You can solve it using RNN. First, let's create an example dataframe to work with
import pandas as pd
import numpy as np
test_df = pd.DataFrame({'year':range(2008,2020)})
# 0-java, 1-php, 2-python
for ind in range(3): test_df['frac_%i' % ind] = np.random.rand(2020-2008)
test_df = test_df.drop('year',axis=1)
# the array of fractions
data = test_df.values
Before dropping the column year
, test_df
looks like
year frac_0 frac_1 frac_2
0 2008 0.457123 0.780754 0.978396
1 2009 0.578795 0.323664 0.909824
2 2010 0.707996 0.477242 0.948976
3 2011 0.455918 0.627572 0.137039
4 2012 0.272352 0.144968 0.831693
5 2013 0.064729 0.233168 0.554654
6 2014 0.754608 0.570530 0.968355
7 2015 0.435918 0.264335 0.727189
8 2016 0.699624 0.455323 0.237246
9 2017 0.824758 0.995260 0.333113
10 2018 0.597993 0.384319 0.750074
11 2019 0.598657 0.533934 0.072334
When using RNN to do time series analysis, the first thing is to cast the task into a supervised regression task, that is we need to create a dataframe in which each row is
observations of the past year | observation of a year
Here is a function that can help you achieve this (I learned this function from this wonderful post)
def series_to_supervised(data,n_in,n_out):
df = pd.DataFrame(data)
cols = list()
for i in range(n_in,0,-1): cols.append(df.shift(i))
for i in range(0, n_out): cols.append(df.shift(-i))
agg = pd.concat(cols,axis=1)
agg.dropna(inplace=True)
return agg.values
With this function we can create the desired dataframe
n_in,n_out = 2,1
data = series_to_supervised(test_df,n_in,n_out)
n_in
is the number of past years we want to use to make predictions, and n_out
is the number of years we want to predict. In this case, we are predicting only one year given the data of the past two years.
Now that we have prepared the data, we can train a RNN model
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,LSTM,Dropout
x, y= data[:,None,:-n_out*3],data[:,n_in*3:]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=49)
model = Sequential()
model.add(LSTM(4,name='lstm_0'))
model.add(Dropout(0.2,name='dropout_0'))
model.add(Dense(3,activation='tanh'))
model.compile(loss='mse',optimizer='adam',metrics=['mse'])
# fit
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=50,verbose=0)
With this model, you can predict the fractions of the years 2020, 2021 and 2022 by
# predict 2020 with 2018 and 2019
last_two_years = np.hstack((test_df.values[-2],test_df.values[-1]))[None,None,:]
frac_2020 = model.predict(last_two_years)
# predict 2021 with 2019 and 2020
last_two_years = np.hstack((test_df.values[-1],frac_2020.ravel()))[None,None,:]
frac_2021 = model.predict(last_two_years)
# predict 2022 with 2020 and 2021
last_two_years = np.hstack((frac_2020.ravel(),frac_2021.ravel()))[None,None,:]
frac_2022 = model.predict(last_two_years)
The full script
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,LSTM,Dropout
def series_to_supervised(data,n_in,n_out):
df = pd.DataFrame(data)
cols = list()
for i in range(n_in,0,-1): cols.append(df.shift(i))
for i in range(0, n_out): cols.append(df.shift(-i))
agg = pd.concat(cols,axis=1)
agg.dropna(inplace=True)
return agg.values
test_df = pd.DataFrame({'year':range(2008,2020)})
# 0-java, 1-php, 2-python
for ind in range(3): test_df['frac_%i' % ind] = np.random.rand(2020-2008)
test_df = test_df.drop('year',axis=1)
# the array of fractions
data = test_df.values
# cast the task as a supevised regression task
n_in,n_out = 2,1
data = series_to_supervised(test_df,n_in,n_out)
# train test split
x, y= data[:,None,:-n_out*3],data[:,n_in*3:]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=49)
model = Sequential()
model.add(LSTM(4,name='lstm_0'))
model.add(Dropout(0.2,name='dropout_0'))
model.add(Dense(3,activation='tanh'))
model.compile(loss='mse',optimizer='adam',metrics=['mse'])
# fit
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=50,verbose=0)
# predict 2020 with 2018 and 2019
last_two_years = np.hstack((test_df.values[-2],test_df.values[-1]))[None,None,:]
frac_2020 = model.predict(last_two_years)
# predict 2021 with 2019 and 2020
last_two_years = np.hstack((test_df.values[-1],frac_2020.ravel()))[None,None,:]
frac_2021 = model.predict(last_two_years)
# predict 2022 with 2020 and 2021
last_two_years = np.hstack((frac_2020.ravel(),frac_2021.ravel()))[None,None,:]
frac_2022 = model.predict(last_two_years)
print(frac_2020,frac_2021,frac_2022)