I am vertically appending two different sets of dataframes into a single series. I want to annotate plots after we fit the model using RandomForestRegressor
and plot the actual and predicted values. The two datasets I am considering are found in this link
My solution for prediction and attempt to the plots of the values is shown below
import glob
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "cubic*.csv"))),ignore_index=True)
#df = pd.read_csv('cubic31.csv')
#df.sort_values(['time'], inplace=True)
df['time'] = pd.Series(["{0:.10f}".format(val * 100) for val in df['time']], index = df.index)
for i in range(1,3):
df['X_t'+str(i)] = df['X'].shift(i)
print(df)
df.dropna(inplace=True)
X =np.array (pd.DataFrame({ 'X_%d'%i : df['X'].shift(i) for i in range(3)}).apply(np.nan_to_num, axis=0).values)
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)
X_train = X_train.drop('time', axis=1)
X_test = X_test.drop('time', axis=1)
print(X.shape)
print(df['Y'].shape)
print()
print("Size of X_train:",(len(X_train)))
print("Size of Y_train:",(len(X_train)))
print("Size of X_test:",(len(X_test)))
print("Size of Y_test:",(len(y_test)))
print(X_train.shape)
print(y_train.shape)
print()
####### to add the trendline
fig, ax = plt.subplots()
#df.plot(x='time', y='Y', ax=ax)
ax.plot(df['time'].values, df['Y'].values)
fig, ax = plt.subplots()
plt.annotate('annote test!',
xy=(len(modelPred_test), modelPred_test[-1]),
xycoords='data',
xytext=(-30,30),
textcoords='offset points',
arrowprops=dict(arrowstyle="->"))
index_values=range(0,len(y_test))
y_test.sort_index(inplace=True)
X_test.sort_index(inplace=True)
modelPred_test = reg.predict(X_test)
ax.plot(pd.Series(index_values), y_test.values)
plotsInOne=pd.DataFrame(pd.concat([pd.Series(modelPred_test), pd.Series(y_test.values)], axis=1))
plt.figure(); plotsInOne.plot(); plt.legend(loc='best')
When I take only a single dataset (for example: cubic31.csv
) as
df = pd.read_csv('cubic31.csv')
and apply the plot commands
fig, ax = plt.subplots()
ax.plot(df['time'].values, df['Y'].values)
fig, ax = plt.subplots()
this is the plot i got.
When we take the second dataset cubic32.csv
) as
df = pd.read_csv('testdata2.csv')
and apply the plot commands
fig, ax = plt.subplots()
ax.plot(df['time'].values, df['Y'].values)
fig, ax = plt.subplots()
This is the plot i got
However, if I combine the two datasets as
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "cubic*.csv"))))
, this is the plot i got
I wanted to put a mark where each plot ends (as shown by the red arrow in the plots). I have tried it using the following but it ONLY points at the end of the second plot but not the first plot as shown below
plt.annotate('annote test!',
xy=(len(modelPred_test), modelPred_test[-1]),
xycoords='data',
xytext=(-30,30),
textcoords='offset points',
arrowprops=dict(arrowstyle="->"))
How can we do this to the plotting command so that we can annotate and label (for example this is where cubic31.csv ends
, this is where cubic32.csv ends
, ... etc) the plots automatically as shown below?