2

I am vertically appending two different sets of dataframes into a single series. I want to annotate plots after we fit the model using RandomForestRegressor and plot the actual and predicted values. The two datasets I am considering are found in this link

My solution for prediction and attempt to the plots of the values is shown below

import glob
import os    
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "cubic*.csv"))),ignore_index=True)
#df = pd.read_csv('cubic31.csv')

#df.sort_values(['time'], inplace=True)

df['time'] = pd.Series(["{0:.10f}".format(val * 100) for val in df['time']], index = df.index)

for i in range(1,3):
    df['X_t'+str(i)] = df['X'].shift(i)

print(df)

df.dropna(inplace=True)

X =np.array (pd.DataFrame({ 'X_%d'%i : df['X'].shift(i) for i in range(3)}).apply(np.nan_to_num, axis=0).values)

X = df.drop('Y', axis=1)
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)
X_train = X_train.drop('time', axis=1)
X_test = X_test.drop('time', axis=1)

print(X.shape)
print(df['Y'].shape)

print()
print("Size of X_train:",(len(X_train)))
print("Size of Y_train:",(len(X_train)))
print("Size of X_test:",(len(X_test)))
print("Size of Y_test:",(len(y_test)))

print(X_train.shape)
print(y_train.shape)

print()

####### to add the trendline
fig, ax = plt.subplots()
#df.plot(x='time', y='Y', ax=ax)
ax.plot(df['time'].values, df['Y'].values)
fig, ax = plt.subplots()


plt.annotate('annote test!', 
             xy=(len(modelPred_test), modelPred_test[-1]),  
             xycoords='data',
             xytext=(-30,30),
             textcoords='offset points',
             arrowprops=dict(arrowstyle="->"))

index_values=range(0,len(y_test))
y_test.sort_index(inplace=True)
X_test.sort_index(inplace=True)

modelPred_test = reg.predict(X_test)
ax.plot(pd.Series(index_values), y_test.values)


plotsInOne=pd.DataFrame(pd.concat([pd.Series(modelPred_test), pd.Series(y_test.values)], axis=1))


plt.figure(); plotsInOne.plot(); plt.legend(loc='best')

When I take only a single dataset (for example: cubic31.csv) as df = pd.read_csv('cubic31.csv') and apply the plot commands

    fig, ax = plt.subplots()
    ax.plot(df['time'].values, df['Y'].values)
    fig, ax = plt.subplots()

this is the plot i got.

enter image description here

When we take the second dataset cubic32.csv) as

df = pd.read_csv('testdata2.csv') and apply the plot commands

    fig, ax = plt.subplots()
    ax.plot(df['time'].values, df['Y'].values)
    fig, ax = plt.subplots()

This is the plot i got

enter image description here

However, if I combine the two datasets as

df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "cubic*.csv")))), this is the plot i got

enter image description here

I wanted to put a mark where each plot ends (as shown by the red arrow in the plots). I have tried it using the following but it ONLY points at the end of the second plot but not the first plot as shown below

plt.annotate('annote test!', 
             xy=(len(modelPred_test), modelPred_test[-1]),  
             xycoords='data',
             xytext=(-30,30),
             textcoords='offset points',
             arrowprops=dict(arrowstyle="->"))

enter image description here

How can we do this to the plotting command so that we can annotate and label (for example this is where cubic31.csv ends, this is where cubic32.csv ends, ... etc) the plots automatically as shown below?

enter image description here

Desta Haileselassie Hagos
  • 23,140
  • 7
  • 48
  • 53

1 Answers1

3

Consider using df.ix[] in the annotate's xy coordinates where you retrieve index by df.shape (the tuple of number of rows and columns). Below demonstrates with random data (seeded to be reproducible). Second dataframe is half the size of first. Also below shows horizontal and vertical merge annotated plot:

Data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# DATAFRAME 1
np.random.seed(33)
df1 = pd.DataFrame({'Y': [np.random.randint(0,1000) for i in range(50)],
                   'time': range(50)})

# DATAFRAME 2
np.random.seed(64)
df2 = pd.DataFrame({'Y': [np.random.randint(0,1000) for i in range(25)],
                   'time': range(25)})

Individual Annotatate

def runplot(df, title):   
    rows, cols = df.shape

    fig, ax = plt.subplots()
    ax.plot(df['time'].values, df['Y'].values)

    plt.title(title)
    plt.annotate('annote test!', 
                 xy=(rows, df.ix[rows-1,'Y']),  
                 xycoords='data',
                 xytext=(-30,30),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))


runplot(df1, 'Dataframe 1')
runplot(df2, 'Dataframe 2')

Dataframe 1 Plot Dataframe 2 Plot

Horizontal Merge / Vertical Append Annotate

def runplot_merge(left_df, right_df, df, title):               
    fig, ax = plt.subplots()
    ax.plot(df['time'].values, df['Y'].values)

    rows, cols = left_df.shape

    plt.title(title)
    plt.annotate('annote test 1!', 
                 xy=(rows, left_df.ix[rows-1,'Y']),  
                 xycoords='data',
                 xytext=(-30,30),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))

    rows, cols = right_df.shape

    plt.annotate('annote test 2!', 
                 xy=(rows, right_df.ix[rows-1,'Y']),  
                 xycoords='data',
                 xytext=(-30,30),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))        


df_wide = pd.concat([df1, df2], axis=1)
runplot_merge(df1, df2, df_wide, 'Horizontal Merge')

df_long = pd.concat([df1, df2]).reset_index(drop=True).sort_values('time')
runplot_merge(df1, df2, df_long, 'Vertical Append')

plt.show()
plt.clf()
plt.close()

Horizontal Merge Plot Vertical Append

Parfait
  • 104,375
  • 17
  • 94
  • 125
  • 1
    I did not mean for you to copy my solution but to show how you produced your version from the posted linked data. I still believe that desired plot uses a different dataset. And you want 100 annotations in one plot? I would read all dataframes into a list, `pd.concat()` using list, then iterate through list passing dfs into `plt.annotate(...)` but plot concatenated df. – Parfait Aug 10 '17 at 01:17