New to ARIMA and attempting to model a dataset in Python using auto ARIMA. I'm using auto-ARIMA as I believe it will be better at defining the values of p, d and q however the results are poor and I need some guidance. Please see my reproducible attempts below
Attempt as follows:
# DEPENDENCIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pmdarima as pm
from pmdarima.model_selection import train_test_split
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import ADFTest
from pmdarima import auto_arima
from sklearn.metrics import r2_score
# CREATE DATA
data_plot = pd.DataFrame(data removed)
# SET INDEX
data_plot['date_index'] = pd.to_datetime(data_plot['date']
data_plot.set_index('date_index', inplace=True)
# CREATE ARIMA DATASET
arima_data = data_plot[['value']]
arima_data
# PLOT DATA
arima_data['value'].plot(figsize=(7,4))
The above steps result in a dataset that should look like this.
# Dicky Fuller test for stationarity
adf_test = ADFTest(alpha = 0.05)
adf_test.should_diff(arima_data)
Result = 0.9867 indicating non-stationary data which should be handled by appropriate over of differencing later in auto arima process.
# Assign training and test subsets - 80:20 split
print('Dataset dimensions;', arima_data.shape)
train_data = arima_data[:-24]
test_data = arima_data[-24:]
print('Training data dimension:', train_data.shape, round((len(train_data)/len(arima_data)*100),2),'% of dataset')
print('Test data dimension:', test_data.shape, round((len(train_data)/len(arima_data)*100),2),'% of dataset')
# Plot training & test data
plt.plot(train_data)
plt.plot(test_data)
# Run auto arima
arima_model = auto_arima(train_data, start_p=0, d=1, start_q=0,
max_p=5, max_d=5, max_q=5,
start_P=0, D=1, start_Q=0, max_P=5, max_D=5,
max_Q=5, m=12, seasonal=True,
stationary=False,
error_action='warn', trace=True,
suppress_warnings=True, stepwise=True,
random_state=20, n_fits=50)
print(arima_model.aic())
Output suggests best model is 'ARIMA(1,1,1)(0,1,0)[12]'
with AIC 1725.35484
#Store predicted values and view resultant df
prediction = pd.DataFrame(arima_model.predict(n_periods=25), index=test_data.index)
prediction.columns = ['predicted_value']
prediction
# Plot prediction against test and training trends
plt.figure(figsize=(7,4))
plt.plot(train_data, label="Training")
plt.plot(test_data, label="Test")
plt.plot(prediction, label="Predicted")
plt.legend(loc='upper right')
plt.show()
# Finding r2 model score
test_data['predicted_value'] = prediction
r2_score(test_data['value'], test_data['predicted_value'])
Result: -6.985