I have this time series data:
library(forecast)
library(lubridate)
set.seed(123)
weeks <- rep(seq(as.Date("2010-01-01"), as.Date("2023-01-01"), by = "week"), each = 1)
counts <- rpois(length(weeks), lambda = 50)
df <- data.frame(Week = as.character(weeks), Count = counts)
# Convert Week column to Date format
df$Week <- as.Date(df$Week)
# Create a time series object
ts_data <- ts(df$Count, frequency = 52, start = c(year(min(df$Week)), 1))
I want to fit a time series model to this data and implement a "rolling window cross validation"
As I understand, this involves (ordering the data in chronological order):
- Fit a model to the first 60 data points, predict the next 5 and record the error (e.g. rmse, mae, mape)
- Next, fit a model model to the first 65 points, predict the next 5 and record the error
- etc.
Based on the answer I got in a previous question (R: Sequentually Storing Rows from Data Frame into Lists), I tried to apply this logic for my question to create the "chunks" of data required for training and testing the model:
x <- seq(from = 60, to = 679, by = 5)
y <- x[-length(x)] + 1
#train <- lapply(x, \(x) df[seq_len(x), ])
#test <- lapply(y, \(x) df[seq(from = x, by = 1, length.out = 5), ])
train <- lapply(x, function(x) {
df[seq_len(x), ]
})
test <- lapply(y, function(x) {
df[seq(from = x, by = 1, length.out = 5), ]
})
Then, I tried to run the models (training):
train_list = list()
for (i in 1:length(train))
{
data_i = data.frame(train[i])
data_i = ts(data_i$Count, frequency = 52, start = c(year(min(data_i$Week)), 1))
fit_arima_i <- auto.arima( data_i , seasonal = TRUE, lambda = "auto")
train_list[[i]] = fit_arima_i
print(fit_arima_i)
}
And then testing:
test_list = list()
mae = list()
rmse = list()
mape = list()
for (i in 1:length(test)) {
tryCatch({
test_i <- data.frame(test[i])
fcast_i <- forecast(train_list[[i]], h = 5)
mae[i] <- sum(abs(as.numeric(fcast_i$mean) - test_i$Count))
rmse[i] <- sqrt(mean((as.numeric(fcast_i$mean) - test_i$Count)^2))
mape[i] <- mean(abs((as.numeric(fcast_i$mean) - test_i$Count) / test_i$Count)) * 100
#print(c(sum(abs(as.numeric(fcast_i$mean) - test_i$Count)), sqrt(mean((as.numeric(fcast_i$mean) - test_i$Count)^2)), mean(abs((as.numeric(fcast_i$mean) - test_i$Count) / test_i$Count)) * 100))
}, error = function(e) {
# Handle the error here
message(paste0("Error occurred for i = ", i, ": ", e))
})
}
Can someone please tell me if I am doing this correctly? Perhaps any alternate ways to do this with some pre-existing package?
Thanks!
Note 1: An extra approach using matrices (to store errors up to 5 forecasts ahead)
rmse = matrix(0, nrow = length(test) , ncol = 5)
mae = matrix(0, nrow = length(test) , ncol = 5)
mape = matrix(0, nrow = length(test) , ncol = 5)
for (i in 1:length(test)) {
tryCatch({
test_i <- data.frame(test[i])
fcast_1_i <- forecast(train_list[[i]], h = 1)
fcast_2_i <- forecast(train_list[[i]], h = 2)
fcast_3_i <- forecast(train_list[[i]], h = 3)
fcast_4_i <- forecast(train_list[[i]], h = 4)
fcast_5_i <- forecast(train_list[[i]], h = 5)
mae[i,1] <- sum(abs(as.numeric(fcast_1_i$mean) - test_i[1,2]))
rmse[i,1] <- sqrt(mean((as.numeric(fcast_1_i$mean) - test_i[1,2])^2))
mape[i,1] <- mean(abs((as.numeric(fcast_1_i$mean) - test_i[1,2]) / test_i[1,2])) * 100
mae[i,2] <- sum(abs(as.numeric(fcast_2_i$mean) - test_i[1:2,2]))
rmse[i,2] <- sqrt(mean((as.numeric(fcast_2_i$mean) - test_i[1:2,2])^2))
mape[i,2] <- mean(abs((as.numeric(fcast_2_i$mean) - test_i[1:2,2]) / test_i[1:2,2])) * 100
mae[i,3] <- sum(abs(as.numeric(fcast_3_i$mean) - test_i[1:3,2]))
rmse[i,3] <- sqrt(mean((as.numeric(fcast_3_i$mean) - test_i[1:3,2])^2))
mape[i,3] <- mean(abs((as.numeric(fcast_3_i$mean) - test_i[1:3,2]) / test_i[1:3,2])) * 100
mae[i,4] <- sum(abs(as.numeric(fcast_4_i$mean) - test_i[1:4,2]))
rmse[i,4] <- sqrt(mean((as.numeric(fcast_4_i$mean) - test_i[1:4,2])^2))
mape[i,4] <- mean(abs((as.numeric(fcast_4_i$mean) - test_i[1:4,2]) / test_i[1:4,2])) * 100
mae[i,5] <- sum(abs(as.numeric(fcast_5_i$mean) - test_i$Count))
rmse[i,5] <- sqrt(mean((as.numeric(fcast_5_i$mean) - test_i$Count)^2))
mape[i,5] <- mean(abs((as.numeric(fcast_5_i$mean) - test_i$Count) / test_i$Count)) * 100
}, error = function(e) {
# Handle the error here
message(paste0("Error occurred for i = ", i, ": ", e))
})
}
Note 2: The same approach using sub-loops for efficiency:
n_horizon <- 5
rmse <- matrix(0, nrow = length(test), ncol = n_horizon)
mae <- matrix(0, nrow = length(test), ncol = n_horizon)
mape <- matrix(0, nrow = length(test), ncol = n_horizon)
for (i in 1:length(test)) {
tryCatch({
test_i <- data.frame(test[i])
fcast <- list()
for (h in 1:n_horizon) {
fcast[[h]] <- forecast(train_list[[i]], h = h)
mae[i, h] <- sum(abs(as.numeric(fcast[[h]]$mean) - test_i[1:h, 2]))
rmse[i, h] <- sqrt(mean((as.numeric(fcast[[h]]$mean) - test_i[1:h, 2])^2))
mape[i, h] <- mean(abs((as.numeric(fcast[[h]]$mean) - test_i[1:h, 2]) / test_i[1:h, 2])) * 100
}
}, error = function(e) {
message(paste0("Error occurred for i = ", i, ": ", e))
})
}
Note 3: Plot the Results
# Create the plot
par(mfrow = c(1, 3))
# Plot for MAE
plot(1:n_horizon, colMeans(mae_arima, na.rm = TRUE), type = "b", xlab = "Forecast Horizon (in months)", ylab = "MAE",
main = "MAE for ARIMA and ETS Models", col = "blue", lty = 1)
lines(1:n_horizon, colMeans(mae_ets, na.rm = TRUE), type = "b", col = "green", lty = 2)
legend("topright", c("ARIMA", "ETS"), col = c("blue", "green"), lty = c(1, 2))
# Plot for RMSE
plot(1:n_horizon, colMeans(rmse_arima, na.rm = TRUE), type = "b", xlab = "Forecast Horizon (in months)", ylab = "RMSE",
main = "RMSE for ARIMA and ETS Models", col = "red", lty = 3)
lines(1:n_horizon, colMeans(rmse_ets, na.rm = TRUE), type = "b", col = "orange", lty = 4)
legend("topright", c("ARIMA", "ETS"), col = c("red", "orange"), lty = c(3, 4))
# Plot for MAPE
plot(1:n_horizon, colMeans(mape_arima, na.rm = TRUE), type = "b", xlab = "Forecast Horizon (in months)", ylab = "MAPE",
main = "MAPE for ARIMA and ETS Models", col = "red", lty = 3)
lines(1:n_horizon, colMeans(mape_ets, na.rm = TRUE), type = "b", col = "orange", lty = 4)
legend("topright", c("ARIMA", "ETS"), col = c("red", "orange"), lty = c(3, 4))