-1

I have this time series data:

library(forecast)
library(lubridate)

set.seed(123)

weeks <- rep(seq(as.Date("2010-01-01"), as.Date("2023-01-01"), by = "week"), each = 1)
counts <- rpois(length(weeks), lambda = 50)
df <- data.frame(Week = as.character(weeks), Count = counts)

# Convert Week column to Date format
df$Week <- as.Date(df$Week)

# Create a time series object
ts_data <- ts(df$Count, frequency = 52, start = c(year(min(df$Week)), 1))

I want to fit a time series model to this data and implement a "rolling window cross validation"

As I understand, this involves (ordering the data in chronological order):

  • Fit a model to the first 60 data points, predict the next 5 and record the error (e.g. rmse, mae, mape)
  • Next, fit a model model to the first 65 points, predict the next 5 and record the error
  • etc.

enter image description here

Based on the answer I got in a previous question (R: Sequentually Storing Rows from Data Frame into Lists), I tried to apply this logic for my question to create the "chunks" of data required for training and testing the model:

x <- seq(from = 60, to = 679, by = 5)
y <- x[-length(x)] + 1

#train <- lapply(x, \(x) df[seq_len(x), ])
#test <- lapply(y, \(x) df[seq(from = x, by = 1, length.out = 5), ])

train <- lapply(x, function(x) {
    df[seq_len(x), ]
})

test <- lapply(y, function(x) {
    df[seq(from = x, by = 1, length.out = 5), ]
})

Then, I tried to run the models (training):

train_list = list()

for (i in 1:length(train))
{
    data_i = data.frame(train[i])
    data_i = ts(data_i$Count, frequency = 52, start = c(year(min(data_i$Week)), 1))
    fit_arima_i <- auto.arima( data_i , seasonal = TRUE, lambda = "auto")
train_list[[i]] = fit_arima_i
print(fit_arima_i)
}

And then testing:

 test_list = list()
mae = list()
rmse = list()
mape = list()

for (i in 1:length(test)) {
    tryCatch({
        test_i <- data.frame(test[i])
        fcast_i <- forecast(train_list[[i]], h = 5)
        mae[i] <- sum(abs(as.numeric(fcast_i$mean) - test_i$Count))
        rmse[i] <- sqrt(mean((as.numeric(fcast_i$mean) - test_i$Count)^2))
        mape[i] <- mean(abs((as.numeric(fcast_i$mean) - test_i$Count) / test_i$Count)) * 100
        #print(c(sum(abs(as.numeric(fcast_i$mean) - test_i$Count)), sqrt(mean((as.numeric(fcast_i$mean) - test_i$Count)^2)), mean(abs((as.numeric(fcast_i$mean) - test_i$Count) / test_i$Count)) * 100))
    }, error = function(e) {
        # Handle the error here
        message(paste0("Error occurred for i = ", i, ": ", e))
    })
}

Can someone please tell me if I am doing this correctly? Perhaps any alternate ways to do this with some pre-existing package?

Thanks!

Note 1: An extra approach using matrices (to store errors up to 5 forecasts ahead)

rmse = matrix(0, nrow = length(test) , ncol = 5)
mae = matrix(0, nrow = length(test) , ncol = 5)
mape = matrix(0, nrow = length(test) , ncol = 5)    

for (i in 1:length(test)) {
    tryCatch({
        test_i <- data.frame(test[i])
        fcast_1_i <- forecast(train_list[[i]], h = 1)
        
        fcast_2_i <- forecast(train_list[[i]], h = 2)
        fcast_3_i <- forecast(train_list[[i]], h = 3)
        fcast_4_i <- forecast(train_list[[i]], h = 4)
        fcast_5_i <- forecast(train_list[[i]], h = 5)
        
        mae[i,1] <- sum(abs(as.numeric(fcast_1_i$mean) -  test_i[1,2]))
        rmse[i,1] <- sqrt(mean((as.numeric(fcast_1_i$mean) -  test_i[1,2])^2))
        mape[i,1] <- mean(abs((as.numeric(fcast_1_i$mean) -  test_i[1,2]) /  test_i[1,2])) * 100
        
        
        mae[i,2] <- sum(abs(as.numeric(fcast_2_i$mean) -  test_i[1:2,2]))
        rmse[i,2] <- sqrt(mean((as.numeric(fcast_2_i$mean) -  test_i[1:2,2])^2))
        mape[i,2] <- mean(abs((as.numeric(fcast_2_i$mean) -  test_i[1:2,2]) /  test_i[1:2,2])) * 100
        
        mae[i,3] <- sum(abs(as.numeric(fcast_3_i$mean) -  test_i[1:3,2]))
        rmse[i,3] <- sqrt(mean((as.numeric(fcast_3_i$mean) -  test_i[1:3,2])^2))
        mape[i,3] <- mean(abs((as.numeric(fcast_3_i$mean) -  test_i[1:3,2]) /  test_i[1:3,2])) * 100
        
                   
        mae[i,4] <- sum(abs(as.numeric(fcast_4_i$mean) -  test_i[1:4,2]))
        rmse[i,4] <- sqrt(mean((as.numeric(fcast_4_i$mean) -  test_i[1:4,2])^2))
        mape[i,4] <- mean(abs((as.numeric(fcast_4_i$mean) -  test_i[1:4,2]) /  test_i[1:4,2])) * 100
                    
        
        mae[i,5] <- sum(abs(as.numeric(fcast_5_i$mean) - test_i$Count))
        rmse[i,5] <- sqrt(mean((as.numeric(fcast_5_i$mean) - test_i$Count)^2))
        mape[i,5] <- mean(abs((as.numeric(fcast_5_i$mean) - test_i$Count) / test_i$Count)) * 100
        
        
    }, error = function(e) {
        # Handle the error here
        message(paste0("Error occurred for i = ", i, ": ", e))
    })
} 

Note 2: The same approach using sub-loops for efficiency:

n_horizon <- 5
rmse <- matrix(0, nrow = length(test), ncol = n_horizon)
mae <- matrix(0, nrow = length(test), ncol = n_horizon)
mape <- matrix(0, nrow = length(test), ncol = n_horizon)    

for (i in 1:length(test)) {
  tryCatch({
    test_i <- data.frame(test[i])
    fcast <- list()
    
    for (h in 1:n_horizon) {
      fcast[[h]] <- forecast(train_list[[i]], h = h)
      mae[i, h] <- sum(abs(as.numeric(fcast[[h]]$mean) - test_i[1:h, 2]))
      rmse[i, h] <- sqrt(mean((as.numeric(fcast[[h]]$mean) - test_i[1:h, 2])^2))
      mape[i, h] <- mean(abs((as.numeric(fcast[[h]]$mean) - test_i[1:h, 2]) / test_i[1:h, 2])) * 100
    }
    
  }, error = function(e) {
    message(paste0("Error occurred for i = ", i, ": ", e))
  })
}

Note 3: Plot the Results

# Create the plot
par(mfrow = c(1, 3))

# Plot for MAE
plot(1:n_horizon, colMeans(mae_arima, na.rm = TRUE), type = "b", xlab = "Forecast Horizon (in months)", ylab = "MAE",
     main = "MAE for ARIMA and ETS Models", col = "blue", lty = 1)
lines(1:n_horizon, colMeans(mae_ets, na.rm = TRUE), type = "b", col = "green", lty = 2)
legend("topright", c("ARIMA", "ETS"), col = c("blue", "green"), lty = c(1, 2))

# Plot for RMSE
plot(1:n_horizon, colMeans(rmse_arima, na.rm = TRUE), type = "b", xlab = "Forecast Horizon (in months)", ylab = "RMSE",
     main = "RMSE for ARIMA and ETS Models", col = "red", lty = 3)
lines(1:n_horizon, colMeans(rmse_ets, na.rm = TRUE), type = "b", col = "orange", lty = 4)
legend("topright", c("ARIMA", "ETS"), col = c("red", "orange"), lty = c(3, 4))

# Plot for MAPE
plot(1:n_horizon, colMeans(mape_arima, na.rm = TRUE), type = "b", xlab = "Forecast Horizon (in months)", ylab = "MAPE",
     main = "MAPE for ARIMA and ETS Models", col = "red", lty = 3)
lines(1:n_horizon, colMeans(mape_ets, na.rm = TRUE), type = "b", col = "orange", lty = 4)
legend("topright", c("ARIMA", "ETS"), col = c("red", "orange"), lty = c(3, 4))
stats_noob
  • 5,401
  • 4
  • 27
  • 83
  • 8
    A suggesting to the way you ask questions: Try to shorten and simplify your question such that other can catch your point immediately. Better avoid using huge dataset or complicate logics when describing the problem. – ThomasIsCoding Mar 19 '23 at 21:16
  • @ ThomasIsCoding: thank you for your reply! I try to add the full context so that students in the future who have similar questions can see the full thought process and use my questions as a learning tutorial... but i agree with you... perhaps I should write shorter questions – stats_noob Mar 19 '23 at 21:24
  • 3
    This question is too open ended. If you don't have any programming questions and you are unclear on the underlying theory, Cross Validated might be a better site for your questions. If your objective is to create useful tutorials for future students, writing a blog post and submitting to the r-bloggers aggregator or contributing a package vignette would be a much better format than lengthy questions like this where it is unclear what your actually asking. – Matt Summersgill Mar 21 '23 at 14:00

2 Answers2

2

We can create ts_data directly from weeks and counts and can eliminate lubridate as a dependency without increasing the code size.

As there are not a whole number of weeks in the year the alignment in ts_data will get successively off as the years progress. ts works better with monthly, quarterly or annual data and if the only reason to do this is to demonstrate these functions to students use one of those to eliminate distracting complications; however, below we mostly stick to your setup.

1) tsCV After setting up the data run tsCV from the forecast package. For each week tsCV calls fcast to predict the next 5 weeks . Since ts_data has length 157 tsCV invokes fcast 157 times. giving a 157 x 5 matrix of errors such that the ith row is the errors from predicting ts_data[i + 1:5] based on head(ts_datam, i). These errors are then used to calculate the desired metrics, as shown.

library(forecast)

set.seed(123)

from <- "2010-01-01"
to <- "2013-01-01"

weeks <- seq(as.Date(from), as.Date(to), by = "week")
counts <- rpois(length(weeks), lambda = 50)
ts_data <- ts(counts, frequency = 52, start = substr(from, 1, 4))

fcast <- function(x, h) forecast(auto.arima(x), h = h)
errors <- tsCV(ts_data, fcast, h = 5)

n <- length(ts_data)
metrics <- data.frame(
  MAE = rowMeans(abs(errors)),
  RMSE = sqrt(rowMeans(errors^2)),
  MAPE = sapply(1:n, function(i) 100 * mean(abs(errors[i, ]/ts_data[i + 1:5])))
)

head(metrics)
##        MAE      RMSE     MAPE
## 1 9.400000 10.285913 17.75126
## 2 7.600000  9.186947 17.13744
## 3 7.933333  8.957926 16.61123
## 4 9.400000  9.909591 19.21178
## 5 6.840000  8.301807 16.01443
## 6 6.566667  8.392126 15.61435

A few comments on the above code. The ith row of the output from tsCV equals:

i <- 60  # use 60 as an example
ts_data[i + 1:5] - fcast(head(ts_data, i), 5)$mean[1:5]
## [1]   6.760717  -1.964939 -13.559583   5.512663  -5.474460

errors[i, ]  # same
##        h=1        h=2        h=3        h=4        h=5 
##   6.760717  -1.964939 -13.559583   5.512663  -5.474460 

Be very careful when forming such expressions since indexing a ts class object will turn it into a non-ts object. Had we written

# not the same - head(ts_data, i) is replaced with ts_data[1:i]
i <- 60
ts_data[i + 1:5] - fcast(ts_data[1:i], 5)$mean[1:5]
## [1]   6.274882  -1.470550 -13.470550   5.529450  -5.470550

then ts_data[1:i] would not be a ts object and although auto.arima would convert it to a ts object in the conversion the frequency of 52 would be lost resulting in the different answer just shown.

The forecast package provides head.ts and subset.ts and the stats package in base R provides window.ts to circumvent these problems.

The zoo package, which the forecast package imports, can also be used to avoid these problems since zoo objects can be indexed and converted seamlessly to and from ts objects. For example

identical(head(ts_data, 5), ts_data[1:5])
## [1] FALSE

library(zoo)
z <- as.zoo(ts_data)
identical(head(ts_data, 5), as.ts(z[1:5]))
## [1] TRUE

2) accuracy Another approach is to iteratively apply the accuracy function found in the forecast package. Replace all the lines above from the tsCV line onwards with the code below. This gives the same metric values as in (1).

# calculate ith row of metrics
accuracy1 <- function(i) {
  fc <- fcast(head(ts_data, i), h = 5) 
  actual <- ts_data[i+1:5]
  accuracy(fc, actual)["Test set", c("MAE", "RMSE", "MAPE")]
}

n <- length(ts_data)
metrics2 <- do.call("rbind", lapply(1:n, accuracy1))
head(metrics2)
##           MAE      RMSE     MAPE
## [1,] 9.400000 10.285913 17.75126
## [2,] 7.600000  9.186947 17.13744
## [3,] 7.933333  8.957926 16.61123
## [4,] 9.400000  9.909591 19.21178
## [5,] 6.840000  8.301807 16.01443
## [6,] 6.566667  8.392126 15.61435

IF instead it were desired to predict points 61-65 using the first 60 points and then use the first 65 points to predict points 66-70 and so on returning the 3 metrics use seq(60, n, 5) in place of 1:n above.

G. Grothendieck
  • 254,981
  • 17
  • 203
  • 341
  • thank you so much for your answer! I am sorry I never replied earlier - I thought I did but I must have forgotten – stats_noob Aug 18 '23 at 21:20
  • one question I wanted to ask - is what I did initially correct from a methodology perspective (even though it might not have been very efficient)? thank you so much! – stats_noob Aug 18 '23 at 21:21
1

I would simplify your code.

  • use descriptive names, start_vec_train and start_vec_test instead of x and y;
  • don't create a time series, then data.frames, then back to time series;
  • use lapply/mapply instead of for loops;
  • define functions rmse, mae and mape

The ugly start_vec_test[length(start_vec_test)] is because I have train and test data until the end but the last test series is of length 4 only.

library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(lubridate)
#> Loading required package: timechange
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

set.seed(123)

weeks <- rep(seq(as.Date("2010-01-01"), as.Date("2023-01-01"), by = "week"), each = 1)
counts <- rpois(length(weeks), lambda = 50)
df <- data.frame(Week =weeks, Count = counts)
str(df)
#> 'data.frame':    679 obs. of  2 variables:
#>  $ Week : Date, format: "2010-01-01" "2010-01-08" ...
#>  $ Count: int  46 58 38 50 62 53 41 37 58 52 ...

# Create a time series object
ts_data <- ts(df$Count, frequency = 52, start = c(year(min(df$Week)), 1))
plot(ts_data)


# define functions
mse <- function(x, y) mean((x - y)^2)
rmse <- function(x, y) sqrt(mse(x, y))
mae <- function(x, y) sum(abs(x - y))
mape <- function(x, y) mean(abs( (x - y)/y )) * 100

# both vectors have the same length
start_vec_train <- seq(from = 60, to = 679, by = 5)
start_vec_test <- start_vec_train + 1

train <- lapply(start_vec_train, function(.x) {
  ts(ts_data[seq_len(.x)], frequency = 52, start = c(time(ts_data)[1], 1))
})

test <- lapply(start_vec_test, function(.x) {
  length_out <- 5L - (.x == start_vec_test[length(start_vec_test)])
  y <- ts_data[seq(from = .x, by = 1, length.out = length_out)]
  ts(y, start = time(ts_data)[.x], frequency = 52)
})

model_list <- lapply(train, \(x) {
  auto.arima(x, seasonal = TRUE, lambda = "auto")
})

pred_list <- mapply(
  \(fit, .y) {
    h <- if(.y == start_vec_test[length(start_vec_test)]) 4 else 5
    forecast(fit, h = h)
  },
  model_list, start_vec_test,
  SIMPLIFY = FALSE
)

stats_list <- mapply(
  \(fit, tst) {
    c(
      mae = mae(fit$mean, tst),
      rmse = rmse(fit$mean, tst),
      mape = mape(fit$mean, tst)
    )
  }, 
  pred_list, test,
  SIMPLIFY = FALSE
)

stats_mat <- do.call(rbind, stats_list)
head(stats_mat)
#>           mae     rmse      mape
#> [1,] 16.96667 3.963304  7.213142
#> [2,] 11.86154 3.094527  5.111530
#> [3,] 16.72857 3.928156  6.288358
#> [4,] 22.90667 5.262766  9.089086
#> [5,] 32.12500 7.938238 11.867540
#> [6,] 22.02353 5.020015  8.778799

Created on 2023-03-26 with reprex v2.0.2

Rui Barradas
  • 70,273
  • 8
  • 34
  • 66
  • @ Rui Barradas: Thank you so much for your answer! Just a question - what did you think of the code I provided in my question? Are the mathematical details correct? thank you so much! – stats_noob Mar 26 '23 at 08:37
  • @stats_noob In your code you compute the the forecasts from 1 to 5 steps ahead, then the metrics and then average those metrics. This doesn't make sense, errors for 1 step ahead, 2 steps ahead, etc cannot be averaged. – Rui Barradas Mar 26 '23 at 16:21
  • hi, how are things? I am revisiting this problem - I wanted to ask a question: from a methodology perspective, is what I have initially done correct? thank you so much – stats_noob Aug 18 '23 at 21:16