I am trying to run a random forest model on my Macbook Air M2 (base model), 128GB, 8GB RAM.
This is the code I was trying to execute.
# TEST
library(dplyr) # for data manipulation
library(caret) # for machine learning
# Select relevant columns for analysis
selected_cols <- c("Year", "Month", "DayofMonth", "DayOfWeek", "DepTime", "CRSDepTime", "ArrTime", "CRSArrTime", "UniqueCarrier", "FlightNum", "Origin", "Dest", "Distance", "Cancelled", "CancellationCode")
flight_data <- Flight_Details_2003 %>% select(selected_cols)
# Create a delay variable by subtracting the scheduled departure time from the actual departure time. We will also filter out cancelled flights and negative delays
flight_data <- flight_data %>% mutate(DepDelay = DepTime - CRSDepTime) %>%
filter(Cancelled == 0, DepDelay >= 0)
# create a new fata frame with the delay information by airport and date:
airport_data <- flight_data %>% group_by(Year, Month, DayofMonth, Origin) %>%
summarize(TotalDelay = sum(DepDelay))
# Merge the delay data with itself to create a delay-by-airport-by-date matrix:
delay_matrix <- merge(airport_data, airport_data, by = c("Year", "Month", "DayofMonth"))
# Create a new column that indicates if a delay occured in the destination airport due to a delay in the origin airport
delay_matrix <- delay_matrix %>% mutate(CascadingDelay = ifelse(TotalDelay.x > 0 & TotalDelay.y == 0, 1, 0))
# Prepare data for machine learning
# Drop unnecessary columns
delay_matrix <- delay_matrix %>% select(-c("TotalDelay.x", "TotalDelay.y"))
# Convert data to binary format
delay_matrix$CascadingDelay <- factor(delay_matrix$CascadingDelay, levels = c(0,1), labels = c("No", "Yes"))
# Split data into training and testing sets
set.seed(123)
train_index <- createDataPartition(delay_matrix$CascadingDelay, p = 0.7, list = FALSE)
train_data <- delay_matrix[train_index,]
test_data <- delay_matrix[-train_index,]
# Randomly sampling a smaller portion of the training data because of this error: vector memory exhausted (limit reached?)
set.seed(123)
sample_size <- 10000
train_data_sample <- train_data[sample(seq_len(nrow(train_data)), size = sample_size), ]
# train the machine learning model using a random forest algorithm:
# Train the model
model <- train(CascadingDelay ~ ., data = train_data, method = "rf", trControl = trainControl(method = "cv", number = 10))
# Check model accuracy
confusionMatrix(model, test_data$CascadingDelay)
This is what my dataframe consists of:
> str(Flight_Details_2003)
'data.frame': 6488540 obs. of 29 variables:
$ Year : int 2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
$ Month : int 1 1 1 1 1 1 1 1 1 1 ...
$ DayofMonth : int 29 30 31 1 2 3 4 5 6 1 ...
$ DayOfWeek : int 3 4 5 3 4 5 6 7 1 3 ...
$ DepTime : int 1651 1654 1724 1033 1053 1031 1031 1035 1031 1713 ...
$ CRSDepTime : int 1655 1655 1655 1035 1035 1035 1035 1035 1035 1710 ...
$ ArrTime : int 1912 1910 1936 1625 1726 1640 1626 1636 1653 1851 ...
$ CRSArrTime : int 1913 1913 1913 1634 1634 1634 1634 1634 1634 1847 ...
$ UniqueCarrier : chr "UA" "UA" "UA" "UA" ...
$ FlightNum : int 1017 1017 1017 1018 1018 1018 1018 1018 1018 1020 ...
$ TailNum : chr "N202UA" "N311UA" "N317UA" "N409UA" ...
$ ActualElapsedTime: int 141 136 132 232 273 249 235 241 262 98 ...
$ CRSElapsedTime : int 138 138 138 239 239 239 239 239 239 97 ...
$ AirTime : int 119 108 110 215 214 223 219 227 241 62 ...
$ ArrDelay : int -1 -3 23 -9 52 6 -8 2 19 4 ...
$ DepDelay : int -4 -1 29 -2 18 -4 -4 0 -4 3 ...
$ Origin : chr "ORD" "ORD" "ORD" "OAK" ...
$ Dest : chr "MSY" "MSY" "MSY" "ORD" ...
$ Distance : int 837 837 837 1835 1835 1835 1835 1835 1835 413 ...
$ TaxiIn : int 5 2 5 6 13 13 5 5 7 7 ...
$ TaxiOut : int 17 26 17 11 46 13 11 9 14 29 ...
$ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
$ CancellationCode : chr NA NA NA NA ...
$ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
$ CarrierDelay : int NA NA NA NA NA NA NA NA NA NA ...
$ WeatherDelay : int NA NA NA NA NA NA NA NA NA NA ...
$ NASDelay : int NA NA NA NA NA NA NA NA NA NA ...
$ SecurityDelay : int NA NA NA NA NA NA NA NA NA NA ...
$ LateAircraftDelay: int NA NA NA NA NA NA NA NA NA NA ...
I will run into the error at:
model <- train(CascadingDelay ~ ., data = train_data, method = "rf", trControl = trainControl(method = "cv", number = 10)) Error: vector memory exhausted (limit reached?)