0

I am trying to run a random forest model on my Macbook Air M2 (base model), 128GB, 8GB RAM.

This is the code I was trying to execute.

# TEST
library(dplyr) # for data manipulation
library(caret) # for machine learning 

# Select relevant columns for analysis
selected_cols <- c("Year", "Month", "DayofMonth", "DayOfWeek", "DepTime", "CRSDepTime", "ArrTime", "CRSArrTime", "UniqueCarrier", "FlightNum", "Origin", "Dest", "Distance", "Cancelled", "CancellationCode")

flight_data <- Flight_Details_2003 %>% select(selected_cols)

# Create a delay variable by subtracting the scheduled departure time from the actual departure time. We will also filter out cancelled flights and negative delays
flight_data <- flight_data %>% mutate(DepDelay = DepTime - CRSDepTime) %>% 
  filter(Cancelled == 0, DepDelay >= 0)

# create a new fata frame with the delay information by airport and date:
airport_data <- flight_data %>% group_by(Year, Month, DayofMonth, Origin) %>% 
  summarize(TotalDelay = sum(DepDelay))

# Merge the delay data with itself to create a delay-by-airport-by-date matrix:
delay_matrix <- merge(airport_data, airport_data, by = c("Year", "Month", "DayofMonth"))

# Create a new column that indicates if a delay occured in the destination airport due to a delay in the origin airport
delay_matrix <- delay_matrix %>% mutate(CascadingDelay = ifelse(TotalDelay.x > 0 & TotalDelay.y == 0, 1, 0))

# Prepare data for machine learning
# Drop unnecessary columns
delay_matrix <- delay_matrix %>% select(-c("TotalDelay.x", "TotalDelay.y"))

# Convert data to binary format
delay_matrix$CascadingDelay <- factor(delay_matrix$CascadingDelay, levels = c(0,1), labels = c("No", "Yes"))

# Split data into training and testing sets
set.seed(123)
train_index <- createDataPartition(delay_matrix$CascadingDelay, p = 0.7, list = FALSE)
train_data <- delay_matrix[train_index,]
test_data <- delay_matrix[-train_index,]

# Randomly sampling a smaller portion of the training data because of this error: vector memory exhausted (limit reached?)
set.seed(123)
sample_size <- 10000
train_data_sample <- train_data[sample(seq_len(nrow(train_data)), size = sample_size), ]


# train the machine learning model using a random forest algorithm:

# Train the model
model <- train(CascadingDelay ~ ., data = train_data, method = "rf", trControl = trainControl(method = "cv", number = 10))

# Check model accuracy
confusionMatrix(model, test_data$CascadingDelay)

This is what my dataframe consists of:

> str(Flight_Details_2003)
'data.frame':   6488540 obs. of  29 variables:
 $ Year             : int  2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
 $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
 $ DayofMonth       : int  29 30 31 1 2 3 4 5 6 1 ...
 $ DayOfWeek        : int  3 4 5 3 4 5 6 7 1 3 ...
 $ DepTime          : int  1651 1654 1724 1033 1053 1031 1031 1035 1031 1713 ...
 $ CRSDepTime       : int  1655 1655 1655 1035 1035 1035 1035 1035 1035 1710 ...
 $ ArrTime          : int  1912 1910 1936 1625 1726 1640 1626 1636 1653 1851 ...
 $ CRSArrTime       : int  1913 1913 1913 1634 1634 1634 1634 1634 1634 1847 ...
 $ UniqueCarrier    : chr  "UA" "UA" "UA" "UA" ...
 $ FlightNum        : int  1017 1017 1017 1018 1018 1018 1018 1018 1018 1020 ...
 $ TailNum          : chr  "N202UA" "N311UA" "N317UA" "N409UA" ...
 $ ActualElapsedTime: int  141 136 132 232 273 249 235 241 262 98 ...
 $ CRSElapsedTime   : int  138 138 138 239 239 239 239 239 239 97 ...
 $ AirTime          : int  119 108 110 215 214 223 219 227 241 62 ...
 $ ArrDelay         : int  -1 -3 23 -9 52 6 -8 2 19 4 ...
 $ DepDelay         : int  -4 -1 29 -2 18 -4 -4 0 -4 3 ...
 $ Origin           : chr  "ORD" "ORD" "ORD" "OAK" ...
 $ Dest             : chr  "MSY" "MSY" "MSY" "ORD" ...
 $ Distance         : int  837 837 837 1835 1835 1835 1835 1835 1835 413 ...
 $ TaxiIn           : int  5 2 5 6 13 13 5 5 7 7 ...
 $ TaxiOut          : int  17 26 17 11 46 13 11 9 14 29 ...
 $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ CancellationCode : chr  NA NA NA NA ...
 $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ CarrierDelay     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ WeatherDelay     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ NASDelay         : int  NA NA NA NA NA NA NA NA NA NA ...
 $ SecurityDelay    : int  NA NA NA NA NA NA NA NA NA NA ...
 $ LateAircraftDelay: int  NA NA NA NA NA NA NA NA NA NA ...

I will run into the error at:

model <- train(CascadingDelay ~ ., data = train_data, method = "rf", trControl = trainControl(method = "cv", number = 10)) Error: vector memory exhausted (limit reached?)

Joseph Ng
  • 13
  • 3
  • I have tried adding: options(java.parameters = "-Xmx8000m") rsession-memory-limit-mb=8000 rsession-memory-init-mb=8000 and many other methods, but all to no avail. I even shrank the sample size to 10000 – Joseph Ng Mar 04 '23 at 07:38
  • give it a try on Rstudio Cloud https://posit.cloud/ – Wael Mar 04 '23 at 11:59

0 Answers0