Would it make more sense to remove missing data from just the Time_of_Day column using drop_na(Time_of_Day) or to remove it from the whole data frame using na. omit(). Also when I pipe na.omit right after the following code and reuse this data frame, the NA values in the Time_of_Day reappear.
> dput(head(ABIA))
structure(list(Year = c(2008L, 2008L, 2008L, 2008L, 2008L, 2008L
), Month = c(1L, 1L, 1L, 1L, 1L, 1L), DayofMonth = c(1L, 1L,
1L, 1L, 1L, 1L), DayOfWeek = c(2L, 2L, 2L, 2L, 2L, 2L), DepTime = c(120L,
555L, 600L, 601L, 601L, 636L), CRSDepTime = c(1935L, 600L, 600L,
605L, 600L, 645L), ArrTime = c(309L, 826L, 728L, 727L, 654L,
934L), CRSArrTime = c(2130L, 835L, 729L, 750L, 700L, 932L), UniqueCarrier = c("9E",
"AA", "YV", "9E", "AA", "NW"), FlightNum = c(5746L, 1614L, 2883L,
5743L, 1157L, 1674L), TailNum = c("84129E", "N438AA", "N922FJ",
"89189E", "N4XAAA", "N967N"), ActualElapsedTime = c(109L, 151L,
148L, 86L, 53L, 178L), CRSElapsedTime = c(115L, 155L, 149L, 105L,
60L, 167L), AirTime = c(88L, 133L, 125L, 70L, 38L, 145L), ArrDelay = c(339L,
-9L, -1L, -23L, -6L, 2L), DepDelay = c(345L, -5L, 0L, -4L, 1L,
-9L), Origin = c("MEM", "AUS", "AUS", "AUS", "AUS", "AUS"), Dest = c("AUS",
"ORD", "PHX", "MEM", "DFW", "MSP"), Distance = c(559L, 978L,
872L, 559L, 190L, 1042L), TaxiIn = c(3L, 7L, 7L, 4L, 5L, 11L),
TaxiOut = c(18L, 11L, 16L, 12L, 10L, 22L), Cancelled = c(0L,
0L, 0L, 0L, 0L, 0L), CancellationCode = c("", "", "", "",
"", ""), Diverted = c(0L, 0L, 0L, 0L, 0L, 0L), CarrierDelay = c(339L,
NA, NA, NA, NA, NA), WeatherDelay = c(0L, NA, NA, NA, NA,
NA), NASDelay = c(0L, NA, NA, NA, NA, NA), SecurityDelay = c(0L,
NA, NA, NA, NA, NA), LateAircraftDelay = c(0L, NA, NA, NA,
NA, NA)), row.names = c(NA, 6L), class = "data.frame")
> str(ABIA)
'data.frame': 99260 obs. of 29 variables:
$ Year : int 2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
$ Month : int 1 1 1 1 1 1 1 1 1 1 ...
$ DayofMonth : int 1 1 1 1 1 1 1 1 1 1 ...
$ DayOfWeek : int 2 2 2 2 2 2 2 2 2 2 ...
$ DepTime : int 120 555 600 601 601 636 646 650 650 654 ...
$ CRSDepTime : int 1935 600 600 605 600 645 655 700 650 700 ...
$ ArrTime : int 309 826 728 727 654 934 735 841 1139 1117 ...
$ CRSArrTime : int 2130 835 729 750 700 932 750 857 1145 1133 ...
$ UniqueCarrier : chr "9E" "AA" "YV" "9E" ...
$ FlightNum : int 5746 1614 2883 5743 1157 1674 340 541 1182 1060 ...
$ TailNum : chr "84129E" "N438AA" "N922FJ" "89189E" ...
$ ActualElapsedTime: int 109 151 148 86 53 178 49 111 169 203 ...
$ CRSElapsedTime : int 115 155 149 105 60 167 55 117 175 213 ...
$ AirTime : int 88 133 125 70 38 145 28 94 153 177 ...
$ ArrDelay : int 339 -9 -1 -23 -6 2 -15 -16 -6 -16 ...
$ DepDelay : int 345 -5 0 -4 1 -9 -9 -10 0 -6 ...
$ Origin : chr "MEM" "AUS" "AUS" "AUS" ...
$ Dest : chr "AUS" "ORD" "PHX" "MEM" ...
$ Distance : int 559 978 872 559 190 1042 140 650 1242 1522 ...
$ TaxiIn : int 3 7 7 4 5 11 6 6 4 13 ...
$ TaxiOut : int 18 11 16 12 10 22 15 11 12 13 ...
$ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
$ CancellationCode : chr "" "" "" "" ...
$ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
$ CarrierDelay : int 339 NA NA NA NA NA NA NA NA NA ...
$ WeatherDelay : int 0 NA NA NA NA NA NA NA NA NA ...
$ NASDelay : int 0 NA NA NA NA NA NA NA NA NA ...
$ SecurityDelay : int 0 NA NA NA NA NA NA NA NA NA ...
$ LateAircraftDelay: int 0 NA NA NA NA NA NA NA NA NA ...
#create time of the day column
ABIA_Time_of_Day <- ABIA %>%
mutate(Time_of_Day = ifelse(DepTime %in% 100:559, "Early Morning",
ifelse(DepTime %in% 600:1159,"Morning",
ifelse(DepTime %in% 1200:1659, "Afternoon",
ifelse(DepTime %in% 1700:1959, "Evening",
ifelse(DepTime %in% 2000:2259, "Night",
ifelse(DepTime %in% 2300:2459, "Late Night", NA)))))))
Here is where I reuse it
#group new df by Carrier and Time of Day
group_cols <- c("Time_of_Day", "UniqueCarrier")
ABIA_Time_Carrier <- ABIA_Time_of_Day %>%
group_by(across(all_of(group_cols))) %>%
summarize(count = n(),
mean_ArrDelay = mean(ArrDelay, na.rm = TRUE)