0

Would it make more sense to remove missing data from just the Time_of_Day column using drop_na(Time_of_Day) or to remove it from the whole data frame using na. omit(). Also when I pipe na.omit right after the following code and reuse this data frame, the NA values in the Time_of_Day reappear.

    > dput(head(ABIA))
structure(list(Year = c(2008L, 2008L, 2008L, 2008L, 2008L, 2008L
), Month = c(1L, 1L, 1L, 1L, 1L, 1L), DayofMonth = c(1L, 1L, 
1L, 1L, 1L, 1L), DayOfWeek = c(2L, 2L, 2L, 2L, 2L, 2L), DepTime = c(120L, 
555L, 600L, 601L, 601L, 636L), CRSDepTime = c(1935L, 600L, 600L, 
605L, 600L, 645L), ArrTime = c(309L, 826L, 728L, 727L, 654L, 
934L), CRSArrTime = c(2130L, 835L, 729L, 750L, 700L, 932L), UniqueCarrier = c("9E", 
"AA", "YV", "9E", "AA", "NW"), FlightNum = c(5746L, 1614L, 2883L, 
5743L, 1157L, 1674L), TailNum = c("84129E", "N438AA", "N922FJ", 
"89189E", "N4XAAA", "N967N"), ActualElapsedTime = c(109L, 151L, 
148L, 86L, 53L, 178L), CRSElapsedTime = c(115L, 155L, 149L, 105L, 
60L, 167L), AirTime = c(88L, 133L, 125L, 70L, 38L, 145L), ArrDelay = c(339L, 
-9L, -1L, -23L, -6L, 2L), DepDelay = c(345L, -5L, 0L, -4L, 1L, 
-9L), Origin = c("MEM", "AUS", "AUS", "AUS", "AUS", "AUS"), Dest = c("AUS", 
"ORD", "PHX", "MEM", "DFW", "MSP"), Distance = c(559L, 978L, 
872L, 559L, 190L, 1042L), TaxiIn = c(3L, 7L, 7L, 4L, 5L, 11L), 
    TaxiOut = c(18L, 11L, 16L, 12L, 10L, 22L), Cancelled = c(0L, 
    0L, 0L, 0L, 0L, 0L), CancellationCode = c("", "", "", "", 
    "", ""), Diverted = c(0L, 0L, 0L, 0L, 0L, 0L), CarrierDelay = c(339L, 
    NA, NA, NA, NA, NA), WeatherDelay = c(0L, NA, NA, NA, NA, 
    NA), NASDelay = c(0L, NA, NA, NA, NA, NA), SecurityDelay = c(0L, 
    NA, NA, NA, NA, NA), LateAircraftDelay = c(0L, NA, NA, NA, 
    NA, NA)), row.names = c(NA, 6L), class = "data.frame")
        
    > str(ABIA)
           'data.frame':    99260 obs. of  29 variables:
     $ Year             : int  2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
     $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
     $ DayofMonth       : int  1 1 1 1 1 1 1 1 1 1 ...
     $ DayOfWeek        : int  2 2 2 2 2 2 2 2 2 2 ...
     $ DepTime          : int  120 555 600 601 601 636 646 650 650 654 ...
     $ CRSDepTime       : int  1935 600 600 605 600 645 655 700 650 700 ...
     $ ArrTime          : int  309 826 728 727 654 934 735 841 1139 1117 ...
     $ CRSArrTime       : int  2130 835 729 750 700 932 750 857 1145 1133 ...
     $ UniqueCarrier    : chr  "9E" "AA" "YV" "9E" ...
     $ FlightNum        : int  5746 1614 2883 5743 1157 1674 340 541 1182 1060 ...
     $ TailNum          : chr  "84129E" "N438AA" "N922FJ" "89189E" ...
     $ ActualElapsedTime: int  109 151 148 86 53 178 49 111 169 203 ...
     $ CRSElapsedTime   : int  115 155 149 105 60 167 55 117 175 213 ...
     $ AirTime          : int  88 133 125 70 38 145 28 94 153 177 ...
     $ ArrDelay         : int  339 -9 -1 -23 -6 2 -15 -16 -6 -16 ...
     $ DepDelay         : int  345 -5 0 -4 1 -9 -9 -10 0 -6 ...
     $ Origin           : chr  "MEM" "AUS" "AUS" "AUS" ...
     $ Dest             : chr  "AUS" "ORD" "PHX" "MEM" ...
     $ Distance         : int  559 978 872 559 190 1042 140 650 1242 1522 ...
     $ TaxiIn           : int  3 7 7 4 5 11 6 6 4 13 ...
     $ TaxiOut          : int  18 11 16 12 10 22 15 11 12 13 ...
     $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
     $ CancellationCode : chr  "" "" "" "" ...
     $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
     $ CarrierDelay     : int  339 NA NA NA NA NA NA NA NA NA ...
     $ WeatherDelay     : int  0 NA NA NA NA NA NA NA NA NA ...
     $ NASDelay         : int  0 NA NA NA NA NA NA NA NA NA ...
     $ SecurityDelay    : int  0 NA NA NA NA NA NA NA NA NA ...
     $ LateAircraftDelay: int  0 NA NA NA NA NA NA NA NA NA ...

 #create time of the day column
 ABIA_Time_of_Day <- ABIA %>% 
 mutate(Time_of_Day = ifelse(DepTime %in% 100:559, "Early Morning",
                   ifelse(DepTime %in% 600:1159,"Morning",
                   ifelse(DepTime %in% 1200:1659, "Afternoon",
                   ifelse(DepTime %in% 1700:1959, "Evening",
                   ifelse(DepTime %in% 2000:2259, "Night",  
                   ifelse(DepTime %in% 2300:2459, "Late Night", NA)))))))

  

    

Here is where I reuse it

   #group new df by Carrier and Time of Day
   group_cols <- c("Time_of_Day", "UniqueCarrier")
   ABIA_Time_Carrier <- ABIA_Time_of_Day %>% 
   group_by(across(all_of(group_cols))) %>%
   summarize(count = n(), 
              mean_ArrDelay = mean(ArrDelay, na.rm = TRUE)
kapsha
  • 35
  • 4
  • Welcome to SO! Can you help us help you by sharing a [minimal reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) of your issue? For example, edit the question to include the output of `dput(head(ABIA))` and possibly also `str(ABIA)`. – Dan Adams Feb 10 '22 at 17:23
  • first your code has some issues, what happens when one inputes a value like 1160 – Onyambu Feb 10 '22 at 17:26
  • `drop_na(Time_of_Day)` will remove rows that have a missing value in the `Time_of_Day` column. `na.omit(ABIA_Time_of_Day)` will drop rows that have a missing value in any column. Use whichever one is appropriate. – Gregor Thomas Feb 10 '22 at 17:33
  • As to *"when I pipe na.omit right after the following code and reuse this data frame, the NA values in the Time_of_Day reappear"*, make sure you are examining the object you assigned the `na.omit` result to with the `<-` assignment. – Gregor Thomas Feb 10 '22 at 17:35
  • You might also be interested in simplifying your code with the `cut` function. See the R FAQ [on binning data](https://stackoverflow.com/q/5570293/903061) for an example. – Gregor Thomas Feb 10 '22 at 17:36

0 Answers0