2

I need to fill the null values of all the numerical columns with each column's median value in a data frame. I did the following code.

median_forNumericalNulls <- function(dataframe){
  
  nums <- unlist(lapply(dataframe, is.numeric))  
  
  df_num <- dataframe[ , nums]
  
  df_num[] <- lapply(df_num, function(x) { 
    x[is.na(x)] <- median(x, na.rm = TRUE)
    x
  })      

  return(dataframe)
  
}

median_forNumericalNulls(A)

A is the parent table, which consists of both numerical as well as categorical variables. How can I replace the columns of 'A' dataframe with the output of the function median_forNumericalNulls?

Is there a better way that we can do the same?

user3789200
  • 1,166
  • 2
  • 25
  • 45

3 Answers3

2

May be we need to change the function to directly subset the columns and updating the columns, instead of creating another object and then updating

median_forNumericalNulls <- function(dataframe){
  
  nums <- unlist(lapply(dataframe, is.numeric))  
  
  df_num <- dataframe[ , nums]
  
  dataframe[nums] <- lapply(dataframe[nums], function(x) { 
    x[is.na(x)] <- median(x, na.rm = TRUE)
    x
  })      
  dataframe
  
}

-testing

A <- median_forNumericalNulls(A)

Also, this can be done in a compact way with na.aggregate though

library(zoo)
A <- na.aggregate(A, FUN = median)

Or using tidyverse

library(dplyr)
A <- A %>%
   mutate(across(where(is.numeric), 
         ~ replace(., is.na(.), median(., na.rm = TRUE))))
akrun
  • 874,273
  • 37
  • 540
  • 662
  • @akrun can you look at this question: https://stackoverflow.com/questions/69404370/how-to-fix-a-problem-with-legends-margins-in-plotbase it was rewritten from another one and can't be seen in newest. thx. – manro Oct 17 '21 at 21:39
  • @manro I tested that, but couldn't reproduce your second plot though – akrun Oct 17 '21 at 21:43
2

Here is another approach how you could do it: Example:

librara(dplyr)
iris1 <- iris %>% 
  select(1, 2, 5)
head(iris1, 10) %>% 
  as_tibble() %>% 
  mutate(across(where(is.numeric), ~ifelse(.<= 3, NA, .))) %>% 
  mutate(across(where(is.numeric), ~ifelse(is.na(.), median(.,na.rm = TRUE), .)))
   Sepal.Length Sepal.Width Species
          <dbl>       <dbl> <fct>  
 1          5.1         3.5 setosa 
 2          4.9         3.4 setosa 
 3          4.7         3.2 setosa 
 4          4.6         3.1 setosa 
 5          5           3.6 setosa 
 6          5.4         3.9 setosa 
 7          4.6         3.4 setosa 
 8          5           3.4 setosa 
 9          4.4         3.4 setosa 
10          4.9         3.1 setosa 
TarJae
  • 72,363
  • 6
  • 19
  • 66
1

Base R solution:

# Function to deteremine data.frame vector indices that are numeric:
# resolve_num_cols => function() 
resolve_num_cols <- function(df){
  # Store a vector of numeric column names: 
  # num_cols => logical vector
  num_cols <- which(
    vapply(
      df, 
      is.numeric, 
      logical(1),
      USE.NAMES = FALSE
    )
  )
  # Explicitly define the returned object: logical vector => env
  return(num_cols)
}

# Function to impute median values for each numeric vector in data.frame
# impute_median_vals_in_df => function()
impute_median_vals_in_df <- function(df, num_col_idx){
  # Replace the na. values in each numeric vector: df => data.frame 
  df[,num_col_idx] <- lapply(
    num_col_idx, 
    function(col_idx){
      df[,col_idx] <- ifelse(
        is.na(df[,col_idx]),
        median(df[,col_idx], na.rm = TRUE),
        df[,col_idx]
      )
    }
  )
  # Return the data.frame object: data.frame => env
  return(df)
}

# Apply the function to resolve the numeric vectors in data.frame: 
# num_cols => integer vector
num_cols <- resolve_num_cols(df1)

# Apply the function: clean_df => data.frame
clean_df <- impute_median_vals_in_df(df1, num_cols)

Data used:

# Import data: df1 => data.frame
df1 <- structure(list(mpg = c(NA, 21, NA, 21.4, 18.7, 18.1, 14.3, 24.4, 
22.8, 19.2, NA, 16.4, 17.3, NA, NA, 10.4, 14.7, 32.4, 30.4, 33.9, 
21.5, NA, 15.2, 13.3, 19.2, 27.3, 26, NA, NA, 19.7, 15, 21.4), 
cyl = c(NA, 6, NA, 6, 8, NA, NA, 4, 4, 6, 6, 8, 8, 8, 8, 
8, 8, NA, NA, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, NA, NA, 4), disp = c(160, 
160, 108, 258, 360, 225, 360, 146.7, 140.8, 167.6, 167.6, 
NA, NA, 275.8, NA, 460, 440, 78.7, 75.7, 71.1, NA, 318, NA, 
350, 400, NA, NA, 95.1, 351, NA, 301, 121), hp = c(110, 110, 
93, 110, NA, 105, 245, 62, 95, 123, 123, 180, NA, 180, 205, 
215, NA, 66, NA, 65, 97, NA, NA, 245, 175, 66, 91, NA, 264, 
175, NA, 109), drat = c(3.9, 3.9, 3.85, 3.08, 3.15, 2.76, 
3.21, NA, 3.92, 3.92, 3.92, 3.07, NA, 3.07, 2.93, 3, 3.23, 
4.08, NA, 4.22, NA, 2.76, 3.15, 3.73, 3.08, NA, NA, 3.77, 
4.22, 3.62, NA, NA), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 
3.46, 3.57, NA, 3.15, 3.44, 3.44, 4.07, NA, NA, 5.25, 5.424, 
5.345, 2.2, 1.615, 1.835, NA, NA, 3.435, 3.84, NA, NA, NA, 
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61, 
19.44, NA, NA, NA, 20, NA, 18.3, 18.9, 17.4, 17.6, 18, NA, 
17.82, NA, 19.47, 18.52, 19.9, NA, 16.87, NA, 15.41, 17.05, 
18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6), vs = c(0, NA, 
1, 1, NA, NA, 0, NA, 1, NA, 1, 0, 0, 0, 0, 0, 0, 1, NA, 1, 
1, 0, 0, 0, NA, 1, NA, 1, 0, 0, 0, 1), am = c(NA, NA, NA, 
0, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, NA, 
0, 0, 0, NA, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3, NA, 
3, 3, NA, NA, 4, 4, 3, 3, 3, 3, 3, NA, 4, 4, 4, 3, 3, 3, 
3, 3, 4, 5, NA, NA, NA, NA, 4), carb = c(4, 4, 1, 1, 2, 1, 
4, NA, NA, 4, NA, 3, NA, 3, NA, 4, 4, 1, 2, 1, 1, 2, 2, 4, 
NA, NA, 2, 2, NA, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag", 
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", 
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", 
"Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", 
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", 
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", 
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", 
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", 
"Volvo 142E"), class = "data.frame")
hello_friend
  • 5,682
  • 1
  • 11
  • 15