0

I have a dataframe called df:

df <- data.frame(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9),
                customer = c("Alice", "Bob", "Carlos", "Chuck", "Craig", "Heidi", "Judy", "Rupert", "Wendy"),
                Balance = c(100, 75, 56, 172, 450, 777, 1001, 25, 968),
                Hour = c(1, 23, 4, 5, 6, 12, 14, 17, 17),
                InDebt = c(1, 1, 1, 1, 0, 0, 0, 1, 1),
                DueDay = c("Mon", "Tue", "Wed", "Fri", "Sun", "Sat", "Thu", "Mon", "Wed"),
                AppBooked = c(1, 1, 1, 0, 0, 1, 0, 1, 1)
                )

I want to convert certain columns (in this instance Hour, InDebt, AppBooked) to factors and at present have been using the following method:

df$Hour <- as.factor(df$Hour)
df$InDebt <- as.factor(df$InDebt)
df$AppBooked <- as.factor(df$AppBooked)

Although this works, I wanted to see if there was a quicker way of doing this (particularly if I have to do it for 100 different column names). I'm basically interested in converting a specific set of numeric variables into factors (but not all numeric variables). I've been trying to get this to work using the lapply function and dplyr, storing my column names in a vector, however, I'm having difficulty getting it to work:

colsasfactors <- (c("Hour", "InDebt", "AppBooked"))

df <- df %>%
        lapply(colnames(df) %in% colsasfactor, factor)

Clearly, I'm missing something (my suspicion being that I'm using %in% incorrectly).

JGW
  • 314
  • 4
  • 18

5 Answers5

2

Also, you can use across() with mutate() using the index position of each variable. If you have 100 variables it is easy to use index number than names:

library(dplyr)
#Data
df <- data.frame(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9),
                 customer = c("Alice", "Bob", "Carlos", "Chuck", "Craig", "Heidi", "Judy", "Rupert", "Wendy"),
                 Balance = c(100, 75, 56, 172, 450, 777, 1001, 25, 968),
                 Hour = c(1, 23, 4, 5, 6, 12, 14, 17, 17),
                 InDebt = c(1, 1, 1, 1, 0, 0, 0, 1, 1),
                 DueDay = c("Mon", "Tue", "Wed", "Fri", "Sun", "Sat", "Thu", "Mon", "Wed"),
                 AppBooked = c(1, 1, 1, 0, 0, 1, 0, 1, 1),stringsAsFactors = F
)
#Code
df <- df %>% mutate(across(c(4,5,7),factor))

Output:

'data.frame':   9 obs. of  7 variables:
 $ id       : num  1 2 3 4 5 6 7 8 9
 $ customer : chr  "Alice" "Bob" "Carlos" "Chuck" ...
 $ Balance  : num  100 75 56 172 450 ...
 $ Hour     : Factor w/ 8 levels "1","4","5","6",..: 1 8 2 3 4 5 6 7 7
 $ InDebt   : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 1 2 2
 $ DueDay   : chr  "Mon" "Tue" "Wed" "Fri" ...
 $ AppBooked: Factor w/ 2 levels "0","1": 2 2 2 1 1 2 1 2 2

Or using your variable vector colsasfactors:

#Code 2
df <- df %>% mutate(across(colsasfactors,factor))

Output:

'data.frame':   9 obs. of  7 variables:
 $ id       : num  1 2 3 4 5 6 7 8 9
 $ customer : chr  "Alice" "Bob" "Carlos" "Chuck" ...
 $ Balance  : num  100 75 56 172 450 ...
 $ Hour     : Factor w/ 8 levels "1","4","5","6",..: 1 8 2 3 4 5 6 7 7
 $ InDebt   : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 1 2 2
 $ DueDay   : chr  "Mon" "Tue" "Wed" "Fri" ...
 $ AppBooked: Factor w/ 2 levels "0","1": 2 2 2 1 1 2 1 2 2
Duck
  • 39,058
  • 13
  • 42
  • 84
1

You can use mutate_at:

df <- df %>% 
      mutate_at(vars(Hour, InDebt, AppBooked), factor)
YOLO
  • 20,181
  • 5
  • 20
  • 40
1

You can use the newer across() function with mutate

df <- df %>% 
      mutate(across( c(Hour, InDebt, AppBooked), as.factor) ) 

You can also use mutate_at but it has now been depreciated in dplyr

al_the_man
  • 295
  • 4
  • 6
0

You could use the data.table package to do that:

df <- data.frame(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9),
                 customer = c("Alice", "Bob", "Carlos", "Chuck", "Craig",
                              "Heidi", "Judy", "Rupert", "Wendy"),
                 Balance = c(100, 75, 56, 172, 450, 777, 1001, 25, 968),
                 Hour = c(1, 23, 4, 5, 6, 12, 14, 17, 17),
                 InDebt = c(1, 1, 1, 1, 0, 0, 0, 1, 1),
                 DueDay = c("Mon", "Tue", "Wed", "Fri", "Sun",
                            "Sat", "Thu", "Mon", "Wed"),
                 AppBooked = c(1, 1, 1, 0, 0, 1, 0, 1, 1))

require(data.table)
setDT(df)

cols = c("Hour", "InDebt", "AppBooked")
df[ , (cols) := lapply(.SD, as.factor), .SDcols = cols ]

sapply(df, class) # check column classes
andschar
  • 3,504
  • 2
  • 27
  • 35
0

Another way you can try

df[,colsasfactors] <- lapply(df[,colsasfactors],as.factor)
str(df)
# 'data.frame': 9 obs. of  7 variables:
#   $ id       : num  1 2 3 4 5 6 7 8 9
# $ customer : chr  "Alice" "Bob" "Carlos" "Chuck" ...
# $ Balance  : num  100 75 56 172 450 ...
# $ Hour     : Factor w/ 8 levels "1","4","5","6",..: 1 8 2 3 4 5 6 7 7
# $ InDebt   : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 1 2 2
# $ DueDay   : chr  "Mon" "Tue" "Wed" "Fri" ...
# $ AppBooked: Factor w/ 2 levels "0","1": 2 2 2 1 1 2 1 2 2
Tho Vu
  • 1,304
  • 2
  • 8
  • 20