In R, the problem with the model.matrix approach proposed by @Maurits Evers is that excepted for the first factor, the function drops the first level of each factor. Sometimes this is what you want but sometimes it is not (depending on the problem as underlined by @Maurits Evers).
There are several functions scattered in different packages to do that (eg package caret
see here for several examples).
I use the following function inspired by this Stack Overflow answer by @Jaap
#'
#' Transform factors from a data.frame into dummy variables (one hot encoding)
#'
#' This function will transform all factors into dummy variables with one column
#' for each level of the factor (unlike the contrasts matrices that will drop the first
#' level). The factors with only two levels will have only one column (0/1 on the second
#' level). The ordered factors and logicals are transformed into numeric.
#' The numeric and text vectors will remain untouched.
#'
make_dummies <- function(df){
# function to create dummy variables for one factor only
dummy <- function(fac, name = "") {
if(is.factor(fac) & !is.ordered(fac)) {
l <- levels(fac)
res <- outer(fac, l, function(fac, l) 1L * (fac == l))
colnames(res) <- paste0(name, l)
if(length(l) == 2) {res <- res[,-1, drop = F]}
if(length(l) == 1) {res <- res}
} else if(is.ordered(fac) | is.logical(fac)) {
res <- as.numeric(fac)
} else {
res <- fac
}
return(res)
}
# Apply this function to all columns
res <- (lapply(df, dummy))
# change the names of the cases with only one column
for(i in seq_along(res)){
if(any(is.matrix(res[[i]]) & ncol(res[[i]]) == 1)){
colnames(res[[i]]) <- paste0(names(res)[i], ".", colnames(res[[i]]))
}
}
res <- as.data.frame(res)
return(res)
}
Example :
df <- data.frame(num = round(rnorm(12),1),
sex = factor(c("Male", "Female")),
color = factor(c("black", "red", "yellow")),
fac2 = factor(1:4),
fac3 = factor("A"),
size = factor(c("small", "middle", "big"),
levels = c("small", "middle", "big"), ordered = TRUE),
logi = c(TRUE, FALSE))
print(df)
#> num sex color fac2 fac3 size logi
#> 1 0.0 Male black 1 A small TRUE
#> 2 -1.0 Female red 2 A middle FALSE
#> 3 1.3 Male yellow 3 A big TRUE
#> 4 1.4 Female black 4 A small FALSE
#> 5 -0.9 Male red 1 A middle TRUE
#> 6 0.1 Female yellow 2 A big FALSE
#> 7 1.4 Male black 3 A small TRUE
#> 8 0.1 Female red 4 A middle FALSE
#> 9 1.6 Male yellow 1 A big TRUE
#> 10 1.1 Female black 2 A small FALSE
#> 11 0.2 Male red 3 A middle TRUE
#> 12 0.3 Female yellow 4 A big FALSE
make_dummies(df)
#> num sex.Male color.black color.red color.yellow fac2.1 fac2.2 fac2.3
#> 1 0.0 1 1 0 0 1 0 0
#> 2 -1.0 0 0 1 0 0 1 0
#> 3 1.3 1 0 0 1 0 0 1
#> 4 1.4 0 1 0 0 0 0 0
#> 5 -0.9 1 0 1 0 1 0 0
#> 6 0.1 0 0 0 1 0 1 0
#> 7 1.4 1 1 0 0 0 0 1
#> 8 0.1 0 0 1 0 0 0 0
#> 9 1.6 1 0 0 1 1 0 0
#> 10 1.1 0 1 0 0 0 1 0
#> 11 0.2 1 0 1 0 0 0 1
#> 12 0.3 0 0 0 1 0 0 0
#> fac2.4 fac3.A size logi
#> 1 0 1 1 1
#> 2 0 1 2 0
#> 3 0 1 3 1
#> 4 1 1 1 0
#> 5 0 1 2 1
#> 6 0 1 3 0
#> 7 0 1 1 1
#> 8 1 1 2 0
#> 9 0 1 3 1
#> 10 0 1 1 0
#> 11 0 1 2 1
#> 12 1 1 3 0
Created on 2018-03-19 by the reprex package (v0.2.0).