11

For those unfamiliar, one-hot encoding simply refers to converting a column of categories (i.e. a factor) into multiple columns of binary indicator variables where each new column corresponds to one of the classes of the original column. This example will explain it better:

dt <- data.table(
  ID=1:5, 
  Color=factor(c("green", "red", "red", "blue", "green"), levels=c("blue", "green", "red", "purple")),
  Shape=factor(c("square", "triangle", "square", "triangle", "cirlce"))
)

dt
   ID Color    Shape
1:  1 green   square
2:  2   red triangle
3:  3   red   square
4:  4  blue triangle
5:  5 green   cirlce

# one hot encode the colors
color.binarized <- dcast(dt[, list(V1=1, ID, Color)], ID ~ Color, fun=sum, value.var="V1", drop=c(TRUE, FALSE))

# Prepend Color_ in front of each one-hot-encoded feature
setnames(color.binarized, setdiff(colnames(color.binarized), "ID"), paste0("Color_", setdiff(colnames(color.binarized), "ID")))

# one hot encode the shapes
shape.binarized <- dcast(dt[, list(V1=1, ID, Shape)], ID ~ Shape, fun=sum, value.var="V1", drop=c(TRUE, FALSE))

# Prepend Shape_ in front of each one-hot-encoded feature
setnames(shape.binarized, setdiff(colnames(shape.binarized), "ID"), paste0("Shape_", setdiff(colnames(shape.binarized), "ID")))

# Join one-hot tables with original dataset
dt <- dt[color.binarized, on="ID"]
dt <- dt[shape.binarized, on="ID"]

dt
   ID Color    Shape Color_blue Color_green Color_red Color_purple Shape_cirlce Shape_square Shape_triangle
1:  1 green   square          0           1         0            0            0            1              0
2:  2   red triangle          0           0         1            0            0            0              1
3:  3   red   square          0           0         1            0            0            1              0
4:  4  blue triangle          1           0         0            0            0            0              1
5:  5 green   cirlce          0           1         0            0            1            0              0

This is something I do a lot, and as you can see it's pretty tedious (especially when my data has many factor columns). Is there an easier way to do this with data.table? In particular, I assumed dcast would allow me to one-hot-encode multiple columns at once, when I try doing something like

dcast(dt[, list(V1=1, ID, Color, Shape)], ID ~ Color + Shape, fun=sum, value.var="V1", drop=c(TRUE, FALSE))

I get column combinations

   ID blue_cirlce blue_square blue_triangle green_cirlce green_square green_triangle red_cirlce red_square red_triangle purple_cirlce purple_square purple_triangle
1:  1           0           0             0            0            1              0          0          0            0             0             0               0
2:  2           0           0             0            0            0              0          0          0            1             0             0               0
3:  3           0           0             0            0            0              0          0          1            0             0             0               0
4:  4           0           0             1            0            0              0          0          0            0             0             0               0
5:  5           0           0             0            1            0              0          0          0            0             0             0               0
Ben
  • 20,038
  • 30
  • 112
  • 189

5 Answers5

10

Here you go:

dcast(melt(dt, id.vars='ID'), ID ~ variable + value, fun = length)
#   ID Color_blue Color_green Color_red Shape_cirlce Shape_square Shape_triangle
#1:  1          0           1         0            0            1              0
#2:  2          0           0         1            0            0              1
#3:  3          0           0         1            0            1              0
#4:  4          1           0         0            0            0              1
#5:  5          0           1         0            1            0              0

To get the missing factors you can do the following:

res = dcast(melt(dt, id = 'ID', value.factor = T), ID ~ value, drop = F, fun = length)
setnames(res, c("ID", unlist(lapply(2:ncol(dt),
                             function(i) paste(names(dt)[i], levels(dt[[i]]), sep = "_")))))
res
#   ID Color_blue Color_green Color_red Color_purple Shape_cirlce Shape_square Shape_triangle
#1:  1          0           1         0            0            0            1              0
#2:  2          0           0         1            0            0            0              1
#3:  3          0           0         1            0            0            1              0
#4:  4          1           0         0            0            0            0              1
#5:  5          0           1         0            0            1            0              0
eddi
  • 49,088
  • 6
  • 104
  • 155
  • Ah, this looks so elegant, but unfortunately it's missing Color_purple (an unused color level). – Ben Oct 07 '16 at 17:17
  • Looks like I jumped the gun. Unfortunately this only works when the levels of each factor column are completely distinct. Pretty sure I can fix it though. You got me 90% of the way there. – Ben Oct 07 '16 at 18:24
  • @Ben perhaps start with this instead, and then you won't need to do the renaming later: `newdt = setDT(lapply(1:ncol(dt), function(i) if (is.factor(dt[[i]])) { factor(paste(names(dt)[i], levels(dt[[i]]), sep = "_"))[dt[[i]]] } else { dt[[i]] }))` – eddi Oct 07 '16 at 18:46
8

Using model.matrix:

> cbind(dt[, .(ID)], model.matrix(~ Color + Shape, dt))
   ID (Intercept) Colorgreen Colorred Colorpurple Shapesquare Shapetriangle
1:  1           1          1        0           0           1             0
2:  2           1          0        1           0           0             1
3:  3           1          0        1           0           1             0
4:  4           1          0        0           0           0             1
5:  5           1          1        0           0           0             0

This makes the most sense if you're doing modelling.

If you want to suppress the intercept (and restore the aliased column for the 1st variable):

> cbind(dt[, .(ID)], model.matrix(~ Color + Shape - 1, dt))
   ID Colorblue Colorgreen Colorred Colorpurple Shapesquare Shapetriangle
1:  1         0          1        0           0           1             0
2:  2         0          0        1           0           0             1
3:  3         0          0        1           0           1             0
4:  4         1          0        0           0           0             1
5:  5         0          1        0           0           0             0
Hong Ooi
  • 56,353
  • 13
  • 134
  • 187
  • `Matrix::sparse.model.matrix` would be better. – Dmitriy Selivanov Oct 07 '16 at 15:42
  • 1
    `Shapecirlce` is missing...? – eddi Oct 07 '16 at 17:36
  • 2
    @eddi `Shapecircle` can be inferred from the values of `Shapesquare` and `Shapetriangle`. Representing n levels requires n-1 columns in general. – Hong Ooi Oct 07 '16 at 17:45
  • 1
    @HongOoi while this is true, one-hot-encoding is frequently used for machine learning models which randomly sample subsets of columns (e.g. random forest, gradient boosting, etc.). With these models, it's generally better to include all the data columns since one missing column can't be inferred after column subsets are taken. – Ben Oct 08 '16 at 14:04
  • @Ben I'm not sure I understand your comment regarding using n columns instead of n-1 – learningAsIGo Apr 19 '18 at 01:39
  • Seems like I only have the missing level problem when I include more than one variable at a time in the model matrix formula. – gannawag Jan 11 '19 at 15:41
5

Here's a more generalized version of eddi's solution:

one_hot <- function(dt, cols="auto", dropCols=TRUE, dropUnusedLevels=FALSE){
  # One-Hot-Encode unordered factors in a data.table
  # If cols = "auto", each unordered factor column in dt will be encoded. (Or specifcy a vector of column names to encode)
  # If dropCols=TRUE, the original factor columns are dropped
  # If dropUnusedLevels = TRUE, unused factor levels are dropped

  # Automatically get the unordered factor columns
  if(cols[1] == "auto") cols <- colnames(dt)[which(sapply(dt, function(x) is.factor(x) & !is.ordered(x)))]

  # Build tempDT containing and ID column and 'cols' columns
  tempDT <- dt[, cols, with=FALSE]
  tempDT[, ID := .I]
  setcolorder(tempDT, unique(c("ID", colnames(tempDT))))
  for(col in cols) set(tempDT, j=col, value=factor(paste(col, tempDT[[col]], sep="_"), levels=paste(col, levels(tempDT[[col]]), sep="_")))

  # One-hot-encode
  if(dropUnusedLevels == TRUE){
    newCols <- dcast(melt(tempDT, id = 'ID', value.factor = T), ID ~ value, drop = T, fun = length)
  } else{
    newCols <- dcast(melt(tempDT, id = 'ID', value.factor = T), ID ~ value, drop = F, fun = length)
  }

  # Combine binarized columns with the original dataset
  result <- cbind(dt, newCols[, !"ID"])

  # If dropCols = TRUE, remove the original factor columns
  if(dropCols == TRUE){
    result <- result[, !cols, with=FALSE]
  }

  return(result)
}

Note that for large datasets it's probably better to use Matrix::sparse.model.matrix

Update (2017)

This is now in the package mltools.

Ben
  • 20,038
  • 30
  • 112
  • 189
  • 1
    your function works great for me, but since it turns n factor levels in n columns, it is not useful for creating models that are sensitive to multicollinearity. Is there an adjusted version of your function which produces n-1 dummy columns per factor column? – constiii Apr 24 '17 at 13:57
  • @Constantin No, but you could just drop one of the columns after encoding. – Ben Oct 16 '17 at 17:14
1

If no one posts a clean way to write this out by hand each time, you can always make a function/macro:

OHE <- function(dt, grp, encodeCols) {
        grpSymb = as.symbol(grp)
        for (col in encodeCols) {
                colSymb = as.symbol(col)
                eval(bquote(
                            dt[, .SD
                               ][, V1 := 1
                               ][, dcast(.SD, .(grpSymb) ~ .(colSymb), fun=sum, value.var='V1')
                               ][, setnames(.SD, setdiff(colnames(.SD), grp), sprintf("%s_%s", col, setdiff(colnames(.SD), grp)))
                               ][, dt <<- dt[.SD, on=grp]
                               ]
                            ))
        }
        dt
}

dtOHE = OHE(dt, 'ID', c('Color', 'Shape'))
dtOHE

   ID Color    Shape Color_blue Color_green Color_red Shape_cirlce Shape_square Shape_triangle
1:  1 green   square          0           1         0            0            1              0
2:  2   red triangle          0           0         1            0            0              1
3:  3   red   square          0           0         1            0            1              0
4:  4  blue triangle          1           0         0            0            0              1
5:  5 green   cirlce          0           1         0            1            0              0
Clayton Stanley
  • 7,513
  • 9
  • 32
  • 46
0

In few lines you can solve this problem:

library(tidyverse)
dt2 <- spread(dt,Color,Shape)
dt3 <- spread(dt,Shape,Color)

df <- cbind(dt2,dt3)

df2 <- apply(df, 2, function(x){sapply(x, function(y){
  ifelse(is.na(y),0,1)
})})

df2 <- as.data.frame(df2)

df <- cbind(dt,df2[,-1])

table image

Brian Tompsett - 汤莱恩
  • 5,753
  • 72
  • 57
  • 129
Arthur Vaz
  • 391
  • 1
  • 3
  • 11