No need for dplyr
. Assuming var
is stored as a factor already:
non_c <- setdiff(levels(dat$var), "Candy")
levels(dat$var) <- list(Candy = "Candy", "Non-Candy" = non_c)
See ?levels
.
This is much more efficient than the ifelse
approach, which is bound to be slow:
library(microbenchmark)
set.seed(01239)
# resample data
smp <- data.frame(sample(dat$var, 1e6, TRUE))
names(smp) <- "var"
timings <- replicate(50, {
# copy data to facilitate reuse
cop <- smp
t0 <- get_nanotime()
levs <- setdiff(levels(cop$var), "Candy")
levels(cop$var) <- list(Candy = "Candy", "Non-Candy" = levs)
t1 <- get_nanotime() - t0
cop <- smp
t0 <- get_nanotime()
cop = cop %>%
mutate(candy.flag = factor(ifelse(var == "Candy", "Candy", "Non-Candy")))
t2 <- get_nanotime() - t0
cop <- smp
t0 <- get_nanotime()
cop$var <-
factor(cop$var == "Candy", labels = c("Non-Candy", "Candy"))
t3 <- get_nanotime() - t0
c(levels = t1, dplyr = t2, direct = t3)
})
x <- apply(times, 1, median)
x[2]/x[1]
# dplyr direct
# 8.894303 4.962791
That is, this is 9 times faster.