Another base variant using setNames
making use that []
will take the first match.
data$x <- setNames(c(lookup$new, data$x), c(lookup$old, data$x))[data$x]
#dats$x <- c(setNames(lookup$new, lookup$old), setNames(data$x, data$x))[data$x] #Variant
#data$x <- (\(.) setNames(c(lookup$new, .), c(lookup$old, .))[.])(data$x) #Variant
data
# id x
#1 1 a
#2 2 a
#3 3 B
#4 4 C
#5 5 d
#6 6 AA
#7 7 !
In case data$x
is not character
.
data$x <- setNames(c(lookup$new, data$x), c(lookup$old, data$x))[as.character(data$x)]
The same but using match
data$x <- c(lookup$new, data$x)[match(data$x, c(lookup$old, data$x))]
or fmatch
:
library(fastmatch)
data$x <- c(lookup$new, data$x)[fmatch(data$x, c(lookup$old, data$x))]
In case data$x
is long and has many duplicates it might help to take only unique
.
data$x <- (\(.) c(lookup$new, .)[match(data$x, c(lookup$old, .))])(unique(data$x))
#Unique and not in old
(\(.) c(lookup$new, .)[match(data$x, c(lookup$old, .))])(setdiff(data$x, lookup$old))
#Maybe faster as setdiff uses match for the whole vector
(\(.) c(lookup$new, .)[match(data$x, c(lookup$old, .))])(setdiff(unique(data$x), lookup$old))
And here kit::funique
or collapse::funique
could be a used instead of unique
.
Anther way is the usage of ==
with which
and usage of rep
.
i <- lapply(lookup$old, \(s) which(s == data$x))
data$x[unlist(i)] <- rep(lookup$new, lengths(i))
When it is a character vector it could be stored as a factor
and the levels
of a factor can be renamed. (It was asked that this was not wanted but it could be very performant.)
x <- factor(data$x)
levels(x)[match(lookup$old, levels(x), nlevels(x)+1L)] <- lookup$new
x
#[1] a a B C d AA !
#Levels: ! a AA B C d
Benchmark taking data from @LMc
data = data.frame(
id = 1:7,
x = c("A", "A", "B", "C", "D", "AA", ".")
)
lookup = data.frame(
old = c("A", "D", "."),
new = c("D", "A", "!")
)
set.seed(1)
data <- data[sample(nrow(data), 1E7, replace = TRUE),]
library(data.table)
library(fastmatch)
dt_data <- data.table::copy(data)
system.time(setDT(dt_data))
# User System verstrichen
# 0.001 0.000 0.001
dtK_data <- data.table::copy(data)
dtK_lookup <- data.table::copy(lookup)
system.time({
setDT(dtK_data)
setDT(dtK_lookup)
setkey(dtK_data, x)
setkey(dtK_lookup,old) })
#Note: dt uses multiple cores
# User System verstrichen
# 0.373 0.067 0.141
f_data <- data.table::copy(data)
system.time(f_data$x <- factor(f_data$x))
# User System verstrichen
# 0.238 0.032 0.270
C0_data <- data.table::copy(data)
fun <- alist(coalesceDeframe = C0_data$x <- dplyr::coalesce(tibble::deframe(lookup)[data$x], data$x),
coalesce = C0_data$x <- dplyr::coalesce(setNames(lookup$new, lookup$old)[data$x], data$x),
"recode" = C0_data$x <- dplyr::recode(data$x, !!!setNames(lookup$new, lookup$old)),
mapvalues = C0_data$x <- plyr::mapvalues(data$x, lookup$old, lookup$new, FALSE),
"unique" = (\(.) C0_data$x <- c(lookup$new, .)[match(data$x, c(lookup$old, .))])(unique(data$x)),
"match0" = {i <- match(data$x, lookup$old, nomatch = 0)
C0_data$x[i>0] <- lookup$new[i]},
"matchNA" = {i <- match(data$x, lookup$old)
j <- which(!is.na(i))
C0_data$x[j] <- lookup$new[i[j]]},
"rep" = {i <- lapply(lookup$old, \(s) which(s == data$x))
C0_data$x[unlist(i)] <- rep(lookup$new, lengths(i))},
"uniqueFmatch" = (\(.) C0_data$x <- c(lookup$new, .)[fmatch(data$x, c(lookup$old, .))])(unique(data$x)),
datatable = Cdt_data[lookup, x:=new, on=.(x=old)],
datatableKey = CdtK_data[dtK_lookup, x:=new, on=.(x=old)],
"collapseFmatch" = (\(.) C0_data$x <- c(lookup$new, .)[fmatch(data$x, c(lookup$old, .))])(collapse::funique(data$x)),
"kitFmatch" = (\(.) C0_data$x <- c(lookup$new, .)[fmatch(data$x, c(lookup$old, .))])(kit::funique(data$x)),
"factor" = levels(Cf_data$x)[match(lookup$old, levels(Cf_data$x), nlevels(Cf_data$x)+1L)] <- lookup$new
)
bench <- microbenchmark::microbenchmark(list = fun, times = 7L,
control=list(order="block"),
setup = {Cdt_data <- data.table::copy(dt_data)
CdtK_data <- data.table::copy(dtK_data)
Cf_data <- data.table::copy(f_data)
gc()
})
bench
ggplot2::autoplot(bench, log=FALSE)
Result
Unit: milliseconds
expr min lq mean median uq max neval
coalesceDeframe 1419.85752 1431.90215 1441.67163 1437.54078 1456.58562 1457.32759 7
coalesce 1416.81792 1418.58149 1422.67433 1420.02443 1424.95336 1434.80827 7
recode 1162.67033 1163.04287 1168.39498 1163.63317 1168.59404 1189.18758 7
mapvalues 613.99308 616.50627 618.56250 617.50594 617.96866 629.48865 7
match0 369.73388 379.07043 390.64808 385.51089 403.51618 414.11857 7
matchNA 360.17730 375.59077 378.24535 376.08590 385.57151 389.12971 7
rep 351.87641 352.77764 364.34904 357.27131 375.93110 383.87809 7
unique 340.98122 341.27901 341.87930 341.63321 342.35865 343.26537 7
uniqueFmatch 253.75138 253.90585 254.33802 254.14054 254.73900 255.18455 7
datatable 237.79038 252.08057 251.22063 253.78246 254.10755 254.59535 7
datatableKey 165.42183 165.55716 166.01792 165.68011 166.34137 167.22646 7
collapseFmatch 125.93940 126.10227 126.25538 126.13199 126.42614 126.65943 7
kitFmatch 125.72638 125.94671 127.43846 126.16227 126.96446 134.35822 7
factor 39.35195 39.43222 39.45809 39.44678 39.47576 39.59197 7

In this case using a factor
and updating it's levels
is fastest. Using a fast version of unique
improves speed substantially.