In case you are looking for a fast unique
have a look at kit::funique
or collapse::funique
:
setDTthreads(1)
microbenchmark::microbenchmark(
dt = y[,logical(1), keyby = company]$company,
base = unique(x$company),
collapse = collapse::funique(x$company),
kit = kit::funique(x$company))
#Unit: milliseconds
# expr min lq mean median uq max neval
# dt 12.862388 13.575131 14.759180 14.248541 14.945780 49.930937 100
# base 12.939646 13.505176 14.734066 14.773846 15.415468 18.256204 100
# collapse 3.302862 3.589133 3.685685 3.692886 3.773045 4.063564 100
# kit 1.903043 2.433478 2.963308 2.882986 3.076537 6.183840 100
setDTthreads(4)
microbenchmark::microbenchmark(
dt = y[,logical(1), keyby = company]$company,
base = unique(x$company),
collapse = collapse::funique(x$company),
kit = kit::funique(x$company))
#Unit: milliseconds
# expr min lq mean median uq max neval
# dt 5.480513 7.384032 7.873730 7.569420 8.346282 11.193741 100
# base 12.998406 13.295775 14.464446 13.736353 14.856721 47.320488 100
# collapse 3.333292 3.549712 3.655851 3.645528 3.737236 4.325676 100
# kit 1.881232 2.825040 2.959422 2.917149 3.004288 5.281440 100
Data and Libraries:
set.seed(42)
n <- 1e6
company <- c("A", "S", "W", "L", "T", "T", "W", "A", "T", "W")
item <- c("Thingy", "Thingy", "Widget", "Thingy", "Grommit",
"Thingy", "Grommit", "Thingy", "Widget", "Thingy")
sales <- c(120, 140, 160, 180, 200, 120, 140, 160, 180, 200)
x <- data.frame(company = sample(company, n, TRUE),
item = sample(item, n, TRUE),
sales = sample(sales, n, TRUE))
library(data.table)
y <- as.data.table(x)