In response to this thread on twitter, where @Guru_GyanKhoji asked how to quickly match a vector of 10k words in another vector of 10k words I've been playing with some options as a learning exercise. I based this on the reprex posted in the thread by @Phioniater
Requirements
Returns a list of named logical vectors, as done in the initial rpub
Optionally iterates across a ragged list of lookup vectors, as mentioned elsewhere in the twitter thread.
I've yet to explore explicit parallelisation (except for data generation), but feel free to add these if you like.
Question
Any other suggestions? Are we yet leaving substantial amounts of performance on the table?
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(data.table)
#>
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#>
#> between, first, last
library(fastmatch)
#>
#> Attaching package: 'fastmatch'
#> The following object is masked from 'package:dplyr':
#>
#> coalesce
library(foreach)
library(rlang, include.only = 'set_names') # Otherwise Rlang masks data.table :=
benchmarkme::get_cpu()
#> $vendor_id
#> [1] "GenuineIntel"
#>
#> $model_name
#> [1] "12th Gen Intel(R) Core(TM) i7-12700H"
#>
#> $no_of_cores
#> [1] 20
# Number of tokens in the dictionary
n_dictionary <- 1e4
# The list of lookup token vectors is defined as ragged.
# Number of vectors of tokens to look up in the dictionary
n_lookup_vectors <- 1e4
# Min number of tokens in any single lookup vector
n_tokens_min <- 1e4
# Max number of tokens in any single lookup vector
n_tokens_max <- 1e4
# Please read the warning below before setting this
generate_new_data <- TRUE
if (!generate_new_data) {
stop(
'Data generation takes ~45 seconds parallelised on the above architecture on Linux but
will probably take ~5-10 minutes on single core/older hardware, or on Windows.
With default 1e4*1e4 `tokens` object is ~1.8GB in memory, or ~712Mb as an RDS.
Set generate_new_data == TRUE to accept this & continue.'
)
} else {
# Only used to speed up generation of test dataset
library(dqrng)
library(stringi)
library(parallel)
# Set up cluster for data generation. Defaults to ForkCluster on linux
.n_cores<-ifelse(detectCores() > 1, detectCores() - 1, stop("I wouldn't recommend it..."))
if(.Platform$OS.type == "unix") {
cl <- parallel::makeForkCluster(.n_cores)
} else {
cl <- parallel::makeCluster(.n_cores)
}
doParallel::registerDoParallel(cl)
## Data Generating Functions, adapted from https://rpubs.com/Bernhard/940382
gen_wordlist <- function(n, dictionary = c(), n_matches = 1e3) {
# Reusable function. If dictionary is supplied, it produces a
# vector of tokens with n_matches to the dictionary
# adapted from https://rpubs.com/Bernhard/940382
# Use of dqrng::dqsample & dqsample.int cuts sampling time by ~4x
starts<-seq_len(n) * 10
ends <- starts + 9
wl <- substring(stri_join(dqsample(letters, n*10, TRUE), collapse = ""), starts, ends)
if (length(dictionary) > 0) {
# make sure about 1000 words are in both lists
wl[dqsample.int(n, n_matches)] <- dictionary[dqsample.int(length(dictionary), n_matches)]
}
wl
}
gen_lookups <- function(n_lookups, n_tokens_min = 1e3, n_tokens_max = 1e5, dictionary, n_matches = 1e3) {
if (n_matches >= n_tokens_min) stop("Number of induced matches must <= n_tokens_min")
# Induce token lists to be of various lengths, to address problem.
n_tokens <- runif(n_lookups, n_tokens_min, n_tokens_max)
# Return list of vectors of token vectors of various lengths
foreach(i = n_tokens,
.packages = c('dqrng', 'stringi'),
.export = c('gen_wordlist')) %dopar% {
gen_wordlist(i, dictionary = dictionary, n_matches = n_matches)
}
}
# define dictionary
dictionary <- gen_wordlist(n_dictionary)
# Generate a standard-length single-vector token list for initial benchmark
token_init <- gen_lookups(1,
n_tokens_min = 1e4,
n_tokens_max = 1e4,
dictionary
)
# Generate a list of lookup vectors of various lengths
tokens <- gen_lookups(n_lookup_vectors,
n_tokens_min = n_tokens_min,
n_tokens_max = n_tokens_max,
dictionary
)
parallel::stopCluster(cl)
}
##################################################
# Functions to test
# All expect a single vector dictionary in wl1 & a list of token vectors for wl2
# Naive part-vectorised for loop, using base %in%
for_loop <- function(wl1, wl2) {
lapply(wl2, \(x) {
# Prepopulate return vector
wl2.in.wl1 <- rep(NA, length(x))
names(wl2.in.wl1) <- x
for (i in seq_along(x)) {
wl2.in.wl1[i] <- x[i] %in% wl1
}
wl2.in.wl1
})
}
for_loop_set_names <- function(wl1, wl2) {
lapply(wl2, \(x) {
# Prepopulate return vector, testing effect of rlang::set_names
wl2.in.wl1 <- rep(NA, length(x)) |> set_names(x)
for (i in seq_along(x)) {
wl2.in.wl1[i] <- x[i] %in% wl1
}
wl2.in.wl1
})
}
# Part-vectorised for loop, but with fastmatch %fin%
for_loop_fmatch <- function(wl1, wl2) {
lapply(wl2, \(x) {
# Prepopulate return vector
wl2.in.wl1 <- rep(NA, length(x)) |> set_names(x)
for (i in seq_along(x)) {
# Fastmatch implementation of %in%
wl2.in.wl1[i] <- x[i] %fin% wl1
}
wl2.in.wl1
})
}
# Vectorised search using base %in%
vectorised_in <- function(wl1, wl2) {
lapply(wl2, \(x) {
x %in% wl1 |> set_names(x)
})
}
# Vectorised search using data.table %chin%
vectorised_chin <- function(wl1, wl2) {
lapply(wl2, \(x) {
# Using data.table's %chin%
x %chin% wl1 |> set_names(x)
})
}
# Vectorised match using fastmatch %fin%
vectorised_fin <- function(wl1, wl2) {
lapply(wl2, \(x) {
# fastmatch implementation of %in%
x %fin% wl1 |> set_names(x)
})
}
# Hash table lookup
hash_table <- function(wl1, wl2) {
lapply(wl2, \(x) {
# adapted from https://rpubs.com/Bernhard/940382
# an example with loops and a has table -- can definitively approved on
hash <- hashtab(size = length(wl1))
# put the whole wl1 into the hashtable and do it with for for fun
for (word in wl1) sethash(hash, word, TRUE)
# search for words in wl that are in wl1 with apply
sapply(x, \(word) gethash(hash, word, nomatch = FALSE))
})
}
# Data table left join
data_table <- function(wl1, wl2) {
lapply(wl2, \(x) {
wl1_dt <- data.table(
token = wl1,
match = TRUE
)
wl2_dt <- data.table(
token = x
)
# left join
tbl <- wl1_dt[wl2_dt, on = 'token']
# Bit of munging, but it only adds ~1ms
!is.na(tbl$match) |> set_names(tbl$token)
})
}
# Data table left join, improved
data_table_2 <- function(wl1, wl2) {
lapply(wl2, \(x) {
wl1_dt <- data.table(
token = wl1,
match = TRUE
)
wl2_dt <- data.table(
token = x
)
# Uses a more efficient left join from https://stackoverflow.com/a/34600831/19455460
tbl <- wl2_dt[wl1_dt, on = 'token', match := i.match]
# Bit of munging, but it only adds ~1ms
!is.na(tbl$match) |> set_names(tbl$token)
})
}
#########################################################
# Benchmarking
# Run benchmarks on a single lookup vector
bench::mark(
.for_loop = for_loop(dictionary, token_init),
.for_loop_set_names = for_loop_set_names(dictionary, token_init),
.hash_table = hash_table(dictionary, token_init),
.data_table = data_table(dictionary, token_init),
.data_table_2 = data_table_2(dictionary, token_init),
.for_loop_fmatch = for_loop_fmatch(dictionary, token_init),
.vectorised_in = vectorised_in(dictionary, token_init),
.vectorised_chin = vectorised_chin(dictionary, token_init),
.vectorised_fin = vectorised_fin(dictionary, token_init),
check = TRUE,
min_iterations = 100,
max_iterations = 1e5
)[,c(1,2,3,5,7,9)] |>
mutate(speedup = as.numeric(max(median) / median))
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 9 × 5
#> expression min median mem_alloc speedup
#> <bch:expr> <bch:tm> <bch:tm> <bch:byt> <dbl>
#> 1 .for_loop 562.55ms 729.96ms 763.47MB 1
#> 2 .for_loop_set_names 564.52ms 696.27ms 763.47MB 1.05
#> 3 .hash_table 13.32ms 14.15ms 702.6KB 51.6
#> 4 .data_table 5.27ms 5.51ms 3.12MB 133.
#> 5 .data_table_2 5.48ms 5.64ms 850.42KB 129.
#> 6 .for_loop_fmatch 4.67ms 5.1ms 74.56KB 143.
#> 7 .vectorised_in 141.39µs 152.08µs 362.61KB 4800.
#> 8 .vectorised_chin 81.62µs 100.3µs 39.11KB 7278.
#> 9 .vectorised_fin 24.88µs 27.83µs 78.22KB 26233.
# Run benchmarks across all lookup vectors for the faster options
bench::mark(
#.for_loop = for_loop(dictionary, tokens),
#.for_loop_set_names = for_loop_set_names(dictionary, tokens[1]),
#.hash_table = hash_table(dictionary, tokens),
#.data_table = data_table(dictionary, tokens),
#.data_table_2 = data_table_2(dictionary, tokens),
#.for_loop_fmatch = for_loop_fmatch(dictionary, tokens),
.vectorised_in = vectorised_in(dictionary, tokens),
.vectorised_chin = vectorised_chin(dictionary, tokens),
.vectorised_fin= vectorised_fin(dictionary, tokens),
check = TRUE,
min_time = 30,
min_iterations = 10,
max_iterations = 1e6
)[,c(1,2,3,5,7,9)] |>
mutate(
speedup = as.numeric(max(median) / median)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 3 × 5
#> expression min median mem_alloc speedup
#> <bch:expr> <bch:tm> <bch:tm> <bch:byt> <dbl>
#> 1 .vectorised_in 3.48s 3.85s 3.46GB 1
#> 2 .vectorised_chin 1.34s 1.47s 382MB 2.62
#> 3 .vectorised_fin 546.35ms 1.04s 763.93MB 3.71
Created on 2022-09-10 by the reprex package (v2.0.1)