Fast Token Matching in R

Question

In response to this thread on twitter, where @Guru_GyanKhoji asked how to quickly match a vector of 10k words in another vector of 10k words I've been playing with some options as a learning exercise. I based this on the reprex posted in the thread by @Phioniater

Requirements

Returns a list of named logical vectors, as done in the initial rpub
Optionally iterates across a ragged list of lookup vectors, as mentioned elsewhere in the twitter thread.
I've yet to explore explicit parallelisation (except for data generation), but feel free to add these if you like.

Question

Any other suggestions? Are we yet leaving substantial amounts of performance on the table?

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
library(fastmatch)
#> 
#> Attaching package: 'fastmatch'
#> The following object is masked from 'package:dplyr':
#> 
#>     coalesce
library(foreach)
library(rlang, include.only = 'set_names') # Otherwise Rlang masks data.table :=

benchmarkme::get_cpu()
#> $vendor_id
#> [1] "GenuineIntel"
#> 
#> $model_name
#> [1] "12th Gen Intel(R) Core(TM) i7-12700H"
#> 
#> $no_of_cores
#> [1] 20

# Number of tokens in the dictionary
n_dictionary <- 1e4

# The list of lookup token vectors is defined as ragged.
# Number of vectors of tokens to look up in the dictionary
n_lookup_vectors <- 1e4
# Min number of tokens in any single lookup vector
n_tokens_min <- 1e4
# Max number of tokens in any single lookup vector
n_tokens_max <- 1e4

# Please read the warning below before setting this
generate_new_data <- TRUE

if (!generate_new_data) {
  stop(
    'Data generation takes ~45 seconds parallelised on the above architecture on Linux but 
    will probably take ~5-10 minutes on single core/older hardware, or on Windows. 
    With default 1e4*1e4 `tokens` object is ~1.8GB in memory, or ~712Mb as an RDS.
    Set generate_new_data == TRUE to accept this & continue.'
  )
} else {
  
  # Only used to speed up generation of test dataset
  library(dqrng)
  library(stringi)
  library(parallel)
  
  # Set up cluster for data generation. Defaults to ForkCluster on linux
  .n_cores<-ifelse(detectCores() > 1, detectCores() - 1, stop("I wouldn't recommend it..."))
  
  if(.Platform$OS.type == "unix") {
    cl <- parallel::makeForkCluster(.n_cores)
  } else {
    cl <- parallel::makeCluster(.n_cores)
  }
  doParallel::registerDoParallel(cl)
  
  ## Data Generating Functions, adapted from https://rpubs.com/Bernhard/940382
  gen_wordlist <- function(n, dictionary = c(), n_matches = 1e3) {
    # Reusable function. If dictionary is supplied, it produces a 
    # vector of tokens with n_matches to the dictionary
    # adapted from https://rpubs.com/Bernhard/940382
    # Use of dqrng::dqsample & dqsample.int cuts sampling time by ~4x
    starts<-seq_len(n) * 10
    ends <- starts + 9
    wl <- substring(stri_join(dqsample(letters, n*10, TRUE), collapse = ""), starts, ends)
    if (length(dictionary) > 0) {
      # make sure about 1000 words are in both lists
      wl[dqsample.int(n, n_matches)] <- dictionary[dqsample.int(length(dictionary), n_matches)]
    }
    wl
  }
  
  gen_lookups <- function(n_lookups, n_tokens_min = 1e3, n_tokens_max = 1e5, dictionary, n_matches = 1e3) {
    if (n_matches >= n_tokens_min) stop("Number of induced matches must <= n_tokens_min")
    # Induce token lists to be of various lengths, to address problem.
    n_tokens <- runif(n_lookups, n_tokens_min, n_tokens_max)
    # Return list of vectors of token vectors of various lengths
    foreach(i = n_tokens, 
            .packages = c('dqrng', 'stringi'), 
            .export = c('gen_wordlist')) %dopar% {
              gen_wordlist(i, dictionary = dictionary, n_matches = n_matches)
            }
  }
  
  
  # define dictionary
  dictionary <- gen_wordlist(n_dictionary)
  
  # Generate a standard-length single-vector token list for initial benchmark

  token_init <- gen_lookups(1, 
                        n_tokens_min = 1e4, 
                        n_tokens_max = 1e4, 
                        dictionary
                )
  
  # Generate a list of lookup vectors of various lengths
  tokens <- gen_lookups(n_lookup_vectors, 
                        n_tokens_min = n_tokens_min, 
                        n_tokens_max = n_tokens_max, 
                        dictionary
            )
  
  parallel::stopCluster(cl)
}

##################################################
# Functions to test
# All expect a single vector dictionary in wl1 & a list of token vectors for wl2 

# Naive part-vectorised for loop, using base %in%
for_loop <- function(wl1, wl2) {
  lapply(wl2, \(x) {
    # Prepopulate return vector
    wl2.in.wl1 <- rep(NA, length(x))
    names(wl2.in.wl1) <- x
    
    for (i in seq_along(x)) {
      wl2.in.wl1[i] <- x[i] %in% wl1
    }
    
    wl2.in.wl1
  })
}

for_loop_set_names <- function(wl1, wl2) {
  lapply(wl2, \(x) {
    # Prepopulate return vector, testing effect of rlang::set_names
    wl2.in.wl1 <- rep(NA, length(x)) |> set_names(x)
    
    for (i in seq_along(x)) {
      wl2.in.wl1[i] <- x[i] %in% wl1
    }
    
    wl2.in.wl1
  })
}

# Part-vectorised for loop, but with fastmatch %fin%
for_loop_fmatch <- function(wl1, wl2) {
  lapply(wl2, \(x) {
    # Prepopulate return vector
    wl2.in.wl1 <- rep(NA, length(x))  |> set_names(x)
    
    for (i in seq_along(x)) {
      # Fastmatch implementation of %in%
      wl2.in.wl1[i] <- x[i] %fin% wl1
    }
    
    wl2.in.wl1
  })
}

# Vectorised search using base %in%
vectorised_in <- function(wl1, wl2) {
  lapply(wl2, \(x) {
    x %in% wl1  |> set_names(x)
  })
}


# Vectorised search using data.table %chin%
vectorised_chin <- function(wl1, wl2) {
  lapply(wl2, \(x) {
    # Using data.table's %chin%
    x %chin% wl1 |> set_names(x)
  })
}

# Vectorised match using fastmatch %fin%
vectorised_fin <- function(wl1, wl2) {
  lapply(wl2, \(x) {
    # fastmatch implementation of %in%
    x %fin% wl1 |> set_names(x)
  })
}

# Hash table lookup
hash_table <- function(wl1, wl2) {
  lapply(wl2, \(x) {
    # adapted from https://rpubs.com/Bernhard/940382
    # an example with loops and a has table -- can definitively approved on
    hash <- hashtab(size = length(wl1))
    
    # put the whole wl1 into the hashtable and do it with for for fun
    for (word in wl1) sethash(hash, word, TRUE) 
    
    # search for words in wl that are in wl1 with apply
    sapply(x, \(word) gethash(hash, word, nomatch = FALSE))
  })
}

# Data table left join
data_table <- function(wl1, wl2) {
  lapply(wl2, \(x) {
    wl1_dt <- data.table(
      token = wl1,
      match = TRUE
    )
    wl2_dt <- data.table(
      token = x
    )
    # left join
    tbl <- wl1_dt[wl2_dt, on = 'token']
    # Bit of munging, but it only adds ~1ms
    !is.na(tbl$match) |> set_names(tbl$token)
  })
}

# Data table left join, improved
data_table_2 <- function(wl1, wl2) {
  lapply(wl2, \(x) {
    wl1_dt <- data.table(
      token = wl1,
      match = TRUE
    )
    wl2_dt <- data.table(
      token = x
    )
    # Uses a more efficient left join from https://stackoverflow.com/a/34600831/19455460
    tbl <- wl2_dt[wl1_dt, on = 'token', match := i.match]
    # Bit of munging, but it only adds ~1ms
    !is.na(tbl$match) |> set_names(tbl$token)
  })
}

#########################################################
# Benchmarking

# Run benchmarks on a single lookup vector
bench::mark(
  .for_loop = for_loop(dictionary, token_init),
  .for_loop_set_names = for_loop_set_names(dictionary, token_init),
  .hash_table = hash_table(dictionary, token_init),
  .data_table = data_table(dictionary, token_init),
  .data_table_2 = data_table_2(dictionary, token_init),
  .for_loop_fmatch = for_loop_fmatch(dictionary, token_init),
  .vectorised_in = vectorised_in(dictionary, token_init),
  .vectorised_chin = vectorised_chin(dictionary, token_init),
  .vectorised_fin = vectorised_fin(dictionary, token_init),
  check = TRUE,
  min_iterations = 100,
  max_iterations = 1e5
)[,c(1,2,3,5,7,9)] |>
  mutate(speedup = as.numeric(max(median) / median))
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 9 × 5
#>   expression               min   median mem_alloc  speedup
#>   <bch:expr>          <bch:tm> <bch:tm> <bch:byt>    <dbl>
#> 1 .for_loop           562.55ms 729.96ms  763.47MB     1   
#> 2 .for_loop_set_names 564.52ms 696.27ms  763.47MB     1.05
#> 3 .hash_table          13.32ms  14.15ms   702.6KB    51.6 
#> 4 .data_table           5.27ms   5.51ms    3.12MB   133.  
#> 5 .data_table_2         5.48ms   5.64ms  850.42KB   129.  
#> 6 .for_loop_fmatch      4.67ms    5.1ms   74.56KB   143.  
#> 7 .vectorised_in      141.39µs 152.08µs  362.61KB  4800.  
#> 8 .vectorised_chin     81.62µs  100.3µs   39.11KB  7278.  
#> 9 .vectorised_fin      24.88µs  27.83µs   78.22KB 26233.

# Run benchmarks across all lookup vectors for the faster options
bench::mark(
  #.for_loop = for_loop(dictionary, tokens),
  #.for_loop_set_names = for_loop_set_names(dictionary, tokens[1]),
  #.hash_table = hash_table(dictionary, tokens),
  #.data_table = data_table(dictionary, tokens),
  #.data_table_2 = data_table_2(dictionary, tokens),
  #.for_loop_fmatch = for_loop_fmatch(dictionary, tokens),
  .vectorised_in = vectorised_in(dictionary, tokens),
  .vectorised_chin = vectorised_chin(dictionary, tokens),
  .vectorised_fin= vectorised_fin(dictionary, tokens),
  check = TRUE,
  min_time = 30,
  min_iterations = 10,
  max_iterations = 1e6
)[,c(1,2,3,5,7,9)] |>
  mutate(
    speedup = as.numeric(max(median) / median)
  )
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 3 × 5
#>   expression            min   median mem_alloc speedup
#>   <bch:expr>       <bch:tm> <bch:tm> <bch:byt>   <dbl>
#> 1 .vectorised_in      3.48s    3.85s    3.46GB    1   
#> 2 .vectorised_chin    1.34s    1.47s     382MB    2.62
#> 3 .vectorised_fin  546.35ms    1.04s  763.93MB    3.71

^{Created on 2022-09-10 by the reprex package (v2.0.1)}

generally the more information you can give, the more optimized of a solution you can write up. For example, are both vectors sorted, length of the words, which language (letter occurrences), are the word frequencies known etc. Then, depending on the info, implementing in C++ should give a large speedup. As for a cheap speed up here, `vapply` is faster than `lapply` — Donald Seinen, Sep 10 '22 at 16:21
Thanks @DonaldSeinen. Yes I wanted to use vapply, but struggled to work out how to work out how to give a default `FUN.VALUE` with a list of different-length vectors, without resorting to lots of padding. Perhaps that'd still be faster. — sturu, Sep 10 '22 at 16:33
The approaches look quite optimized for the general use case. Some slight optimizations in the slower functions are possible, for example to use an environment instead of hashtab, explicit `\`for()` in backticks. If the intention is to learn, a memoised function may be interesting to explore, although it will not give the optimal performance for a one-off task. — Donald Seinen, Sep 10 '22 at 16:50

Fast Token Matching in R

Requirements

Question

0 Answers0