7

I'm looking for a speedy solution for randomly subsetting vectors nested in a list.

If we simulate the following data, we get a list l that holds 3 million vectors inside, each one is of length 5. But I want the length of each vector to vary. So I thought I should apply a function that randomly subsets each vector. The problem is, this method is not as speedy as I wished.

simulate data: the list l

library(stringi)

set.seed(123)
vec_n <- 15e6
vec_vals  <- 1:vec_n
vec_names <- stringi::stri_rand_strings(vec_n, 5)

my_named_vec <- setNames(vec_vals, vec_names)

split_func <- function(x, n) {
  unname(split(x, rep_len(1:n, length(x))))
}

l <- split_func(my_named_vec, n = vec_n / 5)

head(l)
#> [[1]]
#>    HmPsw    Qk8NP    Quo3T    8f0GH    nZmjN 
#>        1  3000001  6000001  9000001 12000001 
#> 
#> [[2]]
#>    2WtYS    ZaHFl    6YjId    jbGuA    tAG65 
#>        2  3000002  6000002  9000002 12000002 
#> 
#> [[3]]
#>    xSgZ6    jM5Uw    ujPOc    CTV5F    5JRT5 
#>        3  3000003  6000003  9000003 12000003 
#> 
#> [[4]]
#>    tF2Kx    r4ZCI    Ooklo    VOLHU    M6z6H 
#>        4  3000004  6000004  9000004 12000004 
#> 
#> [[5]]
#>    tgdze    w8d1B    FYERK    jlClo    NQfsF 
#>        5  3000005  6000005  9000005 12000005 
#> 
#> [[6]]
#>    hXaH9    gsY1u    CjBwC    Oqqty    dxJ4c 
#>        6  3000006  6000006  9000006 12000006

Now that we have l, I wish to subset each vector randomly: meaning that the number of elements being subsetted (per vector) will be random. So one option is to set the following utility function:

randomly_subset_vec <- function(x) {
  my_range <- 1:length(x)
  x[-sample(my_range, sample(my_range))]
}

lapply(head(l), randomly_subset_vec)
#> [[1]]
#>   Quo3T 
#> 6000001 
#> 
#> [[2]]
#>   6YjId   jbGuA 
#> 6000002 9000002 
#> 
#> [[3]]
#>   xSgZ6   jM5Uw   ujPOc   CTV5F 
#>       3 3000003 6000003 9000003 
#> 
#> [[4]]
#>   Ooklo 
#> 6000004 
#> 
#> [[5]]
#> named integer(0)
#> 
#> [[6]]
#>    CjBwC    Oqqty    dxJ4c 
#>  6000006  9000006 12000006

But running this procedure over the entire l takes forever. I've tried using rrapply which is a fast package for dealing with lists, and it takes "only" 110 seconds on my machine.

library(rrapply)
library(tictoc)

tic()
l_subsetted <- rrapply(object = l, f = randomly_subset_vec)
toc()
#> 110.23 sec elapsed

I will be happy with either of the following:

  1. Is there a speedier alternative to:
    rrapply(object = l, f = randomly_subset_vec)
    
  2. Or more generally, is there a speedier way to start with my_named_vec and arrive at l_subsetted?
Emman
  • 3,695
  • 2
  • 20
  • 44

8 Answers8

5

UPDATE 1 to fix the name behavior in stack for large objects

Your subsets don't include the full set, so this first removes a random element from each vector, then randomly retains all other elements:

library(stringi)

set.seed(123)
vec_n <- 15e6
vec_vals  <- 1:vec_n
vec_names <- stringi::stri_rand_strings(vec_n, 5)

my_named_vec <- setNames(vec_vals, vec_names)

split_func <- function(x, n) {
  unname(split(x, rep_len(1:n, length(x))))
}

l <- split_func(my_named_vec, n = vec_n / 5)
system.time({
  lenl <- lengths(l)
  # use stack to unlist the list while keeping the originating list index for each value
  vec_names <- names(unlist(l))
  blnKeep <- replace(sample(c(FALSE, TRUE), length(vec_names), replace = TRUE), ceiling(runif(length(l))*lenl) + c(0, head(cumsum(lenl), -1)), FALSE)
  temp <- stack(setNames(l, seq_along(l)))[blnKeep,]
  # re-list
  l_subsetted <- unname(split(setNames(temp$values, vec_names[blnKeep]), temp$ind))
})
#>    user  system elapsed 
#>  22.999   0.936  23.934
head(l_subsetted)
#> [[1]]
#>    HmPsw    nZmjN 
#>        1 12000001 
#> 
#> [[2]]
#>   2WtYS   6YjId 
#>       2 6000002 
#> 
#> [[3]]
#>   xSgZ6   jM5Uw   ujPOc 
#>       3 3000003 6000003 
#> 
#> [[4]]
#>   tF2Kx   r4ZCI 
#>       4 3000004 
#> 
#> [[5]]
#>    FYERK    NQfsF 
#>  6000005 12000005 
#> 
#> [[6]]
#>   gsY1u 
#> 3000006
Created on 2021-11-01 by the reprex package (v2.0.0)

UPDATE 2 for vectors of uniformly distributed lengths:

@runr is correct in the comments that the above code will result in binomially-distributed vector lengths, while the OP's original code results in uniformly-distributed vector lengths. Below is an example of how to use the same idea to get uniformly-distributed vector lengths. The code is more complex, but the run-time seems to be a bit faster (possibly due to circumventing stack):

library(stringi)
set.seed(123)
vec_n <- 15e6
vec_vals  <- 1:vec_n
vec_names <- stringi::stri_rand_strings(vec_n, 5)
my_named_vec <- setNames(vec_vals, vec_names)
split_func <- function(x, n) {
  unname(split(x, rep_len(1:n, length(x))))
}
l <- split_func(my_named_vec, n = vec_n / 5)

system.time({
  idx <- seq_along(l)
  lenl <- lengths(l)
  ul <- unlist(l)
  # get a random number of elements to remove from each vector
  nRemove <- ceiling(runif(length(l))*lenl)
  nRemove2 <- nRemove
  blnNotEmpty <- nRemove != lenl # will the subset vector have any elements?
  blnKeep <- rep(TRUE, length(l))
  
  # loop until the predetermined number of elements have been removed from each vector
  while (length(nRemove)) {
    # remove a random element from vectors that have too many
    ul <- ul[-(ceiling(runif(length(idx))*lenl[idx]) + c(0, head(cumsum(lenl), -1))[idx])]
    lenl[idx] <- lenl[idx] - 1L # decrement the vector lengths
    blnKeep <- nRemove != 1
    idx <- idx[blnKeep]
    nRemove <- nRemove[blnKeep] - 1L # decrement the number of elements left to remove
  }
  
  l_subsetted <- rep(list(integer(0)), length(l))
  l_subsetted[blnNotEmpty] <- unname(split(ul, rep.int(seq_along(l), lenl)))
})
#>    user  system elapsed 
#>  18.396   0.935  19.332
head(l_subsetted)
#> [[1]]
#>   Qk8NP   Quo3T   8f0GH 
#> 3000001 6000001 9000001 
#> 
#> [[2]]
#> integer(0)
#> 
#> [[3]]
#>    xSgZ6    ujPOc    CTV5F    5JRT5 
#>        3  6000003  9000003 12000003 
#> 
#> [[4]]
#>   tF2Kx   Ooklo   VOLHU 
#>       4 6000004 9000004 
#> 
#> [[5]]
#>    tgdze    w8d1B    jlClo    NQfsF 
#>        5  3000005  9000005 12000005 
#> 
#> [[6]]
#>    gsY1u    CjBwC    Oqqty    dxJ4c 
#>  3000006  6000006  9000006 12000006
# check that vector lengths are uniformly-distributed (lengths of 0-4 are equally likely)
table(lengths(l_subsetted))
#> 
#>      0      1      2      3      4 
#> 599633 599041 601209 600648 599469
Created on 2021-11-02 by the reprex package (v2.0.1)
jblood94
  • 10,340
  • 1
  • 10
  • 15
  • Thanks. This is a very nice solution. However, please note that your `l_subsetted` doesn't include the original alphanumeric vec names. Seems that they're gone somewhere in the process. – Emman Nov 01 '21 at 19:45
  • @Emman - I added `head(l_subsetted)`, which shows that the list elements are named vectors. Do you get a similar result? – jblood94 Nov 01 '21 at 19:59
  • Unfortunately I don't get the same output as yours. I even ran the code via `reprex()` and still no. Could you also try via `reprex()`? – Emman Nov 01 '21 at 20:06
  • @Emman - I put all the code in Reprex (with a smaller dataset). Still the same here. The machine I'm currently on is running an older version of R (3.6.0), so I ran it on https://rdrr.io/snippets/ (version 4.0.3) and the results were still the same. Is it possible one of the functions used is masked by another package? Which packages do you have loaded? – jblood94 Nov 01 '21 at 20:21
  • This is quite bizarre. When `vec_n <- 15e3` I get the correct alphanumeric names, but when `vec_n <- 15e6` then those names are replaced by numbers. I have no packages attached besides `stringi`. My R version is 4.1.1. Windows 10 PC. – Emman Nov 01 '21 at 21:10
  • Yep, I see it now. I honestly thought I had tested it on the full data set before I posted. The problem is in the `stack` function. Somewhere around 1e4 it starts dropping the names from the vectors. I'll post a workaround in a moment. – jblood94 Nov 01 '21 at 22:32
  • The workaround may be slightly slower, but not much. – jblood94 Nov 01 '21 at 22:36
  • 2
    @Emman observe the different distributions of lengths in this approach (versus the originally intended), whether they make sense in your case. E.g., call ``l_subsetted %>% lapply(.,length) %>% do.call(c,.) %>% table`` and observe a bell shaped histograme with a median value of ``2``. On the other hand, the original experiment in the OP's code would generate a uniform distribution. This can be a crucial difference to the intended experiment design – runr Nov 02 '21 at 12:25
  • @runr is correct. Please see UPDATE 2 in my answer to get uniformly-distributed vector lengths. – jblood94 Nov 02 '21 at 18:53
  • Great update! Does this scale with the CPU cores of the machine? – runr Nov 03 '21 at 22:58
  • @runr I've been tinkering with `parallel`, but I haven't had much luck. The problem is that `my_named_vec` is ~1GB, so passing the data to the nodes and back takes as long as processing the whole thing on a single node. That's even when I export only the needed subsets of `my_named_vec` (i.e., `clusterExport(cl[i], ...`). – jblood94 Nov 04 '21 at 14:01
2

Simplify the sampling function:

randomly_subset_vec_2 <- function(x) {
  my_range <- length(x)
  x[-sample(my_range, sample(my_range, 1))]
}

This alone can give a significant speed-up.
And though I have not tested it, given the problem description, to remove some elements (minus sign before sample) is to keep the others. Why not extract some elements (no minus sign) thereby keeping those?


Simpler and faster: To sample directly from x is the fastest so far.

randomly_subset_vec_3 <- function(x) {
  sample(x, sample(length(x), 1))
}
Rui Barradas
  • 70,273
  • 8
  • 34
  • 66
  • Thanks! `randomly_subset_vec_2` cuts down processing time from 110 seconds to 53 on my machine. So roughly x2 times faster. Not sure I understood your question about removing vs. extracting. Yes, I agree it's just the same thing. – Emman Nov 01 '21 at 15:25
  • @Emman I mean to remove the minus sign, see the edit. And the 2nd function is 35% faster. – Rui Barradas Nov 01 '21 at 15:36
2

Very rough and I'm not particularly proud of this. I'm sure there is a more elegant way but this ran in the matter of seconds on my machine

> # Make some fake data
> out <- lapply(1:3000000, function(i){sample(LETTERS, 5, replace = FALSE)})
> out[1:5]
[[1]]
[1] "D" "H" "C" "Y" "V"

[[2]]
[1] "M" "E" "H" "G" "S"

[[3]]
[1] "R" "P" "O" "L" "M"

[[4]]
[1] "C" "U" "G" "Q" "X"

[[5]]
[1] "Q" "L" "W" "O" "V"

> # Create list with ids to sample
> id <- lapply(1:3000000, function(i){sample(1:5, sample(1:5, 1), replace = FALSE)})
> id[1:5]
[[1]]
[1] 2

[[2]]
[1] 2 3 4 1 5

[[3]]
[1] 4

[[4]]
[1] 5

[[5]]
[1] 1 2

> # Extract the ids from the original data using the id list.
> # Like I said I'm not particularly proud of this but it gets the job
> # done quick enough on my computer
> out <- lapply(1:3000000, function(i){out[[i]][id[[i]]]})
> out[1:5]
[[1]]
[1] "H"

[[2]]
[1] "E" "H" "G" "M" "S"

[[3]]
[1] "L"

[[4]]
[1] "X"

[[5]]
[1] "Q" "L"
Dason
  • 60,663
  • 9
  • 131
  • 148
  • Thanks. While updating `out` indeed takes just a few seconds, computing `id` takes the majority of time. So overall, on my machine, the entire code you suggested (except the initial creation of `out`) is about 55 seconds. So x2 times faster than my original method. – Emman Nov 01 '21 at 15:41
  • I now wonder whether there's a way to create `id` first as a matrix with random values ranging 1-5, then somehow convert it to a list. – Emman Nov 01 '21 at 19:48
2

It seems that the largest bottleneck is running all the sample calls, so we could try the following. One way, is the solution by Julius Vainora. First, we generate funFast by Rcpp:

library(inline)
library(Rcpp)
src <- 
'
int num = as<int>(size), x = as<int>(n);
Rcpp::NumericVector vx = Rcpp::clone<Rcpp::NumericVector>(x);
Rcpp::NumericVector pr = Rcpp::clone<Rcpp::NumericVector>(prob);
Rcpp::NumericVector rnd = rexp(x) / pr;
for(int i= 0; i<vx.size(); ++i) vx[i] = i;
std::partial_sort(vx.begin(), vx.begin() + num, vx.end(), Comp(rnd));
vx = vx[seq(0, num - 1)] + 1;
return vx;
'
incl <- 
'
struct Comp{
  Comp(const Rcpp::NumericVector& v ) : _v(v) {}
  bool operator ()(int a, int b) { return _v[a] < _v[b]; }
  const Rcpp::NumericVector& _v;
};
'
funFast <- cxxfunction(signature(n = "Numeric", size = "integer", prob = "numeric"),
                       src, plugin = "Rcpp", include = incl)

Then, define an alternative to your randomly_subset_vec using funFast instead of sample:

'randomly_subset_vec_2' <- function(x) {
  range <- length(x)
  probs <- rep(1/range, range)
  
  o <- funFast(range, size = funFast(range, size = 1, prob = probs), prob = probs)
  return(x[-o])
}

tic();obj <- rrapply(object = l, f = randomly_subset_vec_2);toc();
runr
  • 1,142
  • 1
  • 9
  • 25
  • @Emman have you tried the approach? – runr Nov 02 '21 at 00:08
  • Sorry, yes. It runs in 36 seconds on my machine so right now it's the fastest. However, `cxxfunction()` has its own overhead, so overall your solution takes 45 seconds on my machine. I'll need to come up with a proper benchmarking rather than using `tictoc` – Emman Nov 02 '21 at 08:13
  • @Emman cxxfunction only compiles the c++ code. You can compile it once per session or load it already compiled.. – runr Nov 02 '21 at 11:46
  • @Emman in any case, it seems that specifically the sampling function is taking most of the time here, not the design or the way we do the subsetting or anything. See, e.g., ``profvis({ tic();obj <- lapply(l,randomly_subset_vec_2);toc(); })`` here ``profvis::profvis`` provides a flame-graph for the largest bottlenecks. Simplify the original function to make the samping size fixed (instead of random) and the code gets faster, since it does less ``funFast`` calls. Of course, I already assume you are going to parallelize this to all of the cpu cores, instead of current 1? – runr Nov 02 '21 at 12:13
  • But having the sample size random is inherent to the problem. And parallelizing is a good idea! I didn't consider that so far. – Emman Nov 02 '21 at 12:24
1

Maybe we can replace randomly_subset_vec with something simpler with sample and sample.int:

lapply(l, function(x) x[sample.int(5, sample(5, 1))])
GuedesBF
  • 8,409
  • 5
  • 19
  • 37
1

More efficient is probably to replace the many individual sample calls by a single larger sample call. Below is an approach that samples a large logical matrix keep (since l initially has a rectangular format) and keep only the entries for which keep evaluates to TRUE:

system.time({
  keep <- matrix(sample(c(TRUE, FALSE), size = vec_n, replace = TRUE), nrow = 5, ncol = length(l))
  l1 <- lapply(seq_along(l), function(i) l[[i]][keep[, i]])
})

#>    user  system elapsed 
#>   8.667   0.448   9.114

head(l1)

#> [[1]]
#>   HmPsw   Quo3T   8f0GH 
#>       1 6000001 9000001 
#> 
#> [[2]]
#>   2WtYS   ZaHFl   6YjId 
#>       2 3000002 6000002 
#> 
#> [[3]]
#>    xSgZ6    jM5Uw    ujPOc    CTV5F    5JRT5 
#>        3  3000003  6000003  9000003 12000003 
#> 
#> [[4]]
#>    M6z6H 
#> 12000004 
#> 
#> [[5]]
#>    tgdze    w8d1B    FYERK    jlClo    NQfsF 
#>        5  3000005  6000005  9000005 12000005 
#> 
#> [[6]]
#>   hXaH9   CjBwC   Oqqty 
#>       6 6000006 9000006

NB: here the order of the entries in l stays the same (i.e. no resampling), also list elements of l1 are not guaranteed to contain at least one value.

Joris C.
  • 5,721
  • 3
  • 12
  • 27
  • Also, please note the output of `table(lengths(l1))`. We get a non-uniform distribution, suggesting the randomness has a pattern (and thus not entirely random). See @runr's [comment](https://stackoverflow.com/questions/69798599/any-speedier-way-to-randomly-subset-vectors-inside-a-list#comment123398872_69799702). – Emman Nov 02 '21 at 21:47
  • 2
    @Emman, they're still random subsets. This would give binomially distributed vector lengths (with n = 5). It just depends on how you wish to sample. This answer randomly keeps/removes each element with probability 0.5, while the original post randomly samples a uniformly-distributed number of elements for each vector. – jblood94 Nov 02 '21 at 21:57
1

I'm putting this in a new answer so as to not further confuse my previous one.

I noticed from some of the comments that the vectors in l are intended to have all the same lengths (5) and that you may not need l at all. It's also a little unclear whether you want the lengths of l_subsetted to be between 0 and 4 or between 0 and 5. You also seem to be interested in the distribution of the lengths of l_subsetted (uniform vs. binomial).

Below is a generic function if length(unique(lengths(l))) == 1. It subsets directly from my_named_vec without creating l. It pretty consistently runs in the 5-13 second range.

set.seed(123)
vec_n <- 15e6L
my_named_vec <- setNames(1:vec_n, stringi::stri_rand_strings(vec_n, 5))

fSub <- function(nv, vecLen = 5L, maxLen = 5L, unif = FALSE) {
  # subset each named vector from the list l (l is not generated):
  # l <- unname(split(nv, rep_len(seq(length(nv)/vecLen), length(nv))))
  # INPUTS:
  #  nv: named vector whose length is a multiple of vecLen
  #  vecLen: the length of the vectors in l
  #  maxLen: the maximum length of the subsetted vectors
  #  unif: FALSE = binomial subset vector lengths
  #        TRUE = uniform subset vector lengths
  # OUTPUT: a list of named vectors subset from l
  
  nrw <- length(nv)%/%vecLen # length of the output list
  # get all possible logical indices for sampling each vector in l
  mKeep <- as.matrix(expand.grid(rep(list(c(TRUE, FALSE)), vecLen)), ncol = vecLen)
  nKeep <- rowSums(mKeep)
  # remove logical indices that would result in vectors greater than maxLen
  blnKeep <- nKeep <= maxLen
  mKeep <- mKeep[blnKeep,]
  nKeep <- nKeep[blnKeep]
  
  if (unif) {
    # sample mKeep with non-uniform probability in order to get uniform lengths
    iKeep <- sample(length(nKeep), nrw, replace = TRUE, prob = 1/choose(vecLen, nKeep))
  } else {
    iKeep <- sample(length(nKeep), nrw, replace = TRUE)
  }
  
  blnKeep <- c(mKeep[iKeep,])
  l <- rep(list(integer(0L)), nrw)
  l[iKeep != length(nKeep)] <- unname(split(nv[blnKeep], rep(1:nrw, vecLen)[blnKeep]))
  return(l)
}

lbinom5 <- fSub(my_named_vec) # binomial vector lengths (0 to 5)
lunif5 <- fSub(my_named_vec, unif = TRUE) # uniform vector lengths (0 to 5)
lbinom4 <- fSub(my_named_vec, maxLen = 4L) # binomial vector lenghts (0 to 4)
lunif4 <- fSub(my_named_vec, maxLen = 4L, unif = TRUE) # uniform vector lengths (0 to 4)

> microbenchmark::microbenchmark(
+   lbinom5 = {lbinom5 <- fSub(my_named_vec)},
+   lunif5 = {lunif5 <- fSub(my_named_vec, unif = TRUE)},
+   lbinom4 = {lbinom4 <- fSub(my_named_vec, maxLen = 4L)},
+   lunif4 = {lunif4 <- fSub(my_named_vec, maxLen = 4L, unif = TRUE)},
+   times = 10)
Unit: seconds
    expr      min       lq     mean    median       uq      max neval
 lbinom5 5.974837 8.060281 9.192600  9.014967 10.15609 13.01182    10
  lunif5 5.240133 6.618115 9.688577 10.799230 11.44718 12.73518    10
 lbinom4 5.082508 6.497218 8.636434  8.656817 11.40678 11.81519    10
  lunif4 5.468311 6.639423 8.310269  7.919579 10.28546 11.28075    10
jblood94
  • 10,340
  • 1
  • 10
  • 15
0

You can try the code below

lapply(
  l,
  function(x) {
    head(sample(x), sample(length(x), 1))
  }
)
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81