Optimize repeated fisher test over dataframe

Question

I have a code segment that I am trying to optimize to run a bit faster.

df1 <- df %>%
rowwise() %>%
    mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
                                         nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value) %>% 
    filter(oddsRatio > 1 & fisher < pval) %>% 
    mutate(test_direction = binom.test(x = forward, n = counts, p = 0.5)$p.value) %>%
    filter(test_direction < 0.05)

I've been trying to replace the first mutate/filter pair with

df1 <- df %>%
{.$fisher = fisher.test(matrix(c(.$counts, .$nt1_not_t2,
                                     .$nt2_not_t1, .$not_t1_or_t2), nrow = 2, ncol = 2))$p.value; .} %>% 
    vctrs::vec_slice(., .$oddsRatio > 1 & .$fisher < pval)

However the matrix, instead of having 4 values, is trying to use the entire dataset.

Warning message:
In matrix(c(.$counts, .$nt1_not_t2, .$nt2_not_t1, .$not_t1_or_t2),  :
  data length differs from size of matrix: [12184 != 2 x 2]

Not sure if there is some syntax I can add to avoid that, or another more performant way to achieve this.

Any advice would be great

EDIT:: using profvis()

[![profiling][1]][1]

some sample data

df <- structure(list(counts = c(114L, 55L, 57L, 95L, 514L, 65L, 694L, 
28L, 148L, 122L, 240L, 38L, 260L, 65L, 40L, 12L, 32L, 134L, 42L, 
16L, 37L, 33L, 63L, 16L, 20L, 13L, 12L, 17L, 15L, 26L, 548L, 
31L, 467L, 202L, 1696L, 1422L, 219L, 362L, 417L, 1449L, 2142L, 
241L, 128L, 1812L, 3281L, 677L, 1006L, 137L, 67L, 161L), forward = c(61L, 
37L, 32L, 47L, 233L, 43L, 568L, 15L, 75L, 76L, 149L, 27L, 177L, 
34L, 33L, 7L, 22L, 81L, 24L, 8L, 19L, 22L, 55L, 8L, 12L, 8L, 
7L, 10L, 12L, 13L, 407L, 17L, 260L, 119L, 861L, 906L, 144L, 199L, 
195L, 645L, 1223L, 166L, 73L, 844L, 2727L, 341L, 529L, 86L, 36L, 
87L), oddsRatio = c(2.81719639416155, 2.56249627110554, 3.0012284711951, 
3.2379086619481, 3.28262910798122, 1.78506192857701, 1.23683314379245, 
1.47200829293576, 1.60268857356236, 2.54327837666455, 2.65317932754055, 
2.7443152244971, 2.41170031230883, 1.39183344640434, 1.67403290633562, 
2.45502917152859, 1.52227974689146, 1.67590893004601, 1.5285951005419, 
1.88542317834637, 1.64599293496765, 1.23362495081645, 0.884202671972712, 
1.15874072081511, 1.23428571428571, 0.918351141954909, 0.942141312184571, 
1.63698770491803, 1.92011125705014, 1.67230461700034, 6.63377209451277, 
1.98838889786539, 3.21071805754822, 2.3432554142601, 3.03013725938027, 
11.6868669158062, 3.25457032130808, 3.5341881506799, 2.37982532860213, 
11.383770310192, 2.98290705092543, 1.43986166989779, 1.32458721885379, 
1.70316762338472, 1.20465352503506, 1.32048669611991, 1.38391148677588, 
1.70567801561587, 1.11338513259357, 1.18942080378251), not_t1_or_t2 = c(16736L, 
17180L, 17230L, 16989L, 12236L, 16869L, 4192L, 17243L, 15631L, 
16569L, 15416L, 17344L, 14981L, 16665L, 17139L, 17533L, 17201L, 
15903L, 17066L, 16400L, 12543L, 12161L, 4345L, 15346L, 14742L, 
15391L, 15681L, 15977L, 16568L, 14576L, 14024L, 14291L, 13801L, 
14039L, 11687L, 13734L, 14104L, 13968L, 13703L, 13701L, 10576L, 
13757L, 14009L, 9868L, 3491L, 12503L, 11656L, 14064L, 14137L, 
13875L), nt1_not_t2 = c(755L, 814L, 812L, 774L, 355L, 804L, 175L, 
841L, 721L, 747L, 629L, 831L, 609L, 804L, 829L, 857L, 837L, 735L, 
827L, 69L, 48L, 52L, 22L, 69L, 65L, 72L, 73L, 68L, 70L, 59L, 
3609L, 4126L, 3690L, 3955L, 2461L, 2735L, 3938L, 3795L, 3740L, 
2708L, 2015L, 3916L, 4029L, 2345L, 876L, 3480L, 3151L, 4020L, 
4090L, 3996L), nt2_not_t1 = c(897L, 453L, 403L, 644L, 5397L, 
764L, 13441L, 390L, 2002L, 1064L, 2217L, 289L, 2652L, 968L, 494L, 
100L, 432L, 1730L, 567L, 2017L, 5874L, 6256L, 14072L, 3071L, 
3675L, 3026L, 2736L, 2440L, 1849L, 3841L, 321L, 54L, 544L, 306L, 
2658L, 611L, 241L, 377L, 642L, 644L, 3769L, 588L, 336L, 4477L, 
10854L, 1842L, 2689L, 281L, 208L, 470L)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame")) ```


  [1]: https://i.stack.imgur.com/a1ebG.png

I imagine the slow bit here is performing the actual tests, not `mutate` or `filter`. Using different syntax seems unlikely to do much for you here. Have you tried profiling your code? If speed is a real issue, you can run the tests in parallel. — Axeman, Apr 28 '23 at 21:55
Yeah you are right, I did the profiling and its mainly the fisher test...What would you recommend for parallelization? — Dave R, Apr 28 '23 at 22:21
I like `future`, either using `future.apply` or `furrr`. So I would more or less do what Martin shows. — Axeman, Apr 28 '23 at 22:41

score 2 · Accepted Answer · answered Apr 28 '23 at 22:33

Based on Axeman's comment here a small benchmark using different approaches:

library(dplyr)
library(purrr)

my_fisher <- function(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2) {
  fisher.test(
    matrix(
      c(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), 
      nrow = 2, 
      ncol = 2
      )
  )$p.value
}

We apply this function using purrrs pmap-function and using furrrs future_pmap, a parallel version of pmap:

# purrr
df %>% 
  mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))

# furrr, choose workers based on actual hardware
library(furrr)
plan("future::multisession", workers = 4)

df %>% 
  mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))

Benchmarking:

library(microbenchmark)

microbenchmark(
  orig = df %>%
    rowwise() %>%
    mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
                                         nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value),
  purrr = df %>% 
    mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher)),
  furrr = df %>% 
    mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
)

returns

Unit: milliseconds
  expr      min       lq      mean    median       uq      max neval
  orig 108.7756 110.3752 116.08041 113.84570 118.3328 224.2154   100
 purrr 105.9916 108.0498 114.80299 113.86020 116.7421 224.0461   100
 furrr  86.1203  89.5472  93.77818  92.63145  96.1540 122.7134   100

So using furrr seems to give a small advantage regarding time / speed.

Optimize repeated fisher test over dataframe

1 Answers1