I have a code segment that I am trying to optimize to run a bit faster.
df1 <- df %>%
rowwise() %>%
mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value) %>%
filter(oddsRatio > 1 & fisher < pval) %>%
mutate(test_direction = binom.test(x = forward, n = counts, p = 0.5)$p.value) %>%
filter(test_direction < 0.05)
I've been trying to replace the first mutate/filter pair with
df1 <- df %>%
{.$fisher = fisher.test(matrix(c(.$counts, .$nt1_not_t2,
.$nt2_not_t1, .$not_t1_or_t2), nrow = 2, ncol = 2))$p.value; .} %>%
vctrs::vec_slice(., .$oddsRatio > 1 & .$fisher < pval)
However the matrix, instead of having 4 values, is trying to use the entire dataset.
Warning message:
In matrix(c(.$counts, .$nt1_not_t2, .$nt2_not_t1, .$not_t1_or_t2), :
data length differs from size of matrix: [12184 != 2 x 2]
Not sure if there is some syntax I can add to avoid that, or another more performant way to achieve this.
Any advice would be great
EDIT:: using profvis()
[![profiling][1]][1]
some sample data
df <- structure(list(counts = c(114L, 55L, 57L, 95L, 514L, 65L, 694L,
28L, 148L, 122L, 240L, 38L, 260L, 65L, 40L, 12L, 32L, 134L, 42L,
16L, 37L, 33L, 63L, 16L, 20L, 13L, 12L, 17L, 15L, 26L, 548L,
31L, 467L, 202L, 1696L, 1422L, 219L, 362L, 417L, 1449L, 2142L,
241L, 128L, 1812L, 3281L, 677L, 1006L, 137L, 67L, 161L), forward = c(61L,
37L, 32L, 47L, 233L, 43L, 568L, 15L, 75L, 76L, 149L, 27L, 177L,
34L, 33L, 7L, 22L, 81L, 24L, 8L, 19L, 22L, 55L, 8L, 12L, 8L,
7L, 10L, 12L, 13L, 407L, 17L, 260L, 119L, 861L, 906L, 144L, 199L,
195L, 645L, 1223L, 166L, 73L, 844L, 2727L, 341L, 529L, 86L, 36L,
87L), oddsRatio = c(2.81719639416155, 2.56249627110554, 3.0012284711951,
3.2379086619481, 3.28262910798122, 1.78506192857701, 1.23683314379245,
1.47200829293576, 1.60268857356236, 2.54327837666455, 2.65317932754055,
2.7443152244971, 2.41170031230883, 1.39183344640434, 1.67403290633562,
2.45502917152859, 1.52227974689146, 1.67590893004601, 1.5285951005419,
1.88542317834637, 1.64599293496765, 1.23362495081645, 0.884202671972712,
1.15874072081511, 1.23428571428571, 0.918351141954909, 0.942141312184571,
1.63698770491803, 1.92011125705014, 1.67230461700034, 6.63377209451277,
1.98838889786539, 3.21071805754822, 2.3432554142601, 3.03013725938027,
11.6868669158062, 3.25457032130808, 3.5341881506799, 2.37982532860213,
11.383770310192, 2.98290705092543, 1.43986166989779, 1.32458721885379,
1.70316762338472, 1.20465352503506, 1.32048669611991, 1.38391148677588,
1.70567801561587, 1.11338513259357, 1.18942080378251), not_t1_or_t2 = c(16736L,
17180L, 17230L, 16989L, 12236L, 16869L, 4192L, 17243L, 15631L,
16569L, 15416L, 17344L, 14981L, 16665L, 17139L, 17533L, 17201L,
15903L, 17066L, 16400L, 12543L, 12161L, 4345L, 15346L, 14742L,
15391L, 15681L, 15977L, 16568L, 14576L, 14024L, 14291L, 13801L,
14039L, 11687L, 13734L, 14104L, 13968L, 13703L, 13701L, 10576L,
13757L, 14009L, 9868L, 3491L, 12503L, 11656L, 14064L, 14137L,
13875L), nt1_not_t2 = c(755L, 814L, 812L, 774L, 355L, 804L, 175L,
841L, 721L, 747L, 629L, 831L, 609L, 804L, 829L, 857L, 837L, 735L,
827L, 69L, 48L, 52L, 22L, 69L, 65L, 72L, 73L, 68L, 70L, 59L,
3609L, 4126L, 3690L, 3955L, 2461L, 2735L, 3938L, 3795L, 3740L,
2708L, 2015L, 3916L, 4029L, 2345L, 876L, 3480L, 3151L, 4020L,
4090L, 3996L), nt2_not_t1 = c(897L, 453L, 403L, 644L, 5397L,
764L, 13441L, 390L, 2002L, 1064L, 2217L, 289L, 2652L, 968L, 494L,
100L, 432L, 1730L, 567L, 2017L, 5874L, 6256L, 14072L, 3071L,
3675L, 3026L, 2736L, 2440L, 1849L, 3841L, 321L, 54L, 544L, 306L,
2658L, 611L, 241L, 377L, 642L, 644L, 3769L, 588L, 336L, 4477L,
10854L, 1842L, 2689L, 281L, 208L, 470L)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame")) ```
[1]: https://i.stack.imgur.com/a1ebG.png