The mean and variance calculations need to be done by group, but the t-test and p-value calculation can be vectorized.
my.t.test.2 <- function(grp, x, y) {
grp <- factor(grp)
x_g <- split(x, grp)
x_n <- lengths(x_g)
x_mean <- vapply(x_g, mean, numeric(1))
x_var <- vapply(x_g, var, numeric(1))
y_g <- split(y, grp)
y_n <- lengths(y_g)
y_mean <- vapply(y_g, mean, numeric(1))
y_var <- vapply(y_g, var, numeric(1))
x_se2 <- x_var / x_n
y_se2 <- y_var / y_n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / (x_se2^2 / (x_n - 1L) + (y_se2^2) / (y_n - 1L))
2 * pt(-abs(tstat), df)
}
One can try and be super clever by avoiding dispatch (the 'reason' give for slowness of mean()
) and minimizing redundant calculation, e.g., of the lengths of each group.
my.t.test.2.1 <- compiler::cmpfun(function(grp, x, y) {
grp <- factor(grp)
x_g <- split.default(x, grp)
n <- lengths(x_g)
n1 <- n - 1L
x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
x_var <- vapply(x_g, var, numeric(1), USE.NAMES = FALSE)
y_g <- split.default(y, grp)
y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
y_var <- vapply(y_g, var, numeric(1), USE.NAMES = FALSE)
x_se2 <- x_var / n
y_se2 <- y_var / n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / n1)
2 * pt(-abs(tstat), df)
})
The canonical and other solutions can be wrapped to provide the same output
f0 <- function(df)
df %>% group_by(id) %>% summarize(p.value = t.test(A, B)$p.value)
f1 <- function(df)
df %>% group_by(id) %>% summarize(p.value = my.t.test(A, B))
f2 <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2(df$id, df$A, df$B))
f2.1 <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2.1(df$id, df$A, df$B))
f2.1()
produces the same result as the canonical implementation and is about twice as fast; worrying about the speed of mean()
, etc (f2()
vs. f2.1()
) seems mostly to be misguided
> all.equal.default(f0(df1), f2.1(df1))
[1] TRUE
> microbenchmark(f0(df1), f1(df1), f2(df1), f2.1(df1), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval
f0(df1) 374.2819 379.7749 380.8365 380.0094 381.2368 388.8794 5
f1(df1) 249.6502 250.2525 251.8813 252.1965 253.3444 253.9630 5
f2(df1) 154.1152 158.3243 159.8277 159.1076 162.7602 164.8311 5
f2.1(df1) 151.0032 151.0149 152.3900 152.8105 153.2840 153.8373 5
For me the C++ implementation
my.t.test.cpp <- function (x, y = NULL) {
nx <- length(x)
mx <- sum_cpp(x) / nx
vx <- var_cpp(x, mx)
ny <- length(y)
my <- sum_cpp(y) / ny
vy <- var_cpp(y, my)
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my - 0)/stderr
pval <- 2 * pt(-abs(tstat), df)
return(pval)
}
fcpp <- function(df)
df %>% group_by(id) %>% summarize(p.value = my.t.test.cpp(A, B))
produces results equal to the canonical and clocks in at about 100 ms.
Profiling the 2.1 solution shows that most of the time is spent inside var()
, where there is a call to stopifnot()
as well as a argument matching call
> var
function (x, y = NULL, na.rm = FALSE, use)
{
...
na.method <- pmatch(use, c("all.obs", "complete.obs", "pairwise.complete.obs",
"everything", "na.or.complete"))
...
if (is.data.frame(x))
x <- as.matrix(x)
else stopifnot(is.atomic(x))
...
.Call(C_cov, x, y, na.method, FALSE)
}
<bytecode: 0x5e1a440>
<environment: namespace:stats>
> Rprof(); x <- my.t.test.2.1(df1$id, df1$A, df1$B); Rprof(NULL); summaryRprof()
$by.self
self.time self.pct total.time total.pct
"withCallingHandlers" 0.04 28.57 0.08 57.14
"tryCatchList" 0.04 28.57 0.04 28.57
"vapply" 0.02 14.29 0.14 100.00
"stopifnot" 0.02 14.29 0.12 85.71
"match.call" 0.02 14.29 0.02 14.29
$by.total
total.time total.pct self.time self.pct
"vapply" 0.14 100.00 0.02 14.29
"my.t.test.2.1" 0.14 100.00 0.00 0.00
"stopifnot" 0.12 85.71 0.02 14.29
"FUN" 0.12 85.71 0.00 0.00
"withCallingHandlers" 0.08 57.14 0.04 28.57
"tryCatchList" 0.04 28.57 0.04 28.57
"tryCatch" 0.04 28.57 0.00 0.00
"match.call" 0.02 14.29 0.02 14.29
$sample.interval
[1] 0.02
$sampling.time
[1] 0.14
So in the pursuit of speed one might avoid the argument checks and call the C function directly
my.t.test.2.2 <- compiler::cmpfun(function(grp, x, y) {
var <- function(x)
.Call(stats:::C_cov, x, NULL, 4L, FALSE)
grp <- factor(grp)
x_g <- split.default(x, grp)
n <- lengths(x_g)
n1 <- n - 1L
x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
x_var <- vapply(x_g, var, numeric(1), USE.NAMES = FALSE)
y_g <- split.default(y, grp)
y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
y_var <- vapply(y_g, var, numeric(1), USE.NAMES = FALSE)
x_se2 <- x_var / n
y_se2 <- y_var / n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / n1)
2 * pt(-abs(tstat), df)
})
f2.2 <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2.2(df$id, df$A, df$B))
This turns out to be quite performant.
> all.equal.default(f0(df1), f2.2(df1))
[1] TRUE
> microbenchmark(
+ f0(df1), f1(df1), f2(df1), f2.1(df1), f2.2(df1), fcpp(df1),
+ times = 5
+ )
Unit: milliseconds
expr min lq mean median uq max neval
f0(df1) 378.61985 379.25525 393.38371 379.56797 386.2806 443.19488 5
f1(df1) 250.99802 252.45281 253.55140 253.34249 255.2801 255.68362 5
f2(df1) 156.76073 158.63126 159.63693 160.33446 161.2260 161.23216 5
f2.1(df1) 146.64555 148.28773 151.17250 151.38536 153.9363 155.60751 5
f2.2(df1) 25.24441 25.62982 27.50898 26.11755 30.0836 30.46951 5
fcpp(df1) 104.20851 104.50396 105.19383 104.62905 104.7876 107.84006 5
We can use the C++ implementation of variance calculation instead of the call to R's computation with
my.t.test.2.2.cpp <- compiler::cmpfun(function(grp, x, y) {
grp <- factor(grp)
x_g <- split.default(x, grp)
n <- lengths(x_g)
n1 <- n - 1L
x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
x_var <- unlist(Map(var_cpp, x_g, x_mean))
y_g <- split.default(y, grp)
y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
y_var <- unlist(Map(var_cpp, y_g, y_mean))
x_se2 <- x_var / n
y_se2 <- y_var / n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / n1)
2 * pt(-abs(tstat), df)
})
f2.2.cpp <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2.2.cpp(df$id, df$A, df$B))
for comparable performance
> microbenchmark(f2.2(df1), f2.2.cpp(df1), times = 20)
Unit: milliseconds
expr min lq mean median uq max neval
f2.2(df1) 25.11237 25.69622 30.27956 26.35570 29.81884 87.34955 20
f2.2.cpp(df1) 24.88787 25.25171 26.80836 25.43498 29.06338 30.80012 20
I'm not sure which is more of a hack -- writing your own C++ code for the variance, or calling R's C code directly.
A faster C++ solution calculates the group mean and variance in a single call
cppFunction('List doit(IntegerVector group, NumericVector x) {
int n_grp = 0;
for (int i = 0; i < group.size(); ++i)
n_grp = group[i] > n_grp ? group[i] : n_grp;
std::vector<int> n(n_grp);
std::vector<double> sum(n_grp), sumsq(n_grp);
for (int i = 0; i < group.size(); ++i) {
n[ group[i] - 1 ] += 1;
sum[ group[i] - 1 ] += x[i];
sumsq[ group[i] - 1 ] += x[i] * x[i];
}
NumericVector mean(n_grp), var(n_grp);
for (size_t i = 0; i < n.size(); ++i) {
mean[i] = sum[i] / n[i];
var[i] = (sumsq[i] - sum[i] * mean[i]) / (n[i] - 1);
}
return List::create(_["n"]=n[0], _["mean"]=mean, _["var"]=var);
}')
my.t.test.2.3.cpp <- compiler::cmpfun(function(grp, x, y) {
x <- doit(grp, x)
y <- doit(grp, y)
x_se2 <- x$var / x$n
y_se2 <- y$var / y$n
se <- sqrt(x_se2 + y_se2)
tstat <- (x$mean - y$mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / (x$n - 1L))
2 * pt(-abs(tstat), df)
})
f2.3.cpp <- function(df)
tibble(
id = unique(df$id),
p.value = my.t.test.2.3.cpp(df$id, df$A, df$B)
)
and this is fast
> all.equal.default(f0(df1), f2.3.cpp(df1))
[1] TRUE
> microbenchmark(f2.2(df1), f2.2.cpp(df1), f2.3.cpp(df1), times = 50)
Unit: milliseconds
expr min lq mean median uq max
f2.2(df1) 24.743364 25.445833 28.032135 25.873117 29.191020 88.642771
f2.2.cpp(df1) 24.122380 24.867212 26.012985 25.369963 25.897866 30.783544
f2.3.cpp(df1) 2.831635 2.946094 3.101408 2.992049 3.073788 7.191572
neval
50
50
50
>
Another alternative is the Bioconductor package genefilter::rowttests()
, which requires a matrix
set.seed(1)
m1 <- cbind(
matrix(rnorm(8000, mean = 26, sd = 5), ncol=8, byrow = TRUE),
matrix(rnorm(8000, mean = 25, sd = 7), ncol=8, byrow = TRUE)
)
f4 <- function(m1)
genefilter::rowttests(m1, factor(rep(1:2, each=8)))
and is also fast
> microbenchmark(f2.3.cpp(df1), f4(m1), times=50)
Unit: milliseconds
expr min lq mean median uq max neval
f2.3.cpp(df1) 2.760877 2.796542 2.877030 2.845795 2.895441 3.286143 50
f4(m1) 1.335288 1.359007 1.397601 1.377544 1.412606 1.693340 50
(some of the difference is in creating the tibble).