Another variant using rep
of the values at places where to reset. This uses cumsum
only once but the drawback will lead to large numbers and can cause an integer overflow or inaccurate numeric values and will propagate NA
to all following groups.
x <- cumsum(DF$b)
i <- which(DF$b == 0)
x - rep(c(0, x[i]), diff(c(1L, i, length(x)+1L)))
#[1] 1 0 1 2
Another way is to use Rcpp
- in this case for integer.
Rcpp::cppFunction('IntegerVector csrA(const IntegerVector x, int z=0) {
IntegerVector out(no_init(x.size()));
int init = z == NA_INTEGER ? 0 : z;
int s = 0;
for(int i = 0; i < x.size(); ++i) {
if(x[i] == z) s = init;
else [[likely]] s += x[i];
out[i] = s;
}
return out;
}')
csrA(DF$b)
#[1] 1 0 1 2
A variant also taking care of NA might look like:
Rcpp::cppFunction('IntegerVector csr(const IntegerVector x, int z=0) {
IntegerVector out(no_init(x.size()));
int init = z == NA_INTEGER ? 0 : z;
LogicalVector isNA = is_na(x);
int s = 0;
for(int i = 0; i < x.size(); ++i) {
if(x[i] == z) s = init;
else [[likely]] if(isNA[i] || s == NA_INTEGER) s = NA_INTEGER;
else [[likely]] s += x[i];
out[i] = s;
}
return out;
}')
csr(c(2,4,3,0,3,5), 0)
#[1] 2 6 9 0 3 8
csr(c(2,NA,3,0,3,5), 0)
#[1] 2 NA NA 0 3 8
csr(c(2,4,3,1,3,5), 1)
#[1] 2 6 9 1 4 9
csr(c(2,4,3,NA,3,5), NA)
#[1] 2 6 9 0 3 8
Data
DF <- data.frame(campaign = letters[1:4] ,
date=c("jan","feb","march","april"),
b = c(1,0,1,1) ,
whatiwant = c(1,0,1,2)
)
Benchmark - Based on @David Arenburg
set.seed(123)
#Using 1e3 instead of 1e2 would lead to an integer overflow for whichRep and cummax
x <- sample(0:1e2, 1e7, TRUE)
library(data.table)
bench::mark(
ave = ave(x, cumsum(x == 0), FUN = cumsum),
data.table = data.table(x)[, whatiwant := cumsum(x), by = rleid(x == 0L)]$whatiwant,
cummax = {cs = cumsum(x)
cs - cummax((x == 0) * cs)},
whichRep = {y <- cumsum(x)
i <- which(x == 0)
y - rep(c(0, y[i]), diff(c(1L, i, length(x)+1L)))},
RcppNA = csr(x),
RcppSimple = csrA(x)
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_…¹
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 ave 1.06s 1.06s 0.945 751.8MB 3.78 1 4 1.06s
2 data.table 199.01ms 266.26ms 3.76 231.9MB 1.88 2 1 532.53ms
3 cummax 90.57ms 93.76ms 10.4 152.6MB 6.92 6 4 578.4ms
4 whichRep 74.5ms 77.05ms 12.9 195.6MB 11.1 7 6 541.63ms
5 RcppNA 39.55ms 40.84ms 24.2 76.3MB 5.60 13 3 536.1ms
6 RcppSimple 29.73ms 30.59ms 32.3 38.1MB 3.80 17 2 526.1ms