Aggregate column intervals into new columns in data.table

Question

I would like to aggregate a data.table based on intervals of a column (time). The idea here is that each interval should be a separate column with a different name in the output.

I've seen a similar question in SO but I couldn't get my head around the problem. help?

reproducible example

library(data.table)

# sample data
  set.seed(1L)
  dt <- data.table( id= sample(LETTERS,50,replace=TRUE),
                    time= sample(60,50,replace=TRUE),
                    points= sample(1000,50,replace=TRUE))

# simple summary by `id`
   dt[, .(total = sum(points)), by=id]
>     id total
> 1:  J  2058
> 2:  T  1427
> 3:  C  1020

In the desired output, each column would be named after the interval size they originate from. For example with three intervals, say time < 10, time < 20, time < 30, the head of the output should be:

  id | total | subtotal_under10 | subtotal_under20 | subtotal_under30

did you try `dcast` or any other way to pivot results? – Bulat May 17 '16 at 22:13 — Bulat, May 17 '16 at 22:13

bgoldst · Accepted Answer · 2016-05-19T03:57:08.600

Exclusive Subtotal Categories

set.seed(1L);
N <- 50L;
dt <- data.table(id=sample(LETTERS,N,T),time=sample(60L,N,T),points=sample(1000L,N,T));

breaks <- seq(0L,as.integer(ceiling((max(dt$time)+1L)/10)*10),10L);
cuts <- cut(dt$time,breaks,labels=paste0('subtotal_under',breaks[-1L]),right=F);
res <- dcast(dt[,.(subtotal=sum(points)),.(id,cut=cuts)],id~cut,value.var='subtotal');
res <- res[dt[,.(total=sum(points)),id]][order(id)];
res;

##     id subtotal_under10 subtotal_under20 subtotal_under30 subtotal_under40 subtotal_under50 subtotal_under60 total
##  1:  A               NA               NA              176               NA               NA              512   688
##  2:  B               NA               NA              599               NA               NA               NA   599
##  3:  C              527               NA               NA               NA               NA               NA   527
##  4:  D               NA               NA              174               NA               NA               NA   174
##  5:  E               NA              732              643               NA               NA               NA  1375
##  6:  F              634               NA               NA               NA               NA             1473  2107
##  7:  G               NA               NA             1410               NA               NA               NA  1410
##  8:  I               NA               NA               NA               NA               NA              596   596
##  9:  J              447               NA              640               NA               NA              354  1441
## 10:  K              508               NA               NA               NA               NA              454   962
## 11:  M               NA               14             1358               NA               NA               NA  1372
## 12:  N               NA               NA               NA               NA              730               NA   730
## 13:  O               NA               NA              271               NA               NA              259   530
## 14:  P               NA               NA               NA               NA               78               NA    78
## 15:  Q              602               NA              485               NA              925               NA  2012
## 16:  R               NA              599              357              479               NA               NA  1435
## 17:  S               NA              986              716              865               NA               NA  2567
## 18:  T               NA               NA               NA               NA              105               NA   105
## 19:  U               NA               NA               NA              239             1163              641  2043
## 20:  V               NA              683               NA               NA              929               NA  1612
## 21:  W               NA               NA               NA               NA              229               NA   229
## 22:  X              214              993               NA               NA               NA               NA  1207
## 23:  Y               NA              130              992               NA               NA               NA  1122
## 24:  Z               NA               NA               NA               NA              104               NA   104
##     id subtotal_under10 subtotal_under20 subtotal_under30 subtotal_under40 subtotal_under50 subtotal_under60 total

Cumulative Subtotal Categories

I've come up with a new solution based on the requirement of cumulative subtotals.

My objective was to avoid looping operations such as lapply(), since I realized that it should be possible to compute the desired result using only vectorized operations such as findInterval(), vectorized/cumulative operations such as cumsum(), and vector indexing.

I succeeded, but I should warn you that the algorithm is fairly intricate, in terms of its logic. I'll try to explain it below.

breaks <- seq(0L,as.integer(ceiling((max(dt$time)+1L)/10)*10),10L);
ints <- findInterval(dt$time,breaks);
res <- dt[,{ y <- ints[.I]; o <- order(y); y <- y[o]; w <- which(c(y[-length(y)]!=y[-1L],T)); v <- rep(c(NA,w),diff(c(1L,y[w],length(breaks)))); c(sum(points),as.list(cumsum(points[o])[v])); },id][order(id)];
setnames(res,2:ncol(res),c('total',paste0('subtotal_under',breaks[-1L])));
res;
##     id total subtotal_under10 subtotal_under20 subtotal_under30 subtotal_under40 subtotal_under50 subtotal_under60
##  1:  A   688               NA               NA              176              176              176              688
##  2:  B   599               NA               NA              599              599              599              599
##  3:  C   527              527              527              527              527              527              527
##  4:  D   174               NA               NA              174              174              174              174
##  5:  E  1375               NA              732             1375             1375             1375             1375
##  6:  F  2107              634              634              634              634              634             2107
##  7:  G  1410               NA               NA             1410             1410             1410             1410
##  8:  I   596               NA               NA               NA               NA               NA              596
##  9:  J  1441              447              447             1087             1087             1087             1441
## 10:  K   962              508              508              508              508              508              962
## 11:  M  1372               NA               14             1372             1372             1372             1372
## 12:  N   730               NA               NA               NA               NA              730              730
## 13:  O   530               NA               NA              271              271              271              530
## 14:  P    78               NA               NA               NA               NA               78               78
## 15:  Q  2012              602              602             1087             1087             2012             2012
## 16:  R  1435               NA              599              956             1435             1435             1435
## 17:  S  2567               NA              986             1702             2567             2567             2567
## 18:  T   105               NA               NA               NA               NA              105              105
## 19:  U  2043               NA               NA               NA              239             1402             2043
## 20:  V  1612               NA              683              683              683             1612             1612
## 21:  W   229               NA               NA               NA               NA              229              229
## 22:  X  1207              214             1207             1207             1207             1207             1207
## 23:  Y  1122               NA              130             1122             1122             1122             1122
## 24:  Z   104               NA               NA               NA               NA              104              104
##     id total subtotal_under10 subtotal_under20 subtotal_under30 subtotal_under40 subtotal_under50 subtotal_under60

Explanation

breaks <- seq(0L,as.integer(ceiling((max(dt$time)+1L)/10)*10),10L);
breaks <- seq(0,ceiling(max(dt$time)/10)*10,10); ## old derivation, for reference

First, we derive breaks as before. I should mention that I realized there was a subtle bug in my original derivation algorithm. Namely, if the maximum time value is a multiple of 10, then the derived breaks vector would've been short by 1. Consider if we had a maximum time value of 60. The original calculation of the upper limit of the sequence would've been ceiling(60/10)*10, which is just 60 again. But it should be 70, since the value 60 technically belongs in the 60 <= time < 70 interval. I fixed this in the new code (and retroactively amended the old code) by adding 1 to the maximum time value when computing the upper limit of the sequence. I also changed two of the literals to integers and added an as.integer() coercion to preserve integerness.

ints <- findInterval(dt$time,breaks);

Second, we precompute the interval indexes into which each time value falls. We can precompute this once for the entire table, because we'll be able to index out each id group's subset within the j argument of the subsequent data.table indexing operation. Note that findInterval() behaves perfectly for our purposes using the default arguments; we don't need to mess with rightmost.closed, all.inside, or left.open. This is because findInterval() by default uses lower <= value < upper logic, and it's impossible for values to fall below the lowest break (which is zero) or on or above the highest break (which must be greater than the maximum time value because of the way we derived it).

res <- dt[,{ y <- ints[.I]; o <- order(y); y <- y[o]; w <- which(c(y[-length(y)]!=y[-1L],T)); v <- rep(c(NA,w),diff(c(1L,y[w],length(breaks)))); c(sum(points),as.list(cumsum(points[o])[v])); },id][order(id)];

Third, we compute the aggregation using a data.table indexing operation, grouping by id. (Afterward we sort by id using a chained indexing operation, but that's not significant.) The j argument consists of 6 statements executed in a braced block which I will now explain one at a time.

y <- ints[.I];

This pulls out the interval indexes for the current id group in input order.

o <- order(y);

This captures the order of the group's records by interval. We will need this order for the cumulative summation of points, as well as the derivation of which indexes in that cumulative sum represent the desired interval subtotals. Note that the within-interval orders (i.e. ties) are irrelevant, since we're only going to extract the final subtotals of each interval, which will be the same regardless if and how order() breaks ties.

y <- y[o];

This actually reorders y to interval order.

w <- which(c(y[-length(y)]!=y[-1L],T));

This computes the endpoints of each interval sequence, IOW the indexes of only those elements that comprise the final element of an interval. This vector will always contain at least one index, it will never contain more indexes than there are intervals, and it will be unique.

v <- rep(c(NA,w),diff(c(1L,y[w],length(breaks))));

This repeats each element of w according to its distance (as measured in intervals) from its following element. We use diff() on y[w] to compute these distances, requiring an appended length(breaks) element to properly treat the final element of w. We also need to cover if the first interval (and zero or more subsequent intervals) is not represented in the group, in which case we must pad it with NAs. This requires prepending an NA to w and prepending a 1 to the argument vector to diff().

c(sum(points),as.list(cumsum(points[o])[v]));

Finally, we can compute the group aggregation result. Since you want a total column and then separate subtotal columns, we need a list starting with the total aggregation, followed by one list component per subtotal value. points[o] gives us the target summation operand in interval order, which we then cumulatively sum, and then index with v to produce the correct sequence of cumulative subtotals. We must coerce the vector to a list using as.list(), and then prepend the list with the total aggregation, which is simply the sum of the entire points vector. The resulting list is then returned from the j expression.

setnames(res,2:ncol(res),c('total',paste0('subtotal_under',breaks[-1L])));

Last, we set the column names. It is more performant to set them once after-the-fact, as opposed to having them set repeatedly in the j expression.

Benchmarking

For benchmarking, I wrapped my code in a function, and did the same for Mike's code. I decided to make my breaks variable a parameter with its derivation as the default argument, and I did the same for Mike's my_nums variable, but without a default argument.

Also note that for the identical() proofs-of-equivalence, I coerce the two results to matrix, because Mike's code always computes the total and subtotal columns as doubles, whereas my code preserves the type of the input points column (i.e. integer if it was integer, double if it was double). Coercing to matrix was the easiest way I could think of to verify that the actual data is equivalent.

library(data.table);
library(microbenchmark);

bgoldst <- function(dt,breaks=seq(0L,as.integer(ceiling((max(dt$time)+1L)/10)*10),10L)) { ints <- findInterval(dt$time,breaks); res <- dt[,{ y <- ints[.I]; o <- order(y); y <- y[o]; w <- which(c(y[-length(y)]!=y[-1L],T)); v <- rep(c(NA,w),diff(c(1L,y[w],length(breaks)))); c(sum(points),as.list(cumsum(points[o])[v])); },id][order(id)]; setnames(res,2:ncol(res),c('total',paste0('subtotal_under',breaks[-1L]))); res; };
mike <- function(dt,my_nums) { cols <- sapply(1:length(my_nums),function(x){return(paste0("subtotal_under",my_nums[x]))}); dt[,(cols) := lapply(my_nums,function(x) ifelse(time<x,points,NA))]; dt[,total := points]; dt[,lapply(.SD,function(x){ if (all(is.na(x))){ as.numeric(NA) } else{ as.numeric(sum(x,na.rm=TRUE)) } }),by=id, .SDcols=c("total",cols) ][order(id)]; };

## OP's sample input
set.seed(1L);
N <- 50L;
dt <- data.table(id=sample(LETTERS,N,T),time=sample(60L,N,T),points=sample(1000L,N,T));

identical(as.matrix(bgoldst(copy(dt))),as.matrix(mike(copy(dt),c(10,20,30,40,50,60))));
## [1] TRUE

microbenchmark(bgoldst(copy(dt)),mike(copy(dt),c(10,20,30,40,50,60)));
## Unit: milliseconds
##                                       expr      min       lq     mean   median       uq      max neval
##                          bgoldst(copy(dt)) 3.281380 3.484301 3.793532 3.588221 3.780023 6.322846   100
##  mike(copy(dt), c(10, 20, 30, 40, 50, 60)) 3.243746 3.442819 3.731326 3.526425 3.702832 5.618502   100

Mike's code is actually faster (usually) by a small amount for the OP's sample input.

## large input 1
set.seed(1L);
N <- 1e5L;
dt <- data.table(id=sample(LETTERS,N,T),time=sample(60L,N,T),points=sample(1000L,N,T));

identical(as.matrix(bgoldst(copy(dt))),as.matrix(mike(copy(dt),c(10,20,30,40,50,60,70))));
## [1] TRUE

microbenchmark(bgoldst(copy(dt)),mike(copy(dt),c(10,20,30,40,50,60,70)));
## Unit: milliseconds
##                                           expr      min       lq      mean   median        uq       max neval
##                              bgoldst(copy(dt)) 19.44409 19.96711  22.26597 20.36012  21.26289  62.37914   100
##  mike(copy(dt), c(10, 20, 30, 40, 50, 60, 70)) 94.35002 96.50347 101.06882 97.71544 100.07052 146.65323   100

For this much larger input, my code significantly outperforms Mike's.

In case you're wondering why I had to add the 70 to Mike's my_nums argument, it's because with so many more records, the probability of getting a 60 in the random generation of dt$time is extremely high, which requires the additional interval. You can see that the identical() call gives TRUE, so this is correct.

## large input 2
set.seed(1L);
N <- 1e6L;
dt <- data.table(id=sample(LETTERS,N,T),time=sample(60L,N,T),points=sample(1000L,N,T));

identical(as.matrix(bgoldst(copy(dt))),as.matrix(mike(copy(dt),c(10,20,30,40,50,60,70))));
## [1] TRUE

microbenchmark(bgoldst(copy(dt)),mike(copy(dt),c(10,20,30,40,50,60,70)));
## Unit: milliseconds
##                                           expr       min        lq      mean    median        uq       max neval
##                              bgoldst(copy(dt))  204.8841  207.2305  225.0254  210.6545  249.5497  312.0077   100
##  mike(copy(dt), c(10, 20, 30, 40, 50, 60, 70)) 1039.4480 1086.3435 1125.8285 1116.2700 1158.4772 1412.6840   100

For this even larger input, the performance difference is slightly more pronounced.

you get a point from me for the right idea, but this code is very poorly formatted and almost impossible to read and understand - at the very least add some spaces; the total part is also very weird - I have to say the +1 is pretty tenuous — eddi, May 17 '16 at 23:02
@bgoldst, thank you for your answer but it is not giving the expected output. Check with @MikeyMike`s output for a comparison. — rafa.pereira, May 18 '16 at 10:51
There are 4 differences between our output: (1) Mike doesn't actually aggregate; he duplicates the aggregation values for each applicable id, whereas I aggregate into a one-row-per-id table; (2) I derive more subtotal columns to fully cover the range of the input values; (3) he fills with zero, I fill with NA; and (4) he interpreted that you wanted a cumulative subtotal, whereas I assumed you wanted exclusive subtotal categories (e.g. I thought `subtotal_under20` means `10 <= time < 20`). In hindsight, I shouldn't have made this assumption, and his interpretation now seems more logical to me. — bgoldst, May 18 '16 at 11:21
Please comment if you want me to change any aspect of my solution, and if so, how. — bgoldst, May 18 '16 at 11:21
I prefer `NA`'s to `0`s in this case. The crucial part here is to consider cumulative subtotals. — rafa.pereira, May 18 '16 at 16:59
@bgoldst, thanks for such a detailed answer. Both solutions (yours and MikeyMike's) are really helpful. I think MikeyMike's answer is more intuitive and thus preferable in most situations (I gave him a vote). However, I'm working with a large dataset - 70 Million rows . Your solution worked it out in 42 sec, while MikeyMike's made R collapse due to memory limits (I have 16 GB of RAM) — rafa.pereira, May 19 '16 at 09:10
Great explanation. I think it's my all(is.na()) call that is taking a really long time no? Anyway you got my vote. — Mike H., May 19 '16 at 11:34
@MikeyMike Thanks! I'm not sure, but I think the memory issue with your solution may be related to the assignment of the subtotal columns prior to aggregating, which will require a lot of memory if there are many rows in the input table. The `lapply()` and `all(is.na())` calls probably won't be too hard on memory or processing time, because there are not too many iterations of the hidden loop. But your solution definitely beats mine on concision and intuitiveness. +1 for you too! — bgoldst, May 19 '16 at 11:41

Mike H. · Answer 2 · 2016-05-18T22:30:14.723

I'm pretty sure something like this might work as well:

   # sample data
set.seed(1)
dt <- data.table( id= sample(LETTERS,50,replace=TRUE),
                  time= sample(60,50,replace=TRUE),
                  points= sample(1000,50,replace=TRUE))

#Input numbers
my_nums <- c(10,20,30)

#Defining columns
cols <- sapply(1:length(my_nums),function(x){return(paste0("subtotal_under",my_nums[x]))})
dt[,(cols) := lapply(my_nums,function(x) ifelse(time<x,points,NA))]
dt[,total := sum((points)),by=id]
dt[,(cols):= lapply(.SD,sum,na.rm=TRUE),by=id, .SDcols=cols ]

head(dt)
   id time points subtotal_under10 subtotal_under20 subtotal_under30 total
1:  G   29    655                0                0             1410  1410
2:  J   52    354              447              447             1087  1441
3:  O   27    271                0                0              271   530
4:  X   15    993              214             1207             1207  1207
5:  F    5    634              634              634              634  2107
6:  X    6    214              214             1207             1207  1207

Edit: To aggregate columns, you can simply change to:

#Defining columns
cols <- sapply(1:length(my_nums),function(x){return(paste0("subtotal_under",my_nums[x]))})
dt[,(cols) := lapply(my_nums,function(x) ifelse(time<x,points,NA))]
dt[,total := points]
dt[,lapply(.SD,function(x){
                          if (all(is.na(x))){
                            as.numeric(NA)
                          } else{
                            as.numeric(sum(x,na.rm=TRUE))
                          }
          }),by=id, .SDcols=c("total",cols) ]

This should give the expected output of 1 row per ID.

Edit: Per OPs comment below, changed so that 0s are NA. Changed so don't need an as.numeric() call in the building of columns.

score 1 · Answer 3 · answered Jun 11 '16 at 23:50

After a while thinking about this, I think I've arrived at a very simple and fast solution based on conditional sum ! The small problem is that I haven't figured out how to automate this code to create a larger number of columns without having to write each of them. Any help here would be really welcomed !

library(data.table)

dt[, .( total = sum(points)
        , subtotal_under10 = sum(points[which( time < 10)])
        , subtotal_under20 = sum(points[which( time < 20)])
        , subtotal_under30 = sum(points[which( time < 30)])
        , subtotal_under40 = sum(points[which( time < 40)])
        , subtotal_under50 = sum(points[which( time < 50)])
        , subtotal_under60 = sum(points[which( time < 60)])), by=id][order(id)]

microbenchmark

Using the same benchmark proposed by @bgoldst in another answer, this simple solution is much faster than the alternatives:

set.seed(1L)
N <- 1e6L
dt <- data.table(id=sample(LETTERS,N,T),time=sample(60L,N,T),points=sample(1000L,N,T))

library(microbenchmark)
microbenchmark(rafa(copy(dt)),bgoldst(copy(dt)),mike(copy(dt),c(10,20,30,40,50,60)))

#                                           expr    min     lq   mean median      uq     max neval cld
#                                 rafa(copy(dt))  95.79 102.45 117.25 110.09  116.95  278.50   100 a  
#                              bgoldst(copy(dt)) 192.53 201.85 211.04 207.50  213.26  354.17   100  b 
#      mike(copy(dt), c(10, 20, 30, 40, 50, 60)) 844.80 890.53 955.29 921.27 1041.96 1112.18   100   c