Assuming your Diff
column is numeric and those blanks really are NA
you could do:
library(data.table)
dt <- data.table(Group = c(1,1,1,1,1,2,2,2,3,3), Diff = c(NA,-16055,-1313, 45707,6569,NA,-7249,2,NA,-384724))
dt[,total := ifelse(seq_len(.N) == .N, sum(Diff, na.rm = T), NA), by = Group]
# Group Diff total
#1: 1 NA NA
#2: 1 -16055 NA
#3: 1 -1313 NA
#4: 1 45707 NA
#5: 1 6569 34908
#6: 2 NA NA
#7: 2 -7249 NA
#8: 2 2 -7247
#9: 3 NA NA
#10: 3 -384724 -384724
Alternatively you can do a merge as per Frank's suggestion in the comments:
dt[dt[, sum(Diff, na.rm=TRUE), by=Group], on=.(Group), mult="last", total := i.V1 ]
Timing Comparison:
To see how the timing compares between the @Frank's merge option, and my original solution, I varied the # groups and # observations (a high and low option for both) and ran a microbenchmark on the 4 separate datasets. The results are below and it looks like in any case other Frank's merge option is the fastest. I think the bottleneck in my solution is the ifelse
and if you could remove that, it would likely be faster although by how much is uncertain.
set.seed(1)
high_grp <- 1:10000; high_obs = 1000000;
low_grp <- 1:100; low_obs = 50000;
low_grp_high_obs <- data.table(Group = sample(low_grp, high_obs, replace = T), Diff = sample(-60000:60000, high_obs, replace = T))
high_grp_high_obs <- data.table(Group = sample(high_grp, high_obs, replace = T), Diff = sample(-60000:60000, high_obs, replace = T))
low_grp_low_obs <- data.table(Group = sample(low_grp, low_obs, replace = T), Diff = sample(-60000:60000, low_obs, replace = T))
high_grp_low_obs <- data.table(Group = sample(high_grp, low_obs, replace = T), Diff = sample(-60000:60000, low_obs, replace = T))
comparison_sets <- list("Low Group; High Obs" = low_grp_high_obs, "High Group; High Obs" = high_grp_high_obs,
"Low Group; Low Obs" = low_grp_low_obs, "High Group; Low Obs" = high_grp_low_obs)
comparison <- lapply(comparison_sets, function(dt) {microbenchmark::microbenchmark(orig = dt[,total := ifelse(seq_len(.N) == .N, sum(Diff, na.rm = T), NA), by = Group],
merge = dt[dt[, sum(Diff, na.rm=TRUE), by=Group], on=.(Group), mult="last", total := i.V1 ])} )
comparison
#$`Low Group; High Obs`
#Unit: milliseconds
# expr min lq mean median uq max neval
# orig 53.16160 58.00227 69.93443 60.08673 62.57489 191.1628 100
# merge 12.93931 15.15634 17.90187 15.56495 18.33738 147.9433 100
#
#$`High Group; High Obs`
#Unit: milliseconds
# expr min lq mean median uq max neval
# orig 143.60222 151.8497 161.65825 154.85638 158.2183 281.2311 100
# merge 23.18698 23.7380 29.20126 24.86465 29.9832 153.7919 100
#
#$`Low Group; Low Obs`
#Unit: milliseconds
# expr min lq mean median uq max neval
# orig 3.047569 3.190157 3.957012 3.378145 3.692857 8.087345 100
# merge 1.685882 1.808594 1.928094 1.846520 1.953369 5.998864 100
#
#$`High Group; Low Obs`
#Unit: milliseconds
# expr min lq mean median uq max neval
# orig 65.903991 68.727469 69.861163 69.857406 70.950330 76.351860 100
# merge 3.418077 3.595673 3.831805 3.855684 3.952869 5.069314 100