Dataset provided by OP does not expose the complexity of the task. In terms of addressing OP question so far only Mike's answer was the correct one.
In fact for a 8 rolling days, instead of 7 rolling days, due to d <= 0 & d >= -7
.
zoo
solution by @G. Grothendieck is almost valid, only if merge
would be made to each group of ID
.
Below second data.table solution, this time valid results, using dev RcppRoll which allows na.rm=TRUE
.
And slightly formatted Mike's solution output.
data<-as.data.frame(matrix(NA,42,3))
data$V1<-seq(as.Date("2014-05-01"),as.Date("2014-09-01"),by=3)
data$V2<-rep(1:6,7)
data$V3<-rep(c(1,2),21)
colnames(data)<-c("Date","USD","ID")
library(microbenchmark)
library(RcppRoll) # install_github("kevinushey/RcppRoll")
library(data.table) # install_github("Rdatatable/data.table")
correct_jan_dt = function(n, partial=TRUE){
DT = as.data.table(data) # this can be speedup by setDT()
date.range = DT[,range(Date)]
all.dates = seq.Date(date.range[1],date.range[2],by=1)
setkey(DT,ID,Date)
r = DT[CJ(unique(ID),all.dates)][, c("roll") := as.integer(roll_sumr(USD, n, normalize = FALSE, na.rm = TRUE)), by="ID"][!is.na(USD)]
# This could be simplified when `partial` arg will be implemented in [kevinushey/RcppRoll](https://github.com/kevinushey/RcppRoll)
if(isTRUE(partial)){
r[is.na(roll), roll := cumsum(USD), by="ID"][]
}
return(r[order(Date,ID)])
}
correct_mike_dt = function(){
data = as.data.table(data)[,ID2:=.GRP,by=c("ID")]
#Build reference table
Ref <- data[,list(Compare_Value=list(I(USD)),Compare_Date=list(I(Date))), by=c("ID2")]
#Use mapply to get last seven days of value by id
data[, c("roll") := mapply(RD = Date,NUM=ID2, function(RD, NUM){
d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
sum((d <= 0 & d >= -7)*Ref$Compare_Value[[NUM]])})][,ID2:=NULL][]
}
identical(correct_mike_dt(), correct_jan_dt(n=8,partial=TRUE))
# [1] TRUE
microbenchmark(unit="relative", times=5L, correct_mike_dt(), correct_jan_dt(8))
# Unit: relative
# expr min lq mean median uq max neval
# correct_mike_dt() 274.0699 273.9892 267.2886 266.6009 266.2254 256.7296 5
# correct_jan_dt(8) 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 5
Looking forward for update from @Khashaa.
Edit (20150122.2): Below benchmarks do not answer OP question.
Timing on a bigger (still very tiny) dataset, 5439 rows:
library(zoo)
library(data.table)
library(dplyr)
library(RcppRoll)
library(microbenchmark)
data<-as.data.frame(matrix(NA,5439,3))
data$V1<-seq(as.Date("1970-01-01"),as.Date("2014-09-01"),by=3)
data$V2<-sample(1:6,5439,TRUE)
data$V3<-sample(c(1,2),5439,TRUE)
colnames(data)<-c("Date","USD","ID")
zoo_f = function(){
z <- read.zoo(data)
z0 <- merge(z, zoo(, seq(start(z), end(z), "day")), fill = 0) # expand to daily
roll <- function(x) rollsumr(x, 7, fill = NA)
transform(data, roll = ave(z0$USD, z0$ID, FUN = roll)[time(z)])
}
dt_f = function(){
DT = as.data.table(data) # this can be speedup by setDT()
date.range = DT[,range(Date)]
all.dates = seq.Date(date.range[1],date.range[2],by=1)
setkey(DT,Date)
DT[.(all.dates)
][order(Date), c("roll") := rowSums(setDT(shift(USD, 0:6, NA, "lag")),na.rm=FALSE), by="ID"
][!is.na(ID)]
}
dp_f = function(){
data %>% group_by(ID) %>%
mutate(roll=roll_sum(c(rep(NA,6), USD), 7))
}
dt2_f = function(){
# this can be speedup by setDT()
as.data.table(data)[, c("roll") := roll_sum(c(rep(NA,6), USD), 7), by="ID"][]
}
identical(as.data.table(zoo_f()),dt_f())
# [1] TRUE
identical(setDT(as.data.frame(dp_f())),dt_f())
# [1] TRUE
identical(dt2_f(),dt_f())
# [1] TRUE
microbenchmark(unit="relative", times=20L, zoo_f(), dt_f(), dp_f(), dt2_f())
# Unit: relative
# expr min lq mean median uq max neval
# zoo_f() 140.331889 141.891917 138.064126 139.381336 136.029019 137.730171 20
# dt_f() 14.917166 14.464199 15.210757 16.898931 16.543811 14.221987 20
# dp_f() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
# dt2_f() 1.536896 1.521983 1.500392 1.518641 1.629916 1.337903 20
Yet I'm not sure if my data.table code is already optimal.
Above functions did not answer OP question. Read the top of post for update. Mike's solution was the correct one.