I would like to change data.table
instead of dplyr
in dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop'
). Also, I would like to know if you think the processing time is faster by data.table
than by dplyr
?
library(dplyr)
library(tidyr)
library(lubridate)
#database
df1 <- data.frame( Id = rep(1:5, length=100000),
date1 = as.Date( "2021-12-01"),
date2= rep(seq( as.Date("2021-01-01"), length.out=50000, by=1), each = 2),
Category = rep(c("ABC", "EFG"), length.out = 100000),
Week = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Sunday"), length.out = 100000),
DR1 = sample( 200:250, 100000, repl=TRUE),
setNames( replicate(365, { sample(0:100000, 100000)}, simplify=FALSE),
paste0("DRM", formatC(1:365, width = 2, format = "d", flag = "0"))))
subsetDRM<- df1 %>% select(starts_with("DRM"))
DR1_subsetDRM<-cbind (df1, setNames(df1$DR1 - subsetDRM, paste0(names(subsetDRM), "_PV")))
subset_PV<-select(DR1_subsetDRM,Id, date2,Week, Category, DR1, ends_with("PV"))
result_median<-subset_PV %>%
group_by(Id,Category,Week) %>%
dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop')