Could also use data.table
library(data.table)
DT <- data.table(DF)
DT[, series_new := 1:.N, by = id]
and using tapply
DF$series_new <- unlist(tapply(DF$id, DF$id, function(x) 1:length(x)))
Extending @Henrik's comparison above both data.table
and dplyr
are quite a bit faster for large data sets.
library(data.table)
library(dplyr)
df <- data.frame(id= sample(letters[1:3], 100000, replace = TRUE), stringsAsFactors = F)
dt <- data.table(df)
fun_orig <- function(df){
for (i in unique(df$id)){
df$series[df$id==i]<-1:length(df$id[df$id==i])
}}
fun_tapply <- function(df){
df$series <- unlist(tapply(df$id, df$id, function(x) 1:length(x)))
}
fun_ave <- function(df){
transform(df, series = ave(df$id, df$id, FUN = seq_along))
}
fun_dp <- function(df){
df %.%
group_by(id) %.%
mutate(
series = seq_along(id))
}
fun_dt <- function(dt) dt[, 1:.N, by = id]
microbenchmark(fun_dt(dt), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_dt(dt) 2.473253 2.597031 2.771771 3.76307 40.59909 1000
microbenchmark(fun_dp(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_dp(df) 2.71375 2.786829 2.914569 3.081609 40.48445 1000
microbenchmark(fun_orig(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_orig(df) 30.65534 31.93449 32.72991 33.88885 75.13967 1000
microbenchmark(fun_tapply(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_tapply(df) 56.67636 61.72207 66.37193 102.4189 124.6661 1000
microbenchmark(fun_ave(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_ave(df) 97.36992 103.161 107.5007 139.1362 157.9464 1000