Only the X4
column update depends on previous values, so the loop can be mostly 'vectorized' (with a little bit of optimization, avoiding addition of 1 to rind
in each iteration) as
rind1 <- rind + 1L
for (i in seq_len(N))
x$X4[rind1[i]] <- x$X4[rind1[i]] + x$X4[rind[i]]
x$X5[rind1] <- x$X4[rind1] * x$X3[rind1]
x$X5[rind1] <- trunc(x$X5[rind1] * 10^8) / 10^8
x$X1[rind] <- NA
na.omit(x)
X4
is a numeric value and the update can be made more efficient by updating it as a vector rather than a column of a data.frame
X4 <- x$X4
for (i in seq_len(N))
X4[rind1[i]] <- X4[rind1[i]] + X4[rind[i]]
x$X4 <- X4
For comparison, we have
f0 <- function(nrow) {
set.seed(123)
df <- data.frame(replicate(5, runif(nrow)))
df[,1:3] <- round(df[,1:3])
x <- df; N <- nrow(df); i <- 1
ind <- df[1:(N-1),1:3] == df[2:N,1:3];
rind <- which(apply(ind,1,all))
N <- length(rind)
while(i <= N)
{
x$X4[rind[i]+1] <- x$X4[rind[i]+1] + x$X4[rind[i]]
x$X5[rind[i]+1] <- x$X4[rind[i]+1] * x$X3[rind[i]+1]
x$X5[rind[i]+1] <- trunc(x$X5[rind[i]+1]*10^8)/10^8
x$X1[rind[i]] <- NA
i <- i + 1
}
na.omit(x)
}
f1a <- function(nrow) {
set.seed(123)
df <- data.frame(replicate(5, runif(nrow)))
df[,1:3] <- round(df[,1:3])
x <- df; N <- nrow(df)
ind <- df[1:(N-1),1:3] == df[2:N,1:3];
rind <- which(apply(ind,1,all))
rind1 <- rind + 1L
for (i in seq_along(rind))
x$X4[rind1[i]] <- x$X4[rind1[i]] + x$X4[rind[i]]
x$X5[rind1] <- x$X4[rind1] * x$X3[rind1]
x$X5[rind1] <- trunc(x$X5[rind1] * 10^8) / 10^8
x$X1[rind] <- NA
na.omit(x)
}
f4a <- function(nrow) {
set.seed(123)
df <- data.frame(replicate(5, runif(nrow)))
df[,1:3] <- round(df[,1:3])
x <- df; N <- nrow(df)
ind <- df[1:(N-1),1:3] == df[2:N,1:3];
rind <- which(apply(ind,1,all))
rind1 <- rind + 1L
X4 <- x$X4
for (i in seq_along(rind))
X4[rind1[i]] <- X4[rind1[i]] + X4[rind[i]]
x$X4 <- X4
x$X1[rind] <- NA
x$X5[rind1] <- X4[rind1] * x$X3[rind1]
x$X5[rind1] <- trunc(x$X5[rind1] * 10^8) / 10^8
na.omit(x)
}
The results are the same
> identical(f0(1000), f1a(1000))
[1] TRUE
> identical(f0(1000), f4a(1000))
[1] TRUE
The speedup is substantial (using library(microbenchmark)
)
> microbenchmark(f0(10000), f1a(10000), f4a(10000), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
f0(10000) 346.35906 354.37637 361.15188 363.71627 366.74944 373.88275 10
f1a(10000) 124.71766 126.43532 127.99166 127.39257 129.51927 133.01573 10
f4a(10000) 41.70401 42.48141 42.90487 43.00584 43.32059 43.83757 10
The reason for the difference can be seen when R has been compiled with memory profiling enabled --
> tracemem(x)
[1] "<0x39d93a8>"
> tracemem(x$X4)
[1] "<0x6586e40>"
> x$X4[1] <- 1
tracemem[0x39d93a8 -> 0x39d9410]:
tracemem[0x6586e40 -> 0x670d870]:
tracemem[0x39d9410 -> 0x39d9478]:
tracemem[0x39d9478 -> 0x39d94e0]: $<-.data.frame $<-
tracemem[0x39d94e0 -> 0x39d9548]: $<-.data.frame $<-
>
Each line indicates a memory copy, so updating a cell in a data frame incurs 5 copies of the outer structure or the vector itself. In contrast, a vector can be updated without any copies.
> tracemem(X4)
[1] "<0xdd44460>"
> X4[1] = 1
tracemem[0xdd44460 -> 0x9d26c10]:
> X4[1] = 2
>
(The first assignment is expensive because it represents the duplication of the data.frame column; subsequent updates are to X4
, only X4
refers to the vector being updated, and the vector does not need to be duplicated).
The data.frame implementation does seem to scale non-linearly
> microbenchmark(f1a(100), f1a(1000), f1a(10000), f1a(100000), times=10)
Unit: milliseconds
expr min lq mean median uq
f1a(100) 2.372266 2.479458 2.551568 2.524818 2.640244
f1a(1000) 10.831288 11.100009 11.210483 11.194863 11.432533
f1a(10000) 130.011104 138.686445 139.556787 141.138329 141.522686
f1a(1e+05) 4092.439956 4117.818817 4145.809235 4143.634663 4172.282888
max neval
2.727221 10
11.581644 10
147.993499 10
4216.129732 10
The reason is apparent in the second line of the tracemem output above -- updating a row triggers a copy of the entire column. So the algorithm scales as the number of rows to update times the number of rows in a column, approximately quadratic.
f4a()
appears to scale linearly
> microbenchmark(f4a(100), f4a(1000), f4a(10000), f4a(100000), f4a(1e6), times=10)
Unit: milliseconds
expr min lq mean median uq
f4a(100) 1.741458 1.756095 1.827886 1.773887 1.929943
f4a(1000) 5.286016 5.517491 5.558091 5.569514 5.671840
f4a(10000) 42.906895 43.025385 43.880020 43.928631 44.633684
f4a(1e+05) 467.698285 478.919843 539.696364 552.896109 576.707913
f4a(1e+06) 5385.029968 5521.645185 5614.960871 5573.475270 5794.307470
max neval
2.003700 10
5.764022 10
44.983002 10
644.927832 10
5823.868167 10
One could try and be clever about vectorizing the loop, but is it now necessary?
A tuned version of the data processing part of the function uses negative indexing (e.g., -nrow(df)
) to remove rows from the data frame, rowSums()
instead of apply()
, and unname()
so that subset operations don't carry around unused names:
g0 <- function(df) {
ind <- df[-nrow(df), 1:3] == df[-1, 1:3]
rind <- unname(which(rowSums(ind) == ncol(ind)))
rind1 <- rind + 1L
X4 <- df$X4
for (i in seq_along(rind))
X4[rind1[i]] <- X4[rind1[i]] + X4[rind[i]]
df$X4 <- X4
df$X1[rind] <- NA
df$X5[rind1] <- trunc(df$X4[rind1] * df$X3[rind1] * 10^8) / 10^8
na.omit(df)
}
Compared to the data.table solution suggested by @Khashaa
g1 <- function(df) {
x <- setDT(df)[, r:=rleid(X1, X2, X3),]
x <- x[, .(X1=X1[.N], X2=X2[.N], X3=X3[.N], X4=sum(X4), X5=X5[.N]), by=r]
x <- x[, X5:= trunc(X3 * X4 * 10^8)/10^8]
x
}
the base R version performs favorably with times
> n_row <- 200000
> set.seed(123)
> df <- data.frame(replicate(5, runif(n_row)))
> df[,1:3] <- round(df[,1:3])
> system.time(g0res <- g0(df))
user system elapsed
0.247 0.000 0.247
> system.time(g1res <- g1(df))
user system elapsed
0.551 0.000 0.551
(The pre-tuning version in f4a takes about 760ms, so more than twice as slow).
The results from the data.table implementation are not correct
> head(g0res)
X1 X2 X3 X4 X5
1 0 1 1 0.4708851 0.8631978
2 1 1 0 0.8977670 0.8311355
3 0 1 0 0.7615472 0.6002179
4 1 1 1 0.6478515 0.5616587
5 1 0 0 0.5329256 0.5805195
6 0 1 1 0.8526255 0.4913130
> head(g1res)
r X1 X2 X3 X4 X5
1: 1 0 1 1 0.4708851 0.4708851
2: 2 1 1 0 0.8977670 0.0000000
3: 3 0 1 0 0.7615472 0.0000000
4: 4 1 1 1 0.6478515 0.6478515
5: 5 1 0 0 0.5329256 0.0000000
6: 6 0 1 1 0.8526255 0.8526255
and I'm not enough of a data.table wizard (barely a data.table user) to know what the correct formulation is.
Compiling (benefits exclusively from the for loop?) increases speed by about 20%
> g0c <- compiler::cmpfun(g0)
> microbenchmark(g0(df), g0c(df), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
g0(df) 250.0750 262.941 276.1549 276.8848 281.1966 321.3778 10
g0c(df) 214.3132 219.940 228.0784 230.2098 235.4579 242.6636 10