One good method, using a for
loop. It's not row-by-row, it operates on "all rows with an NA
in column 'X'" at one time, for each column in cols
.
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 3 3 2 3 1 1 1 4 3 1
# 2: 1 1 2 2 1 2 2 1 2 4 1
# 3: 3 2 3 1 1 4 3 3 2 1 2
# 4: 2 3 3 1 2 2 1 4 3 4 2
# 5: 1 2 3 4 4 3 2 2 2 4 3
# 6: 4 1 4 2 1 4 4 3 3 4 3
Admittedly the use of get(.)
is not perfect, but I think it'll generally be okay.
Another method, about as fast (depending on the size of data):
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this, prev), .SD, accumulate = TRUE), .SDcols = cols]
# same results
Benchmarking, since you said that with 2M rows, performance is important.
I'll go with 2M rows and update the method for randomizing the NA
s.
library(data.table)
set.seed(456)
n <- 2e6 # 6e5
dt <- data.table(a = sample(1:4,n, replace=T), b = sample(1:4,n, replace=T), c = sample(1:4,n, replace=T), d = sample(1:4,n, replace=T), e = sample(1:4,n, replace=T), f = sample(1:4,n, replace=T), g = sample(1:4,n, replace=T), h = sample(1:4,n, replace=T), i = sample(1:4,n, replace=T), j = sample(1:4,n, replace=T), xx = sample(1:4,n, replace=T))
mtx <- cbind(sample(nrow(dt), ceiling(n*11/20), replace=TRUE), sample(ncol(dt), ceiling(n*11/20), replace=TRUE))
mtx <- mtx[!duplicated(mtx),]
dt[mtx] <- NA
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 NA 1
# 4: 2 1 4 1 2 3 NA 4 4 4 3
# 5: 1 2 3 3 4 3 3 NA 1 4 1
# 6: 4 3 4 2 2 NA 4 1 2 4 2
Unfortunately, the transpose
method fails:
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# Error: cannot allocate vector of size 30.6 Gb
but the for
loop (and Reduce
, incidentally) works fine:
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.14 0.00 0.11
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 1 1
# 4: 2 1 4 1 2 3 3 4 4 4 3
# 5: 1 2 3 3 4 3 3 3 1 4 1
# 6: 4 3 4 2 2 2 4 1 2 4 2
If I simplify the problem-set to 600K rows, then I can get both to work. (I don't know the tipover point for my system ... it might be 1M, who knows, I just wanted to compare them side-by-side.) With n <- 6e5
and generating dt
, I see the following data and simple timing:
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 3 1 3 4 NA 3 3 3 3
# 2: 1 3 2 2 4 3 1 2 2 4 1
# 3: 3 4 2 1 1 1 1 4 2 4 2
# 4: 2 4 1 NA 1 4 3 1 4 1 1
# 5: 1 NA 4 2 NA NA 4 4 2 2 NA
# 6: 4 1 4 4 1 2 3 3 1 1 2
sum(is.na(dt))
# [1] 321782
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# user system elapsed
# 4.27 4.50 7.74
sum(is.na(dt)) # 'dt' is unchanged, only important here to compare the 'for' loop
# [1] 321782
sum(is.na(dt2)) # rows with leading columns having 'NA', nothing to coalesce, not surprising
# [1] 30738
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.10 0.03 0.06
identical(dt, dt2)
# [1] TRUE
### regenerate `dt` so it has `NA`s again
system.time({
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
})
# user system elapsed
# 0.03 0.00 0.03
identical(dt, dt2)
# [1] TRUE
A more robust benchmark such as bench::mark
is going to be encumbered a little by the need to copy(dt)
every pass. Though this overhead is not huge,
bench::mark(copy(dt))
# # A tibble: 1 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 copy(dt) 7.77ms 20.9ms 45.1 25.2MB 0 23 0 510ms <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [14 x 3]> <bch:tm [23]> <tibble [23 x 3]>
it is still extra. As such, I'll compare the transpose
code twice, once with and once without, in order to better compare it to the for
and reduce
answers more honestly. (Note that bench::mark
's default action is to verify that all outputs are identical
. This can be disabled, but I have not done that, so all code blocks return the same results.)
bench::mark(
transpose1 = {
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
dt2
},
transpose2 = {
dt0 = copy(dt)
dt2 = transpose(dt0)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt0))
dt2
},
forloop = {
dt0 <- copy(dt)
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt0[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt0
},
reduce = {
dt0 <- copy(dt)
dt0[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
},
min_iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 transpose1 4.94s 5.48s 0.154 1.28GB 0.201 10 13 1.08m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [33,008 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 2 transpose2 5.85s 6.29s 0.130 1.3GB 0.259 10 20 1.29m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [15,316 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 3 forloop 48.37ms 130.91ms 2.87 71.14MB 0 10 0 3.49s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [191 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 4 reduce 48.08ms 75.82ms 4.70 71MB 0.470 10 1 2.13s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [38 x 3]> <bch:tm [10]> <tibble [10 x 3]>
From this:
- Time: after normalizing to milliseconds, 4840ms compares poorly against 48ms; and
- Memory: 1.28GB compares poorly with 71MB.
(Edited to increase the benchmark's minimum iterations to 10.)