Whenever possible, I suggest you avoid for
loops and repeated rbind
for filtering your data. Using some sample data:
set.seed(2)
n <- 1e4
df <- data.frame(
row11 = sample(100, size=n, replace=TRUE),
row3 = sample(100, size=n, replace=TRUE)
)
dim(df)
# [1] 10000 2
head(df)
# row11 row3
# 1 19 5
# 2 71 27
# 3 58 31
# 4 17 52
# 5 95 37
# 6 95 79
Vectorize It!
cond1 <- df$row11 > 30
cond2 <- df$row11 < 40
cond3 <- df$row3 > 10
cond4 <- df$row3 < 15
str(cond1)
# logi [1:10000] FALSE TRUE TRUE FALSE TRUE TRUE ...
out1 <- df[ cond1 & cond2 & cond3 & cond4, ]
str(out1)
# 'data.frame': 31 obs. of 2 variables:
# $ row11: int 39 35 37 33 37 36 32 34 32 37 ...
# $ row3 : int 13 11 14 13 11 13 14 12 11 12 ...
(The use of cond1
, et al, as predefined logical
vectors is completely optional. This works just as well with literal conditionals inside the [...]
brackets. Additionally, I know that your data has more columns ... this works just fine with more columns.)
To see the benefit of using the vectorized approach over loops (either literal for
or using lapply
in a similar fashion):
library(microbenchmark)
microbenchmark(
vec = {
cond1 <- df$row11 > 30
cond2 <- df$row11 < 40
cond3 <- df$row3 > 10
cond4 <- df$row3 < 15
df[ cond1 & cond2 & cond3 & cond4, ]
},
forloop = {
out2 <- df[0,]
for (i in seq_len(nrow(df))) {
if (df$row11[i] > 30 && df$row11[i] < 40 &&
df$row3[i] > 10 && df$row3[i] < 15) {
out2 <- rbind(out2, df[i,,drop=FALSE])
}
}
},
lapp = {
out3 <- lapply(seq_len(nrow(df)), function(i) {
if (df$row11[i] > 30 && df$row11[i] < 40 &&
df$row3[i] > 10 && df$row3[i] < 15) {
df[i,,drop=FALSE]
}
})
do.call(rbind, out3)
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# vec 340.605 381.813 444.9889 409.1635 476.2635 758.519 100
# forloop 142056.061 154749.407 169612.1311 165602.7955 178100.6755 254283.720 100
# lapp 148903.885 161126.073 178910.3185 172380.4195 186945.8120 256529.009 100
That this means is that what I did in around 409μsec, the for
and lapply
implementations were three orders of magnitude higher.
Much Larger Data
For a demonstration closer to your data-size:
set.seed(2)
# 3 million rows
nr <- 3e6
# 20 columns
nc <- 20
df <- as.data.frame(setNames(lapply(seq_len(nc), function(i) sample(100, size=nr, replace=TRUE)),
paste0("row", seq_len(nc))))
str(df)
# 'data.frame': 3000000 obs. of 20 variables:
# $ row1 : int 19 71 58 17 95 95 13 84 47 55 ...
# $ row2 : int 55 86 45 12 20 4 53 53 9 56 ...
# $ row3 : int 78 100 93 86 67 61 45 41 82 32 ...
# $ row4 : int 2 8 71 33 10 61 84 6 12 72 ...
# $ row5 : int 31 27 32 75 100 54 80 2 52 10 ...
# $ row6 : int 35 84 37 100 61 27 8 89 18 69 ...
# $ row7 : int 100 28 54 34 18 68 25 96 8 9 ...
# $ row8 : int 47 4 50 4 46 34 64 88 17 73 ...
# $ row9 : int 45 91 13 1 78 17 40 78 81 39 ...
# $ row10: int 31 41 87 60 30 30 22 99 85 44 ...
# $ row11: int 83 90 10 51 88 27 21 48 87 27 ...
# $ row12: int 94 83 44 53 58 41 39 5 93 6 ...
# $ row13: int 65 90 8 55 85 100 14 41 44 99 ...
# $ row14: int 39 29 18 32 87 80 32 62 22 12 ...
# $ row15: int 33 15 58 46 7 4 61 35 32 60 ...
# $ row16: int 22 17 58 27 24 56 83 59 22 44 ...
# $ row17: int 38 28 7 40 95 21 13 53 78 64 ...
# $ row18: int 64 12 88 55 36 68 84 16 82 15 ...
# $ row19: int 48 53 75 62 61 31 36 23 4 18 ...
# $ row20: int 25 89 1 11 10 40 24 50 50 66 ...
system.time({
cond1 <- df$row11 > 30
cond2 <- df$row11 < 40
cond3 <- df$row3 > 10
cond4 <- df$row3 < 15
out1 <- df[ cond1 & cond2 & cond3 & cond4, ]
})
# user system elapsed
# 0.14 0.04 0.18
Reducing 3M rows to just over 10K in well under 1 second:
str(out1)
# 'data.frame': 10685 obs. of 20 variables:
# $ row1 : int 47 82 31 1 10 86 97 85 74 56 ...
# $ row2 : int 42 5 48 1 48 10 11 18 11 94 ...
# $ row3 : int 13 12 11 12 13 12 12 11 14 11 ...
# $ row4 : int 75 29 66 53 21 2 78 52 39 87 ...
# $ row5 : int 69 90 27 67 96 23 1 36 70 83 ...
# $ row6 : int 95 77 34 99 26 63 78 100 23 42 ...
# $ row7 : int 23 27 95 61 58 91 36 35 35 35 ...
# $ row8 : int 57 92 47 23 69 49 1 44 29 99 ...
# $ row9 : int 49 17 44 65 10 94 76 60 74 81 ...
# $ row10: int 85 86 77 76 54 29 12 14 87 68 ...
# $ row11: int 34 31 34 34 37 31 32 37 31 37 ...
# $ row12: int 15 69 35 53 92 67 47 73 66 55 ...
# $ row13: int 66 57 78 8 2 14 31 88 46 67 ...
# $ row14: int 41 83 28 47 98 61 79 93 35 79 ...
# $ row15: int 36 37 15 12 18 62 25 64 15 98 ...
# $ row16: int 72 60 93 31 27 84 37 78 34 76 ...
# $ row17: int 83 2 48 20 92 25 6 57 55 66 ...
# $ row18: int 45 88 86 71 92 27 20 82 89 43 ...
# $ row19: int 9 34 79 9 28 39 37 72 90 14 ...
# $ row20: int 59 3 44 35 65 54 41 50 87 18 ...