0

I'm working on a model that requires reading in and combining a large number of data tables. While the data tables will all have the same columns with identical headers, each will have a different number of rows. My current method for doing this is inefficient, and depending on the number of data tables to combine, can take hours.

My current approach uses rbind to combine data tables; below is a reproducible example with a much smaller data set:

library(data.table)

old.way <- function() {
wildfire_data <- data.table()

for(tile in 1:3) {
# Normally this data would be read in from an external file, but we'll make some dummy data for this example
new_wildfire_data <- data.table(x = sample(1:1e6,1000), y = sample(1:1e6,1000), total_PM10 = sample(1:1e6,1000),
total_PM2.5 = sample(1:1e6,1000), total_CH4 = sample(1:1e6,1000), total_CO = sample(1:1e6,1000), total_CO2 = sample(1:1e6,1000), total_NOx = sample(1:1e6,1000), total_SO2 = sample(1:1e6,1000), total_VOC = sample(1:1e6,1000), total_char = sample(1:1e6,1000))

wildfire_data <- rbind(wildfire_data,new_wildfire_data)
}
return(wildfire_data)
}

Looking at other questions, this looks like an inefficient method (Growing a data.frame in a memory-efficient manner), and I should instead be pre-allocating size and using data table's "set" function within a for loop, populating the empty data table. I tried that as well:

new.way <- function() {
num.needed.rows <- 3000

# Create a data table of a pre-allocated size    
wildfire_data <- data.table(x = integer(num.needed.rows), y = integer(num.needed.rows), total_PM10 = integer(num.needed.rows), total_PM2.5 = integer(num.needed.rows), total_CH4 = integer(num.needed.rows), total_CO = integer(num.needed.rows), total_CO2 = integer(num.needed.rows), total_NOx = integer(num.needed.rows), total_SO2 = integer(num.needed.rows), total_VOC = integer(num.needed.rows), total_char = integer(num.needed.rows))

start.row <- as.integer(0)

for(tile in 1:3) {
# Again, this data would normally be read in from an external file
new_wildfire_data <- data.table(x = sample(1:1e6,1000), y = sample(1:1e6,1000), total_PM10 = sample(1:1e6,1000),
total_PM2.5 = sample(1:1e6,1000), total_CH4 = sample(1:1e6,1000), total_CO = sample(1:1e6,1000), total_CO2 = sample(1:1e6,1000), total_NOx = sample(1:1e6,1000), total_SO2 = sample(1:1e6,1000), total_VOC = sample(1:1e6,1000), total_char = sample(1:1e6,1000))

for(raw.data.row.i in 1:nrow(new_wildfire_data)) {
set(wildfire_data,start.row + raw.data.row.i,"x", new_wildfire_data[raw.data.row.i,x])
set(wildfire_data,start.row + raw.data.row.i,"y", new_wildfire_data[raw.data.row.i,y])
set(wildfire_data,start.row + raw.data.row.i,"total_PM10", new_wildfire_data[raw.data.row.i,total_PM10])
set(wildfire_data,start.row + raw.data.row.i,"total_PM2.5", new_wildfire_data[raw.data.row.i,total_PM2.5])
set(wildfire_data,start.row + raw.data.row.i,"total_PM2.5", new_wildfire_data[raw.data.row.i,total_PM2.5])
set(wildfire_data,start.row + raw.data.row.i,"total_CH4", new_wildfire_data[raw.data.row.i,total_CH4])
set(wildfire_data,start.row + raw.data.row.i,"total_CO", new_wildfire_data[raw.data.row.i,total_CO])
set(wildfire_data,start.row + raw.data.row.i,"total_CO2", new_wildfire_data[raw.data.row.i,total_CO2])
set(wildfire_data,start.row + raw.data.row.i,"total_NOx", new_wildfire_data[raw.data.row.i,total_NOx])
set(wildfire_data,start.row + raw.data.row.i,"total_SO2", new_wildfire_data[raw.data.row.i,total_SO2])
set(wildfire_data,start.row + raw.data.row.i,"total_VOC", new_wildfire_data[raw.data.row.i,total_VOC])
set(wildfire_data,start.row + raw.data.row.i,"total_char", new_wildfire_data[raw.data.row.i,total_char])
}
start.row <- start.row + nrow(new_wildfire_data)
}
return(wildfire_data)
}



But the new way is much slower. Here's my benchmarking results:

library(microbenchmark)
microbenchmark(old.way(),new.way(),times=2

Unit: milliseconds
      expr         min          lq        mean      median          uq         max neval
 old.way()    24.29792    24.29792    25.06512    25.06512    25.83233    25.83233     2
 new.way() 12961.41358 12961.41358 13070.96187 13070.96187 13180.51016 13180.51016     2

Is there a proper way to use "set" that would result in greater efficiency over the use of "rbind"?

arharris
  • 13
  • 4
  • 1
    You look to be looping over every single row in `new_wildfire_data` - so this `set` code is being run 1 million times. I don't think you need to fill each row individually though. `set` should be able to run in chunks I would have thought. – thelatemail Jul 31 '19 at 00:35
  • 3
    you might want to use `rbindlist(lapply(filepaths, fread))`. i think `set` is for amending elements in `data.table` and growing horizontally (wide dirn) while `rbindlist` is for growing in vertically (long dirn) – chinsoon12 Jul 31 '19 at 01:09

1 Answers1

2

set is more often an alternative to := for fast assignment to elements of a data.table. This is one example of how it's normally used.

As chinsoon12 points out, rbindlist(lapply(filepaths, fread)) should be a faster solution here. In terms of the example given, one option would be to define a list of the correct dimensions and use rbindlist:

list.way <- function() {
wildfire_data_list <- vector("list", length = 3)
for(tile in 1:3) {
    # Normally this data would be read in from an external file, but we'll make some dummy data for this example
    new_wildfire_data <- data.table(x = sample(1:1e6,1000), y = sample(1:1e6,1000), total_PM10 = sample(1:1e6,1000),
                                    total_PM2.5 = sample(1:1e6,1000), total_CH4 = sample(1:1e6,1000), total_CO = sample(1:1e6,1000), total_CO2 = sample(1:1e6,1000), total_NOx = sample(1:1e6,1000), total_SO2 = sample(1:1e6,1000), total_VOC = sample(1:1e6,1000), total_char = sample(1:1e6,1000))

    wildfire_data_list[[tile]] <- new_wildfire_data
}
wildfire_data <- rbindlist(wildfire_data_list)
return(wildfire_data)
}
rsco
  • 68
  • 5
  • Implemented this method, and saw ~25% speed improvement over the original method ("old.way"). Thanks everyone! – arharris Jul 31 '19 at 19:10