I made a naive benchmark of the methods mentioned in the comments, and data.table::fwrite
comes out ahead when data is uncompressed (but resulting csv
file is huge), followed by arrow::write_parquet
, then compressed csv.gz
using data.table::fwrite
, and finally saveRDS
.
library(readr)
library(data.table)
library(arrow)
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
library(bench)
large_table <- as.data.frame(matrix(rnorm(1e7), ncol = 1e4))
test_saveRDS <- function(large_table) {
saveRDS(large_table, "test.RDS")
return(TRUE)
}
test_fwrite_uncomp <- function(large_table) {
data.table::fwrite(large_table, "test_dt.csv")
return(TRUE)
}
test_fwrite <- function(large_table) {
data.table::fwrite(large_table, "test_dt.csv.gz")
return(TRUE)
}
test_write_parquet <- function(large_table) {
arrow::write_parquet(large_table, "test.parquet")
return(TRUE)
}
bench::mark(
test_saveRDS(large_table),
test_fwrite(large_table),
test_fwrite_uncomp(large_table),
test_write_parquet(large_table)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 test_saveRDS(large_table) 3.68s 3.68s 0.271 8.63KB 0
#> 2 test_fwrite(large_table) 2.95s 2.95s 0.339 211.97KB 0
#> 3 test_fwrite_uncomp(large_table) 656.3ms 656.3ms 1.52 88KB 0
#> 4 test_write_parquet(large_table) 1.52s 1.52s 0.659 6.47MB 2.64
saveRDS
uses the least memory, tough.