Right now there is no fixed width reading in arrow. However we can make use of some readr
functionality and how Datasets work. The process below involves creating an intermediate object, so not reading directly from the fixed width files. Sometimes that's ok, sometime not. Hopefully useful to you though:
library(dplyr)
library(readr)
library(gdata) ## easiest way to create fixed width file
library(arrow)
## make a "big" fixed width dataset
write.fwf(x = quakes, file = "data.fwf", colnames = FALSE)
## create a temporary directory where you can write the chunks
tdir <- file.path(tempdir(), "arrow-tmp")
dir.create(tdir, showWarnings = FALSE, recursive = TRUE)
## callback function that reads in a chunk then converts that
## chunk into a parquet file
f <- function(x, pos) {
d <- read_fwf(paste0(x, "\n"), fwf_widths(
c(6, 6, 4, 4, 2),
col_names = names(quakes)
))
arrow::write_parquet(d, file.path(tdir, paste0("temp-", pos, ".parquet")))
}
## actually read in chunks 100 lines at a time
read_lines_chunked(file = "data.fwf", readr::SideEffectChunkCallback$new(f), chunk_size = 100)
## they are there
fs::dir_tree(tdir)
#> /var/folders/6f/c414rjlx2l512x7_80hp126c0000gn/T//RtmpHsOQmC/arrow-tmp
#> ├── temp-1.parquet
#> ├── temp-101.parquet
#> ├── temp-201.parquet
#> ├── temp-301.parquet
#> ├── temp-401.parquet
#> ├── temp-501.parquet
#> ├── temp-601.parquet
#> ├── temp-701.parquet
#> ├── temp-801.parquet
#> └── temp-901.parquet
## create a directory where you want your finished data to exist
tdir_real <- file.path(tempdir(), "arrow-real")
## open Dataset with chunked data then write it out again as parquet partitioned by station
arrow::open_dataset(tdir) %>%
group_by(stations) %>%
arrow::write_dataset(tdir_real)
fs::dir_tree(tdir_real)
#> /var/folders/6f/c414rjlx2l512x7_80hp126c0000gn/T//RtmpHsOQmC/arrow-real
#> ├── stations=0
#> │ └── part-0.parquet
#> ├── stations=1
#> │ └── part-0.parquet
#> ├── stations=2
#> │ └── part-0.parquet
#> ├── stations=3
#> │ └── part-0.parquet
#> ├── stations=4
#> │ └── part-0.parquet
#> ├── stations=5
#> │ └── part-0.parquet
#> ├── stations=6
#> │ └── part-0.parquet
#> ├── stations=7
#> │ └── part-0.parquet
#> ├── stations=8
#> │ └── part-0.parquet
#> └── stations=9
#> └── part-0.parquet