Edit to include 3rd approach
Providing a comparison of timings of my approach, @Ben Bolker's, and @akrun's. I don't usually compare speeds so there might be a more eloquent way of setting this up. Open to suggestions.
I created an example with 500 rows and one with 250,000 rows. I look at the time it takes to split into 10 columns and 100 columns.
Approach 1: Empty data.table()
populated using for loop.
Approach 2: read.fwf()
Approach 3: setDT()
with substring
Here is the time these approaches take in minutes:
> results
DataSize Approach1 Approach2 Approach3
1 500 Rows, 10 Columns 0.01934705 0.002605216 0.001200851
2 500 rows, 100 Columns 0.07892265 0.028603617 0.014927268
3 250,000 Rows, 10 Columns 6.84735728 1.527935565 1.585325948
4 250,000 rows, 100 Columns 37.34443290 8.075678647 4.172232886
read.fwf()
is clearly faster than the for loops I used in my approach. The substring
approach is the fastest. Interestingly the three approaches scale differently depending on the the number columns vs rows:
> # Time factor increase with column and row increases
> scaling
Increase Approach1 Approach2 Approach3
1 500 Rows: Increase from 10 to 100 Columns 4.079311 10.979366 12.430577
2 250k Rows: Increase from 10 to 100 Columns 5.453846 5.285353 2.631782
3 10 Columns: Increase from 500 to 250k Rows 353.922518 586.490999 1320.168952
4 100 Columns: Increase from 500 to 250k Rows 473.177640 282.330677 279.504118
It appears that the empty data.table
approach scales better than read.fwf()
and the substring
approach when there is either a small number of columns or a small number of rows. Any thoughts on why this is?
Another thought: My data set has a little less columns and rows than the biggest example here. But it took almost an hour to parse. Each row in my data set 700-800 characters and the resulting columns are of varying sizes. This is another dimension of performance and speed worth considering.
Here is how I set this up.
Set up tables with random strings and guide tables
library(stringi)
df1 <- data.frame(
v1=stri_rand_strings(n=250000, length=200, pattern="[A-Za-z0-9]"),
stringsAsFactors=FALSE
)
df2 <- as.data.frame(df1[1:500,])
guide1 <- data.frame(
names=paste0(rep("c",100), 1:100),
begin=(1:100)*2-1,
end=(1:100)*2,
stringsAsFactors = FALSE
)
guide2 <- data.frame(
names=paste0(rep("c",10), 1:10),
begin=(0:9)*20+1,
end=(1:10)*20,
stringsAsFactors = FALSE
)
Set up the functions for both approaches
approach1 <- function(emptydf, df, guide){
for(y in 1:nrow(df)){
split <- character()
for(z in 1:nrow(guide)){
extr <- substr(df[y,], guide[z, 2], guide[z, 3])
split <- c(split, extr)
}
emptydf <- emptydf[y, names(emptydf) := as.list(split)]
}
return(emptydf)
}
approach2 <- function(path, guide){
import <- read.fwf(path, guide$end+1-guide$begin)
}
approach3 <- function(df, guide){
setDT(setNames(do.call(rbind.data.frame, Map(substring, df$v1,
list(guide$begin), list(guide$end))), guide$names))[]
}
Approach 1: Empty data.table with For loops:
emptydf1 <- data.frame(matrix(ncol = nrow(guide1),
nrow = nrow(df1)))
colnames(emptydf1) <- as.character(unlist(guide1[1]))
emptydf1[is.na(emptydf1)] <- ""
emptydf2 <- as.data.frame(emptydf1[, 1:10])
emptydf3 <- as.data.frame(emptydf1[1:500,])
emptydf4 <- as.data.frame(emptydf1[1:500,1:10])
setDT(emptydf1)
setDT(emptydf2)
setDT(emptydf3)
setDT(emptydf4)
## 500 rows and 10 columns
a0 <- Sys.time()
app1Out1 <- approach1(emptydf4, df2, guide2)
a1 <- Sys.time()
## 500 rows and 100 columns
b0 <- Sys.time()
app1Out2 <- approach1(emptydf3, df2, guide1)
b1 <- Sys.time()
## 250,000 rows and 10 columns
c0 <- Sys.time()
app1Out3 <- approach1(emptydf2, df1, guide2)
c1 <- Sys.time()
## 250,000 rows and 100 columns
d0 <- Sys.time()
app1Out4 <- approach1(emptydf1, df1, guide1)
d1 <- Sys.time()
Approach 2: read.fwf()
writeLines(as.character(unlist(df1[1])), con="df1.txt")
writeLines(as.character(unlist(df2[1])), con="df2.txt")
## 500 rows and 10 columns
e0 <- Sys.time()
app2Out1 <- approach2("df2.txt", guide2)
e1 <- Sys.time()
## 500 rows and 100 columns
f0 <- Sys.time()
app2Out2 <- approach2("df2.txt", guide1)
f1 <- Sys.time()
## 500 rows and 10 columns
g0 <- Sys.time()
app2Out3 <- approach2("df1.txt", guide2)
g1 <- Sys.time()
## 250,00 rows and 100 columns
h0 <- Sys.time()
app2Out4 <- approach2("df1.txt", guide1)
h1 <- Sys.time()
Approach 3: setDF()
with substring
names(df2) <- "v1"
## 500 rows and 10 columns
i0 <- Sys.time()
app3Out1 <- approach3(df2, guide2)
i1 <- Sys.time()
## 500 rows and 100 columns
j0 <- Sys.time()
app3Out2 <- approach3(df2, guide1)
j1 <- Sys.time()
## 250,000 rows and 10 columns
k0 <- Sys.time()
app3Out3 <- approach3(df1, guide2)
k1 <- Sys.time()
## 250,000 rows and 100 columns
l0 <- Sys.time()
app3Out4 <- approach3(df1, guide1)
l1 <- Sys.time()
Set result tables up
tests <- c("500 Rows, 10 Columns","500 rows, 100 Columns","250,000 Rows, 10 Columns",
"250,000 rows, 100 Columns")
app1 <- c(as.numeric(a1-a0)/60,as.numeric(b1-b0)/60,as.numeric(c1-c0),as.numeric(d1-d0))
app2 <- c(as.numeric(e1-e0)/60,as.numeric(f1-f0)/60,as.numeric(g1-g0),as.numeric(h1-h0))
app3 <- c(as.numeric(i1-i0)/60,as.numeric(j1-j0)/60,as.numeric(k1-k0),as.numeric(l1-l0))
results <- data.frame(
"DataSize"=tests,
"Approach1"=app1,
"Approach2"=app2,
"Approach3"=app3
)
# Time factor with increase with column and row increases
scaling <- data.frame(
"Increase"=c("500 Rows: Increase from 10 to 100 Columns","250k Rows: Increase from 10 to 100 Columns",
"10 Columns: Increase from 500 to 250k Rows","100 Columns: Increase from 500 to 250k Rows"),
"Approach1"=c((results[2,2]/results[1,2]),(results[4,2]/results[3,2]),
(results[3,2]/results[1,2]),(results[4,2]/results[2,2])),
"Approach2"=c((results[2,3]/results[1,3]),(results[4,3]/results[3,3]),
(results[3,3]/results[1,3]),(results[4,3]/results[2,3])),
"Approach3"=c((results[2,4]/results[1,4]),(results[4,4]/results[3,4]),
(results[3,4]/results[1,4]),(results[4,4]/results[2,4]))
)