I have a a number of csv .txt files that have an inconsistent number and order of columns added to their left and right sides. Each change though, is preceded by a row of headers. read.table and fread struggle with these additional columns but after trawling countless stackoverflow Q on the topic [there are many! e.g. defining col.name or fill = TRUE or using ncols <- max(count.fields(file, sep = ",")) did not work], I found a way to deal with these inconsistencies using the headers here: Is there a better way in R to split a file with multiple sections, which produces a list of all the headed sections in a single file, which I can then rbindlist() on the common columns I want.
My question is how I use an if() loop or a function() to apply this code to a folder with a number these .txt files and load them into a list of lists perhaps? I am a beginner so am still trying to grasp nested looping and have been pulling my hair out for weeks! Can anyone help? Thanks.
Here is example code to work with:
My data are big files of lat/long info but I have created an example dataset below.
library(data.table)
tmp1 <- c("C,D,E,F", "1,1,1,1", "2,2,2,2", "C,D,E,F", "3,3,3,3", "4,4,4,4",
"5,5,5,5", "C,D,E,F", "6,6,6,6", "7,7,7,7", "8,8,8,8", "9,9,9,9",
"A,B,C,D,E,F", "10,10,10,10,10,10", "11,11,11,11,11,11", "A,B,C,D,E,F",
"12,12,12,12,12,12", "13,13,13,13,13,13", "14,14,14,14,14,14",
"15,15,15,15,15,15", "A,B,C,D,E,F,G,H", "16,16,16,16,16,16,16,16",
"17,17,17,17,17,17,17,17", "18,18,18,18,18,18,18,18", "A,B,C,D,E,F,G,H",
"19,19,19,19,19,19,19,19", "20,20,20,20,20,20,20,20")
tmp2 <- c("C,D,E,F", "21,21,21,21", "22,22,22,22", "C,D,E,F", "23,23,23,23",
"24,24,24,24", "25,25,25,25", "C,D,E,F", "26,26,26,26", "27,27,27,27",
"28,28,28,28", "29,29,29,29", "A,B,C,D,E,F", "30,30,30,30,30,30",
"31,31,31,31,31,31", "A,B,C,D,E,F", "32,32,32,32,32,32", "33,33,33,33,33,33",
"34,34,34,34,34,34", "35,35,35,35,35,35", "A,B,C,D,E,F,G,H",
"36,36,36,36,36,36,36,36", "37,37,37,37,37,37,37,37", "38,38,38,38,38,38,38,38",
"A,B,C,D,E,F,G,H", "39,39,39,39,39,39,39,39", "40,40,40,40,40,40,40,40")
tmp3 <- c("C,D,E,F", "21,21,21,21", "22,22,22,22", "C,D,E,F", "23,23,23,23",
"24,24,24,24", "25,25,25,25", "C,D,E,F", "26,26,26,26", "27,27,27,27",
"28,28,28,28", "29,29,29,29", "A,B,C,D,E,F", "30,30,30,30,30,30",
"31,31,31,31,31,31", "A,B,C,D,E,F", "32,32,32,32,32,32", "33,33,33,33,33,33",
"34,34,34,34,34,34", "35,35,35,35,35,35", "A,B,C,D,E,F,G,H",
"36,36,36,36,36,36,36,36", "37,37,37,37,37,37,37,37", "38,38,38,38,38,38,38,38",
"A,B,C,D,E,F,G,H", "39,39,39,39,39,39,39,39", "40,40,40,40,40,40,40,40")
tmp4 <- c("C,D,E,F", "61,61,61,61", "62,62,62,62", "C,D,E,F", "63,63,63,63",
"64,64,64,64", "65,65,65,65", "C,D,E,F", "66,66,66,66", "67,67,67,67",
"68,68,68,68", "69,69,69,69", "A,B,C,D,E,F", "70,70,70,70,70,70",
"71,71,71,71,71,71", "A,B,C,D,E,F", "72,72,72,72,72,72", "73,73,73,73,73,73",
"74,74,74,74,74,74", "75,75,75,75,75,75", "A,B,C,D,E,F,G,H",
"76,76,76,76,76,76,76,76", "77,77,77,77,77,77,77,77", "78,78,78,78,78,78,78,78",
"A,B,C,D,E,F,G,H", "79,79,79,79,79,79,79,79", "80,80,80,80,80,80,80,80")
wd <- getwd()
dir.create("tmpfolder")
write.table(tmp1, paste(wd,"/tmpfolder/tmp1.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(tmp2, paste(wd,"/tmpfolder/tmp2.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(tmp3, paste(wd,"/tmpfolder/tmp3.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(tmp4, paste(wd,"/tmpfolder/tmp4.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE) # an example of my current lack of ability to use loops!
file = file.path(paste(wd,"/tmpfolder/tmp1.txt",sep=""))
tmp = readLines(file)
sof <- (grep("C", tmp)) - 1
real_start <- sof + 1
real_end <- c(sof[-1] - 1, length(tmp))
to_read <- real_end - real_start + 1
my_dfs <- vector("list", length = length(real_start))
for(i in 1:length(my_dfs)){
my_dfs[[i]] <- suppressWarnings(
data.table::fread(file,
sep = ",",
skip = sof[i],
nrows = to_read[i],
fill = TRUE,
check.names = FALSE,
data.table = FALSE,
)
)
}