0

I have a a number of csv .txt files that have an inconsistent number and order of columns added to their left and right sides. Each change though, is preceded by a row of headers. read.table and fread struggle with these additional columns but after trawling countless stackoverflow Q on the topic [there are many! e.g. defining col.name or fill = TRUE or using ncols <- max(count.fields(file, sep = ",")) did not work], I found a way to deal with these inconsistencies using the headers here: Is there a better way in R to split a file with multiple sections, which produces a list of all the headed sections in a single file, which I can then rbindlist() on the common columns I want.

My question is how I use an if() loop or a function() to apply this code to a folder with a number these .txt files and load them into a list of lists perhaps? I am a beginner so am still trying to grasp nested looping and have been pulling my hair out for weeks! Can anyone help? Thanks.

Here is example code to work with:

My data are big files of lat/long info but I have created an example dataset below.

  library(data.table)

  tmp1 <- c("C,D,E,F", "1,1,1,1", "2,2,2,2", "C,D,E,F", "3,3,3,3", "4,4,4,4", 
            "5,5,5,5", "C,D,E,F", "6,6,6,6", "7,7,7,7", "8,8,8,8", "9,9,9,9", 
            "A,B,C,D,E,F", "10,10,10,10,10,10", "11,11,11,11,11,11", "A,B,C,D,E,F", 
            "12,12,12,12,12,12", "13,13,13,13,13,13", "14,14,14,14,14,14", 
            "15,15,15,15,15,15", "A,B,C,D,E,F,G,H", "16,16,16,16,16,16,16,16", 
            "17,17,17,17,17,17,17,17", "18,18,18,18,18,18,18,18", "A,B,C,D,E,F,G,H", 
            "19,19,19,19,19,19,19,19", "20,20,20,20,20,20,20,20")

  tmp2 <- c("C,D,E,F", "21,21,21,21", "22,22,22,22", "C,D,E,F", "23,23,23,23", 
            "24,24,24,24", "25,25,25,25", "C,D,E,F", "26,26,26,26", "27,27,27,27", 
            "28,28,28,28", "29,29,29,29", "A,B,C,D,E,F", "30,30,30,30,30,30", 
            "31,31,31,31,31,31", "A,B,C,D,E,F", "32,32,32,32,32,32", "33,33,33,33,33,33", 
            "34,34,34,34,34,34", "35,35,35,35,35,35", "A,B,C,D,E,F,G,H", 
            "36,36,36,36,36,36,36,36", "37,37,37,37,37,37,37,37", "38,38,38,38,38,38,38,38", 
            "A,B,C,D,E,F,G,H", "39,39,39,39,39,39,39,39", "40,40,40,40,40,40,40,40")

  tmp3 <- c("C,D,E,F", "21,21,21,21", "22,22,22,22", "C,D,E,F", "23,23,23,23", 
            "24,24,24,24", "25,25,25,25", "C,D,E,F", "26,26,26,26", "27,27,27,27", 
            "28,28,28,28", "29,29,29,29", "A,B,C,D,E,F", "30,30,30,30,30,30", 
            "31,31,31,31,31,31", "A,B,C,D,E,F", "32,32,32,32,32,32", "33,33,33,33,33,33", 
            "34,34,34,34,34,34", "35,35,35,35,35,35", "A,B,C,D,E,F,G,H", 
            "36,36,36,36,36,36,36,36", "37,37,37,37,37,37,37,37", "38,38,38,38,38,38,38,38", 
            "A,B,C,D,E,F,G,H", "39,39,39,39,39,39,39,39", "40,40,40,40,40,40,40,40")

  tmp4 <- c("C,D,E,F", "61,61,61,61", "62,62,62,62", "C,D,E,F", "63,63,63,63", 
            "64,64,64,64", "65,65,65,65", "C,D,E,F", "66,66,66,66", "67,67,67,67", 
            "68,68,68,68", "69,69,69,69", "A,B,C,D,E,F", "70,70,70,70,70,70", 
            "71,71,71,71,71,71", "A,B,C,D,E,F", "72,72,72,72,72,72", "73,73,73,73,73,73", 
            "74,74,74,74,74,74", "75,75,75,75,75,75", "A,B,C,D,E,F,G,H", 
            "76,76,76,76,76,76,76,76", "77,77,77,77,77,77,77,77", "78,78,78,78,78,78,78,78", 
            "A,B,C,D,E,F,G,H", "79,79,79,79,79,79,79,79", "80,80,80,80,80,80,80,80")

 wd <- getwd()
  dir.create("tmpfolder")
  write.table(tmp1, paste(wd,"/tmpfolder/tmp1.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
  write.table(tmp2, paste(wd,"/tmpfolder/tmp2.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
  write.table(tmp3, paste(wd,"/tmpfolder/tmp3.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
  write.table(tmp4, paste(wd,"/tmpfolder/tmp4.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE) # an example of my current lack of ability to use loops!

  
  file = file.path(paste(wd,"/tmpfolder/tmp1.txt",sep=""))
  tmp = readLines(file)
  sof <- (grep("C", tmp)) - 1
  real_start <- sof + 1
  real_end <- c(sof[-1] - 1, length(tmp))
  to_read <- real_end - real_start + 1
  my_dfs <- vector("list", length = length(real_start))
  for(i in 1:length(my_dfs)){
    my_dfs[[i]] <- suppressWarnings(
      data.table::fread(file,
                        sep = ",",
                        skip = sof[i],
                        nrows = to_read[i],
                        fill = TRUE,
                        check.names = FALSE,
                        data.table = FALSE,
      )
    )
  }
Skeiff
  • 31
  • 4
  • sounds like `rbindlist(fill=TRUE, use.names=TRUE)` + `lapply` + `fread` – jangorecki Jun 17 '21 at 07:29
  • Hi, thanks for responding, Yes, sure, as I said, "I can then rbindlist() on the common columns I want" e.g. ```DFs <- rbindlist(my_dfs, use.names = TRUE, fill = TRUE)```. My question though is how can apply the code which splits the individual file - using grep() on the headers etc - to a __folder__ full of these files without typing the code out or altering it for each file in turn in that folder. – Skeiff Jun 18 '21 at 12:29
  • ..for instance, using something like the for(i in names) described in [link](https://stackoverflow.com/questions/5319839/read-multiple-csv-files-into-separate-data-frames). I've tried but can't figure out how to include the grep() skip nrows on the headers etc code in that for() loop. – Skeiff Jun 18 '21 at 16:28

1 Answers1

1

you can just build a loop around your code like this:

for(i in 1:5)
{
  write.table(eval(parse(text=paste0("tmp",i))), paste(wd,"/tmpfolder/tmp",i,".txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
}

And the same for reading in the data

for(i in 1:5)
{
  file = file.path(paste(wd,"/tmpfolder/tmp",i,".txt",sep=""))
  d <- fread(file, sep = ",")
  if(i==1){d.out <- list(d)}else{d.out <- list(d.out, d)}
}
Dharman
  • 30,962
  • 25
  • 85
  • 135
wiebke
  • 111
  • 2