0

I have 600 txt files to clean, analyze, and write a one line output appended to a csv file. I am able to do this for one file at a time by copying and pasting the name of each txt file, one at a time. I would like to automate this so my code will see all 600 txt files in the directory, then run the code for each of them. Ultimately I should get a csv report with 600 lines summarizing the data of each report.

I'm definitely a newbie, so it's cool if you talk down if you have a suggestion. I think the help I need comes in the first 3 lines, and the final line. I've posted all of my code just in case.



files <- list.files(path = "dataGRI", pattern = ".txt$",  full.names = TRUE)

clean_text <- function(){

txt_raw = readLines()

# remove all extra white space, also splits on lines
txt_format1 <- gsub(" *\\b[[:alpha:]]{1,2}\\b *", " ", txt_raw)
txt_format1.5 <- gsub("^ +| +$|( ) +", "\\1", txt_format1) 

# recombine now that all white space is stripped
txt_format2 <- str_c(txt_format1.5, collapse=" ")

#split strings on space now to get a list of all words
txt_format3 <- str_split(txt_format2," ")

# convert to vector
txt_format4 <- unlist(txt_format3)

# remove empty strings and those with words shorter than 3 length
txt_format5 <- txt_format4[str_length(txt_format4) > 3]

# combine document back to single string
cleaned <- str_c(txt_format5, collapse=" ")
#get word count
wc <- summary(txt_format5)
wcount <- wc[1]

s1_raw = readLines("wordlistGRI/stage1r.txt")
str(s1_raw)
s2_raw = readLines("wordlistGRI/stage2r.txt")
str(s2_raw)
s3_raw = readLines("wordlistGRI/stage3r.txt")
str(s3_raw)
s4_raw = readLines("wordlistGRI/stage4r.txt")
str(s4_raw)
s5_raw = readLines("wordlistGRI/stage5r.txt")
str(s5_raw)

# apply str_count function using each stage vector
level1 <- sapply(s1_raw, str_count, string=cleaned)
level2 <- sapply(s2_raw, str_count, string=cleaned)
level3 <- sapply(s3_raw, str_count, string=cleaned)
level4 <- sapply(s4_raw, str_count, string=cleaned)
level5 <- sapply(s5_raw, str_count, string=cleaned)

#make a vector from this for the report later
wordcountresult <- c(level1,level2,level3,level4,level5)

# convert to dataframes
s1 <- as.data.frame(level1)
s2 <- as.data.frame(level2)
s3 <- as.data.frame(level3)
s4 <- as.data.frame(level4)
s5 <- as.data.frame(level5)

# add a count column that each df shares
s1$count <- s1$level1
s2$count <- s2$level2
s3$count <- s3$level3
s4$count <- s4$level4
s5$count <- s5$level5

# add a stage column to identify what stage the word is in
s1$stage <- "Stage 1"
s2$stage <- "Stage 2"
s3$stage <- "Stage 3"
s4$stage <- "Stage 4"
s5$stage <- "Stage 5"

# drop the unique column
s1 <- s1[c("count","stage")]
s2 <- s2[c("count","stage")]
s3 <- s3[c("count","stage")]
s4 <- s4[c("count","stage")]
s5 <- s5[c("count","stage")]

# s1
df <- rbind(s1, s2,s3, s4, s5)

#Making the report
#Make a vector to put in the report
#get stage counts and make a vector
s1c <- sum(s1$count)
s2c <- sum(s2$count)
s3c <- sum(s3$count)
s4c <- sum(s4$count)
s5c <- sum(s5$count)
stagesvec <- c(s1c,s2c,s3c,s4c,s5c)
names(stagesvec) <- c("Stage1","Stage2","Stage3","Stage4","Stage5")

#get the company report name for a vector
companyvec <- c(company)
names(companyvec) <- c("company")

# combine the vectors for the vector row to be inserted into the report
reportresult <- c(companyvec, wordcountresult, wcount, stagesvec)
rrdf <- data.frame(t(reportresult))

#if working file exists-use it
if (file.exists("dataGRI/WordCount.csv")) {
  write.csv(rrdf, "dataGRI/WordCountTemp.csv", row.names = FALSE)
  rrdf2 <- read.csv("dataGRI/WordCountTemp.csv")
  df2 <- read.csv("dataGRI/WordCount.csv")
  df2 <- rbind(df2, rrdf2)
  write.csv(df2, "dataGRI/WordCount.csv", row.names = FALSE)
  
} else{
  #if NO working file exists-make it
  write.csv(rrdf,
            "dataGRI/WordCount.csv", row.names = FALSE)
}
}

 purrr::map(list.files("dataGRI"), ~clean_text())
jamesey
  • 1
  • 3
  • Have a look at this: https://stackoverflow.com/questions/13441204/using-lapply-and-read-csv-on-multiple-files-in-r. Once you have a vector of files, you can use `lapply` to loop across each file – Jonny Phelps Jul 28 '20 at 07:56
  • 4
    That is way to much code to review. Please provide a minimal example, this increases the chance of getting help. I would say, you need a for loop. – MKR Jul 28 '20 at 07:57
  • there seem to be a syntax error on the last line, should be : `purrr::map(list.files("dataGRI"), clean_text)` – py_b Jul 28 '20 at 08:16

0 Answers0