I have 600 txt files to clean, analyze, and write a one line output appended to a csv file. I am able to do this for one file at a time by copying and pasting the name of each txt file, one at a time. I would like to automate this so my code will see all 600 txt files in the directory, then run the code for each of them. Ultimately I should get a csv report with 600 lines summarizing the data of each report.
I'm definitely a newbie, so it's cool if you talk down if you have a suggestion. I think the help I need comes in the first 3 lines, and the final line. I've posted all of my code just in case.
files <- list.files(path = "dataGRI", pattern = ".txt$", full.names = TRUE)
clean_text <- function(){
txt_raw = readLines()
# remove all extra white space, also splits on lines
txt_format1 <- gsub(" *\\b[[:alpha:]]{1,2}\\b *", " ", txt_raw)
txt_format1.5 <- gsub("^ +| +$|( ) +", "\\1", txt_format1)
# recombine now that all white space is stripped
txt_format2 <- str_c(txt_format1.5, collapse=" ")
#split strings on space now to get a list of all words
txt_format3 <- str_split(txt_format2," ")
# convert to vector
txt_format4 <- unlist(txt_format3)
# remove empty strings and those with words shorter than 3 length
txt_format5 <- txt_format4[str_length(txt_format4) > 3]
# combine document back to single string
cleaned <- str_c(txt_format5, collapse=" ")
#get word count
wc <- summary(txt_format5)
wcount <- wc[1]
s1_raw = readLines("wordlistGRI/stage1r.txt")
str(s1_raw)
s2_raw = readLines("wordlistGRI/stage2r.txt")
str(s2_raw)
s3_raw = readLines("wordlistGRI/stage3r.txt")
str(s3_raw)
s4_raw = readLines("wordlistGRI/stage4r.txt")
str(s4_raw)
s5_raw = readLines("wordlistGRI/stage5r.txt")
str(s5_raw)
# apply str_count function using each stage vector
level1 <- sapply(s1_raw, str_count, string=cleaned)
level2 <- sapply(s2_raw, str_count, string=cleaned)
level3 <- sapply(s3_raw, str_count, string=cleaned)
level4 <- sapply(s4_raw, str_count, string=cleaned)
level5 <- sapply(s5_raw, str_count, string=cleaned)
#make a vector from this for the report later
wordcountresult <- c(level1,level2,level3,level4,level5)
# convert to dataframes
s1 <- as.data.frame(level1)
s2 <- as.data.frame(level2)
s3 <- as.data.frame(level3)
s4 <- as.data.frame(level4)
s5 <- as.data.frame(level5)
# add a count column that each df shares
s1$count <- s1$level1
s2$count <- s2$level2
s3$count <- s3$level3
s4$count <- s4$level4
s5$count <- s5$level5
# add a stage column to identify what stage the word is in
s1$stage <- "Stage 1"
s2$stage <- "Stage 2"
s3$stage <- "Stage 3"
s4$stage <- "Stage 4"
s5$stage <- "Stage 5"
# drop the unique column
s1 <- s1[c("count","stage")]
s2 <- s2[c("count","stage")]
s3 <- s3[c("count","stage")]
s4 <- s4[c("count","stage")]
s5 <- s5[c("count","stage")]
# s1
df <- rbind(s1, s2,s3, s4, s5)
#Making the report
#Make a vector to put in the report
#get stage counts and make a vector
s1c <- sum(s1$count)
s2c <- sum(s2$count)
s3c <- sum(s3$count)
s4c <- sum(s4$count)
s5c <- sum(s5$count)
stagesvec <- c(s1c,s2c,s3c,s4c,s5c)
names(stagesvec) <- c("Stage1","Stage2","Stage3","Stage4","Stage5")
#get the company report name for a vector
companyvec <- c(company)
names(companyvec) <- c("company")
# combine the vectors for the vector row to be inserted into the report
reportresult <- c(companyvec, wordcountresult, wcount, stagesvec)
rrdf <- data.frame(t(reportresult))
#if working file exists-use it
if (file.exists("dataGRI/WordCount.csv")) {
write.csv(rrdf, "dataGRI/WordCountTemp.csv", row.names = FALSE)
rrdf2 <- read.csv("dataGRI/WordCountTemp.csv")
df2 <- read.csv("dataGRI/WordCount.csv")
df2 <- rbind(df2, rrdf2)
write.csv(df2, "dataGRI/WordCount.csv", row.names = FALSE)
} else{
#if NO working file exists-make it
write.csv(rrdf,
"dataGRI/WordCount.csv", row.names = FALSE)
}
}
purrr::map(list.files("dataGRI"), ~clean_text())