I have a list of pdfs and list of keywords, each keyword should run through each pdf and return TRUE if it exists in the pdf
Tried with two for
loops but its returning only last keyword and the result also incorrect, all are returning as TRUE. Only one pdf contains that word but it is showing TRUE for all the pdfs
library(pdftools)
library(stringr)
library(tm)
library(filesstrings)
library(RODBC)
setwd("C:/RProject/ReadPDF/InputFiles/")
SelectFirstKeyword <- list("new formula" , "new research", "morning up")
ID <- list.files("C:/RProject/ReadPDF/InputFiles/", full.names = T)
ID_ <- ID[ID != ""]
files <- ID_
for(i in 1:length(files)){
for(j in 1:length(SelectFirstKeyword)){
filename <- files[i]
read <- readPDF(control = list(text = "-layout"))
mystring <- Corpus(URISource(filename), readerControl = list(reader = read))
lower_string <- tolower(mystring)
CleanData <- gsub("\n", " ",lower_string)
second_string <- tolower(SelectFirstKeyword[j])
print(second_string)
mystring <- paste(CleanData,sep=" ")
mystring_vector <- str_split(CleanData, "!")[[1]]
FirstMatch <- second_string
Match1 <- grepl(FirstMatch,mystring_vector[1])
mystring <- paste(CleanData,sep=" ")
mystring_vector <- str_split(CleanData, "!")[[1]]
FirstMatch <- second_string
Match1 <- grepl(FirstMatch,mystring_vector[1])
FinalOutput <- paste(pdf_list,Outid, Match1)
View(FinalOutput)
if (Match1 == TRUE)
{
DATA1 <- paste(ID = pdf_list, Outid = FirstMatch , Keywordinnote = Match1)
}
}
}
View(DATA1)
### I shall email you the pdf files
Expected output is:
ID outid keywordinnote
1 news TRUE
2 new formula TRUE