I was able to highlight some keywords in a PDF with the following code. There are four steps :
Save wikipedia page to PDF;
Convert the PDF to word document with the Word Software (There is an OCR!!);
Highlight the keywords in the word document;
Save the word document as PDF.
library(RDCOMClient)
library(DescTools)
library(pagedown)
#############################################
#### Step 1 : Save wikipedia page as PDF ####
#############################################
chrome_print(input = "https://en.wikipedia.org/wiki/Cat",
output = "C:\\Text_PDF_Cat.pdf")
path_PDF <- "C:\\Text_PDF_Cat.pdf"
path_Word <- "C:\\Text_PDF_Cat.docx"
################################################################
#### Step 2 : Convert PDF to word document with OCR of Word ####
################################################################
wordApp <- COMCreate("Word.Application")
wordApp[["Visible"]] <- TRUE
wordApp[["DisplayAlerts"]] <- FALSE
doc <- wordApp[["Documents"]]$Open(normalizePath(path_PDF),
ConfirmConversions = FALSE)
doc$SaveAs2(path_Word)
doc_Selection <- wordApp$Selection()
######################################################
#### Step 3 : Highlight keywords in word document ####
######################################################
move_To_Beginning_Doc <- function(doc_Selection)
{
doc_Selection$HomeKey(Unit = wdConst$wdStory) # Need DescTools for wdConst$wdStory
}
highlight_Text_Regex_Word <- function(doc,
doc_Selection,
words_To_Highlight,
colorIndex = 7,
nb_Max_Word = 100)
{
for(i in words_To_Highlight)
{
move_To_Beginning_Doc(doc_Selection)
for(j in 1 : nb_Max_Word)
{
doc_Selection$Find()$Execute(FindText = i, MatchCase = FALSE)
doc_Selection_Range <- doc_Selection$Range()
doc_Selection_Range[["HighlightColorIndex"]] <- colorIndex
}
}
}
highlight_Text_Regex_Word(doc, doc_Selection,
words_To_Highlight = c("cat", "domestic", "quick"),
colorIndex = 7, nb_Max_Word = 100)
###############################################
#### Step 4 : Convert word document to pdf ####
###############################################
path_PDF_Highlighted <- "C:\\Text_PDF_Cat_Highlighted.pdf"
wordApp[["ActiveDocument"]]$SaveAs(path_PDF_Highlighted, FileFormat = 17) # FileFormat = 17 saves as .PDF
doc$Close()
wordApp$Quit() # quit wordApp