The following solution only works on Windows. I started from the image above. With the code below, I have been able to extract the table :
library(RDCOMClient)
library(magick)
################################################
#### Step 1 : We convert the image to a PDF ####
################################################
path_PDF <- "C:\\temp.pdf"
path_PNG <- "C:\\lP3hw.png"
path_Word <- "C:\\temp.docx"
pdf(path_PDF, width = 16, height = 6)
im <- image_read(path_PNG)
plot(im)
abline(h = 50, col = "black")
abline(h = 100, col = "black")
abline(h = 130, col = "black")
abline(h = 260, col = "black")
dev.off()
####################################################################
#### Step 2 : We use the OCR of Word to convert the PDF in word ####
####################################################################
wordApp <- COMCreate("Word.Application")
wordApp[["Visible"]] <- TRUE
wordApp[["DisplayAlerts"]] <- FALSE
doc <- wordApp[["Documents"]]$Open(normalizePath(path_PDF),
ConfirmConversions = FALSE)
doc$SaveAs2(path_Word)
##############################################################
#### Step 3 : We extract the table from the word document ####
##############################################################
nb_Row <- doc$tables(1)$Rows()$Count()
nb_Col <- doc$tables(1)$Columns()$Count()
mat_Temp <- matrix(NA, nrow = nb_Row, ncol = nb_Col)
for(i in 1 : nb_Row)
{
for(j in 1 : nb_Col)
{
mat_Temp[i, j] <- tryCatch(doc$tables(1)$cell(i, j)$range()$text(), error = function(e) NA)
}
}
mat_Temp
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "\r\a" "/\r\a" NA NA NA NA
[2,] "\r\a" "/\r\a" "1 sand, 2\tmg/kg\tmglkg\tpH\tpH\tdS/m sandy loam, 3 loam, 4 loamy clay, 5 clay\r\a" NA NA NA
[3,] "30 May 2018\r\a" "0-10\r\a" "\t520\t23.00\r\a" "Colwell\r\a" "7_09\r\a" "6_70\r\a"
[4,] "30 May 2018\r\a" "\t10-60\t50\r\a" "9.0\r\a" "\r\a" "8_50\r\a" "7_80\r\a"
[,7] [,8]
[1,] NA NA
[2,] NA NA
[3,] "0.1\r\a" "0.93\r\a"
[4,] "0.1\r\a" "3.3\r\a"