I was able to extract the tables of the documents with the following code :
library(RDCOMClient)
#############################################
#### Step 1 : SaVe wikipedia page as PDF ####
#############################################
path_PDF <- "C:\\Stackoverflow83.pdf"
path_Word <- "C:\\Stackoverflow83.docx"
################################################################
#### Step 2 : Convert PDF to word document with OCR of Word ####
################################################################
wordApp <- COMCreate("Word.Application")
wordApp[["Visible"]] <- TRUE
wordApp[["DisplayAlerts"]] <- FALSE
doc <- wordApp[["Documents"]]$Open(normalizePath(path_PDF),
ConfirmConversions = FALSE)
doc$SaveAs2(path_Word)
doc_Selection <- wordApp$Selection()
######################################################
#### Step 3 : Highlight keywords in word document ####
######################################################
nb_Tables <- doc$tables()$count()
list_Table <- list()
for(l in 1 : nb_Tables)
{
print(l)
nb_Row <- doc$tables(l)$Rows()$Count()
nb_Col <- doc$tables(l)$Columns()$Count()
mat_Temp <- matrix(NA, nrow = nb_Row, ncol = nb_Col)
for(i in 1 : nb_Row)
{
for(j in 1 : nb_Col)
{
mat_Temp[i, j] <- doc$tables(l)$cell(i, j)$range()$text()
}
}
list_Table[[l]] <- mat_Temp
}
list_Table
[[1]]
[,1] [,2] [,3]
[1,] "Subjects \r\a" "Body Height Range \r\a" "Arm span Range\r\a"
[2,] " \r\a" "(Mean±SD) \r\a" "(Mean±SD)\r\a"
[3,] "Male \r\a" "161.6-201.5 \r\a" "156.0-206.0\r\a"
[4,] " \r\a" "(183.21±7.06) \r\a" "(185.71±8.17)\r\a"
[5,] "Female \r\a" " 156.9-182.2 \r\a" "152.0-184.7\r\a"
[6,] " \r\a" "(168.37±5.27) \r\a" "(168.13±6.58)\r\a"
[[2]]
[,1] [,2] [,3] [,4]
[1,] "Subjects\t\r\a" "Correlation\t\r\a" "95%\tconfidence\t\r\a" "Significance\r\a"
[2,] "\t\r\a" "Coefficient\t\r\a" "interval\t\r\a" "p-value\r\a"
[3,] "Male \r\a" "0.861 \r\a" "0.817–0.900 \r\a" "<0.000\r\a"
[4,] "Female \r\a" "0.809 \r\a" "0.735–0.866 \r\a" "<0.000\r\a"
[[3]]
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "Subjects \r\a" "Regression \r\a" "Standard \r\a" "R-square \r\a" "t-value \r\a" "p-value \r\a"
[2,] " \r\a" "Coefficient \r\a" "Error (SE) \r\a" "(%) \r\a" "\r\a" "\r\a"
[3,] "Male \r\a" "0.861 \r\a" "0.033 \r\a" "74.2 \r\a" "22.499 \r\a" "0.000\r\a"
[4,] "Female \r\a" "0.809 \r\a" "0.046 \r\a" "65.4 \r\a" "14.079 \r\a" "0.000\r\a"
[[4]]
[,1] [,2] [,3] [,4]
[1,] " \r\a" "Male \r\a" "Female \r\a" "\r\a"
[2,] "Belgium \r\a" "179.5 \r\a" "166.3 \r\a" "DINBelg 2005\r\a"
[3,] "Czech Republic \r\a" "180.3 \r\a" "167.2 \r\a" "Vignerová et al. 2006\r\a"
[4,] "Croatia \r\a" "180.5 \r\a" "166.3 \r\a" "Juresa et al. 2012\r\a"
[5,] "England \r\a" "177.6 \r\a" "163.4 \r\a" "NHS 2009\r\a"
[6,] "Finland \r\a" "178.4 \r\a" "165.2 \r\a" "Peltonen et al. 2008\r\a"
[7,] "France \r\a" "177.8 \r\a" "164.2 \r\a" "InVS 2007\r\a"
[8,] "Hungary \r\a" "177.5 \r\a" "164.4 \r\a" "Bodzsár & Zsákai 2008\r\a"
[9,] "Ireland \r\a" "176.3 \r\a" "163.3 \r\a" "Sproston & Mindell 2006\r\a"
[10,] "Island \r\a" "180.6 \r\a" "167.2 \r\a" "Dagbjartsson et al. 2000\r\a"
[11,] "Italy \r\a" "176.5 \r\a" "162.6 \r\a" "Cacciari et al. 2006\r\a"
[12,] "Latvia \r\a" "177.6 \r\a" "167.1 \r\a" "Gerhards 2005\r\a"
[13,] "Lithuania \r\a" "181.3 \r\a" "167.5 \r\a" "Tutkuviene 2005\r\a"
[14,] "Montenegro \r\a" "183.2 \r\a" "168.3 \r\a" "Present study\r\a"
[15,] "Netherland \r\a" "183.8 \r\a" "170.7 \r\a" "TNO 2010\r\a"
[16,] "Poland \r\a" "178.5 \r\a" "165.1 \r\a" "Kulaga et al. 2010\r\a"
[17,] "Russia \r\a" "177.2 \r\a" "164.1 \r\a" "Brainerd 2006\r\a"
[18,] "Slovenia \r\a" "180.3 \r\a" "167.4 \r\a" "Starc & Strel 2011\r\a"
[19,] "Serbia \r\a" "180.9 \r\a" "167.3 \r\a" "J. Grozdanov, per. communication 2011\r\a"
[20,] "Spain \r\a" "177.3 \r\a" "164.0 \r\a" "Carrascosa Lezcano et al. 2008\r\a"
[21,] "Sweden \r\a" "180.4 \r\a" "167.0 \r\a" "Werner & Bodin 2006\r\a"
[22,] "Turkey \r\a" "173.6 \r\a" "161.9 \r\a" "Iseri & Arslan 2009\r\a"
[23,] "Wales \r\a" "177.0 \r\a" "162.0 \r\a" "Statistics for Wales 2010\r\a"
[[5]]
[,1] [,2] [,3] [,4]
[1,] "Australia \r\a" "174.8 \r\a" "163.4 \r\a" "ABS 1995\r\a"
[2,] "Argentina \r\a" "174.5 \r\a" "161.0 \r\a" "Del Pino et al. 2005\r\a"
[3,] "Bahrain \r\a" "171.0 \r\a" "156.6 \r\a" "Gharib & Shah 2009\r\a"
[4,] "Bolivia \r\a" "166.6 \r\a" "155.4 \r\a" "Baya Botti et al. 2009\r\a"
[5,] "Brazil \r\a" "170.7 \r\a" "158.8 \r\a" "IBGE 2010\r\a"
[6,] "Cameroon \r\a" "170.6 \r\a" "161.3 \r\a" "Kamadjeu et al. 2006\r\a"
[7,] "China \r\a" "173.4 \r\a" "161.2 \r\a" "Ji & Chen 2005\r\a"
[8,] "Egypt \r\a" "170.3 \r\a" "158.9 \r\a" "El-Zanaty & Way 2008\r\a"
[9,] "Ghana \r\a" "170.0 \r\a" "158.0 \r\a" "Schulz 2003\r\a"
[10,] "India \r\a" "165.2 \r\a" "152.0 \r\a" "Mamidi et al. 2011\r\a"
[11,] "Iran \r\a" "173.4 \r\a" "159.9 \r\a" "Haghdoost et al. 2008\r\a"
[12,] "Ivory Coast \r\a" "171.0 \r\a" "159.0 \r\a" "Schulz 2003\r\a"
[13,] "Malaysia \r\a" "166.3 \r\a" "154.7 \r\a" "Lim et al. 2000\r\a"
[14,] "Mexico \r\a" "168.0 \r\a" "155.3 \r\a" "Del Río Navarro et al. 2007\r\a"
[15,] "Mongolia \r\a" "168.4 \r\a" "157.7 \r\a" "WHO 2007\r\a"
[16,] "New Zealand \r\a" "177.0 \r\a" "165.0 \r\a" "OSHS 1997\r\a"
[17,] "Nigeria \r\a" "167.2 \r\a" "160.3 \r\a" "Ter Goon et al. 2011\r\a"
[18,] "Qatar \r\a" "170.8 \r\a" "161.1 \r\a" "Bener & Kamal 2005\r\a"
[19,] "Saudi Arabia \r\a" "168.9 \r\a" "156.3 \r\a" "El Mouzan et al. 2010\r\a"
[20,] "South Africa \r\a" "168.0 \r\a" "159.0 \r\a" "OrcMacro 2007\r\a"
[21,] "South Korea \r\a" "174.2 \r\a" "161.3 \r\a" "Kim et al. 2008\r\a"
[22,] "Sri Lanka \r\a" "165.6 \r\a" "154.0 \r\a" "Ranasinghe et al 2011\r\a"
[23,] "United Arab Emirates \r\a" "173.4 \r\a" "156.4 \r\a" "Abdulrazzaq et al. 2008\r\a"
[24,] "United States of America \r\a" "176.3 \r\a" "162.2 \r\a" "McDowell et al. 2008\r\a"
The result is pretty good from my point of view.