1

I wrote a chunk of code working to get the .pdf table I am interested in in R, but there must be a better way. Hence, I haven't a problem in importing the data from pdf. I am looking for a BETTER way than the following to extract the tables I am interested in.

df_st <- "http://www.drustvo-antropologov.si/AN/PDF/2012_2/Anthropological_Notebooks_XVIII_2_Bjelica.pdf"

df_st_table <- extract_tables(df_st)

df_str <- data.frame(matrix(unlist(df_st_table), nrow=195, byrow=T))

df_str_a <- df_str[29:52, ]
df_str_a <- data.frame(matrix(unlist(df_str_a), nrow=24, byrow=T))
df_str_b <- df_str[53:76, ]
df_str_b <- data.frame(matrix(unlist(df_str_b), nrow=24, byrow=T))
df_str_c <- df_str[101:126, ]
df_str_c <- data.frame(matrix(unlist(df_str_c), nrow=26, byrow=T))
df_str_d <- df_str[127:152, ]
df_str_d <- data.frame(matrix(unlist(df_str_d), nrow=26, byrow=T))

...and then I merge them all. Too long and inelegant.

Brian Tompsett - 汤莱恩
  • 5,753
  • 72
  • 57
  • 129
Helena
  • 87
  • 9
  • 1
    Possible duplicate of [Recognize PDF table using R](https://stackoverflow.com/questions/44141160/recognize-pdf-table-using-r) – Dror Bogin May 15 '18 at 12:58
  • 2
    @Dhiraj They are already using that package: `tabulizer::extract_tables` – zx8754 May 15 '18 at 13:05
  • @zx8754 yeah just realized, my bad! – Dhiraj May 15 '18 at 13:07
  • What is the expected output? – zx8754 May 15 '18 at 13:14
  • Have a look at this [post](https://medium.com/@CharlesBordet/how-to-extract-and-clean-data-from-pdf-files-in-r-da11964e252e). It is showing how to use 2 packages for pdf-extraction (pdftools and tm) – SeGa May 15 '18 at 13:24
  • Sorry, I forgot to tell I am using both tabulizer and tm. I guess there is something I am missing though. Anyway, the code I have copied here is actually working, nevertheless is awful. I am trying to find out something more agile. – Helena May 15 '18 at 13:31

1 Answers1

0

I was able to extract the tables of the documents with the following code :

library(RDCOMClient)

#############################################
#### Step 1 : SaVe wikipedia page as PDF ####
#############################################

path_PDF <- "C:\\Stackoverflow83.pdf"
path_Word <- "C:\\Stackoverflow83.docx"

################################################################
#### Step 2 : Convert PDF to word document with OCR of Word ####
################################################################
wordApp <- COMCreate("Word.Application")
wordApp[["Visible"]] <- TRUE
wordApp[["DisplayAlerts"]] <- FALSE

doc <- wordApp[["Documents"]]$Open(normalizePath(path_PDF),
                                   ConfirmConversions = FALSE)

doc$SaveAs2(path_Word)
doc_Selection <- wordApp$Selection()

######################################################
#### Step 3 : Highlight keywords in word document ####
######################################################
nb_Tables <- doc$tables()$count()
list_Table <- list()

for(l in 1 : nb_Tables)
{
  print(l)
  nb_Row <- doc$tables(l)$Rows()$Count()
  nb_Col <- doc$tables(l)$Columns()$Count()
  mat_Temp <- matrix(NA, nrow = nb_Row, ncol = nb_Col)
  
  for(i in 1 : nb_Row)
  {
    for(j in 1 : nb_Col)
    {
      mat_Temp[i, j] <- doc$tables(l)$cell(i, j)$range()$text()
    }
  }
  
  list_Table[[l]] <- mat_Temp
}

list_Table

[[1]]
     [,1]            [,2]                      [,3]                
[1,] "Subjects \r\a" "Body Height Range  \r\a" "Arm span Range\r\a"
[2,] " \r\a"         "(Mean±SD) \r\a"          "(Mean±SD)\r\a"     
[3,] "Male \r\a"     "161.6-201.5 \r\a"        "156.0-206.0\r\a"   
[4,] " \r\a"         "(183.21±7.06) \r\a"      "(185.71±8.17)\r\a" 
[5,] "Female \r\a"   " 156.9-182.2 \r\a"       "152.0-184.7\r\a"   
[6,] " \r\a"         "(168.37±5.27) \r\a"      "(168.13±6.58)\r\a" 

[[2]]
     [,1]            [,2]               [,3]                  [,4]              
[1,] "Subjects\t\r\a" "Correlation\t\r\a" "95%\tconfidence\t\r\a" "Significance\r\a"
[2,] "\t\r\a"         "Coefficient\t\r\a" "interval\t\r\a"       "p-value\r\a"     
[3,] "Male \r\a"     "0.861 \r\a"       "0.817–0.900 \r\a"    "<0.000\r\a"      
[4,] "Female \r\a"   "0.809 \r\a"       "0.735–0.866 \r\a"    "<0.000\r\a"      

[[3]]
     [,1]            [,2]               [,3]              [,4]             [,5]           [,6]          
[1,] "Subjects \r\a" "Regression \r\a"  "Standard \r\a"   "R-square  \r\a" "t-value \r\a" "p-value \r\a"
[2,] " \r\a"         "Coefficient \r\a" "Error (SE) \r\a" "(%) \r\a"       "\r\a"         "\r\a"        
[3,] "Male \r\a"     "0.861 \r\a"       "0.033 \r\a"      "74.2 \r\a"      "22.499 \r\a"  "0.000\r\a"   
[4,] "Female \r\a"   "0.809 \r\a"       "0.046 \r\a"      "65.4 \r\a"      "14.079 \r\a"  "0.000\r\a"   

[[4]]
      [,1]                  [,2]         [,3]          [,4]                                       
 [1,] " \r\a"               "Male \r\a"  "Female \r\a" "\r\a"                                     
 [2,] "Belgium \r\a"        "179.5 \r\a" "166.3 \r\a"  "DINBelg 2005\r\a"                         
 [3,] "Czech Republic \r\a" "180.3 \r\a" "167.2 \r\a"  "Vignerová et al. 2006\r\a"                
 [4,] "Croatia \r\a"        "180.5 \r\a" "166.3 \r\a"  "Juresa et al. 2012\r\a"                   
 [5,] "England \r\a"        "177.6 \r\a" "163.4 \r\a"  "NHS 2009\r\a"                             
 [6,] "Finland \r\a"        "178.4 \r\a" "165.2 \r\a"  "Peltonen et al. 2008\r\a"                 
 [7,] "France \r\a"         "177.8 \r\a" "164.2 \r\a"  "InVS 2007\r\a"                            
 [8,] "Hungary \r\a"        "177.5 \r\a" "164.4 \r\a"  "Bodzsár & Zsákai 2008\r\a"                
 [9,] "Ireland \r\a"        "176.3 \r\a" "163.3 \r\a"  "Sproston & Mindell 2006\r\a"              
[10,] "Island \r\a"         "180.6 \r\a" "167.2 \r\a"  "Dagbjartsson et al. 2000\r\a"             
[11,] "Italy \r\a"          "176.5 \r\a" "162.6 \r\a"  "Cacciari et al. 2006\r\a"                 
[12,] "Latvia \r\a"         "177.6 \r\a" "167.1 \r\a"  "Gerhards 2005\r\a"                        
[13,] "Lithuania \r\a"      "181.3 \r\a" "167.5 \r\a"  "Tutkuviene 2005\r\a"                      
[14,] "Montenegro \r\a"     "183.2 \r\a" "168.3 \r\a"  "Present study\r\a"                        
[15,] "Netherland \r\a"     "183.8 \r\a" "170.7 \r\a"  "TNO 2010\r\a"                             
[16,] "Poland \r\a"         "178.5 \r\a" "165.1 \r\a"  "Kulaga et al. 2010\r\a"                   
[17,] "Russia \r\a"         "177.2 \r\a" "164.1 \r\a"  "Brainerd 2006\r\a"                        
[18,] "Slovenia \r\a"       "180.3 \r\a" "167.4 \r\a"  "Starc & Strel 2011\r\a"                   
[19,] "Serbia \r\a"         "180.9 \r\a" "167.3 \r\a"  "J. Grozdanov, per. communication 2011\r\a"
[20,] "Spain \r\a"          "177.3 \r\a" "164.0 \r\a"  "Carrascosa Lezcano et al. 2008\r\a"       
[21,] "Sweden \r\a"         "180.4 \r\a" "167.0 \r\a"  "Werner & Bodin 2006\r\a"                  
[22,] "Turkey \r\a"         "173.6 \r\a" "161.9 \r\a"  "Iseri & Arslan 2009\r\a"                  
[23,] "Wales \r\a"          "177.0 \r\a" "162.0 \r\a"  "Statistics for Wales 2010\r\a"            

[[5]]
      [,1]                            [,2]          [,3]         [,4]                             
 [1,] "Australia \r\a"                "174.8 \r\a"  "163.4 \r\a" "ABS 1995\r\a"                   
 [2,] "Argentina \r\a"                "174.5 \r\a"  "161.0 \r\a" "Del Pino et al. 2005\r\a"       
 [3,] "Bahrain \r\a"                  "171.0 \r\a"  "156.6 \r\a" "Gharib & Shah 2009\r\a"         
 [4,] "Bolivia \r\a"                  "166.6 \r\a"  "155.4 \r\a" "Baya Botti et al. 2009\r\a"     
 [5,] "Brazil \r\a"                   "170.7 \r\a"  "158.8 \r\a" "IBGE 2010\r\a"                  
 [6,] "Cameroon \r\a"                 "170.6 \r\a"  "161.3 \r\a" "Kamadjeu et al. 2006\r\a"       
 [7,] "China \r\a"                    "173.4 \r\a"  "161.2 \r\a" "Ji & Chen 2005\r\a"             
 [8,] "Egypt \r\a"                    "170.3 \r\a"  "158.9 \r\a" "El-Zanaty & Way 2008\r\a"       
 [9,] "Ghana \r\a"                    "170.0 \r\a"  "158.0 \r\a" "Schulz 2003\r\a"                
[10,] "India \r\a"                    "165.2 \r\a"  "152.0 \r\a" "Mamidi et al. 2011\r\a"         
[11,] "Iran \r\a"                     "173.4 \r\a"  "159.9 \r\a" "Haghdoost et al. 2008\r\a"      
[12,] "Ivory Coast \r\a"              "171.0 \r\a"  "159.0 \r\a" "Schulz 2003\r\a"                
[13,] "Malaysia \r\a"                 "166.3 \r\a"  "154.7 \r\a" "Lim et al. 2000\r\a"            
[14,] "Mexico \r\a"                   "168.0 \r\a"  "155.3 \r\a" "Del Río Navarro et al. 2007\r\a"
[15,] "Mongolia \r\a"                 "168.4 \r\a"  "157.7 \r\a" "WHO 2007\r\a"                   
[16,] "New Zealand \r\a"              "177.0  \r\a" "165.0 \r\a" "OSHS 1997\r\a"                  
[17,] "Nigeria \r\a"                  "167.2 \r\a"  "160.3 \r\a" "Ter Goon et al. 2011\r\a"       
[18,] "Qatar \r\a"                    "170.8 \r\a"  "161.1 \r\a" "Bener & Kamal 2005\r\a"         
[19,] "Saudi Arabia \r\a"             "168.9 \r\a"  "156.3 \r\a" "El Mouzan et al. 2010\r\a"      
[20,] "South Africa \r\a"             "168.0 \r\a"  "159.0 \r\a" "OrcMacro 2007\r\a"              
[21,] "South Korea \r\a"              "174.2 \r\a"  "161.3 \r\a" "Kim et al. 2008\r\a"            
[22,] "Sri Lanka \r\a"                "165.6 \r\a"  "154.0 \r\a" "Ranasinghe et al 2011\r\a"      
[23,] "United Arab Emirates \r\a"     "173.4 \r\a"  "156.4 \r\a" "Abdulrazzaq et al. 2008\r\a"    
[24,] "United States of America \r\a" "176.3 \r\a"  "162.2 \r\a" "McDowell et al. 2008\r\a"   

The result is pretty good from my point of view.
Emmanuel Hamel
  • 1,769
  • 7
  • 19