0

I am using Rselenium to scrape following website: http://plovila.pomorstvo.hr/

Every time I have to enter 'NIB' field, execute and scrape all data. I am using Sys.time() function several time so my code is slow (cca 12 seconds for one NIB). I need to scrape around 200.000 NIB numbers which gives 30 days of scraping.

I am interested if I can open multiple browsers locally or somehow in the cloud and make my scraping script faster.

Is it possible to use parallel computing to overcome this issue?
Do you have any suggestions?

EDIT:
I am adding the code:

library(XML)
library(RCurl)
library(RSelenium)
library(png)
library(imager)
library(RMySQL)
library(htmltab)
library(jsonlite)
library(rvest)

# function for waiting instead Sys.sleep()
waitLoad <- function (xpath_check = "//input[@id = 'ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]",
                       iterations = 5){
  counter <- 0
  chk <- FALSE
  while(!chk & counter <= iterations){
    wait <- tryCatch(
      remDr$findElement(using = "xpath",
                        xpath_check)$getElementText(),
      # remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_Img1']")$clearElement(),
      error = function(e) print(paste0("Trazi dalje"))
    )
    if(wait == "Trazi dalje" ){
      Sys.sleep(1L)
      counter <- sum(counter, 1)
    }else{
      chk <- TRUE
    }
  }
}

# Start Selenium Server
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.0
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
remDr$open()

# Simulate browser session and fill out form
remDr$navigate("http://plovila.pomorstvo.hr/")
remDr$findElement(using = "xpath", "//select[@id = 'ctl00_Content_FormContent_uiTipObjektaDropDown']/option[@value = '1']")$clickElement()
remDr$screenshot(display = TRUE)

# Scrap !
df <- list()
Porivni_uredjaji <- list()
Clanovi_posade <- list()
Vlasnici <- list()
Korisnici <- list()
df_2 <- list()
Tereti <- list()
pocetak <- 100000
kraj <- 100003
system.time(
for (i in pocetak:kraj){
  remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$clearElement()
  Sys.sleep(1L)
  remDr$findElement(using = "xpath", 
                    "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$sendKeysToElement(list(as.character(i), 
                                                                                                         key = "enter"))
  waitLoad()
  remDr$screenshot(display = TRUE)
  doc <- htmlParse(remDr$getPageSource()[[1]])
  Sys.sleep(1L)
  Ime <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[1]", fun = xmlValue)
  Oznaka <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]", fun = xmlValue)
  NIB <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[3]", fun = xmlValue)
  Vlasnik <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[4]", fun = xmlValue)
  LK_LI <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[5]", fun = xmlValue)
  br1 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[6]", fun = xmlValue)
  br2 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[7]", fun = xmlValue)
  x <- i-pocetak + 1
  if (length(NIB)==0){
    Pozivni_znak <- NA
    df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Pozivni_znak)
    df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)
  }else{
    remDr$findElement(using = "xpath", "//input[@title = 'Detalji']")$clickElement()
    waitLoad("//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']", 5)
    doc <- htmlParse(remDr$getPageSource()[[1]], encoding = "UTF-8")
    Sys.sleep(1L)
    list_a <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/fieldset/h3[1]", fun = xmlValue)
    if (length(list_a) >= 1){
      
      Namjena <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']/@value")
      json <- paste0("[", '"', Namjena, '"', "]")
      Namjena <- fromJSON(json)
      Namjena <- as.data.frame(Namjena, stringsAsFactors = FALSE)
      colnames(Namjena) <- "Namjena"
      Vrsta_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVrstaPlovilaText']/@value")
      json <- paste0("[", '"', Vrsta_plovila, '"', "]")
      Vrsta_plovila <- fromJSON(json)
      Vrsta_plovila <- as.data.frame(Vrsta_plovila, stringsAsFactors = FALSE)
      colnames(Vrsta_plovila) <- "Vrsta_plovila"
      Model_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiModelPlovilaText']/@value")
      json <- paste0("[", '"', Model_plovila, '"', "]")
      Model_plovila <- fromJSON(json)
      Model_plovila <- as.data.frame(Model_plovila, stringsAsFactors = FALSE)
      colnames(Model_plovila) <- "Model_plovila"
      Duljina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiDuljinaTrupaText']/@value")
      json <- paste0("[", '"', Duljina_trupa, '"', "]")
      Duljina_trupa <- fromJSON(json)
      Duljina_trupa <- as.data.frame(Duljina_trupa, stringsAsFactors = FALSE)
      colnames(Duljina_trupa) <- "Duljina_trupa"
      Sirina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiSirinaText']/@value")
      json <- paste0("[", '"', Sirina_trupa, '"', "]")
      Sirina_trupa <- fromJSON(json)
      Sirina_trupa <- as.data.frame(Sirina_trupa, stringsAsFactors = FALSE)
      colnames(Sirina_trupa) <- "Sirina_trupa"
      Visina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVisinaText']/@value")
      json <- paste0("[", '"', Visina_trupa, '"', "]")
      Visina_trupa <- fromJSON(json)
      Visina_trupa <- as.data.frame(Visina_trupa, stringsAsFactors = FALSE)
      colnames(Visina_trupa) <- "Visina_trupa"
      Gaz <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGazText']/@value")
      json <- paste0("[", '"', Gaz, '"', "]")
      Gaz <- fromJSON(json)
      Gaz <- as.data.frame(Gaz, stringsAsFactors = FALSE)
      colnames(Gaz) <- "Gaz"
      Nosivost <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNosivostText']/@value")
      json <- paste0("[", '"', Nosivost, '"', "]")
      Nosivost <- fromJSON(json)
      Nosivost <- as.data.frame(Nosivost, stringsAsFactors = FALSE)
      colnames(Nosivost) <- "Nosivost"
      GT <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGtText']/@value")
      json <- paste0("[", '"', GT, '"', "]")
      GT <- fromJSON(json)
      GT <- as.data.frame(GT, stringsAsFactors = FALSE)
      colnames(GT) <- "GT"
      Snaga_motora <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiUkupnaSnagaText']/@value")
      json <- paste0("[", '"', Snaga_motora, '"', "]")
      Snaga_motora <- fromJSON(json)
      Snaga_motora <- as.data.frame(Snaga_motora, stringsAsFactors = FALSE)
      colnames(Snaga_motora) <- "Snaga_motora"
      Brodogradiliste <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiBrodogradilisteText']/@value")
      Brodogradiliste <- gsub("\"", "'", Brodogradiliste)
      json <- paste0("[", '"', Brodogradiliste, '"', "]")
      Brodogradiliste <- fromJSON(json)
      Brodogradiliste <- as.data.frame(Brodogradiliste, stringsAsFactors = FALSE)
      Encoding(Brodogradiliste[,c(1)]) <- "UTF-8"
      colnames(Brodogradiliste) <- "Brodogradiliste"
      Godina_gradnje <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGodGradnjeText']/@value")
      json <- paste0("[", '"', Godina_gradnje, '"', "]")
      Godina_gradnje <- fromJSON(json)
      Godina_gradnje <- as.data.frame(Godina_gradnje, stringsAsFactors = FALSE)
      colnames(Godina_gradnje) <- "Godina_gradnje"
      Materijal <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaterijalGradnjeText']/@value")
      json <- paste0("[", '"', Materijal, '"', "]")
      Materijal <- fromJSON(json)
      Materijal <- as.data.frame(Materijal, stringsAsFactors = FALSE)
      colnames(Materijal) <- "Materijal"
      Najveci_broj_osoba <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojOsobaText']/@value")
      json <- paste0("[", '"', Najveci_broj_osoba, '"', "]")
      Najveci_broj_osoba <- fromJSON(json)
      Najveci_broj_osoba <- as.data.frame(Najveci_broj_osoba, stringsAsFactors = FALSE)
      colnames(Najveci_broj_osoba) <- "Najveci_broj_osoba"
      Najveci_broj_putnika <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojPutnikaText']/@value")
      json <- paste0("[", '"', Najveci_broj_putnika, '"', "]")
      Najveci_broj_putnika <- fromJSON(json)
      Najveci_broj_putnika <- as.data.frame(Najveci_broj_putnika, stringsAsFactors = FALSE)
      colnames(Najveci_broj_putnika) <- "Najveci_broj_putnika"
      Najmanji_broj_posade <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMinBrojPosade']/@value")
      json <- paste0("[", '"', Najmanji_broj_posade, '"', "]")
      Najmanji_broj_posade <- fromJSON(json)
      Najmanji_broj_posade <- as.data.frame(Najmanji_broj_posade, stringsAsFactors = FALSE)
      colnames(Najmanji_broj_posade) <- "Najmanji_broj_posade"
      Prethodna_oznaka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaOznakaText']/@value")
      json <- paste0("[", '"', Prethodna_oznaka, '"', "]")
      Prethodna_oznaka <- fromJSON(json)
      Prethodna_oznaka <- as.data.frame(Prethodna_oznaka, stringsAsFactors = FALSE)
      colnames(Prethodna_oznaka) <- "Prethodna_oznaka"
      Prethodna_luka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaLukaUpisaText']/@value")
      Prethodna_luka <- gsub("\"", "'", Prethodna_luka)
      json <- paste0("[", '"', Prethodna_luka, '"', "]")
      Prethodna_luka <- fromJSON(json)
      Prethodna_luka <- as.data.frame(Prethodna_luka, stringsAsFactors = FALSE)
      colnames(Prethodna_luka) <- "Prethodna_luka"
      Prethodna_drĹľava <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaDrzavaUpisaText']/@value")
      json <- paste0("[", '"', Prethodna_drĹľava, '"', "]")
      Prethodna_drĹľava <- fromJSON(json)
      Prethodna_drĹľava <- as.data.frame(Prethodna_drĹľava, stringsAsFactors = FALSE)
      colnames(Prethodna_drĹľava) <- "Prethodna_drĹľava"
      
      df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Namjena, Vrsta_plovila, 
                       Model_plovila, Duljina_trupa, Sirina_trupa, Visina_trupa, Gaz, Nosivost, GT,
                       Snaga_motora, Brodogradiliste, Godina_gradnje, Materijal, Najveci_broj_osoba,
                       Najveci_broj_putnika, Najmanji_broj_posade, Prethodna_oznaka,
                       Prethodna_luka, Prethodna_drĹľava)
      df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)
      
      df_2 <- readHTMLTable(doc)
      Sys.sleep(2L)
      
      Porivni_uredjaji[[x]] <- tryCatch(as.data.frame(cbind(df_2[[2]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
      Clanovi_posade[[x]] <- tryCatch(as.data.frame(cbind(df_2[[3]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
      Vlasnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[4]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
      Korisnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[5]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
      Tereti[[x]] <- cbind(remDr$findElement(using = "xpath", "//*/span[@id='ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiTeretiLabel']")$getElementText(), NIB)
    }}
}
)

# manipulate data after scraping
for (i in 1:length(df)){
  if (length(df[[i]]) < 13){
    df[[i]] <- matrix(data = rep(NA, 26), nrow = 1, ncol = 26)
    df[[i]] <- as.data.frame(df[[i]])
    colnames(df[[i]]) <- c("Ime", "Oznaka", "NIB", "Vlasnik", "LK_LI", "br1", "br2","Namjena",
                           "Vrsta_plovila", "Model_plovila", "Duljina_trupa", "Sirina_trupa", "Visina_trupa",
                           "Gaz", "Nosivost", "GT", "Snaga_motora", "Brodogradiliste", "Godina_gradnje", 
                           "Materijal", "Najveci_broj_osoba", "Najveci_broj_putnika", "Najmanji_broj_posade", 
                           "Prethodna_oznaka", "Prethodna_luka", "Prethodna_drĹľava")
  }
}

df_final <- do.call(rbind, df)
df_final_1 <- df_final[!is.na(df_final$NIB), ]

EDIT 2 : I have a problem with above code you posted. If I run:

(cl <- (detectCores() - 1) %>%  makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3
clusterEvalQ(cl, {
  library(RSelenium)
  remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
  remDr$open()
})
myTitles <- c()
ws <- foreach(x = 1:length(urls), 
              .packages = c("rvest", "magrittr", "RSelenium", "jsonlite", "htmltab", "XML", "RCurl"))  %dopar%  {
  remDr$navigate(urls[x])
  Sys.sleep(3L)
  remDr$getTitle()[[1]]
              }

it returns an error

Error in { : task 1 failed - "   Summary: UnknownError
     Detail: An unknown server-side error occurred while processing the command.
     Further Details: run errorDetails method"
Dave2e
  • 22,192
  • 18
  • 42
  • 50
Mislav
  • 1,533
  • 16
  • 37
  • Simplest solution would to customize or your break your input into batches and run 8 parallel scripts in different terminal. This will mean least code changes. And do remember these are UI based browsers so you going beyond a specific number will start to deteriorate your system performance – Tarun Lalwani Sep 21 '17 at 09:22
  • @tarun Lalwani What do you mean by break input into baches? I have used oparallel and foreach package to parallel y compute for loop. But i am not sure how to do that with Rselenium. Or is it even possible. – Mislav Sep 21 '17 at 09:54
  • Post your code, can't comment like this – Tarun Lalwani Sep 21 '17 at 09:55
  • I have added the code. Lots of things happening, but important part i the loop. I would like to use this loops paralley somehow. For example I would like to run lopp from NIB number 100 000 to 120 000 on one session, fomr 120 001 to 140 000 on another etc. – Mislav Sep 21 '17 at 10:18
  • 1
    See if this helps you https://stackoverflow.com/questions/38950958/run-rselenium-in-parallel – Tarun Lalwani Sep 21 '17 at 10:28
  • It helps. But I don't know how to "tell" every every node in the cluster (every independent remote driver) to scrap different range of NIB numbers (that are inputs in form). – Mislav Sep 21 '17 at 12:12
  • S I have tried the code you linked. How to achieve this using docker. I have tried above code, Just inserting remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome"in parentheis of remDr, but it retunrs an error An unknown server-side error occurred while processing the command – Mislav Sep 21 '17 at 13:26

1 Answers1

0

Maybe an issue with chrome:3.5.0 docker image. The following runs for me on win 10 with docker toolbox:

library(RSelenium)
library(rvest)
library(magrittr)
library(foreach)
library(doParallel)

# using  docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3
# in windows
URLsPar <- c("https://stackoverflow.com/", "https://github.com/", 
             "http://www.bbc.com/", "http://www.google.com", 
             "https://www.r-project.org/", "https://cran.r-project.org",
             "https://twitter.com/", "https://www.facebook.com/")

appHTML <- c()

(cl <- (detectCores() - 1) %>%  makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
clusterEvalQ(cl, {
  library(RSelenium)
  remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, 
                        browserName = "chrome")
  remDr$open()
})
ws <- foreach(x = 1:length(URLsPar), 
              .packages = c("rvest", "magrittr", "RSelenium"))  %dopar%  {
                print(URLsPar[x])
                remDr$navigate(URLsPar[x])
                remDr$getTitle()[[1]]
              }
> ws
[[1]]
[1] "Stack Overflow - Where Developers Learn, Share, & Build Careers"

[[2]]
[1] "The world's leading software development platform · GitHub"

[[3]]
[1] "BBC - Homepage"

[[4]]
[1] "Google"

[[5]]
[1] "R: The R Project for Statistical Computing"

[[6]]
[1] "The Comprehensive R Archive Network"

[[7]]
[1] "Twitter. It's what's happening."

[[8]]
[1] "Facebook - Log In or Sign Up"     


# close browser on each node
clusterEvalQ(cl, {
  remDr$close()
})

stopImplicitCluster()
jdharrison
  • 30,085
  • 4
  • 77
  • 89
  • I put 7 same urls in `URLsPar` vector. Maybe that's the problem. also, I have driver 3.5.3., not 3.5.0. If I run the code in usual way, it woks perfectly (remoteDriver without paralell) – Mislav Sep 21 '17 at 20:51
  • I changed name `urls` instad of variable name `URLsPar ` in first place. Now I have changed to `URLsPar ` and it gives the output. Why is variable name important?? It gives following: [1]"Stack Overflow - Where Developers Learn, Share, & Build Careers" and for [2] and [3] `"chrome not reachable\n (Session info: chrome=61.0.3163.91)\n (Driver info: chromedriver=2.32.498513 (2c63aa53b2c658de596ed550eb5267ec5967b351),platform=Linux 4.4.86-boot2docker x86_64)"` So the question now is why chrome is not reachable. – Mislav Sep 21 '17 at 21:16
  • 1
    I have changed example to 8 urls. I don't see the issue you are having. It is possibly related to https://github.com/SeleniumHQ/docker-selenium/issues/547 and https://github.com/SeleniumHQ/docker-selenium/issues/554 – jdharrison Sep 21 '17 at 22:41
  • Your comment have helped. It works with chrome 3.4.0. But I have the same problem as before. Inside every node, scraping program should scrap same page but with different input scale for form. For example, if I have two cores (two active sessions), first session should scrap from input 1 to 100 and second core from 100 to 200. The goal is to decrease the time of scraping. – Mislav Sep 22 '17 at 12:54
  • The session scrap urls as they become available. In the example given above the work is distributed between the available cores. – jdharrison Sep 22 '17 at 13:52
  • Do you have any suggestion how to approach to my problem. Maybe I should open new question. Maybe I need to use different cluster function (clusterSpplit?) – Mislav Sep 22 '17 at 18:29