0

I am using the crrri package to webscrape some dynamic sites; however, when I run it across multiple pages in a for loop, there is a delay (I'm presuming because of the promises), which means that sometimes when accessing the html_temp file, it is yet to be written yet and as a result retrieves the previous web page instead.

Is there a way to ensure the file has been written before progressing to the next line within the for loop?

html_file <- tempfile(fileext = ".html")

chrome <- Chrome$new()
client <- chrome$connect()

dump_DOM <- function(client,url) {
  Network <- client$Network
  Page <- client$Page
  Runtime <- client$Runtime
  Network$enable() %...>%
    { Page$enable() } %...>%
    { Network$setCacheDisabled(cacheDisabled = TRUE) } %...>%
    { Page$navigate(url = url) } %...>%
    { Page$loadEventFired() } %...>% {
      Runtime$evaluate(
        expression = 'document.documentElement.outerHTML'
      )
    } %...>% {
      writeLines(c(.$result$value, "\n"), con = html_file)
    } %>%
    finally(
      ~ client$disconnect()
    ) %...!% {
      cat("Error:", .$message, "\n")
    }
}

url_list <- c('https://facebook.com','https://twitter.com','https://google.com')

for (i in 1:3){
 client <- chrome$connect()
 client %...>% dump_DOM(url = url_list[i]))
 hlink <- read_html(html_file)
}
timnus
  • 197
  • 10

0 Answers0