I am using the crrri
package to webscrape some dynamic sites; however, when I run it across multiple pages in a for loop, there is a delay (I'm presuming because of the promises
), which means that sometimes when accessing the html_temp
file, it is yet to be written yet and as a result retrieves the previous web page instead.
Is there a way to ensure the file has been written before progressing to the next line within the for loop?
html_file <- tempfile(fileext = ".html")
chrome <- Chrome$new()
client <- chrome$connect()
dump_DOM <- function(client,url) {
Network <- client$Network
Page <- client$Page
Runtime <- client$Runtime
Network$enable() %...>%
{ Page$enable() } %...>%
{ Network$setCacheDisabled(cacheDisabled = TRUE) } %...>%
{ Page$navigate(url = url) } %...>%
{ Page$loadEventFired() } %...>% {
Runtime$evaluate(
expression = 'document.documentElement.outerHTML'
)
} %...>% {
writeLines(c(.$result$value, "\n"), con = html_file)
} %>%
finally(
~ client$disconnect()
) %...!% {
cat("Error:", .$message, "\n")
}
}
url_list <- c('https://facebook.com','https://twitter.com','https://google.com')
for (i in 1:3){
client <- chrome$connect()
client %...>% dump_DOM(url = url_list[i]))
hlink <- read_html(html_file)
}