stop web scraper from showing 404 error in R

Question

Trying to webscrape journals from website Oxford articles.

        library(rvest)
        library(RCurl)
        library(XML)
        library(stringr)
    
    
    #Getting the number of Page
    getPageNumber <- function(URL) {
      print(URL)
      parsedDocument <- read_html(URL)
      results_per_page <- length(parsedDocument %>% html_nodes(".sr-list"))
      total_results <- parsedDocument %>%
        toString() %>%
        str_match(., 'num_results":"(.*?)"') %>% 
        .[,2] %>%
        as.integer()
      pageNumber <- tryCatch(ceiling(total_results / results_per_page), error = function(e) {1})
      return(pageNumber)
    }
    
    
    #Getting all articles based off of their DOI
    getAllArticles <-function(URL){
      parsedDocument = read_html(URL)
      findLocationDiv <- html_nodes(parsedDocument,'div')
      foundClass <-  findLocationDiv[which(html_attr(findLocationDiv, "class") == "al-citation-list")]
      ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(foundClass)))
      DOImain <- "https://doi.org/10.1093/dnares/"
      fullDOI <- paste(DOImain, ArticleDOInumber, sep = "")
      return(fullDOI)
    }
    
    
    #Get Title of journals
    Title <- function(parsedDocument) {
      Title <- parsedDocument %>%
        html_node(".article-title-main") %>%
        html_text() %>%
        gsub("\\r\\n\\s+", "", .) %>%
        trimws(.)
      Title <- ifelse(is.na(Title), "No", Title)
      return(Title)
    }
    
    #Getting Authors of Journals
    Authors <- function(parsedDocument){
      Authors <- parsedDocument %>%
        html_node("a.linked-name") %>%
        html_text() %>%
      return(Authors)
    }
#main function with input as parameter year
findURL <- function(year_chosen){
  if (year_chosen >= 1994) {
    noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
    pagesURl <- "&fl_SiteID=5275&page="
    URL <- paste(noYearURL, pagesURl, sep = "")
    # URL is working with parameter year_chosen
    firstPage <- getPageNumber(URL)
    if (firstPage == 5) {
      nextPage <- 0
      while (firstPage < nextPage | firstPage != nextPage) {
        firstPage <- nextPage
        URLwithPageNum <- paste(URL, firstPage-1, sep = "")
        nextPage <- getPageNumber(URLwithPageNum)
      }
    }
  DNAresearch <- data.frame()
    for (i in 1:firstPage) {
      URLallArticles <- getAllArticles(paste(URL, i, sep = ""))
      print(URLallArticles)
      for (j in 1:(length(URLallArticles))) {
        parsedDocument <- read_html(URLallArticles[j])
        paste(parsedDocument)
        #need work on getiing Full Text
        #allData <- data.frame("Full text"=FullText(parsedDocument),stringsAsFactors = FALSE)
        
        #scraped items that are good
        #"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding  Authors" = CorrespondingAuthors(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publish Date" = PublicationDate(parsedDocument),"Abstract" = Abstract(parsedDocument),"Keywords" = Keywords(parsedDocument)
        
        allData <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
        DNAresearch <- rbind(DNAresearch, allData)
      }
    }
    write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
  } else {
    print("The Year you provide is out of range, this journal only contain articles from 1994 to present")
  }
}

##################### Main function test
findURL(2015)

Code is showing error 404.

I believe it is a problem with getAllArticles, the last output has a bad url. I've tried using a try catch to stop the error from outputting but haven't been successful. It may also be my logic.

the output for the year 2015 is:

[1] "https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F2015%20TO%2012%2F31%2F2015&fl_SiteID=5275&page="
[1] "https://doi.org/10.1093/dnares/dsv028"
[2] "https://doi.org/10.1093/dnares/dsv027"
[3] "https://doi.org/10.1093/dnares/dsv029"
[4] "https://doi.org/10.1093/dnares/dsv030"
[1] "https://doi.org/10.1093/dnares/dsv022"
[1] "https://doi.org/10.1093/dnares/dsv024"
[2] "https://doi.org/10.1093/dnares/dsv025"
[3] "https://doi.org/10.1093/dnares/dsv026"
[4] "https://doi.org/10.1093/dnares/dsv021"
[5] "https://doi.org/10.1093/dnares/dsv023"
[1] "https://doi.org/10.1093/dnares/dsv020"
[2] "https://doi.org/10.1093/dnares/dsv019"
[3] "https://doi.org/10.1093/dnares/dsv017"
[1] "https://doi.org/10.1093/dnares/dsv018"
[2] "https://doi.org/10.1093/dnares/dsv015"
[1] "https://doi.org/10.1093/dnares/dsv013"
[2] "https://doi.org/10.1093/dnares/dsv016"
[3] "https://doi.org/10.1093/dnares/dsv014"
[1] "https://doi.org/10.1093/dnares/dsv012"
[2] "https://doi.org/10.1093/dnares/dsv010"
[1] "https://doi.org/10.1093/dnares/dsv011"
[2] "https://doi.org/10.1093/dnares/dsv009"
[3] "https://doi.org/10.1093/dnares/dsv005"
[1] "https://doi.org/10.1093/dnares/dsv008"
[2] "https://doi.org/10.1093/dnares/dsv007"
[3] "https://doi.org/10.1093/dnares/dsv004"
[1] "https://doi.org/10.1093/dnares/dsv006"
[2] "https://doi.org/10.1093/dnares/dsv002"
[3] "https://doi.org/10.1093/dnares/dsv003"
[4] "https://doi.org/10.1093/dnares/dsv001"
[1] "https://doi.org/10.1093/dnares/dsu047"
[2] "https://doi.org/10.1093/dnares/dsu045"
[3] "https://doi.org/10.1093/dnares/dsu046"
[1] "https://doi.org/10.1093/dnares/dsu044"
[2] "https://doi.org/10.1093/dnares/dsu041"
[3] "https://doi.org/10.1093/dnares/dsu038"
[4] "https://doi.org/10.1093/dnares/dsu040"
[5] "https://doi.org/10.1093/dnares/dsu042"
[6] "https://doi.org/10.1093/dnares/dsu043"
[1] "https://doi.org/10.1093/dnares/"
Error in open.connection(x, "rb") : HTTP error 404.
In addition: Warning message:
In for (i in seq_along(specs)) { :
 
 Error in open.connection(x, "rb") : HTTP error 404.

a year like 1994 for example runs without an error, but years like 2015 and 2016 has this error.

Maybe this prior post will help: https://stackoverflow.com/questions/19111956/suppress-error-message-in-r — SteveM, Mar 27 '21 at 19:14
So I've tried this try(is.url(fullDOI)) to validate if the url is valid, but it returns a boolean. I need to still use the url for later scraping — bkush98, Mar 27 '21 at 19:20

score 2 · Accepted Answer · answered Mar 27 '21 at 19:46

You can check for valid URL and add exception -

        if (url.exists(URLallArticles[j])){
              
    parsedDocument <- read_html(URLallArticles[j])
    paste(parsedDocument)
    #need work on getiing Full Text
    #allData <- data.frame("Full text"=FullText(parsedDocument),stringsAsFactors = FALSE)
    
    #scraped items that are good
    #"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding  Authors" = CorrespondingAuthors(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publish Date" = PublicationDate(parsedDocument),"Abstract" = Abstract(parsedDocument),"Keywords" = Keywords(parsedDocument)
    
    allData <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
    DNAresearch <- rbind(DNAresearch, allData)
    }

stop web scraper from showing 404 error in R

1 Answers1