0
`library(rvest)
library(RCurl)
library(XML)
library(stringr)


#Getting the number of Page
getPageNumber <- function(URL){
  parsedDocument = read_html(URL)
  Sort1 <- html_nodes(parsedDocument, 'div')
  Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pagination al-pagination")] 
  P <- str_count(html_text(Sort2), pattern = " \\d+\r\n")
  return(ifelse(length(P) == 0, 0, max(P)))
}

#Getting all articles based off of their DOI
getAllArticles <-function(URL){
  parsedDocument = read_html(URL)
  Sort1 <- html_nodes(parsedDocument,'div')
  Sort2 <-  Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
  ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2)))
  URL3 <- "https://doi.org/10.1093/dnares/"
  URL4 <- paste(URL3, ArticleDOInumber, sep = "")
  return(URL4)
}


Title <- function(parsedDocument){
  Sort1 <- html_nodes(parsedDocument, 'h4')
  Title <- gsub("<a>\\n|\\n</a>","",Sort1)
  return(Title)
}



#main function with input as parameter year
findURL <- function(year_chosen){
  if(year_chosen >= 1994){
  noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
  pagesURl = "&fl_SiteID=5275&page="
  URL = paste(noYearURL, pagesURl, sep = "")
  #URL is working with parameter year_chosen
  Page <- getPageNumber(URL)
  
  
  if(Page == 5){
    Page2 <- 0
    while(Page < Page2 | Page != Page2){
      Page <- Page2
      URL3 <- paste(URL, Page-1, sep = "")
      Page2 <- getPageNumber(URL3)  
    }
  }
  R_Data <- data.frame()
  for(i in 0:ifelse((Page-1) > 0, (Page-1), 0)){
  URL2 <- getAllArticles(paste(URL, i, sep = ""))
    for(j in 1:(length(URL2))){
      parsedDocument <- read_html(URL2[j])
      print(URL2[j])
      R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
      R_Data <- rbind(R_Data, R)
    } 
  }
  write.csv(R_Data, "Group4.csv", row.names = FALSE, sep = "\t")
  } else {
    print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
  }
}

findURL(2000)`

So I am Trying to scrape a website for a given year and inside my main function I try to loop through different pages. Extracting just the title of each article.

I keep getting this error -> Error in open.connection(x, "rb") : HTTP error 404

Some years have only 3 pages so I can see why there may be an error for that, but mostly all have articles have at 5 pages of journals.

After scraping the journals by year I want to write out the scraped titles onto a civ file .

Thank you in advance for the help!

bkush98
  • 19
  • 6

1 Answers1

0

Haven't fully checked i.e. not tested with lots of different years, but as Page = 0 yields no results do you perhaps want:

for(i in 1:Page)

instead of

for(i in 0:ifelse((Page-1) > 0, (Page-1), 0))

This possibly has implications for logic later in findURL.

Also, your function Title is currently returning html. Don't know if that was intentional.

With my suggested change your code now produces a populated csv with a lot of duplicated info suggesting you may want to revisit your Title function.

QHarr
  • 83,427
  • 12
  • 54
  • 101
  • Thank you for the suggestion that does do the same thing. I posted another question and it's about my output if you could take a look https://stackoverflow.com/questions/66757114/r-function-is-looping-over-the-same-data-in-webscraper – bkush98 Mar 23 '21 at 04:06
  • It doesn't produce a 404 for me like your original code did. Are you saying it does for you? – QHarr Mar 23 '21 at 04:06
  • 1
    I restarted RStudio and the error went away thank you for the help! – bkush98 Mar 23 '21 at 04:09
  • QHarr if you can I would appreciate assistance on this question I posted https://stackoverflow.com/questions/66757114/r-function-is-looping-over-the-same-data-in-webscraper – bkush98 Mar 23 '21 at 04:12
  • What are you hoping to return from Title function? – QHarr Mar 23 '21 at 04:13
  • The title function is supposed to scrape the title of a journal entry. I believe my Title function is currently returning the title along with some html. That has to be fixed – bkush98 Mar 23 '21 at 04:16
  • No it is not. I am just testing an adaptation to your code. – QHarr Mar 23 '21 at 04:16