Trying to webscrape journals from website Oxford articles.
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL) {
print(URL)
parsedDocument <- read_html(URL)
results_per_page <- length(parsedDocument %>% html_nodes(".sr-list"))
total_results <- parsedDocument %>%
toString() %>%
str_match(., 'num_results":"(.*?)"') %>%
.[,2] %>%
as.integer()
pageNumber <- tryCatch(ceiling(total_results / results_per_page), error = function(e) {1})
return(pageNumber)
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
findLocationDiv <- html_nodes(parsedDocument,'div')
foundClass <- findLocationDiv[which(html_attr(findLocationDiv, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(foundClass)))
DOImain <- "https://doi.org/10.1093/dnares/"
fullDOI <- paste(DOImain, ArticleDOInumber, sep = "")
return(fullDOI)
}
#Get Title of journals
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\\r\\n\\s+", "", .) %>%
trimws(.)
Title <- ifelse(is.na(Title), "No", Title)
return(Title)
}
#Getting Authors of Journals
Authors <- function(parsedDocument){
Authors <- parsedDocument %>%
html_node("a.linked-name") %>%
html_text() %>%
return(Authors)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
firstPage <- getPageNumber(URL)
if (firstPage == 5) {
nextPage <- 0
while (firstPage < nextPage | firstPage != nextPage) {
firstPage <- nextPage
URLwithPageNum <- paste(URL, firstPage-1, sep = "")
nextPage <- getPageNumber(URLwithPageNum)
}
}
DNAresearch <- data.frame()
for (i in 1:firstPage) {
URLallArticles <- getAllArticles(paste(URL, i, sep = ""))
print(URLallArticles)
for (j in 1:(length(URLallArticles))) {
parsedDocument <- read_html(URLallArticles[j])
paste(parsedDocument)
#need work on getiing Full Text
#allData <- data.frame("Full text"=FullText(parsedDocument),stringsAsFactors = FALSE)
#scraped items that are good
#"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding Authors" = CorrespondingAuthors(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publish Date" = PublicationDate(parsedDocument),"Abstract" = Abstract(parsedDocument),"Keywords" = Keywords(parsedDocument)
allData <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
DNAresearch <- rbind(DNAresearch, allData)
}
}
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 1994 to present")
}
}
##################### Main function test
findURL(2015)
Code is showing error 404.
I believe it is a problem with getAllArticles, the last output has a bad url. I've tried using a try catch to stop the error from outputting but haven't been successful. It may also be my logic.
the output for the year 2015 is:
[1] "https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F2015%20TO%2012%2F31%2F2015&fl_SiteID=5275&page="
[1] "https://doi.org/10.1093/dnares/dsv028"
[2] "https://doi.org/10.1093/dnares/dsv027"
[3] "https://doi.org/10.1093/dnares/dsv029"
[4] "https://doi.org/10.1093/dnares/dsv030"
[1] "https://doi.org/10.1093/dnares/dsv022"
[1] "https://doi.org/10.1093/dnares/dsv024"
[2] "https://doi.org/10.1093/dnares/dsv025"
[3] "https://doi.org/10.1093/dnares/dsv026"
[4] "https://doi.org/10.1093/dnares/dsv021"
[5] "https://doi.org/10.1093/dnares/dsv023"
[1] "https://doi.org/10.1093/dnares/dsv020"
[2] "https://doi.org/10.1093/dnares/dsv019"
[3] "https://doi.org/10.1093/dnares/dsv017"
[1] "https://doi.org/10.1093/dnares/dsv018"
[2] "https://doi.org/10.1093/dnares/dsv015"
[1] "https://doi.org/10.1093/dnares/dsv013"
[2] "https://doi.org/10.1093/dnares/dsv016"
[3] "https://doi.org/10.1093/dnares/dsv014"
[1] "https://doi.org/10.1093/dnares/dsv012"
[2] "https://doi.org/10.1093/dnares/dsv010"
[1] "https://doi.org/10.1093/dnares/dsv011"
[2] "https://doi.org/10.1093/dnares/dsv009"
[3] "https://doi.org/10.1093/dnares/dsv005"
[1] "https://doi.org/10.1093/dnares/dsv008"
[2] "https://doi.org/10.1093/dnares/dsv007"
[3] "https://doi.org/10.1093/dnares/dsv004"
[1] "https://doi.org/10.1093/dnares/dsv006"
[2] "https://doi.org/10.1093/dnares/dsv002"
[3] "https://doi.org/10.1093/dnares/dsv003"
[4] "https://doi.org/10.1093/dnares/dsv001"
[1] "https://doi.org/10.1093/dnares/dsu047"
[2] "https://doi.org/10.1093/dnares/dsu045"
[3] "https://doi.org/10.1093/dnares/dsu046"
[1] "https://doi.org/10.1093/dnares/dsu044"
[2] "https://doi.org/10.1093/dnares/dsu041"
[3] "https://doi.org/10.1093/dnares/dsu038"
[4] "https://doi.org/10.1093/dnares/dsu040"
[5] "https://doi.org/10.1093/dnares/dsu042"
[6] "https://doi.org/10.1093/dnares/dsu043"
[1] "https://doi.org/10.1093/dnares/"
Error in open.connection(x, "rb") : HTTP error 404.
In addition: Warning message:
In for (i in seq_along(specs)) { :
Error in open.connection(x, "rb") : HTTP error 404.
a year like 1994 for example runs without an error, but years like 2015 and 2016 has this error.