This is my program that I've written
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument, 'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pageNumbers al-pageNumbers")]
P <- str_count(html_text(Sort2), pattern = " \\d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument,'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument){
Sort1 <- html_nodes(parsedDocument, 'h1')
Title <- gsub("<h1>\\n|\\n</h1>","",Sort1)
return(Title)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if(year_chosen >= 1994){
noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl = "&fl_SiteID=5275&startpage="
URL = paste(noYearURL, pagesURl, sep = "")
#URL is working with parameter year_chosen
Page <- getPageNumber(URL)
Page2 <- 0
while(Page < Page2 | Page != Page2){
Page <- Page2
URL3 <- paste(URL, Page-1, sep = "")
Page2 <- getPageNumber(URL3)
}
R_Data <- data.frame()
for(i in 1:Page){ #0:Page-1
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for(j in 1:(length(URL2))){
parsedDocument <- read_html(URL2[j])
print(URL2[j])
R <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
#R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
R_Data <- rbind(R_Data, R)
}
}
paste(URL2)
suppressWarnings(write.csv(R_Data, "DNAresearch.csv", row.names = FALSE, sep = "\t"))
#return(R_Data)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)
The output for my code goes as follows:
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
[1] "https://doi.org/10.1093/dnares/10.6.277"
[1] "https://doi.org/10.1093/dnares/10.6.229"
[1] "https://doi.org/10.1093/dnares/10.6.239"
[1] "https://doi.org/10.1093/dnares/10.6.287"
[1] "https://doi.org/10.1093/dnares/10.5.221"
[1] "https://doi.org/10.1093/dnares/10.5.203"
[1] "https://doi.org/10.1093/dnares/10.5.213"
[1] "https://doi.org/10.1093/dnares/10.4.137"
[1] "https://doi.org/10.1093/dnares/10.4.147"
[1] "https://doi.org/10.1093/dnares/10.4.167"
[1] "https://doi.org/10.1093/dnares/10.4.181"
[1] "https://doi.org/10.1093/dnares/10.4.155"
[1] "https://doi.org/10.1093/dnares/10.3.115"
[1] "https://doi.org/10.1093/dnares/10.3.85"
[1] "https://doi.org/10.1093/dnares/10.3.123"
[1] "https://doi.org/10.1093/dnares/10.3.129"
[1] "https://doi.org/10.1093/dnares/10.3.97"
[1] "https://doi.org/10.1093/dnares/10.2.59"
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
I'm trying to scrape a journal with years as a parameter. I've scraped one page, but when I'm supposed to change pages my loop just goes back to the top of the page and loops over the same data. My code should be right and I don't understand why this is happening. Thank you in advance