I want to pull a few basic information from Google Scholar such as Title_name, Author_Names, Year_Publication, Title_URL, and cited_by across all Google Scholar pages but as a test wanted to extract information from 2 pages.
The purpose of this webscraping is to generate a list of studies for literature review leading to a meta-analysis study.
I have been trying to edit the following code but no luck:
# Install and load the necessary packages
#install.packages("RSelenium")
##install.packages("rvest")
#install.packages("stringr")
library(RSelenium)
library(rvest)
library(stringr)
# Start a Selenium server and open Chrome browser
rD <- rsDriver(browser = "chrome", chromever = "latest", geckover = "latest",
IEDriverVersion = NULL, verbose = FALSE, check = TRUE,
extraCapabilities = NULL, verboseInfo = FALSE, checkInterval = 1000,
timeout = 20000, whitelist = NULL, checkPath = TRUE, port = 4445L,
phantomver = NULL,
chromepath = NULL, firefoxpath = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe")
remDr <- rD$client
# Define your search terms
search_terms <- "((COVID OR COVID-19))"
# Function to extract data from a page
extract_data <- function(page_source) {
page <- read_html(page_source)
titles <- page %>% html_nodes(".gs_rt") %>% html_text()
authors <- page %>% html_nodes(".gs_a") %>% html_text()
years <- str_extract(authors, "\\d{4}")
authors <- str_replace(authors, "\\d{4}", "")
urls <- page %>% html_nodes(".gs_rt a") %>% html_attr("href")
cited_by <- page %>% html_nodes(".gs_fl a:nth-child(3)") %>% html_text()
cited_by <- as.integer(str_extract(cited_by, "\\d+"))
data.frame(Title_name = titles, Author_Names = authors, Year_Publication = years, Title_URL = urls, cited_by = cited_by)
}
# Function to search for a specific term on Google Scholar
search_google_scholar <- function(term) {
tryCatch({
remDr$navigate("https://scholar.google.com/")
search_box <- remDr$findElement("css", "#gs_hdr_tsi")
search_box$sendKeysToElement(list(term, key="enter"))
Sys.sleep(5) # Allow time for page to load
pages <- 2 # Number of pages to scrape
results <- data.frame()
for (page in 1:pages) {
page_source <- remDr$getPageSource()[[1]]
page_data <- extract_data(page_source)
results <- rbind(results, page_data)
next_button <- remDr$findElement("css", "#gs_n a")
if (length(next_button) == 0) {
break
} else {
next_button$clickElement()
Sys.sleep(5) # Allow time for page to load
}
}
return(results)
}, error = function(e) {
message("An error occurred: ", conditionMessage(e))
NULL
})
}
# Execute the search and scrape the data
search_results <- search_google_scholar(search_terms)
# Close the browser
remDr$close()
rD$server$stop()
Can anyone help me modify the above code or suggest a simple workaround?