I tweaked the scrapy_stocks function to accommodate the Yahoo page update. I haven't thoroughly vetted this solution, but it seems to work well in all my trials thus far. Please be aware of two things:
- I don't think this would work if you have Yahoo Premium. I don't have it, so I can't test it. But if you do, it shouldn't be too difficult to update.
- I don't have a lot of experience with rvest, but because of the nature of the page, it had to set the function such that if there is one value that is missing, the entire row is missing.
Try this:
scrapy_stocks2 <- function(stock){
if ("rvest" %in% installed.packages()) {
library(rvest)
}else{
install.packages("rvest")
library(rvest)
}
if ("xml2" %in% installed.packages()) {
library(xml2)
}else{
install.packages("xml2")
library(xml2)
}
for (stocknum in 1:length(stock)) {
tryCatch(
{
# Income Statement
url <- "https://finance.yahoo.com/quote/"
url <- paste0(url,stock[stocknum],"/financials?p=",stock[stocknum])
wahis.session <- html_session(url)
nodes <- wahis.session %>%
html_nodes(xpath = '//*[@id="Col1-1-Financials-Proxy"]/section/div[4]//span')
yh_data <- nodes %>%
xml_text() %>%
gsub(pattern = ',', replacement = '')
colnums <- 1:6
col_nms <- yh_data[colnums]
yh_data <- yh_data[-colnums]
lab_inds <- nodes %>%
html_attr(name = 'class') == "Va(m)"
lab_inds[is.na(lab_inds)] <- FALSE
lab_inds <- lab_inds[-colnums]
data <- matrix(NA, nrow = sum(lab_inds), ncol = 5, dimnames = list(yh_data[lab_inds], col_nms[-1]))
row_num <- 1
for (i in 2:(length(lab_inds)-4)) {
t_ind <- !lab_inds[i:(i+4)]
if (sum(t_ind) == 5) {
data[row_num, 1:5] <- as.numeric(yh_data[i:(i+4)])
}
if (lab_inds[i]) {
row_num <- row_num+1
}
}
temp1 <- as.data.frame(data)
print(paste(stock[stocknum],' Income Statement Success'))
# Balance Sheet
url <- "https://finance.yahoo.com/quote/"
url <- paste0(url,stock[stocknum],"/balance-sheet?p=",stock[stocknum])
wahis.session <- html_session(url)
nodes <- wahis.session %>%
html_nodes(xpath = '//*[@id="Col1-1-Financials-Proxy"]/section/div[4]/div[1]/div[1]//span')
yh_data <- nodes %>%
xml_text() %>%
gsub(pattern = ',', replacement = '')
colnums <- 1:5
col_nms <- yh_data[colnums]
yh_data <- yh_data[-colnums]
lab_inds <- nodes %>%
html_attr(name = 'class') == "Va(m)"
lab_inds[is.na(lab_inds)] <- FALSE
lab_inds <- lab_inds[-colnums]
data <- matrix(NA, nrow = sum(lab_inds), ncol = 4, dimnames = list(yh_data[lab_inds], col_nms[-1]))
row_num <- 1
for (i in 2:(length(lab_inds)-3)) {
t_ind <- !lab_inds[i:(i+3)]
if (sum(t_ind) == 4) {
data[row_num, 1:4] <- as.numeric(yh_data[i:(i+3)])
}
if (lab_inds[i]) {
row_num <- row_num+1
}
}
temp2 <- as.data.frame(data)
print(paste(stock[stocknum],' Balance Sheet Success'))
# Cash Flow
url <- "https://finance.yahoo.com/quote/"
url <- paste0(url,stock[stocknum],"/cash-flow?p=",stock[stocknum])
wahis.session <- html_session(url)
nodes <- wahis.session %>%
html_nodes(xpath = '//*[@id="Col1-1-Financials-Proxy"]/section/div[4]/div[1]/div[1]//span')
yh_data <- nodes %>%
xml_text() %>%
gsub(pattern = ',', replacement = '')
colnums <- 1:6
col_nms <- yh_data[colnums]
yh_data <- yh_data[-colnums]
lab_inds <- nodes %>%
html_attr(name = 'class') == "Va(m)"
lab_inds[is.na(lab_inds)] <- FALSE
lab_inds <- lab_inds[-colnums]
data <- matrix(NA, nrow = sum(lab_inds), ncol = 5, dimnames = list(yh_data[lab_inds], col_nms[-1]))
row_num <- 1
for (i in 2:(length(lab_inds)-4)) {
t_ind <- !lab_inds[i:(i+4)]
if (sum(t_ind) == 5) {
data[row_num, 1:5] <- as.numeric(yh_data[i:(i+4)])
}
if (lab_inds[i]) {
row_num <- row_num+1
}
}
temp3 <- as.data.frame(data)
print(paste(stock[stocknum],' Cash Flow Statement Success'))
assign(paste0(stock[stocknum],'.f'),value = list(IS = temp1,BS = temp2,CF = temp3),envir = parent.frame())
},
error = function(cond){
message(stock[stocknum], "Give error ",cond)
}
)
}
}