I am trying to read multiple result pages using rvest to collect all the data on a page.
I have tried adapting code from this :
R web scraping across multiple pages
but I only get the first page for the results.
library(httr)
library(rvest)
library(dplyr)
library(magrittr)
library(stringr)
library(lubridate)
library(purrr)
library(jsonlite)
library(rjson)
library(tidyverse)
body_tags_1 <- lapply(paste0('https://www.eventbrite.com/d/ny--new-york/conference/?page=', 1:49),
function(url){
url %>% read_html() %>%
html_nodes("body") %>%
html_text() %>%
toString() # to produce a single character string describing an R object.
})
tmp <- str_match_all(body_tags_1,'window.__SERVER_DATA__ = (.*);')
# Convert R objects from JSON - output - list and flatten the JSON object
json <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)
# get Event name from json
Event_Name <- json$suggestions$events$name
# convert to data frame
Event_Name <- as.data.frame(Event_Name)
# convert column variable to character from factor
Event_Name <- data.frame(lapply(Event_Name, as.character), stringsAsFactors=FALSE)
# get event date from dates
Event_Date <- json$suggestions$events$start_date
# convert dates to data frame
Event_Date <- as.data.frame(Event_Date)
# Convert dates to character
Event_Date <- data.frame(lapply(Event_Date, as.character), stringsAsFactors=FALSE)
#`## convert Date from chr format to Date format using lubridate
Event_Date$Event_Date <- ymd(Event_Date$Event_Date)
# get Location from json
Location <- json$suggestions$events$primary_venue.address.city
# convert to data frame
Location <- as.data.frame(Location)
# convert column variable to character from factor
Location <- data.frame(lapply(Location, as.character), stringsAsFactors=FALSE)
Tickets <- json$suggestions$events$ticket_availability.minimum_ticket_price.major_value
# transform a vector into data frame with fixed dimension
# by converting to a matrix, specify the ncol
m1 <- matrix(Tickets, ncol=1, byrow=TRUE)
# convert to data frame
Tickets <- as.data.frame(m1, stringsAsFactors=FALSE)
Tickets <- as.data.frame(Tickets)
# get Currency from json
Currency <- json$suggestions$events$ticket_availability.minimum_ticket_price.currency
Currency
# convert to data frame
Currency <- as.data.frame(Currency)
str(Currency)
# convert column variable to character from factor
Currency <- data.frame(lapply(Currency, as.character), stringsAsFactors=FALSE)
# bind all the data together by columns
all_data_bind <- cbind.data.frame(Event_Name, Event_Date, Location, Tickets, Currency)
# rename V1 as Mininum Price
all_data <- all_data_bind %>%
rename(Min_Price = V1)
all_data$Min_Price <- as.numeric(all_data$Min_Price)
# remove rows with na
all_data_1 <- all_data %>% drop_na()
all_data_1
str(all_data_1)
# keep rows with price > 200
all_data_filter_Price <- filter(all_data_1, Min_Price > 200)
all_data_filter_Price
I expected all pages but these are the results I get: dput(all_data_1)
structure(list(Event_Name = c("AFROPUNK FEST BROOKLYN 2019",
"New York: The Wizard's Brunch & Dinner ", "ROOFTOP PARTY | SATURDAY NIGHT | Sky Room NYC Tallest Rooftop Bar Lounge Times Square ",
"2019 Tunnel to Towers 5K Run & Walk - NEW YORK CITY", "CIRCLE OF SISTERS 2019",
"RuPaul's DragCon NYC 2019", "Caribbean Concerts at Six Flags 2019",
"NYC Ravel Penthouse 808 Rooftop Saturdays Everyone FREE onlist (Gametight)",
"Comic Con For Kids (Philadelphia, PA)", "AFROBEATS & BRUNCH "
), Event_Date = structure(c(18132, 18124, 18111, 18168, 18146,
18145, 18126, 18111, 18181, 18112), class = "Date"), Location = c("Brooklyn",
"New York City", "New York", "Brooklyn", "New York", "New York",
"Jackson", "Queens", "Oaks", "New York"), Min_Price = c(60, 45,
0, 0, 22.99, 0, 0, 0, 14.99, 0), Currency = c("USD", "USD", "USD",
"USD", "USD", "USD", "USD", "USD", "USD", "USD")), row.names = c(NA,
10L), class = "data.frame")