0

I am trying to read multiple result pages using rvest to collect all the data on a page.

I have tried adapting code from this :

R web scraping across multiple pages

but I only get the first page for the results.

library(httr)
library(rvest)
library(dplyr)
library(magrittr)
library(stringr)
library(lubridate)
library(purrr)
library(jsonlite)
library(rjson)
library(tidyverse)

body_tags_1 <- lapply(paste0('https://www.eventbrite.com/d/ny--new-york/conference/?page=', 1:49),
                function(url){
                  url %>% read_html() %>% 
                    html_nodes("body") %>% 
                    html_text() %>% 
                    toString() # to produce a single character string describing an R object.
                })

tmp <- str_match_all(body_tags_1,'window.__SERVER_DATA__ = (.*);')  


# Convert R objects from JSON - output - list and flatten the JSON object
json <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)

# get Event name from json
Event_Name <- json$suggestions$events$name
# convert to data frame
Event_Name <- as.data.frame(Event_Name)
# convert column variable to character from factor
Event_Name <- data.frame(lapply(Event_Name, as.character), stringsAsFactors=FALSE)

# get event date from dates 
Event_Date <- json$suggestions$events$start_date

# convert dates to data frame
Event_Date <- as.data.frame(Event_Date)

# Convert dates to character
Event_Date <- data.frame(lapply(Event_Date, as.character), stringsAsFactors=FALSE)

#`## convert Date from chr format to Date format using lubridate
Event_Date$Event_Date <- ymd(Event_Date$Event_Date)
# get Location from json
Location <- json$suggestions$events$primary_venue.address.city 

# convert to data frame
Location <- as.data.frame(Location)
# convert column variable to character from factor
Location <- data.frame(lapply(Location, as.character), stringsAsFactors=FALSE)

Tickets <- json$suggestions$events$ticket_availability.minimum_ticket_price.major_value  

# transform a vector into data frame with fixed dimension
# by converting to a matrix, specify the ncol
m1 <- matrix(Tickets, ncol=1, byrow=TRUE)

# convert to data frame
Tickets <- as.data.frame(m1, stringsAsFactors=FALSE)
Tickets <- as.data.frame(Tickets)

# get Currency from json
Currency <- json$suggestions$events$ticket_availability.minimum_ticket_price.currency
Currency
# convert to data frame
Currency <- as.data.frame(Currency)
str(Currency)
# convert column variable to character from factor
Currency <- data.frame(lapply(Currency, as.character), stringsAsFactors=FALSE)

# bind all the data together by columns
all_data_bind <- cbind.data.frame(Event_Name, Event_Date, Location, Tickets, Currency)

# rename V1 as Mininum Price
all_data <- all_data_bind %>% 
  rename(Min_Price = V1)

all_data$Min_Price <- as.numeric(all_data$Min_Price)

# remove rows with na
all_data_1 <- all_data %>% drop_na()
all_data_1
str(all_data_1)
# keep rows with price > 200
all_data_filter_Price <- filter(all_data_1, Min_Price > 200)
all_data_filter_Price

I expected all pages but these are the results I get: dput(all_data_1)

structure(list(Event_Name = c("AFROPUNK FEST BROOKLYN 2019", 
"New York: The Wizard's Brunch & Dinner ", "ROOFTOP PARTY | SATURDAY NIGHT | Sky Room NYC Tallest Rooftop Bar Lounge  Times Square ", 
"2019 Tunnel to Towers 5K Run & Walk - NEW YORK CITY", "CIRCLE OF SISTERS 2019", 
"RuPaul's DragCon NYC 2019", "Caribbean Concerts at Six Flags 2019", 
"NYC Ravel Penthouse 808 Rooftop Saturdays Everyone FREE onlist (Gametight)", 
"Comic Con For Kids (Philadelphia, PA)", "AFROBEATS & BRUNCH "
), Event_Date = structure(c(18132, 18124, 18111, 18168, 18146, 
18145, 18126, 18111, 18181, 18112), class = "Date"), Location = c("Brooklyn", 
"New York City", "New York", "Brooklyn", "New York", "New York", 
"Jackson", "Queens", "Oaks", "New York"), Min_Price = c(60, 45, 
0, 0, 22.99, 0, 0, 0, 14.99, 0), Currency = c("USD", "USD", "USD", 
"USD", "USD", "USD", "USD", "USD", "USD", "USD")), row.names = c(NA, 
10L), class = "data.frame")
EJG_27
  • 111
  • 10

1 Answers1

0

You are getting all pages but you are not processing all the returned items in the regex.

The problem is this line I think:

json <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)

You are only working with the first regex match group from all the ones you are interested in. You need to work with all 49 i.e. length(tmp) .You can check this with:

> json1 <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)
> json1$page_number
[1] 1
> json2 <- jsonlite::fromJSON(tmp[[2]][,2], flatten=TRUE)
> json2$page_number
[1] 2
> 

You get the idea. You could write a function which extracts from any given page returning the info of interest and apply that to all returned regex groups of interest.

QHarr
  • 83,427
  • 12
  • 54
  • 101
  • for (k in 1:49){ json[k] <- jsonlite::fromJSON(tmp[[k]][,2], flatten=TRUE) # Something like this? – EJG_27 Aug 03 '19 at 13:18
  • there is probably a more efficient way using tools in tidyverse to be honest but yes that is one way to access each item – QHarr Aug 03 '19 at 13:37
  • Please explain what you mean by doesn't work? If you run the pastebin code and compare the two output files they should match with the json file links I gave. If you inspect you should see different content in index1 versus index2 - when examining the results attribute and the page number – QHarr Aug 05 '19 at 20:48
  • Did you work this out so you got same results as I did? – QHarr Aug 08 '19 at 07:29