We are doing a project with a shiny app that involves scraping and downloading dataframes from a website. We have the following problem: It works on some computers and not others. We have the same packages versions, and we did not do too many requests... It is not linked with whether it is on windows of mac, as it works on some windows and some macs but not others. Do you have any idea ? Could it be in the connexion settings ? It is not linked with the wifi network, we tried on the same wifi...
Upon request here is the code and the error messages :
This function is the one we call directly:
scraping_function <- function(search_terms, subreddit,
sort_by , time_frame){
exctracted_link <- reddit_urls_mod(search_terms, subreddit,
sort_by , time_frame)
exctracted_data <- reddit_content(exctracted_link[,5])
exctracted_data[,13] <- cleaning_text_function(exctracted_data[,13])
return(exctracted_data)
}
These are the functions that this function calls :
- extracting the URLS:
reddit_urls_mod<- function (search_terms = "", subreddit = "",
sort_by = "", time_frame= "")
{
if (subreddit == ""){
subreddit <- NA
}
if (search_terms == ""){
search_terms <- NA
}
if (!grepl("^[0-9A-Za-z]*$", subreddit) & !is.na(subreddit) ) {
stop("subreddit must be a sequence of letter and number without special characters and spaces")
}
regex_filter = ""
cn_threshold = 0
page_threshold = 15
wait_time = 1
cached_links = data.frame(date = as.Date(character()),
num_comments = numeric(),
title = character(),
subreddit = character(),
URL = character(),
link = character())
if (sort_by != "front_page"){
if (!grepl("^comments$|^new$|^relevance$|^top$|^front_page$", sort_by)) {
stop("sort_by must be either 'new', 'comments', 'top', 'relevance' or 'front_page'")
}
if (!grepl("^hour$|^day$|^week$|^month$|^year$|^all$", time_frame)) {
stop("time_frame must be either 'hour', 'day', 'week', 'month', 'year or 'all'")
}
sterms = ifelse(is.na(search_terms), NA, gsub("\\s", "+",search_terms))
subreddit = ifelse(is.na(subreddit), "", paste0("r/", gsub("\\s+","+", subreddit), "/"))
sterms = ifelse(is.na(sterms), "", paste0("q=", sterms, "&restrict_sr=on&"))
sterms_prefix = ifelse(sterms == "", "new", "search")
time_frame_in = ifelse(is.na(search_terms), "", paste0("t=",time_frame,"&"))
search_address = search_query = paste0("https://www.reddit.com/",
subreddit, sterms_prefix,
".json?",
sterms,time_frame_in,
"sort=",
sort_by)
} else {
if (is.na(subreddit)) {
stop("if you choose sort_by = front_page please enter a subreddit")
}
search_address = search_query = paste0("https://www.reddit.com/r/",
subreddit,
".json?")
}
next_page = index = ""
page_counter = 0
comm_filter = 10000
while (is.null(next_page) == FALSE & page_counter < page_threshold &
comm_filter >= cn_threshold & length(index) > 0) {
search_JSON = tryCatch(RJSONIO::fromJSON(readLines(search_query,
warn = FALSE)), error = function(e) NULL)
if (is.null(search_JSON)) {
stop(paste("Unable to connect to reddit website or invalid subreddit entered"))
} else if (length(search_JSON$data$children)==0){
stop(paste("This search term returned no results or invalid subreddit entered"))
} else {
contents = search_JSON[[2]]$children
search_permalink = paste0("http://www.reddit.com",
sapply(seq(contents), function(x) contents[[x]]$data$permalink))
search_num_comments = sapply(seq(contents), function(x) contents[[x]]$data$num_comments)
search_title = sapply(seq(contents), function(x) contents[[x]]$data$title)
search_score = sapply(seq(contents), function(x) contents[[x]]$data$score)
search_subreddit = sapply(seq(contents), function(x) contents[[x]]$data$subreddit)
search_link = sapply(seq(contents), function(x) contents[[x]]$data$url)
index = which(search_num_comments >= cn_threshold &
grepl(regex_filter, search_title, ignore.case = T,
perl = T))
if (length(index) > 0) {
search_date = format(as.Date(as.POSIXct(unlist(lapply(seq(contents), function(x) contents[[x]]$data$created_utc)),
origin = "1970-01-01")), "%d-%m-%y")
temp_dat = data.frame(date = search_date,
num_comments = search_num_comments,
title = search_title,
subreddit = search_subreddit,
URL = search_permalink,
link = search_link,
stringsAsFactors = FALSE)[index,]
cached_links = as.data.frame(rbind(cached_links,
temp_dat))
next_page = search_JSON$data$after
comm_filter = utils::tail(search_num_comments,
1)
search_query = paste0(search_address, "&after=",
next_page)
page_counter = page_counter + 1
}
Sys.sleep(min(2, wait_time))
}
}
final_table = cached_links[!duplicated(cached_links), ]
if (dim(final_table)[1] == 0) {
cat(paste("\nNo results retrieved, should be invalid subreddit entered, down server or simply unsuccessful search query :("))
}
else {
remove_row = which(final_table[, 1] == "")
if (length(remove_row) > 0) {
final_table = final_table[-remove_row, ]
}
return(final_table)
}
}
- extracting the content :
reddit_content <- function (URL, wait_time = 1) {
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(
paste0(filter, " ", depth),
lapply(1:length(reply.nodes),
function(x)
get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))
))
}
data_extract = data.frame(
id = numeric(),
structure = character(),
post_date = as.Date(character()),
comm_date = as.Date(character()),
num_comments = numeric(),
subreddit = character(),
upvote_prop = numeric(),
post_score = numeric(),
author = character(),
user = character(),
comment_score = numeric(),
controversiality = numeric(),
comment = character(),
title = character(),
post_text = character(),
link = character(),
domain = character(),
URL = character()
)
withProgress(message = 'Work in progress', value = 0, min=0,max=1, {
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X, warn = FALSE)),
error = function(e)
NULL
)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X,
warn = FALSE)),
error = function(e)
NULL
)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x)
get.structure(main.node[[x]], x)))
TEMP = data.frame(
id = NA,
structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(
as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")
), "%d-%m-%y"),
comm_date = format(as.Date(
as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")
), "%d-%m-%y"),
num_comments = meta.node$num_comments,
subreddit = ifelse(
is.null(meta.node$subreddit),
"UNKNOWN",
meta.node$subreddit
),
upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score,
author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})),
comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})),
controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})),
comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})),
title = meta.node$title,
post_text = meta.node$selftext,
link = meta.node$url,
domain = meta.node$domain,
URL = URL[i],
stringsAsFactors = FALSE
)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else
print(paste("missed", i, ":", URL[i]))
}
}
incProgress(amount = 1/length(URL))
Sys.sleep(min(2, wait_time))
}
# data_extract[,13] <-
# cleaning_text_function(data_extract[,13])
})
return(data_extract)
}
- Cleaning the text:
cleaning_text_function <- function(x,stopwords=stopwords_vec) {
stopwords_vec <- c(stopwords::stopwords("en"), "don", "isn", "gt", "i", "re","removed","deleted","m","you re","we ll", "ve", "hasn","they re","id","tl dr", "didn", "wh","oh","tl","dr","shes","hes","aren","edit","ok","ll","wasn","shouldn","t","doesn","youre","going","still","much", "many","also")
if (is.character(x)) {
#Put accents instead of code html (only for french)
Encoding(x) <- 'latin1'
#take out accent
x <- stri_trans_general(x, 'latin-ascii')
x <- unlist(lapply(x, function(x, stopwords = stopwords_vec) {
#separate words
x <- unlist(strsplit(x, " "))
#take out internet links
x <- x[!grepl("\\S+www\\S+|\\S+https://\\S+|https://\\S+", x)]
#take out codes ASCII and ponctuation
x <-gsub("\n|[[:punct:]]|[\x01-\x09\x11-\x12\x14-\x1F\x7F]|gt"," ",x)
#take out simple alone numbers
x <-gsub("(^[0-9]{1}\\s|^[0-9]{1}$|\\s{1}[0-9]{1}$|\\s{1}[0-9]{1}\\s{1})"," ",x)
#take out space in the beginning and end of stringg
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
#lowercase
x <- tolower(x)
#take out alone letters
x <-gsub("(^[a-z]{1}\\s+|^[a-z]{1}$|\\s+[a-z]{1}$|\\s+[a-z]{1}\\s+)", "", x)
#take out words in stopwords list
x <-paste(x[!x %in% stopwords], collapse = " ")
#rerun stopwords again to get ride of stopword in composed string
x <- unlist(strsplit(x, " "))
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
x <-paste(x[!x %in% stopwords], collapse = " ")
return(x)
}))
} else{
stop("please enter a character vector")
}
return(x)
}
And the message we get :
Listening on http://127.0.0.1:7745
Warning in file(con, "r") :
cannot open URL 'https://www.reddit.com/r/news/search.json?q=Greta&restrict_sr=on&t=week&sort=comments': HTTP status was '429 Unknown Error'
Warning: Error in reddit_urls_mod: Unable to connect to reddit website or invalid subreddit entered
126: stop
125: reddit_urls_mod
124: scraping_function
123: eventReactiveHandler
79: df1
72: observeEventHandler
1: runApp
I get an error 429 even on computers that have never made a request before...
Thank you