I'm newish to R (and programming in general) and am automating myself out of a job ;)
I have written a script that (1) takes a CSV file of "API numbers," (2) finds and downloads an HTML table for each API number, and (3) saves the info as a CSV table. It works - it's just not pretty. One of the problems is the website I'm downloading the data from gives a 500 Internal Server Error sometimes. In order to address the website's sporadic availability, I have built some real ugly nested if statements that delay the script for increasing amounts of time. It's overkill, but I don't want the download to fail when I leave it overnight.
I'm looking for feedback on the workaround download delay. Is there a better way to do this? Is there a way to tell R to keep trying the download until it's successful?
This script will download data and save each API number as a separate CSV. The example list of API numbers has 60. You can find it here: https://www.dropbox.com/s/fwvcxun8hr0xy4n/API%20List.csv?dl=0
Thanks in advance!
######################### User-Defined Parameters ##########################################
### Specify where the API list is and where to download temp data
welllist = ".../API List.csv" # each API will have a seperate CSV in this directory as well
tempdata = ".../tempdata.txt"
######################### Get API List and Parse API ##########################################
wells = read.csv(file = welllist, header = 1, sep = ",")
colnum = 1
rownum = nrow(wells)
API = data.frame(1:rownum,"A","B","C",stringsAsFactors = F)
colnames(API) = c("number", "type","county","sequence")
for (i in 1:rownum) {
current = toString(wells[i,colnum])
dashloc = as.data.frame(gregexpr(pattern = "-", text = current))
type = substr(x = current, start = 0, stop = dashloc[1,1]-1)
if (type != "05") {print(paste("WARNING! API DOES NOT BEGIN WITH 05", "- WELL", i,wells[i,2]))}
county = substr(x = current, start = dashloc[1,1]+1, stop = dashloc[2,1]-1)
sequence = substr(x = current, start = dashloc[2,1]+1, stop = nchar(current))
API$type[i] = type
API$county[i] = county
API$sequence[i] = sequence
}
######################### Download the Data ##########################################
end = nrow(API)
for (i in 1:end) {
county = API$county[i]
sequence = API$sequence[i]
dataurl = paste("http://cogcc.state.co.us/production/?&apiCounty=",county,"&apiSequence=",sequence,sep = "")
### ***** U-G-L-Y Retry Data Download if Server Error or if File Size is Too Small ***** ###
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(2)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(4)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(8)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(16)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(32)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(64)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(128)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(256)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(512)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
Sys.sleep(1024)
err <- try(download.file(url = dataurl, destfile = tempdata, quiet = F, mode = "wb"))
}
if (class(err) == "try-error" || file.size(tempdata) < 300000) {
write.csv(x = paste("Error downloading", sequence, "at", Sys.time()), file = paste(dirname(wells),"errorlog.txt",sep = "/"))
next
}
### Save the CSV ###
write.csv(x = tempdata, file = paste(dirname(welllist),"/",sequence,"_production.csv",sep = ""))
}
Periodically, the website breaks and gives: HTTP status was '500 Internal Server Error'