This may look like a site that uses XHR via javascript to retrieve the URL contents, but it's not. It's just a plain web site that can easily be scraped via standard rvest
& xml2
calls like html_session
and read_html
. It does keep the Location:
URL the same, so it kinda looks like XHR even thought it's not.
However, this is a <form>
-based site, which means you could be generous to the community and write an R wrapper for the "hidden" API and possibly donate it to rOpenSci.
To that end, I used the curlconverter
package on the "Copy as cURL" content from the POST
request and it provided all the form fields (which seem to map to most — if not all — of the fields on the advanced search page):
library(curlconverter)
make_req(straighten())[[1]] -> req
httr::VERB(verb = "POST", url = "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list",
httr::add_headers(Pragma = "no-cache",
Origin = "https://www.fbo.gov",
`Accept-Encoding` = "gzip, deflate, br",
`Accept-Language` = "en-US,en;q=0.8",
`Upgrade-Insecure-Requests` = "1",
`User-Agent` = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.41 Safari/537.36",
Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
`Cache-Control` = "no-cache",
Referer = "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list",
Connection = "keep-alive",
DNT = "1"), httr::set_cookies(PHPSESSID = "32efd3be67d43758adcc891c6f6814c4",
sympcsm_cookies_enabled = "1",
BALANCEID = "balancer.172.16.121.7"),
body = list(`dnf_class_values[procurement_notice][keywords]` = "machine+learning",
`dnf_class_values[procurement_notice][_posted_date]` = "365",
search_filters = "search",
`_____dummy` = "dnf_",
so_form_prefix = "dnf_",
dnf_opt_action = "search",
dnf_opt_template = "VVY2VDwtojnPpnGoobtUdzXxVYcDLoQW1MDkvvEnorFrm5k54q2OU09aaqzsSe6m",
dnf_opt_template_dir = "Pje8OihulaLVPaQ+C+xSxrG6WrxuiBuGRpBBjyvqt1KAkN/anUTlMWIUZ8ga9kY+",
dnf_opt_subform_template = "qNIkz4cr9hY8zJ01/MDSEGF719zd85B9",
dnf_opt_finalize = "0",
dnf_opt_mode = "update",
dnf_opt_target = "", dnf_opt_validate = "1",
`dnf_class_values[procurement_notice][dnf_class_name]` = "procurement_notice",
`dnf_class_values[procurement_notice][notice_id]` = "63ae1a97e9a5a9618fd541d900762e32",
`dnf_class_values[procurement_notice][posted]` = "",
`autocomplete_input_dnf_class_values[procurement_notice][agency]` = "",
`dnf_class_values[procurement_notice][agency]` = "",
`dnf_class_values[procurement_notice][zipstate]` = "",
`dnf_class_values[procurement_notice][procurement_type][]` = "",
`dnf_class_values[procurement_notice][set_aside][]` = "",
mode = "list"), encode = "form")
curlconverter
adds the httr::
prefixes to the various functions since you can actually use req()
to make the request. It's a bona-fide R function.
However, most of the data being passed in is browser "cruft" and can be trimmed down a bit and moved into a POST
request:
library(httr)
library(rvest)
POST(url = "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list",
add_headers(Origin = "https://www.fbo.gov",
Referer = "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list"),
set_cookies(PHPSESSID = "32efd3be67d43758adcc891c6f6814c4",
sympcsm_cookies_enabled = "1",
BALANCEID = "balancer.172.16.121.7"),
body = list(`dnf_class_values[procurement_notice][keywords]` = "machine+learning",
`dnf_class_values[procurement_notice][_posted_date]` = "365",
search_filters = "search",
`_____dummy` = "dnf_",
so_form_prefix = "dnf_",
dnf_opt_action = "search",
dnf_opt_template = "VVY2VDwtojnPpnGoobtUdzXxVYcDLoQW1MDkvvEnorFrm5k54q2OU09aaqzsSe6m",
dnf_opt_template_dir = "Pje8OihulaLVPaQ+C+xSxrG6WrxuiBuGRpBBjyvqt1KAkN/anUTlMWIUZ8ga9kY+",
dnf_opt_subform_template = "qNIkz4cr9hY8zJ01/MDSEGF719zd85B9",
dnf_opt_finalize = "0",
dnf_opt_mode = "update",
dnf_opt_target = "", dnf_opt_validate = "1",
`dnf_class_values[procurement_notice][dnf_class_name]` = "procurement_notice",
`dnf_class_values[procurement_notice][notice_id]` = "63ae1a97e9a5a9618fd541d900762e32",
`dnf_class_values[procurement_notice][posted]` = "",
`autocomplete_input_dnf_class_values[procurement_notice][agency]` = "",
`dnf_class_values[procurement_notice][agency]` = "",
`dnf_class_values[procurement_notice][zipstate]` = "",
`dnf_class_values[procurement_notice][procurement_type][]` = "",
`dnf_class_values[procurement_notice][set_aside][]` = "",
mode="list"),
encode = "form") -> res
This portion:
set_cookies(PHPSESSID = "32efd3be67d43758adcc891c6f6814c4",
sympcsm_cookies_enabled = "1",
BALANCEID = "balancer.172.16.121.7")
makes me think you should use html_session
or GET
at least once on the main URL to establish those cookies in the cached curl
handler (which will be created & maintained automagically for you).
The add_headers()
bit may also not be necessary but that's an exercise left for the reader.
You can find the table you're looking for via:
content(res, as="text", encoding="UTF-8") %>%
read_html() %>%
html_nodes("table.list") %>%
html_table() %>%
dplyr::glimpse()
## Observations: 20
## Variables: 4
## $ Opportunity <chr> "NSN: 1650-01-074-1054; FILTER ELEMENT, FLUID; WSIC: L SP...
## $ Agency/Office/Location <chr> "Defense Logistics Agency DLA Acquisition LocationsDLA Av...
## $ Type / Set-aside <chr> "Presolicitation", "Presolicitation", "Award", "Award", "...
## $ Posted On <chr> "Sep 28, 2016", "Sep 28, 2016", "Sep 28, 2016", "Sep 28, ...
There's an indicator on the page saying these are results "1 - 20 of 2008". You need to scrape that as well and deal with the paginated results. This is also left as an exercise to the reader.