0

I am trying to create a Python script that will complete a search at the DOE website with specific search parameters (Institution Name like: and Most Recent Award Date:) so that I can then parse the data later in my script. If this returns multiple pages, I will need to get the data from each page. I cannot figure out how to get the site to return any search results at all.

I found this StackOverflow response, which seems like exactly what I need, but when I run the following code:

import requests
from lxml import etree

URL = 'https://pamspublic.science.energy.gov/WebPAMSExternal/Interface/Awards/AwardSearchExternal.aspx'

def get_fields():
    res = requests.get(URL)
    if res.ok:
        page = etree.HTML(res.text)
        fields = page.xpath('//form[@id="aspnetForm"]//input')
        return { e.attrib['name']: e.attrib.get('value', '') for e in fields }

get_fields()

I get this error:

Traceback (most recent call last):
  File "/home/austin/repos/funding-scraper/doe_scraper.py", line 15, in <module>
    get_fields()
  File "/home/austin/repos/funding-scraper/doe_scraper.py", line 13, in get_fields
    return { e.attrib['name']: e.attrib.get('value', '') for e in fields }
  File "/home/austin/repos/funding-scraper/doe_scraper.py", line 13, in <dictcomp>
    return { e.attrib['name']: e.attrib.get('value', '') for e in fields }
  File "src/lxml/etree.pyx", line 2497, in lxml.etree._Attrib.__getitem__
KeyError: 'name'

EDIT1:

An example query with specific search parameters:

Institution name like: University of Texas

Most Recent Award Date: Between: 1/1/2023 and: 1/31/2023

I don't know what the exact response would look like, but it should include results from this search that contain multiple html/json/xml fields for each result entry (e.g. Award Number, Title, Institution, Amount Awarded to Date, etc.)

EDIT2:

After much trial and error, I pieced together a half-solution:

import requests
from bs4 import BeautifulSoup

URL = 'https://pamspublic.science.energy.gov/WebPAMSExternal/Interface/Awards/AwardSearchExternal.aspx'


def get_fields():
    res = requests.get(URL)
    if res.ok:
        soup = BeautifulSoup(res.content, 'html.parser')
        script_manager = soup.find(attrs={"name": "ctl00_REIRadScriptManager1_TSM"})['value']
        viewstate = soup.find(attrs={"name": "__VIEWSTATE"})['value']
        my_dict = {
            "ctl00_REIRadScriptManager1_TSM": script_manager,
            "__VIEWSTATE": viewstate,
            "ctl00$MainContent$pnlSearch$txtInstitutionName": "University of Texas",
            "ctl00$MainContent$pnlSearch$dpAwardDateFrom$dateInput": "1/1/2023",
            "ctl00$MainContent$pnlSearch$dpAwardDateTo$dateInput": "1/31/2023"
        }
        return my_dict
    

def query():
    formdata = get_fields()
    res = requests.post(URL, formdata)
    if res.ok:
        soup = BeautifulSoup(res.content, 'html.parser')
        with open('results.html', 'w') as results:
            results.write(str(soup))

query()

This creates a parse-able document (which I will figure out later) that includes search results. However, it is not running the DateFrom or DateTo inputs, and it's returning only the first 15 results. Any help on adding these parameters to my post request would be appreciated!

darrowboat
  • 33
  • 4
  • Can you edit your question with an example of a specific set of search parameters and the exact expected output from that search? – Jack Fleeting Feb 21 '23 at 13:18
  • I edited my response to include the exact search parameters I would be using (specific dates would change each month). I don't know exactly what to expect from the output, but as long as it returns the same results one would get from a manual browser search in any format that I can parse, I can work with it! @JackFleeting – darrowboat Feb 21 '23 at 14:05

1 Answers1

0

I wanted to post a working solution I finally discovered, in case it helps anyone in the future:

import requests
from bs4 import BeautifulSoup
import argparse
import datetime

URL = 'https://pamspublic.science.energy.gov/WebPAMSExternal/Interface/Awards/AwardSearchExternal.aspx'
#inputformat = YYYYMMDD
#outputformat = MMDDYYYY

parser = argparse.ArgumentParser(description='Scrape NSF funded awards')
parser.add_argument('-s', '--start', dest='start_date', help='range start date, format = YYYYMMDD', required=True)
parser.add_argument('-e', '--end', dest='end_date', help='range start date, format = YYYYMMDD', required=True)
args = parser.parse_args()

START = datetime.datetime.strptime(args.start_date, '%Y%m%d').strftime('%-m/%-d/%Y')
START_VALIDATION = datetime.datetime.strptime(args.start_date, '%Y%m%d').strftime('%Y-%m-%d-00-00-00')
END = datetime.datetime.strptime(args.end_date, '%Y%m%d').strftime('%-m/%-d/%Y')
END_VALIDATION = datetime.datetime.strptime(args.end_date, '%Y%m%d').strftime('%Y-%m-%d-23-59-59')

with requests.Session() as session:
    # Start a session with a post request to the URL
    res = session.post(URL)
    if res.ok:
        # Use response to grab hidden fields necessary for a valid search
        soup = BeautifulSoup(res.content, 'html.parser')
        script_manager = soup.find(attrs={"name": "ctl00_REIRadScriptManager1_TSM"})['value']
        viewstate = soup.find(attrs={"name": "__VIEWSTATE"})['value']
        viewstategenerator = soup.find(attrs={"name": "__VIEWSTATEGENERATOR"})['value']
        payload = {
            "ctl00_REIRadScriptManager1_TSM": script_manager,
            "__EVENTTARGET": "ctl00$MainContent$pnlSearch",
            "__EVENTARGUMENT": "CustomSortSelected=False SearchPanelExpanded=True Search",
            "__VIEWSTATE": viewstate,
            "__VIEWSTATEGENERATOR": viewstategenerator,
            # Institution name like:
            "ctl00$MainContent$pnlSearch$txtInstitutionName": "University of Texas",
            # Award start date: 
            "ctl00$MainContent$pnlSearch$dpPPSDFrom$dateInput": f"{START}",
            "ctl00_MainContent_pnlSearch_dpPPSDFrom_dateInput_ClientState": f"{{'enabled':true,'emptyMessage':'','validationText':'{START_VALIDATION}','valueAsString':'{START_VALIDATION}','minDateStr':'1980-00-01-00-01-00','maxDateStr':'2099-00-31-00-12-00','lastSetTextBoxValue':'{START}'}}",
            "ctl00$MainContent$pnlSearch$dpPPSDTo$dateInput": f"{END}",
            "ctl00_MainContent_pnlSearch_dpPPSDTo_dateInput_ClientState": f"{{'enabled':true,'emptyMessage':'','validationText':'{END_VALIDATION}','valueAsString':'{END_VALIDATION}','minDateStr':'1980-00-01-00-01-00','maxDateStr':'2099-00-31-00-12-00','lastSetTextBoxValue':'{END}'}}"
        }
    # Make updated post request to perform actual search
    res = session.post(URL, data=payload)
    if res.ok:
        soup = BeautifulSoup(res.content, 'html.parser')
        with open('results.html', 'w') as results:
            results.write(str(soup))

Big thanks to the author of this post for leading me in the right direction!

darrowboat
  • 33
  • 4