I'm trying to automate downloading the holdings of Vanguard funds from the web. The links resolve through JavaScript so I'm using Pyppeteer but I'm not getting the file. Note, the link says CSV but it provides an Excel file.
From my browser it works like this:
- Go to the fund URL, eg https://www.vanguard.com.au/personal/products/en/detail/8225/portfolio
- Follow the link, "See count total holdings"
- Click the link, "Export to CSV"
My attempt to replicate this in Python follows. The first link follow seems to work because I get different HTML but the second click gives me the same page, not a download.
import asyncio
from pyppeteer import launch
import os
async def get_page(browser, url):
page = await browser.newPage()
await page.goto(url)
return page
async def fetch(url):
browser = await launch(options={'args': ['--no-sandbox']}) #headless=True,
page = await get_page(browser, url)
await page.waitFor(2000)
# save the page so we can see the source
wkg_dir = 'vanguard'
t_file = os.path.join(wkg_dir,'8225.htm')
with open(t_file, 'w', encoding="utf-8") as ef:
ef.write(await page.content())
accept = await page.xpath('//a[contains(., "See count total holdings")]')
print(f'Found {len(accept)} "See count total holdings" links')
if accept:
await accept[0].click()
await page.waitFor(2000)
else:
print('DID NOT FIND THE LINK')
return False
# save the pop-up page for debug
t_file = os.path.join(wkg_dir,'first_page.htm')
with open(t_file, 'w', encoding="utf-8") as ef:
ef.write(await page.content())
links = await page.xpath('//a[contains(., "Export to CSV")]')
print(f'Found {len(links)} "Export to CSV" links') # 3 of these
for i, link in enumerate(links):
print(f'Trying link {i}')
await link.click()
await page.waitFor(2000)
t_file = os.path.join(wkg_dir,f'csv_page{i}.csv')
with open(t_file, 'w', encoding="utf-8") as ef:
ef.write(await page.content())
return True
#---------- Main ------------
# Set constants and global variables
url = 'https://www.vanguard.com.au/personal/products/en/detail/8225/portfolio'
loop = asyncio.get_event_loop()
status = loop.run_until_complete(fetch(url))
Would love to hear suggestions from anyone that knows Puppeteer / Pyppeteer well.