The idea is to load the page with PhantomJS
, wait for the contents of the workbook to load, get all the necessary parameters for the download file handler endpoint request which we can do with requests
package.
Full working solution:
import json
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
WORKBOOK_TYPE = "PublishedItemsSnapshot"
driver = webdriver.PhantomJS()
driver.maximize_window()
driver.get('http://www.cbe.org.eg/en/EconomicResearch/Publications/_layouts/xlviewer.aspx?id=/MonthlyStatisticaclBulletinDL/External%20Sector%20Data%20235.xlsx&DefaultItemOpen=1#')
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.ID, "ctl00_PlaceHolderMain_m_excelWebRenderer_ewaCtl_rowHeadersDiv")))
# get workbook uri
hidden_input = wait.until(EC.presence_of_element_located((By.ID, "ctl00_PlaceHolderMain_m_excelWebRenderer_ewaCtl_m_workbookContextJson")))
workbook_uri = json.loads(hidden_input.get_attribute('value'))['EncryptedWorkbookUri']
# get session id
session_id = driver.find_element_by_id("ctl00_PlaceHolderMain_m_excelWebRenderer_ewaCtl_m_workbookId").get_attribute("value")
# get workbook filename
workbook_filename = driver.find_element_by_xpath("//h2[contains(@class, 's4-mini-header')]/span[contains(., '.xlsx')]").text
driver.close()
print("Downloading workbook '%s'..." % workbook_filename)
response = requests.get("http://www.cbe.org.eg/en/EconomicResearch/Publications/_layouts/XlFileHandler.aspx", params={
'id': workbook_uri,
'sessionId': session_id,
'workbookFileName': workbook_filename,
'workbookType': WORKBOOK_TYPE
})
with open(workbook_filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)