Download the File which has stream-url is the chrome extension in the embed tag using selenium webdriver in python

Question

According to my code I have tried to click on the View button which contain the hidden document, I need to download that document using selenium webdriver in python. When I inspect, I got the stream-url = chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai/85967fa5-7853-412e-bbe5-c96406308ec6 this stream-url I found in the embed tag. I am not getting how to download that document.

enter code here
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import os
from selenium.webdriver.support.select import Select
import time
import pandas as pd
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'

driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver, 
    20).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='search- 
    pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= 
     WebDriverWait(driver,
     10).until(EC.element_to_be_clickable((By.ID,"Promoter")))

driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver, 
     10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in 
     driver.find_elements_by_tag_name("a") if
     item.get_attribute('href') is not None]
View = View[0]
request = urllib.request.Request(View)
driver.get(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
divPInfo = soup.find("div", {"id": "DivDocument"})
title = divPInfo.find("div", {'class': 'x_panel'}, 
       recursive=False).find("div", {'class': 'x_title'}).find(
      "h2").text.strip()
print(title)
with open("uploads.csv" , "a") as csv_file:
    csv_file.write(title + "\n")
    csv_file.close()    
table = pd.read_html(driver.page_source)[11]                 
print(table)
table.to_csv("uploads.csv" , sep=',',index = False)
btn = WebDriverWait(driver, 
    20).until(EC.element_to_be_clickable((By.XPATH, "//button[@class='btn 
    btn-info btn-xs' and @id='btnShow_10']")))
driver.execute_script("arguments[0].click();",btn)

on may computer it always give errorn on `View = View[0]` because it can't find links — furas, Jul 23 '19 at 11:31
I checked this `View` and it sends `HTML` not `PDF` so you would have to download `HTML`, search all `` and download all images. — furas, Jul 23 '19 at 13:28
@furas - Actually when I clicked the View button it redirect me to another window of pdf which has stream-url containing the chrome-extension doesn't contain the — A.D, Jul 24 '19 at 05:24
so it uses chrome extension to display this page. And this extension may have hidden url. But as I know Selenium may not have access to extensions. — furas, Jul 24 '19 at 10:24
@furas - Yes it uses chrome extension. So not able to download the scan docs embed in . Can you please suggest for the same — A.D, Jul 24 '19 at 10:27
@furas- Can we have another method to download that scan documents. — A.D, Jul 24 '19 at 12:49
this page display scan as PDF in the same way as in previous question. It uses ` — furas, Jul 25 '19 at 10:15

furas · Accepted Answer · 2020-05-12T06:12:37.223

In Firefox page uses <object data="..."> to display PDF with scan. There are buttons in section "Uploaded Documents" to display other scans.

This code uses these buttons to display scans, get data from <object> and save in files document-0.pdf, document-1.pdf, etc.

I use the same code you could see in my answer to your previous question:
Save the pdf using the selenium webdriver in python

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time

url = 'https://maharerait.mahaonline.gov.in'

#chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
#driver = webdriver.Chrome(executable_path=chrome_path)

driver = webdriver.Firefox()

driver.get(url)

WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='search-pro-details']//a[contains(.,'Search Project Details')]"))).click()
registered_project_radio = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();", registered_project_radio)

application = driver.find_element_by_id("CertiNo")
application.send_keys("P50500000005")

search = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();", search)

time.sleep(5)

View = [item.get_attribute('href')
         for item in driver.find_elements_by_tag_name("a")
          if item.get_attribute('href') is not None]

# if there is list then get first element
if View:
    View = View[0]

#-----------------------------------------------------------------------------

# load page    
driver.get(View)

# find buttons in section `Uploaded Documents`
buttons = driver.find_elements_by_xpath('//div[@id="DivDocument"]//button')

# work with all buttons 
for i, button in enumerate(buttons):

    # click button
    button.click()

    # wait till page display scan
    print('wait for object:', i)
    search = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, "object")))

    # get data from object    
    print('get data:', i)    
    import base64

    obj = driver.find_element_by_tag_name('object')
    data = obj.get_attribute('data')
    text = data.split(',')[1]
    bytes = base64.b64decode(text)

    # save scan in next PDF     
    print('save: document-{}.pdf'.format(i))    
    with open('document-{}.pdf'.format(i), 'wb') as fp:
        fp.write(bytes)

    # close scan        
    print('close document:', i)    
    driver.find_element_by_xpath('//button[text()="Close"]').click()    

# --- end ---

driver.close()

probably all PDF you could get this method. Other documents may not use ` — furas, Jul 25 '19 at 14:17
this code checks only one page which you had in code and it downloads 9 documents from this one page - at least with Firefox. If you need download from other pages then you have to add urls and rebuild code. — furas, Jul 25 '19 at 14:35
@furas- Can you please suggest for the issue posted at https://stackoverflow.com/questions/58115962/how-to-split-the-sublist-from-the-list-and-add-the-data-in-another-list — A.D, Sep 26 '19 at 12:16

Download the File which has stream-url is the chrome extension in the embed tag using selenium webdriver in python

1 Answers1

Linked