0

I had posted in Stack Exchange earlier; however, did not get much response from that yet; hence, posting it here.

I am trying to scrape some data using the following code. When I run the code line by line, it works fine. However, when I want to run all code at one go, the dropdown options go blank and as a result, the last line returns error. Your help would be much appreciated. The code is below.

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os 

path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)    

url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()

browser.find_element_by_link_text("BIHAR").click()
browser.implicitly_wait(5)

year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlFin")
elem2.send_keys(year[0])
browser.implicitly_wait(5)

select_dist = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist")
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
for e in range(len(options)):
    select_dist = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist"))
    select_dist.select_by_index(e)

    select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
    options1 = select_block.options
    for f in range(len(options1)):
        select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
        select_block.select_by_index(f)

        select_gp = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
        options2 = select_gp.options
        for g in range(len(options2)):
            select_gp =  Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
            select_gp.select_by_index(g)

            browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_rbLoginLevel_1").click()
            browser.implicitly_wait(10)

            elem6 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodFrom")
            elem6.send_keys('01/04/2016')
            browser.implicitly_wait(10)

            elem7 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodTo")
            elem7.send_keys('31/03/2017')
            browser.implicitly_wait(10)

            browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_login").click()
            browser.implicitly_wait(10)

            browser.find_element_by_link_text("Download All Reports").click()
Bidhu
  • 3
  • 4
  • `options = webdriver.ChromeOptions()` then `for e in range(len(options)):`? That should be `{TypeError}'Options' object is not iterable` – Trapli Apr 17 '20 at 09:05
  • Thank you @Trapli for noticing that. I edited the code. However, it still does not work – Bidhu Apr 17 '20 at 11:08

1 Answers1

0

Besides that the target page is slower than an aged snail, and those 10 second waits are barely enough for anything, there are two things you missed an those caused your troubles:

  • you did not take account that the first element of the select options are "select an option" types. So if you try to cycle trough all of them, you must ignore the option at the first index, else it will look like "nothing is selected"
  • wait for that spinner. After the spinner is gone, page will be refreshed. Do not grab the elements before page refresh is complete, wait until the spinner is gone.

With these two helper functions it is possible to press the "Get Reports" button without issues:

def is_spinner_gone(arg):
    loaded_spinner = browser.find_element_by_xpath('//div[//div[@class="loader"]]')
    if loaded_spinner:
        return loaded_spinner.get_attribute('style') == 'display: none;'
    return True

def wait_for_element(xpath):
    # this is necessary because the spinner does not pop up instantly
    time.sleep(1)
    no_spinner =  WebDriverWait(browser, 500).until(is_spinner_gone)

    element = WebDriverWait(browser, 500).until(
        EC.element_to_be_clickable((By.XPATH, xpath)))
    return element

If you get your elements via the wait_for_element call then you'll be able to interact with them without error. I guess you know that pressing that button is not the end of the road yet, you'll have to choose the report format and who knows what later on.

Adjusted code:


from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os
import time

path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)

start = time.time()
url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()

loaded = time.time()
print(f'PAGE LOADED IN {loaded-start} seconds')

browser.find_element_by_link_text("BIHAR").click()

def is_spinner_gone(arg):
    loaded_spinner = browser.find_element_by_xpath('//div[//div[@class="loader"]]')
    if loaded_spinner:
        return loaded_spinner.get_attribute('style') == 'display: none;'
    return True

def wait_for_element(xpath):
    # this is necessary because the spinner does not pop up instantly
    time.sleep(1)
    no_spinner =  WebDriverWait(browser, 500).until(is_spinner_gone)

    element = WebDriverWait(browser, 500).until(
        EC.element_to_be_clickable((By.XPATH, xpath)))
    return element

year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = wait_for_element('//*[@name="ctl00$ContentPlaceHolder1$ddlFin"]')
selector_page_loaded = time.time()
print(f'WORK AREA LOADED IN {selector_page_loaded-loaded} seconds')

elem2.send_keys(year[0])

select_dist = wait_for_element('//*[@name="ctl00$ContentPlaceHolder1$ddldist"]')
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
# ISSUE: default fields are included in the options!
for e in range(1,len(options)):
    select_dist = Select(wait_for_element('//*[@name="ctl00$ContentPlaceHolder1$ddldist"]'))
    select_dist.select_by_index(e)

    select_block = Select(wait_for_element('//*[@name="ctl00$ContentPlaceHolder1$ddlblock"]'))
    options1 = select_block.options
    for f in range(1, len(options1)):
        select_block = Select(wait_for_element('//*[@name="ctl00$ContentPlaceHolder1$ddlblock"]'))
        select_block.select_by_index(f)

        select_gp = Select(wait_for_element('//*[@name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
        options2 = select_gp.options
        for g in range(1, len(options2)):
            select_gp = Select(wait_for_element('//*[@name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
            select_gp.select_by_index(g)

            wait_for_element('//*[@id="ctl00_ContentPlaceHolder1_rbLoginLevel_1"]').click()

            elem6 = wait_for_element('//*[@name="ctl00$ContentPlaceHolder1$txtperiodFrom"]')
            elem6.send_keys('01/04/2016')

            elem7 = wait_for_element('//*[@name="ctl00$ContentPlaceHolder1$txtperiodTo"]')
            elem7.send_keys('31/03/2017')

            wait_for_element('//*[@value="Get Reports"]').click()
            print(f'FIRST RUN IN {time.time()-selector_page_loaded}')

Trapli
  • 1,517
  • 2
  • 13
  • 19
  • Thank you @Trapli. This works fine. A follow-up question: when I run the "download all reports", it opens a new tab. As a result, the code interrputts. I am wondering if there is a way to prevent chaning the focus of tab to the one with dropdowns – Bidhu Apr 19 '20 at 15:58
  • Well, you could Ctrl+click the `Get Reports` button to make the report page open in a new tab: https://stackoverflow.com/questions/27775759 and switch between tabs by https://stackoverflow.com/questions/38948190 but this is just an idea :) – Trapli Apr 19 '20 at 19:21