0

I have a code.

It works at times; but works when i rerun it a couple of times at times and sometimes the next rerun.

I want to either:

  1. Wait till the process is completely loaded like add a wait()
  2. Restart the process/rerun this code

Code:

import os
import re
import threading
from math import nan
from multiprocessing.pool import ThreadPool

import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver

class Driver:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        # Un-comment next line to supress logging:
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.driver = webdriver.Chrome(options=options)

    def __del__(self):
        self.driver.quit()  # clean up driver when we are cleaned up
        # print('The driver has been "quitted".')


threadLocal = threading.local()


def create_driver():
    the_driver = getattr(threadLocal, 'the_driver', None)
    if the_driver is None:
        the_driver = Driver()
        setattr(threadLocal, 'the_driver', the_driver)
    return the_driver.driver


class GameData:
    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []


def generate_matches(table):
    tr_tags = table.findAll('tr')
    for tr_tag in tr_tags:
        if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
            th_tag = tr_tag.find('th', {'class': 'first2 tl'})
            a_tags = th_tag.findAll('a')
            country = a_tags[0].text
            league = a_tags[1].text
        else:
            td_tags = tr_tag.findAll('td')
            if len(td_tags) > 5:  # or just if td_tags
                yield [td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text,
                       td_tags[4].text, td_tags[5].text, country, league]


def parse_data(url, return_urls=False):
    browser = create_driver()
    browser.get(url)
    soup = bs(browser.page_source, "lxml")
    div = soup.find('div', {'id': 'col-content'})
    table = div.find('table', {'class': 'table-main'})
    h1 = soup.find('h1').text
    print(h1)
    m = re.search(r'\d+ \w+ \d{4}$', h1)
    game_date = m[0]
    game_data = GameData()
    for row in generate_matches(table):
        game_data.date.append(game_date)
        game_data.time.append(row[0])
        game_data.game.append(row[1])
        # Score present?
        if ':' not in row[2]:
            # No, shift a few columns right:
            row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
        game_data.score.append(row[2])
        game_data.home_odds.append(nan if row[3] == '-' else row[3])
        game_data.draw_odds.append(nan if row[4] == '-' else row[4])
        game_data.away_odds.append(nan if row[5] == '-' else row[5])
        game_data.country.append(row[6])
        game_data.league.append(row[7])

    if return_urls:
        span = soup.find('span', {'class': 'next-games-date'})
        a_tags = span.findAll('a')
        urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
        return game_data, urls
    return game_data


if __name__ == '__main__':
    results = None
    pool = ThreadPool(5)  # We will be getting, however, 7 URLs
    # Get today's data and the Urls for the other days:
    game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
    urls.pop(1)  # Remove url for today: We already have the data for that
    game_data_results = pool.imap(parse_data, urls)
    for i in range(8):
        try:
            game_data = game_data_today if i == 1 else next(game_data_results)
            result = pd.DataFrame(game_data.__dict__)
            if results is None:
                results = result
            else:
                results = results.append(result, ignore_index=True)
        except ValueError:
            game_data = game_data_today if i == 1 else next(game_data_results)
            result = pd.DataFrame(game_data.__dict__)
            if results is None:
                results = result
            else:
                results = results.append(result, ignore_index=True)
    print(results)
    # print(results.head())
    # ensure all the drivers are "quitted":
    del threadLocal
    import gc

    gc.collect()  # a little extra insurance

print(results)

# noinspection PyTypeChecker because for some reason pycharm shows a warning here.
results.to_csv()

error:

Next Soccer Matches: Sunday, 27 Mar 2022
Next Soccer Matches: Tuesday, 29 Mar 2022
Next Soccer Matches: Saturday, 26 Mar 2022
Next Soccer Matches: Yesterday, 23 Mar 2022
Next Soccer Matches: Tomorrow, 25 Mar 2022
Traceback (most recent call last):
  File "C:/Users/harsh/AppData/Roaming/JetBrains/PyCharmCE2021.2/scratches/scratch_4.py", line 114, in <module>
    game_data = game_data_today if i == 1 else next(game_data_results)
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python37\lib\multiprocessing\pool.py", line 748, in next
    raise value
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python37\lib\multiprocessing\pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "C:/Users/harsh/AppData/Roaming/JetBrains/PyCharmCE2021.2/scratches/scratch_4.py", line 80, in parse_data
    game_date = m[0]
TypeError: 'NoneType' object is not subscriptable

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:/Users/harsh/AppData/Roaming/JetBrains/PyCharmCE2021.2/scratches/scratch_4.py", line 128, in <module>
    game_data = game_data_today if i == 1 else next(game_data_results)
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python37\lib\multiprocessing\pool.py", line 748, in next
    raise value
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python37\lib\multiprocessing\pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "C:/Users/harsh/AppData/Roaming/JetBrains/PyCharmCE2021.2/scratches/scratch_4.py", line 80, in parse_data
    game_date = m[0]
TypeError: 'NoneType' object is not subscriptable
Process finished with exit code 1

I have observed that when the process is completed, the

Process finished with exit code 1

switches to 0.

How can I restart a loop before this in my code?

How can I do this job better?

But in SO's case, how can I add either wait() or restart the process

leonardo
  • 140
  • 10

1 Answers1

0

The problem is in your parse_data function where you are attempting to access the first result of a regex search, when there may have been none found.

To make this more robust, you should add a check, fall back if there are no results and handle nothing being returned.

m = re.search(r'\d+ \w+ \d{4}$', h1)
if m == None: 
 return
game_date = m[0]

if it is expected there should always be a result, it may be that the element has not yet loaded and you should implement a wait using WebDriverWait

BlueTalon
  • 27
  • 9