I have a code.
It works at times; but works when i rerun it a couple of times at times and sometimes the next rerun.
I want to either:
- Wait till the process is completely loaded like add a wait()
- Restart the process/rerun this code
Code:
import os
import re
import threading
from math import nan
from multiprocessing.pool import ThreadPool
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
else:
td_tags = tr_tag.findAll('td')
if len(td_tags) > 5: # or just if td_tags
yield [td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text,
td_tags[4].text, td_tags[5].text, country, league]
def parse_data(url, return_urls=False):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
print(h1)
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
# Score present?
if ':' not in row[2]:
# No, shift a few columns right:
row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
game_data.score.append(row[2])
game_data.home_odds.append(nan if row[3] == '-' else row[3])
game_data.draw_odds.append(nan if row[4] == '-' else row[4])
game_data.away_odds.append(nan if row[5] == '-' else row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
if return_urls:
span = soup.find('span', {'class': 'next-games-date'})
a_tags = span.findAll('a')
urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
return game_data, urls
return game_data
if __name__ == '__main__':
results = None
pool = ThreadPool(5) # We will be getting, however, 7 URLs
# Get today's data and the Urls for the other days:
game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
urls.pop(1) # Remove url for today: We already have the data for that
game_data_results = pool.imap(parse_data, urls)
for i in range(8):
try:
game_data = game_data_today if i == 1 else next(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
except ValueError:
game_data = game_data_today if i == 1 else next(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
print(results)
# noinspection PyTypeChecker because for some reason pycharm shows a warning here.
results.to_csv()
error:
Next Soccer Matches: Sunday, 27 Mar 2022
Next Soccer Matches: Tuesday, 29 Mar 2022
Next Soccer Matches: Saturday, 26 Mar 2022
Next Soccer Matches: Yesterday, 23 Mar 2022
Next Soccer Matches: Tomorrow, 25 Mar 2022
Traceback (most recent call last):
File "C:/Users/harsh/AppData/Roaming/JetBrains/PyCharmCE2021.2/scratches/scratch_4.py", line 114, in <module>
game_data = game_data_today if i == 1 else next(game_data_results)
File "C:\Users\harsh\AppData\Local\Programs\Python\Python37\lib\multiprocessing\pool.py", line 748, in next
raise value
File "C:\Users\harsh\AppData\Local\Programs\Python\Python37\lib\multiprocessing\pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "C:/Users/harsh/AppData/Roaming/JetBrains/PyCharmCE2021.2/scratches/scratch_4.py", line 80, in parse_data
game_date = m[0]
TypeError: 'NoneType' object is not subscriptable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/harsh/AppData/Roaming/JetBrains/PyCharmCE2021.2/scratches/scratch_4.py", line 128, in <module>
game_data = game_data_today if i == 1 else next(game_data_results)
File "C:\Users\harsh\AppData\Local\Programs\Python\Python37\lib\multiprocessing\pool.py", line 748, in next
raise value
File "C:\Users\harsh\AppData\Local\Programs\Python\Python37\lib\multiprocessing\pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "C:/Users/harsh/AppData/Roaming/JetBrains/PyCharmCE2021.2/scratches/scratch_4.py", line 80, in parse_data
game_date = m[0]
TypeError: 'NoneType' object is not subscriptable
Process finished with exit code 1
I have observed that when the process is completed, the
Process finished with exit code 1
switches to 0
.
How can I restart a loop before this in my code?
How can I do this job better?
But in SO's case, how can I add either wait() or restart the process