0

Code used to scrape Oddsportal however now I am getting an error with this code.

import os
import re
import threading
from datetime import datetime
from math import nan
from multiprocessing.pool import ThreadPool
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver

class Driver:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        # Un-comment next line to supress logging:
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.driver = webdriver.Chrome(options=options)

    def __del__(self):
        self.driver.quit()  # clean up driver when we are cleaned up
        print('The driver has been "quitted".')


threadLocal = threading.local()


def create_driver():
    the_driver = getattr(threadLocal, 'the_driver', None)
    if the_driver is None:
        the_driver = Driver()
        setattr(threadLocal, 'the_driver', the_driver)
    return the_driver.driver


class GameData:
    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []


def generate_matches(table):
    tr_tags = table.findAll('tr')
    for tr_tag in tr_tags:
        if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
            th_tag = tr_tag.find('th', {'class': 'first2 tl'})
            a_tags = th_tag.findAll('a')
            country = a_tags[0].text
            league = a_tags[1].text
        else:
            td_tags = tr_tag.findAll('td')
            if len(td_tags) > 5:  # or just if td_tags
                yield [td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text,
                       td_tags[4].text, td_tags[5].text, country, league]


def parse_data(url, return_urls=False):
    browser = create_driver()
    browser.get(url)
    browser.implicitly_wait(25)
    soup = bs(browser.page_source, "lxml")
    div = soup.find('div', {'id': 'col-content'})
    table = div.find('table', {'class': 'table-main'})
    h1 = soup.find('h1').text
    print(h1)
    m = re.search(r'\d+ \w+ \d{4}$', h1)
    game_date = m[0]
    game_data = GameData()
    for row in generate_matches(table):
        game_data.date.append(game_date)
        game_data.time.append(row[0])
        game_data.game.append(row[1])
        # Score present?
        if ':' not in row[2]:
            # No, shift a few columns right:
            row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
        game_data.score.append(row[2])
        game_data.home_odds.append(nan if row[3] == '-' else row[3])
        game_data.draw_odds.append(nan if row[4] == '-' else row[4])
        game_data.away_odds.append(nan if row[5] == '-' else row[5])
        game_data.country.append(row[6])
        game_data.league.append(row[7])

    if return_urls:
        span = soup.find('span', {'class': 'next-games-date'})
        a_tags = span.findAll('a')
        urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
        return game_data, urls
    return game_data


if __name__ == '__main__':
    games = None
    pool = ThreadPool(5)  # We will be getting, however, 7 URLs
    # Get today's data and the Urls for the other days:
    game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
    urls.pop(1)  # Remove url for today: We already have the data for that
    game_data_results = pool.imap(parse_data, urls)
    for i in range(8):
        try:
            game_data = game_data_today if i == 1 else next(game_data_results)
            result = pd.DataFrame(game_data.__dict__)
            if games is None:
                games = result
            else:
                games = games.append(result, ignore_index=True)
        except ValueError:
            game_data = game_data_today if i == 1 else next(game_data_results)
            result = pd.DataFrame(game_data.__dict__)
            if games is None:
                games = result
            else:
                games = games.append(result, ignore_index=True)
        finally:
            pass
    print(games)
    # ensure all the drivers are "quitted":
    del threadLocal
    import gc

    gc.collect()  # a little extra insurance

Model Output:

Unnamed: 0         date   time                                               game     score home_odds  draw_odds  away_odds                country                               league
0              0  08 Jan 2023  00:30                       Boca Juniors - Independiente       0:0      1.93       3.23       3.91              Argentina                    Torneos De Verano
1              1  08 Jan 2023  00:45                            CSP U20 - Sao Paulo U20       0:4     11.27       5.85       1.21                 Brazil           Copa Sao Paulo de juniores
2              2  08 Jan 2023  01:00               U. de Deportes (Per)  - Aucas (Ecu)        0:0      1.94       3.28       3.74                  World                        Club Friendly
3              3  08 Jan 2023  01:10                                     Atlas - Toluca       NaN    postp.       2.04       3.40                 Mexico                              Liga MX
4              4  08 Jan 2023  01:30            Inac Kobe Leonesa W - Albirex Niigata W       2:1      1.22       5.42      12.01                  Japan                      WE League Women
5              5  08 Jan 2023  02:00                        Tampico Madero - Lobos ULMX       1:0      1.41       4.69       6.25                 Mexico                 Liga Premier Serie A

However currently I am getting error:

Traceback (most recent call last):
  File "C:\Users\User\AppData\Roaming\JetBrains\PyCharmCE2022.2\scratches\Scraping_New.py", line 111, in <module>
    game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
  File "C:\Program Files\Python37\lib\multiprocessing\pool.py", line 261, in apply
    return self.apply_async(func, args, kwds).get()
  File "C:\Program Files\Python37\lib\multiprocessing\pool.py", line 657, in get
    raise self._value
  File "C:\Program Files\Python37\lib\multiprocessing\pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "C:\Users\User\AppData\Roaming\JetBrains\PyCharmCE2022.2\scratches\Scraping_New.py", line 78, in parse_data
    table = div.find('table', {'class': 'table-main'})
AttributeError: 'NoneType' object has no attribute 'find'

How do I resolve this?

Driftr95
  • 4,572
  • 2
  • 9
  • 21
Rander
  • 94
  • 8
  • @qharr can you please help me resolve the error? – Rander Jan 09 '23 at 14:56
  • if you just want to avoid the error, add a line with `if not div: return []` before the *`table = div.find....`* line. If you want to save some information to try to figure out what caused the `div` to be missing, you can use something like this [`logError_scrapes`](https://pastebin.com/cxGH50Mc) function [*`if not div: return logError_scrapes('', url, msg=f"table container div#col-content not found", rSoup=soup, conf={'reqUrl':browser.current_url}, returnVal=[])`*] – Driftr95 Jan 09 '23 at 21:21
  • I dont want to avoid the error. This code worked before and since the website did a redesign, I am unable to ron the same. When I run `rows = table.find_all("tr")` I get the error _rows = table.find_all("tr") AttributeError: 'NoneType' object has no attribute 'find_all'_ – Rander Jan 10 '23 at 08:21
  • The error happens because the fetched html doesn't contain the table you're looking for. If some sites just don't have the table, then you can just skip (that's what I meant by avoiding the error), if ALL sites raise the error then you need to rewrite your scraper, and if some sites raise the error and some don't *and you're not sure why* that's when you use something like the function I mentioned previously to log the sites that raised the error (so you can go back and try to figure out why) and the program can still continue collecting as much data as possible from the non-problematic sites – Driftr95 Jan 10 '23 at 09:22
  • How do I get values in the dataframe? If you could help me with one of the dataframe value positions, I will figure out the rest.. (I hope). E.g. game column value – Rander Jan 11 '23 at 16:47

1 Answers1

2

NOTE : You can see a complete solution (compiled, tested and pasted by @leonardo) that includes my answer from below (or my backup of it [cloned just in case]).


How do I get values in the dataframe? If you could help me with one of the dataframe value positions,

If you use .select with CSS selectors, you can get very specific about the positions.

This example uses selectors to extract as much data as I could think of, but for just the positions in your DataFrame:

def generate_matches(pgSoup, defaultVal=None):
    evtSel = {
        'time': 'p.whitespace-nowrap',
        'game': 'a div:has(>a[title])', 
        'score': 'a:has(a[title])+div.hidden', 
        'home_odds': 'a:has(a[title])~div:not(.hidden)',
        'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
        'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)', 
    }

    events, current_group = [], {}
    pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]') ## EDIT
    if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip() ## EDIT
    for evt in pgSoup.select('div[set]>div:last-child'):
        if evt.parent.select(f':scope>div:first-child+div+div'):
            cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
              evt.parent.select_one(s) for s in 
              [ ':scope>div:first-child+div>div:first-child',
                ':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
                ':scope>div:first-child>a:nth-of-type(3):last-of-type' ]]]
            current_group = dict(zip(['date', 'country', 'league'], cgVals))
            if pgDate: current_group['date'] = pgDate ## EDIT
        
        evtRow = {'date': current_group.get('date', defaultVal)}

        for k, v in evtSel.items():
            v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
            evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
        evtTeams = evt.select('a div>a[title]') ## EDIT
        evtRow['game'] = ' – '.join(a['title'] for a in evtTeams) ## EDIT

        evtRow['country'] = current_group.get('country', defaultVal)
        evtRow['league'] = current_group.get('league', defaultVal)

        events.append(evtRow)
    return events

would collect:

op


And change parse_data to

def parse_data(url, return_urls=False):
    browser = create_driver() ## as before
    browser.get(url) ## as before
    browser.implicitly_wait(25) ## as before
    soup = bs(browser.page_source, "lxml") ## as before

    game_data = GameData() ## as before
    game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
    # game_keys = ['date', 'time', 'game', 'score', 'home_odds', 'draw_odds', 'away_odds', 'country', 'league']
    for row in generate_matches(soup, defaultVal=nan):
        for k in game_keys: getattr(game_data, k).append(row.get(k, nan)) ## OR:
        # game_data.date.append(row.get('date'))
        # game_data.time.append(row.get('time'))
        # game_data.game.append(row.get('game'))
        # game_data.score.append(row.get('score'))
        # game_data.home_odds.append(row.get('home_odds'))
        # game_data.draw_odds.append(row.get('draw_odds'))
        # game_data.away_odds.append(row.get('away_odds'))
        # game_data.country.append(row.get('country'))
        # game_data.league.append(row.get('league'))

    if return_urls: ## as before
        span = soup.find('span', {'class': 'next-games-date'}) ## as before
        a_tags = span.findAll('a') ## as before
        urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags] ## as before
        return game_data, urls ## as before
    return game_data ## as before

EDIT: Suggested Changes to Main Block

[ Main change is how the DataFrame is built up, since DataFrame.append has been deprecated. ]

if __name__ == '__main__':
    games = None ## AS BEFORE
    pool = ThreadPool(5) ## AS BEFORE
    url_today = 'https://www.oddsportal.com/matches/soccer' # [just shortens line]
    game_data_today, urls = pool.apply(parse_data, args=(url_today, True)) 
    # urls.pop(1)  ## [ no longer needed ]
    game_data_results = pool.imap(parse_data, urls)  ## AS BEFORE

    ############################ BUILD  DATAFRAME ############################
    game_data_dfList, added_todayGame = [], False
    for game_data in game_data_results:
        try:
            game_data_dfList.append(pd.DataFrame(game_data.__dict__)) 
            if not added_todayGame:
                game_data_dfList += [pd.DataFrame(game_data_today.__dict__)]
                added_todayGame = True
        except Exception as e:
            game_n = len(game_data_dfList) + 1
            print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
        # finally: pass ## [ redundant ] 
    try: games = pd.concat(game_data_dfList, ignore_index=True)
    except Exception as e: print('Error concatenating DataFrames:', repr(e))
    ##########################################################################

    print('!?NO GAMES?!' if games is None else games) ## print(games)
    # ensure all the drivers are "quitted": ## AS BEFORE
    del threadLocal ## AS BEFORE
    import gc ## AS BEFORE

    gc.collect()  # a little extra insurance ## AS BEFORE

You could also add to the final DataFrame one by one (instead of concatenating them all at once in the end) by replacing the # BUILD DATAFRAME # block above with

    game_n, added_todayGame = 0, False
    for game_data in game_data_results:
        try:
            game_n += 1
            gd_df = pd.DataFrame(game_data.__dict__)
            games = gd_df if games is None else pd.concat([games, gd_df])
            if not added_todayGame:
                game_n += 1
                gdt_df = pd.DataFrame(game_data_today.__dict__)
                games, added_todayGame = pd.concat([games, gdt_df]), True
        except Exception as e:
            print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}') 
Driftr95
  • 4,572
  • 2
  • 9
  • 21
  • Thank you. Where in the above code would I replace yours to get the dataframe? – Rander Jan 13 '23 at 13:53
  • 1
    You could wrap my code in `def generate_matches(soup)` and `return events`, and then right after *`soup = bs(browser.page_source, "lxml")`* iterate through the events with *`for row in generate_matches(soup)...`* like `game_data.date.append(row.get('date'))` and `game_data.time.append(row.get('time'))` and so on – Driftr95 Jan 13 '23 at 17:48
  • Apologies for a late response however I dont quite follow your advice about wrapping. Can you please update the parent code with your answer? – Rander Jan 28 '23 at 10:19
  • Thank you!. I edited my code including your inputs [please find entire code here](https://pastebin.com/nF47vSwE) however I am getting an error at line 92 `a_tags = span.findAll('a') AttributeError: 'NoneType' object has no attribute 'findAll'` How do I correct this error? – Rander Jan 29 '23 at 03:19
  • @Rander it's because *`soup.find('span', {'class': 'next-games-date'})`* returned `None` - either test for that (something like `a_tags = [] if span is None else span.find_all('a')`) or use `.select` instead of `.find...` like **`a_tags = soup.select('span.next-games-date a[href]')`** – Driftr95 Jan 29 '23 at 05:11
  • Again, I highly recommend learning about CSS selectors - chaining multiple `.find...` calls often leads to this `AttributeError`. [ For single elements, you should also check against `None` before trying to extract text or any attribute - like I did with *`v.get_text(' ').strip() if v else defaultVal`* and *`evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal`* ] – Driftr95 Jan 29 '23 at 05:12
  • 1
    @Rander edited code lines are marked with `## EDIT` – Driftr95 Feb 02 '23 at 10:40
  • Thanks thats great! I noticed a few things. Please find an [explanation for the points here](https://imgur.com/a/Krzewlx). Namely, 1. The extracts are for all the pages except the landing page. I used `pd.DataFrame(urls)` to get this. 2. For some reason, the site is resisting scraping. Different runs yield different `count` of rows for `games`. Is there a better way than `browser.implicitly_wait(25)`? I tried `browser.implicitly_wait(50)` but the run time remains the same. Probably the statement can be optimally placed? – Rander Feb 05 '23 at 10:33
  • I am sorry, I am not getting enough time to test all of this very diligently however, I will open source this solution on github when I get it working correctly as Oddsportal is one of the most scraped sites for betting odds and many of the packages (repos) are failing due to the redesign. Your work will go a long way to help everybody! – Rander Feb 05 '23 at 10:34
  • @Rander I guess you could try an [explicit wait](https://selenium-python.readthedocs.io/waits.html#explicit-waits) instead (like `WebDriverWait(browser, 30).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[set]>div:last-child a:has(a[title])~div:not(.hidden)")))` is supposed to wait till all the odds are loaded) but I'm not really sure...have you checked on your browser that the site content itself doesn't vary? – Driftr95 Feb 05 '23 at 11:31
  • btw, in your `for i in range(8)...` loop, why is there a `try...except` with what looks like identical `try` and `except` blocks? – Driftr95 Feb 05 '23 at 11:32
  • Yes, I can try `explicit wait` where would I put this statement? Replace `implicitly wait`? I will check the same. I did not cause of the `headless` option enabled. Will test all your points and respond tomorrow. (Typing from phone). As you can tell, my CSS and HTML scripting skills are abysmal to quite follow your questions. I had built this code from SO help. I am a data analyst myself hence the incompetence. I honestly apologise. – Rander Feb 05 '23 at 12:22
  • Thank you @Driftr95 for the answer. I see the same issue as the browser does not load fully . I am trying to replace `browser.implicitly_wait(25)` with `WebDriverWait(browser, 30).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[set]>div:last-child a:has(a[title])~div:not(.hidden)")))` however, what would `EC` be in part `until( EC.presence_of_all_elements...` stand for? Also, how would I modify the code to include Rander's point of "Todays Matches" in the extract? I guess the `for i in range(8)` part was for the 8 `a_tags` collected however they dont seem to be the case now. – leonardo Feb 06 '23 at 04:43
  • @leonardo please see the imports in the first example in the [explicit waits documentation](https://selenium-python.readthedocs.io/waits.html#explicit-waits) - both `EC` (*`selenium.webdriver.support.expected_conditions`*) and `WebDriverWait` (*`selenium.webdriver.support.wait.WebDriverWait`*) are imported classes/sub-packages (as is `By`). – Driftr95 Feb 06 '23 at 08:00
  • About the *`for i in range(8)`* I'm not entirely sure why they wouldn't apply any more, but I've suggested some alterations (see *`EDIT: Suggested Changes to Main Block`*) although I'm not confident in it at all as I have very little experience with thread pools – Driftr95 Feb 06 '23 at 08:02
  • You are doing great work! I have [compiled the code as per your suggestions here](https://pastebin.com/RYnbGC8Y). While I understand the [Explicit Wait](https://selenium-python.readthedocs.io/waits.html#explicit-waits) exists, I really dont know where to apply the same in the code. Can you please help me insert the part in the code? Also, The dataframe is being constructed twice. i.e. the code loops twice. How can I correct the same? – leonardo Feb 07 '23 at 23:28
  • @leonardo what happens when you replace *`browser.implicitly_wait(25)`* with `WebDriverWait(browser, 30).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[set]>div:last-child a:has(a[title])~div:not(.hidden)")))`? Is there an error raised? – Driftr95 Feb 08 '23 at 22:01
  • No, there is no error raised however I noticed when I disabled `self.driver.quit()` and `del threadLocal` the page does not load fully. The scraped data is just of the visible dataframe which is quite short. After scrolling the page manually, the page continues to load till the end (there is no indefinite scrolling which I noticed as the matches and odds are timestamped so new date is displayed on the respective page) which can explain why `EC` could not be helpful here. Can you help introduce scrolling using `webdriver` and `selenium` for `n` seconds and then build the dataframe? – leonardo Feb 09 '23 at 22:01
  • So I tried `browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")` [following this solution](https://stackoverflow.com/questions/63532066/fix-lazy-loading-in-selenium-python) for scrolling the page which helped get more data but the scrolling stopped at 1. Usually on weekends, there are thousands of events and hence maybe the scroll has to happen 30-50 times the `document.body.scrollHeight` so I tried `browser.execute_script("window.scrollTo(10, document.body.scrollHeight);")` but it stopped at 1 only. Can you help me understand this and help scroll the page more times? – leonardo Feb 09 '23 at 22:22
  • This is great progress! I cannot thank you enough for the progress so far! – leonardo Feb 09 '23 at 22:22
  • @leonardo are you scrolling in a loop? The first parameter of [`window.scrollTo`](https://developer.mozilla.org/en-US/docs/Web/API/Window/scrollTo) is just an x-coordinate, so manipulating it will make no difference to the number of times it scrolls - for that you'll need to loop, and you should add some delay inside the loop to allow new data to load, just like in the solution you mentioned – Driftr95 Feb 09 '23 at 23:02
  • Can you please help me write the loop? I really appreciate it! – leonardo Feb 09 '23 at 23:16
  • 1
    @leonardo this covers it quite well: https://stackoverflow.com/a/27760083/6146136 – Driftr95 Feb 09 '23 at 23:20
  • Super Sweet my man! Now I am getting a consistent result. I would not for the life of me had figured out about scrolling and infinite loading pages if it were not for you! You are literally godsend! Bless you man! Can you please post [this code as the final answer](https://pastebin.com/Sd0E1Hmm) considering that the code is as efficiently built as it can be? This should be accepted. @Rander the code works now. – leonardo Feb 10 '23 at 00:15
  • @leonardo I've only edited in the link to it [but it's right at the top] since this question is about a bs4-related error. I'm glad to hear that a working solution has been reached! – Driftr95 Feb 10 '23 at 02:22
  • @Driftr95 I guess the website changed a bit. Can you please help answer [this question](https://stackoverflow.com/questions/75740238/missing-values-while-scraping-oddsportal-using-beautifulsoup)? – Rander Mar 15 '23 at 03:22