0

I am trying to run this code and it returns IndexError

import pandas as pd
from bs4 import BeautifulSoup as bs
from math import nan

browser = webdriver.Chrome()


class GameData:
    def __init__(self):
        self.score = []
        self.country = []
        self.league = []
        self.game = []

    def append(self, score):
        pass


def get_urls(browser, landing_page):
    browser.get(landing_page)
    urls = [i.get_attribute('href') for i in
            browser.find_elements_by_css_selector(
                '.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]

    return urls


def parse_data(html):
    global league
    df = pd.read_html(html, header=0)[0]
    html = browser.page_source
    soup = bs(html, "lxml")
    scores = [i.select_one('.table-score').text if i.select_one('.table-score') is not None else nan for i in
              soup.select('#table-matches tr:nth-of-type(n+2)')]
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
    main = content.find('th', {'class': 'first2 tl'})

    if main is None:
        return None

    count = main.findAll('a')
    country = count[0].text
    game_data = GameData()
    leagues = [i.text for i in soup.select('.first2 > a:last-child')]

    n = 0

    for number, row in enumerate(df.itertuples()):
        if n == 0 or '»' in row[1]:
            league = leagues[n]
            n += 1
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            country = row[1].split('»')[0]
            continue
        game_time = row[1]
        print(len(scores[number])) **This is where I find that the index has changed
        print(scores[number])

        game_data.country.append(country)
        game_data.league.append(league)
        game_data.game.append(row[2])
        game_data.score.append(scores[number])

    return game_data


if __name__ == '__main__':

    start_url = "https://www.oddsportal.com/matches/soccer/"
    urls = []
    browser = webdriver.Chrome()
    results = None
    urls = get_urls(browser, start_url)
    urls.insert(0, start_url)

    for number, url in enumerate(urls):
        if number > 0:
            browser.get(url)
        html = browser.page_source
        game_data = parse_data(html)

        if game_data is None:
            continue

        result = pd.DataFrame(game_data.__dict__)

        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)

I am not sure where I am going wrong

Traceback (most recent call last):
  File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_10.py", line 112, in <module>
    game_data = parse_data(html)
  File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_10.py", line 84, in parse_data
    print(scores[number])
IndexError: list index out of range

Verbose: Since SO asks for it.

I can see that the way scores is constructed, its not the best. How can I just extract this value from Xpath?

The Xpath for scores is /html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[7]/table/tbody/tr[9]/td[3]

How can I get rid of the scores function and use this Xpath?

If thats the case, Ideally, I should be using Xpath for every column value

  • Try refining your xpath; also, try using an existing scraper: https://github.com/gingeleski/odds-portal-scraper – Lior Pollak Sep 04 '21 at 13:12
  • Is there any library and/or a code structure that can allow me to input the xpath to get the value? The Xpath I have posted here is when you copy full xpath? –  Sep 04 '21 at 13:15
  • There's an excellent support for xpath in chrome: https://stackoverflow.com/questions/3030487/is-there-a-way-to-get-the-xpath-in-google-chrome – Lior Pollak Sep 04 '21 at 13:17
  • It helps in how to get xpath. It does not instruct how to get it in a dataframe. thanks. I was trying with the full Xpath. trurns out xpath can do a better job. –  Sep 04 '21 at 13:38

0 Answers0