I am trying to run this code and it returns IndexError
import pandas as pd
from bs4 import BeautifulSoup as bs
from math import nan
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.score = []
self.country = []
self.league = []
self.game = []
def append(self, score):
pass
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
def parse_data(html):
global league
df = pd.read_html(html, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
scores = [i.select_one('.table-score').text if i.select_one('.table-score') is not None else nan for i in
soup.select('#table-matches tr:nth-of-type(n+2)')]
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[0].text
game_data = GameData()
leagues = [i.text for i in soup.select('.first2 > a:last-child')]
n = 0
for number, row in enumerate(df.itertuples()):
if n == 0 or '»' in row[1]:
league = leagues[n]
n += 1
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
country = row[1].split('»')[0]
continue
game_time = row[1]
print(len(scores[number])) **This is where I find that the index has changed
print(scores[number])
game_data.country.append(country)
game_data.league.append(league)
game_data.game.append(row[2])
game_data.score.append(scores[number])
return game_data
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
I am not sure where I am going wrong
Traceback (most recent call last):
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_10.py", line 112, in <module>
game_data = parse_data(html)
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_10.py", line 84, in parse_data
print(scores[number])
IndexError: list index out of range
Verbose: Since SO asks for it.
I can see that the way scores
is constructed, its not the best. How can I just extract this value from Xpath
?
The Xpath
for scores
is /html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[7]/table/tbody/tr[9]/td[3]
How can I get rid of the scores function and use this Xpath
?
If thats the case, Ideally, I should be using Xpath for every column value