Using DevTool
in Chrome/Firefox
(tab: Network
, filters: JS, XHR
) I found urls used by page to get data from server using AJAX
.
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/
etc.
Urls are similar. Difference is xbNfvuAM
, l8FEjeUE
which I found in code as PageTournament({"id":"l8FEjeUE", ...
and I can generate these urls.
And this way I could create code which gets HTML
without using Selenium
but only using requests
.
Original code needed ~20s
and with requests
it needs only ~6s
.
BTW: I also reduced code in parse_data
and use only DataFrame
without class GameData
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
from multiprocessing import Pool
# --- functions ---
def get_html(url):
r = requests.get(url, headers=headers)
text = r.text
start = text.find('PageTournament({"id":"') + len('PageTournament({"id":"')
end = text.find('"', start)
code = text[start:end]
print(f'code: {code}')
url = f'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/{code}/X0/1/0/1/'
r = requests.get(url, headers=headers)
text = r.text
# remove `globals.jsonpCallback('...',` at the start
text = text.split(',', 1)[1]
text = text[:-2] # remove `);` at the end
# print('json:', text[:25], '...', text[-25:]) # may display partially because other processes my put own text
print(f'json: {text[:25]} ... {text[-25:]}') # display all in one peace
data = json.loads(text)
html = data['d']['html']
# print('html:', html[:25], '...', html[-25:]) # may display partially because other processes my put own text
# may display partially because other processes my put own text
print(f'html: {html[:25]} ... {html[-25:]}')
return html
def parse_data(html):
try:
df = pd.read_html(html)[0]
except KeyError:
print('KeyError')
return
soup = bs(html, "lxml")
header = soup.select('table th.first2.tl a')
if not header:
return
df['country'] = header[1].text
df['league'] = header[2].text
return df
def process(url):
return parse_data(get_html(url))
# --- main ---
# needed headers - on some systems it has to be outside `__main__`
headers = {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
'referer': 'https://www.oddsportal.com/',
}
if __name__ == '__main__':
# urls for AJAX requests
# ajax_urls = {
# # for 'view-source:https://www.oddsportal.com/soccer/romania/superliga-women/results/#/'
# 'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/',
# # for 'https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/'
# 'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/',
# }
# you can find `l8FEjeUE` in oriiginal page as `PageTournament({"id":"l8FEjeUE", ...`
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
time_start = time.time()
# empty `DataFrame` so I don't have to check `if results is None`
results = pd.DataFrame()
with Pool(10) as p:
all_game_data = p.map(process, urls)
for game_data in all_game_data:
if game_data is None:
#print('game_data', game_data)
continue
results = results.append(game_data, ignore_index=True)
time_end = time.time()
time_diff = (time_end - time_start)
print(f'time: {time_diff:.2f} s')
print('--- results ---')
print(results)
EDIT:
As @αԋɱҽԃαмєяιcαη figured out headers
have to be outside __main__
because on some systems it may raise error.
Doc: multiprocessing
EDIT:
I created code which uses multiprocessing to run original code.
Problem is that it can't send browser
to processes every process has to run own Selenium
and it display 5 browser at the same time. And it needs more time to start all browsers and it tooks me ~40s
.
Maybe if to run process with queue to get URL
and send back HTML
then it could reuse one browser (or few browsers to run them at the same time). But it would need more complex code.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
from multiprocessing import Pool
# --- classes ---
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
# --- functions ---
def parse_data(url):
browser = webdriver.Chrome()
while True:
try:
browser.get(url)
df = pd.read_html(browser.page_source)[0]
break
except KeyError:
print('KeyError:', url)
continue
html = browser.page_source
browser.quit()
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# --- main ---
if __name__ == '__main__':
# URLs go here
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
time_start = time.time()
results = None
with Pool(5) as p:
all_game_data = p.map(parse_data, urls)
for game_data in all_game_data:
if game_data is None:
#print('game_data', game_data)
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
time_end = time.time()
time_diff = (time_end - time_start)
print(f'time: {time_diff:.2f} s')
print('--- results ---')
print(results)