What I got so far is code working for one single page but I want it to work for several pages(7*29 in a loop) e.g. http://www.oddsportal.com/basketball/usa/nba-2013-2014/results/#/page/1 I'm guessing, that somehow you have to restart the browser simulation everytime but I'm not exactly sure, how. So this is the console output I get running the code(python 3.5).
content-type missing in HTTP POST, defaulting to application/x-www-form- urlencoded. Use QNetworkRequest::setHeader() to fix this problem.
done
QObject::connect: Cannot connect (null)::configurationAdded(QNetworkConfiguration) to QNetworkConfigurationManager::configurationAdded(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::configurationRemoved(QNetworkConfiguration) to QNetworkConfigurationManager::configurationRemoved(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::configurationChanged(QNetworkConfiguration) to QNetworkConfigurationManager::configurationChanged(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::onlineStateChanged(bool) to QNetworkConfigurationManager::onlineStateChanged(bool)
QObject::connect: Cannot connect (null)::configurationUpdateComplete() to QNetworkConfigurationManager::updateCompleted()
I'm also not sure, what the content-type missing is, but it works for a single page just fine, so I ignored it. To test the stuff I want to do with it, I went ahead and changed the url manually for the 2014 season and it turned out to work fine, so I'm kinda lost. the code consists of a generic scraping javascript part that I pretty much copy pasted and a html interpreting part I wrote myself. Since im not sure, where the problem is, here is the complete code.
from lxml import html
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
import pickle
javascript scraping according to google, with first rendering the page and then loading it to html
class Render(QWebPage):
def __init__ (self,url):
self.app =QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def getHtml(str_url):
r_html = Render(str_url)
html = r_html.frame.toHtml()
return html
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
this method incorporates the standard lxml procedure and the javascript rendering to get the games and odds of a particular page for further processing and put it in a list
def scrape_js(url):
str_html = getHtml(url)
result = str(str_html.encode("utf-8"))
tree = html.fromstring(result)
content = tree.xpath('//table[@class=" table-main"]//tr[(@class=" deactivate") or (@class="odd deactivate")]//td[position()>1]//text()')
liste=[[]]
i=0;
k=0;
n=int(len(content))
while i<n:
if is_number(content[i-1]) and is_number(content[i-2]) and is_number(content[i-3]):
liste.append([content[i]])
i+=1
k+=1
else:
liste[k].append(content[i])
i+=1
liste = liste[1:]
for line in liste:
if is_number(line[2]):
liste = liste[1:]
return liste
complete_liste = []
file_name = 'odds_2009'
for the 2008/09 season, I'd like to get all 29 pages but after completing the first page, javascript crashes
for page in range(30):
url = ''.join(['http://www.oddsportal.com/basketball/usa/nba-2008-2009/results/#/page/',str(page)])
liste = scrape_js(url)
for line in liste:
complete_liste.append(line)
print('done')
fileObject = open(file_name,'wb')
pickle.dump(complete_liste,fileObject)
fileObject.close()