I am trying to scrape a website that uses javascript. I am using the following code:
import os
import sys
import re
import requests
import mechanize
import cookielib
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
import pandas as pd
import time
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def read_page(url):
r = Render(url)
result = r.frame.toHtml()
text = str(result.toAscii())
html_source = html.fromstring(text)
return text, html_source
for url in urls:
text, html_source = read_page(url)
After reading successfully the first url, on the second url it displays the following message and python.exe crashes.
content-type missing in HTTP POST, defaulting to application/x-www-form-urlencoded. Use QNetworkRequest::setHeader() to fix this problem.
QObject::connect: Cannot connect (null)::configurationAdded(QNetworkConfiguration) to QNetworkConfigurationManager::configurationAdded(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::configurationRemoved(QNetworkConfiguration) to QNetworkConfigurationManager::configurationRemoved(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::configurationChanged(QNetworkConfiguration) to QNetworkConfigurationManager::configurationChanged(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::onlineStateChanged(bool) to QNetworkConfigurationManager::onlineStateChanged(bool)
QObject::connect: Cannot connect (null)::configurationUpdateComplete() to QNetworkConfigurationManager::updateCompleted()