import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
# Take this class for granted.Just use result of rendering.
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://pycoders.com/archive/'
r = Render(url)
result = r.frame.toHtml()
# This step is important.Converting QString to Ascii for lxml to process
archive_links = html.fromstring(str(result.encode('utf-8')))
print(archive_links)
I got that script from this page.I had to change the result.toAscii() to result.encode('utf-8)
When i run this script it returns:
> <Element div at 0x7f98226af458>
Im not pro in Python and i dont know exactly what is that mean.Is it a kind of stored information? It suppose to return the links in the webpage.
After that,i added for loop like this:
for link in archive_links:
print(link)
It returns:
> <Element p at 0x7f09a14d3408> <Element meta at 0x7f09a14d34a8>
> <Element meta at 0x7f09a14d34f8> <Element meta at 0x7f09a14d3408>
> <Element meta at 0x7f09a14d34a8> <Element meta at 0x7f09a14d34f8>
> <Element link at 0x7f09a14d3408> <Element link at 0x7f09a14d34a8>
> <Element title at 0x7f09a14d34f8> <Element style at 0x7f09a14d3408>
> <!-- Bootstrap core CSS --> <Element link at 0x7f09a14d34f8> <!--
> Custom styles for this template --> <Element link at 0x7f09a14d3408>
> <!-- Fonts from Google Fonts --> <Element link at 0x7f09a14d34a8> <!--
> HTML5 shim and Respond.js IE8 support of HTML5 elements and media
> queries --> <!--[if lt IE 9]>\n\t <script
> src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>\n\t
> <script
> src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script>\n\t<![endif]-->
> <!-- Fixed navbar --> <Element div at 0x7f09a14d34a8> <Element div at
> 0x7f09a14d3408> <Element div at 0x7f09a14d34f8> <!-- /container -->
> <!-- Bootstrap core
> JavaScript\n\t================================================== -->
> <!-- Placed at the end of the document so the pages load faster -->
> <Element script at 0x7f09a14d34f8> <Element script at 0x7f09a14d3408>
> <Element script at 0x7f09a14d34a8> <Element script at 0x7f09a14d34f8>
Also,can we do this with PyQt5 easily?I'm open to any kind of suggestion. Thank you.
Edit:This edit is relevant to JS scraping topic.But its about scraping another site and with different technique. I checked this page before create this topic.I dont understand what this code does:
>>> BeautifulSoup(html, 'lxml').find("div",{"id":"cntPos"}).find("table",{"class":"cntTb"}).tbody.find_all("tr")[1].find("td",{"class":"cntBoxGreyLnk"}) is None
> True
Returns True.What i did with this code is:
import urllib.request
from bs4 import BeautifulSoup
html = urllib.request.urlopen('http://oil-price.net').read()
soup = BeautifulSoup(html, 'lxml').find("div",{"id":"cntPos"}).find("table",{"class":"cntTb"}).tbody.find_all("tr")[1].find("td",{"class":"cntBoxGreyLnk"})
print(soup)
But this code doesnt return dynamic oil prices.It returns this:
> /usr/bin/python3.5
> /home/dogus/PycharmProjects/Recipes/bsdoesntfindjs.py <td
> class="cntBoxGreyLnk" rowspan="2" valign="top"> <script
> src="http://www.oil-price.net/COMMODITIES/gen.php?lang=en"
> type="text/javascript"> </script> <noscript> To get live <a
> href="http://www.oil-price.net/dashboard.php?lang=en#COMMODITIES">gold,
> oil and commodity price</a>, please enable Javascript. </noscript>
> <br/> <table cellpadding="0" cellspacing="0" class="b11"> <tbody> <tr>
> <td colspan="3" height="15"> <a
> href="http://feeds.feedburner.com/Oil-pricenet-OilPriceTodayAndTomorrow"
> style="font-size: 16px; font-weight: normal; "><img border="0"
> src="/pics/feed-icon.gif"/> Subscribe to RSS</a><br/> <hr/> <br/>
> <form action="http://www.feedburner.com/fb/a/emailverify"
> method="post"
> onsubmit="window.open('http://www.feedburner.com/fb/a/emailverifySubmit?feedId=1678900',
> 'popupwindow', 'scrollbars=yes,width=550,height=520');return true"
> style="border:0px solid #ccc;padding:8px;text-align:center; font-size:
> 12px; font-weight: normal;" target="popupwindow"> <p style="font-size:
> 14px; font-weight: bold;"><img border="0" height="40"
> src="/index_files/email_40.png" style="vertical-align:text-bottom;"
> widht="40"/> Receive our FREE<br/>Oil Intelligence Newsletter:
> <br/><span style="font-size:11px; font-weight:normal;">(We don't
> spam)</span> </p><p><input name="email" style="width:140px"
> type="text"/></p><input name="url" type="hidden"
> value="http://feeds.feedburner.com/~e?ffid=1678900"/><input
> name="title" type="hidden" value="Oil-price.net - Oil Price, Today and
> Tomorrow"/><input name="loc" type="hidden" value="en_US"/><input
> type="submit" value="Subscribe"/></form> </td> </tr><tr><td
> colspan="3"> <hr/><br/> <div class="b11"> <strong style="white-space:
> nowrap;">oil-price.net is available in</strong> <br/> <br/> </div>
> </td></tr> <tr> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=fr"><img border="0" height="21"
> src="index_files/lng_fr.png" width="54"/><br/> Français </a> </td>
> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=en"><img border="0" height="21"
> src="index_files/lng_en.png" width="54"/><br/> English </a> </td>
> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=zh"><img border="0" height="21"
> src="index_files/lng_zh.png" width="54"/><br/> 中国 </a> </td> </tr>
> <tr> <td colspan="3" height="15"></td> </tr> <tr> <td
> style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=it"><img border="0" height="21"
> src="index_files/lng_it.png" width="54"/><br/> Italiano </a> </td>
> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=th"><img border="0" height="21"
> src="index_files/lng_th.png" width="54"/><br/> ภาษาไทย </a> </td>
> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=ar"><img border="0" height="21"
> src="index_files/lng_ar.png" width="54"/><br/> العربيه </a> </td>
> </tr> <tr> <td colspan="3" height="15"></td> </tr> <tr> <td
> style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=nl"><img border="0" height="21"
> src="index_files/lng_nl.png" width="54"/><br/> Nederland </a> </td>
> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=pt"><img border="0" height="21"
> src="index_files/lng_pt.png" width="54"/><br/> Português </a> </td>
> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=ko"><img border="0" height="21"
> src="index_files/lng_ko.png" width="54"/><br/> 한국어 </a> </td> </tr>
> <tr> <td colspan="3" height="15"></td> </tr> <tr> <td
> style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=ja"><img border="0" height="21"
> src="index_files/lng_ja.png" width="54"/><br/> 日本語 </a> </td> <td
> style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=ru"><img border="0" height="21"
> src="index_files/lng_ru.png" width="54"/><br/> Русскийязык </a>
> </td> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=id"><img border="0" height="21"
> src="index_files/lng_id.png" width="54"/><br/> Bahasa
> <br/>Indonesia </a> </td> </tr> <tr> <td colspan="3" height="15"></td>
> </tr> <tr> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=es"><img border="0" height="21"
> src="index_files/lng_es.png" width="54"/><br/> Espanol </a> </td>
> <td style="padding-right: 15px;" valign="middle"> <a
> href="/index.php?lang=de"><img border="0" height="21"
> src="index_files/lng_de.png" width="54"/><br/> Deutsch </a> </td>
> </tr></tbody></table> </td>
Another user recommends to use PyV8 but it seems to me that it is a too old library to use.Last update was in 2012.Should i learn it? Or should i stick with PyQt4? or 5?