I want to get whole list of pdf links in the below url page: 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
The problem is that the webpage uses javascript internally to show the links, and I could not get the pdf links.
Actually, I tried to parse with various ways found through googling. But I failed. Can you suggest the proper way to solve the problem ?
The below is the code I tried but failed:
def crawle_kiwoom_mletter():
if not os.path.exists(dir_output_mletter):
os.makedirs(dir_output_mletter)
#urlformat = 'https://www.kiwoom.com/nkw.template.do?m=m0601010101&s_menu=ML&s_sqno=4784'
urlformat = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
index = -1
while True:
index = index + 1
url = urlformat.format(index)
print('processing {}...'.format(url))
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
#print_anchors(soup)
print(soup.prettify())
'''
if browse_mbriefing_linkpages(soup) == False:
break
'''
break
'''
https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/
'''
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def crawl_kiwoom_mletter2():
url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
url='http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000&source=&xdr='
#This does the magic.Loads everything
r = Render(url)
#result is a QString.
result = r.frame.toHtml()
print(result)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
'''
http://stackoverflow.com/questions/28289699/python-web-scraping-for-javascript-generated-content
'''
def crawl_kiwoom_mletter3():
browser = webdriver.Firefox()
url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
browser.get(url)
res = browser.page_source
print(res)
driver.close()