I'm trying to retrieve all fields from this web page:
https://www.jobsbank.gov.sg/ICMSPortal/portlets/JobBankHandler/SearchDetail.do?id=JOB-2016-0008786
However, when I run my code, it does not even seem to retrieve the HTML. I'm wondering what is it about the page that makes it unable to retrieve the HTML (I use PhantomJS.exe headless browser but that shouldn't make a difference), does this have anything to do with iframe? If so, how do I use selenium to deal with this?
I've used selenium with phantomJS to scrape from http://www.indeed.com/resumes/-/in-Singapore successfully, but the same method doesn't seem to work for the website mentioned above. Please refer to my code below
import re
import math
import time
import requests
from lxml import html
import selenium
from selenium import webdriver
def getPageEnd(url):
payload = {
"{actionForm.checkValidRequest}": "YES",
"{actionForm.recordsPerPage}": "20",
"{actionForm.sortBy}": "1",
"{actionForm.searchType}": "Quick Search",
"{actionForm.currentPageNumber}": "1"
}
r = requests.post(url, data=payload)
tree = html.fromstring(r.text)
page_list = tree.xpath('//div[@class="searchetails"]/p/'
'span[@style="color: #b41b84;"]/text()')
page_list = re.findall('\d+', page_list[0])
page_end = page_list[-1]
page_end = int(page_end)/100.0
page_end = math.ceil(page_end) #rounding up
return int(page_end)
def jobScrape(url, pagenum):
job_link_url_prepend = "__https://www.jobsbank.gov.sg"
payload = {
"{actionForm.checkValidRequest}": "YES",
"{actionForm.recordsPerPage}": "20",
"{actionForm.sortBy}": "1",
"{actionForm.searchType}": "Quick Search",
"{actionForm.currentPageNumber}": "%s" % pagenum,
}
while True:
try:
r = requests.post(url, data=payload)
except requests.exceptions.ConnectionError as e:
print("Exception ConnectionError was caught, retrying requests...")
time.sleep(5)
else:
break
tree = html.fromstring(r.text)
cur_page_job_links = [job_link_url_prepend + i for i in
tree.xpath('//td[@class="jobDesActive"]/a/@href')]
print("Done scraping page %s" % pagenum)
return cur_page_job_links
def main():
driver = webdriver.PhantomJS(executable_path=r'E:\desktop\phantomjs.exe')
driver.set_window_size(1120, 550)
url = "https://www.jobsbank.gov.sg/ICMSPortal/portlets/JobBankHandler/SearchResult3.do"
page_start = 1
page_end = getPageEnd(url)
for pagenum in range(page_start, 2):
cur_page_job_links = jobScrape(url, str(pagenum))
with open("link.txt", 'a') as f:
for link in cur_page_job_links:
f.write("%s \n" % link)
for link in cur_page_job_links:
url = "__https://" + link
driver.get(url)
htmltext = driver.page_source
print htmltext
if __name__ == "__main__":
main()
I think it might have something to do with wanting to retrieve JobDescription frame, how do I use a command like window.document.getElementById("frameJobDescription") to get what I want?