BeautifulSoup Text Mining - Variable String

Question

I am trying to extract text from indeed.com and for some reason unable to extract text from the jobmap variable shown below. I have included my code for your review and guidance. The original link I am using is http://www.indeed.co.uk/jobs?q=python&l=.

I am hoping to create a dataframe from jobmap[0] to jobmap[9]:

<script type="text/javascript">

function rclk(el,jobdata,oc,sal) { var ocstr = oc ? '&onclick=1' : ''; document.cookie='RCLK="jk='+jobdata.jk+'&tk=19i8hio29173i4q9&rd='+jobdata.rd+'&qd=7tdTJLF8oc4dPpT7T_zGvNMUkEhdsofXi_d_0hd2X6v0K0UAGbvReB0EpTyqsy6mfwDp0dterPbHZubI-Ho6fr5IbNHuDaBjQT6u6eGSKV6XZjJx0CQssKb7HhgrPx5f&ts=1428363501641&sal='+sal+ocstr+'"; path=/'; return true;}
function zrprclk(el,jobdata,oc) { var ocstr = oc ? '&onclick=1' : ''; document.cookie='RCLK="jk='+jobdata.jk+'&tk=19i8hio29173i4q9&from=reconzrp&rd='+jobdata.rd+'&qd=7tdTJLF8oc4dPpT7T_zGvNMUkEhdsofXi_d_0hd2X6v0K0UAGbvReB0EpTyqsy6mfwDp0dterPbHZubI-Ho6fr5IbNHuDaBjQT6u6eGSKV6XZjJx0CQssKb7HhgrPx5f&ts=1428363501641'+ocstr+'"; path=/'; return true;}
function prjbottomclk(el,jobdata,oc) { var ocstr = oc ? '&onclick=1' : ''; document.cookie='RCLK="jk='+jobdata.jk+'&tk=19i8hio29173i4q9&from=reconserp&rd='+jobdata.rd+'&qd=7tdTJLF8oc4dPpT7T_zGvNMUkEhdsofXi_d_0hd2X6v0K0UAGbvReB0EpTyqsy6mfwDp0dterPbHZubI-Ho6fr5IbNHuDaBjQT6u6eGSKV6XZjJx0CQssKb7HhgrPx5f&ts=1428363501641'+ocstr+'"; path=/'; return true;}

var jobmap = {};

jobmap[0]= {jk:'833b3b546fa19a15',efccid: 'ba27a1a49bded3ca',srcid:'bd5b1a0b89fdc77a',cmpid:'1c61cbd342c70437',num:'0',srcname:'ustwo studio Ltd',cmp:'ustwo studio Ltd',cmpesc:'ustwo studio Ltd',cmplnk:'/ustwo-studio-jobs',loc:'London',country:'GB',zip:'',city:'London',title:'Data Scientist',locid:'833c779eabe84c9f',rd:'2G0bcbLxcAqiHB9MMTYN9Q'};

jobmap[1]= {jk:'bf6df27f1d3b90fb',efccid: '98f3e203ab7d8e01',srcid:'b0a70c53f51e95a6',cmpid:'fe8b4fdb8a17a513',num:'1',srcname:'Reed Business Information',cmp:'Reed Business Information',cmpesc:'Reed Business Information',cmplnk:'/Reed-Business-Information-jobs',loc:'Heathrow',country:'GB',zip:'',city:'Heathrow',title:'Data Analytics Manager - Flightglobal - Heathrow, Middlesex',locid:'4296d6706ebc67b5',rd:'4ZrZ-vtiYwdobVTLuwlSBHEwqdD0vnOb9P51Phyha6c'};

jobmap[2]= {jk:'146969d233b25b49',efccid: '2a58d847c3011c18',srcid:'b4a49235193125a8',cmpid:'1544766d4c2915b0',num:'2',srcname:'EY',cmp:'EY',cmpesc:'EY',cmplnk:'/EY-jobs',loc:'London',country:'GB',zip:'',city:'London',title:'Analytics Manager - People Data',locid:'833c779eabe84c9f',rd:'WPdCTYq1ZBHM1poxVAfv11_MKnaSAFGAsD6kfERFt3g'};
...
...
...
...
...
...
jobmap[9]=

</script>

My Code is:

from bs4 import BeautifulSoup
import urllib2
import csv
import os
import re
import requests

page1 = urllib2.urlopen('http://www.indeed.co.uk/jobs?q=%22data+science%22')
soup = BeautifulSoup(page1)


for title in soup.findAll('h2',{'class' : 'jobtitle'}):
    print title.text


for company in soup.findAll('span',{'class' : 'company'}):
    print company.text

score 1 · Accepted Answer · edited May 23 '17 at 12:13

The idea here is to locate the script element with the desired object definition and use regular expressions to find all jobmap object definitions, then use demjson module to convert them to dictionaries. As a result you would get a list of dictionaries:

from pprint import pprint
import re
import urllib2

from bs4 import BeautifulSoup
import demjson


page1 = urllib2.urlopen('http://www.indeed.co.uk/jobs?q=%22data+science%22')
soup = BeautifulSoup(page1)

pattern = re.compile(r"jobmap\[\d+\]= (.*?);")
script = soup.find('script', text=lambda text: text and "jobmap = {}" in text).text

data = [demjson.decode(item) for item in pattern.findall(script)]
pprint(data)

Prints a list of dictionaries:

[{u'city': u'London',
  u'cmp': u'ustwo studio Ltd',
  u'cmpesc': u'ustwo studio Ltd',
  u'cmpid': u'1c61cbd342c70437',
  u'cmplnk': u'/ustwo-studio-jobs',
  u'country': u'GB',
  u'efccid': u'ba27a1a49bded3ca',
  u'jk': u'833b3b546fa19a15',
  u'loc': u'London',
  u'locid': u'833c779eabe84c9f',
  u'num': u'0',
  u'rd': u'2G0bcbLxcAqiHB9MMTYN9Q',
  u'srcid': u'bd5b1a0b89fdc77a',
  u'srcname': u'ustwo studio Ltd',
  u'title': u'Data Scientist',
  u'zip': ''},
  ...
 {u'city': u'Belfast',
  u'cmp': u'Allstate Northern Ireland',
  u'cmpesc': u'Allstate Northern Ireland',
  u'cmpid': u'bd6c20d6c99988f6',
  u'cmplnk': u'/Allstate-Northern-Ireland-jobs',
  u'country': u'GB',
  u'efccid': u'521645e5cd22988a',
  u'jk': u'9b517e0b09e09ca0',
  u'loc': u'Belfast',
  u'locid': u'e6523dbdeffe6c9b',
  u'num': u'9',
  u'rd': u'hW5WLDedIUk_fnMJS2cPmngDVkFzbh8-xI2u2vwcbH0',
  u'srcid': u'eb73601b9a76cd58',
  u'srcname': u'Allstate Northern Ireland',
  u'title': u'Big Data Analytics',
  u'zip': ''}]

This is perfect. Suits exactly to my needs. I was hoping if you could help me on another task that I am finding some challenge. Details here. http://stackoverflow.com/questions/29500367/ascii-codec-cant-encode-character-u-u2013-in-position-19-ordinal-not-in-ra — Deepayan, Apr 07 '15 at 20:09

BeautifulSoup Text Mining - Variable String

1 Answers1