I am trying to produce columns of data from the LD+JSON metadata found in the HTML of a jobs listings site. I've used scrapy Item Loaders to clean the HTML string and convert this metadata to a JSON object. I would then like to use the information contained within that JSON to populate further fields within my crawler.
Here is the spider so far, which crawls the most recent 100 jobs listing:
import scrapy, json
from ..items import EthjobsScrapyItem, EthJobsLoader
class EthioJobsSpider(scrapy.Spider):
name = "EthioJobs"
allowed_domains = ["ethiojobs.net"]
start_urls = ["http://www.ethiojobs.net/latest-jobs-in-ethiopia/?searchId=1573067457.6526&action=search&page=1&listings_per_page=100&view=list"]
def parse(self, response):
for listing_url in response.xpath('/html/body/div[4]/section/div/div/div/div[4]/div/div[1]/div[4]/div/div/div/table/tbody//@href').getall():
yield response.follow(listing_url, callback=self.parse_listing)
def parse_listing(self, response):
loader = EthJobsLoader(item = EthjobsScrapyItem(), response=response)
loader.add_xpath('JSON_LD', '//script[@type="application/ld+json"]/text()')
yield loader.load_item()
where items.py
is:
import scrapy, re, json
from scrapy.loader import ItemLoader
class EthjobsScrapyItem(scrapy.Item):
JSON_LD = scrapy.Field()
datePosted = scrapy.Field() # an example of a field that would populate data from the JSON data
def cleanJsonVar(self, jsonvar): # Clean HTML markup
for TEXT in jsonvar:
if jsonvar:
try:
jsonvar = re.sub(r"\r+|\n+|\t+| | |amp;|</?.{,6}>", " ", TEXT).strip()
jsonvar = re.sub(r"Job\sDescription", "", jsonvar)
jsonvar = re.sub(r"\A\s+", "", jsonvar)
jsonvar = re.sub(r"( ){2,}", r" ", jsonvar)
jsonvar = re.sub(r"\u2019", r" '", jsonvar)
jsonvar = re.sub(r"\u25cf", r" -", jsonvar)
jsonvar = re.sub(r"\\",r"/", jsonvar)
except Exception as e:
jsonvar = None
print("ERROR: ", str(e))
else:
pass
return jsonvar
def intoJsonVar(self, jsonvar): # Convert from string to JSON
for TEXT in jsonvar:
return json.loads(TEXT)
class EthJobsLoader(ItemLoader):
JSON_LD_in = cleanJsonVar
JSON_LD_out = intoJsonVar
JSON_LD
is outputted from the crawler like so:
{'JSON_LD': ["{
'@context': 'http://schema.org/',
'@type': 'JobPosting',
'title': 'Terms of Reference',
'description': ' Terms of Reference for developing General Management Plan...,'
'identifier': {
'@type': 'PropertyValue',
'name': 'Population Health and Environment – Ethiopia Consortium (PHE EC)',
'value': '65264'
},
'datePosted': '2019-12-10 04:13:31',
'validThrough': '2019-12-20 23:59:59',
'employmentType': 'Full Time',
'hiringOrganization': {
'@type': 'Organization',
'name': 'Population Health and Envir...'
},
'jobLocation': {
'@type': 'Place',
'address': {
'@type': 'PostalAddress',
'addressLocality': 'ETH Region',
'addressRegion': ' Addis Ababa ',
'addressCountry': 'ETH'
}
}
}"]
}
My question is this: how would I take information from the above JSON and use it to populate new fields in my crawler?
Any and all input/critique is beyond welcome!