I've written some scrapy code that should be able to loop through a series of cities, go to the specific pages for those cities, grab all the data in a table on that page, and iterate through all of the pages of tables for that city. My code runs, but after a while it seems to time out or something, and I start getting this in my logs:
2020-12-16 18:47:47 [yjs] INFO: Parsing table and getting job data for page url http://www.yingjiesheng.com/other-morejob-1372.html
2020-12-16 18:48:27 [scrapy.extensions.logstats] INFO: Crawled 113 pages (at 2 pages/min), scraped 111 items (at 2 items/min)
2020-12-16 18:49:27 [scrapy.extensions.logstats] INFO: Crawled 113 pages (at 0 pages/min), scraped 111 items (at 0 items/min)
2020-12-16 18:50:27 [scrapy.extensions.logstats] INFO: Crawled 113 pages (at 0 pages/min), scraped 111 items (at 0 items/min)
2020-12-16 18:51:27 [scrapy.extensions.logstats] INFO: Crawled 113 pages (at 0 pages/min), scraped 111 items (at 0 items/min)
2020-12-16 18:52:27 [scrapy.extensions.logstats] INFO: Crawled 113 pages (at 0 pages/min), scraped 111 items (at 0 items/min)
This seems to occur at random points of time. The first time I ran it I started getting this after 66 pages. Below is my spider code:
URLROOT = "https://www.yingjiesheng.com/"
CITIES = {"beijing": "εδΊ¬"}
class YjsSpider(scrapy.Spider):
name = "yjs"
def start_requests(self):
# loop through cities and pass info
for key, value in CITIES.items():
self.logger.info('Starting requests for %s', key)
url = URLROOT + str(key)
yield scrapy.Request(
url=url, callback=self.retrieve_tabsuffix,
meta={'city': key, 'city_ch': value},
encoding='gb18030'
)
def retrieve_tabsuffix(self, response):
city = response.meta['city']
city_ch = response.meta['city_ch']
morepages = response.xpath(
'//*[contains(concat( " ", @class, " " ), concat( " ", "mbth", " " ))]')
morepage_html = morepages.css("a::attr(href)").get()
if "-morejob-" in morepage_html:
jobpage_one = f"{URLROOT}{city}-morejob-1.html"
elif "list_" in morepage_html:
jobpage_one = f"{URLROOT}{city}/list_1.html"
yield response.follow(
url=jobpage_one,
callback=self.retrieve_tabhtmls,
meta={'city': city, 'city_ch': city_ch},
encoding='gb18030')
def retrieve_tabhtmls(self, response):
city = response.meta['city']
city_ch = response.meta['city_ch']
self.logger.info('Encodings are %s, %s', encoding1, encoding2)
# htmls
listhtmls = response.xpath(
'//*[contains(concat( " ", @class, " " ), concat( " ", "clear", " " ))]').get()
totalrecords = response.xpath(
'//*[contains(concat( " ", @class, " " ), concat( " ", "act", " " ))]').get()
self.logger.info("totalrecords: %s", totalrecords)
# identify the last page number
listhtmls = listhtmls.split("a href=\"")
for listhtml in listhtmls:
if "last page" in listhtml:
lastpagenum = re.findall(r"\d+", listhtml)[0]
morejobpages = list(range(1, int(lastpagenum) + 1))
self.logger.info("total number tables %s", lastpagenum)
self.logger.info('Getting all table page URLs for %s', city)
morejobpages_urls = [
"http://www.yingjiesheng.com/{}/list_{}.html".format(city, i) for i in morejobpages]
self.logger.info(morejobpages)
yield from response.follow_all(
urls=morejobpages_urls,
callback=self.parse_tab,
meta={'city': city, 'city_ch': city_ch,
'totalrecords': totalrecords},
encoding='gb18030')
def parse_tab(self, response):
city = response.meta['city']
city_ch = response.meta['city_ch']
totalrecords = response.meta['totalrecords']
self.logger.info('Parsing table and getting job data for page url %s', response.url)
# table content
tabcontent = response.xpath(
'//*[(@id = "tb_job_list")]')
# list of rows
tabrows = tabcontent.css("tr.jobli").getall()
item = YjsTable()
item['table'] = tabrows
item['time_scraped'] = datetime.datetime.now().strftime(
"%m/%d/%Y %H:%M:%S")
item['city'] = city
item['city_ch'] = city_ch
item['totalrecords'] = totalrecords
item['pageurl'] = response.url
yield item
This is the only other post I found that seems to be running into the same issue, but they are pulling from a SQL database and I am not.
Does anyone know why scrapy would work for a while and then all of a sudden stop requesting pages and grabbing data, but continuing to run?
EDIT: I re-ran with the DEBUG logging settings and got this:
2020-12-17 10:35:47 [scrapy.extensions.logstats] INFO: Crawled 41 pages (at 0 pages/min), scraped 39 items (at 0 items/min)
2020-12-17 10:35:49 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET http://www.yingjiesheng.com/app/job.php?Action=FullTimeMore&Location=guangzhou&Source=Other&Page=86> from <GET http://www.yingjiesheng.com/guangzhou-morejob-86.html>
2020-12-17 10:36:06 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET http://www.yingjiesheng.com/guangzhou-morejob-86.html> from <GET http://www.yingjiesheng.com/app/job.php?Action=FullTimeMore&Location=guangzhou&Source=Other&Page=86>
2020-12-17 10:36:24 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET http://www.yingjiesheng.com/app/job.php?Action=FullTimeMore&Location=guangzhou&Source=Other&Page=85> from <GET http://www.yingjiesheng.com/guangzhou-morejob-85.html>
So it seems like I'm getting redirected, but it's not successfully scraping the information from the redirect and then moves on to the next page. Does anyone have an idea of how you would have scrapy keep trying a page until it's successful? Or if there's a better way to deal with this?