From Scrapy results, one unwanted nonASCII code \u2013
(aka character(150)
or en dash
) was in the title, such as u'Director/Senior Director \u2013 Pathology'
. I am trying to use pipeline to remove \u2013
with a regular ,
. But the following code didn't work. No error message be reported neither.
from datetime import datetime
from hashlib import md5
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import re
import string
class ReplaceASC2InTitlePipeline(object):
"""replace unwanted ASCII characters in titles"""
ascii_to_filter = ["\u2013",]
def process_item(self, item, spider):
for word in self.ascii_to_filter:
desc = item.get('title')
if (desc) and word in desc:
spider.log("\u2013 in '%s' was replace" % (item['title']) )
item['title']=item['title'].replace("\u2013", ",")
return item
else:
return item