I am having trouble in checking existing data in scrapy. i have used elasticsearch as my database below code i am trying to execute ??
def checkIfURLExistsInCrawler(single_url):
elastic_query = json.dumps({
"query": {
"match_phrase": {
"url": single_url
}
}
})
result = es.search(index='test', doc_type='test', body=elastic_query)['hits']['hits']
return result
def start_requests(self):
urls = [
# here i have some url there might be chance
# that some urls are duplicate so i have to put
# validation but in for loop it doesn't working
]
for request_url in urls:
checkExists = self.checkIfURLExistsInCrawler(request_url)
if not checkExists :
beingCrawledUrl = {}
beingCrawledUrl['url'] = single_url
beingCrawledUrl['added_on'] = now.strftime("%Y-%m-%d %H:%M:%S")
json_data = json.dumps(beingCrawledUrl)
InsertData = es.index(index='test', doc_type='test', body=json.loads(json_data))
yield scrapy.Request();
if i execute this code all record inside urls = [ ] are inserted into "test" index even if its duplicated because of validation i put above is not working .
but if i run this again with same data validation works .so please can any one help this out.