I'm building a web crawler. some of the the data I input into datastore get saved, others do not get saved and I have no idea what is the problem.
here is my crawler class
class Crawler(object):
def get_page(self, url):
try:
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"}) # yessss!!! with the header, I am able to download pages
#response = urlfetch.fetch(url, method='GET')
#return response.content
#except urlfetch.InvalidURLError as iu:
# return iu.message
response = urllib2.urlopen(req)
return response.read()
except urllib2.HTTPError as e:
return e.reason
def get_all_links(self, page):
return re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',page)
def union(self, lyst1, lyst2):
try:
for elmt in lyst2:
if elmt not in lyst1:
lyst1.append(elmt)
return lyst1
except e:
return e.reason
#function that crawls the web for links starting from the seed
#returns a dictionary of index and graph
def crawl_web(self, seed="http://tonaton.com/"):
query = Listings.query() #create a listings object from storage
if query.get():
objListing = query.get()
else:
objListing = Listings()
objListing.toCrawl = [seed]
objListing.Crawled = []
start_time = datetime.datetime.now()
while datetime.datetime.now()-start_time < datetime.timedelta(0,5):#tocrawl (to crawl can take forever)
try:
#while True:
page = objListing.toCrawl.pop()
if page not in objListing.Crawled:
content = self.get_page(page)
add_page_to_index(page, content)
outlinks = self.get_all_links(content)
graph = Graph() #create a graph object with the url
graph.url = page
graph.links = outlinks #save all outlinks as the value part of the graph url
graph.put()
self.union(objListing.toCrawl, outlinks)
objListing.Crawled.append(page)
except:
return False
objListing.put() #save to database
return True #return true if it works
the classes that define the various ndb Models are in this python module:
import os
import urllib
from google.appengine.ext import ndb
import webapp2
class Listings(ndb.Model):
toCrawl = ndb.StringProperty(repeated=True)
Crawled = ndb.StringProperty(repeated=True)
#let's see how this works
class Index(ndb.Model):
keyword = ndb.StringProperty() # keyword part of the index
url = ndb.StringProperty(repeated=True) # value part of the index
#class Links(ndb.Model):
# links = ndb.JsonProperty(indexed=True)
class Graph(ndb.Model):
url = ndb.StringProperty()
links = ndb.StringProperty(repeated=True)
it used to work fine when I had JsonProperty in place of StringProperty(repeated=true). but JsonProperty is limited to 1500 bytes so I had an error once.
now, when I run the crawl_web member function, it actually crawls but when I check datastore it's only the Index entity that is created. No Graph, no Listing. please help. thanks.