I am very new to python and trying to develop very simple web crawler. My web crawler works good but it stick to one link for long time. How can I set up a timeout function?
How to deal with the urllib2.HTTPError? Is my except statement correct?
def get_link(page):
start = page.find('<a href=')
if start==-1:
return None,0
startp=page.find('"',start)
endp=page.find('"',startp+1)
url=page[startp+1:endp]
return url,endp
def get_all_link(page):
allurl = []
while True:
url,endp=get_link(page)
if url:
page=page[endp:]
allurl.append(url)
else:
return allurl
break
def get_page(page, tocrawl):
import urllib2
try:
page_source = urllib2.urlopen(page)
return page_source.read()
except:
page = tocrawl.pop()
raise
def validate(page):
valid = page.find('http')
if valid == -1:
return 0
return 1
def crawler(seed):
tocrawl = [seed]
crawled = []
i=0
while tocrawl:
page=tocrawl.pop()
valid = validate(page)
if valid:
if page not in crawled:
tocrawl = set(tocrawl) | set(get_all_link(get_page(page,tocrawl)))
crawled.append(page)
i=i+1
f = open("crawled.txt","a")
f.write(repr(i)+" : "+repr(page)+"\n")
f.close()
return crawled
crawler("http://google.com")