1

I am very new to python and trying to develop very simple web crawler. My web crawler works good but it stick to one link for long time. How can I set up a timeout function?

How to deal with the urllib2.HTTPError? Is my except statement correct?

def get_link(page):
    start = page.find('<a href=')
    if start==-1:
        return None,0
    startp=page.find('"',start)
    endp=page.find('"',startp+1)
    url=page[startp+1:endp]
    return url,endp

def get_all_link(page):
    allurl = []
    while True:
        url,endp=get_link(page)
        if url:
            page=page[endp:]
            allurl.append(url)
        else:
            return allurl
            break

def get_page(page, tocrawl):
    import urllib2
    try:
        page_source = urllib2.urlopen(page)
        return page_source.read()
    except:
        page = tocrawl.pop()
        raise

def validate(page):
    valid = page.find('http')

    if valid == -1:
       return 0
    return 1


def crawler(seed):
    tocrawl = [seed]
    crawled = []
    i=0

    while tocrawl:
        page=tocrawl.pop()
        valid = validate(page)
        if valid:
            if page not in crawled:
                tocrawl = set(tocrawl) | set(get_all_link(get_page(page,tocrawl)))
                crawled.append(page)
                i=i+1
                f = open("crawled.txt","a")
                f.write(repr(i)+" : "+repr(page)+"\n")
                f.close()
    return crawled

crawler("http://google.com")
  • You wrote `except:` means except all exceptions. If you want to catch only `urllib2.HTTPError` then write as `except urllib2.HTTPError:`. – Nilesh Oct 28 '14 at 06:04
  • Explanation on how to handle urllib2 timeouts - http://stackoverflow.com/questions/2712524/handling-urllib2s-timeout-python – Yaron Tal Oct 28 '14 at 09:42

0 Answers0