The pattern I used is to fetch the latitude,longitude,url and so on:
pattern = "<article.*?latitude="(.*?)".*?longitude="(.*?)">.*?<a href="(.*?)".*?<figcaption.*?>(.*?)</figcaption>.*?</a>.*?<dt class="listing-type zsg-content_collapsed"><span.*?></span>(.*?)</dt>"
Sometimes it runs well, but sometimes it hangs in
re.findall(pattern, page)
.
The snippet code is:
def getPage(strUrl):
socket.setdefaulttimeout(60)
try:
request = urllib2.Request(strUrl)
#mock browser
request.add_header("User-Agent","Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0")
response = urllib2.urlopen(request)
except urllib2.URLError, e:
print "Bad Url or timeout"
print type(e)
print e
page = ''
except socket.timeout,e:
print "socket timeout"
print type(e)
print e
page = ''
else:
page = response.read().decode('utf8')
print "Get page contents successfully"
return m_page
def getHouseInfo(self,strRegularExpr,page):
if strRegularExpr=='' or page=='':
print "regular expression is null or page is null."
return False
pattern = re.compile(strRegularExpr,re.S)
items = re.findall(pattern,page)
return items
def getHomeDetailLinks(page):
print "<<<<<<Get links starts>>>>>>"
items = getHouseInfo(mapRe['homeDetailLinks'],page)
print items
print "<<<<<<Get links ends>>>>>>"
return items
page = getPage("http://www.zillow.com/homes/for_rent/02138_rb/1_p")
temp = getHomeDetailLinks(page)
print temp
While it hangs, I have to use CTRL+C to stop it, it throws blow exception:
^CTraceback (most recent call last):
...
items = self.getHouseInfo(mapRe['homeDetailLinks'],page)
File ".../crawlbase.py", line 59, in getHouseInfo
items = re.findall(pattern,page)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/re.py", line 177, in findall
return _compile(pattern, flags).findall(string)
KeyboardInterrupt
Could anyone give me some clue to fix this issue?