I have following code to parse the website:
if os.path.isfile(data_content_file):
try:
with open(data_content_file) as data_file:
question_answer = json.load(data_file)
except Exception as e:
question_answer = {}
else:
question_answer = {}
if os.path.isfile(count_file):
f = open(count_file, 'r')
try:
start = int(f.read())
except Exception as e:
start = 1
f.close()
else:
start = 1
f = open(count_file, 'w+')
for x in xrange(start,500000):
try:
print(x)
f.seek(0)
f.truncate()
f.write(str(x))
req = urllib2.Request("https://islamqa.info/en/"+str(x), headers={'User-Agent' : "Magic Browser"})
con = urllib2.urlopen( req )
soup = BeautifulSoup(con.read(),"lxml")
I don't know why it get freezed at some x values.
If I stop my script and run again for the same x value, it runs fine.
I tried using timeout, but it's not loading any page, even if timeout is 1000:
req = urllib2.Request("https://islamqa.info/en/"+str(x), headers={'User-Agent' : "Magic Browser"},timeout=10000)
What's the best way to avoid this, or continue the loop, even the site freezes.