I am struggling to get my code, that scrapes HTML table info from web, to work through a list of websites held in ShipURL.txt file. The code reads in the web page addresses from ShipURL and then goes to the link and downloads the table data and saves it to csv. But my problem is that the program cannot finish, as the error "A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond" occurs in the middle and the program stops. Now as I understand I need to increase the request time, use a proxy or make a try statement. I have scanned through a few answers concerning the same problem, but as an novice I am finding it hard to understand. Any help would be appreciated.
ShipURL.txt https://dl.dropboxusercontent.com/u/110612863/ShipURL.txt
# -*- coding: utf-8 -*-
fm = open('ShipURL.txt', 'r')
Shiplinks = fm.readlines()
import csv
from urllib import urlopen
from bs4 import BeautifulSoup
import re
for line in Shiplinks:
website = re.findall(r'(https?://\S+)', line)
website = "".join(str(x) for x in website)
if website != "":
with open('ShipData.csv','wb')as f: #Creates an empty csv file to which assign values.
writer = csv.writer(f)
shipUrl = website
shipPage = urlopen(shipUrl)
soup = BeautifulSoup(shipPage, "html.parser") #Read the web page HTML
table = soup.find_all("table", { "class" : "table1" }) #Finds table with class table1
List = []
columnRow = ""
valueRow = ""
Values = []
for mytable in table: #Loops tables with class table1
table_body = mytable.find('tbody') #Finds tbody section in table
try: #If tbody exists
rows = table_body.find_all('tr') #Finds all rows
for tr in rows: #Loops rows
cols = tr.find_all('td') #Finds the columns
i = 1 #Variable to control the lines
for td in cols: #Loops the columns
## print td.text #Displays the output
co = td.text #Saves the column to a variable
## writer.writerow([co]) Writes the variable in CSV file row
if i == 1: #Checks the control variable, if it equals to 1
if td.text[ -1] == ":":
# võtab kooloni maha ja lisab koma järele
columnRow += td.text.strip(":") + "," # Tekkis mõte, et vb oleks lihtsam kohe ühte string panna
List.append(td.text) #.. takes the column value and assigns it to a list called 'List' and..
i+=1 #..Increments i by one
else:
# võtab reavahetused maha ja lisab koma stringile
valueRow += td.text.strip("\n") + ","
Values.append(td.text) #Takes the second columns value and assigns it to a list called Values
#print List #Checking stuff
#print Values #Checking stuff
except:
print "no tbody"
# Prindime pealkirjad ja väärtused koos reavahetusega välja ka :)
print columnRow.strip(",")
print "\n"
print valueRow.strip(",")
# encode'ing hakkas jälle kiusama
# Kirjutab esimeseks reaks veeru pealkirjad ja teiseks väärtused
writer.writerow([columnRow.encode('utf-8')])
writer.writerow([valueRow.encode('utf-8')])