0

I'm trying to parse more than 200 links, but BS4 just stuck with out processing. I saw that Beautifulsoup findall get stuck without processing, but that's different. Stuck in the random places.

import os
import urllib.request
from bs4 import BeautifulSoup
def get_html(url):
    response = urllib.request.urlopen(url)
    return response.read()

def parse(html, url):
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find_all('tr', title = "Допущено до конкурсу")
    if os.path.exists('base/%s.txt' % url[27:]):
        pass
    else:
        abitbase = open('base/%s.txt' % (url[27:]), 'w')
        for unit in table:
            collection = unit.find_all('td')
            position = collection[0].text
            name = collection[1].text
            priority = collection[2].text
            score = collection[3].text
            abitbase.write('%s %s %s %s \n' % (position, name, priority, score))
        abitbase.close()

def main():
    global applicants
    url_list = open('clist.txt', 'r')
    for count in range(1, 241):
        url_s = url_list.readline()
        if url_s[-1] == '\n':
            url = url_s[:-1]
        else:
            url = url_s
        parse(get_html(url), url)
        print('base [%s] saved | %s%s' %(url[27:], (round((count/2.41), 2)), '%'))

if __name__ == '__main__':
    applicants = {}
    main()

And TimeoutError:

Traceback (most recent call last):
  File "/usr/lib/python3.4/urllib/request.py", line 1182, in do_open
    h.request(req.get_method(), req.selector, req.data, headers)
  File "/usr/lib/python3.4/http/client.py", line 1088, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python3.4/http/client.py", line 1126, in _send_request
    self.endheaders(body)
  File "/usr/lib/python3.4/http/client.py", line 1084, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python3.4/http/client.py", line 922, in _send_output
    self.send(msg)
  File "/usr/lib/python3.4/http/client.py", line 857, in send
    self.connect()
  File "/usr/lib/python3.4/http/client.py", line 834, in connect
    self.timeout, self.source_address)
  File "/usr/lib/python3.4/socket.py", line 512, in create_connection
    raise err
  File "/usr/lib/python3.4/socket.py", line 503, in create_connection
    sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 58, in <module>
    main()
  File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 53, in main
    parse(get_html(url), url)
  File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 22, in get_html
    response = urllib.request.urlopen(url)
  File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib/python3.4/urllib/request.py", line 463, in open
    response = self._open(req, data)
  File "/usr/lib/python3.4/urllib/request.py", line 481, in _open
    '_open', req)
  File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.4/urllib/request.py", line 1210, in http_open
    return self.do_open(http.client.HTTPConnection, req)
  File "/usr/lib/python3.4/urllib/request.py", line 1184, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 110] Connection timed out>
Community
  • 1
  • 1
Mr.Xyzed
  • 43
  • 9

1 Answers1

1

BS4 works fine, that's my fault.

I just put os.path.exists before parse(get_html(url), url) and it works good.

Sorry.

Mr.Xyzed
  • 43
  • 9