I don't think there is a specific answer that will always work. In your case, there are clear enters between the different sections. So therefore I would create a manual parser that focuses on getting those different sections.
Some example code I could come up with is shown below, but I'll first mention the steps that I took.
Get the data set from the website, and store it in a local file (save some energy).
Manually find the point of splitting the summary data (the header on top) and the body with the vote counts.
Parse the header line for line manually, this will break if somethings changes, but hey, you probably only have to do it once (cross your fingers).
Parse the body, where I partition the body into multiple sections, where every section is contained between two enters. An example section would be
AURORA MAYOR
VOTE FOR 1
(WITH 3 OF 3 PRECINCTS COUNTED)
RICHARD C. IRVIN . . . . . . . . 237 62.20 207 30 0
JUDD LOFCHIE . . . . . . . . . 59 15.49 56 3 0
JOHN LAESCH. . . . . . . . . . 85 22.31 63 22 0
Then manually parse the section, for which I leave room to parse every candidate.
Now I didn't fully finalized the scraping, but that is part of the fun for you.
But this should set you up with a framework of how to handle an arbitrary large number of sections and candidates.
Code
import itertools
import urllib.request
from argparse import Namespace
from pprint import pprint
from bs4 import BeautifulSoup
def get_data(url, file='data.txt'):
""" Retrieve the bare bone data from a weblink and stores it in provided file. """
with urllib.request.urlopen(url) as page:
soup = BeautifulSoup(page.read(), 'html.parser')
data = soup.find('pre').text.split('\n')
with open(file, 'w') as file:
file.writelines(data)
def clean_data(file='data.txt', header=15, ignore=False):
"""
Clean the data, where the first n lines are for the header or ignored.
:param file: (str) Name of the file to load.
:param header: (int) Number of lines used for header or skipped when ignore is True.
:param ignore: (bool) If True, skips the lines indicated by header.
:return:
"""
with open(file, 'r') as file:
data = file.readlines()
header, body = data[:header], data[header:]
data_header = generate_header(header)
data_body = generate_body(body, columns=data_header.columns)
# pprint(vars(data_header))
pprint(vars(data_body))
def parse_numbers(line: str, columns, missing: list = None, fill_value='-') -> dict:
values = list(filter(str.strip, line.split(' ')))
if len(values) == len(columns):
return dict(zip(columns, values))
if all(int(value) == 0 for value in values):
return dict(zip(columns, ['0'] * len(columns)))
raise ValueError(f"Unknown handling of missing values."
f"\nColumns: {columns}Line: \n{line}Values: \n{values}")
def generate_header(header: list[str]):
""" Manually parse the header (hopefully only once). """
clean_data = list(filter(bool, ''.join(header).split('\n')))
name, description, status = list(filter(str.strip, clean_data[0].split(' ')))
date = clean_data[1].strip()
country, state = list(map(str.strip, clean_data[2].split(',')))
election_date = clean_data[3].strip()
columns = list(filter(str.strip, clean_data[4].split(' ')))
summary = {}
for row in clean_data[5:11]:
pass
return Namespace(
name=name,
description=description,
status=status,
date=date,
country=country,
state=state,
election_date=election_date,
columns=columns,
summary=summary
)
def generate_body(body: list[str], columns=None):
clean_body = list(map(str.strip, ''.join(body).split('\n')))
# https://stackoverflow.com/a/52943710/10961342
sections = [list(group) for key, group in itertools.groupby(clean_body, key=bool) if key]
metadata = []
for section in sections:
function = section[0]
vote = [row.startswith('VOTE FOR') for row in section].index(True) # locate where `VOTE FOR`
info = ' '.join(map(str.strip, section[1:vote + 2]))
candidates = []
for candidate in section[vote + 2:]:
name = candidate.split('.')[0].strip()
numbers = candidate.rsplit('. .')[-1]
data = parse_numbers(numbers, columns)
candidates.append({"name": name, "data": data})
metadata.append({"function": function, "info": info, "candidates": candidates})
pprint(metadata, sort_dicts=False)
return Namespace(body=metadata)
if __name__ == '__main__':
# Retrieve the original data set.
# get_data('https://results.co.kendall.il.us/')
clean_data()
Output
[{'function': 'AURORA MAYOR',
'info': 'VOTE FOR 1 (WITH 3 OF 3 PRECINCTS COUNTED)',
'candidates': [{'name': 'RICHARD C',
'data': {'TOTAL VOTES': '237',
' %': ' 62.20',
'ELECTION DAY': ' 207',
' EV, VBM': '30',
'PROV, POST': ' 0'}},
{'name': 'JUDD LOFCHIE',
'data': {'TOTAL VOTES': ' 59',
' %': ' 15.49',
'ELECTION DAY': '56',
' EV, VBM': ' 3',
'PROV, POST': ' 0'}},
{'name': 'JOHN LAESCH',
'data': {'TOTAL VOTES': ' 85',
' %': ' 22.31',
'ELECTION DAY': '63',
' EV, VBM': '22',
'PROV, POST': ' 0'}}]},
{'function': 'AURORA ALDERMAN AT LARGE',
'info': 'VOTE FOR 1 (WITH 3 OF 3 PRECINCTS COUNTED)',
'candidates': [{'name': 'RON WOERMAN',
'data': {'TOTAL VOTES': '117',
' %': ' 34.01',
'ELECTION DAY': ' 106',
' EV, VBM': '11',
'PROV, POST': ' 0'}},
{'name': 'BROOKE SHANLEY',
'data': {'TOTAL VOTES': '168',
' %': ' 48.84',
'ELECTION DAY': ' 136',
' EV, VBM': '32',
'PROV, POST': ' 0'}},
{'name': 'RAYMOND HULL',
'data': {'TOTAL VOTES': ' 59',
' %': ' 17.15',
'ELECTION DAY': '52',
' EV, VBM': ' 7',
'PROV, POST': ' 0'}}]},
{'function': 'AURORA ALDERMAN WARD 9',
'info': 'VOTE FOR 1 (WITH 3 OF 3 PRECINCTS COUNTED)',
'candidates': [{'name': 'EDWARD J',
'data': {'TOTAL VOTES': '339',
' %': '100.00',
'ELECTION DAY': ' 285',
' EV, VBM': '54',
'PROV, POST': ' 0'}}]},
{'function': 'JOLIET COUNCILMAN AT LARGE',
'info': 'VOTE FOR 3 (WITH 7 OF 7 PRECINCTS COUNTED)',
'candidates': [{'name': 'GLENDA WRIGHT-McCULLUM',
'data': {'TOTAL VOTES': ' 96',
' %': '7.81',
'ELECTION DAY': '91',
' EV, VBM': ' 5',
'PROV, POST': ' 0'}},
{'name': 'NICOLE LURRY',
'data': {'TOTAL VOTES': ' 77',
' %': '6.27',
'ELECTION DAY': '70',
' EV, VBM': ' 7',
'PROV, POST': ' 0'}},
{'name': 'JEREMY BRZYCKI',
'data': {'TOTAL VOTES': ' 90',
' %': '7.32',
'ELECTION DAY': '78',
' EV, VBM': '12',
'PROV, POST': ' 0'}},
{'name': 'CESAR GUERRERO',
'data': {'TOTAL VOTES': '106',
' %': '8.62',
'ELECTION DAY': '95',
' EV, VBM': '11',
'PROV, POST': ' 0'}},
{'name': 'ISIAH WILLIAMS JR',
'data': {'TOTAL VOTES': ' 47',
' %': '3.82',
'ELECTION DAY': '45',
' EV, VBM': ' 2',
'PROV, POST': ' 0'}},
{'name': 'HUDSON HOLLISTER',
'data': {'TOTAL VOTES': ' 84',
' %': '6.83',
'ELECTION DAY': '72',
' EV, VBM': '12',
'PROV, POST': ' 0'}},
{'name': 'JAMES LANHAM',
'data': {'TOTAL VOTES': ' 32',
' %': '2.60',
'ELECTION DAY': '29',
' EV, VBM': ' 3',
'PROV, POST': ' 0'}},
{'name': 'ROGER POWELL',
'data': {'TOTAL VOTES': ' 56',
' %': '4.56',
'ELECTION DAY': '55',
' EV, VBM': ' 1',
'PROV, POST': ' 0'}},
{'name': 'WARREN C',
'data': {'TOTAL VOTES': ' 76',
' %': '6.18',
'ELECTION DAY': '66',
' EV, VBM': '10',
'PROV, POST': ' 0'}},
{'name': 'ROBERT WUNDERLICH',
'data': {'TOTAL VOTES': '166',
' %': ' 13.51',
'ELECTION DAY': ' 149',
' EV, VBM': '17',
'PROV, POST': ' 0'}},
{'name': 'JOE CLEMENT',
'data': {'TOTAL VOTES': '203',
' %': ' 16.52',
'ELECTION DAY': ' 190',
' EV, VBM': '13',
'PROV, POST': ' 0'}},
{'name': 'JAN QUILLMAN',
'data': {'TOTAL VOTES': '196',
' %': ' 15.95',
'ELECTION DAY': ' 184',
' EV, VBM': '12',
'PROV, POST': ' 0'}}]},
{'function': 'PLANO MAYOR',
'info': 'VOTE FOR 1 (WITH 11 OF 11 PRECINCTS COUNTED)',
'candidates': [{'name': 'ROBERT "BOB" HAUSLER (IND)',
'data': {'TOTAL VOTES': '388',
' %': ' 48.50',
'ELECTION DAY': ' 336',
' EV, VBM': '52',
'PROV, POST': ' 0'}},
{'name': 'MIKE RENNELS (IND)',
'data': {'TOTAL VOTES': '412',
' %': ' 51.50',
'ELECTION DAY': ' 352',
' EV, VBM': '60',
'PROV, POST': ' 0'}}]},
...