I have a Flask app that parses a CSV of public election data and inserts the results into a Postgres database. It's a port of an old, not-Flask, Python 2 app that uses various libraries that no longer work. I'm mostly trying to base the application's structure on this tutorial. I've been using Flask-SQLAlchemy to construct some models for the database tables and populate the data from the CSV.
In this case I'm working with an Area model, which corresponds to a geographic area that might have an election (house district, school board district, etc). Here's what I've got in my basic blueprint route:
election = None
@bp.route('/areas')
def scrape_areas():
area = Area()
sources = area.read_sources()
election = area.set_election()
if election not in sources:
return
# Get metadata about election
election_meta = sources[election]['meta'] if 'meta' in sources[election] else {}
for i in sources[election]:
source = sources[election][i]
if 'type' in source and source['type'] == 'areas':
rows = area.parse_election(source, election_meta)
count = 0
for row in rows:
parsed = area.parser(row, i)
area = Area()
area.from_dict(parsed, new=True)
# this shows the generated string of area_id
# which is a UNIQUE key in the database
print(area)
db.session.add(area)
db.session.commit()
count = count + 1
return count
And here's the models.py:
import logging
import os
import json
import re
import csv
import urllib.request
import calendar
import datetime
from flask import current_app
from app import db
LOG = logging.getLogger(__name__)
scraper_sources_inline = None
class ScraperModel(object):
nonpartisan_parties = ['NP', 'WI', 'N P']
def __init__(self, group_type = None):
"""
Constructor
"""
# this is where scraperwiki was creating and connecting to its database
# we do this in the imported sql file instead
self.read_sources()
def read_sources(self):
"""
Read the scraper_sources.json file.
"""
if scraper_sources_inline is not None:
self.sources = json.loads(scraper_sources_inline)
else:
#sources_file = current_app.config['SOURCES_FILE']
sources_file = os.path.join(current_app.root_path, '../scraper_sources.json')
data = open(sources_file)
self.sources = json.load(data)
return self.sources
def set_election(self):
# Get the newest set
newest = 0
for s in self.sources:
newest = int(s) if int(s) > newest else newest
newest_election = str(newest)
election = newest_election
# Usually we just want the newest election but allow for other situations
election = election if election is not None and election != '' else newest_election
return election
def parse_election(self, source, election_meta = {}):
# Ensure we have a valid parser for this type
parser_method = getattr(self, "parser", None)
if callable(parser_method):
# Check if election has base_url
source['url'] = election_meta['base_url'] + source['url'] if 'base_url' in election_meta else source['url']
# Get data from URL
try:
response = urllib.request.urlopen(source['url'])
lines = [l.decode('latin-1') for l in response.readlines()]
rows = csv.reader(lines, delimiter=';')
return rows
except Exception as err:
LOG.exception('[%s] Error when trying to read URL and parse CSV: %s' % (source['type'], source['url']))
raise
def from_dict(self, data, new=False):
for field in data:
setattr(self, field, data[field])
class Area(ScraperModel, db.Model):
__tablename__ = "areas"
id = db.Column(db.Integer, primary_key=True, autoincrement=True)
area_id = db.Column(db.String(255), unique=True, nullable=False)
areas_group = db.Column(db.String(255))
county_id = db.Column(db.String(255))
county_name = db.Column(db.String(255))
ward_id = db.Column(db.String(255))
precinct_id = db.Column(db.String(255))
precinct_name = db.Column(db.String(255))
state_senate_id = db.Column(db.String(255))
state_house_id = db.Column(db.String(255))
county_commissioner_id = db.Column(db.String(255))
district_court_id = db.Column(db.String(255))
soil_water_id = db.Column(db.String(255))
school_district_id = db.Column(db.String(255))
school_district_name = db.Column(db.String(255))
mcd_id = db.Column(db.String(255))
precincts = db.Column(db.String(255))
name = db.Column(db.String(255))
updated = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp())
def __repr__(self):
return '<Area {}>'.format(self.area_id)
def parser(self, row, group):
# General data
parsed = {
'area_id': group + '-',
'areas_group': group,
'county_id': None,
'county_name': None,
'ward_id': None,
'precinct_id': None,
'precinct_name': '',
'state_senate_id': None,
'state_house_id': None,
'county_commissioner_id': None,
'district_court_id': None,
'soil_water_id': None,
'school_district_id': None,
'school_district_name': '',
'mcd_id': None,
'precincts': None,
'name': ''
}
if group == 'municipalities':
parsed['area_id'] = parsed['area_id'] + row[0] + '-' + row[2]
parsed['county_id'] = row[0]
parsed['county_name'] = row[1]
parsed['mcd_id'] = "{0:05d}".format(int(row[2])) #enforce 5 digit
parsed['name'] = row[1]
if group == 'counties':
parsed['area_id'] = parsed['area_id'] + row[0]
parsed['county_id'] = row[0]
parsed['county_name'] = row[1]
parsed['precincts'] = row[2]
if group == 'precincts':
parsed['area_id'] = parsed['area_id'] + row[0] + '-' + row[1]
parsed['county_id'] = row[0]
parsed['precinct_id'] = row[1]
parsed['precinct_name'] = row[2]
parsed['state_senate_id'] = row[3]
parsed['state_house_id'] = row[4]
parsed['county_commissioner_id'] = row[5]
parsed['district_court_id'] = row[6]
parsed['soil_water_id'] = row[7]
parsed['mcd_id'] = row[8]
if group == 'school_districts':
parsed['area_id'] = parsed['area_id'] + row[0]
parsed['school_district_id'] = row[0]
parsed['school_district_name'] = row[1]
parsed['county_id'] = row[2]
parsed['county_name'] = row[3]
return parsed
So Areas is an extension of my default model class because it allows me to set up the fields that are specific to a given area based on the CSV.
Where this code fails is a (relatively) rare case in the CSV data where there might be multiple rows that, in the old application, correspond to the same row in the table. That old application had an array (usually with just one item, representing a UNIQUE column in the database) to instruct the code to run an UPDATE on those rows.
It returns an error like this:
sqlalchemy.exc.IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "areas_area_id_key"
DETAIL: Key (area_id)=(counties-01) already exists.
An example of how it runs when I'm just logging my UNIQUE key value from the model instead of inserting it:
<Area precincts-87-0140>
<Area precincts-87-0145>
<Area precincts-87-0150>
<Area precincts-87-0155>
<Area precincts-87-0160>
<Area precincts-87-0165>
<Area school_districts-0001>
<Area school_districts-0001>
<Area school_districts-0001>
<Area school_districts-0002>
<Area school_districts-0004>
<Area school_districts-0006>
<Area school_districts-0012>
<Area school_districts-0013>
<Area school_districts-0014>
So I've been looking at different methods that Flask can use to run an UPSERT statement because I'd need to update all of the fields, and they'd be different based on both which type of area it is, and also in the other models (election contests or results, for example). Most of what I'm finding uses SQLAlchemy rather than Flask-SQLAlchemy.
- I found this answer that looked promising. Here's what I added to my model:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
Then I modified the ScraperModel
class like this:
class ScraperModel(object):
@compiles(Insert)
def compile_upsert(insert_stmt, compiler, **kwargs):
"""
converts every SQL insert to an upsert i.e;
INSERT INTO test (foo, bar) VALUES (1, 'a')
becomes:
INSERT INTO test (foo, bar) VALUES (1, 'a') ON CONFLICT(foo) DO UPDATE SET (bar = EXCLUDED.bar)
(assuming foo is a primary key)
:param insert_stmt: Original insert statement
:param compiler: SQL Compiler
:param kwargs: optional arguments
:return: upsert statement
"""
pk = insert_stmt.table.primary_key
insert = compiler.visit_insert(insert_stmt, **kwargs)
ondup = f'ON CONFLICT ({",".join(c.name for c in pk)}) DO UPDATE SET'
updates = ', '.join(f"{c.name}=EXCLUDED.{c.name}" for c in insert_stmt.table.columns)
upsert = ' '.join((insert, ondup, updates))
return upsert
I'm clearly misunderstanding how the insert_stmt
works because of how the query comes out, but here's the error that it generates:
sqlalchemy.exc.ProgrammingError: (psycopg2.errors.SyntaxError) syntax error at or near "ON"
LINE 1: ..., '54', '', CURRENT_TIMESTAMP) RETURNING areas.id ON CONFLIC...
^
[SQL: INSERT INTO areas (area_id, areas_group, county_id, county_name, ward_id, precinct_id, precinct_name, state_senate_id, state_house_id, county_commissioner_id, district_court_id, soil_water_id, school_district_id, school_district_name, mcd_id, precincts, name, updated) VALUES (%(area_id)s, %(areas_group)s, %(county_id)s, %(county_name)s, %(ward_id)s, %(precinct_id)s, %(precinct_name)s, %(state_senate_id)s, %(state_house_id)s, %(county_commissioner_id)s, %(district_court_id)s, %(soil_water_id)s, %(school_district_id)s, %(school_district_name)s, %(mcd_id)s, %(precincts)s, %(name)s, CURRENT_TIMESTAMP) RETURNING areas.id ON CONFLICT (id) DO UPDATE SET id=EXCLUDED.id, area_id=EXCLUDED.area_id, areas_group=EXCLUDED.areas_group, county_id=EXCLUDED.county_id, county_name=EXCLUDED.county_name, ward_id=EXCLUDED.ward_id, precinct_id=EXCLUDED.precinct_id, precinct_name=EXCLUDED.precinct_name, state_senate_id=EXCLUDED.state_senate_id, state_house_id=EXCLUDED.state_house_id, county_commissioner_id=EXCLUDED.county_commissioner_id, district_court_id=EXCLUDED.district_court_id, soil_water_id=EXCLUDED.soil_water_id, school_district_id=EXCLUDED.school_district_id, school_district_name=EXCLUDED.school_district_name, mcd_id=EXCLUDED.mcd_id, precincts=EXCLUDED.precincts, name=EXCLUDED.name, updated=EXCLUDED.updated]
[parameters: {'area_id': 'counties-01', 'areas_group': 'counties', 'county_id': '01', 'county_name': 'Aitkin', 'ward_id': None, 'precinct_id': None, 'precinct_name': '', 'state_senate_id': None, 'state_house_id': None, 'county_commissioner_id': None, 'district_court_id': None, 'soil_water_id': None, 'school_district_id': None, 'school_district_name': '', 'mcd_id': None, 'precincts': '54', 'name': ''}]
(Background on this error at: https://sqlalche.me/e/14/f405)
I'm hoping I didn't paste too much to be helpful there.
- I also found this answer that I read as creating its own insert statement instead of compiling the built in one. Here's what I changed. In the blueprint's imports:
from sqlalchemy.dialects.postgresql import insert
And in the blueprint's loop:
for i in sources[election]:
source = sources[election][i]
if 'type' in source and source['type'] == 'areas':
rows = area.parse_election(source, election_meta)
count = 0
for row in rows:
parsed = area.parser(row, i)
area = Area()
area.from_dict(parsed, new=True)
stmt = insert(Area.__table__).values(parsed)
stmt = stmt.on_conflict_do_update(
# Let's use the constraint name which was visible in the original posts error msg
constraint="['area_id']",
# The columns that should be updated on conflict
set_={
parsed
}
)
db.session.execute(stmt)
count = count + 1
return count
It results in a different error:
TypeError: unhashable type: 'dict'
All that to say, I'm currently at a loss. It's clear to me that I need to modify the INSERT statement, but it's not clear to me which route I should take to modify it, how to make sure that it matches on the correct field (which is called area_id
and the key is called areas_id_unique
), or how to make sure it updates the correct fields when it does find a match.