0

I have a Flask app that parses a CSV of public election data and inserts the results into a Postgres database. It's a port of an old, not-Flask, Python 2 app that uses various libraries that no longer work. I'm mostly trying to base the application's structure on this tutorial. I've been using Flask-SQLAlchemy to construct some models for the database tables and populate the data from the CSV.

In this case I'm working with an Area model, which corresponds to a geographic area that might have an election (house district, school board district, etc). Here's what I've got in my basic blueprint route:

election = None

@bp.route('/areas')
def scrape_areas():
    area = Area()
    sources = area.read_sources()
    election = area.set_election()

    if election not in sources:
        return

    # Get metadata about election
    election_meta = sources[election]['meta'] if 'meta' in sources[election] else {}

    for i in sources[election]:
        source = sources[election][i]

        if 'type' in source and source['type'] == 'areas':

            rows = area.parse_election(source, election_meta)
            count = 0

            for row in rows:
                parsed = area.parser(row, i)

                area = Area()
                area.from_dict(parsed, new=True)
                # this shows the generated string of area_id
                # which is a UNIQUE key in the database
                print(area)

                db.session.add(area)
                db.session.commit()
                count = count + 1
            
    return count

And here's the models.py:

import logging
import os
import json
import re
import csv
import urllib.request

import calendar
import datetime
from flask import current_app
from app import db

LOG = logging.getLogger(__name__)
scraper_sources_inline = None

class ScraperModel(object):

    nonpartisan_parties = ['NP', 'WI', 'N P']

    def __init__(self, group_type = None):
        """
        Constructor
        """

        # this is where scraperwiki was creating and connecting to its database
        # we do this in the imported sql file instead

        self.read_sources()


    def read_sources(self):
        """
        Read the scraper_sources.json file.
        """
        if scraper_sources_inline is not None:
            self.sources = json.loads(scraper_sources_inline)
        else:
            #sources_file = current_app.config['SOURCES_FILE']
            sources_file = os.path.join(current_app.root_path, '../scraper_sources.json')
            data = open(sources_file)
            self.sources = json.load(data)

        return self.sources


    def set_election(self):
        # Get the newest set
        newest = 0
        for s in self.sources:
            newest = int(s) if int(s) > newest else newest

        newest_election = str(newest)
        election = newest_election
        # Usually we just want the newest election but allow for other situations
        election = election if election is not None and election != '' else newest_election
        return election


    def parse_election(self, source, election_meta = {}):

        # Ensure we have a valid parser for this type
        parser_method = getattr(self, "parser", None)
        if callable(parser_method):
            # Check if election has base_url
            source['url'] = election_meta['base_url'] + source['url'] if 'base_url' in election_meta else source['url']

            # Get data from URL
            try:
                response = urllib.request.urlopen(source['url'])
                lines = [l.decode('latin-1') for l in response.readlines()]
                rows = csv.reader(lines, delimiter=';')
                return rows
            except Exception as err:
                LOG.exception('[%s] Error when trying to read URL and parse CSV: %s' % (source['type'], source['url']))
                raise


    def from_dict(self, data, new=False):
        for field in data:
            setattr(self, field, data[field])


class Area(ScraperModel, db.Model):

    __tablename__ = "areas"

    id = db.Column(db.Integer, primary_key=True, autoincrement=True)
    area_id = db.Column(db.String(255), unique=True, nullable=False)
    areas_group = db.Column(db.String(255))
    county_id = db.Column(db.String(255))
    county_name = db.Column(db.String(255))
    ward_id = db.Column(db.String(255))
    precinct_id = db.Column(db.String(255))
    precinct_name = db.Column(db.String(255))
    state_senate_id = db.Column(db.String(255))
    state_house_id = db.Column(db.String(255))
    county_commissioner_id = db.Column(db.String(255))
    district_court_id = db.Column(db.String(255))
    soil_water_id = db.Column(db.String(255))
    school_district_id = db.Column(db.String(255))
    school_district_name = db.Column(db.String(255))
    mcd_id = db.Column(db.String(255))
    precincts = db.Column(db.String(255))
    name = db.Column(db.String(255))
    updated = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp())

    def __repr__(self):
        return '<Area {}>'.format(self.area_id)

    def parser(self, row, group):

        # General data
        parsed = {
            'area_id': group + '-',
            'areas_group': group,
            'county_id': None,
            'county_name': None,
            'ward_id': None,
            'precinct_id': None,
            'precinct_name': '',
            'state_senate_id': None,
            'state_house_id': None,
            'county_commissioner_id': None,
            'district_court_id': None,
            'soil_water_id': None,
            'school_district_id': None,
            'school_district_name': '',
            'mcd_id': None,
            'precincts': None,
            'name': ''
        }

        if group == 'municipalities':
            parsed['area_id'] = parsed['area_id'] + row[0] + '-' + row[2]
            parsed['county_id'] = row[0]
            parsed['county_name'] = row[1]
            parsed['mcd_id'] = "{0:05d}".format(int(row[2])) #enforce 5 digit
            parsed['name'] = row[1]

        if group == 'counties':
            parsed['area_id'] = parsed['area_id'] + row[0]
            parsed['county_id'] = row[0]
            parsed['county_name'] = row[1]
            parsed['precincts'] = row[2]

        if group == 'precincts':
            parsed['area_id'] = parsed['area_id'] + row[0] + '-' + row[1]
            parsed['county_id'] = row[0]
            parsed['precinct_id'] = row[1]
            parsed['precinct_name'] = row[2]
            parsed['state_senate_id'] = row[3]
            parsed['state_house_id'] = row[4]
            parsed['county_commissioner_id'] = row[5]
            parsed['district_court_id'] = row[6]
            parsed['soil_water_id'] = row[7]
            parsed['mcd_id'] = row[8]

        if group == 'school_districts':
            parsed['area_id'] = parsed['area_id'] + row[0]
            parsed['school_district_id'] = row[0]
            parsed['school_district_name'] = row[1]
            parsed['county_id'] = row[2]
            parsed['county_name'] = row[3]

        return parsed

So Areas is an extension of my default model class because it allows me to set up the fields that are specific to a given area based on the CSV.

Where this code fails is a (relatively) rare case in the CSV data where there might be multiple rows that, in the old application, correspond to the same row in the table. That old application had an array (usually with just one item, representing a UNIQUE column in the database) to instruct the code to run an UPDATE on those rows.

It returns an error like this:

sqlalchemy.exc.IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "areas_area_id_key"
DETAIL:  Key (area_id)=(counties-01) already exists.

An example of how it runs when I'm just logging my UNIQUE key value from the model instead of inserting it:

<Area precincts-87-0140>
<Area precincts-87-0145>
<Area precincts-87-0150>
<Area precincts-87-0155>
<Area precincts-87-0160>
<Area precincts-87-0165>
<Area school_districts-0001>
<Area school_districts-0001>
<Area school_districts-0001>
<Area school_districts-0002>
<Area school_districts-0004>
<Area school_districts-0006>
<Area school_districts-0012>
<Area school_districts-0013>
<Area school_districts-0014>

So I've been looking at different methods that Flask can use to run an UPSERT statement because I'd need to update all of the fields, and they'd be different based on both which type of area it is, and also in the other models (election contests or results, for example). Most of what I'm finding uses SQLAlchemy rather than Flask-SQLAlchemy.

  1. I found this answer that looked promising. Here's what I added to my model:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert

Then I modified the ScraperModel class like this:

class ScraperModel(object):

    @compiles(Insert)
    def compile_upsert(insert_stmt, compiler, **kwargs):
        """
        converts every SQL insert to an upsert  i.e;
        INSERT INTO test (foo, bar) VALUES (1, 'a')
        becomes:
        INSERT INTO test (foo, bar) VALUES (1, 'a') ON CONFLICT(foo) DO UPDATE SET (bar = EXCLUDED.bar)
        (assuming foo is a primary key)
        :param insert_stmt: Original insert statement
        :param compiler: SQL Compiler
        :param kwargs: optional arguments
        :return: upsert statement
        """
        pk = insert_stmt.table.primary_key
        insert = compiler.visit_insert(insert_stmt, **kwargs)
        ondup = f'ON CONFLICT ({",".join(c.name for c in pk)}) DO UPDATE SET'
        updates = ', '.join(f"{c.name}=EXCLUDED.{c.name}" for c in insert_stmt.table.columns)
        upsert = ' '.join((insert, ondup, updates))
        return upsert

I'm clearly misunderstanding how the insert_stmt works because of how the query comes out, but here's the error that it generates:

sqlalchemy.exc.ProgrammingError: (psycopg2.errors.SyntaxError) syntax error at or near "ON"
LINE 1: ..., '54', '', CURRENT_TIMESTAMP) RETURNING areas.id ON CONFLIC...
                                                             ^

[SQL: INSERT INTO areas (area_id, areas_group, county_id, county_name, ward_id, precinct_id, precinct_name, state_senate_id, state_house_id, county_commissioner_id, district_court_id, soil_water_id, school_district_id, school_district_name, mcd_id, precincts, name, updated) VALUES (%(area_id)s, %(areas_group)s, %(county_id)s, %(county_name)s, %(ward_id)s, %(precinct_id)s, %(precinct_name)s, %(state_senate_id)s, %(state_house_id)s, %(county_commissioner_id)s, %(district_court_id)s, %(soil_water_id)s, %(school_district_id)s, %(school_district_name)s, %(mcd_id)s, %(precincts)s, %(name)s, CURRENT_TIMESTAMP) RETURNING areas.id ON CONFLICT (id) DO UPDATE SET id=EXCLUDED.id, area_id=EXCLUDED.area_id, areas_group=EXCLUDED.areas_group, county_id=EXCLUDED.county_id, county_name=EXCLUDED.county_name, ward_id=EXCLUDED.ward_id, precinct_id=EXCLUDED.precinct_id, precinct_name=EXCLUDED.precinct_name, state_senate_id=EXCLUDED.state_senate_id, state_house_id=EXCLUDED.state_house_id, county_commissioner_id=EXCLUDED.county_commissioner_id, district_court_id=EXCLUDED.district_court_id, soil_water_id=EXCLUDED.soil_water_id, school_district_id=EXCLUDED.school_district_id, school_district_name=EXCLUDED.school_district_name, mcd_id=EXCLUDED.mcd_id, precincts=EXCLUDED.precincts, name=EXCLUDED.name, updated=EXCLUDED.updated]
[parameters: {'area_id': 'counties-01', 'areas_group': 'counties', 'county_id': '01', 'county_name': 'Aitkin', 'ward_id': None, 'precinct_id': None, 'precinct_name': '', 'state_senate_id': None, 'state_house_id': None, 'county_commissioner_id': None, 'district_court_id': None, 'soil_water_id': None, 'school_district_id': None, 'school_district_name': '', 'mcd_id': None, 'precincts': '54', 'name': ''}]
(Background on this error at: https://sqlalche.me/e/14/f405)

I'm hoping I didn't paste too much to be helpful there.

  1. I also found this answer that I read as creating its own insert statement instead of compiling the built in one. Here's what I changed. In the blueprint's imports:
from sqlalchemy.dialects.postgresql import insert

And in the blueprint's loop:

for i in sources[election]:
        source = sources[election][i]

        if 'type' in source and source['type'] == 'areas':

            rows = area.parse_election(source, election_meta)

            count = 0

            for row in rows:
                parsed = area.parser(row, i)

                area = Area()
                area.from_dict(parsed, new=True)

                stmt = insert(Area.__table__).values(parsed)
                stmt = stmt.on_conflict_do_update(
                    # Let's use the constraint name which was visible in the original posts error msg
                    constraint="['area_id']",

                    # The columns that should be updated on conflict
                    set_={
                        parsed
                    }
                )
                db.session.execute(stmt)
                count = count + 1

    return count

It results in a different error:

TypeError: unhashable type: 'dict'

highlighted error message in the Flask URL


All that to say, I'm currently at a loss. It's clear to me that I need to modify the INSERT statement, but it's not clear to me which route I should take to modify it, how to make sure that it matches on the correct field (which is called area_id and the key is called areas_id_unique), or how to make sure it updates the correct fields when it does find a match.

Jonathan Stegall
  • 530
  • 1
  • 6
  • 23
  • In the second variation, `parsed` is already a `dict`, so you don't need to wrap it in curly brackets: `set_=parsed` should work. "Most of what I'm finding uses SQLAlchemy rather than Flask-SQLAlchemy": flask-sqlalchemy is a fairly thin wrapper around SQLAlchemy itself: generallyy you can reference things as `db.sqlalchemy_thing` instead of `sqlalchemy.sqlalchemy_thing` and it will work. – snakecharmerb Oct 08 '21 at 06:25
  • It doesn't work to use `set_=parsed`; it has the same error. – Jonathan Stegall Oct 08 '21 at 13:06

1 Answers1

1

What I think I'm finding is that none of this would work because I wasn't matching on a primary key, but on a unique key. What I've done is change the unique key area_id to a primary key. Then, I can use the upsert statement from above.

@compiles(Insert)
    def compile_upsert(insert_stmt, compiler, **kwargs):
        """
        converts every SQL insert to an upsert  i.e;
        INSERT INTO test (foo, bar) VALUES (1, 'a')
        becomes:
        INSERT INTO test (foo, bar) VALUES (1, 'a') ON CONFLICT(foo) DO UPDATE SET (bar = EXCLUDED.bar)
        (assuming foo is a primary key)
        :param insert_stmt: Original insert statement
        :param compiler: SQL Compiler
        :param kwargs: optional arguments
        :return: upsert statement
        """
        pk = insert_stmt.table.primary_key
        insert = compiler.visit_insert(insert_stmt, **kwargs)
        ondup = f'ON CONFLICT ({",".join(c.name for c in pk)}) DO UPDATE SET'
        updates = ', '.join(f"{c.name}=EXCLUDED.{c.name}" for c in insert_stmt.table.columns)
        upsert = ' '.join((insert, ondup, updates))
        return upsert

I had been trying to change the pk = insert_stmt.table.primary_key line to check for the unique key with no success, but it works just like this if I change that field.

Changing the primary key also fixed the other solution I was trying:

group = []
for row in rows:
    parsed = area.parser(row, i)

    area = Area()
    area.from_dict(parsed, new=True)

    group.append(area)

insert(db.session, Area, group)


def insert(session, model, rows):
    table = model.__table__
    stmt = insert(table)
    primary_keys = [key.name for key in inspect(table).primary_key]
    update_dict = {c.name: c for c in stmt.excluded if not c.primary_key}

    if not update_dict:
        raise ValueError("insert_or_update resulted in an empty update_dict")

    stmt = stmt.on_conflict_do_update(
        index_elements=primary_keys,
        set_=update_dict
    )

So both solutions were (relatively) workable, but only with a primary key instead of a unique key and that just hadn't been clear to me.

Jonathan Stegall
  • 530
  • 1
  • 6
  • 23