I'm trying to bulk load the data from the UK Companies House csv files into PostgreSQL using python.
I'm converting each row of data into a list of dicts and then using an unnest statement to unpack the data into a single high volume sql statement, here's a sample of what I'm doing, (there are many more fields in the source)...
def buildDict(row)
clean_name = row[0].decode('utf-8').upper()
country_code = lookups.getCountryCodeFromName(row[14])
if len(country_code) > 2:
country_code = None
insert_dict = {
'companyname': row[0],
'companynumber': row[1],
'regaddress_careof': row[2],
'regaddress_pobox': row[3],
'dissolutiondate': row[13],
}
# convert 'None' and '' strings to None
for k, v in six.iteritems(insert_dict):
insert_dict[k] = set_to_null(v)
def fastInsert(data):
sql='''
INSERT INTO uk_data.companies_house(
companyname,
companynumber,
regaddress_careof,
regaddress_pobox,
dissolutiondate
)
SELECT
unnest( %(companyname)s ),
unnest( %(companynumber)s ),
unnest( %(regaddress_careof)s ),
unnest( %(regaddress_pobox)s ),
unnest( %(dissolutiondate)s )
;
'''
companyname=[str(r['companyname']) for r in data]
companynumber=[str(r['companynumber']) for r in data]
regaddress_careof=[str(r['regaddress_careof']) for r in data]
regaddress_pobox=[str(r['regaddress_pobox']) for r in data]
dissolutiondate=[datetime.strptime(r['dissolutiondate'], "%d/%m/%Y") if r['dissolutiondate'] else None for r in data]
execute(sql,locals())
def execute(sql,params={}):
with connect() as connection:
with connection.cursor() as cursor:
if params:
cursor.execute(sql,params)
else:
cursor.execute(sql)
This code works fine as long as everything is cast to a string, but when I try and cast the data to dates I get the following error every time the a date record has no value (NB this value been set to None
by the conditional, so should load into PostgreSQL).
Error could not determine polymorphic type because input has type "unknown"
I've tried casting the type to ::DATE
in the unnest statement as follows:
sql='''
INSERT INTO uk_data.companies_house(
companyname,
companynumber,
regaddress_careof,
regaddress_pobox,
dissolutiondate
)
SELECT
unnest( %(companyname)s ),
unnest( %(companynumber)s ),
unnest( %(regaddress_careof)s ),
unnest( %(regaddress_pobox)s ),
unnest( %(dissolutiondate)s )::DATE
;
'''
but this doesn't help. A print of my locals shows the following for a single record:
('these are my locals: ', {'regaddress_posttown': ['LEEDS'], 'regaddress_addressline1': ['METROHOUSE 57 PEPPER ROAD'], 'regaddress_addressline2': ['HUNSLET'], 'regaddress_careof': ['None'], 'companystatus': ['Active'], 'companycategory': ['Private Limited Company'], 'companyname': ['! LTD'], 'countryoforigin': ['None'], 'regaddress_pobox': ['None'], 'regaddress_country': ['None'], 'dissolutiondate': None, 'regaddress_postcode': ['LS10 2RU'], 'regaddress_county': ['YORKSHIRE'], 'sql': '
INSERT INTO uk_data.companies_house(
companyname,
companynumber,
regaddress_careof,
regaddress_pobox,
regaddress_addressline1,
regaddress_addressline2,
regaddress_posttown,
regaddress_county,
regaddress_country,
regaddress_postcode,
companycategory,
companystatus,
countryoforigin,
dissolutiondate
)
SELECT
unnest( %(companyname)s ),
unnest( %(companynumber)s ),
unnest( %(regaddress_careof)s ),
unnest( %(regaddress_pobox)s ),
unnest( %(regaddress_addressline1)s ),
unnest( %(regaddress_addressline2)s ),
unnest( %(regaddress_posttown)s ),
unnest( %(regaddress_county)s ),
unnest( %(regaddress_country)s ),
unnest( %(regaddress_postcode)s ),
unnest( %(companycategory)s ),
unnest( %(companystatus)s ),
unnest( %(countryoforigin)s ),
unnest( %(dissolutiondate)s )
;
', 'r': {'regaddress_posttown': 'LEEDS', 'regaddress_careof': None, 'companystatus': 'Active', 'companynumber': '08209948', 'regaddress_addressline1': 'METROHOUSE 57 PEPPER ROAD', 'regaddress_addressline2': 'HUNSLET', 'companycategory': 'Private Limited Company', 'companyname': '! LTD', 'countryoforigin': None, 'regaddress_pobox': None, 'regaddress_country': None, 'dissolutiondate': None, 'regaddress_postcode': 'LS10 2RU', 'regaddress_county': 'YORKSHIRE'}, 'data': [{'regaddress_posttown': 'LEEDS', 'regaddress_careof': None, 'companystatus': 'Active', 'companynumber': '08209948', 'regaddress_addressline1': 'METROHOUSE 57 PEPPER ROAD', 'regaddress_addressline2': 'HUNSLET', 'companycategory': 'Private Limited Company', 'companyname': '! LTD', 'countryoforigin': None, 'regaddress_pobox': None, 'regaddress_country': None, 'dissolutiondate': None, 'regaddress_postcode': 'LS10 2RU', 'regaddress_county': 'YORKSHIRE'}], 'companynumber': ['08209948']})
I'm not sure that this is relevant, but I've noticed that the local variables, once taken out of the dict, are all placed in a list as so: ['None']
but the date variable that causes the problems (dissolutiondate
) is given as a true None
value.