I am not sure what is happening under the hood with regards to the Python object model for the code below.
You can download the data for the ctabus.csv file from this link
import csv
def read_as_dicts(filename):
records = []
with open(filename) as f:
rows = csv.reader(f)
headers = next(rows)
for row in rows:
route = row[0]
date = row[1]
daytype = row[2]
rides = int(row[3])
records.append({
'route': route,
'date': date,
'daytype': daytype,
'rides': rides})
return records
# read data from csv
rows = read_as_dicts('ctabus.csv')
print(len(rows)) #736461
# record route ids (object ids)
route_ids = set()
for row in rows:
route_ids.add(id(row['route']))
print(len(route_ids)) #690072
# unique_routes
unique_routes = set()
for row in rows:
unique_routes.add(row['route'])
print(len(unique_routes)) #185
When I call print(len(route_ids))
it prints "690072"
. Why did Python end up creating these many objects?
I expect this count to be either 185 or 736461. 185 because, when I count the unique routes in set the length of that set comes out to be 185. 736461 because, this is the total number of records in csv file.
What is this weird number "690072"?
I am trying to understand why this partial caching? Why python can't perform a full caching something like below.
import csv
route_cache = {}
#some hack to cache
def cached_route(routename):
if routename not in route_cache:
route_cache[routename] = routename
return route_cache[routename]
def read_as_dicts(filename):
records = []
with open(filename) as f:
rows = csv.reader(f)
headers = next(rows)
for row in rows:
row[0] = cached_route(row[0]) #cache trick
route = row[0]
date = row[1]
daytype = row[2]
rides = int(row[3])
records.append({
'route': route,
'date': date,
'daytype': daytype,
'rides': rides})
return records
# read data from csv
rows = read_as_dicts('ctabus.csv')
print(len(rows)) #736461
# unique_routes
unique_routes = set()
for row in rows:
unique_routes.add(row['route'])
print(len(unique_routes)) #185
# record route ids (object ids)
route_ids = set()
for row in rows:
route_ids.add(id(row['route']))
print(len(route_ids)) #185