I have 1000 dictionaries (grid_1
, grid_2
... grid_1000
) stored as pickle objects (generated by some previous process) and one reference dictionary. I need to compare each of the pickled dictionaries to the reference, and then combine the results.
The input dictionaries might look like:
grid_1 = {'143741': {'467457':1,'501089':2,'903718':1,'999216':5,'1040952':2},'281092':{'1434': 67,'3345': 345}, '33123': {'4566':5,'56788':45}}
grid_2 = {'143741': {'467457':5,'501089':7,'1040952':9},'281092':{'1434': 67,'3345': 20}, '33123': {'4566':7,'56788':38}}
and the reference dictionary might look like:
grid_density_original = {'143741': {'467457':1,'501089':2,'903718':1,'999216':5,'9990':4},'281092':{'1434': 60,'3345': 3,'9991': 43}, '33123': {'56788':4}}
In the first step, we should intersect the individual grid_n
dicts like:
# intersection of grid_1 and grid_density_original
assert intersect_1 == {'143741': {'467457':1,'501089':2,'903718':1,'999216':5},'281092':{'1434': 67,'3345': 345}, '33123': {'56788':45}
# intersection of grid_2 and grid_density_original
assert intersect_2 == {'143741': {'467457':5,'501089':7},'281092':{'1434': 67,'3345': 20}, '33123': {'56788':38}}
Then these results should be combined, as follows:
assert combine12 == {'143741': {'467457':[1,5],'501089':[2,7],'903718':[1,99999],'999216':[5,99999]},'281092':{'1434': [67,67],'3345': [345,20]}, '33123': {'56788':[45,38]}
This appears to b the slow part, as the inner list size increases each time a new intersect_n
is added.
This is the code I have currently. My actual dictionaries have on the order of 10,000 keys, and this code takes about 4 days to run.
from collections import defaultdict
from collections import Counter
import pickle
import gc
import copy
import pickle
import scipy.stats as st
from collections import defaultdict
# grid_density_orignal is original nested dictionary we compare each of 1000 grids to:
with open('path\grid_density_original_intercountry.pickle','rb') as handle:
grid_density_orignal = pickle.load(handle,encoding ='latin-1')
# Previous process generated 1000 grids and dump them as .pickle files : grid_1,grid_2....grid_1000
for iteration in range(1,1001):
# load each grid i.e.grid_1,grid_2...grid_1000 into memory sequentially
filename = 'path\grid_%s' %iteration
with open(filename,'rb') as handle:
globals()['dictt_%s' % iteration] = pickle.load(handle,encoding ='latin-1')
# Counter to store grid-grids densities: same dictionary structure as grid_density_orignal
globals()['g_den_%s' % iteration] = defaultdict(list)
for k,v in globals()['dictt_%s' % iteration].items():
globals()['g_den_%s' % iteration][k] = Counter(v)
# here we find the common grid-grid connections between grid_density_orignal and each of the 1000 grids
globals()['intersect_%s' % iteration] = defaultdict(list)
for k,v in grid_density_orignal.items():
pergrid = defaultdict(list)
common_grid_ids = v.keys() & globals()['g_den_%s' % iteration][k].keys()
for gridid in common_grid_ids:
pergrid[gridid] = globals()['g_den_%s' % iteration][k][gridid]
globals()['intersect_%s' % iteration][k] = pergrid
print('All 1000 grids intersection done')
# From previous code we now have 1000 intersection grids : intersect_1,intersect_2 ...... intersect_1000
for iteration in range(1,1000):
itnext = iteration +1 # to get next intersect out of 1000
globals()['combine_%s%s' %(iteration,itnext)] = defaultdict(list) # dictionary to store intermediate combine step results between 2 intersects : intersect_x and intersect_x+1
for k,v in globals()['intersect_%s' %iteration].items():
innr = []
combine = defaultdict(list)
for key in set(list(globals()['intersect_%s' % iteration][k].keys())+ list(globals()['intersect_%s' % itnext][k].keys())): # key in the union of intersect_1 , intersect_2
if (isinstance(globals()['intersect_%s' % iteration][k].get(key,99999), int) and isinstance(globals()['intersect_%s' % itnext][k].get(key,99999), int)): # get key value if exists, if for e.g. a certain grid doesnt exist in intersect_1, intersect_2 we give it default of 99999 as placeholder, alos check if value is an instance of int or list as in intial step it is an int but later we get lists after combining every 2 intersects
combine[key] = [globals()['intersect_%s' % iteration][k].get(key,99999)] + [globals()['intersect_%s' % itnext][k].get(key,99999)] # combine into list intersect_1, intersect_2
if (isinstance(globals()['intersect_%s' % iteration][k].get(key,99999), list) and isinstance(globals()['intersect_%s' % itnext][k].get(key,99999), int)): # this condition will be reached after initial step of intersect_1 + intersect_2
combine[key] = globals()['intersect_%s' % iteration][k].get(key,99999) + [globals()['intersect_%s' % itnext][k].get(key,99999)] # combine into list intersect_1, intersect_2
globals()['combine_%s%s' %(iteration,itnext)][k] = combine
globals()['intersect_%s' % itnext] = copy.copy(globals()['combine_%s%s' %(iteration,itnext)]) # copy combine dict onto intersect dict so we can continue combining this dict with the next iteration
print('2 combined: ',iteration,itnext)
del globals()['intersect_%s' % iteration] # delete older intersect, combine as we dont need them and may cause memory overflow when more dicts are in memory
del globals()['combine_%s%s' %(iteration,itnext)]
gc.collect() # explicitly call the garbage collector as too big for ram
print('intersection and combine done') # at the end we have intersect_1000 with is a dict with all grid_id ids as keys and a list of densities (list size is 1000 corresponding to 1000 grids)
How can I improve the performance of the code?