I'm trying to replicate a Perl script in Python, and I am running into some major performance problems.
Essentially, I have a tuple of lists and process one list at a time. The list includes a TYPE (string), ID (string), OVERALL_COUNT (int), TYPE_ID_COUNT (int), and a DROP_KEEP_FLAG (string).
My goal is to read the data into a memory efficient data structure that is fast to access. I want to trim the structure every time I process a new record with the trimming criteria being that no TYPE_ID_COUNT can be included if they are more than 2 being the current TYPE_ID_COUNT (i.e., remove if historical TYPE_ID_COUNT < current TYPE_ID_COUNT - 2)
When I compare this to my Perl code, it is orders of magnitude slower. I have included 2 versions (I replace the dict.keys() with dict). I'm relatively new to Python so I'm sure there are more optimal ways for my code to be written. Would Numpy arrays be better?
import timeit
from collections import defaultdict
from copy import deepcopy
# FIELDS: TYPE, ID, OVERALL_COUNT, TYPE_ID_COUNT, DROP_KEEP_FLAG
a = (['TYPE_1','000000001',1,1,'K'],['TYPE_2','000000002',2,1,'K'],['TYPE_3','000000001',3,1,'K'],
['TYPE_1','000000002',4,1,'K'],['TYPE_1','000000002',5,2,'K'],['TYPE_3','000000002',6,1,'K'],
['TYPE_1','000000002',7,3,'K'],['TYPE_1','000000002',8,4,'D'],['TYPE_1','000000002',9,5,'K'],
['TYPE_1','000000001',10,2,'K'],['TYPE_2','000000001',11,1,'K'],['TYPE_2','000000001',12,2,'K'],
['TYPE_2','000000001',13,3,'K'],['TYPE_3','000000001',14,2,'K'],['TYPE_3','000000001',15,3,'K'],
['TYPE_3','000000002',16,2,'K'],['TYPE_3','000000002',17,3,'K'],['TYPE_3','000000002',18,4,'K'])
expand = a
for x in range(0, 250):
expand = expand + a
window = 2
def version_1(data,window):
result = defaultdict(lambda: defaultdict(list))
output = {}
for i_idx,i_val in enumerate(data,start=1):
#ADD NEW ELEMENT IF IT IS A 'K'
if i_val[4] == 'K':
result[i_val[0]][i_val[1]].append(i_val[2])
# TRIM OLD ELEMENTS AND COMPUTE LENGTHS
for j_idx, j_key in enumerate(result.keys(),start=1):
j_val = result.get(j_key)
j_val_cp = deepcopy(j_val)
output[j_key] = 0
for k_idx, k_key in enumerate(j_val_cp.keys(),start=1):
k_val = j_val.get(k_key)
for item in (x for x in k_val if x < i_val[2] - window):
k_val.remove(item)
if not k_val:
del j_val[k_key]
if k_key == i_val[1]:
output[j_key] = len(k_val)
#print('Output ' + str(i_idx) + ': ID: ' + i_val[1] + ' , values: ' + str(output.items()))
return output
def version_2(data,window):
result = defaultdict(lambda: defaultdict(list))
output = {}
for i_idx,i_val in enumerate(data,start=1):
#ADD NEW ELEMENT IF IT IS A 'K'
if i_val[4] == 'K':
result[i_val[0]][i_val[1]].append(i_val[2])
# TRIM OLD ELEMENTS AND COMPUTE LENGTHS
for j_idx, j_key in enumerate(result,start=1):
j_val = result.get(j_key)
j_val_cp = deepcopy(j_val)
output[j_key] = 0
for k_idx, k_key in enumerate(j_val_cp,start=1):
k_val = j_val.get(k_key)
for item in (x for x in k_val if x < i_val[2] - window):
k_val.remove(item)
if not k_val:
del j_val[k_key]
if k_key == i_val[1]:
output[j_key] = len(k_val)
#print('Output ' + str(i_idx) + ': ID: ' + i_val[1] + ' , values: ' + str(output.items()))
return output
# timeit.timeit(version_1(2))
start_time = timeit.default_timer()
version_1(expand,2)
print(timeit.default_timer() - start_time)
start_time = timeit.default_timer()
version_2(expand,2)
print(timeit.default_timer() - start_time)
Any help would be greatly appreciated!