I've a list which has approximately 177071007 items. and i'm trying to perform the following operations a) get the first and last occurance of a unique item in the list. b) the number of occurances.
def parse_data(file, op_file_test):
ins = csv.reader(open(file, 'rb'), delimiter = '\t')
pc = list()
rd = list()
deltas = list()
reoccurance = list()
try:
for row in ins:
pc.append(int(row[0]))
rd.append(int(row[1]))
except:
print row
pass
unique_pc = set(pc)
unique_pc = list(unique_pc)
print "closing file"
#takes a long time from here!
for a in range(0, len(unique_pc)):
index_first_occurance = pc.index(unique_pc[a])
index_last_occurance = len(pc) - 1 - pc[::-1].index(unique_pc[a])
delta_rd = rd[index_last_occurance] - rd[index_first_occurance]
deltas.append(int(delta_rd))
reoccurance.append(pc.count(unique_pc[a]))
print unique_pc[a] , delta_rd, reoccurance[a]
print "printing to file"
map_file = open(op_file_test,'a')
for a in range(0, len(unique_pc)):
print >>map_file, "%d, %d, %d" % (unique_pc[a], deltas[a], reoccurance)
map_file.close()
However the complexity is in the order of O(n). Would there be a possibility to make the for loop 'run fast', by that i mean, do you think yielding would make it fast? or is there any other way? unfortunately, i don't have numpy