As a follow up to Large list generation optimization I wanted to tackle the problem the other way around.
Given a list of strings in the form of:
seq = ['A.p[11]','A.p[1:10]','B.p[1:2]','B.p[2]','B.p[3]','B.p[0]','A.p[0]']
I want a function which would return a list of "merged" element, with order preserved, like so:
merged = ['A.p[1:11]', 'B.p[0:3]', 'A.p[0]']
To achieve my goal I wrote this function:
def merge_seq(seq):
''' Merges a given sequence of strings
Each string will consiste of word[index] or word[bottom index:top index]
ex: given ['A.p[11]','A.p[1:10]','B.p[1:2]','B.p[2]','B.p[3]','B.p[0]','A.p[0]']
return ['A.p[1:11]','B.p[0:3]','A.p[0]']
'''
merged_list = []
for i in seq:
# First item? Add to list
if not merged_list:
merged_list.append(i)
else:
current = i # current item
previous = merged_list[-1] # previous item
# Skip if current == previous
if current != previous:
current = current.split('[')
previous = previous.split('[')
# If current word != previous, add to list
if current[0] != previous[0]:
merged_list.append(i)
else:
range0 = [int(x) for x in current[-1][:-1].split(':')] # current range
range1 = [int(x) for x in previous[-1][:-1].split(':')] # previous range
bottom = max(range0[0], range1[0])
top = min(range0[-1], range1[-1])
# Test if they overlap or are next to each other, if so edit last entry to reflect new range
if abs(bottom-top) == 1 or xrange(bottom,top+1):
bottom = min(range0[0], range1[0])
top = max(range0[-1], range1[-1])
merged_list[-1] = '%s[%s]'%(previous[0],('%s:%s'%(bottom,top)))
# No overlap. Add to list
else:
merged_list.append(i)
return merged_list
The problem with this function is that it gets very slow when dealing with large sequences.
UPDATE
After doing some research i stumbled upon this answer to detect consecutive integers in a list. This inspired me to write a new function that leverages itertools and operator
from itertools import groupby
from operator import itemgetter
import re
def merge_seq2(seq):
''' Merges a given sequence of strings
Each string will consiste of word[index] or word[bottom index:top index]
ex: given ['A.p[11]','A.p[1:10]','B.p[1:2]','B.p[2]','B.p[3]','B.p[0]','A.p[0]']
becomes ['A.p[1:11]','B.p[0:3]','A.p[0]']
'''
r = re.compile(r"([0-9a-zA-Z._]+)\[(\d+)(?::(\d+))?\]")
current = ''
values = set()
result = []
for item in seq:
m = r.match(item)
name, start, end = m.group(1), int(m.group(2)), m.group(3)
rng = xrange(start, int(end)+1) if end else (start,)
# if this is a new item and we have values, append result
if name != current:
if values:
for k, g in groupby(enumerate(sorted(values)), lambda (i,x):i-x):
m = map(itemgetter(1), g)
if len(m) == 1:
result.append('%s[%s]'%(current,m[0]))
else:
result.append('%s[%s:%s]'%(current,m[0],m[-1]))
# reset 'current' name and values
current = name
values = set(rng)
# else add to values
else:
values.update(rng)
# Do a last append to results and return
if values:
for k, g in groupby(enumerate(sorted(values)), lambda (i,x):i-x):
m = map(itemgetter(1), g)
if len(m) == 1:
result.append('%s[%s]'%(current,m[0]))
else:
result.append('%s[%s:%s]'%(current,m[0],m[-1]))
return result
Now lets compare the two functions and use this solution to generate large lists :
def flatten_seq(seq):
''' Answered by JuniorCompressor
https://stackoverflow.com/questions/29089435/large-list-generation-optimization/29089675#29089675
Faster than: ['%s[%s]'%(item.split('[')[0],i) for item in seq for i in xrange(int(item.split('[')[-1][:-1].split(':')[0]),int(item.split('[')[-1][:-1].split(':')[-1])+1)]
'''
r = re.compile(r"([0-9a-zA-Z._]+)\[(\d+)(?::(\d+))?\]")
result = []
for item in seq:
m = r.match(item)
name, start, end = m.group(1), int(m.group(2)), m.group(3)
rng = xrange(start, int(end)+1) if end else (start,)
t = name + "["
result.extend(t + str(i) + "]" for i in rng)
return result
# Lets make 1 million entries
seq = ['A[1:500000]','B[1:500000]']
t1 = time.clock()
flat = flatten_seq(seq)
t2 = time.clock()
print '# Flattened in %s secs'%round(t2-t1, 3)
merged = merge_seq(flat)
t3 = time.clock()
print '# Old Merge %s secs'%round(t3-t2, 3)
merged = merge_seq2(flat)
t4 = time.clock()
print '# New Merge %s secs'%round(t4-t3, 3)
# Flattened in 0.265 secs
# Old Merge 6.76 secs
# New Merge 2.613 secs
That's ~2.5 times faster!
Only minor issue with marge_seq2 is that in some cases when given unflattened lists it can be slower than the original merge_seq function.
If anyone has suggestions to speed this up even more, I would love to hear them!