Before I post my main code I'll try to explain how it works. Let the source string be 'a123b'. The valid subsequences consist of all the subsets of '123' prefixed with 'a' and suffixed with 'b'. The set of all subsets is called the powerset, and the itertools
docs have code showing how to produce the powerset using combinations
in the Itertools Recipes section.
# Print all subsequences of '123', prefixed with 'a' and suffixed with 'b'
from itertools import combinations
src = '123'
for i in range(len(src) + 1):
for s in combinations(src, i):
print('a' + ''.join(s) + 'b')
output
ab
a1b
a2b
a3b
a12b
a13b
a23b
a123b
Here's a brute-force solution which uses that recipe.
from itertools import combinations
def count_bruteforce(src, targets):
c0, c1 = targets
count = 0
for i in range(2, len(src) + 1):
for t in combinations(src, i):
if t[0] == c0 and t[-1] == c1:
count += 1
return count
It can be easily shown that the number of subsets of a set of n
items is 2**n
. So rather than producing the subsets one by one we can speed up the process by using that formula, which is what my count_fast
function does.
from itertools import combinations
def count_bruteforce(src, targets):
c0, c1 = targets
count = 0
for i in range(2, len(src) + 1):
for t in combinations(src, i):
if t[0] == c0 and t[-1] == c1:
count += 1
return count
def count_fast(src, targets):
c0, c1 = targets
# Find indices of the target chars
idx = {c: [] for c in targets}
for i, c in enumerate(src):
if c in targets:
idx[c].append(i)
idx0, idx1 = idx[c0], idx[c1]
count = 0
for u in idx0:
for v in idx1:
if v < u:
continue
# Calculate the number of valid subsequences
# which start at u+1 and end at v-1.
n = v - u - 1
count += 2 ** n
return count
# Test
funcs = (
count_bruteforce,
count_fast,
)
targets = 'ab'
data = (
'ab', 'aabb', 'a123b', 'aacbb', 'aabbb',
'zababcaabb', 'aabbaaabbb',
)
for src in data:
print(src)
for f in funcs:
print(f.__name__, f(src, targets))
print()
output
ab
count_bruteforce 1
count_fast 1
aabb
count_bruteforce 9
count_fast 9
a123b
count_bruteforce 8
count_fast 8
aacbb
count_bruteforce 18
count_fast 18
aabbb
count_bruteforce 21
count_fast 21
zababcaabb
count_bruteforce 255
count_fast 255
aabbaaabbb
count_bruteforce 730
count_fast 730
There may be a way to make this even faster by starting the inner loop at the correct place rather than using continue
to skip unwanted indices.