So I played around a bit with a trie-based approach to your problem (now that I understand what you want). Maybe you can find something useful in it. There is an initial idea, an abstract interface slapped around that idea, to help looking for more efficient solutions, and a few pytest tests to help understand whether/how it works. There are trie modules out there, but this seemed like more fun, for now.
from collections import defaultdict
# Faking an infinite stream of characters
from itertools import cycle
stream = cycle('carracecowtenhihellocohiwcar')
# Just exploring the idea of a trie. If it works, we can think about a
# more efficient implementation later.
def new_trie_branch():
return defaultdict(new_trie_branch)
# A symbol used to indicate leaves in the trie
END_OF_WORD = object()
# The trie is implemented as a dictionary mapping letters to
# sub-tries. The pseudo-letter END_OF_WORD marks the end of a path in
# the trie which corresponds to a valid whole word.
def make_trie(words):
trie = new_trie_branch()
for word in words:
branch = trie
for letter in word:
branch = branch[letter]
branch[END_OF_WORD] = True
return trie
# As each letter comes out of the stream, it is fed into a collection
# of 'listeners'. Each listener is a stateful function which
# corresponds to some location in the trie and is aware of the word
# prefix which describes the path from the trie's root to the current
# node. When such a listener is given a letter, it checks (in the trie)
# whether the prefix plus the new letter form a complete word: if so,
# it bumps the word count for that word. It also checks whether the
# prefix plus the new letter form a valid longer prefix: if so, it
# adds a new listener (corresponding to the next node in the trie)
# into the collection of listeners that will be applied to the next letter to
# come out of the stream.
def count_words_in_stream(words, stream, word_count = None):
word_count = defaultdict(int) if word_count is None else word_count
def make_listener(branch, prefix):
def listener(next_letter):
if next_letter in branch:
next_branch = branch[next_letter]
word = prefix + next_letter
if END_OF_WORD in next_branch:
word_count[word] += 1
next_listeners.append(make_listener(next_branch, word))
return listener
start_of_word_listener = make_listener(make_trie(words), '')
listeners = [start_of_word_listener]
for letter in stream:
next_listeners = [start_of_word_listener]
for listen in listeners:
listen(letter)
listeners = next_listeners
return word_count
# Now we try to come up with an implementation-independent interface
# for the trie to allow us to refactor more easily in search of a more
# efficient implementation, if necessary.
class Trie(object):
def __init__(self, words):
self._trie = make_trie(words)
# Checks whether the given WORD is present in the trie
def __contains__(self, word):
trie = self._trie
for letter in word:
if letter not in trie:
return False
trie = trie[letter]
else:
return END_OF_WORD in trie
# The 'in' operator (__contains__) checks for the presence of a
# whole word in the trie, so we need a different interface for
# checking whether a given branch exists at this node.
def has_branch(self, branch_id):
return branch_id in self._trie
# Picks one branch of the trie
def __getitem__(self, branch_id):
branch = Trie('')
branch._trie = self._trie[branch_id]
return branch
# Iterates over the branches of this trie
def __iter__(self):
return iter(self._trie)
# Same as count_words_in_stream above, but uses the abstract interface
# we just invented.
def abstract_count_words_in_stream(words, stream, word_count = None):
word_count = defaultdict(int) if word_count is None else word_count
def make_listener(branch, prefix):
def listener(next_letter):
if branch.has_branch(next_letter):
next_branch = branch[next_letter]
word = prefix + next_letter
if next_branch.has_branch(END_OF_WORD):
word_count[word] += 1
next_listeners.append(make_listener(next_branch, word))
return listener
start_of_word_listener = make_listener(Trie(words), '')
listeners = [start_of_word_listener]
for letter in stream:
next_listeners = [start_of_word_listener]
for listen in listeners:
listen(letter)
listeners = next_listeners
return word_count
################################################################################
# Some tests of the implementation. These are written in the pytest
# framework.
################################################################################
from pytest import mark
# Testing the specific implementation details. Just to get us going.
@mark.parametrize('words, trie', (
(['one'],
{'o': {'n': {'e': {END_OF_WORD: True}}}}),
('one two'.split(),
{'o': {'n': {'e': {END_OF_WORD: True}}},
't': {'w': {'o': {END_OF_WORD: True}}}}),
('abc abd'.split(),
{'a': {'b': {'c': {END_OF_WORD: True},
'd': {END_OF_WORD: True}}}})
))
def test_make_trie(words, trie):
assert make_trie(words) == trie
count_words_test_data = ('words, stream, expected', (
(['cow'] ,'abcdefg', {}),
(['cow'] ,'cowcowcow', {'cow':3}),
('cow car fish'.split(), 'cowcarfishcarcarfishcow',
{'cow':2, 'car':3, 'fish':2}),
('and hand handy'.split(), 'handyandhand',
{'and':3, 'hand':2, 'handy':1}),
))
@mark.parametrize(*count_words_test_data)
def test_count_words_in_stream(words, stream, expected):
assert count_words_in_stream(words, stream) == expected
################################################################################
# Testing the abstract Trie interface. This will help if we want to
# refactor the implementation in search of something more efficient.
################################################################################
@mark.parametrize('words, absents', (
('one'.split(), 'o on ono'.split()),
('o on one'.split(), []),
('abc abd'.split(), ['ab'])
))
def test_Trie_word_presence(words, absents):
trie = Trie(words)
for word in words:
assert word in trie
for absent in absents:
assert absent not in trie
@mark.parametrize(*count_words_test_data)
def test_abstract_count_words_in_stream(words, stream, expected):
assert abstract_count_words_in_stream(words, stream) == expected