More involved but much faster: pre-process your list of strings into a prefix trie.
Then, for each file line, starting at each character position, see how far you can walk into the trie.
If you kept a queue of all active tries, you only have to look at each character-position once as you scan through the line. You could also include a "minimum terminal depth" counter at each trie-node to allow you to truncate comparison early once you get near the end of the string.
A simpler half-step would be to reduce your big list of strings to a dict of lists of strings, indexed by the first three chars of each string you are looking for.
from itertools import count, tee, izip
def triwise(iterable):
# base on pairwise, from the itertools documentation
"s -> (s0,s1,s2), (s1,s2,s3), (s2,s3,s4), ..."
a, b, c = tee(iterable, 3)
next(b, None)
next(c, None)
next(c, None)
return izip(a, b, c)
class Searcher:
def __init__(self):
self.index = {}
def add_seek_strings(self, strings):
for s in strings:
pre = s[:3]
if pre in self.index:
self.index[pre].append(s)
else:
self.index[pre] = [s]
def find_matches(self, target):
offset = -1
for a,b,c in triwise(target):
offset += 1
pre = a+b+c
if pre in self.index:
from_here = target[offset:]
for seek in self.index[pre]:
if from_here.startswith(seek):
yield seek
def is_match(self, target):
for match in self.find_matches(target):
return True
return False
def main():
srch = Searcher()
srch.add_seek_strings(["the", "words", "you", "want"])
with open("myfile.txt") as inf:
matched_lines = [line for line in inf if srch.is_match(line)]
if __name__=="__main__":
main()