Fast algorithm to find longest repeating substring

Question

I'm looking for a fast algorithm that searchs for the longest repeating substring (repeated at least 1 time) in a given string, with the lowest time complexity possible and (if possible) memory wise (RAM).

I've seen some implementations but most of them aren't designed for a ton of characters (let's say 4k, 400k, 4m... length). One example is this one:

from collections import deque

def largest_substring_algo1(string):
    l = list(string)
    d = deque(string[1:])
    match = []
    longest_match = []
    while d:
        for i, item in enumerate(d):
            if l[i]==item:
                match.append(item)
            else:
                if len(longest_match) < len(match):
                    longest_match = match
                match = []
        d.popleft()
    return ''.join(longest_match)

I've been trying with a string containing 103440816326530612244897959183673469387755102040816326530612244897959183673469387755 100 times.

It works well for small strings (<1k length) but it behaves strange for those sizes mentioned before.

EDIT: is there any way to do it without loading a (let's say 20GB) file in memory?

Does this answer your question? [Find longest repetitive sequence in a string](https://stackoverflow.com/questions/11090289/find-longest-repetitive-sequence-in-a-string) — Will Da Silva, Jun 13 '21 at 19:07

Michael H · Answer 1 · 2021-06-13T19:19:29.973

def main():
    from time import time
    data = '103440816326530612244897959183673469387755102040816326530612244897959183673469387755'*100

    start_time = time()
    ans1 = largest_substring_algo1(data)
    print(f'{time()-start_time}')
    # 3.889688014984131

    start_time = time()
    ans2 = longestDupSubstring(data)
    print(f'{time()-start_time}')
    # 0.014296770095825195

    print(ans1 == ans2)
    # True


def longestDupSubstring(S):
    '''
    I improved it from python2 to python3: https://leetcode.com/problems/longest-duplicate-substring/discuss/290871/Python-Binary-Search
    '''
    A = [ord(c) - ord('a') for c in S]
    mod = 2**63 - 1
    from functools import reduce

    def test(L):
        p = pow(26, L, mod)
        cur = reduce(lambda x, y: (x * 26 + y) % mod, A[:L], 0)
        seen = {cur}
        for i in range(L, len(S)):
            cur = (cur * 26 + A[i] - A[i - L] * p) % mod
            if cur in seen:
                return i - L + 1
            seen.add(cur)
    res, lo, hi = 0, 0, len(S)
    while lo < hi:
        mi = (lo + hi + 1) // 2
        pos = test(mi)
        if pos:
            lo = mi
            res = pos
        else:
            hi = mi - 1
    return S[res:res + lo]


def largest_substring_algo1(string):

    from collections import deque
    l = list(string)
    d = deque(string[1:])
    match = []
    longest_match = []
    while d:
        for i, item in enumerate(d):
            if l[i] == item:
                match.append(item)
            else:
                if len(longest_match) < len(match):
                    longest_match = match
                match = []
        d.popleft()
    return ''.join(longest_match)


if __name__ == '__main__':
    main()

That's fast but why does it return almost the entire string isntead of the repeated portion? — loko, Jun 13 '21 at 19:30

Fast algorithm to find longest repeating substring

1 Answers1