Check if keywords with same order exists in a text

Question

So we have two types of keywords:

Keywords which start with !, This keyword should contain in text
Keywords which start with @!, This keyword shouldn't be seen in text

We want to check if there is at least one valid pattern for given keywords. There might be many occurrences of keywords, finding any valid pattern is fine for us.

Example 1: keywords: ['!A', '!C'] Text:

Multiline text ...

A

Some other text

C

Expected result: True

Example 2: keywords: ['!A', '@!B' , '!C'] Text:

Multiline text ...

A

Some other text

B
C

Expected result: False, since B is placed between A and C

Example 3: keywords: ['!A', '@!B' , '!C'] Text:

Multiline text ...

A

Some other text

B

A

C

Expected result: True, since there is no B after second A, and C is after that.

What I've tried so far(without luck):

Use regex ( i wasn't able to use negative lookahead as well)
Try to write a recursive function for it

Sample function for method 1:

def contain_keywords(content, keywords):
    content = str(content)
    regex_builder = []
    or_keyword = False
    for keyword in keywords:
        if keyword.startswith("@!"):
            reg = '[^%s]' % re.escape(keyword[2:])
            regex_builder.append(reg)
        elif keyword.startswith("!"):
            reg = '(%s)' % re.escape(keyword[1:])
            regex_builder.append(reg)

    pattern = r'.*%s.*' % ('([\s\S])*'.join(regex_builder))
    res = re.search(pattern, content)
    return res is not None

Sample function for method 2:

def contain_keywords2(content, keywords, offset=0, keyword_index=0):
    content = str(content)
    valid_pattern = True
    or_keyword = False

    if keyword_index >= len(keywords) or offset >= len(content):
        return True

    for keyword_index, keyword in enumerate(keywords[keyword_index:]):
        keyword = keyword.strip()
        if keyword.startswith("@!"):
            reg = keyword[2:]
            location = content[offset:].find(reg)
            if location != -1:
                return False
            valid_pattern = contain_keywords2(content, keywords, offset=offset, keyword_index=keyword_index+1)
            if not valid_pattern:
                return False
        elif keyword.startswith("!"):
            reg = keyword[1:]
            location = content[offset:].find(reg)
            print(location + offset)
            if location == -1:
                return False
            if keyword_index + 1 >= len(keywords):
                break
            valid_pattern = contain_keywords2(content, keywords, offset=offset + location + len(reg), keyword_index=keyword_index+1)
            if not valid_pattern:
                return False
    return valid_pattern

score 0 · Accepted Answer · answered Jan 14 '19 at 15:34

Since nobody answered it, I'll post my solution:

def contain_keywords2(content_text, keywords, offset=0, keyword_index=0):

    or_keyword = False

    if keyword_index >= len(keywords) or offset >= len(content_text):
        return True

    for loop_keyword_index, keyword in enumerate(keywords[keyword_index:]):
        keyword = keyword.lstrip()
        if keyword.startswith("@!"):
            reg = keyword[2:]
            # Don't look whole file, just search it in at last 10 lines:
            # If you want to search all over the file, remove these 3 lines (including pos_of_tens_line at location ...)
            pos_of_tens_line = findnth(content_text[offset:], '\n', 10)
            if pos_of_tens_line == -1:
                pos_of_tens_line = len(content_text)

            location = content_text.find(reg, offset, offset + pos_of_tens_line)

            if location != -1:
                return False
            return contain_keywords2(content_text, keywords, offset=offset,
                                     keyword_index=keyword_index + loop_keyword_index + 1)
        elif keyword.startswith("!"):
            reg = keyword[1:].strip()

            for keyword_positions in list(find_all(content_text[offset:], reg)):
                valid_pattern = contain_keywords2(content_text, keywords,
                                                  offset=(offset + keyword_positions + len(reg)),
                                                  keyword_index=(keyword_index + loop_keyword_index + 1))
                if valid_pattern:
                    return True
            return False
        elif keyword.startswith('|!'):
            or_keyword = True
            break

    if or_keyword:
        for keyword in keywords:
            if keyword.startswith('|!'):
                if keyword[2:] in content_text:
                    return True
    return False

Check if keywords with same order exists in a text

1 Answers1