So we have two types of keywords:
- Keywords which start with
!
, This keyword should contain in text - Keywords which start with
@!
, This keyword shouldn't be seen in text
We want to check if there is at least one valid pattern for given keywords. There might be many occurrences of keywords, finding any valid pattern is fine for us.
Example 1: keywords: ['!A', '!C'] Text:
Multiline text ...
A
Some other text
C
Expected result: True
Example 2: keywords: ['!A', '@!B' , '!C'] Text:
Multiline text ...
A
Some other text
B
C
Expected result: False, since B is placed between A and C
Example 3: keywords: ['!A', '@!B' , '!C'] Text:
Multiline text ...
A
Some other text
B
A
C
Expected result: True, since there is no B after second A, and C is after that.
What I've tried so far(without luck):
- Use regex ( i wasn't able to use negative lookahead as well)
- Try to write a recursive function for it
Sample function for method 1:
def contain_keywords(content, keywords):
content = str(content)
regex_builder = []
or_keyword = False
for keyword in keywords:
if keyword.startswith("@!"):
reg = '[^%s]' % re.escape(keyword[2:])
regex_builder.append(reg)
elif keyword.startswith("!"):
reg = '(%s)' % re.escape(keyword[1:])
regex_builder.append(reg)
pattern = r'.*%s.*' % ('([\s\S])*'.join(regex_builder))
res = re.search(pattern, content)
return res is not None
Sample function for method 2:
def contain_keywords2(content, keywords, offset=0, keyword_index=0):
content = str(content)
valid_pattern = True
or_keyword = False
if keyword_index >= len(keywords) or offset >= len(content):
return True
for keyword_index, keyword in enumerate(keywords[keyword_index:]):
keyword = keyword.strip()
if keyword.startswith("@!"):
reg = keyword[2:]
location = content[offset:].find(reg)
if location != -1:
return False
valid_pattern = contain_keywords2(content, keywords, offset=offset, keyword_index=keyword_index+1)
if not valid_pattern:
return False
elif keyword.startswith("!"):
reg = keyword[1:]
location = content[offset:].find(reg)
print(location + offset)
if location == -1:
return False
if keyword_index + 1 >= len(keywords):
break
valid_pattern = contain_keywords2(content, keywords, offset=offset + location + len(reg), keyword_index=keyword_index+1)
if not valid_pattern:
return False
return valid_pattern