If you did want to make a rough parser for this it would look some like this.
This uses the scanner
method of pattern objects, iterates through and builds the list when at level 0, where the level is defined through the left and right brackets encountered.
import re
# Token specification
TEST = r'(?P<TEST>test[0-9]*)'
LEFT_BRACKET = r'(?P<LEFT_BRACKET>\()'
RIGHT_BRACKET = r'(?P<RIGHT_BRACKET>\))'
AND = r'(?P<AND> and )'
OR = r'(?P<OR> or )'
master_pat = re.compile('|'.join([TEST, LEFT_BRACKET, RIGHT_BRACKET, AND, OR]))
s = "(test1 or (test2 or test3)) and (test4 and (test6)) and (test7 or test8) and test9"
def generate_list(pat, text):
ans = []
elem = ''
level = 0
scanner = pat.scanner(text)
for m in iter(scanner.match, None):
# print(m.lastgroup, m.group(), level)
# keep building elem if nested or not tokens to skip for level=0,1
if (level > 1 or
(level == 1 and m.lastgroup != 'RIGHT_BRACKET') or
(level == 0 and m.lastgroup not in ['LEFT_BRACKET', 'AND'])
):
elem += m.group()
# if at level 0 we can append
if level == 0 and elem != '':
ans.append(elem)
elem = ''
# set level
if m.lastgroup == 'LEFT_BRACKET':
level += 1
elif m.lastgroup == 'RIGHT_BRACKET':
level -= 1
return ans
generate_list(master_pat, s)
# ['test1 or (test2 or test3)', 'test4 and (test6)', 'test7 or test8', 'test9']
To see how scanner
behaves:
master_pat = re.compile('|'.join([TEST, LEFT_BRACKET, RIGHT_BRACKET, AND, OR]))
s = "(test1 or (test2 or test3)) and (test4 and (test6)) and (test7 or test8) and test9"
scanner = master_pat.scanner(s)
scanner.match()
# <re.Match object; span=(0, 1), match='('>
_.lastgroup, _.group()
# ('LEFT_BRACKET', '(')
scanner.match()
# <re.Match object; span=(1, 6), match='test1'>
_.lastgroup, _.group()
# ('TEST', 'test1')
scanner.match()
# <re.Match object; span=(6, 10), match=' or '>
_.lastgroup, _.group()
# ('OR', ' or ')
scanner.match()
# <re.Match object; span=(10, 11), match='('>
_.lastgroup, _.group()
# ('LEFT_BRACKET', '(')
scanner.match()
# <re.Match object; span=(11, 16), match='test2'>
_.lastgroup, _.group()
# ('TEST', 'test2')