If all the conditions I understood are verified (for exemple: there is no character on a line before '<tag>'
or before '</tag>'
; right ?) , the following code does the job, I think:
import re
RE = ('(\A\n*<tag>\n+)',
'(\A\n*)',
'(!\n*</tag>(?!\n*\Z)\n*)',
'(!\n*</tag>\n*\Z)',
'(!\n*<tag>\n+)',
'(!\n*\Z)',
'(!\n+)')
pat = re.compile('|'.join(RE))
def repl(mat, d = {1:"[['", 2:"['", 3:"'],'", 4:"']]", 5:"',['", 6:"']", 7:"','"}):
return d[mat.lastindex]
ch = .... # a string to parse
dh = eval(pat.sub(repl,ch))
applying:
ch1 = '''
A!
B!
C!
<tag>
D!
E!
</tag>
F!
<tag>
G!
</tag>
'''
ch2 = '''A!
B!
C!
<tag>
D!
E!
</tag>
F!
<tag>
G!
</tag>
H!
'''
ch3 = '''
A!
B!
C!
<tag>
D!
E!
</tag>
Fududu!gutuyu!!
<tag>
G!
</tag>
H!'''
ch4 = '''<tag>
A!
B!
</tag>
C!
<tag>
D!
E!
</tag>
F!
<tag>
G!
</tag>
H!'''
import re
RE = ('(\A\n*<tag>\n+)',
'(\A\n*)',
'(!\n*</tag>(?!\n*\Z)\n*)',
'(!\n*</tag>\n*\Z)',
'(!\n*<tag>\n+)',
'(!\n*\Z)',
'(!\n+)')
pat = re.compile('|'.join(RE))
def repl(mat, d = {1:"[['", 2:"['", 3:"'],'", 4:"']]", 5:"',['", 6:"']", 7:"','"}):
return d[mat.lastindex]
for ch in (ch1,ch2,ch3,ch4):
print ch
dh = eval(pat.sub(repl,ch))
print dh,'\n',type(dh)
print '\n\n============================='
result
>>>
A!
B!
C!
<tag>
D!
E!
</tag>
F!
<tag>
G!
</tag>
['A', 'B', 'C', ['D', 'E'], 'F', ['G']]
<type 'list'>
=============================
A!
B!
C!
<tag>
D!
E!
</tag>
F!
<tag>
G!
</tag>
H!
['A', 'B', 'C', ['D', 'E'], 'F', ['G'], 'H']
<type 'list'>
=============================
A!
B!
C!
<tag>
D!
E!
</tag>
Fududu!gutuyu!!
<tag>
G!
</tag>
H!
['A', 'B', 'C', ['D', 'E'], 'Fududu!gutuyu!', ['G'], 'H']
<type 'list'>
=============================
<tag>
A!
B!
</tag>
C!
<tag>
D!
E!
</tag>
F!
<tag>
G!
</tag>
H!
[['A', 'B'], 'C', ['D', 'E'], 'F', ['G'], 'H']
<type 'list'>
=============================
>>>