You infamously can't use regexps to parse HTML, but you can use re.split
to tokenize a tag-structured string, and parse it into a tree of expressions:
import re
from pprint import pprint
def parse_tag_string(tag_string):
# Stack of tag-name/children structures.
stack = [(None, [])]
for i, bit in enumerate(re.split(r"(<[^>]+>)", tag_string)):
# When using capturing groups, the splitter is included in the result;
# every 2nd item (if there's 1 capturing group) is the splitter.
if i % 2 == 0:
# Bare string, just push onto our current node (unless it's empty).
if bit:
stack[-1][1].append(bit)
else:
name = bit.strip("<>")
if name.startswith("/"): # Handle closing tags
name = name[1:]
if stack[-1][0] != name:
raise ValueError(
f"Mismatched closing tag: {bit!r}; " f"expected {stack[-1][0]!r}"
)
stack.pop()
else: # Handle opening tags
# Create new child with no children of its own
child = (name, [])
# Put it in our current node
stack[-1][1].append(child)
# And make it our current node
stack.append(child)
if len(stack) != 1:
raise ValueError(f"Unclosed tags: {stack[1:]}")
return stack.pop()[1]
parsed = parse_tag_string('foo <t>XXX<b>bar</b></t> bar')
pprint(parsed, width=40)
This prints out
['foo ',
('t', ['XXX', ('b', ['bar'])]),
' bar']
– a list that contains other either strings or 2-tuples whose first member is the tag name and the second member a list like this.
You can then e.g. pretty-print this:
def pretty_print_parsed(parsed, level=0):
indent = " " * level
for item in parsed:
if isinstance(item, str):
print(f"{indent}{item}")
else:
print(f"{indent}<{item[0]}>")
pretty_print_parsed(item[1], level + 1)
print(f"{indent}</{item[0]}>")
yields
foo
<t>
XXX
<b>
bar
</b>
</t>
bar
or just rip out all of the tags by walking the tree and only outputting the text:
def detag(parsed):
return ''.join(
item if isinstance(item, str) else detag(item[1])
for item in parsed
)
print(detag(parsed)) # 'foo XXXbar bar'