Method 1 (with a module):
Just like @iain-shelvington said with a XML parsing/manipulation library You can do it simply and fast;
Try this with lxml module and xpath:
import lxml.etree as et
xml = """<?xml version='1.0' encoding='UTF-8'?>
<corpus name="corpus">
<recording audio="audio.wav" name="first audio">
<segment name="1" start="0" end="2">
<orth>some text 1</orth>
</segment>
<segment name="2" start="2" end="4">
<orth>some text 2</orth>
</segment>
<segment name="3" start="4" end="6">
<orth>some text 3</orth>
</segment>
</recording>
</corpus>"""
tree = et.XML(xml.encode())
find_segments = tree.xpath("*//segment[@name='1' or @name='2']") # you can add more segments here
for each_segment in find_segments:
each_segment.getparent().remove(each_segment)
clean_content = str(et.tostring(tree, pretty_print=True, xml_declaration=True), encoding="utf-8")
print(clean_content)
Some credits to @cédric-julien, @Sheena, @xyz, @josh-allemon and these questions:
- how to remove an element in lxml
- Using an OR condition in Xpath to identify the same element
- lxml.etree.XML ValueError for Unicode string
Method 2 (Hard Code):
xml = """<?xml version='1.0' encoding='UTF-8'?>
<corpus name="corpus">
<recording audio="audio.wav" name="first audio">
<segment name="1" start="0" end="2">
<orth>some text 1</orth>
</segment>
<segment name="2" start="2" end="4">
<orth>some text 2</orth>
</segment>
<segment name="3" start="4" end="6">
<orth>some text 3</orth>
</segment>
</recording>
</corpus>"""
lines = []
toggle = True
for each_line in xml.splitlines():
if each_line.strip().startswith("<segment") and ('name="1"' in each_line or 'name="2"' in each_line):
toggle = False
elif each_line.strip().startswith("</segment>") and toggle is False:
toggle = True
elif toggle:
lines.append(each_line)
new_xml = "\n".join(lines)
print(new_xml)
If you want to read names from file then try this:
from lxml import etree
with open("xml.txt", "r") as xml_file:
xml_data = xml_file.read()
with open('nums.txt', 'r') as file:
list_of_names = file.read().split("\n")
new_xml = xml_data
for each_name in list_of_names:
tree = etree.XML(new_xml.encode())
find_segments = tree.xpath("*//segment[@name='{}']".format(each_name))
for each_segment in find_segments:
each_segment.getparent().remove(each_segment)
new_xml = str(etree.tostring(tree, pretty_print=True, xml_declaration=True), encoding="utf-8")
print(new_xml)
Much Shorter:
from lxml import etree
with open("xml.txt", "r") as xml_file:
tree = etree.XML(xml_file.read().encode())
with open('nums.txt', 'r') as file:
list_of_names = list(set(file.read().split("\n")))
xpath = "*//segment[{}]".format(" or ".join(["@name='{}'".format(each_name) for each_name in list_of_names]))
print(xpath)
for each_segment in tree.xpath(xpath):
each_segment.getparent().remove(each_segment)
new_xml = str(etree.tostring(tree, pretty_print=True, xml_declaration=True), encoding="utf-8")
print(new_xml)