I have a xml file, some of it's tags has some characters like "ç". What I want is to extract this tag's text and write into .txt file.
Here is the xml file look's like:
<article mdate="2020-04-16" key="journals/tkdd/DuarteGB16">
<author>João Duarte</author>
<author orcid="0000-0003-3357-1195">João Gama</author>
<author orcid="0000-0002-8339-7773">Albert Bifet</author>
<title>Adaptive Model Rules From High-Speed Data Streams.</title>
<pages>30:1-30:22</pages>
<year>2016</year>
<volume>10</volume>
<journal>ACM Trans. Knowl. Discov. Data</journal>
<number>3</number>
<ee>https://doi.org/10.1145/2829955</ee>
<ee>https://www.wikidata.org/entity/Q58204491</ee>
<url>db/journals/tkdd/tkdd10.html#DuarteGB16</url>
</article>
and here is my code in python:
import lxml.etree as ET
import os
from pathlib import Path
test_file = "test_data.xml"
file_output_path = "results.txt"
def search_xxx(root, sth: str, file_writer):
"""
parameter: [sth] can be article、inproceedings、proceedings、book、incollection、phdthesis、mastersthesis、www
"""
# with open("result.txt", "w") as f:
item_counter = 0
for item in root.findall(sth):
message = []
invalid = False
item_counter += 1
# print("-----info---------")
# article dict["key"] -> key
key_str = item.attrib["key"]
key_str_list = key_str.split('/')
key = key_str_list[1]
# print("[Category]: ", key_str)
message.append(key)
# key_str = item.attrib["key"]
# message.append(key_str)
# title
if item.find("title") != None:
if item.find("title").text != None:
title = item.find("title").text
# print("[Title]: ", title)
message.append(title)
else:
# invalid = True
continue
else:
invalid = True
# author
# print("[Authors]:")
for item_son in item.iter("author"):
if item_son.text != None:
# author = item_son.text
author = item_son.text
# print("[author]: ", author)
message.append(author)
else:
invalid = True
break
## write to file
# print(message)
# print()
if invalid == True:
# message = ""
file_writer.write("")
else:
message = "+".join(message)
file_writer.write(message + '\n')
# print("---------------")
# write to txt file.
def main():
parser = ET.XMLParser(recover=True)
# tree = ET.parse(test_file, parser=parser)
tree = ET.parse(dblp_file_path, parser=parser)
root = tree.getroot()
with open(file_output_path, "w", encoding="UTF-8") as f:
print("Extracting...")
search_xxx(root, "article", f)
print("Extract done.")
if __name__ == "__main__":
main()
Question: when I run this code and it will get the expected result:
output: tkdd+Adaptive Model Rules From High-Speed Data Streams.+Jo+Jo+Albert Bifet
Here comes the question:
in XML file,
<author>João Duarte</author>
<author orcid="0000-0003-3357-1195">João Gama</author>
I want to get the first author which should be "Joo Duarte", the second should be "Joo Gama",