Initial version
import pandas as pd
import re
raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file_list = file.split('\n')
file_list[13] = '<root>' ### 12 to 13 to overwrite deformed <creditBureau ...>
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
Couple of issues:
- You left a declaration element after
<root>
tag. Removed declarations.
- Declarations have
""
which deforms them. Works without the declarations.
- Write to tocng.xml and you do not close the file handle. Makes ET reading fail.
If you want to use the regex I offered in a previous post then try this as it will remove the header without any line count needed. <root>
then stats from index 1.
import pandas as pd
import re
raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)
file_list = file.split('\n')
file_list[1] = '<root>'
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
Based on new single line xml
More variations in the xml files may need adjustment to the code.
import pandas as pd
import re
raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
# Replace up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)
# Strip file and add \n at each instance of >.
file = file.strip()
file = file.replace('>', '>\n')
# Split file and make a list with no empty items.
file_list = file.split('\n')
file_list = [item for item in file_list if item != '']
# Remove known xml declarations.
if file_list[0][:5] == '<?xml':
del file_list[0]
if file_list[0][:13] == '<creditBureau':
del file_list[0]
# Add root tags.
file_list.insert(0, '<root>')
file_list.append('</root>')
#Converting to String
str1 = ''.join(file_list)
print(str1) ## See output in my answer
with open("tocng.xml","w") as f:
f.write(str1)
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
Output:
<root><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber></transactionControl></root>
Edit: If <creditBureau...>
is needed, then remove or comment these lines:
if file_list[0][:13] == '<creditBureau':
del file_list[0]
The last xml posted does not look malformed with the initial tags so no changes done to handle it.
Regex customization of xml string
Using regex with no use of lists to handle the xml string as the xml could
be multiline or single line.
In the customize_xml function, regex comments show group numbers and the
pattern modes which you can pass as arguments to the customize_xml function.
Valid mode arguments are one of None, -1, 0, 1, 2, 3, 4
.
import pandas as pd
import re
def customize_xml(content, mode=0):
'''Customizes xml tags in the content and may insert a <root> tag.'''
# No modification.
if mode in (-1, None):
return content
# Select a pattern (mode) that modifies the whole xml.
pattern = (r'\2\3\4', # 0. <cB>...</cB>
r'<root>\2\3\4</root>', # 1. <root><cB>...</cB><root>
r'\1<root>\2\3\4</root>', # 2. <?xml?><root><cB>...</cB><root>
r'<root>\3</root>', # 3. <root>...<root>
r'\1<root>\3</root>', # 4. <?xml?><root>...<root>
)
# Groups are marked as \1 \2 ... to use for pattern above.
content = re.sub(r'(<\?xml.+?\?>)' # \1
'(<creditBureau.*?>)' # \2
'(.+?)' # \3
'(</creditBureau>)' # \4
, pattern[mode], content, flags=re.S)
return content
raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
with open("testfile.txt", "w") as f:
f.write(df.loc[0][2])
with open("testfile.txt") as f:
file = f.read()
# Remove characters up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)
# Make xml one single line if not already.
file = file.replace('\n', '')
file = customize_xml(file, 3)
# Write customized xml.
with open("tocng.xml", "w") as f:
f.write(file)
# Parsing xml.
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
# Print pretty xml from xml string.
from xml.dom import minidom
pretty_xml = minidom.parseString(file).toprettyxml(indent=" ")
print(pretty_xml)
Added a pretty print at end. Optional, just for reviewing the result.