#!/usr/bin/env python3
import glob
import xml.etree.ElementTree as ET
filenames = glob.glob("C:\\Users\\####\\Desktop\\BNC2\\[A00-ZZZ]*.xml")
out_lines = []
for filename in filenames:
with open(filename, 'r', encoding="utf-8") as content:
tree = ET.parse(content)
root = tree.getroot()
for w in root.iter('w'):
lemma = w.get('hw')
pos = w.get('pos')
tag = w.get('c5')
out_lines.append(w.text + "," + lemma + "," + pos + "," + tag)
with open("C:\\Users\\####\\Desktop\\bnc.txt", "w") as out_file:
for line in out_lines:
line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
out_file.write("{}\n".format(line))
Gives the error:
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 0: character maps to undefined
I thought this line would have solved that:
line = bytes(line, 'utf-8').decode('utf-8', 'ignore')