I have a project that searches PDFs for URLs and in the process extracts the PDF Metadata. It works perfectly around 99.6% of the time without any errors. But every once in a while, a file throws the old "invalid token error. Traceback Below:
Traceback (most recent call last):
File "c:\python38\lib\runpy.py", line 193, in _run_module_as_main
return run_code(code, main_globals, None,
File "c:\python38\lib\runpy.py", line 86, in run_code
exec(code, run_globals)
File "C:\Python38\Scripts\linkrot.exe_main.py", line 7, in
File "c:\python38\lib\site-packages\linkrot\cli.py", line 182, in main
pdf = linkrot.linkrot(args.pdf)
File "c:\python38\lib\site-packages\linkrot_init.py", line 131, in init
self.reader = PDFMinerBackend(self.stream)
File "c:\python38\lib\site-packages\linkrot\backends.py", line 213, in init
self.metadata.update(xmp_to_dict(metadata))
File "c:\python38\lib\site-packages\linkrot\libs\xmp.py", line 92, in xmp_to_dict
return XmpParser(xmp).meta
File "c:\python38\lib\site-packages\linkrot\libs\xmp.py", line 41, in init
self.tree = ET.XML(xmp)
File "c:\python38\lib\xml\etree\ElementTree.py", line 1320, in XML
parser.feed(text)
xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 55, column 10
My assumption is that there is some sort of issue with the XML extracted from the PDF, but I can't be sure. Is there a workaround? Some way the rest of the program could run when this error throws? The metadata is valuable to the process so I'd like to keep it if possible. I don't know etree that well, so I'd appreciate some help. The Code itself is below:
class XmpParser(object):
"""
Parses an XMP string into a dictionary.
Usage:
parser = XmpParser(xmpstring)
meta = parser.meta
"""
def __init__(self, xmp):
self.tree = ET.XML(xmp)
self.rdftree = self.tree.find(RDF_NS + "RDF")
@property
def meta(self):
""" A dictionary of all the parsed metadata. """
meta = defaultdict(dict)
if self.rdftree:
for desc in self.rdftree.findall(RDF_NS + "Description"):
for (
el
) in (
desc.iter()
):
ns, tag = self._parse_tag(el)
value = self._parse_value(el)
meta[ns][tag] = value
return dict(meta)
def _parse_tag(self, el):
""" Extract the namespace and tag from an element. """
ns = None
tag = el.tag
if tag[0] == "{":
ns, tag = tag[1:].split("}", 1)
if ns in NS_MAP:
ns = NS_MAP[ns]
return ns, tag
def _parse_value(self, el): # noqa: C901
""" Extract the metadata value from an element. """
if el.find(RDF_NS + "Bag") is not None:
value = []
for li in el.findall(RDF_NS + "Bag/" + RDF_NS + "li"):
value.append(li.text)
elif el.find(RDF_NS + "Seq") is not None:
value = []
for li in el.findall(RDF_NS + "Seq/" + RDF_NS + "li"):
value.append(li.text)
elif el.find(RDF_NS + "Alt") is not None:
value = {}
for li in el.findall(RDF_NS + "Alt/" + RDF_NS + "li"):
value[li.get(XML_NS + "lang")] = li.text
else:
value = el.text
return value
Any help or advice would be appreciated.