The reason is that the last available Biopython version (1.79) does not recognise DTD with uri http://www.niso.org/schemas/ali/1.0/. The GitHub version has the corrected Parser but it is not available from pip
now.
Compare:
current 1.79
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
else:
raise ValueError("Unknown prefix '%s' with uri '%s'" % (prefix, uri))
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
GitHub
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
elif prefix == "ali":
assert uri == "http://www.niso.org/schemas/ali/1.0/"
else:
raise ValueError(f"Unknown prefix '{prefix}' with uri '{uri}'")
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
So you can either exchange or edit Parser.py file, or use third party libraries for converting your handle to built-in python object.
If you want download just a full text of the article, you could try to download a pdf through metapub
& go on to extract a text via textract
.
import metapub
from urllib.request import urlretrieve
import textract
pmcid = 'PMC2837563'
fetch = metapub.PubMedFetcher()
article_metadata = fetch.article_by_pmcid(pmcid)
#Get just an abstract
abstract = article_metadata.abstract
#Download full article text
pmid = article_metadata.pmid
url = metapub.FindIt(pmid).url
urlretrieve(url, any_path)
with open(another_path, "w") as textfile:
textfile.write(textract.process(
any_path,
extension='pdf',
method='pdftotext',
encoding="utf_8",
))