I'm trying to parse PDF file in a directory using PDFMiner, and I'm starting out by replicating the first script from the documentation contained here. The code (repeated below) opens the file, and creates the parser object, but gives an 'Unexpected EOF' error when trying to create the document object. Any help in understanding why this is the case would be greatly appreciated. Specifically, are there types of PDFs that cannot be parsed in this way?
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
import os
import re
fp = open(os.getcwd() + '\\' + os.listdir(os.getcwd())[0])
parser = PDFParser(fp)
doc = PDFDocument(parser) #This is the problem, getting an "unexpected EOF" error
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
This yields the following error:
%run scrape_psr.py
---------------------------------------------------------------------------
PSEOF Traceback (most recent call last)
C:\Users\Rob Lantz\Anaconda\lib\site-packages\IPython\utils\py3compat.pyc in execfile(fname, glob, loc)
195 else:
196 filename = fname
--> 197 exec compile(scripttext, filename, 'exec') in glob, loc
198 else:
199 def execfile(fname, *where):
22
23 parser = PDFParser(fp)
---> 24 doc = PDFDocument(parser) #This is the problem, getting an "unexpected EOF" error
25
26 if not doc.is_extractable:
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\pdfdocument.pyc in __init__(self, parser, password, caching, fallback)
313 parser.fallback = True
314 xref = PDFXRefFallback()
--> 315 xref.load(parser)
316 self.xrefs.append(xref)
317 for xref in self.xrefs:
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\pdfdocument.pyc in load(self, parser, debug)
173 # expand ObjStm.
174 parser.seek(pos)
--> 175 (_, obj) = parser.nextobject()
176 if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
177 stream = stream_value(obj)
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\psparser.pyc in nextobject(self)
555 """
556 while not self.results:
--> 557 (pos, token) = self.nexttoken()
558 #print (pos,token), (self.curtype, self.curstack)
559 if isinstance(token, (int, long, float, bool, str, PSLiteral)):
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\psparser.pyc in nexttoken(self)
480 def nexttoken(self):
481 while not self._tokens:
--> 482 self.fillbuf()
483 self.charpos = self._parse1(self.buf, self.charpos)
484 token = self._tokens.pop(0)
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\psparser.pyc in fillbuf(self)
213 self.buf = self.fp.read(self.BUFSIZ)
214 if not self.buf:
--> 215 raise PSEOF('Unexpected EOF')
216 self.charpos = 0
217 return
PSEOF: Unexpected EOF
Edit: Some have suggested that the problem is with xref table, related to the question here. I'm unconvinced since that error trace does not mention "Unexpected EOF", but rather "EOF marker not found". The solution, such that it is, may be valid in that it essentially says that Python packages as they exist right now are not well suited to parsing the mess that is PDF file standards.