got it, more or less, thats something I can work with. If you guys have a better solution, please please please step forward! I appreciate any help with this.
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure
import re
def parse_layout(layout):
"""Function to recursively parse the layout tree."""
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox):
if re.findall("[A-Z][A-Z][0-9][0-9][0-9][0-9]", lt_obj.get_text()):
print(lt_obj.__class__.__name__)
print(lt_obj.bbox)
print(lt_obj.get_text())
fp = open('M:/test.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
parse_layout(layout)