I am trying to parse a pdf file. I want to get all the values in a list or dictionary of the checkbox values. But I am getting this error.
"return OrderedDict((k, v.get('/V', '')) for k, v in fields.items()) AttributeError: 'NoneType' object has no attribute 'items'"
The code I am trying is this
from collections import OrderedDict
from PyPDF2 import PdfFileWriter, PdfFileReader
def _getFields(obj, tree=None, retval=None, fileobj=None):
fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
'/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
if retval is None:
retval = OrderedDict()
catalog = obj.trailer["/Root"]
# get the AcroForm tree
if "/AcroForm" in catalog:
tree = catalog["/AcroForm"]
else:
return None
if tree is None:
return retval
obj._checkKids(tree, retval, fileobj)
for attr in fieldAttributes:
if attr in tree:
# Tree is a field
obj._buildField(tree, retval, fileobj, fieldAttributes)
break
if "/Fields" in tree:
fields = tree["/Fields"]
for f in fields:
field = f.getObject()
obj._buildField(field, retval, fileobj, fieldAttributes)
return retval
def get_form_fields(infile):
infile = PdfFileReader(open(infile, 'rb'))
fields = _getFields(infile)
return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
if __name__ == '__main__':
from pprint import pprint
pdf_file_name = 'Guild.pdf'
pprint(get_form_fields(pdf_file_name))