I am using PyPDF4 to access to the "full" pdf-structures of a pdf and recursively store its values in a dictionary. The algorithm is supposed to work page-wise.
A pdf object for common data structure, IndirectObject
, needs to be cast to dictionary via the getObject()
method. Once the casting is performed then it is possible that some of its values are IndirectObject
s themselves. They may occurs in lists and dictionaries and standalone:
{..., '/Kids': [IndirectObject(32, 0), ...], '/Count': 4}
{'/ColorSpace': {'/CS0': IndirectObject(19023, 0), ...}
{..., '/Parent': IndirectObject(26, 0), '/Resources': ...}
The output of the program is incorrect, still some IndirectObject
are presents. I tried several different implementations but I cannot see what I am doing wrong. How can I fix it?
For debugging purposes to each such substitution the key is modified to have the following pattern: /OriginalKey_NumID
.
The output refers to a pdf that can be found here.
PyPDF4 should be back-compatible with PyPDF2, just change the import part with PyPDF2. Now, PyPDF2 seems to be in a reborn phase and some changes were made, see history.
from PyPDF4 import PdfFileReader # should be also compatible with PyPDF2
from PyPDF4.pdf import IndirectObject
def inspect_page(c_dict):
dg = {}
def __inspect_page(c_dict):
d = {}
for k, v in c_dict.items():
if isinstance(v, IndirectObject):
#d[f'{k}_{v.idnum}'] = __inspect_page(v.getObject()) # lead to oo-recursion
d[f'{k}_{v.idnum}'] = v.getObject()
elif isinstance(v, list):
loc_tmp = []
for vv in v:
if isinstance(vv, IndirectObject):
local_d_tmp = {f'{k}_{vv.idnum}': __inspect_page(vv.getObject())}
loc_tmp.append(local_d_tmp)
elif isinstance(vv, dict):
loc_tmp.append(__inspect_page(vv))
else:
loc_tmp.append(vv)
d[k] = loc_tmp
elif isinstance(v, dict):
v.update(__inspect_page(v))
d[k] = v
else:
d[k] = v
dg.update(d)
return d
__inspect_page(c_dict)
return dg
#process pdf
# sample src: https://www.adobe.com/support/products/enterprise/knowledgecenter/media/c4611_sample_explain.pdf
path = # path to pdf
objects_pages = {}
rpdf = PdfFileReader(open(path, 'rb'))
for i in range(rpdf.getNumPages()):
page = rpdf.getPage(i)
objects_pages[i] = inspect_page(page)
if i == 0:
break
# check output
page_num = 0
print(f'content of page {page_num}')
for k, v in objects_pages[page_num].items():
print(k, v)
Output (IndirectObject
are still presents, both in list and as dictionary values)
content of page 0
/StructParents 0
/Rotate 0
/CS0_19023 ['/Separation', '/Black', '/DeviceCMYK', IndirectObject(19029, 0)]
/ColorSpace {'/CS0': IndirectObject(19023, 0), '/CS0_19023': ['/Separation', '/Black', '/DeviceCMYK', IndirectObject(19029, 0)]}
/T1_0_19022 {'/Subtype': '/Type1', '/FontDescriptor': IndirectObject(19024, 0), '/LastChar': 255, '/Widths': [...], '/BaseFont': '/Times-Roman', '/FirstChar': 0, '/Encoding': '/MacRomanEncoding', '/Type': '/Font'}
/T1_1_19027 {'/Subtype': '/Type1', '/FontDescriptor': IndirectObject(19028, 0), '/LastChar': 255, '/Widths': [...], '/BaseFont': '/Times-Italic', '/FirstChar': 0, '/Encoding': '/WinAnsiEncoding', '/Type': '/Font'}
/Font {'/T1_0': IndirectObject(19022, 0), '/T1_1': IndirectObject(19027, 0), '/T1_0_19022': {'/Subtype': '/Type1', '/FontDescriptor': IndirectObject(19024, 0), '/LastChar': 255, '/Widths': [...], '/BaseFont': '/Times-Roman', '/FirstChar': 0, '/Encoding': '/MacRomanEncoding', '/Type': '/Font'}, '/T1_1_19027': {'/Subtype': '/Type1', '/FontDescriptor': IndirectObject(19028, 0), '/LastChar': 255, '/Widths': [...], '/BaseFont': '/Times-Italic', '/FirstChar': 0, '/Encoding': '/WinAnsiEncoding', '/Type': '/Font'}}
/GS0_19025 {'/OPM': 1, '/op': <PyPDF4.generic.BooleanObject object at 0x7f0264627280>, '/Type': '/ExtGState', '/SA': <PyPDF4.generic.BooleanObject object at 0x7f0264627310>, '/SM': 0.02}
/ExtGState {'/GS0': IndirectObject(19025, 0), '/GS0_19025': {'/OPM': 1, '/op': <PyPDF4.generic.BooleanObject object at 0x7f0264627280>, '/Type': '/ExtGState', '/SA': <PyPDF4.generic.BooleanObject object at 0x7f0264627310>, '/SM': 0.02}}
/ProcSet ['/PDF', '/Text']
/Resources {'/ColorSpace': {'/CS0': IndirectObject(19023, 0), '/CS0_19023': ['/Separation', '/Black', '/DeviceCMYK', IndirectObject(19029, 0)]}, '/Font': {'/T1_0': IndirectObject(19022, 0), '/T1_1': IndirectObject(19027, 0), '/T1_0_19022': {'/Subtype': '/Type1', '/FontDescriptor': IndirectObject(19024, 0), '/LastChar': 255, '/Widths': [...], '/BaseFont': '/Times-Roman', '/FirstChar': 0, '/Encoding': '/MacRomanEncoding', '/Type': '/Font'}, '/T1_1_19027': {'/Subtype': '/Type1', '/FontDescriptor': IndirectObject(19028, 0), '/LastChar': 255, '/Widths': [...], '/BaseFont': '/Times-Italic', '/FirstChar': 0, '/Encoding': '/WinAnsiEncoding', '/Type': '/Font'}}, '/ProcSet': ['/PDF', '/Text'], '/ExtGState': {'/GS0': IndirectObject(19025, 0), '/GS0_19025': {'/OPM': 1, '/op': <PyPDF4.generic.BooleanObject object at 0x7f0264627280>, '/Type': '/ExtGState', '/SA': <PyPDF4.generic.BooleanObject object at 0x7f0264627310>, '/SM': 0.02}}}
/Type /Page
/CropBox [0, 0, 441, 666.36]
/Parent_18919 {'/Parent': IndirectObject(18918, 0), '/Count': 7, '/Type': '/Pages', '/Kids': [IndirectObject(19021, 0), IndirectObject(1, 0), IndirectObject(10, 0), IndirectObject(13, 0), IndirectObject(15, 0), IndirectObject(19, 0), IndirectObject(23, 0)]}
/Contents_19026 {'/Filter': '/FlateDecode'}
/BleedBox [0, 0, 441, 666.36]
/MediaBox [0, 0, 441, 666.36]
/Thumb_4489 {'/Filter': '/FlateDecode', '/BitsPerComponent': 8, '/ColorSpace': IndirectObject(2517, 0), '/Width': 55, '/Height': 83}
/TrimBox [0, 0, 441, 666.36]
EDIT:
IndirectObject
s are uniquely identified by their idnum
but a are not unique inside the PDF's structure. To avoid oo-recursion a set to keep track of each idnum
should be created to avoid duplicates.