I use the following code to extract images from PDF files:
from PIL import Image
from PyPDF2 import PdfFileReader, generic
from io import BytesIO
from typing import List
import zlib
import sys
import struct
def tiff_header_for_CCITT(width:int, height:int, img_size:int, CCITT_group:int=4) -> bytes:
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'l'
return struct.pack(tiff_header_struct,
b'II',
42,
8,
8,
256, 4, 1, width,
257, 4, 1, height,
258, 3, 1, 1,
259, 3, 1, CCITT_group,
262, 3, 1, 0,
273, 4, 1, struct.calcsize(tiff_header_struct),
278, 4, 1, height,
279, 4, 1, img_size,
0
)
def object_to_images(objects:generic.DictionaryObject) -> List:
images:List[Image] = []
for idx, key in enumerate(objects):
obj = objects[key] # object est un mot-clé réservé
if '/Resources' in obj and '/XObject' in obj['/Resources']: # l'objet est un conteneur
images += object_to_images(obj["/Resources"]["/XObject"].getObject()) # récursion
elif obj['/Subtype'] == '/Image':
print(obj)
if "/ImageMask" in obj: # masque d'image, ignoré
continue
img_modes = {'/DeviceRGB': 'RGB',
'/DefaultRGB': 'RGB',
'/DeviceCMYK': 'CMYK',
'/DefaultCMYK': 'CMYK',
'/DeviceGray': 'L',
'/DefaultGray': 'L',
'/Indexed': 'P'}
cspace = obj.get('/ColorSpace').getObject()
indexed = False
if cspace:
if isinstance(cspace, generic.ArrayObject):
if cspace[0] == '/ICCBased':
color_map = cspace[1].getObject()['/N']
if color_map == 1:
colorspace = "P"
elif color_map == 3:
colorspace = "RGB"
elif color_map == 4:
colorspace = "CMYK"
elif cspace[0] == "/Indexed":
c, b, h, l = [v.getObject() for v in cspace]
indexed = True
colorspace = img_modes[c]
else:
colorspace = img_modes[cspace]
width = obj['/Width']
height = obj['/Height']
if "/FlateDecode" in obj["/Filter"]: # Compressé avec zlib
data = zlib.decompress(obj._data)
else:
data = obj._data
if "/DCTDecode" in obj["/Filter"]: # JPEG, rien à faire
img = Image.open(BytesIO(data))
images.append(img)
elif "/JPXDecode" in obj["/Filter"]: # JPEG 2000, rien à faire
img = Image.open(BytesIO(data))
images.append(img)
elif "CCITTFaxDecode" in obj["/Filter"]: # TIFF, rajouter l'en-tête
if obj['/DecodeParms']['/K'] == -1:
CCITT_group = 4
else:
CCITT_group = 3
img_size = len(data)
tiff_header = tiff_header_for_CCITT(width, height, img_size, CCITT_group)
data = tiff_header + data
img = Image.open(BytesIO(data))
images.append(img)
else:
img = Image.frombytes(colorspace, (width, height), data)
if indexed:
img.putpalette(l.getData())
img = img.convert('RGB')
images.append(img)
return images
def pdf_to_images(filename:str) -> List:
images:List[Image] = []
file = PdfFileReader(open(filename, "rb"))
nPages = file.getNumPages()
for i in range(nPages):
page = file.getPage(i)
try:
root_objects = page["/Resources"]["/XObject"].getObject() # devrait s'appeler getObjects...
except KeyError:
continue
images += object_to_images(root_objects)
return images
if __name__ == "__main__":
fichier = sys.argv[1]
images = pdf_to_images(fichier)
#print("\n".join(map(str, images)))
for image in images:
image.show()
pass
It works for almost all PDF files, but one of them is acting weirdly. It's a sample PDF that can be found here. When I pass this PDF file to the code above, the result is skewed and the black and white colors are inverted.
The only thing I've noticed about the image object in this PDF is that it has a DecodeParms
entry, whose value is {'/Predictor': 15, '/Columns': 2550, '/Colors': 3}
. I don't know if it's relevant, but it's the only PDF I've tested that has those.
Thanks in advance