1. Extract the images using pdfimages
pdfimages mydoc.pdf
2. Use the following extraction script:
./extractImages.py images*
Find your cut out images in a new images folder.
Look at what was done in the tracing folder to make sure no images were missed.
Operation
It will process all images and look for shapes inside the images. If a shape is found and is larger than a configurable size it fill figure out the maximum bounding box, cut out the image and save it in a new images, in addition it will create folder named traces where it shows all the bounding boxes.
If you want to find smaller images, just decrease the minimumWidth and minimumHeight however if you set it too low it will find each character.
In my tests it works extremely well, it just finds a few too many images.
extractImages.py
#!/bin/env python
import cv2
import numpy as np
import os
from pathlib import Path
def extractImagesFromFile(inputFilename, outputDirectory, tracing=False, tracingDirectory=""):
# Settings:
minimumWidth = 100
minimumHeight = 100
greenColor = (36, 255, 12)
traceWidth = 2
# Load image, grayscale, Otsu's threshold
image = cv2.imread(inputFilename)
original = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Find contours, obtain bounding box, extract and save ROI
ROI_number = 1
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
x, y, w, h = cv2.boundingRect(c)
if w >= minimumWidth and h >= minimumHeight:
cv2.rectangle(image, (x, y), (x + w, y + h), greenColor, traceWidth)
ROI = original[y:y+h, x:x+w]
outImage = os.path.join(outputDirectory, '{}_{}.png'.format(Path(inputFilename).stem, ROI_number))
cv2.imwrite(outImage, ROI)
ROI_number += 1
if tracing:
outImage = os.path.join(tracingDirectory, Path(inputFilename).stem + '_trace.png')
cv2.imwrite(outImage, image)
def main(files):
tracingEnabled = True
outputDirectory = 'images'
tracingDirectory = 'tracing'
# Create the output directory if it does not exist
outputPath = Path.cwd() / outputDirectory
outputPath.mkdir(exist_ok=True)
if tracingEnabled:
tracingPath = Path.cwd() / tracingDirectory
tracingPath.mkdir(exist_ok=True)
for f in files:
print("Prcessing {}".format(f))
if Path(f).is_file():
extractImagesFromFile(f, outputDirectory, tracingEnabled, tracingDirectory)
else:
print("Invalid file: {}".format(f))
if __name__ == "__main__":
import argparse
from glob import glob
parser = argparse.ArgumentParser()
parser.add_argument("fileNames", nargs='*')
args = parser.parse_args()
fileNames = list()
for arg in args.fileNames:
fileNames += glob(arg)
main(fileNames)
Credit
The basic algorithm was provided by nathancy as an answer to this question:
Extract all bounding boxes using OpenCV Python