Converting a remote PDF's pages to temporary images for OCR

Question

I have a remote PDF file that I need to read page by page and keep passing each to an OCR which will give me its OCR text.

import pytesseract
from pyPdf import PdfFileWriter, PdfFileReader
import cStringIO
from wand.image import Image
import urllib2
import tempfile
import pytesseract
from PIL import Image

remoteFile = urllib2.urlopen(urllib2.Request("file:///home/user/Documents/TestDocs/test.pdf")).read()
memoryFile = cStringIO.StringIO(remoteFile)

pdfFile = PdfFileReader(memoryFile)
for pageNum in xrange(pdfFile.getNumPages()):
    currentPage = pdfFile.getPage(pageNum)

    ## somehow convert currentPage to wand type
    ## image and then pass to tesseract-api
    ##
    ## TEMP_IMAGE = some conversion to temp file
    ## pytesseract.image_to_string(Image.open(TEMP_IMAGE))

memoryFile.close()

I thought of using cStringIO or tempfile but I cannot figure out how to use them for this purpose.

How can solve this issue?

score 1 · Answer 1 · answered Nov 30 '17 at 08:01

There's a couple options for doing this, the more compatible way given the code you supplied is to store the images temporarily in that directory and then delete them after reading the text using pytesseract. I create a wand type image to extract each image from the PDF individually, then convert it to a PIL type image for pytesseract. Here's the code I used for this with the detected text bring written to an array 'text' where each element is an image in the original PDF, I also updated some of your imports to make it compatible with Python3 (cStringIO->io and urllib2->urllib.request).

import PyPDF2
import os
import pytesseract
from wand.image import Image
from PIL import Image as PILImage
import urllib.request
import io

with urllib.request.urlopen('file:///home/user/Documents/TestDocs/test.pdf') as response:
    pdf_read = response.read()
    pdf_im = PyPDF2.PdfFileReader(io.BytesIO(pdf_read))
    text = []
    for p in range(pdf_im.getNumPages()):
        with Image(filename='file:///home/user/Documents/TestDocs/test.pdf' + '[' + str(p) + ']') as img:
            with Image(image = img) as converted: #Need second with to convert SingleImage object from wand to Image
                converted.save(filename=tempFile_Location)
                text.append(pytesseract.image_to_string(PILImage.open(tempFile_Location)))
                os.remove(tempFile_Location)

Alternatively, if you want to avoid creating and deleting a temporary file for each image you can use numpy and OpenCV to extract the image as a blob, convert it to a numpy array and then turn it into a PIL image for pytesseract to perform OCR on (reference)

import PyPDF2
import os
import pytesseract
from wand.image import Image
from PIL import Image as PILImage
import urllib.request
import io
import numpy as np
import cv2

with urllib.request.urlopen('file:///home/user/Documents/TestDocs/test.pdf') as response:
    pdf_read = response.read()
    pdf_im = PyPDF2.PdfFileReader(io.BytesIO(pdf_read))
    text = []
    for p in range(pdf_im.getNumPages()):
        with Image(filename=('file:///home/user/Documents/TestDocs/test.pdf') + '[' + str(p) + ']') as img:
            img_buffer=np.asarray(bytearray(img.make_blob()), dtype=np.uint8)
            retval = cv2.imdecode(img_buffer, cv2.IMREAD_GRAYSCALE)
            text.append(pytesseract.image_to_string(PILImage.fromarray(retval)))

Converting a remote PDF's pages to temporary images for OCR

1 Answers1