There's a couple options for doing this, the more compatible way given the code you supplied is to store the images temporarily in that directory and then delete them after reading the text using pytesseract. I create a wand type image to extract each image from the PDF individually, then convert it to a PIL type image for pytesseract. Here's the code I used for this with the detected text bring written to an array 'text' where each element is an image in the original PDF, I also updated some of your imports to make it compatible with Python3 (cStringIO->io and urllib2->urllib.request).
import PyPDF2
import os
import pytesseract
from wand.image import Image
from PIL import Image as PILImage
import urllib.request
import io
with urllib.request.urlopen('file:///home/user/Documents/TestDocs/test.pdf') as response:
pdf_read = response.read()
pdf_im = PyPDF2.PdfFileReader(io.BytesIO(pdf_read))
text = []
for p in range(pdf_im.getNumPages()):
with Image(filename='file:///home/user/Documents/TestDocs/test.pdf' + '[' + str(p) + ']') as img:
with Image(image = img) as converted: #Need second with to convert SingleImage object from wand to Image
converted.save(filename=tempFile_Location)
text.append(pytesseract.image_to_string(PILImage.open(tempFile_Location)))
os.remove(tempFile_Location)
Alternatively, if you want to avoid creating and deleting a temporary file for each image you can use numpy and OpenCV to extract the image as a blob, convert it to a numpy array and then turn it into a PIL image for pytesseract to perform OCR on (reference)
import PyPDF2
import os
import pytesseract
from wand.image import Image
from PIL import Image as PILImage
import urllib.request
import io
import numpy as np
import cv2
with urllib.request.urlopen('file:///home/user/Documents/TestDocs/test.pdf') as response:
pdf_read = response.read()
pdf_im = PyPDF2.PdfFileReader(io.BytesIO(pdf_read))
text = []
for p in range(pdf_im.getNumPages()):
with Image(filename=('file:///home/user/Documents/TestDocs/test.pdf') + '[' + str(p) + ']') as img:
img_buffer=np.asarray(bytearray(img.make_blob()), dtype=np.uint8)
retval = cv2.imdecode(img_buffer, cv2.IMREAD_GRAYSCALE)
text.append(pytesseract.image_to_string(PILImage.fromarray(retval)))