I have a code to extract text from pdf but I want also extract text from images in pdf. I want to protect the order of written text and the text on image when dealing it. Here is my code to extract the written text:
def convertPdfToText(self,outputTextFile):
try:
with open(fileToConvert,'rb') as pdf_file, open(outputTextFile, 'w') as text_file:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
for page_number in range(number_of_pages):
page = read_pdf.getPage(page_number)
page_content = page.extractText()
text_file.write(page_content)
except:
sys.exit("Any error is occurred.")