My problem is extracting the text from multiple columns of .PDF.
Common libs like PyPDF2
didn't work.
The code below I made to try to read with Pytesseract
but I was also unsuccessful because it is mixing the two columns.
Now my idea using this code as a base is to create a cutout in each column 1 and 2 and generate a new image by pasting column 1 and then columns 2 below, so I could read with Pytesseract
or AWS Textract
without problems.
how could i do this with opencv
?
import fitz
import cv2
import pytesseract
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
SCANNED_FILE = "decreto_santos.pdf"
img = cv2.imread(SCANNED_FILE)
zoom_x = 2.0
zoom_y = 2.0
mat = fitz.Matrix(zoom_x, zoom_y)
# I create an image for each page of the PDF and save.
doc = fitz.open(SCANNED_FILE)
print("Generated pages: ")
for page in doc:
pix = page.get_pixmap(matriz=mat)
png = 'output/' + SCANNED_FILE.split('/')[-1].split('.')[0] + 'page-%i.png' % page.number
print(png)
pix.save(png)
# Upload an image to crop
original_image = cv2.imread('output/decreto_santospage-1.png')
# Grayscale image
gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
plt.figure(figsize=(25, 15))
plt.imshow(gray_image, cmap='gray')
plt.show()
# Result:
# Otsu thresholding
ret, threshold_image = cv2.threshold(gray_image, 0,255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
plt.figure(figsize=(25, 15))
plt.imshow(threshold_image, cmap='gray')
plt.show()
# Result:
rectangular_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
# Applying dilation on the threshold image
dilated_image = cv2.dilate(threshold_image, rectangular_kernel, iterations = 1)
plt.figure(figsize=(25, 15))
plt.imshow(dilated_image)
plt.show()
# Result:
# Finding contours
contours, hierarchy = cv2.findContours(dilated_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# Creating a copy of the image
copied_image = original_image.copy()
with open("output/recognized-kernel-66-66.txt", "w+") as f:
f.write("")
f.close()
mask = np.zeros(original_image.shape, np.uint8)
# Looping through the identified contours
# Then rectangular part is cropped and passed on to pytesseract
# pytesseract extracts the text inside each contours
# Extracted text is then written into a text file
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
# Cropping the text block for giving input to OCR
cropped = copied_image[y:y + h, x:x + w]
with open("output/recognized-kernel-66-66.txt", "a") as f:
# Apply OCR on the cropped image
text = pytesseract.image_to_string(cropped, lang='por', config='--oem 1 --psm 1')
print(text)
masked = cv2.drawContours(mask, [cnt], 0, (255, 255, 255), -1)
plt.figure(figsize=(25, 15))
plt.imshow(masked, cmap='gray')
plt.show()