0

I'm currently working on a small OCR bot. I got pretty much everything to work and am now trying to improve the OCR. Specifically, it has problems with two things: the orange/red-ish text on the same colored gradient and for some reason the first 1 of "1/1". Sadly I haven't found anything that worked in my case yet. I've made a small test image, which is consisting of multiple images, below:

Source Image

Results

Adaptive Threshold

As you can see the gradient results in a blob that is sometimes big enough to overlap with the first word (see "apprentice") resulting in garbage.

I've tried many variations and played around with thresholds, blurs, erode, dilation, box detection with the dilation method, etc. but nothing worked well. The only way I did get rid of the blob is using an adaptive Threshold. But sadly I wasn't able to get good results using the output image.

If anyone knows how to make the OCR more robust, increase accuracy and get rid of the blob I'd appreciate your help. Thanks.

The following code is my 'playground' to figure out a better way:

import cv2
import pytesseract
import numpy as np

pytesseract.pytesseract.tesseract_cmd = YOUR_PATH

def resize(img, scale_percent=300):
    # use this instead?
    # resize = image = imutils.resize(image, width=300)

    # automatically resizes it about 300% by default
    width = int(img.shape[1] * scale_percent / 100)
    height = int(img.shape[0] * scale_percent / 100)
    dim = (width, height)
    resized = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
    return resized


def preprocessImage(img, scale=300, threshhold=127):
    """ input RGB colour space """
    # makes results more accurate - inspired from https://stackoverflow.com/questions/58103337/how-to-ocr-image-with-tesseract
    # another resource to improve accuracy - https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html

    # converts from rgb to grayscale then enlarges it
    # applies gaussian blur
    # convert to b&w
    # invert black and white colours (white background, black text)
    grayscale = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    cv2.imshow('grayscale', grayscale)
    resized = resize(grayscale, scale)
    cv2.imshow('resized', resized)

    blurred = cv2.medianBlur(resized, 5)
    #cv2.imshow('median', blurred)

    blurred = cv2.GaussianBlur(resized, (5, 5), 5)
    cv2.imshow('1', blurred)

    cv2.waitKey()

    blackAndWhite = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    cv2.imshow('blackAndWhite', blackAndWhite)

    th3 = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
    cv2.floodFill(th3, None, (0, 0), 255)
    cv2.imshow('th3', th3)


    #kernel = np.ones((3, 3), np.uint8)
    #erode = cv2.erode(th3, kernel)

    kernel = np.ones((5, 5), np.uint8)
    #opening = cv2.morphologyEx(blackAndWhite, cv2.MORPH_OPEN, kernel)

    invertedColours = cv2.bitwise_not(blackAndWhite)

    return invertedColours


# excerpt from https://www.youtube.com/watch?v=6DjFscX4I_c
def imageToText(img):
    # returns item name from image, preprocess if needed
    boxes = pytesseract.image_to_data(img)
    num = []
    for count, box in enumerate(boxes.splitlines()):
        if (count != 0):
            box = box.split()
            if (len(box) == 12):
                text = box[11].strip('@®')
                if (text != ''):
                    num.append(text)
    text = ' '.join(num)
    ## Alternate method
    # text = pytesseract.image_to_string(img)
    # print("Name:", text)
    return text


if __name__ == "__main__":
    img = cv2.imread("test.png")
    img = preprocessImage(img, scale=300)
    print(imageToText(img))

    ##############################################
    ##### Detecting Words  ######
    ##############################################
    #[   0          1           2           3           4          5         6       7       8        9        10       11 ]
    #['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']

    boxes = pytesseract.image_to_data(img)
    # convert back to colored image
    img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    # draw boxes and text
    for a,b in enumerate(boxes.splitlines()):
            print(b)
            if a!=0:
                b = b.split()
                if len(b)==12:
                    x,y,w,h = int(b[6]),int(b[7]),int(b[8]),int(b[9])
                    cv2.putText(img,b[11],(x,y-5),cv2.FONT_HERSHEY_SIMPLEX,1,(50,50,255),2)
                    cv2.rectangle(img, (x,y), (x+w, y+h), (0, 0, 255), 2)

    cv2.imshow('img', img)
    cv2.waitKey(0)
Alex
  • 3
  • 3

1 Answers1

0

I couldn't get it perfect but almost...

I got a lot of benefit from CLAHE equalization. See tutorial here. But that wasn't enough. Still needed thresholding. Adaptive techniques didn't work well, but cv2.THRESH_TOZERO gives OK results. See thresholding tutorial here

import cv2
from pytesseract import image_to_string, image_to_data


img = cv2.imread('gradient.png', cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (0,0), fx=2.0, fy=2.0)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
img = clahe.apply(img)
img = 255-img # invert image. tesseract prefers black text on white background

ret, img = cv2.threshold(img, 127, 255, cv2.THRESH_TOZERO)

cv2.imwrite('output.png', img)
ocr = image_to_string(img, config='--psm 6')
print(ocr)

which gives ocr output

Tool Crafting Part
Apprentice Craft Kit
Adept Craft Kit
Expert Craft Kit
=
Master Craft Kit
1/1
bfris
  • 5,272
  • 1
  • 20
  • 37