167

I am trying to find the bounding boxes of text in an image and am currently using this approach:

// calculate the local variances of the grayscale image
Mat t_mean, t_mean_2;
Mat grayF;
outImg_gray.convertTo(grayF, CV_32F);
int winSize = 35;
blur(grayF, t_mean, cv::Size(winSize,winSize));
blur(grayF.mul(grayF), t_mean_2, cv::Size(winSize,winSize));
Mat varMat = t_mean_2 - t_mean.mul(t_mean);
varMat.convertTo(varMat, CV_8U);

// threshold the high variance regions
Mat varMatRegions = varMat > 100;

When given an image like this:

enter image description here

Then when I show varMatRegions I get this image:

enter image description here

As you can see it somewhat combines the left block of text with the header of the card, for most cards this method works great but on busier cards it can cause problems.

The reason it is bad for those contours to connect is that it makes the bounding box of the contour nearly take up the entire card.

Can anyone suggest a different way I can find the text to ensure proper detection of text?

200 points to whoever can find the text in the card above the these two.

enter image description here enter image description here

herohuyongtao
  • 49,413
  • 29
  • 133
  • 174
Clip
  • 3,018
  • 8
  • 42
  • 77

10 Answers10

143

I used a gradient based method in the program below. Added the resulting images. Please note that I'm using a scaled down version of the image for processing.

c++ version

The MIT License (MIT)

Copyright (c) 2014 Dhanushka Dangampola

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

#include "stdafx.h"

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>

using namespace cv;
using namespace std;

#define INPUT_FILE              "1.jpg"
#define OUTPUT_FOLDER_PATH      string("")

int _tmain(int argc, _TCHAR* argv[])
{
    Mat large = imread(INPUT_FILE);
    Mat rgb;
    // downsample and use it for processing
    pyrDown(large, rgb);
    Mat small;
    cvtColor(rgb, small, CV_BGR2GRAY);
    // morphological gradient
    Mat grad;
    Mat morphKernel = getStructuringElement(MORPH_ELLIPSE, Size(3, 3));
    morphologyEx(small, grad, MORPH_GRADIENT, morphKernel);
    // binarize
    Mat bw;
    threshold(grad, bw, 0.0, 255.0, THRESH_BINARY | THRESH_OTSU);
    // connect horizontally oriented regions
    Mat connected;
    morphKernel = getStructuringElement(MORPH_RECT, Size(9, 1));
    morphologyEx(bw, connected, MORPH_CLOSE, morphKernel);
    // find contours
    Mat mask = Mat::zeros(bw.size(), CV_8UC1);
    vector<vector<Point>> contours;
    vector<Vec4i> hierarchy;
    findContours(connected, contours, hierarchy, CV_RETR_CCOMP, CV_CHAIN_APPROX_SIMPLE, Point(0, 0));
    // filter contours
    for(int idx = 0; idx >= 0; idx = hierarchy[idx][0])
    {
        Rect rect = boundingRect(contours[idx]);
        Mat maskROI(mask, rect);
        maskROI = Scalar(0, 0, 0);
        // fill the contour
        drawContours(mask, contours, idx, Scalar(255, 255, 255), CV_FILLED);
        // ratio of non-zero pixels in the filled region
        double r = (double)countNonZero(maskROI)/(rect.width*rect.height);

        if (r > .45 /* assume at least 45% of the area is filled if it contains text */
            && 
            (rect.height > 8 && rect.width > 8) /* constraints on region size */
            /* these two conditions alone are not very robust. better to use something 
            like the number of significant peaks in a horizontal projection as a third condition */
            )
        {
            rectangle(rgb, rect, Scalar(0, 255, 0), 2);
        }
    }
    imwrite(OUTPUT_FOLDER_PATH + string("rgb.jpg"), rgb);

    return 0;
}

python version

The MIT License (MIT)

Copyright (c) 2017 Dhanushka Dangampola

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

import cv2
import numpy as np

large = cv2.imread('1.jpg')
rgb = cv2.pyrDown(large)
small = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY)

kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
grad = cv2.morphologyEx(small, cv2.MORPH_GRADIENT, kernel)

_, bw = cv2.threshold(grad, 0.0, 255.0, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 1))
connected = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel)
# using RETR_EXTERNAL instead of RETR_CCOMP
contours, hierarchy = cv2.findContours(connected.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
#For opencv 3+ comment the previous line and uncomment the following line
#_, contours, hierarchy = cv2.findContours(connected.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

mask = np.zeros(bw.shape, dtype=np.uint8)

for idx in range(len(contours)):
    x, y, w, h = cv2.boundingRect(contours[idx])
    mask[y:y+h, x:x+w] = 0
    cv2.drawContours(mask, contours, idx, (255, 255, 255), -1)
    r = float(cv2.countNonZero(mask[y:y+h, x:x+w])) / (w * h)

    if r > 0.45 and w > 8 and h > 8:
        cv2.rectangle(rgb, (x, y), (x+w-1, y+h-1), (0, 255, 0), 2)

cv2.imshow('rects', rgb)

enter image description here enter image description here enter image description here

singrium
  • 2,746
  • 5
  • 32
  • 45
dhanushka
  • 10,492
  • 2
  • 37
  • 47
  • Is there a significant difference between your solution and William's except from image size? I'm trying to understand your approaches but the explanations are brief. – anana May 16 '14 at 05:21
  • 3
    I just had a look at his approach. Main difference I see is that he's using a Sobel filter whereas I'm using a morphological gradient filter. I think the morphological filter and downsampling flattens out much of the not-so-strong edges. Sobel might pick up more noise. – dhanushka May 16 '14 at 16:34
  • Very viable solution. Requires some improvements for my case but in general, Thanks! – Cynichniy Bandera Nov 05 '15 at 21:24
  • +1, A great solution.. In my case this could create bounding boxes around letters/letter parts by changing parameters. – Samitha Chathuranga Nov 23 '15 at 17:48
  • @dhanushka, I'm trying your solution (http://codepad.org/f8wMRmjM) but get this compile error: `tutorial.cpp:1:20: fatal error: stdafx.h: No such file or directory #include "stdafx.h"` I have located all the other includes but I can't find this file on my system – clarkk Dec 10 '15 at 00:48
  • @clarkk you don't actually need it. it's the precompiled header in MS Visual Studio. comment that line and try. – dhanushka Dec 10 '15 at 04:54
  • Could you please explain what `threshold(grad, bw, 0.0, 255.0, THRESH_BINARY | THRESH_OTSU);` does? I am familiar with thresholding, but I've got no idea what `THRESH_BINARY | THRESH_OTSU` does? – daniel451 Jul 28 '16 at 15:47
  • 2
    @ascenator When you combine OTSU with the threshold type, it uses the Otsu's threshold instead of the specified threshold value. See [here](http://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html?highlight=threshold#threshold). – dhanushka Jul 29 '16 at 01:06
  • @PiotrSobolewski I'm sorry, currently there's no python version. – dhanushka Aug 08 '16 at 15:47
  • @dhanushka I've tried you gradient method and it worked like magic with little tweaks here and there. But am wondering how I can ***Extract*** the _detected text_ for recognition (OCR) [preview](http://brianmatovu.com/wp-content/uploads/2017/03/container-code-image-processing.zip) – bmatovu Mar 12 '17 at 12:14
  • @PiotrSobolewski I translated dhanushka's code to Python, see my answer below. It doesn't detect quite as well but the major building blocks are there and hopefully someone can help me tinker the code to get it working better? – rtkaleta Apr 07 '17 at 17:32
  • Could you explain how to do it in Python after following line for(int idx = 0; idx >= 0; idx = hierarchy[idx][0]) It seems Python hierarchy is different from c++. Thanks in advance – zt1983811 Apr 17 '17 at 20:42
  • What is Mat maskROI(mask, rect); maskROI = Scalar(0, 0, 0); in Python – zt1983811 Apr 17 '17 at 21:34
  • @rtkaleta Could you show me how your Python code worked? I also try to translate it to Python, but I am not sure how to do maskROI(mask, rect); maskROI = Scalar(0, 0, 0) – zt1983811 Apr 18 '17 at 14:03
  • @dhanushka This code definitely creates great rects around words, but I would like to know does this also tells the words in that box, if not can you help with a way to detect letters in that box? – iYoung Jul 19 '17 at 10:37
  • @iYoung It does not tell you the word. You should use an OCR software for this. Check **tesseract** or the **gttext** mentioned in [herohuyongtao's answer](https://stackoverflow.com/a/23557702/2571705). I haven't used gttext, but it seems to work well for color images. Looks like you can feed it the text rectangles that you find. – dhanushka Jul 20 '17 at 03:27
  • @dhanushka I want to extract these blocks out of the original image for OCR. However, is there any method to enlarge the image and find the respective contours? – Vishnu Jayanand Mar 15 '18 at 12:39
  • 1
    @VishnuJayanand You just have to apply a scaling to the `rect`. There's one `pyrdown`, so multiply `x, y, width, height` of the `rect` by 4. – dhanushka Mar 16 '18 at 06:54
  • 1
    Could you please provide us with the third condition: the number of significant peaks in a horizontal projection or at least some lead. – ISlimani Jun 05 '18 at 12:03
  • 2
    @DforTye Take the horizontal projection of the filled contour (cv::reduce), then threshold it (say, using mean or median height). If you visualize this result, it'll look like a barcode. I think, at the time, I was thinking of counting the number of bars, and imposing a threshold on it. Now I think, if the region is clean enough, it may also help if we can feed it to an OCR and get a confidence level for each detected character to be sure that the region contains text. – dhanushka Jun 05 '18 at 12:39
  • @dhanushka i am trying python version of this code for text detection but i get following error ,rgb = cv2.pyrDown(large) cv2.error: OpenCV(3.4.3) C:\projects\opencv-python\opencv\modules\imgproc\src\pyramids.cpp:858: error: (-215:Assertion failed) !_src.empty() in function 'cv::pyrDown_' Process finished with exit code 1 – Azam Rafique Dec 24 '18 at 06:36
  • @AzamRafique May be your `imread` does not properly read the image. Please check the input image to the `pyrDown` to see if it is valid. – dhanushka Dec 24 '18 at 08:06
  • @dhanushka code is running successfully! but around the small size text it draws a combined bounding box , can you give me any suggestion. i have also changes the values of the r,w and h.. i want each text portion with seperate bounding box no matter what the size is of text – Azam Rafique Dec 26 '18 at 04:55
  • What would you suggest if I want to get the text, translate it to some other language and project the new text back to the image? (Looking to translate an image) – Suhail Gupta Dec 17 '19 at 16:47
  • @SuhailGupta If you want to project it back to the image, you may want to remove the existing text. The result wouldn't look very nice if you put the new text on top of the existing text. May be you can use image [inpainting](https://en.wikipedia.org/wiki/Inpainting) inside the detected bounding boxes to remove the existing text, then place the new text there. – dhanushka Dec 18 '19 at 07:05
136

You can detect text by finding close edge elements (inspired from a LPD):

#include "opencv2/opencv.hpp"

std::vector<cv::Rect> detectLetters(cv::Mat img)
{
    std::vector<cv::Rect> boundRect;
    cv::Mat img_gray, img_sobel, img_threshold, element;
    cvtColor(img, img_gray, CV_BGR2GRAY);
    cv::Sobel(img_gray, img_sobel, CV_8U, 1, 0, 3, 1, 0, cv::BORDER_DEFAULT);
    cv::threshold(img_sobel, img_threshold, 0, 255, CV_THRESH_OTSU+CV_THRESH_BINARY);
    element = getStructuringElement(cv::MORPH_RECT, cv::Size(17, 3) );
    cv::morphologyEx(img_threshold, img_threshold, CV_MOP_CLOSE, element); //Does the trick
    std::vector< std::vector< cv::Point> > contours;
    cv::findContours(img_threshold, contours, 0, 1); 
    std::vector<std::vector<cv::Point> > contours_poly( contours.size() );
    for( int i = 0; i < contours.size(); i++ )
        if (contours[i].size()>100)
        { 
            cv::approxPolyDP( cv::Mat(contours[i]), contours_poly[i], 3, true );
            cv::Rect appRect( boundingRect( cv::Mat(contours_poly[i]) ));
            if (appRect.width>appRect.height) 
                boundRect.push_back(appRect);
        }
    return boundRect;
}

Usage:

int main(int argc,char** argv)
{
    //Read
    cv::Mat img1=cv::imread("side_1.jpg");
    cv::Mat img2=cv::imread("side_2.jpg");
    //Detect
    std::vector<cv::Rect> letterBBoxes1=detectLetters(img1);
    std::vector<cv::Rect> letterBBoxes2=detectLetters(img2);
    //Display
    for(int i=0; i< letterBBoxes1.size(); i++)
        cv::rectangle(img1,letterBBoxes1[i],cv::Scalar(0,255,0),3,8,0);
    cv::imwrite( "imgOut1.jpg", img1);  
    for(int i=0; i< letterBBoxes2.size(); i++)
        cv::rectangle(img2,letterBBoxes2[i],cv::Scalar(0,255,0),3,8,0);
    cv::imwrite( "imgOut2.jpg", img2);  
    return 0;
}

Results:

a. element = getStructuringElement(cv::MORPH_RECT, cv::Size(17, 3) ); imgOut1 imgOut2

b. element = getStructuringElement(cv::MORPH_RECT, cv::Size(30, 30) ); imgOut1 imgOut2

Results are similar for the other image mentioned.

LovaBill
  • 5,107
  • 1
  • 24
  • 32
  • 7
    License Plate Detector. – LovaBill May 09 '14 at 13:01
  • 2
    For some cards the bounding box does not enclose all of the text, such as half a letter getting cut off. Such as this card: http://i.imgur.com/tX3XrwH.jpg How can I extend every bounding bounding boxes height and width by `n`? Thanks for the solution it works great! – Clip May 11 '14 at 16:22
  • 4
    Say `cv::Rect a;`. Enlarged by n: `a.x-=n/2;a.y-=n/2;a.width+=n;a.height+=n;`. – LovaBill May 12 '14 at 07:15
  • 2
    Hi, how do i achieve the same outcome with python cv2? – dnth Oct 28 '15 at 09:19
  • @William Can you tell why you have used 'Sobel' here ? – Bhushan Mar 01 '16 at 07:51
  • @BhushanPatil You may try without it and see if it suits you better. – LovaBill Mar 01 '16 at 09:01
  • 3
    [Book](http://www.amazon.com/Mastering-OpenCV-Practical-Computer-Projects/dp/1849517827). [Code](https://github.com/MasteringOpenCV/code/tree/master/Chapter5_NumberPlateRecognition). – LovaBill May 30 '16 at 07:16
  • @William New to programming. Can the same stuff be done for text in scripts other than English like Sanskrit/ Chinese? – Vamshi Krishna Aug 12 '16 at 11:03
  • I suppose it can. Try it out. – LovaBill Aug 22 '16 at 07:26
  • @William What if I want to Detect specific code and how could be that possible emgucv in IOS app.. Could you please guide or post a link that could help – Khan Engineer May 30 '17 at 16:19
  • @William Does the input image needs to be of dimension 1400x800 or it can be greater as well like (2448x3264)? – iYoung Jul 14 '17 at 09:56
  • What would you suggest if I want to get the text, translate it to some other language and project the new text back to the image? (Looking to translate an image) – Suhail Gupta Dec 17 '19 at 16:48
  • You need to apply OCR, to translate the text using a translation service, to remove the original text with Content Aware filling, and then rewrite the translated text (OCR can give you the styling). – LovaBill Dec 18 '19 at 10:03
  • Could you point me to a source with OpenCV that demonstrates Content Aware filling? I read about inpainting, not sure if it means the same thing? – Suhail Gupta Dec 18 '19 at 11:57
  • This is pretty good. Is there any similar solution available in JavaScript/TypeScript? – Satyam Jun 20 '20 at 10:09
57

Here is an alternative approach that I used to detect the text blocks:

  1. Converted the image to grayscale
  2. Applied threshold (simple binary threshold, with a handpicked value of 150 as the threshold value)
  3. Applied dilation to thicken lines in image, leading to more compact objects and less white space fragments. Used a high value for number of iterations, so dilation is very heavy (13 iterations, also handpicked for optimal results).
  4. Identified contours of objects in resulted image using opencv findContours function.
  5. Drew a bounding box (rectangle) circumscribing each contoured object - each of them frames a block of text.
  6. Optionally discarded areas that are unlikely to be the object you are searching for (e.g. text blocks) given their size, as the algorithm above can also find intersecting or nested objects (like the entire top area for the first card) some of which could be uninteresting for your purposes.

Below is the code written in python with pyopencv, it should easy to port to C++.

import cv2

image = cv2.imread("card.png")
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY) # grayscale
_,thresh = cv2.threshold(gray,150,255,cv2.THRESH_BINARY_INV) # threshold
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,3))
dilated = cv2.dilate(thresh,kernel,iterations = 13) # dilate
_, contours, hierarchy = cv2.findContours(dilated,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE) # get contours

# for each contour found, draw a rectangle around it on original image
for contour in contours:
    # get rectangle bounding contour
    [x,y,w,h] = cv2.boundingRect(contour)

    # discard areas that are too large
    if h>300 and w>300:
        continue

    # discard areas that are too small
    if h<40 or w<40:
        continue

    # draw rectangle around contour on original image
    cv2.rectangle(image,(x,y),(x+w,y+h),(255,0,255),2)

# write original image with added contours to disk  
cv2.imwrite("contoured.jpg", image) 

The original image is the first image in your post.

After preprocessing (grayscale, threshold and dilate - so after step 3) the image looked like this:

Dilated image

Below is the resulted image ("contoured.jpg" in the last line); the final bounding boxes for the objects in the image look like this:

enter image description here

You can see the text block on the left is detected as a separate block, delimited from its surroundings.

Using the same script with the same parameters (except for thresholding type that was changed for the second image like described below), here are the results for the other 2 cards:

enter image description here

enter image description here

Tuning the parameters

The parameters (threshold value, dilation parameters) were optimized for this image and this task (finding text blocks) and can be adjusted, if needed, for other cards images or other types of objects to be found.

For thresholding (step 2), I used a black threshold. For images where text is lighter than the background, such as the second image in your post, a white threshold should be used, so replace thesholding type with cv2.THRESH_BINARY). For the second image I also used a slightly higher value for the threshold (180). Varying the parameters for the threshold value and the number of iterations for dilation will result in different degrees of sensitivity in delimiting objects in the image.

Finding other object types:

For example, decreasing the dilation to 5 iterations in the first image gives us a more fine delimitation of objects in the image, roughly finding all words in the image (rather than text blocks):

enter image description here

Knowing the rough size of a word, here I discarded areas that were too small (below 20 pixels width or height) or too large (above 100 pixels width or height) to ignore objects that are unlikely to be words, to get the results in the above image.

Community
  • 1
  • 1
anana
  • 1,461
  • 10
  • 11
  • 2
    You are amazing! I will try this in the morning. – Clip May 09 '14 at 05:54
  • I added another step for discarding uninteresting objects; also added example for identifying words or other types of objects (than blocks of text) – anana May 09 '14 at 17:09
  • Thanks for the detailed answer, however I am getting an error in `cv2.findContours`. It says `ValueError: too many values to unpack`. – Abhijith Mar 22 '17 at 09:59
  • 1
    The issue is the function `cv2.findContours` returns 3 arguments, and the original code captures only 2. – Abhijith Mar 23 '17 at 16:02
  • @Abhijith cv2 in version two returned two args, but now, in version three, it returns 3 – Tomasz Giba Feb 27 '18 at 20:42
  • @anana after step 3 i get dilated image but results are not good, how can i get correct results like your image? – Azam Rafique Dec 25 '18 at 04:25
  • @AzamRafique Did you try varying the number of iterations (and the parameters in general) depending on how big and far apart your letters are? If the bounding boxes turn out too big, you should probably decrease dilation, if they are too small - increase it. If the image you are using is very noisy, it's possible some words will be harder to extract anyway. – anana Jan 08 '19 at 13:12
30

@dhanushka's approach showed the most promise but I wanted to play around in Python so went ahead and translated it for fun:

import cv2
import numpy as np
from cv2 import boundingRect, countNonZero, cvtColor, drawContours, findContours, getStructuringElement, imread, morphologyEx, pyrDown, rectangle, threshold

large = imread(image_path)
# downsample and use it for processing
rgb = pyrDown(large)
# apply grayscale
small = cvtColor(rgb, cv2.COLOR_BGR2GRAY)
# morphological gradient
morph_kernel = getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
grad = morphologyEx(small, cv2.MORPH_GRADIENT, morph_kernel)
# binarize
_, bw = threshold(src=grad, thresh=0, maxval=255, type=cv2.THRESH_BINARY+cv2.THRESH_OTSU)
morph_kernel = getStructuringElement(cv2.MORPH_RECT, (9, 1))
# connect horizontally oriented regions
connected = morphologyEx(bw, cv2.MORPH_CLOSE, morph_kernel)
mask = np.zeros(bw.shape, np.uint8)
# find contours
im2, contours, hierarchy = findContours(connected, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
# filter contours
for idx in range(0, len(hierarchy[0])):
    rect = x, y, rect_width, rect_height = boundingRect(contours[idx])
    # fill the contour
    mask = drawContours(mask, contours, idx, (255, 255, 2555), cv2.FILLED)
    # ratio of non-zero pixels in the filled region
    r = float(countNonZero(mask)) / (rect_width * rect_height)
    if r > 0.45 and rect_height > 8 and rect_width > 8:
        rgb = rectangle(rgb, (x, y+rect_height), (x+rect_width, y), (0,255,0),3)

Now to display the image:

from PIL import Image
Image.fromarray(rgb).show()

Not the most Pythonic of scripts but I tried to resemble the original C++ code as closely as possible for readers to follow.

It works almost as well as the original. I'll be happy to read suggestions how it could be improved/fixed to resemble the original results fully.

enter image description here

enter image description here

enter image description here

rtkaleta
  • 681
  • 6
  • 14
  • 4
    Thank you for providing a python version. Many people will find this useful. +1 – dhanushka Apr 08 '17 at 05:11
  • what's the difference between filling the contour and drawing it? I found a code without the filling phase here: https://stackoverflow.com/a/23556997/6837132 – SarahData Sep 06 '17 at 09:46
  • @SarahM I don't know if you are asking about the generic diff between drawing and filling (fairly obvious I think?) or the OpenCV API specifically? If the latter then see the [docs](http://docs.opencv.org/2.4/modules/imgproc/doc/structural_analysis_and_shape_descriptors.html?highlight=drawcontours#drawcontours) for `drawContours` that state "The function draws contour outlines in the image if thickness > 0 or fills the area bounded by the contours if thickness < 0." It's done so we can check the ratio of non-zero pixels to decide if the box likely contains text. – rtkaleta Sep 12 '17 at 12:20
15

You can try this method that is developed by Chucai Yi and Yingli Tian.

They also share a software (which is based on Opencv-1.0 and it should run under Windows platform.) that you can use (though no source code available). It will generate all the text bounding boxes (shown in color shadows) in the image. By applying to your sample images, you will get the following results:

Note: to make the result more robust, you can further merge adjacent boxes together.


Update: If your ultimate goal is to recognize the texts in the image, you can further check out gttext, which is an OCR free software and Ground Truthing tool for Color Images with Text. Source code is also available.

With this, you can get recognized texts like:

herohuyongtao
  • 49,413
  • 29
  • 133
  • 174
6

Above Code JAVA version: Thanks @William

public static List<Rect> detectLetters(Mat img){    
    List<Rect> boundRect=new ArrayList<>();

    Mat img_gray =new Mat(), img_sobel=new Mat(), img_threshold=new Mat(), element=new Mat();
    Imgproc.cvtColor(img, img_gray, Imgproc.COLOR_RGB2GRAY);
    Imgproc.Sobel(img_gray, img_sobel, CvType.CV_8U, 1, 0, 3, 1, 0, Core.BORDER_DEFAULT);
    //at src, Mat dst, double thresh, double maxval, int type
    Imgproc.threshold(img_sobel, img_threshold, 0, 255, 8);
    element=Imgproc.getStructuringElement(Imgproc.MORPH_RECT, new Size(15,5));
    Imgproc.morphologyEx(img_threshold, img_threshold, Imgproc.MORPH_CLOSE, element);
    List<MatOfPoint> contours = new ArrayList<MatOfPoint>();
    Mat hierarchy = new Mat();
    Imgproc.findContours(img_threshold, contours,hierarchy, 0, 1);

    List<MatOfPoint> contours_poly = new ArrayList<MatOfPoint>(contours.size());

     for( int i = 0; i < contours.size(); i++ ){             

         MatOfPoint2f  mMOP2f1=new MatOfPoint2f();
         MatOfPoint2f  mMOP2f2=new MatOfPoint2f();

         contours.get(i).convertTo(mMOP2f1, CvType.CV_32FC2);
         Imgproc.approxPolyDP(mMOP2f1, mMOP2f2, 2, true); 
         mMOP2f2.convertTo(contours.get(i), CvType.CV_32S);


            Rect appRect = Imgproc.boundingRect(contours.get(i));
            if (appRect.width>appRect.height) {
                boundRect.add(appRect);
            }
     }

    return boundRect;
}

And use this code in practice :

        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
        Mat img1=Imgcodecs.imread("abc.png");
        List<Rect> letterBBoxes1=Utils.detectLetters(img1);

        for(int i=0; i< letterBBoxes1.size(); i++)
            Imgproc.rectangle(img1,letterBBoxes1.get(i).br(), letterBBoxes1.get(i).tl(),new Scalar(0,255,0),3,8,0);         
        Imgcodecs.imwrite("abc1.png", img1);
Fariz Aghayev
  • 649
  • 5
  • 17
3

This is a C# version of the answer from dhanushka using OpenCVSharp

        Mat large = new Mat(INPUT_FILE);
        Mat rgb = new Mat(), small = new Mat(), grad = new Mat(), bw = new Mat(), connected = new Mat();

        // downsample and use it for processing
        Cv2.PyrDown(large, rgb);
        Cv2.CvtColor(rgb, small, ColorConversionCodes.BGR2GRAY);

        // morphological gradient
        var morphKernel = Cv2.GetStructuringElement(MorphShapes.Ellipse, new OpenCvSharp.Size(3, 3));
        Cv2.MorphologyEx(small, grad, MorphTypes.Gradient, morphKernel);

        // binarize
        Cv2.Threshold(grad, bw, 0, 255, ThresholdTypes.Binary | ThresholdTypes.Otsu);

        // connect horizontally oriented regions
        morphKernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(9, 1));
        Cv2.MorphologyEx(bw, connected, MorphTypes.Close, morphKernel);

        // find contours
        var mask = new Mat(Mat.Zeros(bw.Size(), MatType.CV_8UC1), Range.All);
        Cv2.FindContours(connected, out OpenCvSharp.Point[][] contours, out HierarchyIndex[] hierarchy, RetrievalModes.CComp, ContourApproximationModes.ApproxSimple, new OpenCvSharp.Point(0, 0));

        // filter contours
        var idx = 0;
        foreach (var hierarchyItem in hierarchy)
        {
            idx = hierarchyItem.Next;
            if (idx < 0)
                break;
            OpenCvSharp.Rect rect = Cv2.BoundingRect(contours[idx]);
            var maskROI = new Mat(mask, rect);
            maskROI.SetTo(new Scalar(0, 0, 0));

            // fill the contour
            Cv2.DrawContours(mask, contours, idx, Scalar.White, -1);

            // ratio of non-zero pixels in the filled region
            double r = (double)Cv2.CountNonZero(maskROI) / (rect.Width * rect.Height);
            if (r > .45 /* assume at least 45% of the area is filled if it contains text */
                 &&
            (rect.Height > 8 && rect.Width > 8) /* constraints on region size */
            /* these two conditions alone are not very robust. better to use something 
            like the number of significant peaks in a horizontal projection as a third condition */
            )
            {
                Cv2.Rectangle(rgb, rect, new Scalar(0, 255, 0), 2);
            }
        }

        rgb.SaveImage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "rgb.jpg"));
Community
  • 1
  • 1
Diomedes Domínguez
  • 1,093
  • 12
  • 22
2

Python Implementation for @dhanushka's solution:

def process_rgb(rgb):
    hasText = False
    gray = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY)
    morphKernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3,3))
    grad = cv2.morphologyEx(gray, cv2.MORPH_GRADIENT, morphKernel)
    # binarize
    _, bw = cv2.threshold(grad, 0.0, 255.0, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    # connect horizontally oriented regions
    morphKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 1))
    connected = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, morphKernel)
    # find contours
    mask = np.zeros(bw.shape[:2], dtype="uint8")
    _,contours, hierarchy = cv2.findContours(connected, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
    # filter contours
    idx = 0
    while idx >= 0:
        x,y,w,h = cv2.boundingRect(contours[idx])
        # fill the contour
        cv2.drawContours(mask, contours, idx, (255, 255, 255), cv2.FILLED)
        # ratio of non-zero pixels in the filled region
        r = cv2.contourArea(contours[idx])/(w*h)
        if(r > 0.45 and h > 5 and w > 5 and w > h):
            cv2.rectangle(rgb, (x,y), (x+w,y+h), (0, 255, 0), 2)
            hasText = True
        idx = hierarchy[0][idx][0]
    return hasText, rgb
Hubert Gruniaux
  • 124
  • 2
  • 11
Akash Budhia
  • 446
  • 3
  • 11
1

You can utilize a python implementation SWTloc.

Full Disclosure : I am the author of this library

To do that :-

First and Second Image

Notice that the text_mode here is 'lb_df', which stands for Light Background Dark Foreground i.e the text in this image is going to be in darker color than the background

from swtloc import SWTLocalizer
from swtloc.utils import imgshowN, imgshow

swtl = SWTLocalizer()
# Stroke Width Transform
swtl.swttransform(imgpaths='img1.jpg', text_mode = 'lb_df',
                  save_results=True, save_rootpath = 'swtres/',
                  minrsw = 3, maxrsw = 20, max_angledev = np.pi/3)
imgshow(swtl.swtlabelled_pruned13C)

# Grouping
respacket=swtl.get_grouped(lookup_radii_multiplier=0.9, ht_ratio=3.0)
grouped_annot_bubble = respacket[2]
maskviz = respacket[4]
maskcomb  = respacket[5]

# Saving the results
_=cv2.imwrite('img1_processed.jpg', swtl.swtlabelled_pruned13C)
imgshowN([maskcomb, grouped_annot_bubble], savepath='grouped_img1.jpg')

enter image description here enter image description here


enter image description here enter image description here

Third Image

Notice that the text_mode here is 'db_lf', which stands for Dark Background Light Foreground i.e the text in this image is going to be in lighter color than the background

from swtloc import SWTLocalizer
from swtloc.utils import imgshowN, imgshow

swtl = SWTLocalizer()
# Stroke Width Transform
swtl.swttransform(imgpaths=imgpaths[1], text_mode = 'db_lf',
              save_results=True, save_rootpath = 'swtres/',
              minrsw = 3, maxrsw = 20, max_angledev = np.pi/3)
imgshow(swtl.swtlabelled_pruned13C)

# Grouping
respacket=swtl.get_grouped(lookup_radii_multiplier=0.9, ht_ratio=3.0)
grouped_annot_bubble = respacket[2]
maskviz = respacket[4]
maskcomb  = respacket[5]

# Saving the results
_=cv2.imwrite('img1_processed.jpg', swtl.swtlabelled_pruned13C)
imgshowN([maskcomb, grouped_annot_bubble], savepath='grouped_img1.jpg')

enter image description here enter image description here

You will also notice that the grouping done is not so accurate, to get the desired results as the images might vary, try to tune the grouping parameters in swtl.get_grouped() function.

Dravidian
  • 9,945
  • 3
  • 34
  • 74
0

this is a VB.NET version of the answer from dhanushka using EmguCV.

A few functions and structures in EmguCV need different consideration than the C# version with OpenCVSharp

Imports Emgu.CV
Imports Emgu.CV.Structure
Imports Emgu.CV.CvEnum
Imports Emgu.CV.Util

        Dim input_file As String = "C:\your_input_image.png"
        Dim large As Mat = New Mat(input_file)
        Dim rgb As New Mat
        Dim small As New Mat
        Dim grad As New Mat
        Dim bw As New Mat
        Dim connected As New Mat
        Dim morphanchor As New Point(0, 0)

        '//downsample and use it for processing
        CvInvoke.PyrDown(large, rgb)
        CvInvoke.CvtColor(rgb, small, ColorConversion.Bgr2Gray)

        '//morphological gradient
        Dim morphKernel As Mat = CvInvoke.GetStructuringElement(ElementShape.Ellipse, New Size(3, 3), morphanchor)
        CvInvoke.MorphologyEx(small, grad, MorphOp.Gradient, morphKernel, New Point(0, 0), 1, BorderType.Isolated, New MCvScalar(0))

        '// binarize
        CvInvoke.Threshold(grad, bw, 0, 255, ThresholdType.Binary Or ThresholdType.Otsu)

        '// connect horizontally oriented regions
        morphKernel = CvInvoke.GetStructuringElement(ElementShape.Rectangle, New Size(9, 1), morphanchor)
        CvInvoke.MorphologyEx(bw, connected, MorphOp.Close, morphKernel, morphanchor, 1, BorderType.Isolated, New MCvScalar(0))

        '// find contours
        Dim mask As Mat = Mat.Zeros(bw.Size.Height, bw.Size.Width, DepthType.Cv8U, 1)  '' MatType.CV_8UC1
        Dim contours As New VectorOfVectorOfPoint
        Dim hierarchy As New Mat

        CvInvoke.FindContours(connected, contours, hierarchy, RetrType.Ccomp, ChainApproxMethod.ChainApproxSimple, Nothing)

        '// filter contours
        Dim idx As Integer
        Dim rect As Rectangle
        Dim maskROI As Mat
        Dim r As Double
        For Each hierarchyItem In hierarchy.GetData
            rect = CvInvoke.BoundingRectangle(contours(idx))
            maskROI = New Mat(mask, rect)
            maskROI.SetTo(New MCvScalar(0, 0, 0))

            '// fill the contour
            CvInvoke.DrawContours(mask, contours, idx, New MCvScalar(255), -1)

            '// ratio of non-zero pixels in the filled region
            r = CvInvoke.CountNonZero(maskROI) / (rect.Width * rect.Height)

            '/* assume at least 45% of the area Is filled if it contains text */
            '/* constraints on region size */
            '/* these two conditions alone are Not very robust. better to use something 
            'Like the number of significant peaks in a horizontal projection as a third condition */
            If r > 0.45 AndAlso rect.Height > 8 AndAlso rect.Width > 8 Then
                'draw green rectangle
                CvInvoke.Rectangle(rgb, rect, New MCvScalar(0, 255, 0), 2)
            End If
            idx += 1
        Next
        rgb.Save(IO.Path.Combine(Application.StartupPath, "rgb.jpg"))
Hakan Usakli
  • 492
  • 5
  • 11