3

I'd like to convert document images to XML and also export the location where a certain word has been found within a page. In order to access bounding box information, tesseract's layout analysis can be used:

 tess.SetImage(...); 
 tess.SetPageSegMode( tesseract::PSM_AUTO_OSD); 
 tesseract::PageIterator* it = tess.AnalyseLayout(); 
 while(it->Next(tesseract::RIL_WORD)
 {
      int top, bottom, left, right; 
      it->BoundingBox(tesseract::RIL_WORD, &left, &top, &right, &bottom); 

 }

At that point, however, I don't know the actual content of a bounding box and by executing the following code, OCR is performed on the current image, so text contains the whole text of a page.

 tess.Recognize(0); 
 std::string text = tess.GetUTF8Text(); 

Currently I temporarily store all bounding boxes in a vector. For each box I cut out a subimage from the original one and perform OCR for each bounding box. Basically this works, but when I compare the results to the Tesseract Command Line Tool, far more OCR errors occur.

Therefore I'd like to know how to I can iterate through the OCR result word by word and get the corresponding bounding box.

Christoph Rackwitz
  • 11,317
  • 4
  • 27
  • 36
Pedro
  • 4,100
  • 10
  • 58
  • 96

2 Answers2

2
tess.Recognize(0);

PAGE_RES_IT resultIter(page_res_);

for (resultIter.start_page(false); resultIter.block() != NULL; resultIter.forward()) 
{

            WERD_RES* wordResult = resultIter.word();
            WERD_CHOICE* word = wordResult->best_choice;

            TBOX& box = wordResult->word->bounding_box();
}
Michael Anderson
  • 70,661
  • 7
  • 134
  • 187
Vasant
  • 86
  • 4
  • 1
    you seem to be using an older version of Tesseract. In v3.01 `page_res_` is inaccessible, instead `tesseract::ResultIterator* it = tess.GetIterator()` has to be used. – Pedro Jul 11 '12 at 17:41
0
NSString *retText = @"";
tesseract::ResultIterator *ri = tess.GetIterator();
tesseract::PageIteratorLevel level = tesseract::RIL_WORD;

if (ri != 0) {
do {
  const char *word = ri->GetUTF8Text(level);
  float conf = ri->Confidence(level);
  int x1, y1, x2, y2;
  ri->BoundingBox(level, &x1, &y1, &x2, &y2);

  if (word) {
    printf("word: '%s';  \tconf: %.2f; BoundingBox: %d,%d,%d,%d;\n", word,
           conf, x1, y1, x2, y2);

    NSString *temp =
        [NSString stringWithCString:word encoding:NSUTF8StringEncoding];
    retText = [NSString stringWithFormat:@"%@ %@", retText, temp];
    retText = [retText stringByReplacingOccurrencesOfString:@"[\\\""
                                                 withString:@""];
    retText = [retText stringByReplacingOccurrencesOfString:@"\n\n"
                                                 withString:@""];

    UIBezierPath *path = [UIBezierPath bezierPath];

    [path moveToPoint:CGPointMake(x1, y1)];
    [path addLineToPoint:CGPointMake(x2, y1)];
    [path addLineToPoint:CGPointMake(x2, y2)];
    [path addLineToPoint:CGPointMake(x1, y2)];
    [path addLineToPoint:CGPointMake(x1, y1)];

    CAShapeLayer *shapeLayer = [CAShapeLayer layer];
    shapeLayer.path = [path CGPath];
    shapeLayer.strokeColor = [[UIColor blueColor] CGColor];
    shapeLayer.lineWidth = 3.0;
    shapeLayer.fillColor = [[UIColor clearColor] CGColor];

    [self.scrollView.layer addSublayer:shapeLayer];

    delete[] word;
  }
} while (ri->Next(level));
}
Can Ürek
  • 641
  • 12
  • 24