2

I am trying to use the Vision from Apple in order to OCR and at the same time get the information on the coordinates of the boxes for each character.

Using VNDetectTextRectanglesRequest gives you that but doesn't give the OCRed text. Whereas VNRecognizeTextRequest gives you the OCRed text but doesn't give you the coordinates of the characters.

I can use both of them, my concern is that the number of characters detected might be not equal to the number of characters recognized.

I do have looked at this stackoverflow post but didn't helped me much: https://stackoverflow.com/questions/44533148/converting-a-vision-vntextobservation-to-a-string

Code with Detect Text:

    func getCharacterBoxesHelper(request: VNRequest, error: Error?) {
        let observations = request.results as! [VNTextObservation]
        for currentObservation in observations {
            for currentCharacterBox in currentObservation.characterBoxes! {
                characterBoxes.append(currentCharacterBox)
            }
        } // for observations
    } // getCharacterBoxesHelper

    func getCharacterBoxes(_ croppedImage: UIImage, findVin: Bool)  {
        self.characterBoxes = []
        let image = croppedImage
        let requestHandler = VNImageRequestHandler(cgImage: image.cgImage!, orientation: CGImagePropertyOrientation(rawValue: UInt32(image.imageOrientation.rawValue))!, options: [:])

        let request = VNDetectTextRectanglesRequest{ [weak self] req, err in
            DispatchQueue.main.async {
                self?.getCharacterBoxesHelper(request: req, error: err)
            }
        }
        request.reportCharacterBoxes = true

        DispatchQueue.global(qos: .userInitiated).async {
            do {
                try requestHandler.perform([request])
            } catch let error as NSError {
                print("Failed to perform image request: \(error)")
                return
            }
        } // dispatch
    } // getCharacterBoxes
} // extension

code for recognize text:

    func readImage(_ croppedImage: UIImage, findX: Bool) {
        self.croppedImage = croppedImage
        let image = croppedImage
        let requestHandler = VNImageRequestHandler(cgImage: image.cgImage!, orientation: CGImagePropertyOrientation(rawValue: UInt32(image.imageOrientation.rawValue))!, options: [:])

        let request = VNRecognizeTextRequest { [weak self] req, err in
            DispatchQueue.main.async {
                self?.handle(request: req, error: err) 
                }
            }
        }
        request.recognitionLevel = .accurate
        request.usesLanguageCorrection = true

        // Send the requests to the request handler.
        DispatchQueue.global(qos: .userInitiated).async {
            do {
                try requestHandler.perform([request])
            } catch let error as NSError {
                print("Failed to perform image request: \(error)")
                return
            }
        }
    }

    func handle(request: VNRequest, error: Error?) {
        if let observations = request.results as? [VNRecognizedTextObservation] {
            var recognizedTextArray = [String]()
            for currentObservation in observations {
                if let recognizedText = currentObservation.topCandidates(1).first {
                    recognizedTextArray.append(recognizedText.string)
                }
            }
        } else {
            print("No observation detected")
        }
    }
low_queue
  • 65
  • 3

1 Answers1

1

VNRecognizedText has a boundingBox method that calculates the bounding box around the characters in the range of the string.

https://developer.apple.com/documentation/vision/vnrecognizedtext/3213755-boundingbox

guard let results = request.results as? [VNRecognizedTextObservation] else { return }

for result in results {
    if let text = result.topCandidates(1).first {
        if let box = try? text.boundingBox(for: text.string.range(of: text.string)!) {
            print("String coordinates for \"\(text.string)\":")
            print("\tTop left: \(box.topLeft)")
            print("\tTop right: \(box.topRight)")
            print("\tBottom left: \(box.bottomLeft)")
            print("\tBottom right: \(box.bottomRight)")
        }
    }
}

YulkyTulky
  • 886
  • 1
  • 6
  • 20
  • 1
    It's worth mentioning that the discussion for that method in the docs says not to use the box for any image processing since it's not guaranteed to be accurate. It's essentially just a helpful hint. Given the original question, it sounds like this might not give robust enough information if the desire is to have accurate per-character boxes. I recently ran into a similar issue where the bounding boxes for the first two characters in a 3 digit number string were the same as the full string's bounding box. – Adam Evans Jul 09 '22 at 23:51