I'm not sure if there is a standardized way to do this, but the Vision API does give us everything we need to compose the block text, including relevant breaks (see Vision API break Types). So we can enumerate each block and create the text from it.
There are a couple of other break types I'm not accounting for (HYPHEN, SURE_SPACE), but I think it should be easy to add these.
For example:
const vision = require('@google-cloud/vision');
const client = new vision.ImageAnnotatorClient({
keyFilename: 'APIKey.json'
});
client
.documentTextDetection('image.jpg')
.then(results => {
console.log("Text blocks: ", getTextBlocks(results));
})
.catch(err => {
console.error("An error occurred: ", err);
});
function getTextBlocks(visionResults) {
let textBlocks = [];
let blockIndex = 0;;
visionResults.forEach(result => {
result.fullTextAnnotation.pages.forEach(page => {
textBlocks = textBlocks.concat(page.blocks.map(block => { return { blockIndex: blockIndex++, text: getBlockText(block) }}));
});
});
return textBlocks;
}
function getBlockText(block) {
let result = '';
block.paragraphs.forEach(paragraph => {
paragraph.words.forEach(word => {
word.symbols.forEach(symbol => {
result += symbol.text;
if (symbol.property && symbol.property.detectedBreak) {
const breakType = symbol.property.detectedBreak.type;
if (['EOL_SURE_SPACE' ,'SPACE'].includes(breakType)) {
result += " ";
}
if (['EOL_SURE_SPACE' ,'LINE_BREAK'].includes(breakType)) {
result += "\n"; // Perhaps use os.EOL for correctness.
}
}
})
})
})
return result;
}