I am running a OCR processing function in AWS Lambda using TesseractJS. I had to ramp up the lambda function's memory to the max (1536 MB) for it to not crash because of memory issues. Even with this, the process almost reaches the max threshold:
Duration: 54133.61 ms Billed Duration: 54200 ms Memory Size: 1536 MB Max Memory Used: 1220 MB
The strange thing, and the reason I am posting this question, is why this is taking so much memory? If I run this same process in my development environment, which has merely 512MB of memory, it can still complete without any problems at all.
Images I am using for these tests are roughly only around 350KB.
Here is snippet of my code:
Tesseract.recognize(img)
.catch(err => reject(err))
.then(function(result) {
Tesseract.terminate();
console.log(result);
}));
});
Here is a more complete version of my code:
lambda.js:
exports.handler = function(event, context, callback) {
let snsMessage = getSNSMessageObject(
JSON.stringify(event.Records[0].Sns.Message));
let bucket = snsMessage.Records[0].s3.bucket.name;
let key = snsMessage.Records[0].s3.object.key;
let bookId = key.split('.')[0].split('/')[0];
let pageNum = key.split('.')[0].split('/')[1];
s3.getImage(bucket, key)
.then(function(data) {
return ocr.recognizeImage(data.Body);
})
.then(function(result) {
return s3.uploadOCR(bucket, bookId, pageNum, result);
})
.then(fulfilled => callback(null))
.catch(error => callback(error, 'Error'));
};
Helper functions:
getImage: function getImage(bucket, key) {
// Obtener la imagen de S3
let params = {Bucket: bucket, Key: key};
return s3.getObject(params).promise();
},
uploadOCR: function uploadOCR(bucket, bookId, pageNum, ocr) {
// Subir el OCR JSON a S3
let params = {
Bucket: bucket,
Key: (bookId + '/' + pageNum + '.json'),
Body: ocr,
ContentType: 'application/json'
};
return s3.putObject(params).promise();
}
recognizeImage: function recognizeImage(img) {
return new Promise(function(resolve, reject) {
// Procesar con TesseractJS
Tesseract.recognize(img)
.catch(err => reject(err))
.then(function(result) {
Tesseract.terminate();
let ocr = {};
ocr['paragraphs'] = result.paragraphs.map(
p => ({'bbox': p.bbox, 'baseline': p.baseline,
'lines': p.lines.map(
l => ({'bbox': l.bbox, 'baseline': l.baseline,
'words': l.words.map(
w => ({'text': w.text, 'bbox': w.bbox,
'baseline': w.baseline}))
}))
}));
resolve(JSON.stringify(ocr));
});
});