I have imported Pytesseract layers to Lambda console in AWS.And also used it but whenever the tesseract ocr detect languages other than english it return some kind of code like this
{"statusCode": 200, "body": "\"\\u09ad\\u09bf\\u09a7\\u09cd\\u09be\\u09ac\\u09c1\\u09af\\u09bc\\u09c7\\u09b\\u09a4\\u09be\\u09b0\\u0995\\u09be ""}
I have linked all the code below.Can some one explain me what is going wrong here.It was a bengali image I supplied to the OCR.And it gives me the above result.
import json
import base64
from PIL import Image, ImageFilter
import pytesseract
def ocr(img):
custom_config = r'-l eng+equ+ben --psm 6'
ocr_text = pytesseract.image_to_string(img, config=custom_config)
return ocr_text
def lambda_handler(event, context):
body_image64 = "Base 64 Image"
# Decode & save image to /tmp
with open("/tmp/saved_img.png", "wb") as f:
f.write(base64.b64decode(body_image64))
# # Read the image with cv2
# image = cv2.imread("/tmp/saved_img.png")
# # Convert to grayscale
# gr_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# open image
im = Image.open("/tmp/saved_img.png")
# preprocessing
gr_image = im.convert('L')# grayscale
gr_image = gr_image.filter(ImageFilter.MedianFilter())
# Ocr
ocr_text = ocr(gr_image)
print(im)
# TODO implement
return {
'statusCode': 200,
'body': json.dumps(ocr_text)
}