When deployed on lambda, AWS Textract doesn't return 'LINE' or 'WORD' blocks, only 'PAGE' blocks
My function works well when it runs locally and invokes Textract OCR, here is the output:
{'DocumentMetadata': {
'Pages': 1},
'JobStatus': 'SUCCEEDED',
'Blocks': [
{'BlockType': 'PAGE', 'Geometry': {'BoundingBox': {'Width': 1.0, 'Height': 1.0, 'Left': 0.0, 'Top': 0.0}, 'Polygon': [{'X': 1.956095047717099e-06, 'Y': 0.0}, {'X': 1.0, 'Y': 1.9269275526312413e-06}, {'X': 1.0, 'Y': 1.0}, {'X': 0.0, 'Y': 1.0}]}, 'Id': 'e3517e8d-3eb1-4c7a-8ffe-f4f4f40bd923', 'Relationships': [{'Type': 'CHILD', 'Ids': ['3261bf02-d63c-4016-a5b2-a437308de18d']}], 'Page': 1},
{'BlockType': 'LINE', 'Confidence': 99.86868286132812, 'Text': 'John Doe Is', 'Geometry': {'BoundingBox': {'Width': 0.14190037548542023, 'Height': 0.013485041446983814, 'Left': 0.09458228200674057, 'Top': 0.06803429871797562}, 'Polygon': [{'X': 0.0945969745516777, 'Y': 0.06803429871797562}, {'X': 0.2364826500415802, 'Y': 0.06828282028436661}, {'X': 0.2364688366651535, 'Y': 0.08151934295892715}, {'X': 0.09458228200674057, 'Y': 0.08127062022686005}]}, 'Id': '3261bf02-d63c-4016-a5b2-a437308de18d', 'Relationships': [{'Type': 'CHILD', 'Ids': ['d6c105b6-2e5b-425c-bc2d-fda9c4ed1815', '223fbbe4-9a0c-4a26-8947-04bb7e535139', '1bc8b80b-4e35-4815-9843-ac901852e720']}], 'Page': 1},
{'BlockType': 'WORD', 'Confidence': 99.84158325195312, 'Text': 'John', 'TextType': 'PRINTED', 'Geometry': {'BoundingBox': {'Width': 0.05469386279582977, 'Height': 0.013332170434296131, 'Left': 0.09458228200674057, 'Top': 0.06803429871797562}, 'Polygon': [{'X': 0.0945969745516777, 'Y': 0.06803429871797562}, {'X': 0.14927615225315094, 'Y': 0.06813007593154907}, {'X': 0.14926178753376007, 'Y': 0.08136647194623947}, {'X': 0.09458228200674057, 'Y': 0.08127062022686005}]}, 'Id': 'd6c105b6-2e5b-425c-bc2d-fda9c4ed1815', 'Page': 1},
{'BlockType': 'WORD', 'Confidence': 99.9034652709961, 'Text': 'Doe', 'TextType': 'PRINTED', 'Geometry': {'BoundingBox': {'Width': 0.02554963529109955, 'Height': 0.01002489123493433, 'Left': 0.15451456606388092, 'Top': 0.0688563659787178}, 'Polygon': [{'X': 0.15452536940574646, 'Y': 0.0688563659787178}, {'X': 0.18006420135498047, 'Y': 0.06890109926462173}, {'X': 0.18005351722240448, 'Y': 0.07888125628232956}, {'X': 0.15451456606388092, 'Y': 0.07883650064468384}]}, 'Id': '223fbbe4-9a0c-4a26-8947-04bb7e535139', 'Page': 1},
{'BlockType': 'WORD', 'Confidence': 99.86099243164062, 'Text': 'Is', 'TextType': 'PRINTED', 'Geometry': {'BoundingBox': {'Width': 0.05028943344950676, 'Height': 0.010319724678993225, 'Left': 0.1861930638551712, 'Top': 0.06834923475980759}, 'Polygon': [{'X': 0.1862039864063263, 'Y': 0.06834923475980759}, {'X': 0.23648250102996826, 'Y': 0.06843730062246323}, {'X': 0.23647181689739227, 'Y': 0.07866895943880081}, {'X': 0.1861930638551712, 'Y': 0.0785808339715004}]}, 'Id': '1bc8b80b-4e35-4815-9843-ac901852e720', 'Page': 1}],
'DetectDocumentTextModelVersion': '1.0', 'ResponseMetadata': {'RequestId': 'b6179e12-c603-4a2b-8a4f-171e5c722756', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'b6179e12-c603-4a2b-8a4f-171e5c722756', 'content-type': 'application/x-amz-json-1.1', 'content-length': '2571', 'date': 'Tue, 18 Oct 2022 13:34:52 GMT'}, 'RetryAttempts': 0}}
But when I deploy the calling function on lambda, the OCR only returns:
{'DocumentMetadata':
{'Pages': 1},
'JobStatus': 'SUCCEEDED',
'Blocks': [
{'BlockType': 'PAGE', 'Geometry': {'BoundingBox': {'Width': 1.0, 'Height': 1.0, 'Left': 0.0, 'Top': 0.0}, 'Polygon': [{'X': 3.7998024282615006e-08, 'Y': 0.0}, {'X': 1.0, 'Y': 0.000705239363014698}, {'X': 1.0, 'Y': 1.0}, {'X': 0.0, 'Y': 0.9995841383934021}]}, 'Id': '6a2b1917-731e-49ae-8620-2b61c3e75cf8', 'Page': 1}],
'DetectDocumentTextModelVersion': '1.0', 'ResponseMetadata': {'RequestId': 'cbe6c3b8-c60f-4990-b771-0620a2d08bd4', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'cbe6c3b8-c60f-4990-b771-0620a2d08bd4', 'content-type': 'application/x-amz-json-1.1', 'content-length': '388', 'date': 'Tue, 18 Oct 2022 13:09:08 GMT'}, 'RetryAttempts': 0}}
Here is my ocr code:
def OCR_V2(pdf, bucket_name):
s3 = boto3.resource("s3")
random_name = get_random_string(10) + ".pdf"
s3.Bucket(bucket_name).upload_fileobj(pdf, random_name)
jobId = InvokeTextDetectJob(bucket_name, random_name)
print('JobId: {}'.format(jobId))
if(CheckJobComplete(jobId)):
response = JobResults(jobId)
final_response = []
for resultPage in response:
if 'Blocks' in resultPage.keys():
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
final_response.append(item['Text'])
return ' '.join(final_response)
Comments
Post a Comment