There isn't a straightforward example, so here is one using the official segmenter/de-segmentor from aws: https://github.com/aws-samples/amazon-comprehend-s3-object-lambda-functions/blob/main/src/processors.py#L172. I converted the code in the link to a package and then imported the Segmenter. I had to adjust the imports in all of the python scripts.
Instructions to replicate code below:
- go to link and download entire src folder
- rename the folder "comprehend_utils"
- fix imports in all files until the below code runs
from comprehend_utils.processors import Segmenter
import boto3
import os
import pandas as pd
os.environ['AWS_SECRET_ACCESS_KEY'] = ''
os.environ['AWS_ACCESS_KEY_ID']=''
def get_results(text):
client = boto3.client(service_name='comprehendmedical', region_name='us-east-1')
result = client.detect_entities(Text= text)
entities = result['Entities']
for entity in entities:
print('Entity', entity)
return entities
##text file
file_name = 'yourfile.txt'
with open(file_name, 'rb') as f:
text = f.read()
text = text.decode('utf-8')
segmentor = Segmenter(2000)
document_list = segmentor.segment(text)
for r in document_list:
entities = get_results(r.text)
r.pii_entities= entities
final_output = segmentor.de_segment(document_list)
df = pd.DataFrame(final_output.pii_entities)
df.to_csv(f'{file_name}_output.csv')