Below is the response from GCP Natural Language custom model for classification. I have made a dictionary that has the gcs_path (file name) as key and payload is the prediction from the model, stored as the value. Trying to extract (parse) the gcs_path (key) for the class (display_name) which is HIC. Therefore for the below response, trying to get gs://talem_mapfre_ocr/invoice-doc0.pdf as the output. I have tried to convert this to json format, but the payload format is not json serializable. Any leads would be appreciated.
{'gs://talem_mapfre_ocr/invoice-doc0.pdf': payload {
annotation_spec_id: "2184948407204839424"
classification {
score: 0.9999998807907104
}
display_name: "HIC"
}
payload {
annotation_spec_id: "6796634425632227328"
classification {
score: 1.0604375688672008e-07
}
display_name: "Not_HIC"
}
preprocessed_input {
document {
document_text {
}
page_count: 1
}
}
, 'gs://talem_mapfre_ocr/invoice-doc1.pdf': payload {
annotation_spec_id: "6796634425632227328"
classification {
score: 1.0
}
display_name: "Not_HIC"
}
payload {
annotation_spec_id: "2184948407204839424"
classification {
score: 2.7873853178801136e-16
}
display_name: "HIC"
}
preprocessed_input {
document {
document_text {
}
page_count: 1
}
}
, 'gs://talem_mapfre_ocr/invoice-doc2.pdf': payload {
annotation_spec_id: "6796634425632227328"
classification {
score: 1.0
}
display_name: "Not_HIC"
}
payload {
annotation_spec_id: "2184948407204839424"
classification {
score: 8.088715808263203e-10
}
display_name: "HIC"
}
preprocessed_input {
document {
document_text {
}
page_count: 1
}
}
, 'gs://talem_mapfre_ocr/invoice-doc3.pdf': payload {
annotation_spec_id: "6796634425632227328"
classification {
score: 1.0
}
display_name: "Not_HIC"
}
payload {
annotation_spec_id: "2184948407204839424"
classification {
score: 1.4505006917797658e-10
}
display_name: "HIC"
}
preprocessed_input {
document {
document_text {
}
page_count: 1
}
}
, 'gs://talem_mapfre_ocr/single_file_for_splitting.pdf': payload {
annotation_spec_id: "6796634425632227328"
classification {
score: 0.999998927116394
}
display_name: "Not_HIC"
}
payload {
annotation_spec_id: "2184948407204839424"
classification {
score: 1.0924828757197247e-06
}
display_name: "HIC"
}
preprocessed_input {
document {
document_text {
}
page_count: 4
}
}
}
Here is the code for getting the result (response):
def pdf_payload(file_path):
return {'document': {'input_config': {'gcs_source': {'input_uris': [file_path] } } } }
def get_prediction(file_path, model_name):
options = ClientOptions(api_endpoint='automl.googleapis.com')
prediction_client = automl_v1.PredictionServiceClient(client_options=options)
#payload = inline_text_payload(file_path)
# Uncomment the following line (and comment the above line) if want to predict on PDFs.
payload = pdf_payload(file_path)
params = {}
request = prediction_client.predict(name=model_name, payload=payload)
return request # waits until request is returned
pdf_single_pages = glob.glob("./*.pdf")
list_of_pages = []
for single_page in pdf_single_pages:
path_split = PurePath(single_page).parts
pagename, ext = os.path.splitext(path_split[-1])
pdf = (pagename+'.pdf')
list_of_pages.append(pdf)
for i in list_of_pages:
new_path = os.path.join(file_path,i)
gcs_paths.append(new_path)
result = get_prediction(new_path, model_name)
json_results.append(result)
dict_json = {k: v for k, v in zip(gcs_paths, json_results)}