I have been successful in sending a PDF File stored in GCS to the Document AI v1beta2 API. But in v1beta3 API, the file approach is no longer supported. It requires me to send the content in the JSON. Here is the documentation I am following: https://cloud.google.com/document-ai/docs/form-parser#v1beta3
Some questions:
What if anything do I have to do to the PDF content returned from a GET request? The PDF content appears to be in a base64 string which is what the API requires.
Looking at the API request, do you see anything incorrect?
REQUEST INFORMATION
ID: N/A
Method: POST
URL/Path: https://us-documentai.googleapis.com/v1beta3/projects/38072577434/locations/us/processors/cd8a06d0cd3cb045:process
Headers: Content-Type: application/json, Accept: application/json
Authorization: :censored:6:c2dc31949c: :censored:179:27504afa53:
Params: N/A
Data:
{"document":{"mimeType":"application/pdf","content":["%PDF-1.4\n1 0 obj\n<<\n/Title (��\u0000C\u0000r\u0000y\u0000s\u0000t\u0000a\u0000l\u0000 \u0000R\u0000e\u0000p\u0000o\u0000r\u0000t\u0000 \u0000V\u0000i\u0000e\u0000w\u0000e\u0000r)\n/Creator (��\u0000w\u0000k\u0000h\u0000t\u0000m\u0000l\u0000t\u0000o\u0000p\u0000d\u0000f\u0000 \u00000\u0000.\u00001\u00002\u0000.\u00005)\n/Producer (��\u0000Q\u0000t\u0000 \u00004\u0000.\u00008\u0000.\u00007)\n/CreationDate (D:20201219164504Z)\n>>\nendobj\n3 0 obj\n<<\n/Type /ExtGState\n/SA true\n/SM 0.02\n/ca 1.0\n/CA 1.0\n/AIS false\n/SMask /None>>\nendobj\n4 0 obj\n[/Pattern /DeviceRGB]\nendobj\n8 0 obj\n<<\n/Type /Annot\n/Subtype /Link\n/Rect [3.75000000 339.500000 102.750000 345.500000 ]\n/Border [0 0 0]\n/A <<\n/Type /Action\n/S /URI\n/URI (http://www.schooldude.com/)\n>>\n>>\nendobj\n9 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n5 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/Contents 10 0 R\n/Resources 12 0 R\n/Annots 13 0 R\n/MediaBox [0 0 595 842]\n>>\nendobj\n12 0 obj\n<<\n/ColorSpace <<\n/PCSp 4 0 R\n/CSp /DeviceRGB\n/CSpg /DeviceGray\n>>\n/ExtGState <<\n/GSa 3 0 R\n>>\n/Pattern <<\n>>\n/Font <<\n/F6 6 0 R\n/F7 7 0 R\n>>\n/XObject <<\n>>\n>>\nendobj\n13 0 obj\n[ 8 0 R ]\nendobj\n10 0 obj\n<<\n/Length 11 0 R\n/Filter /FlateDecode\n>>\nstream\nx��]M�ܸ\u0011����9���o\u0012\b\u0002x>6@\u000e\u0001\f\u001b�!�!�f\u0013,8{��\u000fI}t���\u001eq\u001e=�x�X���*=U�*V\u0015)��\u001f?����oͻ���n>�?\u001f>\u001eڣ3m�_���]~�l>9|m�\u001e>\u001c>Ŀ������\u000b)l����_���h�\u0012.~�NM_���/�k~�\u0002\u0007H\"����w\u001d�x�=�����Jex��#��#�x�nU�\u001c~J\b�j�||�s��\u001b��)��s��\u000f<\u0003�}�\u001c���û\u001f�M���O\u0011sV\\��S�z'T�����I\u0015h>�|x��AQ\u0015'�9J\u0019Z�\u001a���\u0007����C8*i������σ<�������r���rF��T�\u0003�8r��|9����7����,od�e���Y�}�gce��a�}�@'�\u0002#B�(M�g��J|�-���d\u001f��3v1�x��]4����E��d�\tc�\u0013�W���\\�\u001a�7w։:���\u000e��Vh�Ą1ZJx���dި���/�~��d�B�8x�\u00030�����|\u001f9TrI�\r�E}tM�\u0015�\u00006J�䉐\u0004�g\u0002o�BB6w��\n .�\u001e��5\u0018��[\u001a�\u0014;�\u0002%�s�D��\f y�c�ډ�Xe���P&+V�L�$f�sEF��\u0018;�ۉ��nkFO�*�,{\u0014V�Q3I܃��)b%S��]���>��ZDɍ�;!@\u000e��\u0018�M�e@\u0016�e���\u001b�w€\u001c��1@Iz��\"�\f\u0018Q�\u0018��n\u0006\u0001�j�\u0002:\u0016�d\u0006d�\\\u0006�(y\f���\u0001���\f�|�[@��[Aw�b��\\�)�xϚ*�f����P�(�\u0012}#\u0015��#\u0015���0r�ȕ\u0018\u0011Z�G��-Y����\\��[�c��\u0018��q1b$�E��h�`�\\\f�������,1�9\u0004�\u0016ۖ�ň��E�D�(���T�p��0r�ȕ\u0018\u0011\u001a6\u0017�es1a$k�*1�$ag:*�3����E�D�hz�������\\�]]@m�'%�'\u0015��ep�I�\\������?�\u0002H�+����O\"��\u001fQ2\u0019���{�@�\u0002W��PǓ�m�\u001c\u0018Q\u00129���v<\u0017T�\u0003]Ӗʁ$�́\u0011%�\u0003�\u0011�\u0013\u000e�Np5\u000etS7T\u000e$�d\u000e�(�\u001c��A;�@�\u000f�Ɓ��M�z)v����V�>�[]BAH_ک�k;5}q'[��vagw���e���ݸ\u001e\by\u0012y�1=�v��z�\u001d�\u0001Y$�\u0001#J\u001e\u0003Rj�\u0013\u0006�̮\u001a\u0003�4�ʀ,�ˀ\u0011%�\u0001)��\t\u0003r^W�\u0001]S�ʀ,�ˀ\u0011%�\u0001�ͳ\u0013\u0006�.O5\u0006�#\u0000����/v�����w9*�P���?#lbbγ|O\u0012o������m�Er�?���?��� @���b@�kh��]�\u0001d���.^\u0004H��>\u0018Ѝ��\u0018��5����+�4}\u0005\u0016W�[/���x��!\u0013�k�\u0012F�\u0016�\u0012#Bk�Z�\u0012�h-[�\\��1����y�\u0011�Č�\t�$љ,�=�x��bj�\u0019[�w���O\"��\u001fQ��/T�\u000b\u0001Dj�Uc@Ԭf�EF�Ӣ��\u001cD�\u0012�l�~�D��hD��#��^��H]я��3��\u0019`�\fp�a��ɀ�%%�\u0001I$�\u0001#J\u001e\u0003T?<�\u0001J��\fp��\u001f&\"���\n�X�a$o�E�\u0018)�-�$��b�0r�ȕ\u0018\u0011�\u001f~M\u0012�\u000f�v\u0018�Z�J4�\u0011/M�\u0016\"�t�:\u00171�l\u001ckcN��,�=\u0013 [...]
- Here is the error I am receiving:
{
"error": {
"code": 400,
"message": "Invalid JSON payload received. Unknown name \"content\" at 'document': Proto field is not repeating, cannot start list.",
"status": "INVALID_ARGUMENT",
"details": [
{
"@type": "type.googleapis.com/google.rpc.BadRequest",
"fieldViolations": [
{
"field": "document",
"description": "Invalid JSON payload received. Unknown name \"content\" at 'document': Proto field is not repeating, cannot start list."
}
]
}
]
}
}
2021-01-05 adding code to show how encoding is perfomed:
//
//Function to call each url in an array of urls
//
const requestAsync = function(url) {
return z.request(url).then((response) => response.content)
}
//
//Create the array of urls to call synchronously
//
var urlArr = [];
const urls = {
url: 'https://storage.googleapis.com/cloud-samples-data/documentai/loan_form.pdf',
method: 'GET',
headers: {
'Accept': 'application/pdf',
'raw': true
}
}
urlArr.push(urls);
//
//Call the function for each item in the urlArr
//
return Promise.all(urlArr.map(requestAsync))
.then(function(values){
//
// Convert the file data to a Buffer and base64 encode it.
//
var fileContent = Buffer.from(values[0]).toString('base64');
const options = {
url: 'https://us-documentai.googleapis.com/v1beta3/projects/38072577434/locations/us/processors/cd8a06d0cd3cb045:process',
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Accept': 'application/json',
'Authorization': `Bearer ${bundle.authData.access_token}`
},
body: {
document: {
mimeType: 'application/pdf',
content: fileContent
}
}
};
return z.request(options)
.then((response) => {
response.throwForStatus();
const result = response.json;
// Get all of the document text as one big string
const {text} = result;
// Extract shards from the text field
const getText = textAnchor => {
// First shard in document doesn't have startIndex property
const startIndex = textAnchor.textSegments[0].startIndex || 0;
const endIndex = textAnchor.textSegments[0].endIndex;
return text.substring(startIndex, endIndex);
};
/* // Process the output
const [page1] = result.pages;
const {formFields} = page1;
var fieldList = "";
for (const field of formFields) {
var fieldName = getText(field.fieldName.textAnchor);
var fieldValue = getText(field.fieldValue.textAnchor);
fieldName = fieldName.replace(/\n/g,'');
fieldValue = fieldValue.replace(/\n/g,'');
fieldList += `"${fieldName}": "${fieldValue}"`;
z.console.log(`\t(${fieldName}, ${fieldValue})`);
}
*/
//z.console.log(fieldList)
return {getText};
});
});