I am using Amazon's Textract service for extracting tables, Forms from pdf documnets. The example provided at Github here is working for single page document only. But as per demo provided by AWS they are able to extract multi page pdf docs as well.
As per documentation we have to call same service for multi pages as well. But it is not working for me.
All the examples provided by them are either in python or java.
I am doing it in dotnet core.
Any help?
Here is my code.
public IActionResult FileExtract(string filename)
{
try
{
string lineText = "";
string wordText = "";
string fieldsText = "";
string fieldsText2 = "";
string tableText = "";
// Extracting file in below code.
var textractAnalysisClient = BuildTextractClient();
var document = PrepareDocument(textractAnalysisClient, "FORMS", filename);
document.Pages.ForEach(page =>
{
page.Lines.ForEach(line =>
{
lineText += "<button class='rawlabel'>" + line.Text + "</button>";
line.Words.ForEach(word =>
{
wordText += word.Text;
});
});
page.Form.Fields.ForEach(f =>
{
fieldsText += "<div><h5>" + f.Key + "</h5><p style='background-color:lightgray;width: 200px;padding: 6px;'>"
+ f.Value + "</p></div>";
});
var key = "Phone Number:";
var field = page.Form.GetFieldByKey(key);
if (field != null)
{
fieldsText2 += "Key: " + field.Key + " | Value: " + field.Value;
}
});
tableText = "<table id='customers'>";
document = PrepareDocument(textractAnalysisClient, "TABLES", filename);
document.Pages.ForEach(page =>
{
page.Tables.ForEach(table =>
{
var r = 0;
table.Rows.ForEach(row =>
{
r++;
tableText += "<tr>";
var c = 0;
row.Cells.ForEach(cell =>
{
c++;
tableText += "<td>";
tableText += cell.Text + "</td>";
});
tableText += "</tr>";
});
});
});
tableText += "</table>";
objJsonResponse.fieldsText = fieldsText;
objJsonResponse.fieldsText2 = fieldsText2;
objJsonResponse.lineText = lineText;
objJsonResponse.tableText = tableText;
objJsonResponse.wordText = wordText;
objJsonResponse.responsecode = 1;
return Json(objJsonResponse);
}
catch (Exception ex)
{
this.objJsonResponse.responsecode = -1;
this.objJsonResponse.error = "failed";
return Json(this.objJsonResponse);
}
}
static TextractTextAnalysisService BuildTextractClient()
{
var builder = new ConfigurationBuilder()
.SetBasePath(Environment.CurrentDirectory)
.AddJsonFile("appsettings.json", optional: false, reloadOnChange: true)
.AddEnvironmentVariables()
.Build();
var awsOptions = builder.GetAWSOptions();
return new TextractTextAnalysisService(awsOptions.CreateServiceClient<IAmazonTextract>());
}
static TextractDocument PrepareDocument(TextractTextAnalysisService textractAnalysisClient, string type, string FormFile)
{
var task = textractAnalysisClient.StartDocumentAnalysis(BucketName, FormFile, type);
var jobId = task.Result;
textractAnalysisClient.WaitForJobCompletion(jobId);
var results = textractAnalysisClient.GetJobResults(jobId);
return new TextractDocument(results);
}