Extract pdf document with multi page

Question

I am using Amazon's Textract service for extracting tables, Forms from pdf documnets. The example provided at Github here is working for single page document only. But as per demo provided by AWS they are able to extract multi page pdf docs as well.

As per documentation we have to call same service for multi pages as well. But it is not working for me.

All the examples provided by them are either in python or java.

I am doing it in dotnet core.

Any help?

Here is my code.

public IActionResult FileExtract(string filename)
    {

        try
        {
            string lineText = "";

            string wordText = "";
            string fieldsText = "";
            string fieldsText2 = "";
            string tableText = "";


            // Extracting file in below code.
            var textractAnalysisClient = BuildTextractClient();
            var document = PrepareDocument(textractAnalysisClient, "FORMS", filename);
            document.Pages.ForEach(page =>
            {
                page.Lines.ForEach(line =>
                {
                    lineText += "<button class='rawlabel'>" + line.Text + "</button>";
                    line.Words.ForEach(word =>
                    {
                        wordText += word.Text;
                    });
                });
                page.Form.Fields.ForEach(f =>
                {
                    fieldsText += "<div><h5>" + f.Key + "</h5><p style='background-color:lightgray;width: 200px;padding: 6px;'>"
                    + f.Value + "</p></div>";
                });
                var key = "Phone Number:";
                var field = page.Form.GetFieldByKey(key);
                if (field != null)
                {
                    fieldsText2 += "Key: " + field.Key + "  | Value: " + field.Value;
                }
            });

            tableText = "<table id='customers'>";
            document = PrepareDocument(textractAnalysisClient, "TABLES", filename);
            document.Pages.ForEach(page =>
            {
                page.Tables.ForEach(table =>
                {
                    var r = 0;
                    table.Rows.ForEach(row =>
                    {
                        r++;
                        tableText += "<tr>";
                        var c = 0;
                        row.Cells.ForEach(cell =>
                        {
                            c++;
                            tableText += "<td>";
                            tableText += cell.Text + "</td>";
                        });
                        tableText += "</tr>";
                    });
                });
            });

            tableText += "</table>";

            objJsonResponse.fieldsText = fieldsText;
            objJsonResponse.fieldsText2 = fieldsText2;
            objJsonResponse.lineText = lineText;
            objJsonResponse.tableText = tableText;
            objJsonResponse.wordText = wordText;
            objJsonResponse.responsecode = 1;
            return Json(objJsonResponse);
        }
        catch (Exception ex)
        {
            this.objJsonResponse.responsecode = -1;
            this.objJsonResponse.error = "failed";
            return Json(this.objJsonResponse);
        }
    }

    static TextractTextAnalysisService BuildTextractClient()
    {
        var builder = new ConfigurationBuilder()
            .SetBasePath(Environment.CurrentDirectory)
            .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true)
            .AddEnvironmentVariables()
            .Build();
        var awsOptions = builder.GetAWSOptions();
        return new TextractTextAnalysisService(awsOptions.CreateServiceClient<IAmazonTextract>());
    }

    static TextractDocument PrepareDocument(TextractTextAnalysisService textractAnalysisClient, string type, string FormFile)
    {
        var task = textractAnalysisClient.StartDocumentAnalysis(BucketName, FormFile, type);
        var jobId = task.Result;
        textractAnalysisClient.WaitForJobCompletion(jobId);
        var results = textractAnalysisClient.GetJobResults(jobId);
        return new TextractDocument(results);
    }

check details document https://docs.aws.amazon.com/textract/latest/dg/textract-dg.pdf — GRVPrasad, Feb 22 '20 at 10:26
I am following this. Using same function which they have listed there. But no success. — Varinder, Feb 22 '20 at 10:31
I have updated question with code. Can you also tell me where I have to provide my secret keys for using textract services. — Varinder, Feb 22 '20 at 10:44
Hi @Varinder you can have a look at this [link](https://stackoverflow.com/questions/59038306/how-to-use-the-amazon-textract-with-pdf-files) — Rikus, Jun 10 '20 at 07:01

Extract pdf document with multi page

0 Answers0