Extract a table from PDF

Question

I need a help with iText in C#. I'm trying to extract a table from a PDF file and save this into a new CSV file, keeping the values in the correct places. For this, I thought the solution was to create a two-dimensional array to organize the data.

Extracting all information from PDF with iText, I saw it was possible to get some numerical data that seemed to be the position of a piece of text on the page and I organized my array based these indexes. It didn’t work, the text was completely dispersed in various different cells. Now, I want to know what this values means, because they don't follow a "correct" order and I want to know if is possible to organize the future table with this.

I'm using ";" as delimiter cell.

For testing, I'm using this PDF http://www.americana.sp.gov.br/americanaV5/download/contasPublicas/Despesa_Categoria_Economica_2014.pdf

Here's my code:

protected void Button2_Click(object sender, EventArgs e)
{
    try
    {
        TextBox2.Text = "";
        byte[] conteudo = download(TextBox1.Text);

        if (conteudo != null)
        {

            PdfReader leitorp = new PdfReader(conteudo);
            ITextExtractionStrategy estrategia = new SimpleTextExtractionStrategy();

            List<Celula> celulas = new List<Celula>();

            int i, j;

            for (i = 1; i <= leitorp.NumberOfPages; i++)
            {
                //Total and crude extraction of all information from text in PDF via iText, separate lines in an array of strings.
                string[] linhas = (Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, leitorp.GetPageContent(i)))).Split('\n');

                for (j = 1; j < linhas.Length; j++)
                {
                    if (linhas[j].Length > 2)
                    {
                        if (linhas[j].Substring(0, 2).Equals("BT"))
                        {
                            string[] campos = linhas[j].Split(' ');
                            Celula umacelula = new Celula();
                            umacelula.coluna = float.Parse(campos[1]);
                            umacelula.linha = float.Parse(campos[2]);

                            linhadodebug = j;
                            int t1 = linhas[j].IndexOf('(');
                            int t2 = linhas[j].LastIndexOf(')');

                            umacelula.conteudo = System.Text.RegularExpressions.Regex.Replace((linhas[j].Substring(linhas[j].IndexOf('(') + 1, (linhas[j].LastIndexOf(')') - 1 - linhas[j].IndexOf('(')))), @"\s\s+", "");

                            celulas.Add(umacelula);
                        }
                    }
                }
            }

            leitorp.Close();

            string[] totallinhas = new string[celulas.Count];
            string[] totalcolunas = new string[celulas.Count];

            for (i = 0; i < celulas.Count; i++)
            {
                totallinhas[i] = celulas[i].linha.ToString();
                totalcolunas[i] = celulas[i].coluna.ToString();
            }

            totallinhas = totallinhas.Distinct().ToArray();
            totalcolunas = totalcolunas.Distinct().ToArray();

            Array.Sort(totallinhas);
            Array.Reverse(totallinhas);

            Array.Sort(totalcolunas);
            Array.Reverse(totalcolunas);

            string[,] matriz = new string[totallinhas.Length + 1, totalcolunas.Length + 1];

            for (i = 1; i < totallinhas.Length; i++)
            {
                matriz[i, 0] = totallinhas[i - 1].ToString();
            }

            for (i = 1; i < totalcolunas.Length; i++)
            {
                matriz[0, i] = totalcolunas[i - 1].ToString();
            }

            int z;
            for (i = 0; i < celulas.Count(); i++)
            {
                for (j = 1; j < matriz.GetLength(0); j++)
                {
                    for (z = 1; z < matriz.GetLength(1); z++)
                    {
                        if ((celulas[i].linha.ToString().Equals(matriz[j, 0])) && (celulas[i].coluna.ToString().Equals(matriz[0, z])))
                        {
                            matriz[j, z] = celulas[i].conteudo.ToString();
                        }
                    }
                }
            }

            StringWriter texto = new StringWriter();

            for (i = 0; i < matriz.GetLength(0); i++)
            {
                for (j = 0; j < matriz.GetLength(1); j++)
                {
                    texto.Write(matriz[i, j] + ";");
                }
                texto.WriteLine();
            }

            Response.ContentType = "text/plain";
            Response.AddHeader("content-disposition", "attachment;filename=" + string.Format("teste-{0}.csv", string.Format("{0:ddMMyyyy}", DateTime.Today)));
            Response.Clear();

            using (StreamWriter writer = new StreamWriter(Response.OutputStream, Encoding.UTF8))
            {
                writer.Write(texto.ToString());
            }
            Response.End();

        }

    }
    catch (Exception E)
    {
        TextBox2.Text = "Erro Button2_Click: " + E.Message + " # " + linhadodebug.ToString();
    }

}

And here, the struct of celula (cell) and method to download the file:

public struct Celula
{
    public float coluna;
    public float linha;
    public string conteudo;

    public Celula(float coluna, float linha, string conteudo)
    {
        this.coluna = coluna;
        this.linha = linha;
        this.conteudo = conteudo;
    }

    public Celula(Celula celula)
    {
        this.coluna = celula.coluna;
        this.linha = celula.linha;
        this.conteudo = celula.conteudo;
    }
}

protected byte[] download(string url)
{
    try
    {
        WebRequest endereco = HttpWebRequest.Create(url);

        Stream leitor = endereco.GetResponse().GetResponseStream();

        MemoryStream memoria = new MemoryStream();

        byte[] conteudo = null;

        int count = 0;

        do
        {
            byte[] buffer = new byte[1024];
            count = leitor.Read(buffer, 0, 1024);
            memoria.Write(buffer, 0, count);
        }
        while (leitor.CanRead && count > 0);

        // Converte da memória direto para bytes
        conteudo = memoria.ToArray();

        if (conteudo != null)
        {
            return conteudo;
        }
        else
        {
            TextBox2.Text = "Error: download null.";
            return null;
        }

    }
    catch (Exception E)
    {
        TextBox2.Text = "Error download: " + E.Message;
        return null;
    }

}

This is a non-profit project. I hope you can help me. Thank you!

Unless your PDF is a Tagged PDF, you are making the wrong assumption that what your eyes perceive as a table is actually stored as a table inside the PDF. Your question is unanswerable. This is a possible duplicate of [How to Detect table start in itextSharp?](http://stackoverflow.com/questions/15767952/how-to-detect-table-start-in-itextsharp) — Bruno Lowagie, Jul 15 '14 at 05:45
When I look at your PDF and I open File > Document Properties, I clearly see: **Tagged PDF: No** IN other words: your allegation that there's a table structure *inside* the PDF is wrong. Think of the painting "Ceci n'est pas un pipe" by René Magritte. You see a pipe, but there's no real pipe. The same goes for your PDF: you see a table, but there's no real table. — Bruno Lowagie, Jul 15 '14 at 06:46
*I saw it was possible to get some numerical data that seemed to be the position of a piece of text on the page* - If (instead of merely guessing) you had looked into the specification [ISO 32000-1](http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf), you would have known better. — mkl, Jul 15 '14 at 13:35

Extract a table from PDF

0 Answers0