I need a help with iText in C#. I'm trying to extract a table from a PDF file and save this into a new CSV file, keeping the values in the correct places. For this, I thought the solution was to create a two-dimensional array to organize the data.
Extracting all information from PDF with iText, I saw it was possible to get some numerical data that seemed to be the position of a piece of text on the page and I organized my array based these indexes. It didn’t work, the text was completely dispersed in various different cells. Now, I want to know what this values means, because they don't follow a "correct" order and I want to know if is possible to organize the future table with this.
I'm using ";" as delimiter cell.
For testing, I'm using this PDF http://www.americana.sp.gov.br/americanaV5/download/contasPublicas/Despesa_Categoria_Economica_2014.pdf
Here's my code:
protected void Button2_Click(object sender, EventArgs e)
{
try
{
TextBox2.Text = "";
byte[] conteudo = download(TextBox1.Text);
if (conteudo != null)
{
PdfReader leitorp = new PdfReader(conteudo);
ITextExtractionStrategy estrategia = new SimpleTextExtractionStrategy();
List<Celula> celulas = new List<Celula>();
int i, j;
for (i = 1; i <= leitorp.NumberOfPages; i++)
{
//Total and crude extraction of all information from text in PDF via iText, separate lines in an array of strings.
string[] linhas = (Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, leitorp.GetPageContent(i)))).Split('\n');
for (j = 1; j < linhas.Length; j++)
{
if (linhas[j].Length > 2)
{
if (linhas[j].Substring(0, 2).Equals("BT"))
{
string[] campos = linhas[j].Split(' ');
Celula umacelula = new Celula();
umacelula.coluna = float.Parse(campos[1]);
umacelula.linha = float.Parse(campos[2]);
linhadodebug = j;
int t1 = linhas[j].IndexOf('(');
int t2 = linhas[j].LastIndexOf(')');
umacelula.conteudo = System.Text.RegularExpressions.Regex.Replace((linhas[j].Substring(linhas[j].IndexOf('(') + 1, (linhas[j].LastIndexOf(')') - 1 - linhas[j].IndexOf('(')))), @"\s\s+", "");
celulas.Add(umacelula);
}
}
}
}
leitorp.Close();
string[] totallinhas = new string[celulas.Count];
string[] totalcolunas = new string[celulas.Count];
for (i = 0; i < celulas.Count; i++)
{
totallinhas[i] = celulas[i].linha.ToString();
totalcolunas[i] = celulas[i].coluna.ToString();
}
totallinhas = totallinhas.Distinct().ToArray();
totalcolunas = totalcolunas.Distinct().ToArray();
Array.Sort(totallinhas);
Array.Reverse(totallinhas);
Array.Sort(totalcolunas);
Array.Reverse(totalcolunas);
string[,] matriz = new string[totallinhas.Length + 1, totalcolunas.Length + 1];
for (i = 1; i < totallinhas.Length; i++)
{
matriz[i, 0] = totallinhas[i - 1].ToString();
}
for (i = 1; i < totalcolunas.Length; i++)
{
matriz[0, i] = totalcolunas[i - 1].ToString();
}
int z;
for (i = 0; i < celulas.Count(); i++)
{
for (j = 1; j < matriz.GetLength(0); j++)
{
for (z = 1; z < matriz.GetLength(1); z++)
{
if ((celulas[i].linha.ToString().Equals(matriz[j, 0])) && (celulas[i].coluna.ToString().Equals(matriz[0, z])))
{
matriz[j, z] = celulas[i].conteudo.ToString();
}
}
}
}
StringWriter texto = new StringWriter();
for (i = 0; i < matriz.GetLength(0); i++)
{
for (j = 0; j < matriz.GetLength(1); j++)
{
texto.Write(matriz[i, j] + ";");
}
texto.WriteLine();
}
Response.ContentType = "text/plain";
Response.AddHeader("content-disposition", "attachment;filename=" + string.Format("teste-{0}.csv", string.Format("{0:ddMMyyyy}", DateTime.Today)));
Response.Clear();
using (StreamWriter writer = new StreamWriter(Response.OutputStream, Encoding.UTF8))
{
writer.Write(texto.ToString());
}
Response.End();
}
}
catch (Exception E)
{
TextBox2.Text = "Erro Button2_Click: " + E.Message + " # " + linhadodebug.ToString();
}
}
And here, the struct of celula (cell) and method to download the file:
public struct Celula
{
public float coluna;
public float linha;
public string conteudo;
public Celula(float coluna, float linha, string conteudo)
{
this.coluna = coluna;
this.linha = linha;
this.conteudo = conteudo;
}
public Celula(Celula celula)
{
this.coluna = celula.coluna;
this.linha = celula.linha;
this.conteudo = celula.conteudo;
}
}
protected byte[] download(string url)
{
try
{
WebRequest endereco = HttpWebRequest.Create(url);
Stream leitor = endereco.GetResponse().GetResponseStream();
MemoryStream memoria = new MemoryStream();
byte[] conteudo = null;
int count = 0;
do
{
byte[] buffer = new byte[1024];
count = leitor.Read(buffer, 0, 1024);
memoria.Write(buffer, 0, count);
}
while (leitor.CanRead && count > 0);
// Converte da memória direto para bytes
conteudo = memoria.ToArray();
if (conteudo != null)
{
return conteudo;
}
else
{
TextBox2.Text = "Error: download null.";
return null;
}
}
catch (Exception E)
{
TextBox2.Text = "Error download: " + E.Message;
return null;
}
}
This is a non-profit project. I hope you can help me. Thank you!