Problems to extract text from PDF for certain pdfs only C#

Question

I need to extract some data from a PDF file. I'm using the iTextSharp to do that.

I'm using this code which I founded on the net:

using System;
using System.IO;
using iTextSharp.text.pdf;

namespace PdfToText
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
    /// BT = Beginning of a text object operator 
    /// ET = End of a text object operator
    /// Td move to the start of next line
    ///  5 Ts = superscript
    /// -5 Ts = subscript

    #region Fields

    #region _numberOfCharsToKeep
    /// <summary>
    /// The number of characters to keep, when extracting text.
    /// </summary>
    private static int _numberOfCharsToKeep = 15;
    #endregion

    #endregion

    #region ExtractText
    /// <summary>
    /// Extracts a text from a PDF file.
    /// </summary>
    /// <param name="inFileName">the full path to the pdf file.</param>
    /// <param name="outFileName">the output file name.</param>
    /// <returns>the extracted text</returns>
    public bool ExtractText(string inFileName, string outFileName)
    {
        StreamWriter outFile = null;
        try
        {
            outFileName = String.Empty;

            outFileName = Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory);
            //string currentDirectory = Directory.GetCurrentDirectory();
            //string filePath = System.IO.Path.Combine(currentDirectory, "Data", "myfile.txt");
            // extract the text
            //string test = "";
            outFileName += @"\test.txt";
            // Create a reader for the given PDF file
            PdfReader reader = new PdfReader(inFileName);
            //outFile = File.CreateText(outFileName);
            outFile = new StreamWriter(outFileName, true, System.Text.Encoding.UTF8);

            Console.Write("Processing: ");

            int totalLen = 68;
            float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
            int totalWritten = 0;
            float curUnit = 0;

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

                // Write the progress.
                if (charUnit >= 1.0f)
                {
                    for (int i = 0; i < (int)charUnit; i++)
                    {
                        Console.Write("#");
                        totalWritten++;
                    }
                }
                else
                {
                    curUnit += charUnit;
                    if (curUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)curUnit; i++)
                        {
                            Console.Write("#");
                            totalWritten++;
                        }
                        curUnit = 0;
                    }

                }
            }

            if (totalWritten < totalLen)
            {
                for (int i = 0; i < (totalLen - totalWritten); i++)
                {
                    Console.Write("#");
                }
            }
            return true;
        }
        catch(Exception ex)
        {
            return false;
        }
        finally
        {
            if (outFile != null) outFile.Close();
        }
    }
    #endregion

    #region ExtractTextFromPDFBytes
    /// <summary>
    /// This method processes an uncompressed Adobe (text) object 
    /// and extracts text.
    /// </summary>
    /// <param name="input">uncompressed</param>
    /// <returns></returns>
    private string ExtractTextFromPDFBytes(byte[] input)
    {
        if (input == null || input.Length == 0) return "";

        try
        {
            string resultString = "";

            // Flag showing if we are we currently inside a text object
            bool inTextObject = false;

            // Flag showing if the next character is literal 
            // e.g. '\\' to get a '\' character or '\(' to get '('
            bool nextLiteral = false;

            // () Bracket nesting level. Text appears inside ()
            int bracketDepth = 0;

            // Keep previous chars to get extract numbers etc.:
            char[] previousCharacters = new char[_numberOfCharsToKeep];
            for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


            for (int i = 0; i < input.Length; i++)
            {
                char c = (char)input[i];

                if (inTextObject)
                {
                    // Position the text
                    if (bracketDepth == 0)
                    {
                        if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                        {
                            resultString += "\n\r";
                        }
                        else
                        {
                            if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                            {
                                resultString += "\n";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                {
                                    resultString += " ";
                                }
                            }
                        }
                    }

                    // End of a text object, also go to a new line.
                    if (bracketDepth == 0 &&
                        CheckToken(new string[] { "ET" }, previousCharacters))
                    {

                        inTextObject = false;
                        resultString += " ";
                    }
                    else
                    {
                        // Start outputting text
                        if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                        {
                            bracketDepth = 1;
                        }
                        else
                        {
                            // Stop outputting text
                            if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                            {
                                bracketDepth = 0;
                            }
                            else
                            {
                                // Just a normal text character:
                                if (bracketDepth == 1)
                                {
                                    // Only print out next character no matter what. 
                                    // Do not interpret.
                                    if (c == '\\' && !nextLiteral)
                                    {
                                        nextLiteral = true;
                                    }
                                    else
                                    {
                                        if (((c >= ' ') && (c <= '~')) ||
                                            ((c >= 128) && (c < 255)))
                                        {
                                            resultString += c.ToString();
                                        }

                                        nextLiteral = false;
                                    }
                                }
                            }
                        }
                    }
                }

                // Store the recent characters for 
                // when we have to go back for a checking
                for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                {
                    previousCharacters[j] = previousCharacters[j + 1];
                }
                previousCharacters[_numberOfCharsToKeep - 1] = c;

                // Start of a text object
                if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                {
                    inTextObject = true;
                }
            }
            return resultString;
        }
        catch
        {
            return "";
        }
    }
    #endregion

    #region CheckToken
    /// <summary>
    /// Check if a certain 2 character token just came along (e.g. BT)
    /// </summary>
    /// <param name="search">the searched token</param>
    /// <param name="recent">the recent character array</param>
    /// <returns></returns>
    private bool CheckToken(string[] tokens, char[] recent)
    {
        foreach (string token in tokens)
        {
            if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                (recent[_numberOfCharsToKeep - 4] == 0x0a))
                )
            {
                return true;
            }
        }
        return false;
    }
    #endregion
}

}

I'm using this way:

 PDFParser pdfParser = new PDFParser();
 pdfParser.ExtractText(pdfFile,Path.GetFileNameWithoutExtension(pdfFile) + ".txt");

So the pdf content is written in a txt file. It works good for certain pdf-s, but for a pdf file that I really need to use, the txt file remains always empty. I didn't get errors, but for some reason it's not writing anything, although as you can see in this screenshot it recognize the pdf,that it has 2 pages...

This is the pdf that I need but the txt always remains empty.(the black lines are added by me, so there are not present when I want to write in the txt)

And this is another pdf. For this the program works ok, and it is written is a txt file. It is much bigger than the other pdf, and still for this I can extract the texts and for the other I can't.

Do you have any idea what can be the problem?

Patrick Artner · Accepted Answer · 2018-02-15T11:43:01.853

Too long for comment and maybe an answer that you do not like to get:

In PDFs the "Text your see" aka how does a font look and "What the glyphs mean" aka what glyph is mapped to which utf8-letter are separate things.

They are stored in different parts of the pdf - it is utterly possible that a pdf looks totally fine, but if you try to extract text it will give you nothing from it because it only contains the shape of your textglyphs but not theire "meaning".

Try to open the pdf and Select + Copy the text you are after, if you paste that into an editor and noting is there, your pdf lacks the information "what utf8-letter is displayed by this glyph".

OR:

It also might be that your pdf only containts the image of a text - a photo so to say. You can read it, iTextSharp sees only a "picture" - no text.

Those are possible 'why's that would answer your question. As to how to fix it:

There are several questions about corrupt PDFs on SO:

How to repair a PDF file and embed missing fonts

Embedded fonts in PDF: copy and paste problems (this answer)

Copy and Paste relates to text parsing, so the might help you out on how to fix it.

Your edit shows details about your parsing, why don't you leverage iTextSharp for that?

using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;

public static string ExtractTextFromPdf(string path)
{
  using (PdfReader reader = new PdfReader(path))
  {
    StringBuilder text = new StringBuilder();

    for (int i = 1; i <= reader.NumberOfPages; i++)
    {
        text.Append(PdfTextExtractor.GetTextFromPage(reader, i));
    }

    return text.ToString();
  }

from: http://www.squarepdf.net/parsing-pdf-files-using-itextsharp

or like here: parse-pdf-with-itextsharp-and-then-extract-specific-text-to-the-screen ?

I've tried what you have proposed to copy the text from PDF to a text file, and I could copy. So this is not the problem. I will read the links what you just pasted in the answer, — Orsi, Feb 15 '18 at 10:29
@Orsi both are related to "not being able to copy" so in your case they do not seem to help. — Patrick Artner, Feb 15 '18 at 11:35
@Orsi Why arent you using the tools that iTextSharp comes with regarding text extraction? See edit. — Patrick Artner, Feb 15 '18 at 11:39

Problems to extract text from PDF for certain pdfs only C#

1 Answers1