27

I am using iTextSharp to read text contents from PDF. I am able to read that also. But I am loosing text formatting like the font, color etc. Is there any way to get that formatting as well.

Below is the code segment i am using to exact text -

PdfReader reader = new PdfReader("F:\\EBooks\\AspectsOfAjax.pdf");
textBox1.Text = ExtractTextFromPDFBytes(reader.GetPageContent(1));

private string ExtractTextFromPDFBytes(byte[] input)
{
    if (input == null || input.Length == 0) return "";
    try
    {
        string resultString = "";
        // Flag showing if we are we currently inside a text object
        bool inTextObject = false;
        // Flag showing if the next character is literal  e.g. '\\' to get a '\' character or '\(' to get '('
        bool nextLiteral = false;
        // () Bracket nesting level. Text appears inside ()
        int bracketDepth = 0;
        // Keep previous chars to get extract numbers etc.:
        char[] previousCharacters = new char[_numberOfCharsToKeep];
        for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
        for (int i = 0; i < input.Length; i++)
        {
            char c = (char)input[i];
            if (inTextObject)
            {
                // Position the text
                if (bracketDepth == 0)
                {
                    if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                    {
                        resultString += "\n\r";
                    }
                    else
                    {
                        if (CheckToken(new string[] {"'", "T*", "\""}, previousCharacters))
                        {
                            resultString += "\n";
                        }
                        else
                        {
                            if (CheckToken(new string[] { "Tj" }, previousCharacters))
                            {
                                resultString += " ";
                            }
                        }
                    }
                }
                // End of a text object, also go to a new line.
                if (bracketDepth == 0 && CheckToken( new string[]{"ET"}, previousCharacters))
                {
                    inTextObject = false;
                    resultString += " ";
                }
                else
                {
                    // Start outputting text
                    if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                    {
                        bracketDepth = 1;
                    }
                    else
                    {
                        // Stop outputting text
                        if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                        {
                            bracketDepth = 0;
                        }
                        else
                        {
                            // Just a normal text character:
                            if (bracketDepth == 1)
                            {
                                // Only print out next character no matter what. 
                                // Do not interpret.
                                if (c == '\\' && !nextLiteral)
                                {
                                    nextLiteral = true;
                                }
                                else
                                {
                                    if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255)))
                                    {
                                        resultString += c.ToString();
                                    }
                                    nextLiteral = false;
                                }
                            }
                        }
                    }
                }
            }
            // Store the recent characters for when we have to go back for a checking
            for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
            {
                previousCharacters[j] = previousCharacters[j + 1];
            }
            previousCharacters[_numberOfCharsToKeep - 1] = c;

            // Start of a text object
            if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters))
            {
                inTextObject = true;
            }
        }
        return resultString;
    }
    catch
    {
        return "";
    }
}

private bool CheckToken(string[] tokens, char[] recent)
{
    foreach(string token in tokens)
    {
        if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
            (recent[_numberOfCharsToKeep - 2] == token[1]) &&
            ((recent[_numberOfCharsToKeep - 1] == ' ') ||
            (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
            (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
            ((recent[_numberOfCharsToKeep - 4] == ' ') ||
            (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
            (recent[_numberOfCharsToKeep - 4] == 0x0a))
            )
        {
            return true;
        }
    }
    return false;
}
John Saunders
  • 160,644
  • 26
  • 247
  • 397
IrfanRaza
  • 3,030
  • 17
  • 64
  • 89

2 Answers2

43

Let me try pointing you in a different direction. iTextSharp has a really beautiful and simple text extraction system that handle some of the basic tokens. Unfortunately it doesn't handle color information but according to @Mark Storer it might not be too hard to implement yourself.

BEGIN EDIT

I started work on implementing color information. See my blog post here for more details. (Sorry for the bad formatting, heading off to dinner now.)

END EDIT

The code below combines several questions and answers here including this one to get the font height (although its not exact) as well as another one (that for the life of me I can't seem to find anymore) that shows how to detect for faux bold.

The PostscriptFontName returns some additional characters in front of the font name, I think it has to do with when you embed font subsets.

Below is a complete WinForms application that targets iTextSharp 5.1.1.0 and extracts text as HTML.

Screenshot of sample PDF

Screenshot of sample PDF

Sample text extracted as HTML

<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Hello </span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">w</span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:37.87201">o</span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">rl</span>
<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">d </span>
<br />
<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Test </span>

Code

using System;
using System.Collections.Generic;
using System.Text;
using System.Windows.Forms;
using iTextSharp.text.pdf.parser;
using iTextSharp.text.pdf;

namespace WindowsFormsApplication2
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Document.pdf"));
            TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
            string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);
            Console.WriteLine(F);

            this.Close();
        }

        public class TextWithFontExtractionStategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
        {
            //HTML buffer
            private StringBuilder result = new StringBuilder();

            //Store last used properties
            private Vector lastBaseLine;
            private string lastFont;
            private float lastFontSize;

            //http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/TextRenderInfo.html
            private enum TextRenderMode
            {
                FillText = 0,
                StrokeText = 1,
                FillThenStrokeText = 2,
                Invisible = 3,
                FillTextAndAddToPathForClipping = 4,
                StrokeTextAndAddToPathForClipping = 5,
                FillThenStrokeTextAndAddToPathForClipping = 6,
                AddTextToPaddForClipping = 7
            }



            public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
            {
                string curFont = renderInfo.GetFont().PostscriptFontName;
                //Check if faux bold is used
                if ((renderInfo.GetTextRenderMode() == (int)TextRenderMode.FillThenStrokeText))
                {
                    curFont += "-Bold";
                }

                //This code assumes that if the baseline changes then we're on a newline
                Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
                Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
                iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
                Single curFontSize = rect.Height;

                //See if something has changed, either the baseline, the font or the font size
                if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont))
                {
                    //if we've put down at least one span tag close it
                    if ((this.lastBaseLine != null))
                    {
                        this.result.AppendLine("</span>");
                    }
                    //If the baseline has changed then insert a line break
                    if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2])
                    {
                        this.result.AppendLine("<br />");
                    }
                    //Create an HTML tag with appropriate styles
                    this.result.AppendFormat("<span style=\"font-family:{0};font-size:{1}\">", curFont, curFontSize);
                }

                //Append the current text
                this.result.Append(renderInfo.GetText());

                //Set currently used properties
                this.lastBaseLine = curBaseline;
                this.lastFontSize = curFontSize;
                this.lastFont = curFont;
            }

            public string GetResultantText()
            {
                //If we wrote anything then we'll always have a missing closing tag so close it here
                if (result.Length > 0)
                {
                    result.Append("</span>");
                }
                return result.ToString();
            }

            //Not needed
            public void BeginTextBlock() { }
            public void EndTextBlock() { }
            public void RenderImage(ImageRenderInfo renderInfo) { }
        }
    }
}
Community
  • 1
  • 1
Chris Haas
  • 53,986
  • 12
  • 141
  • 274
  • Thanks Chris for your valuable solution and useful links. I will try to implement this. – IrfanRaza Aug 01 '11 at 06:02
  • Thanks a lot Chris your solution has helped us a lot, but we are struck at one place, how can we find if text is underline or not? – Deepesh Aug 27 '12 at 06:42
  • 3
    Hi @Deepesh, unfortunately the PDF spec doesn't support underlining at the raw text level. Instead it is done in one of two ways, either through annotations (rare) or by just drawing a very thin rectangle under some text (most common). In both cases you would have to calculate the position of the text relative to the rectangle. While technically possible, I think it would be a giant headache. If your documents are uniform, however, you might be able to write some arbitrary rules. – Chris Haas Aug 27 '12 at 15:09
  • Awesomeness. Thanks for a new class. :) – Zain Shaikh Sep 02 '12 at 08:59
  • Good example! However, I'm trying to fetch coordinates of every word in the PDF file. Sometimes it tries to render several words at once, so I can't get the precise coordinate for each word. Any ideas how I could accomplish this? – Jón Trausti Arason Apr 27 '14 at 22:28
  • PDfs don't have "words" and to the best of my knowledge iText has no helper extraction strategies based solely upon words but you should be able to repurpose the `SimpleTextExtractionStrategy` class. – Chris Haas Apr 28 '14 at 01:09
  • @raRaRa Calculate the Wordsize via 'var adjust = info.GetFont().GetWidth(_builder)' then take the current lower right pos and calculate the new Left: 'rect.Left = rect.Right - adjust / 100'. – Venson Oct 09 '15 at 07:00
  • Blog post here -> https://cjhaas.com/2011/07/31/getting-color-information-from-itextsharps-textrenderinfo-and-itextextractionstrategy/ – William Humphries Jun 18 '20 at 18:18
0

I converted @Chris code to Java if anyone is looking for it

import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import com.itextpdf.text.pdf.parser.Vector;

public class TextWithFontExtractionStategy implements TextExtractionStrategy {
//HTML buffer
private StringBuilder result = new StringBuilder();

//Store last used properties
private Vector lastBaseLine;
private String lastFont;
private float lastFontSize;

//http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/TextRenderInfo.html
private enum TextRenderMode
{
    FillText(0),
    StrokeText(1),
    FillThenStrokeText(2),
    Invisible(3),
    FillTextAndAddToPathForClipping(4),
    StrokeTextAndAddToPathForClipping(5),
    FillThenStrokeTextAndAddToPathForClipping(6),
    AddTextToPaddForClipping(7);

    private int value;

    TextRenderMode(int value) {
        this.value = value;
    }

    public int getValue() {
        return value;
    }
}

    public void renderText(TextRenderInfo renderInfo)
    {
        String curFont = renderInfo.getFont().getPostscriptFontName();
        //Check if faux bold is used
        if ((renderInfo.getTextRenderMode() == TextRenderMode.FillThenStrokeText.getValue()))
        {
            curFont += "-Bold";
        }

        //This code assumes that if the baseline changes then we're on a newline
        Vector curBaseline = renderInfo.getBaseline().getStartPoint();
        Vector topRight = renderInfo.getAscentLine().getEndPoint();
        Rectangle rect = new Rectangle(curBaseline.get(Vector.I1), curBaseline.get(Vector.I2), topRight.get(Vector.I1), topRight.get(Vector.I2));
        float curFontSize = rect.getHeight();

        //See if something has changed, either the baseline, the font or the font size
        if ((this.lastBaseLine == null) || (curBaseline.get(Vector.I2) != lastBaseLine.get(Vector.I2)) || (curFontSize != lastFontSize) || (curFont != lastFont))
        {
            //if we've put down at least one span tag close it
            if ((this.lastBaseLine != null))
            {
                this.result.append("</span>").append("\n");
            }
            //If the baseline has changed then insert a line break
            if ((this.lastBaseLine != null) && curBaseline.get(Vector.I2) != lastBaseLine.get(Vector.I2))
            {
                this.result.append("<br />").append("\n");
            }
            //Create an HTML tag with appropriate styles
            this.result.append(String.format("<span style=\"font-family:{%s};font-size:{%s}\">", curFont, curFontSize));
        }

        //Append the current text
        this.result.append(renderInfo.getText() + " ");

        //Set currently used properties
        this.lastBaseLine = curBaseline;
        this.lastFontSize = curFontSize;
        this.lastFont = curFont;
    }

    public String getResultantText()
    {
        //If we wrote anything then we'll always have a missing closing tag so close it here
        if (result.length() > 0)
        {
            result.append("</span>");
        }
        return result.toString();
    }

    //Not needed
    public void beginTextBlock() { }
    public void endTextBlock() { }
    public void renderImage(ImageRenderInfo renderInfo) { }

}