1

I am new to work on ITextSharp. I am able to extract the text (with correct word separation) when I implement SimpleTextExtractionStrategy, but I need the font information (like font family and font size) and I implemented the solution given in following link. It works great in terms of font information extraction but problem raised is that in output text spaces between words is lost and all characters are displayed without spaces. For example the text "Hello World" is correctly displayed by SimpleTextExtractionStrategy but it is displayed as "HelloWorld" when the given solution is implemented. Any help?. Thanks in advance. A portion of code is copied

private void Form1_Load(object sender, EventArgs e)
    {
        String filePath;
        filePath = "C:\\paper1.pdf";

        if (File.Exists(filePath))
        {
            PdfReader reader = new PdfReader(filePath);

            TextWithFontExtractionStrategy S = new TextWithFontExtractionStrategy();
            SimpleTextExtractionStrategy st = new SimpleTextExtractionStrategy();

            string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);
            string r = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, st);

            MessageBox.Show(F);

            this.Close();
        }
        else
        {
            MessageBox.Show("Could not locate the file");
        }
    }
}

public class TextWithFontExtractionStrategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
{
    private XmlHandeler xmlHandeler = new XmlHandeler();

    private string textToWrite;

    private StringBuilder result = new StringBuilder();
    private Vector lastBaseLine;
    private string lastFont;
    private float lastFontSize;
    private bool isBold = false;
    private enum TextRenderMode
    {
        FillText = 0,
        StrokeText = 1,
        FillThenStrokeText = 2,
        Invisible = 3,
        FillTextAndAddToPathForClipping = 4,
        StrokeTextAndAddToPathForClipping = 5,
        FillThenStrokeTextAndAddToPathForClipping = 6,
        AddTextToPaddForClipping = 7
    }

    public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
    {
        string curFont = renderInfo.GetFont().PostscriptFontName;

        if ((renderInfo.GetTextRenderMode() == (int)TextRenderMode.FillThenStrokeText))
        {
            isBold = true;
        }

        Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
        Vector topRight = renderInfo.GetAscentLine().GetEndPoint();

        iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
        Single curFontSize = rect.Height;

        if (curFont != lastFont || curFontSize != lastFontSize)
        {
            xmlHandeler.createNode(curFont, curFontSize, isBold, result.ToString());
            result.Clear();
        }
            this.result1.Append(renderInfo.GetText());

            //Set currently used properties
            this.lastBaseLine = curBaseline;
            this.lastFontSize = curFontSize;
            this.lastFont = curFont;
        }
        public string GetResultantText()
        {
            if (result.Length > 0)
            {
                xmlHandeler.writeEndString();
            }
            xmlHandeler.createNode("abc", 12f, isBold, result.ToString());
            return result.ToString();
        }



        //Not needed
        public void BeginTextBlock() { }
        public void EndTextBlock() { }
        public void RenderImage(ImageRenderInfo renderInfo) { }
    }
Community
  • 1
  • 1
  • Post some code to explain further. Maybe use LocationTextExtractionStrategy and split string str.Split('\n'); – misha130 Mar 04 '16 at 18:01
  • I am searching where it might've stripped the \n or special characters. What about result1? What does it contain – misha130 Mar 04 '16 at 18:22
  • result1 is just a typo, actually it is result. The problem is that it do not give spaces between the words. I don't know how to get the spaces between words. – Shahbaz Ahmad Sahi Mar 04 '16 at 18:31
  • `ITextExtractionStrategy` does contain code in its `RenderText` method to insert missing space characters. Your code ignores this. Thus, spaces are not added where needed. – mkl Mar 05 '16 at 00:36
  • @mkl can you please give me code hints to add missing spaces. The code I ignores. I mean what is code that I am ignoring and how to add it? – Shahbaz Ahmad Sahi Mar 05 '16 at 03:49
  • Ah, sorry, I got the classes mixed up. `ITextExtractionStrategy` is merely declaring methods, not implementing them. I had the `SimpleTextExtractionStrategy` on my mind you also mentioned in your question. But you may still want to look at the source of that strategy to understand how to add missing spaces. – mkl Mar 05 '16 at 11:06

1 Answers1

0

I got the solution by using the code given in SimpleTextExtractionStrategy's renderText method. Some code

Vector start = segment.GetStartPoint();
        Vector end = segment.GetEndPoint();
        bool firstRender = result.Length == 0;
        bool hardReturn = false;

        if (!firstRender)
        {
            Vector x0 = start;
            Vector x1 = lastStart;
            Vector x2 = lastEnd;

            float dist = (x2.Subtract(x1)).Cross((x1.Subtract(x0))).LengthSquared / x2.Subtract(x1).LengthSquared;
            float sameLineThreshold = 1f; // we should probably base this on the current font metrics, but 1 pt seems to be sufficient for the time being
            if (dist > sameLineThreshold)
                hardReturn = true;
        }

        if (hardReturn)
        {
            //AppendTextChunk('\n');
            AppendTextChunk(' ');
        }
        else if (!firstRender)
        {
            if (result[result.Length - 1] != ' ' && renderInfo.GetText().Length > 0 && renderInfo.GetText()[0] != ' ')
            {
                // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
                float spacing = lastEnd.Subtract(start).Length;
                if (spacing > renderInfo.GetSingleSpaceWidth() / 2f)
                {
                    AppendTextChunk(' ');
                }
            }
        }