0

I want to highlight several keywords in a set of PDF files. Firstly, we have to identify the single words and match them with my keywords. I found an example:

class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy
{
    //Hold each coordinate
    public List<RectAndText> myPoints = new List<RectAndText>();

    List<string> topicTerms;
    public MyLocationTextExtractionStrategy(List<string> topicTerms)
    {
        this.topicTerms = topicTerms;
    }

    //Automatically called for each chunk of text in the PDF
    public override void RenderText(TextRenderInfo renderInfo)
    {
        base.RenderText(renderInfo);


        //Get the bounding box for the chunk of text
        var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
        var topRight = renderInfo.GetAscentLine().GetEndPoint();

        //Create a rectangle from it
        var rect = new iTextSharp.text.Rectangle(
                                                bottomLeft[Vector.I1],
                                                bottomLeft[Vector.I2],
                                                topRight[Vector.I1],
                                                topRight[Vector.I2]
                                                );

        //Add this to our main collection
        //filter the meaingless words
        string text = renderInfo.GetText();
        this.myPoints.Add(new RectAndText(rect, renderInfo.GetText()));

However, I found so many words are broken. For example, "stop" will be "st" and "op". Are there any other method to identify a single word and its position?

juily lian
  • 31
  • 3
  • 1
    I can always spot my old [code](http://stackoverflow.com/a/23915452/231316)! Anyway, see [mkl's answer here](http://stackoverflow.com/a/20049810/231316) about using `IsChunkAtWordBoundary()` to figure out if two "chunks" should probably be one "word". – Chris Haas Dec 16 '15 at 19:46
  • Thanks for your old code. It indeed helps a lot. I will try your suggestions later. Thanks again. – juily lian Dec 18 '15 at 03:10
  • Anyway, I find a better way of collecting single words is in GetResultantText() but not RenderText(). – juily lian Dec 18 '15 at 20:06

1 Answers1

0

When you want to collect single words and their coordination, the better way is to override the existing LocationTextExtractionStrategy. Here is my code:

public virtual String GetResultantText(ITextChunkFilter chunkFilter){
        if (DUMP_STATE) {
            DumpState();
        }

        List<TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter);
        filteredTextChunks.Sort();

        List<RectAndText> tmpList = new List<RectAndText>();

        StringBuilder sb = new StringBuilder();
        TextChunk lastChunk = null;
        foreach (TextChunk chunk in filteredTextChunks) {

            if (lastChunk == null){
                sb.Append(chunk.Text);
                var startLocation = chunk.StartLocation;
                var endLocation = chunk.EndLocation;

                var rect = new iTextSharp.text.Rectangle(startLocation[0], startLocation[1], endLocation[0], endLocation[1]);
                tmpList.Add(new RectAndText(rect, chunk.Text));
            } else {
                if (chunk.SameLine(lastChunk)){
                    // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
                    if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text))
                    {
                        sb.Append(' ');
                        if (tmpList.Count > 0)
                        {
                            mergeAndStoreChunk(tmpList);
                            tmpList.Clear();
                        }

                    }

                    sb.Append(chunk.Text);

                   var startLocation = chunk.StartLocation; 
                    var endLocation = chunk.EndLocation;

                    var rect = new iTextSharp.text.Rectangle(startLocation[0], startLocation[1], endLocation[0], endLocation[1]);
                    ////var topRight = renderInfo.GetAscentLine().GetEndPoint();
                    tmpList.Add(new RectAndText(rect,chunk.Text));

                } else {
                    sb.Append('\n');
                    sb.Append(chunk.Text);

                }
            }
            lastChunk = chunk;
        }

        return sb.ToString();
    }

    private void mergeAndStoreChunk(List<RectAndText> tmpList)
    {
        RectAndText mergedChunk = tmpList[0];
        int tmpListCount = tmpList.Count();
        for (int i = 1; i < tmpListCount; i++)
        {
            RectAndText nowChunk = tmpList[i];
            mergedChunk.Rect.Right = nowChunk.Rect.Right;
            mergedChunk.Text += nowChunk.Text;
        }
        this.myPoints.Add(mergedChunk);
    }

myPoints is a list, which will return all we want.

juily lian
  • 31
  • 3