0

I would like to find the cardinal position of a line (or paragraph) in a pdf which contains a given pattern.

For exemple I can have this problem :

  • In input, I have a regex (for exemple "Test.*") and a PDF containing a line (or a paragraph) which valid this regex.
  • I want as an output : the list of Y positions of the lines which validate this regex.

Does anyone have an idea how i can detect that positions ?

Thank you very much.

Eliott

Eliott Roynette
  • 716
  • 8
  • 21
  • 3
    we all do think positive about this, so please do it, and when done show us your code. – Ashkan Mobayen Khiabani Sep 20 '18 at 09:50
  • No, it is a duplicate of https://stackoverflow.com/questions/23909893/getting-coordinates-of-string-using-itextextractionstrategy-and-locationtextextr – Eliott Roynette Sep 20 '18 at 12:00
  • Should i close that question ? – Eliott Roynette Sep 20 '18 at 12:00
  • Which iText version do you target? The reference you gave is based on iText 5.x. If that's your target, too, we can gladly close your question as duplicate thereof. If you target iText 7.x, though, you still may be interested in new answers. – mkl Sep 20 '18 at 14:07

1 Answers1

1

I can have something helpful for you but it is not fully completed. I used to write but I did not finish. you will be able to determine the position of the text. Program return each item in pdf and returns the coordinates.

i Use - itext7 and dotnet core

string[] srcFileNames = { "1.pdf" }; FindTextInPdf("test", srcFileNames);

 public void FindTextInPdf(string SearchStr, string[] sources)
 {

            foreach (var item in sources)
            {
                if (File.Exists(item))
                {
                    using (PdfReader reader = new PdfReader(item))
                    using (var doc = new PdfDocument(reader))
                    {

                        var pageCount = doc.GetNumberOfPages();

                        for (int i = 1; i <= pageCount; i++)
                        {
                            PdfPage page = doc.GetPage(i);
                            var box = page.GetCropBox();
                            var rect = new Rectangle(box.GetX(), box.GetY(), box.GetWidth(), box.GetHeight());

                            var filter = new IEventFilter[1];
                                filter[0] = new TextRegionEventFilter(rect);

                            ITextExtractionStrategy strategy = new FilteredTextEventListener(new TextLocationStrategy(), filter);
                            var str = PdfTextExtractor.GetTextFromPage(page, strategy);
                            if (str.Contains(SearchStr) == true)
                            {
                                Console.WriteLine("Searched text found in file:[ " + item + " ] page : [ " + i + " ]");
                            }

                            foreach (var d in objectResult)
                            {
                                Console.WriteLine("Char >"+ d.Text+ " X >"+ d.Rect.GetX()+" font >"+ d.FontFamily + " font size >"+ d.FontSize.ToString()+" space >"+ d.SpaceWidth);**

                            }


                        }
                    }
                }



    }


class TextLocationStrategy : LocationTextExtractionStrategy
{
    public static List<TextMyChunk> objectResult = new List<TextMyChunk>();

    public class TextMyChunk
    {
        public string Text { get; set; }
        public Rectangle Rect { get; set; }
        public string FontFamily { get; set; }
        public float FontSize { get; set; }
        public float SpaceWidth { get; set; }

    }

    public override void EventOccurred(IEventData data, EventType type)
    {
        if (!type.Equals(EventType.RENDER_TEXT)) return;

        TextRenderInfo renderInfo = (TextRenderInfo)data;

        IList<TextRenderInfo> text = renderInfo.GetCharacterRenderInfos();
        foreach (TextRenderInfo t in text)
        {
            string letter = t.GetText();
            Vector letterStart = t.GetBaseline().GetStartPoint();
            Vector letterEnd = t.GetAscentLine().GetEndPoint();
            Rectangle letterRect = new Rectangle(letterStart.Get(0), letterStart.Get(1), letterEnd.Get(0) - letterStart.Get(0), letterEnd.Get(1) - letterStart.Get(1));

                TextMyChunk chunk = new TextMyChunk();
                chunk.Text = letter;
                chunk.Rect = letterRect;
                chunk.FontFamily = t.GetFont().GetFontProgram().ToString();
                chunk.FontSize = t.GetFontSize();
                chunk.SpaceWidth = t.GetSingleSpaceWidth();

                objectResult.Add(chunk);

        }

    }
siasty
  • 167
  • 1
  • 2
  • 10