1

Have few Questions related to Bookmarks and Links

BookMarks

  1. As we get the bookmarks, is there is any option to bookmark in the page content and modify the text of the bookmark and modify the bookmark prsent in the tree structer in PDF file.

  2. Do we have option to search bookmark in the page content . Irespective of conditions like (space, spelling etc)

  3. Option to edit bookmark and bookmark present in the pdf page content.

  4. Is there any option to find H1,H2 in page content

  5. Is there is any option to find that bookmark is pointing to correct page or not.

Links

  1. Is there is any option to find links. And to check whether the link is pointing to correct URL.

This is my code

public void ReadPdfFile(string fileName, string CompareText)
    {
        StringBuilder text = new StringBuilder();
        System.Data.DataTable dtResult = new System.Data.DataTable();
        string currentText = "";
        string title = "";
        string[] pages;
        if (ViewState["Append"] != null)
        {
            dtResult = ViewState["Append"] as System.Data.DataTable;
        }

        string SearchText = string.Empty;

        if (!dtResult.Columns.Contains("BookMarks"))
        {
            dtResult.Columns.Add(new DataColumn("BookMarks"));

        }
        if (!dtResult.Columns.Contains("Exists"))
        {
            dtResult.Columns.Add(new DataColumn("Exists"));

        }
        if (File.Exists(fileName))
        {
                 PdfReader pdfReader = new PdfReader(fileName);
                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                //string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                IList<Dictionary<string, object>> bookmarks = SimpleBookmark.GetBookmark(pdfReader);
                for (int i = 0; i < bookmarks.Count; i++)
                {
                    var values = bookmarks[i].Values.ToList();
                     title = values[0].ToString();
                     pages = values[1].ToString().Split(' ');
                    currentText = PdfTextExtractor.GetTextFromPage(pdfReader, Convert.ToInt32(pages[0].ToString()));
                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    if (currentText.Contains(title))
                    {
                        DataRow dr = dtResult.NewRow();
                        dr["BookMarks"] = title;
                        dr["Exists"] = "No";
                        dtResult.Rows.Add(dr);
                        dtResult.AcceptChanges();
                    }
                    currentText = "";
                    if (values.Count >= 4)
                    {
                        var ChildValues = (((System.Collections.Generic.List<System.Collections.Generic.Dictionary<string, object>>)(values[3])));
                        int haschild = 0;

                        int count = 0;
                        int n = 0;
                        int m = 0;
                           while (count < ChildValues.Count)
                           {
                                haschild = 1;
                                title = ChildValues[n]["Title"].ToString();

                                pages = ChildValues[n]["Page"].ToString().Split(' ');

                                currentText = PdfTextExtractor.GetTextFromPage(pdfReader, Convert.ToInt32(pages[0].ToString()));

                                currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                               // currentText = currentText.Replace('\n', ' ');
                                if (currentText.Contains(title))
                                {
                                    DataRow dr = dtResult.NewRow();
                                    dr["BookMarks"] = title;
                                    dr["Exists"] = "No";
                                    dtResult.Rows.Add(dr);
                                    dtResult.AcceptChanges();
                                }
                                currentText = "";
                                if (ChildValues[n].Count >= 4)
                                {
                                    int mychildcount = 0;
                                    m = 0;
                                    var mychild = (((System.Collections.Generic.List<System.Collections.Generic.Dictionary<string, object>>)(ChildValues[n]["Kids"])));
                                    while (haschild == 1 || mychildcount < mychild.Count)
                                    {

                                        title = mychild[m]["Title"].ToString();

                                        pages = mychild[m]["Page"].ToString().Split(' ');

                                        currentText = PdfTextExtractor.GetTextFromPage(pdfReader, Convert.ToInt32(pages[0].ToString()));

                                        currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                                       // currentText = currentText.Replace('\n', ' ');
                                        if (currentText.Contains(title))
                                        {
                                            DataRow dr = dtResult.NewRow();
                                            dr["BookMarks"] = title;
                                            dr["Exists"] = "No";
                                            dtResult.Rows.Add(dr);
                                            dtResult.AcceptChanges();
                                        }
                                        currentText = "";
                                        if (mychild[m].Count >= 4)
                                        {

                                            haschild = 1;
                                            mychild = (((System.Collections.Generic.List<System.Collections.Generic.Dictionary<string, object>>)(mychild[m]["Kids"])));

                                        }
                                        else
                                        {

                                            m++;
                                            haschild = 0;
                                            mychildcount++;
                                        }

                                    }
                                    n++;
                                    count++;

                                }
                                else
                                {
                                    n++;
                                    count++;
                                }
                                }

                            }
                    }

                  pdfReader.Close();
        }


            ViewState["Append"] = dtResult;
          GVResult.DataSource = dtResult;
        GVResult.DataBind();

            }
Chris Haas
  • 53,986
  • 12
  • 141
  • 274
  • 2
    I can answer your fourth question. `H1`, `H2`, etc. are HTML concepts and don't exist in PDF. PDF doesn't have "headers", "footers", "titles", "tables" and even what we think of as "hyperlinks" aren't really comparable to HTML's version. PDF has text, lines, images and annotations, that's about it. So the direct answer to your fourth question is "no". However, see this answer for extracting text and getting some formatting from which you can apply your own logic to determine the header levels. http://stackoverflow.com/a/6884297/231316 – Chris Haas Jan 16 '14 at 14:03
  • See this for extracting links: http://stackoverflow.com/a/8141831/231316 – Chris Haas Jan 16 '14 at 15:23
  • 1
    To be honest, the remaining questions are just too vague to answer. You talk about "bookmarks" but I don't see any PDF-specific "bookmark" things, I only see your `DataTable` which has a `BookMarks` column in it. We don't know anything about that table nor do we know anything about your specific PDFs. Please try to limit your question to a single specific thing that we can reproduce and help you with. – Chris Haas Jan 16 '14 at 15:24

0 Answers0