4

I want to extract an Image from a PDF file. I tried with the following code and it extracted a jpeg Image perfectly from the PDF. The problem is how to extract image from a particular page e.g. Page 1 or from some other page. I don't want to read the whole PDF to search for the Image.

Any suggestions?

Code to extract Image:

private void List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
        {
            List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();

            iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
            iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
            iTextSharp.text.pdf.PdfObject PDFObj = null;
            iTextSharp.text.pdf.PdfStream PDFStremObj = null;

            try
            {
                RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath);
                PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);

                for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
                {
                    PDFObj = PDFReaderObj.GetPdfObject(i);

                    if ((PDFObj != null) && PDFObj.IsStream())
                    {
                        PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
                        iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);

                        if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
                        {
                            byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);

                            if ((bytes != null))
                            {
                                try
                                {
                                    System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);

                                    MS.Position = 0;
                                    System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);
                                    pictureBox1.Image = ImgPDF;
                                    MS.Close();
                                    MS.Flush();

                                }
                                catch (Exception)
                                {

                                }
                            }
                        }
                    }
                }
                PDFReaderObj.Close();
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }                
        }
Bruno Lowagie
  • 75,994
  • 9
  • 109
  • 165
Aaraadhana
  • 145
  • 1
  • 3
  • 14
  • 4
    It has nothing to do with the problem at hand, but I thought I would point it out: that `throw new Exception(ex.Message);` won't rethrow the exception, creating a new one instead and causing the loss of stack trace information. Simply use `throw;`, unless that's the effect you're explicitly trying to achieve. – s.m. May 21 '12 at 17:07
  • @harriyott: You are not getting my point. I want to specify the page number for which it should search for the Image. – Aaraadhana May 21 '12 at 17:18
  • possible duplicate of [itextsharp extract images](http://stackoverflow.com/questions/802269/itextsharp-extract-images) – Chris Haas May 21 '12 at 18:23
  • @ChrisHaas: Sir i only want to specify the page number to Read. Now its reading the full PDF and search images from it. – Aaraadhana May 21 '12 at 18:34
  • @ChrisHaas: RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath); PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null); In the above you can see its reading i only want to know some way to specify the page so that it only read a particular page – Aaraadhana May 21 '12 at 18:36
  • @Aaraadhana, your code bypasses the entire concept of a pages and instead looks at the entire PDF root objects. If you want to go by pages, see the link that I posted where you call `PdfReader.GetPageN(int)` that gives you a `PdfDictionary` for a given page and from that you can enumerate the `PdfName.RESOURCES` objects. – Chris Haas May 21 '12 at 19:08
  • @ChrisHaas: Sir i tried but its still showing other page pictures. I only want to view Page 1 image. But its is still checking for the image on the other pages – Aaraadhana May 21 '12 at 19:51

3 Answers3

9

I don't have iTextSharp 4.0 available currently so this code targets 5.2 but it should work just fine for the older one, too. This code is an almost direct lift from this post here, so please see that post as well as responses for further questions. As I said in the comments above, your code is looking at all of the images from the document-perspective while the code that I linked to goes page-by-page.

Please read all of the comments in the other post, especially this one which explains that this ONLY works for JPG images. There's a lot of different types of images that PDF supports so unless you know that you're only dealing with JPGs you'll need to add a bunch of more code. See this post and this post for some hints.

        string testFile = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Doc1.pdf");
        string outputPath = Environment.GetFolderPath(Environment.SpecialFolder.Desktop);
        int pageNum = 1;

        PdfReader pdf = new PdfReader(testFile);
        PdfDictionary pg = pdf.GetPageN(pageNum);
        PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
        PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
        if (xobj == null) { return; }
        foreach (PdfName name in xobj.Keys) {
            PdfObject obj = xobj.Get(name);
            if (!obj.IsIndirect()) { continue; }
            PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
            PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
            if (!type.Equals(PdfName.IMAGE)) { continue; }
            int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
            PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
            PdfStream pdfStrem = (PdfStream)pdfObj;
            byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
            if (bytes == null) { continue; }
            using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes)) {
                memStream.Position = 0;
                System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
                if (!Directory.Exists(outputPath))
                    Directory.CreateDirectory(outputPath);

                string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNum));
                System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
                parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
                var jpegEncoder = ImageCodecInfo.GetImageEncoders().ToList().Find(x => x.FormatID == ImageFormat.Jpeg.Guid);
                img.Save(path, jpegEncoder, parms);

            }
        }
Community
  • 1
  • 1
Chris Haas
  • 53,986
  • 12
  • 141
  • 274
  • 1
    Hi, thanks, your solution help me but in one pdf xobj.Keys return all images of all pages for each page. Do you have any idea why ? – Dragouf Jul 30 '13 at 16:13
2

The following is the code which I am using to extract images from PDF. It works completely fine for me.

//   Required: iTextSharp.dll

using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using iTextSharp.text.pdf.parser;
using Dotnet = System.Drawing.Image;
using iTextSharp.text.pdf;

namespace PDF_Parsing {
    partial class ExtractPdfImage
    {
        string imgPath = @"c:\extractedImg.png";
        private void ExtractImage(string pdfFile)
        {
            const int pageNumber = 1;
            PdfReader pdf = new PdfReader(pdfFile);
            PdfDictionary pg = pdf.GetPageN(pageNumber);
            PdfDictionary res =               (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
            PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
            foreach (PdfName name in xobj.Keys)
            {
                PdfObject obj = xobj.Get(name);
                if (obj.IsIndirect())
                {
                    PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
                    string width = tg.Get(PdfName.WIDTH).ToString();
                    string height = tg.Get(PdfName.HEIGHT).ToString();
                    ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new   Matrix(float.Parse(width), float.Parse(height)),
                        (PRIndirectReference)obj, tg);
                    RenderImage(imgRI);
                }
            }
        }

        private void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image = renderInfo.GetImage();
            using (Dotnet dotnetImg = image.GetDrawingImage())
            {
                if (dotnetImg != null)
                {
                    using (MemoryStream ms = new MemoryStream())
                    {
                        dotnetImg.Save(ms, ImageFormat.Tiff);
                        Bitmap d = new Bitmap(dotnetImg);
                        d.Save(imgPath);
                    }
                }
            }
        }
    }
}
Rüdiger Hanke
  • 6,215
  • 2
  • 38
  • 45
Dhivya X.P
  • 71
  • 1
  • 1
  • I've tried this approach and i get an image, however it's distorted for some reason (as if byte stream for the bitmap got output to a different width bitmap). Any ideas? – Muxa May 23 '14 at 09:10
0

The following code works fine to extract image from particular page.

using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using iTextSharp.text.pdf.parser;
using Dotnet = System.Drawing.Image;
using iTextSharp.text.pdf;
namespace PDF_Parsing
{
  partial class PDF_ImgExtraction
  {
    string imgPath;
    private void ExtractImage(string pdfFile)
    {
      const int pageNumber = 1;//Page number to extract the image from
      PdfReader pdf = new PdfReader(pdfFile);
      PdfDictionary pg = pdf.GetPageN(pageNumber);
      PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
      PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
      foreach (PdfName name in xobj.Keys)
      {
        PdfObject obj = xobj.Get(name);
        if (obj.IsIndirect())
        {
          PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
          string width = tg.Get(PdfName.WIDTH).ToString();
          string height = tg.Get(PdfName.HEIGHT).ToString();
          ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg);
          RenderImage(imgRI);
        }
      }
    }
    private void RenderImage(ImageRenderInfo renderInfo)
    {
      PdfImageObject image = renderInfo.GetImage();
      using (Dotnet dotnetImg = image.GetDrawingImage())
      {
        if (dotnetImg != null)
        {
          using (MemoryStream ms = new MemoryStream())
          {
            dotnetImg.Save(ms, ImageFormat.Tiff);
            Bitmap d = new Bitmap(dotnetImg);
            d.Save(imgPath);
          }
        }
      }
    }
  }
}
Andrew Barber
  • 39,603
  • 20
  • 94
  • 123
Dhivya X.P
  • 71
  • 1
  • 1