11

I am trying to extract all the images from a pdf using itextsharp but can't seem to overcome this one hurdle.

The error occures on the line System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS); giving an error of "Parameter is not valid".

I think it works when the image is a bitmap but not of any other format.

I have this following code - sorry for the length;

    private void Form1_Load(object sender, EventArgs e)
    {
        FileStream fs = File.OpenRead(@"reader.pdf");
        byte[] data = new byte[fs.Length];
        fs.Read(data, 0, (int)fs.Length);

        List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();

        iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
        iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
        iTextSharp.text.pdf.PdfObject PDFObj = null;
        iTextSharp.text.pdf.PdfStream PDFStremObj = null;

        try
        {
            RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(data);
            PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);

            for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
            {
                PDFObj = PDFReaderObj.GetPdfObject(i);

                if ((PDFObj != null) && PDFObj.IsStream())
                {
                    PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
                    iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);

                    if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
                    {
                        byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);

                        if ((bytes != null))
                        {
                            try
                            {
                                System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);

                                MS.Position = 0;
                                System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);

                                ImgList.Add(ImgPDF);

                            }
                            catch (Exception)
                            {
                            }
                        }
                    }
                }
            }
            PDFReaderObj.Close();
        }
        catch (Exception ex)
        {
            throw new Exception(ex.Message);
        }



    } //Form1_Load
Dulini Atapattu
  • 2,735
  • 8
  • 33
  • 47
griegs
  • 22,624
  • 33
  • 128
  • 205

8 Answers8

5

Resolved...

Even I got the same exception of "Parameter is not valid" and after so much of work with the help of the link provided by der_chirurg (http://kuujinbo.info/iTextSharp/CCITTFaxDecodeExtract.aspx ) I resolved it and following is the code:

using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using iTextSharp.text.pdf.parser;
using Dotnet = System.Drawing.Image;
using iTextSharp.text.pdf;

namespace PDF_Parsing
{
    partial class PDF_ImgExtraction
    {
        string imgPath;
        private void ExtractImage(string pdfFile)
        {
            PdfReader pdfReader = new PdfReader(files[fileIndex]);
            for (int pageNumber = 1; pageNumber <= pdfReader.NumberOfPages; pageNumber++)
            {
                PdfReader pdf = new PdfReader(pdfFile);
                PdfDictionary pg = pdf.GetPageN(pageNumber);
                PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
                PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
                foreach (PdfName name in xobj.Keys)
                {
                    PdfObject obj = xobj.Get(name);
                    if (obj.IsIndirect())
                    {
                        PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
                        string width = tg.Get(PdfName.WIDTH).ToString();
                        string height = tg.Get(PdfName.HEIGHT).ToString();
                        ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg);
                        RenderImage(imgRI);
                    }
                }
            }
        }
        private void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image = renderInfo.GetImage();
            using (Dotnet dotnetImg = image.GetDrawingImage())
            {
                if (dotnetImg != null)
                {
                    using (MemoryStream ms = new MemoryStream())
                    {
                        dotnetImg.Save(ms, ImageFormat.Tiff);
                        Bitmap d = new Bitmap(dotnetImg);
                        d.Save(imgPath);
                    }
                }
            }
        }
    }
}
Andrew Barber
  • 39,603
  • 20
  • 94
  • 123
Dhivya X.P
  • 71
  • 1
  • 1
  • 1
    Hi, thanks, your solution help me but in one pdf xobj.Keys return all images of all pages for each page. Do you have any idea why ? – Dragouf Jul 30 '13 at 16:12
4

Works for me like this, using these two methods:

    public static List<System.Drawing.Image> ExtractImagesFromPDF(byte[] bytes)
    {
        var imgs = new List<System.Drawing.Image>();
        var pdf = new PdfReader(bytes);

        try
        {
            for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
            {
                PdfDictionary pg = pdf.GetPageN(pageNumber);
                List<PdfObject> objs = FindImageInPDFDictionary(pg);

                foreach (var obj in objs)
                {
                    if (obj != null)
                    {
                        int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
                        PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
                        PdfStream pdfStrem = (PdfStream)pdfObj;
                        var pdfImage = new PdfImageObject((PRStream)pdfStrem);
                        var img = pdfImage.GetDrawingImage();

                        imgs.Add(img);
                    }
                }
            }
        }
        finally
        {
            pdf.Close();
        }

        return imgs;
    }

    private static List<PdfObject> FindImageInPDFDictionary(PdfDictionary pg)
    {
        var res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
        var xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
        var pdfObgs = new List<PdfObject>();

        if (xobj != null)
        {
            foreach (PdfName name in xobj.Keys)
            {
                PdfObject obj = xobj.Get(name);
                if (obj.IsIndirect())
                {
                    var tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
                    var type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));

                    if (PdfName.IMAGE.Equals(type)) // image at the root of the pdf
                    {
                        pdfObgs.Add(obj);
                    }
                    else if (PdfName.FORM.Equals(type)) // image inside a form
                    {
                        FindImageInPDFDictionary(tg).ForEach(o => pdfObgs.Add(o));
                    }
                    else if (PdfName.GROUP.Equals(type)) // image inside a group
                    {
                        FindImageInPDFDictionary(tg).ForEach(o => pdfObgs.Add(o));
                    }
                }
            }
        }

        return pdfObgs;
    }
4

You need to check the stream's /Filter to see what image format a given image uses. It may be a standard image format:

  • DCTDecode (jpeg)
  • JPXDecode (jpeg 2000)
  • JBIG2Decode (jbig is a B&W only format)
  • CCITTFaxDecode (fax format, PDF supports group 3 and 4)

Other than that, you'll need to get the raw bytes (as you are), and build an image using the image stream's width, height, bits per component, number of color components (could be CMYK, indexed, RGB, or Something Weird), and a few others, as defined in section 8.9 of the ISO PDF SPECIFICATION (available for free).

So in some cases your code will work, but in others, it'll fail with the exception you mentioned.

PS: When you have an exception, PLEASE include the stack trace every single time. Pretty please with sugar on top?

Mark Storer
  • 15,672
  • 3
  • 42
  • 80
  • +1 @Mark Storer, sorry about the StackTrace. Will do it next time. Sounds like a lot of faffen around to get images when a simple $500 library will do the job huh? I'm not a graphics person so I think I'll get the company to just buy something. Thanks for the answer too. – griegs May 10 '11 at 22:59
3

In newer version of iTextSharp, the 1st parameter of ImageRenderInfo.CreateForXObject is not Matrix anymore but GraphicsState. @der_chirurg's approach should work. I tested myself with the information from the following link and it worked beautifully:

http://www.thevalvepage.com/swmonkey/2014/11/26/extract-images-from-pdf-files-using-itextsharp/

gustavohenke
  • 40,997
  • 14
  • 121
  • 129
asanga15
  • 445
  • 5
  • 6
  • 1
    Hi there, welcome to SO and thank you for your answer. As links can change over time, could you edit your answer to include the pertinent information here? You can still provide the link for context. Thanks! – Tim Malone Aug 02 '16 at 22:57
1

To extract all Images on all Pages, it is not necessary to implement different filters. iTextSharp has an Image Renderer, which saves all Images in their original image type.

Just do the following found here: http://kuujinbo.info/iTextSharp/CCITTFaxDecodeExtract.aspx You don't need to implement HttpHandler...

der_chirurg
  • 1,475
  • 2
  • 16
  • 26
1

I added library on github which, extract images in PDF and compress them.

Could be useful, when you are going to start play with very powerful library ITextSharp.

Here the link: https://github.com/rock-walker/PdfCompression

rock_walker
  • 453
  • 5
  • 14
  • 2
    While this link may answer the question, it is better to include the essential parts of the answer here and provide the link for reference. Link-only answers can become invalid if the linked page changes. - [From Review](/review/low-quality-posts/17792322) – Jolta Oct 31 '17 at 11:54
  • I was elaborated on task about PDF compression (mainly extracting images an compressing them) for 2 months. And all over the SO, and other resources there were no consolidated example, how to deal with the things, like itextsharp, extracting images, compressing in PDF. I assembled a lot of small pieces of data in SO to get final working library. I really believe, that my end-to-end example could be considered as starting playground for those, who going to do the same things. – rock_walker Nov 01 '17 at 12:24
  • I don't think there is anything wrong with your library! It's just that Stack Overflow's policy expressly discourages answers that are only a link to an external site. The purpose of SO is that people should learn; not only that they will find a solution to their immediate problem. – Jolta Nov 01 '17 at 14:59
  • ah, I wouldn't like to argue with SO policy. From my experience I learned a lot from external links, just only few of them were broken. It's good to tackle the problem in the deep sometimes. Ok, it's my point of view. Just up to me :) – rock_walker Nov 01 '17 at 15:35
0

This works for me and I think it's a simple solution:

Write a custom RenderListener and implement its RenderImage method, something like this

    public void RenderImage(ImageRenderInfo info)
    {
        PdfImageObject image = info.GetImage();
        Parser.Matrix matrix = info.GetImageCTM();
        var fileType = image.GetFileType();
        ImageFormat format;
        switch (fileType)
        {//you may add more types here
            case "jpg":
            case "jpeg":
                format = ImageFormat.Jpeg;
                break;
            case "pnt":
                format = ImageFormat.Png;
                break;
            case "bmp":
                format = ImageFormat.Bmp;
                break;
            case "tiff":
                format = ImageFormat.Tiff;
                break;
            case "gif":
                format = ImageFormat.Gif;
                break;
            default:
                format = ImageFormat.Jpeg;
                break;
        }

        var pic = image.GetDrawingImage();
        var x = matrix[Parser.Matrix.I31];
        var y = matrix[Parser.Matrix.I32];
        var width = matrix[Parser.Matrix.I11];
        var height = matrix[Parser.Matrix.I22];
        if (x < <some value> && y < <some value>)
        {
            return;//ignore these images
        }

        pic.Save(<path and name>, format);
}
-1

I have used this library in the past without any problems.

http://www.winnovative-software.com/PdfImgExtractor.aspx

private void btnExtractImages_Click(object sender, EventArgs e)
{
    if (pdfFileTextBox.Text.Trim().Equals(String.Empty))
    {
        MessageBox.Show("Please choose a source PDF file", "Choose PDF file", MessageBoxButtons.OK);
        return;
    }

    // the source pdf file
    string pdfFileName = pdfFileTextBox.Text.Trim();

    // start page number
    int startPageNumber = int.Parse(textBoxStartPage.Text.Trim());
    // end page number
    // when it is 0 the extraction will continue up to the end of document
    int endPageNumber = 0;
    if (textBoxEndPage.Text.Trim() != String.Empty)
        endPageNumber = int.Parse(textBoxEndPage.Text.Trim());

    // create the PDF images extractor object
    PdfImagesExtractor pdfImagesExtractor = new PdfImagesExtractor();

    pdfImagesExtractor.LicenseKey = "31FAUEJHUEBQRl5AUENBXkFCXklJSUlQQA==";

    // the demo output directory
    string outputDirectory = Path.Combine(Application.StartupPath, @"DemoFiles\Output");

    Cursor = Cursors.WaitCursor;

    // set the handler to be called when an image was extracted
    pdfImagesExtractor.ImageExtractedEvent += pdfImagesExtractor_ImageExtractedEvent;

    try
    {
        // start images counting
        imageIndex = 0;

        // call the images extractor to raise the ImageExtractedEvent event when an images is extracted from a PDF page
        // the pdfImagesExtractor_ImageExtractedEvent handler below will be executed for each extracted image
        pdfImagesExtractor.ExtractImagesInEvent(pdfFileName, startPageNumber, endPageNumber);

        // Alternatively you can use the ExtractImages() and ExtractImagesToFile() methods
        // to extracted the images from a PDF document in memory or to image files in a directory

        // uncomment the line below to extract the images to an array of ExtractedImage objects
        //ExtractedImage[] pdfPageImages = pdfImagesExtractor.ExtractImages(pdfFileName, startPageNumber, endPageNumber);

        // uncomment the lines below to extract the images to image files in a directory
        //string outputDirectory = System.IO.Path.Combine(Application.StartupPath, @"DemoFiles\Output");
        //pdfImagesExtractor.ExtractImagesToFile(pdfFileName, startPageNumber, endPageNumber, outputDirectory, "pdfimage");
    }
    catch (Exception ex)
    {
        // The extraction failed
        MessageBox.Show(String.Format("An error occurred. {0}", ex.Message), "Error");
        return;
    }
    finally
    {
        // uninstall the event handler
        pdfImagesExtractor.ImageExtractedEvent -= pdfImagesExtractor_ImageExtractedEvent;

        Cursor = Cursors.Arrow;
    }

    try
    {
        System.Diagnostics.Process.Start(outputDirectory);
    }
    catch (Exception ex)
    {
        MessageBox.Show(string.Format("Cannot open output folder. {0}", ex.Message));
        return;
    }
}

/// <summary>
/// The ImageExtractedEvent event handler called after an image was extracted from a PDF page.
/// The event is raised when the ExtractImagesInEvent() method is used
/// </summary>
/// <param name="args">The handler argument containing the extracted image and the PDF page number</param>
void pdfImagesExtractor_ImageExtractedEvent(ImageExtractedEventArgs args)
{
    // get the image object and page number from even handler argument
    Image pdfPageImageObj = args.ExtractedImage.ImageObject;
    int pageNumber = args.ExtractedImage.PageNumber;

    // save the extracted image to a PNG file
    string outputPageImage = Path.Combine(Application.StartupPath, @"DemoFiles\Output", 
        "pdfimage_" + pageNumber.ToString() + "_" + imageIndex++ + ".png");
    pdfPageImageObj.Save(outputPageImage, ImageFormat.Png);

    args.ExtractedImage.Dispose();
}
gotnull
  • 26,454
  • 22
  • 137
  • 203