I am trying to create code to export out the images within a PDF using iText Version 7.19. I'm having some issues with Flate encoded images. All the Flate encoded images from the Microsoft free book I'm using as an example (see Moving to Microsoft Visual Studio 2010) always coming out pink and depending upon how I try to copy the bytes they can come out distorted.
If I attempt to copy all the image bytes at once (see the SaveFlateEncodedImage2 method in the code below), they come out distorted like this one:
If I attempt to copy them row by row (see the SaveFlateEncodedImage method in the code below), they are pink like this one
Here is the code that I'm using to export them:
using iText.Kernel;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Filters;
using System;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Runtime.InteropServices;
namespace ITextPdfStuff
{
public class MyPdfImageExtractor
{
private readonly string _pdfFileName;
public MyPdfImageExtractor(string pdfFileName)
{
_pdfFileName = pdfFileName;
}
public void ExtractToDirectory(string directoryName)
{
using (var reader = new PdfReader(_pdfFileName))
{
// Avoid iText.Kernel.Crypto.BadPasswordException: https://stackoverflow.com/a/48065052/97803
reader.SetUnethicalReading(true);
using (var pdfDoc = new PdfDocument(reader))
{
ExtractImagesOnAllPages(pdfDoc, directoryName);
}
}
}
private void ExtractImagesOnAllPages(PdfDocument pdfDoc, string directoryName)
{
Console.WriteLine($"Number of pdf {pdfDoc.GetNumberOfPdfObjects()} objects");
// Extract objects https://itextpdf.com/en/resources/examples/itext-7/extracting-objects-pdf
for (int objNumber = 1; objNumber <= pdfDoc.GetNumberOfPdfObjects(); objNumber++)
{
PdfObject currentObject = pdfDoc.GetPdfObject(objNumber);
if (currentObject != null && currentObject.IsStream())
{
try
{
ExtractImagesOneImage(currentObject as PdfStream, Path.Combine(directoryName, $"image{objNumber}.png"));
}
catch (Exception ex)
{
Console.WriteLine($"Object number {objNumber} is NOT an image! -- error: {ex.Message}");
}
}
}
}
private void ExtractImagesOneImage(PdfStream someStream, string fileName)
{
var pdfDict = (PdfDictionary)someStream;
string subType = pdfDict.Get(PdfName.Subtype)?.ToString() ?? string.Empty;
bool isImage = subType == "/Image";
if (isImage == false)
return;
bool decoded = false;
string filter = pdfDict.Get(PdfName.Filter).ToString();
if (filter == "/FlateDecode")
{
SaveFlateEncodedImage(fileName, pdfDict, someStream.GetBytes(false));
}
else
{
byte[] imgData;
try
{
imgData = someStream.GetBytes(decoded);
}
catch (PdfException ex)
{
imgData = someStream.GetBytes(!decoded);
}
SaveNormalImage(fileName, imgData);
}
}
private void SaveNormalImage(string fileName, byte[] imgData)
{
using (var memStream = new System.IO.MemoryStream(imgData))
using (var image = System.Drawing.Image.FromStream(memStream))
{
image.Save(fileName, ImageFormat.Png);
Console.WriteLine($"{Path.GetFileName(fileName)}");
}
}
private void SaveFlateEncodedImage(string fileName, PdfDictionary pdfDict, byte[] imgData)
{
int width = int.Parse(pdfDict.Get(PdfName.Width).ToString());
int height = int.Parse(pdfDict.Get(PdfName.Height).ToString());
int bpp = int.Parse(pdfDict.Get(PdfName.BitsPerComponent).ToString());
// Example that helped: https://stackoverflow.com/a/8517377/97803
PixelFormat pixelFormat;
switch (bpp)
{
case 1:
pixelFormat = PixelFormat.Format1bppIndexed;
break;
case 8:
pixelFormat = PixelFormat.Format8bppIndexed;
break;
case 24:
pixelFormat = PixelFormat.Format24bppRgb;
break;
default:
throw new Exception("Unknown pixel format " + bpp);
}
// .NET docs https://api.itextpdf.com/iText7/dotnet/7.1.9/classi_text_1_1_kernel_1_1_pdf_1_1_filters_1_1_flate_decode_strict_filter.html
// Java docs have more detail: https://api.itextpdf.com/iText7/java/7.1.7/com/itextpdf/kernel/pdf/filters/FlateDecodeFilter.html
imgData = FlateDecodeStrictFilter.FlateDecode(imgData, true);
// byte[] streamBytes = FlateDecodeStrictFilter.DecodePredictor(imgData, pdfDict);
// Copy the image one row at a time
using (var bmp = new Bitmap(width, height, pixelFormat))
{
BitmapData bmpData = bmp.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.WriteOnly, pixelFormat);
int length = (int)Math.Ceiling(width * bpp / 8.0);
for (int i = 0; i < height; i++)
{
int offset = i * length;
int scanOffset = i * bmpData.Stride;
Marshal.Copy(imgData, offset, new IntPtr(bmpData.Scan0.ToInt64() + scanOffset), length);
}
bmp.UnlockBits(bmpData);
bmp.Save(fileName, ImageFormat.Png);
}
Console.WriteLine($"FlateDecode! {Path.GetFileName(fileName)}");
}
/// <summary>This method distorts the image badly</summary>
private void SaveFlateEncodedImage2(string fileName, PdfDictionary pdfDict, byte[] imgData)
{
int width = int.Parse(pdfDict.Get(PdfName.Width).ToString());
int height = int.Parse(pdfDict.Get(PdfName.Height).ToString());
int bpp = int.Parse(pdfDict.Get(PdfName.BitsPerComponent).ToString());
// Example that helped: https://stackoverflow.com/a/8517377/97803
PixelFormat pixelFormat;
switch (bpp)
{
case 1:
pixelFormat = PixelFormat.Format1bppIndexed;
break;
case 8:
pixelFormat = PixelFormat.Format8bppIndexed;
break;
case 24:
pixelFormat = PixelFormat.Format24bppRgb;
break;
default:
throw new Exception("Unknown pixel format " + bpp);
}
// .NET docs https://api.itextpdf.com/iText7/dotnet/7.1.9/classi_text_1_1_kernel_1_1_pdf_1_1_filters_1_1_flate_decode_strict_filter.html
// Java docs have more detail: https://api.itextpdf.com/iText7/java/7.1.7/com/itextpdf/kernel/pdf/filters/FlateDecodeFilter.html
imgData = FlateDecodeStrictFilter.FlateDecode(imgData, true);
// byte[] streamBytes = FlateDecodeStrictFilter.DecodePredictor(imgData, pdfDict);
// Copy the entire image in one go
using (var bmp = new Bitmap(width, height, pixelFormat))
{
BitmapData bmpData = bmp.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.WriteOnly, pixelFormat);
Marshal.Copy(imgData, 0, bmpData.Scan0, imgData.Length);
bmp.UnlockBits(bmpData);
bmp.Save(fileName, ImageFormat.Png);
}
Console.WriteLine($"FlateDecode! {Path.GetFileName(fileName)}");
}
}
}
The code can be instantiated and called like this from within a .NET Core console application:
string existingFileName = @"c:\temp\ReallyLongBook1.pdf";
var imageExtractor = new MyPdfImageExtractor(existingFileName);
imageExtractor.ExtractToDirectory(@"c:\temp\images");
I'm running the following free Microsoft book through this code: Moving to Microsoft Visual Studio 2010
The image in question is on page 10 and it's black and white (not pink).
I'm no PDF expert and I've been banging on this code for a couple of days now picking apart a number of examples to try to piece this together. Any help that would get me past my pink images, would be greatly appreciated.
-------Update Feb 4, 2020------
Here is the revised version after MKL's suggested changes. His change extracted more images than mine and produced proper looking images that appear in the book I mentioned above:
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using iText.Kernel.Pdf.Xobject;
using System;
using System.Collections.Generic;
using System.IO;
namespace ITextPdfStuff
{
public class MyPdfImageExtractor
{
private readonly string _pdfFileName;
public MyPdfImageExtractor(string pdfFileName)
{
_pdfFileName = pdfFileName;
}
public void ExtractToDirectory(string directoryName)
{
using (var reader = new PdfReader(_pdfFileName))
{
// Avoid iText.Kernel.Crypto.BadPasswordException: https://stackoverflow.com/a/48065052/97803
reader.SetUnethicalReading(true);
using (var pdfDoc = new PdfDocument(reader))
{
ExtractImagesOnAllPages(pdfDoc, directoryName);
}
}
}
private void ExtractImagesOnAllPages(PdfDocument pdfDoc, string directoryName)
{
Console.WriteLine($"Number of pdf {pdfDoc.GetNumberOfPdfObjects()} objects");
IEventListener strategy = new ImageRenderListener(Path.Combine(directoryName, @"image{0}.{1}"));
PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
for (var i = 1; i <= pdfDoc.GetNumberOfPages(); i++)
{
parser.ProcessPageContent(pdfDoc.GetPage(i));
}
}
}
public class ImageRenderListener : IEventListener
{
public ImageRenderListener(string format)
{
this.format = format;
}
public void EventOccurred(IEventData data, EventType type)
{
if (data is ImageRenderInfo imageData)
{
try
{
PdfImageXObject imageObject = imageData.GetImage();
if (imageObject == null)
{
Console.WriteLine("Image could not be read.");
}
else
{
File.WriteAllBytes(string.Format(format, index++, imageObject.IdentifyImageFileExtension()), imageObject.GetImageBytes());
}
}
catch (Exception ex)
{
Console.WriteLine("Image could not be read: {0}.", ex.Message);
}
}
}
public ICollection<EventType> GetSupportedEvents()
{
return null;
}
string format;
int index = 0;
}
}