I m trying to extract images from a pdf file using itextsharp
an example pdf i m using here
The code i m using is:-
static void Main(string[] args)
{
try
{
WriteImageFile(); // write image file
System.Console.WriteLine(AppDomain.CurrentDomain.BaseDirectory);
System.Console.ReadLine();
}
catch (Exception ex)
{
System.Console.WriteLine(ex.Message);
}
}
private static List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
{
List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();
iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
iTextSharp.text.pdf.PdfObject PDFObj = null;
iTextSharp.text.pdf.PdfStream PDFStremObj = null;
try
{
RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath);
PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);
if (PDFReaderObj.IsOpenedWithFullPermissions)
{
Debug.Print("this is a test");
}
for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
{
PDFObj = PDFReaderObj.GetPdfObject(i);
if ((PDFObj != null) && PDFObj.IsStream())
{
PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);
if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
{
byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);
if ((bytes != null))
{
try
{
System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);
MS.Position = 0;
System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);
ImgList.Add(ImgPDF);
}
catch (Exception e)
{
Console.WriteLine ("Exception in extract: " + e);
}
}
}
}
}
PDFReaderObj.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return ImgList;
}
private static void WriteImageFile()
{
try
{
System.Console.WriteLine("Wait for extracting image from PDF file....");
// Get a List of Image
List<System.Drawing.Image> ListImage = ExtractImages(@"C:\Users\pradyut.bhattacharya\Documents\CEVA PDF\more\CS_75.pdf");
for (int i = 0; i < ListImage.Count; i++)
{
try
{
// Write Image File
ListImage[i].Save(@"C:\Users\pradyut.bhattacharya\Documents\CEVA PDF\more\Image" + i + ".jpeg", System.Drawing.Imaging.ImageFormat.Jpeg);
System.Console.WriteLine("Image" + i + ".jpeg write sucessfully");
}
catch (Exception)
{ }
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
}
Now in some cases i can get the images but for most of the PDFs which contains papers scanned i get the error:-
A first chance exception of type 'System.ArgumentException' occurred in System.Drawing.dll
Exception in extract: System.ArgumentException: Parameter is not valid.
at System.Drawing.Image.FromStream(Stream stream, Boolean useEmbeddedColorManagement, Boolean validateImageData)
at System.Drawing.Image.FromStream(Stream stream)
at ConsoleApplication1.Program.ExtractImages(String PDFSourcePath) in C:\Users\pradyut.bhattacharya\Documents\Visual Studio
2010\Projects\ConsoleApplication2\ConsoleApplication2\Program.cs:line 67
A first chance exception of type 'System.ArgumentException' occurred in System.Drawing.dll
Exception in extract: System.ArgumentException: Parameter is not valid.
at System.Drawing.Image.FromStream(Stream stream, Boolean useEmbeddedColorManagement, Boolean validateImageData)
at System.Drawing.Image.FromStream(Stream stream)
at ConsoleApplication1.Program.ExtractImages(String PDFSourcePath) in C:\Users\pradyut.bhattacharya\Documents\Visual Studio
2010\Projects\ConsoleApplication2\ConsoleApplication2\Program.cs:line 67
Any help
Thanks