I am using itextsharp for extracting content from PDF using c# as follow
public static string GetTextFromAllPages(String pdfPath)
{
PdfReader reader = new PdfReader(pdfPath);
StringWriter output = new StringWriter();
for (int i = 1; i <= reader.NumberOfPages; i++)
output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy()));
return output.ToString();
}
Now change I want in this code whenever there are images in PDF it should include an image tag (<img>
) in the content.
I tried with the extracting images alone and I am able to do it but not sure how to merge these two codes together to make extracted content consist with img tag also .
Extraction code of image as follow :
private static List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
{
//string res = GetTextFromAllPages(PDFSourcePath);
//File.WriteAllText(@"d:\blobfile\blobfileresult.txt", res);
List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();
iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
iTextSharp.text.pdf.PdfObject PDFObj = null;
iTextSharp.text.pdf.PdfStream PDFStremObj = null;
try
{
RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath);
PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);
if (PDFReaderObj.IsOpenedWithFullPermissions)
{
Console.WriteLine("this is a test");
}
for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
{
PDFObj = PDFReaderObj.GetPdfObject(i);
if ((PDFObj != null) && PDFObj.IsStream())
{
PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);
if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
// if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.CCITTFAXDECODE.ToString())
{
byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);
if ((bytes != null))
{
try
{
System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);
MS.Position = 0;
System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);
ImgList.Add(ImgPDF);
}
catch (Exception e)
{
Console.WriteLine("Exception in extract: " + e);
}
}
}
}
}
PDFReaderObj.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return ImgList;
}