So we have some really inefficient code that splits a pdf into smaller chunks based on a maximum size allowed. Aka. if the max size is 10megs, an 8 meg file would be skipped, while a 16 meg file would be split based on the number of pages.
This is code that I inherited and feel like there has got to be a more efficient way to do this that requiring only one method and less instantiation of objects.
We use the following code to call the methods:
List<int> splitPoints = null;
List<byte[]> documents = null;
splitPoints = this.GetPDFSplitPoints(currentDocument, maxSize);
documents = this.SplitPDF(currentDocument, maxSize, splitPoints);
Methods:
private List<int> GetPDFSplitPoints(IClaimDocument currentDocument, int maxSize)
{
List<int> splitPoints = new List<int>();
PdfReader reader = null;
Document document = null;
int pagesRemaining = currentDocument.Pages;
while (pagesRemaining > 0)
{
reader = new PdfReader(currentDocument.Data);
document = new Document(reader.GetPageSizeWithRotation(1));
using (MemoryStream ms = new MemoryStream())
{
PdfCopy copy = new PdfCopy(document, ms);
PdfImportedPage page = null;
document.Open();
//Add pages until we run out from the original
for (int i = 0; i < currentDocument.Pages; i++)
{
int currentPage = currentDocument.Pages - (pagesRemaining - 1);
if (pagesRemaining == 0)
{
//The whole document has bee traversed
break;
}
page = copy.GetImportedPage(reader, currentPage);
copy.AddPage(page);
//If the current collection of pages exceeds the maximum size, we save off the index and start again
if (copy.CurrentDocumentSize > maxSize)
{
if (i == 0)
{
//One page is greater than the maximum size
throw new Exception("one page is greater than the maximum size and cannot be processed");
}
//We have gone one page too far, save this split index
splitPoints.Add(currentDocument.Pages - (pagesRemaining - 1));
break;
}
else
{
pagesRemaining--;
}
}
page = null;
document.Close();
document.Dispose();
copy.Close();
copy.Dispose();
copy = null;
}
}
if (reader != null)
{
reader.Close();
reader = null;
}
document = null;
return splitPoints;
}
private List<byte[]> SplitPDF(IClaimDocument currentDocument, int maxSize, List<int> splitPoints)
{
var documents = new List<byte[]>();
PdfReader reader = null;
Document document = null;
MemoryStream fs = null;
int pagesRemaining = currentDocument.Pages;
while (pagesRemaining > 0)
{
reader = new PdfReader(currentDocument.Data);
document = new Document(reader.GetPageSizeWithRotation(1));
fs = new MemoryStream();
PdfCopy copy = new PdfCopy(document, fs);
PdfImportedPage page = null;
document.Open();
//Add pages until we run out from the original
for (int i = 0; i <= currentDocument.Pages; i++)
{
int currentPage = currentDocument.Pages - (pagesRemaining - 1);
if (pagesRemaining == 0)
{
//We have traversed all pages
//The call to copy.Close() MUST come before using fs.ToArray() because copy.Close() finalizes the document
fs.Flush();
copy.Close();
documents.Add(fs.ToArray());
document.Close();
fs.Dispose();
break;
}
page = copy.GetImportedPage(reader, currentPage);
copy.AddPage(page);
pagesRemaining--;
if (splitPoints.Contains(currentPage + 1) == true)
{
//Need to start a new document
//The call to copy.Close() MUST come before using fs.ToArray() because copy.Close() finalizes the document
fs.Flush();
copy.Close();
documents.Add(fs.ToArray());
document.Close();
fs.Dispose();
break;
}
}
copy = null;
page = null;
fs.Dispose();
}
if (reader != null)
{
reader.Close();
reader = null;
}
if (document != null)
{
document.Close();
document.Dispose();
document = null;
}
if (fs != null)
{
fs.Close();
fs.Dispose();
fs = null;
}
return documents;
}
As far as I can tell, the only code online that I can see is VB and doesn't necessarily address the size issue.
UPDATE:
We're experiencing OutofMemory exceptions and I believe it's an issue with the Large Object Heap. So one thought was to reduce the code footprint and that would possibly reduce the number of large objects on the heap.
Basically this is part of a loop that goes through any number of PDF's, and then splits them and stores them in the database. Right now, we had to change the method from doing all of them at once (last run was 97 pdf's of various sizes), to running 5 pdf's through the system every 5 minutes. This is not ideal and won't scale well when we ramp up the tool to more clients.
(we're dealing with 50 -100 meg pdf's, but they could be larger).