This answer shares a proof-of-concept for finding all occurrences of specific text in a PDF and inserting a page break above using iText and Java. It should not be too difficult to port it to iTextSharp and C#.
Furthermore, for production use some extra code has to be added as currently the code makes some assumptions, it e.g. assumes non-rotated pages. Furthermore it does not handle annotations at all.
The task actually is a combination of two tasks, the finding and the inserting page breaks, thus we need
- an extraction strategy for locations of some custom text and
- a tool cutting pages.
SearchTextLocationExtractionStrategy
To extract the locations of custom text, we extend the iText LocationTextExtractionStrategy
to also allow to extract the positions of a custom text text string, actually of matches of a regular expression:
public class SearchTextLocationExtractionStrategy extends LocationTextExtractionStrategy {
public SearchTextLocationExtractionStrategy(Pattern pattern) {
super(new TextChunkLocationStrategy() {
public TextChunkLocation createLocation(TextRenderInfo renderInfo, LineSegment baseline) {
// while baseLine has been changed to not neutralize
// effects of rise, ascentLine and descentLine explicitly
// have not: We want the actual positions.
return new AscentDescentTextChunkLocation(baseline, renderInfo.getAscentLine(),
renderInfo.getDescentLine(), renderInfo.getSingleSpaceWidth());
}
});
this.pattern = pattern;
}
static Field locationalResultField = null;
static Method filterTextChunksMethod = null;
static Method startsWithSpaceMethod = null;
static Method endsWithSpaceMethod = null;
static Field textChunkTextField = null;
static Method textChunkSameLineMethod = null;
static {
try {
locationalResultField = LocationTextExtractionStrategy.class.getDeclaredField("locationalResult");
locationalResultField.setAccessible(true);
filterTextChunksMethod = LocationTextExtractionStrategy.class.getDeclaredMethod("filterTextChunks",
List.class, TextChunkFilter.class);
filterTextChunksMethod.setAccessible(true);
startsWithSpaceMethod = LocationTextExtractionStrategy.class.getDeclaredMethod("startsWithSpace",
String.class);
startsWithSpaceMethod.setAccessible(true);
endsWithSpaceMethod = LocationTextExtractionStrategy.class.getDeclaredMethod("endsWithSpace", String.class);
endsWithSpaceMethod.setAccessible(true);
textChunkTextField = TextChunk.class.getDeclaredField("text");
textChunkTextField.setAccessible(true);
textChunkSameLineMethod = TextChunk.class.getDeclaredMethod("sameLine", TextChunk.class);
textChunkSameLineMethod.setAccessible(true);
} catch (NoSuchFieldException | SecurityException | NoSuchMethodException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public Collection<TextRectangle> getLocations(TextChunkFilter chunkFilter) {
Collection<TextRectangle> result = new ArrayList<>();
try {
List<TextChunk> filteredTextChunks = (List<TextChunk>) filterTextChunksMethod.invoke(this,
locationalResultField.get(this), chunkFilter);
Collections.sort(filteredTextChunks);
StringBuilder sb = new StringBuilder();
List<AscentDescentTextChunkLocation> locations = new ArrayList<>();
TextChunk lastChunk = null;
for (TextChunk chunk : filteredTextChunks) {
String chunkText = (String) textChunkTextField.get(chunk);
if (lastChunk == null) {
// Nothing to compare with at the end
} else if ((boolean) textChunkSameLineMethod.invoke(chunk, lastChunk)) {
// we only insert a blank space if the trailing character of the previous string
// wasn't a space,
// and the leading character of the current string isn't a space
if (isChunkAtWordBoundary(chunk, lastChunk)
&& !((boolean) startsWithSpaceMethod.invoke(this, chunkText))
&& !((boolean) endsWithSpaceMethod.invoke(this, chunkText))) {
sb.append(' ');
LineSegment spaceBaseLine = new LineSegment(lastChunk.getEndLocation(),
chunk.getStartLocation());
locations.add(new AscentDescentTextChunkLocation(spaceBaseLine, spaceBaseLine, spaceBaseLine,
chunk.getCharSpaceWidth()));
}
} else {
assert sb.length() == locations.size();
Matcher matcher = pattern.matcher(sb);
while (matcher.find()) {
int i = matcher.start();
Vector baseStart = locations.get(i).getStartLocation();
TextRectangle textRectangle = new TextRectangle(matcher.group(), baseStart.get(Vector.I1),
baseStart.get(Vector.I2));
for (; i < matcher.end(); i++) {
AscentDescentTextChunkLocation location = locations.get(i);
textRectangle.add(location.getAscentLine().getBoundingRectange());
textRectangle.add(location.getDescentLine().getBoundingRectange());
}
result.add(textRectangle);
}
sb.setLength(0);
locations.clear();
}
sb.append(chunkText);
locations.add((AscentDescentTextChunkLocation) chunk.getLocation());
lastChunk = chunk;
}
} catch (IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return result;
}
@Override
public void renderText(TextRenderInfo renderInfo) {
for (TextRenderInfo info : renderInfo.getCharacterRenderInfos())
super.renderText(info);
}
public static class AscentDescentTextChunkLocation extends TextChunkLocationDefaultImp {
public AscentDescentTextChunkLocation(LineSegment baseLine, LineSegment ascentLine, LineSegment descentLine,
float charSpaceWidth) {
super(baseLine.getStartPoint(), baseLine.getEndPoint(), charSpaceWidth);
this.ascentLine = ascentLine;
this.descentLine = descentLine;
}
public LineSegment getAscentLine() {
return ascentLine;
}
public LineSegment getDescentLine() {
return descentLine;
}
final LineSegment ascentLine;
final LineSegment descentLine;
}
public class TextRectangle extends Rectangle2D.Float {
public TextRectangle(final String text, final float xStart, final float yStart) {
super(xStart, yStart, 0, 0);
this.text = text;
}
public String getText() {
return text;
}
final String text;
}
final Pattern pattern;
}
(SearchTextLocationExtractionStrategy.java)
As some necessary members of the base class are private or package private, we have to use reflection to extract them.
AbstractPdfPageSplittingTool
The page splitting functionality of this tool has been extracted from the PdfVeryDenseMergeTool
from this answer. Furthermore, it is abstract to allow custom positions for page breaks.
public abstract class AbstractPdfPageSplittingTool {
public AbstractPdfPageSplittingTool(Rectangle size, float top) {
this.pageSize = size;
this.topMargin = top;
}
public void split(OutputStream outputStream, PdfReader... inputs) throws DocumentException, IOException {
try {
openDocument(outputStream);
for (PdfReader reader : inputs) {
split(reader);
}
} finally {
closeDocument();
}
}
void openDocument(OutputStream outputStream) throws DocumentException {
final Document document = new Document(pageSize, 36, 36, topMargin, 36);
final PdfWriter writer = PdfWriter.getInstance(document, outputStream);
document.open();
this.document = document;
this.writer = writer;
newPage();
}
void closeDocument() {
try {
document.close();
} finally {
this.document = null;
this.writer = null;
this.yPosition = 0;
}
}
void newPage() {
document.newPage();
yPosition = pageSize.getTop(topMargin);
}
void split(PdfReader reader) throws IOException {
for (int page = 1; page <= reader.getNumberOfPages(); page++) {
split(reader, page);
}
}
void split(PdfReader reader, int page) throws IOException
{
PdfImportedPage importedPage = writer.getImportedPage(reader, page);
PdfContentByte directContent = writer.getDirectContent();
yPosition = pageSize.getTop();
Rectangle pageSizeToImport = reader.getPageSize(page);
float[] borderPositions = determineSplitPositions(reader, page);
if (borderPositions == null || borderPositions.length < 2)
return;
for (int borderIndex = 0; borderIndex + 1 < borderPositions.length; borderIndex++) {
float height = borderPositions[borderIndex] - borderPositions[borderIndex + 1];
if (height <= 0)
continue;
directContent.saveState();
directContent.rectangle(0, yPosition - height, pageSizeToImport.getWidth(), height);
directContent.clip();
directContent.newPath();
writer.getDirectContent().addTemplate(importedPage, 0, yPosition - (borderPositions[borderIndex] - pageSizeToImport.getBottom()));
directContent.restoreState();
newPage();
}
}
protected abstract float[] determineSplitPositions(PdfReader reader, int page);
Document document = null;
PdfWriter writer = null;
float yPosition = 0;
final Rectangle pageSize;
final float topMargin;
}
(AbstractPdfPageSplittingTool.java)
Usage in concert
To implement the task of the OP:
I need to search my pdf for a specific string - Property Number:
Each time this is found, I need to add a page break ABOVE
one can use the classes above like this:
AbstractPdfPageSplittingTool tool = new AbstractPdfPageSplittingTool(PageSize.A4, 36) {
@Override
protected float[] determineSplitPositions(PdfReader reader, int page) {
Collection<TextRectangle> locations = Collections.emptyList();
try {
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
SearchTextLocationExtractionStrategy strategy = new SearchTextLocationExtractionStrategy(
Pattern.compile("Property Number"));
parser.processContent(page, strategy, Collections.emptyMap()).getResultantText();
locations = strategy.getLocations(null);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
List<Float> borders = new ArrayList<>();
for (TextRectangle rectangle : locations)
{
borders.add((float)rectangle.getMaxY());
}
Rectangle pageSize = reader.getPageSize(page);
borders.add(pageSize.getTop());
borders.add(pageSize.getBottom());
Collections.sort(borders, Collections.reverseOrder());
float[] result = new float[borders.size()];
for (int i=0; i < result.length; i++)
result[i] = borders.get(i);
return result;
}
};
tool.split(new FileOutputStream(RESULT), new PdfReader(SOURCE));
(SplitPages.java test method testSplitDocumentAboveAngestellter
)