Is it possible to define simple code using PDFBox (or other Java library) to process a (large) PDF document and report the text that appears in a specific color (e.g., red) or strike-through. The strike-through or the color could appear anywhere, including in the middle of word (e.g., the text "lion" with the "l" colored and the "n" strike-through).
Any idea towards a solution would be great. I need to find the parts where there is a specific color or strike-through and manually revising the whole (large) PDF document would be difficult and error-prone, so I'm trying to find an alternative and reliable solution...
Update: Based on the help provided and the posts reference, I made some progress, but this is still causing me problems...
I'm trying to:
- Detect colored text different from black and write it to the standard output. This seems to work fine except that some newlines are not added in the output (apparently, when there is no explicit newline in the input PDF).
- Detect strikethrough text and write it to the standard output. I cannot manage this to work at all. I've tried many options with no success.
Any idea is welcome!
I include below my current code:
import java.awt.geom.*;
import java.io.*;
import java.util.*;
import org.apache.pdfbox.cos.*;
import org.apache.pdfbox.util.*;
import org.apache.pdfbox.text.*;
import org.apache.pdfbox.contentstream.operator.*;
import org.apache.pdfbox.pdmodel.graphics.state.*;
import org.apache.pdfbox.contentstream.operator.color.*;
import org.apache.pdfbox.pdmodel.graphics.color.*;
public class PDFStyledTextStripper extends PDFTextStripper
{
boolean globalDifferentFromBlack = false;
String currentGlobalLine = "";
public PDFStyledTextStripper() throws IOException
{
super();
registerOperatorProcessor("re", new AppendRectangleToPath());
addOperator(new SetStrokingColorSpace());
addOperator(new SetNonStrokingColorSpace());
addOperator(new SetStrokingDeviceCMYKColor());
addOperator(new SetNonStrokingDeviceCMYKColor());
addOperator(new SetNonStrokingDeviceRGBColor());
addOperator(new SetStrokingDeviceRGBColor());
addOperator(new SetNonStrokingDeviceGrayColor());
addOperator(new SetStrokingDeviceGrayColor());
addOperator(new SetStrokingColor());
addOperator(new SetStrokingColorN());
addOperator(new SetNonStrokingColor());
addOperator(new SetNonStrokingColorN());
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException
{
for (TextPosition textPosition : textPositions)
{
boolean st = determineIfStriketrhough(textPosition);
if (st == true)
{
System.out.println("Striketrhough detected in: " + text);
}
}
}
@Override
protected void processTextPosition(TextPosition text)
{
//System.out.println("(" + text.getX() + ", " + text.getY() + ")");
//System.out.println("Processing text position...");
super.processTextPosition(text);
PDColor color = getGraphicsState().getNonStrokingColor();
boolean differentFromBlack = determineIfColorDifferentFromBlack(color);
if (differentFromBlack == true)
{
globalDifferentFromBlack = true;
currentGlobalLine = currentGlobalLine + text;
}
else
{
globalDifferentFromBlack = false;
if (currentGlobalLine.equals("") == false)
{
System.out.println(currentGlobalLine);
currentGlobalLine = "";
}
}
}
/**
* Write the line separator value to the output stream.
* @throws IOException if there is a problem writing out the lineseparator to the document.
*/
protected void writeLineSeparator( ) throws IOException
{
//output.write(getLineSeparator());
}
boolean determineIfColorDifferentFromBlack(PDColor color)
{
try
{
PDGraphicsState graphicsState = getGraphicsState();
int rgb = color.toRGB();
if (rgb != 0)
{
//System.out.println("RGB value: " + rgb);
return true;
}
else
{
return false;
}
}
catch (Exception e) {e.printStackTrace();}
return false;
}
boolean determineIfStriketrhough(TextPosition textPosition)
{
//System.out.println("Font: " + textPosition.getFont().getName());
if (rectangles.stream().anyMatch(r -> r.strikesThrough(textPosition)))
{
System.out.println("StrikeThrough detected!");
return true;
}
return false;
}
class AppendRectangleToPath extends OperatorProcessor
{
public String getName()
{
return ("AppendRectangleToPath");
}
public void process(Operator operator, List<COSBase> arguments)
{
COSNumber x = (COSNumber) arguments.get(0);
COSNumber y = (COSNumber) arguments.get(1);
COSNumber w = (COSNumber) arguments.get(2);
COSNumber h = (COSNumber) arguments.get(3);
double x1 = x.doubleValue();
double y1 = y.doubleValue();
// create a pair of coordinates for the transformation
double x2 = w.doubleValue() + x1;
double y2 = h.doubleValue() + y1;
Point2D p0 = transformedPoint(x1, y1);
Point2D p1 = transformedPoint(x2, y1);
Point2D p2 = transformedPoint(x2, y2);
Point2D p3 = transformedPoint(x1, y2);
rectangles.add(new TransformedRectangle(p0, p1, p2, p3));
}
Point2D.Double transformedPoint(double x, double y)
{
double[] position = {x,y};
getGraphicsState().getCurrentTransformationMatrix().createAffineTransform().transform(
position, 0, position, 0, 1);
return new Point2D.Double(position[0],position[1]);
}
}
static class TransformedRectangle
{
final Point2D p0, p1, p2, p3;
public TransformedRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3)
{
this.p0 = p0;
this.p1 = p1;
this.p2 = p2;
this.p3 = p3;
}
boolean strikesThrough(TextPosition textPosition)
{
Matrix matrix = textPosition.getTextMatrix();
// TODO: This is a very simplistic implementation only working for horizontal text without page rotation
// and horizontal rectangular strikeThroughs with p0 at the left bottom and p2 at the right top
// Check if rectangle horizontally matches (at least) the text
if (p0.getX() > matrix.getXPosition() + textPosition.getWidth() * .1f || p2.getX() < matrix.getXPosition() + textPosition.getWidth() * .9f)
return false;
// Check whether rectangle vertically is at the right height to underline
double vertDiff = p0.getY() - matrix.getYPosition();
if (vertDiff < 0 || vertDiff > textPosition.getFont().getFontDescriptor().getAscent() * textPosition.getFontSizeInPt() / 1000.0)
return false;
// Check whether rectangle is small enough to be a line
return Math.abs(p2.getY() - p0.getY()) < 2;
}
}
final List<TransformedRectangle> rectangles = new ArrayList<>();
Set<String> currentStyle = Collections.singleton("Undefined");
}