disclaimer: I am the author of borb
, the library used in this answer
Replacing text in a PDF is hard (as you have no doubt found out). The problem is that PDF contains (in the worst case) only the rendering instructions in order to put content on the page.
Your document might contain (in pseudo-code):
- go to 80, 700
- set the active font to Helvetica, size 12
- render the characters "Hell"
- move to 120, 700
- render the characters "o"
- move to 130, 700
- render the characters "World"
As you can see, there is no concept of "words". Letters can just be rendered wherever they happen to be needed. Spaces don't need to be included, software responsible for creating a PDF can just tell the renderer to move the cursor along the x-axis.
In order to replace text, you first need to find it.
from borb.pdf import PDF
from borb.toolkit import RegularExpressionTextExtraction
# RegularExpressionTextExtraction implements EventListener
# EventListener processes rendering events
# you can pass a regular expression to RegularExpressionTextExtraction
# and it will keep track of where that content appears
l: RegularExpressionTextExtraction = RegularExpressionTextExtraction("ANA")
# now we need to load the PDF
with open("input.pdf", "rb") as fh:
PDF.loads(fh, [l])
# Now we can access the locations of the match(es).
# I am only going to use the first one, but feel free
# to update my code to take into account all matches
#
# A match can have multiple bounding boxes
# for instance if the regular expression could be matched over
# multiple lines of text.
print(l.get_matches_for_page(0)[0].get_bounding_boxes()[0])
Next step is to remove content at a given location.
For this we can use redaction. Redaction erases content in a PDF.
from borb.pdf import PDF
from borb.pdf import Document
from borb.pdf import Page
from borb.pdf.canvas.layout.annotation.redact_annotation import RedactAnnotation
import typing
# open the PDF
doc: typing.Optional[Document] = None
with open("input.pdf", "rb") as fh:
doc = PDF.loads(fh)
# get the first page
# maybe you'll need to modify this to apply it to all pages
# keep that in mind
page: Page = doc.get_page(0)
# add the redaction annotation
page.add_annotation(
RedactAnnotation(
Rectangle(Decimal(405), Decimal(721), Decimal(40), Decimal(8))
)
)
)
# apply redaction annotations
page.apply_redact_annotations()
# now we can store the PDF again
with open("input_002.pdf", "wb") as out_file_handle:
PDF.dumps(out_file_handle, doc)
Lastly, we need to put some content back in the PDF, at the location that we previously removed content from.
from borb.pdf import PDF
from borb.pdf import Document
from borb.pdf import Page
from borb.pdf import Paragraph
import typing
# load the PDF
doc: typing.Optional[Document] = None
with open("input.pdf", "rb") as fh:
doc = PDF.loads(fh)
# add a Paragraph at an absolute location
# fmt: off
r: Rectangle = Rectangle(
Decimal(59), # x: 0 + page_margin
Decimal(848 - 84 - 100), # y: page_height - page_margin - height_of_textbox
Decimal(595 - 59 * 2), # width: page_width - 2 * page_margin
Decimal(100), # height
)
# fmt: on
# the next line of code uses absolute positioning
page: Page = doc.get_page(0)
Paragraph("Hello World!").paint(page, r)
# store the PDF
with open("output.pdf", "wb") as fh:
PDF.dumps(fh, doc)
borb
is an open source, pure Python PDF library that creates, modifies and reads PDF documents. You can download it using:
pip install borb
Alternatively, you can build from source by forking/downloading the GitHub repository.