I am using the PYPDF2 python library to open an already existing PDF and modify a value such as {{name}} but I am facing two problems:
- The words don't seem to be represented correctly (example line:
[(But {{name}} did no)-0.6 (t c)1.6 (ome!)]TJ)
. This should probably be(But {{name}} did not come!)TJ
- {{name}} works well despite problem 1 but when it comes to arabic, the characters seem to be read as numbers and are read as individual characters. (example:
(\264)Tj
). This should probably be(اسم)Tj
I have tried to set the encoding as UTF8 as I read that it supports Arabic on the following link: What character encoding should I use for a web page containing mostly Arabic text? Is utf-8 okay?
I also tried to make sense of the following related question: How to read Arabic text from PDF using Python script
Code Below:
import os
import codecs
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject, NameObject
def replace_text(content, replacements = dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == 'tj':
replaced_line = line
#print("Now editing:",replaced_line)
for k, v in replacements.items():
#print("original", k)
#print("replace", v)
#print("----------------------")
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
else:
result += line + "\n"
continue
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode('utf-8')
print(decoded_data)
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode('utf-8')
if object.decodedSelf is not None:
object.decodedSelf.setData(encoded_data)
else:
object.setData(encoded_data)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="path to PDF document")
args = vars(ap.parse_args())
in_file = args["input"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
#english works
#replacements = {'{{name}}': 'Yara'}
#arabic doesn't
str1 = 'اسم'
str2 = 'يارا'
replacements = {str1: str2}
pdf = PdfFileReader(in_file)
writer = PdfFileWriter()
for page_number in range(0, pdf.getNumPages()):
page = pdf.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj, replacements)
# Force content replacement
page[NameObject("/Contents")] = contents.decodedSelf
writer.addPage(page)
with open(filename_base + ".result.pdf", 'wb') as out_file:
writer.write(out_file)