2

I am using the PYPDF2 python library to open an already existing PDF and modify a value such as {{name}} but I am facing two problems:

  1. The words don't seem to be represented correctly (example line: [(But {{name}} did no)-0.6 (t c)1.6 (ome!)]TJ). This should probably be (But {{name}} did not come!)TJ
  2. {{name}} works well despite problem 1 but when it comes to arabic, the characters seem to be read as numbers and are read as individual characters. (example: (\264)Tj). This should probably be (اسم)Tj

I have tried to set the encoding as UTF8 as I read that it supports Arabic on the following link: What character encoding should I use for a web page containing mostly Arabic text? Is utf-8 okay?

I also tried to make sense of the following related question: How to read Arabic text from PDF using Python script

Code Below:

import os
import codecs
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject, NameObject


def replace_text(content, replacements = dict()):
    lines = content.splitlines()

    result = ""
    in_text = False

    for line in lines:
        if line == "BT":
            in_text = True

        elif line == "ET":
            in_text = False

        elif in_text:
            cmd = line[-2:]
            if cmd.lower() == 'tj':
                replaced_line = line
                #print("Now editing:",replaced_line)
                for k, v in replacements.items():
                    #print("original", k)
                    #print("replace", v)
                    #print("----------------------")
                    replaced_line = replaced_line.replace(k, v)
                result += replaced_line + "\n"
            else:
                result += line + "\n"
            continue

        result += line + "\n"

    return result


def process_data(object, replacements):
    data = object.getData()
    decoded_data = data.decode('utf-8')
    print(decoded_data)
    replaced_data = replace_text(decoded_data, replacements)

    encoded_data = replaced_data.encode('utf-8')
    if object.decodedSelf is not None:
        object.decodedSelf.setData(encoded_data)
    else:
        object.setData(encoded_data)


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("-i", "--input", required=True, help="path to PDF document")
    args = vars(ap.parse_args())

    in_file = args["input"]
    filename_base = in_file.replace(os.path.splitext(in_file)[1], "")

    # Provide replacements list that you need here

    #english works
    #replacements = {'{{name}}': 'Yara'}
    
    #arabic doesn't
    str1 = 'اسم'
    str2 = 'يارا'
    replacements = {str1: str2}

    pdf = PdfFileReader(in_file)
    writer = PdfFileWriter()

    for page_number in range(0, pdf.getNumPages()):

        page = pdf.getPage(page_number)
        contents = page.getContents()

        if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
            process_data(contents, replacements)
        elif len(contents) > 0:
            for obj in contents:
                if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
                    streamObj = obj.getObject()
                    process_data(streamObj, replacements)

        # Force content replacement
        page[NameObject("/Contents")] = contents.decodedSelf
        writer.addPage(page)


    with open(filename_base + ".result.pdf", 'wb') as out_file:
        writer.write(out_file)
Sara Kat
  • 378
  • 2
  • 19

0 Answers0