Invalid character "ï¿½" when using Hummus (PDF) for Node JS

Question

I'm trying to use Hummus on Node JS for open pdf, replace string and save a new pdf. The problem is when i save a new pdf. It doesn't save characters like "ã", "õ", "ê" (UTF-8). I couldn't find any solution about this on the internet, there's so few topics about Hummus on Node JS.

Can anyone help me?

My code (took from here):

const hummus = require('hummus');
const fs = require('fs');

/**
 * Returns a byteArray string
 * 
 * @param {string} str - input string
 */
function strToByteArray(str) {
var utf8 = [];
    for (var i=0; i < str.length; i++) {
        var charcode = str.charCodeAt(i);
        if (charcode < 0x80) utf8.push(charcode);
        else if (charcode < 0x800) {
            utf8.push(0xc0 | (charcode >> 6), 
                      0x80 | (charcode & 0x3f));
        }
        else if (charcode < 0xd800 || charcode >= 0xe000) {
            utf8.push(0xe0 | (charcode >> 12), 
                      0x80 | ((charcode>>6) & 0x3f), 
                      0x80 | (charcode & 0x3f));
        }
        // surrogate pair
        else {
            i++;
            // UTF-16 encodes 0x10000-0x10FFFF by
            // subtracting 0x10000 and splitting the
            // 20 bits of 0x0-0xFFFFF into two halves
            charcode = 0x10000 + (((charcode & 0x3ff)<<10)
                      | (str.charCodeAt(i) & 0x3ff));
            utf8.push(0xf0 | (charcode >>18), 
                      0x80 | ((charcode>>12) & 0x3f), 
                      0x80 | ((charcode>>6) & 0x3f), 
                      0x80 | (charcode & 0x3f));
        }
    }
    return utf8;
}

function replaceText(sourceFile, targetFile, pageNumber, findText, replaceText) {  
    var writer = hummus.createWriterToModify(sourceFile, {
        modifiedFilePath: targetFile
    });
    var modifier = new hummus.PDFPageModifier(writer, pageNumber);
    var sourceParser = writer.createPDFCopyingContextForModifiedFile().getSourceDocumentParser();
    var pageObject = sourceParser.parsePage(pageNumber);
    var textObjectId = pageObject.getDictionary().toJSObject().Contents.getObjectID();
    var textStream = sourceParser.queryDictionaryObject(pageObject.getDictionary(), 'Contents');
    //read the original block of text data
    var data = [];
    var readStream = sourceParser.startReadingFromStream(textStream);
    while(readStream.notEnded()){
        Array.prototype.push.apply(data, readStream.read(10000));
    }
    var string = new Buffer(data).toString().replace(findText, replaceText);

    //Create and write our new text object
    var objectsContext = writer.getObjectsContext();
    objectsContext.startModifiedIndirectObject(textObjectId);

    var stream = objectsContext.startUnfilteredPDFStream();
    stream.getWriteStream().write(strToByteArray(string));
    objectsContext.endPDFStream(stream);

    objectsContext.endIndirectObject();

    writer.end();
}

replaceText('file.pdf', 'output.pdf', 0, /REPLACEME/g, 'My New Custom Text');

Sorry for my bad english.

Your code assumes PDF page content streams to be UTF-8 encoded and easily editable as such. This assumption is wrong. Instructions and numeric arguments in a content stream indeed are ASCII encoded but string parameters of text showing operations are encoded according to the encoding of the current font. Thus, each string parameter in such a content stream may have a different encoding! Simple PDFs in Western European languages often have fonts defined with **WinAnsiEncoding** which is similar to ISO 8859-1, so you may have more success with that encoding. It may not contain your chars, though — mkl, Dec 04 '19 at 17:36
You're right! The encoding is like WinAnsiEncoding but I don't know how to do it from now on ... Would it be when I convert the string to the byte array? Or before? — Kaleb Z., Dec 04 '19 at 19:26
I encoded as windows1252, now: `var string = windows1252.encode(new Buffer.from(data).toString('binary').replace(findText, replaceText));` But I'm getting this characters in pdf: " Ã‡Ã• ", " Ãƒ "... I think the problem is the string to byte array... Any sugestions? — Kaleb Z., Dec 04 '19 at 20:13

Invalid character "ï¿½" when using Hummus (PDF) for Node JS

0 Answers0