0

I am trying to make a web scraper that gets light novel text from a website and creates a pdf using the html on the site. To do this, I am getting the html of the text and converting it to a DOM object and then a string of HTML code which I make a pdf out of. My issue is that once it is converted to a pdf, the encoding of the html causes quotes and other characters to become â. How do I make a pdf file without the weird characters appearing? Thanks for the help in advance!

const request = require('request');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
var htmlToPdf = require('html-to-pdf');

var BaseURL = 'https://www.wuxiaworld.com/novel/overgeared/og-chapter-';


for(let chapNum = 1; chapNum < 2; chapNum++) {//made to loop through many chapters
    url = BaseURL;
    url += chapNum;
    request(url , (error, response, html) => {
        if(!error & response.statusCode == 200) {
            const dom = new JSDOM(html);
            const chapterContent = dom.window.document.getElementsByClassName('fr-view')[1];//gets div I want
            const filename = 'Overgeared_Chapter_' + chapNum + '.pdf';

            htmlToPdf.convertHTMLString(chapterContent.outerHTML, filename,//takes the html string and makes pdf file
            function (error, success) {
                if (error) {
                    console.log('Worked...Not!');
                    console.log(error);
                } else {
                    console.log('Actually Worked!');
                    console.log(success);
                }
            }
        );
        }
    })
    url = BaseURL;//resets url to be changed again
}
  • Make sure you are using the correct encoding, eg if the page you are requesting is `utf8` make sure the parsers and converters are also using `utf8` – Patrick Evans Nov 18 '18 at 20:56
  • im kind've an amateur when it comes to this, how would I make sure they are using utf8, the site im getting the hmtl from is utf8 – Omar Elhosseni Nov 18 '18 at 21:14

0 Answers0