8

I am using itextsharp dll to convert HTML to PDF.

The HTML has some Unicode characters like α, β... when I try to convert HTML to PDF, Unicode characters are not shown in PDF.

My function:

Document doc = new Document(PageSize.LETTER);

using (FileStream fs = new FileStream(Path.Combine("Test.pdf"), FileMode.Create, FileAccess.Write, FileShare.Read))
{
    PdfWriter.GetInstance(doc, fs);

    doc.Open();
    doc.NewPage();

    string arialuniTff = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Fonts),
                                      "ARIALUNI.TTF");

    BaseFont bf = BaseFont.CreateFont(arialuniTff, BaseFont.IDENTITY_H, BaseFont.EMBEDDED);

    Font fontNormal = new Font(bf, 12, Font.NORMAL);

    List<IElement> list = HTMLWorker.ParseToList(new StringReader(stringBuilder.ToString()),
                                                 new StyleSheet());
    Paragraph p = new Paragraph {Font = fontNormal};

    foreach (var element in list)
    {
        p.Add(element);
        doc.Add(p);
    }

    doc.Close();
}
GSerg
  • 76,472
  • 17
  • 159
  • 346
NIlesh Lanke
  • 1,213
  • 9
  • 26
  • 35

5 Answers5

16

You can also use the new XMLWorkerHelper (from library itextsharp.xmlworker), you need to override the default FontFactory implementation however.

void GeneratePdfFromHtml()
{
  const string outputFilename = @".\Files\report.pdf";
  const string inputFilename = @".\Files\report.html";

  using (var input = new FileStream(inputFilename, FileMode.Open))
  using (var output = new FileStream(outputFilename, FileMode.Create))
  {
    CreatePdf(input, output);
  }
}

void CreatePdf(Stream htmlInput, Stream pdfOutput)
{
  using (var document = new Document(PageSize.A4, 30, 30, 30, 30))
  {
    var writer = PdfWriter.GetInstance(document, pdfOutput);
    var worker = XMLWorkerHelper.GetInstance();

    document.Open();
    worker.ParseXHtml(writer, document, htmlInput, null, Encoding.UTF8, new UnicodeFontFactory());

    document.Close();
  }    
}

public class UnicodeFontFactory : FontFactoryImp
{
    private static readonly string FontPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Fonts),
      "arialuni.ttf");

    private readonly BaseFont _baseFont;

    public UnicodeFontFactory()
    {
      _baseFont = BaseFont.CreateFont(FontPath, BaseFont.IDENTITY_H, BaseFont.EMBEDDED);

    }

    public override Font GetFont(string fontname, string encoding, bool embedded, float size, int style, BaseColor color,
      bool cached)
    {
      return new Font(_baseFont, size, style, color);
    }
}
Gregor Slavec
  • 4,814
  • 1
  • 26
  • 24
14

When dealing with Unicode characters and iTextSharp there's a couple of things you need to take care of. The first one you did already and that's getting a font that supports your characters. The second thing is that you want to actually register the font with iTextSharp so that its aware of it.

//Path to our font
string arialuniTff = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Fonts), "ARIALUNI.TTF");
//Register the font with iTextSharp
iTextSharp.text.FontFactory.Register(arialuniTff);

Now that we have a font we need to create a StyleSheet object that tells iTextSharp when and how to use it.

//Create a new stylesheet
iTextSharp.text.html.simpleparser.StyleSheet ST = new iTextSharp.text.html.simpleparser.StyleSheet();
//Set the default body font to our registered font's internal name
ST.LoadTagStyle(HtmlTags.BODY, HtmlTags.FACE, "Arial Unicode MS");

The one non-HTML part that you also need to do is set a special encoding parameter. This encoding is specific to iTextSharp and in your case you want it to be Identity-H. If you don't set this then it default to Cp1252 (WINANSI).

//Set the default encoding to support Unicode characters
ST.LoadTagStyle(HtmlTags.BODY, HtmlTags.ENCODING, BaseFont.IDENTITY_H);

Lastly, we need to pass our stylesheet to the ParseToList method:

//Parse our HTML using the stylesheet created above
List<IElement> list = HTMLWorker.ParseToList(new StringReader(stringBuilder.ToString()), ST);

Putting that all together, from open to close you'd have:

doc.Open();

//Sample HTML
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.Append(@"<p>This is a test: <strong>α,β</strong></p>");

//Path to our font
string arialuniTff = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Fonts), "ARIALUNI.TTF");
//Register the font with iTextSharp
iTextSharp.text.FontFactory.Register(arialuniTff);

//Create a new stylesheet
iTextSharp.text.html.simpleparser.StyleSheet ST = new iTextSharp.text.html.simpleparser.StyleSheet();
//Set the default body font to our registered font's internal name
ST.LoadTagStyle(HtmlTags.BODY, HtmlTags.FACE, "Arial Unicode MS");
//Set the default encoding to support Unicode characters
ST.LoadTagStyle(HtmlTags.BODY, HtmlTags.ENCODING, BaseFont.IDENTITY_H);

//Parse our HTML using the stylesheet created above
List<IElement> list = HTMLWorker.ParseToList(new StringReader(stringBuilder.ToString()), ST);

//Loop through each element, don't bother wrapping in P tags
foreach (var element in list) {
    doc.Add(element);
}

doc.Close();

EDIT

In your comment you show HTML that specifies an override font. iTextSharp does not spider the system for fonts and its HTML parser doesn't use font fallback techniques. Any fonts specified in HTML/CSS must be manually registered.

string lucidaTff = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Fonts), "l_10646.ttf");
iTextSharp.text.FontFactory.Register(lucidaTff);
Chris Haas
  • 53,986
  • 12
  • 141
  • 274
  • If the html content are like
    α,β
    The above Function does not work.
    – NIlesh Lanke Apr 27 '12 at 06:37
  • This helped me out so much. I already had a stylesheet setup so i simply had to set the font in the css... body { font-family: 'Arial Unicode MS'!important; } – Kevbo Dec 10 '18 at 18:14
1
private class UnicodeFontFactory : FontFactoryImp
{
    private BaseFont _baseFont;

    public  UnicodeFontFactory()
    {
        string FontPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Fonts), "arialuni.ttf");
        _baseFont = BaseFont.CreateFont(FontPath, BaseFont.IDENTITY_H, BaseFont.EMBEDDED);                
    }

    public override Font GetFont(string fontname, string encoding, bool embedded, float size, int style, BaseColor color, bool cached)
    {                                
        return new Font(_baseFont, size, style, color);
    }
}  

//and Code

FontFactory.FontImp = new UnicodeFontFactory();

string convertedHtml = string.Empty;
foreach (char c in htmlText)
{
     if (c < 127)  
           convertedHtml += c;
     else
           convertedHtml += "&#" + (int)c + ";";
}

List<IElement> htmlElements = XMLWorkerHelper.ParseToElementList(convertedHtml, null);

// add the IElements to the document
foreach (IElement htmlElement in htmlElements)
{                            
      document.Add(htmlElement);
}
  • 1
    Could you elaborate on your answer. Maybe explain why your solution works and what the problem with the original code was. – Noel Widmer Jul 24 '18 at 13:01
0

This has to be one of the most difficult problems that I've had to figure out to date. The answers on the web, including stack overflow has either poor or outdated information. The answer from Gregor is very close. I wanted to give back to this community because I spent many hours to get to this answer.

Here's a very simple program I wrote in c# as an example for my own notes.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.tool.xml;

namespace ExampleOfExportingPDF
{
    class Program
    {
        static void Main(string[] args)
        {
            //Build HTML document
            StringBuilder sb = new StringBuilder();
            sb.Append("<body>");
            sb.Append("<h1 style=\"text-align:center;\">これは日本語のテキストの例です。</h1>");
            sb.Append("</body>");

            //Create our document object
            Document Doc = new Document(PageSize.A4);


            //Create our file stream
            using (FileStream fs = new FileStream(Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Test.pdf"), FileMode.Create, FileAccess.Write, FileShare.Read))
            {
                //Bind PDF writer to document and stream
                PdfWriter writer = PdfWriter.GetInstance(Doc, fs);

                //Open document for writing
                Doc.Open();


                //Add a page
                Doc.NewPage();

                MemoryStream msHtml = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(sb.ToString()));
                XMLWorkerHelper.GetInstance().ParseXHtml(writer, Doc, msHtml, null, Encoding.UTF8, new UnicodeFontFactory());

                //Close the PDF
                Doc.Close();
            }

        }

        public class UnicodeFontFactory : FontFactoryImp
        {
            private static readonly string FontPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Fonts),
          "arialuni.ttf");

            private readonly BaseFont _baseFont;

            public UnicodeFontFactory()
            {
                _baseFont = BaseFont.CreateFont(FontPath, BaseFont.IDENTITY_H, BaseFont.EMBEDDED);

            }

            public override Font GetFont(string fontname, string encoding, bool embedded, float size, int style, BaseColor color,
          bool cached)
            {
                return new Font(_baseFont, size, style, color);
            }
        }

    }
}

Hopefully this will save someone some time in the future.

Frank Thomas
  • 350
  • 5
  • 12
-2

Here is the few steps to display unicode characters in converting Html to Pdf

  1. Create a HTMLWorker
  2. Register a unicode font and assign it
  3. Create a style sheet and set the encoding to Identity-H
  4. Assign the style sheet to the html parser

Check below link for more understanding....

Hindi, Turkish, and special characters are also display during converting from HTML to PDF using this method. Check below demo image.

enter image description here

Code Scratcher
  • 389
  • 3
  • 4
  • [Links to external resources are encouraged, but please add context around the link so your fellow users will have some idea what it is and why it’s there. Always quote the most relevant part of an important link, in case the target site is unreachable or goes permanently offline.](http://stackoverflow.com/help/how-to-answer) – Erik Philips May 20 '16 at 15:41