I am using iTextSharp 4.1.6-LGPL. The text extraction logic is same as described in this answer.
var path = @"D:\ru.pdf";
var reader = new PdfReader(path);
StringBuilder sb = new StringBuilder();
try
{
for (int page = 1; page <= reader.NumberOfPages; page++)
{
var cpage = reader.GetPageN(page);
var content = cpage.Get(PdfName.CONTENTS);
var ir = (PRIndirectReference)content;
var value = reader.GetPdfObject(ir.Number);
if (value.IsStream())
{
PRStream stream = (PRStream)value;
var streamBytes = PdfReader.GetStreamBytes(stream);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
try
{
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TK_STRING)
{
string str = tokenizer.StringValue;
sb.Append(str);
}
}
}
finally
{
tokenizer.Close();
}
}
}
}
finally
{
reader.Close();
}
var res= sb.ToString();
Input PDF file contains only one word: Слово
Actual result for extraction is: ru-RU\u0002Á\u0003#\u0003(\u0003\u000f\u0003(
I tried different Encoding tricks with no success.
Also, newest version of iTextSharp output is correct using PdfTextExtractor
which is not available in 4.6.1
Does anyone know how to get the correct output?