I am using itextsharp to extract text from a pdf document using this code:
public static bool does_document_text_have_keyword(string keyword,
string pdf_src, Report report_object) // TEST
{
try
{
PdfReader pdfReader = new PdfReader(pdf_src);
string currentText;
int count = pdfReader.NumberOfPages;
for (int page = 1; page <= count; page++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
currentText = PdfTextExtractor.GetTextFromPage
(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString
(ASCIIEncoding.Convert
(Encoding.Default,
Encoding.UTF8,
Encoding.Default.GetBytes(currentText)));
report_object.log(currentText); // TEST
if (currentText.IndexOf
(keyword, StringComparison.OrdinalIgnoreCase) != -1) return true;
}
pdfReader.Close();
return false;
}
catch
{
return false;
}
}
But the problem is, when I extract text, the text has no white spaces, it's as if the white spaces has been replaced with an empty string. Yet in the pdf document, there are white spaces in it. Does anyone know whats happening here?