Im using LocationTextExtractionStrategy to render text from PDF. Text is rendered in function called RenderText. So my question is: Can one chunk contains more than 2 words ? For example we have text: 'MKL is a helpfull person' Can it be written in chunks like (the most important chunk is bolded): MK
L
is a h
elpfull
per son
?
Below is the code i use for word separation. Im doing the word separation during adding text(chunk from renderText function) to current line.
public class TextLineLocation
{
public float X { get; set; }
public float Y { get; set; }
public float Height { get; set; }
public float Width { get; set; }
private string Text;
private List<char> bannedSings = new List<char>() {' ',',', '.', '/', '|', Convert.ToChar(@"\"), ';', '(', ')', '*', '&', '^', '!','?' };
public void AddText(TextInfo text)
{
Text += text;
foreach (char sign in bannedSings)
{
//creating new word
if (text.textChunk.Text.Contains(sign))
{
string[] splittedText = text.textChunk.Text.Split(sign);
foreach (string val in splittedText)
{
//if its first element, add it to current word
if (splittedText[0] == val)
{
// if its space, just ignore...
if (splittedText[0] == " ")
{
continue;
}
wordList[wordList.Count - 1].Text += val;
wordList[wordList.Count - 1].Width += text.getFontWidth();
wordList[wordList.Count - 1].Height += text.getFontHeight();
}
else
{
//if it isnt a first element, create another word
wordList.Add(new WordLocation(text.textChunk.StartLocation[1], text.textChunk.StartLocation[0], text.getFontWidth(), text.getFontHeight(), val));
//TODO: what if chunk has more than 2 words separated ?
}
}
}
}
else
{
//update last word
wordList[wordList.Count-1].Text += text.textChunk.Text;
wordList[wordList.Count - 1].Width += text.getFontWidth();
wordList[wordList.Count - 1].Height += text.getFontHeight();
}
}
public List<WordLocation> wordList = new List<WordLocation>();
}