Using a RegEx for HTML processing is usually more trouble than it's worth. Grab the HtmlAgilityPack and use that to walk through the HTML DOM extracting any content inside text nodes. You could use something similar to the class below to gather up all of the text blocks in an HTML string.
public sealed class HtmlTextExtractor
{
private readonly string m_html;
public HtmlTextExtractor(string html)
{
m_html = html;
}
public IEnumerable<string> GetTextBlocks()
{
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(m_html);
var text = new List<string>();
WalkNode(htmlDocument.DocumentNode, text);
return text;
}
private void WalkNode(HtmlNode node, List<string> text)
{
switch (node.NodeType)
{
case HtmlNodeType.Comment:
break; // Exclude comments?
case HtmlNodeType.Document:
case HtmlNodeType.Element:
{
if (node.HasChildNodes)
{
foreach (var childNode in node.ChildNodes)
WalkNode(childNode, text);
}
}
break;
case HtmlNodeType.Text:
{
var html = ((HtmlTextNode)node).Text;
if (html.Length <= 0)
break;
var cleanHtml = HtmlEntity.DeEntitize(html).Trim();
if (!string.IsNullOrEmpty(cleanHtml))
text.Add(cleanHtml);
}
break;
}
}
}
You can then focus on splitting/tokenizing the text after that.
var extractor = new HtmlTextExtractor(html);
var textBlocks = extractor.GetTextBlocks();
var words = new List<string>();
foreach (var textBlock in textBlocks)
{
words.AddRange(textBlock.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries));
}
var distinctWords = words.Select(word => CleanWord(word))
.Where(word => word.Length > 2 && word.Length < 20 && !string.IsNullOrEmpty(word))
.Distinct()
.OrderBy(word => word);
And finally cleaning up individual words or tokens.
public string CleanWord(string word)
{
//Remove everything but letters, numbers and whitespace characters
word = Regex.Replace(word, @"[^\w\s]", string.Empty);
//Remove multiple whitespace characters
word = Regex.Replace(word, @"\s+", " ");
//remove any digits
word = Regex.Replace(word, @"[\d-]"," ");
return word.Trim();
}
Obviously this is the most simple implementation imaginable. It is extremely primitive, won't work well in non-English languages that don't split around spaces, doesn't handle punctuation well etc., but it should give you an idea of the individual parts. You can look at things like Lucene.NET to improve your tokenization and there are probably lots more libraries available if you want to improve the implementation.