I am trying to extract text out of html files. Like this page.
https://artkapakistan.wordpress.com/2013/01/08/debunking-the-myth-of-the-artist/
I use HtmlAgilipack to get inner html from entry-content class and then remove the html tags. There seems to be a problem with encoding because i am getting strange characters. ’ and  to be exact. As per my search online, the first one is curly single quote and second one is a non breaking space. I tried to use regex to replace the single and double quotes with no success.
s1 = Regex.Replace(s1, "’|‘", "'");
s1 = Regex.Replace(s1, "“|”", "\"");
But I am unable to get them replaced. There seems to be some issue with encoding. I am not that much well versed below regex and string replacements. Can you guys help me solve this issue? I have tried to find out 'fixing unicode issues in c#' with no success. Will be highly grateful for any help in this regard.
EDIT: Following is how I retrieve the innter html and text.
text = document.DocumentNode.SelectSingleNode(postBodyClass).InnerHtml;
text = RemoveHTMLTags(text);
text = RemoveHTMLPunctuation(text);
public static string RemoveHTMLPunctuation(string input)
{
string s1 = input;
s1 = System.Net.WebUtility.HtmlDecode(s1);
//replace html left right single double quotation marks
s1 = Regex.Replace(s1, "€¦", "…");
s1 = Regex.Replace(s1, "’", "'");
s1 = Regex.Replace(s1, "€œ|€", "\"");
//replace unicode right and left quotation marks with straight quotation
string s2 = s1.Replace("“", "\x201c");
string s3 = s2.Replace("’", "\x2019");
string s4 = s3.Replace("”", "\x201d");
string s5 = s4.Replace("…", "\x2026");
string s6 = s5.Replace(" ", "");
s6 = s6.Replace("«", "");
string s7 = s6.Replace(""", "\"");
string s8 = s7.Replace("&", "&");
s8 = Regex.Replace(s8, "&[a-z]+;", "");
s8 = Regex.Replace(s8, "'", "'");
//remove non breaking space
s8 = Regex.Replace(s8, " |Â", "");
//add missing spaces after punctuation marks
//s8 = Regex.Replace(s8, "([\\.\\?,;:])(\\w+)", "$1 $2");
return s8;
}
public static string RemoveHTMLTags(string input)
{
string s1 = input;
//remove script tag and everything within.
s1 = Regex.Replace(s1, "\\<script\\s*[^><]+\\>[^><]*\\</\\s*script\\>", "");
s1 = Regex.Replace(s1, "\\<\\s*br\\s*/*\\s*\\>", Environment.NewLine);
//add new line for div p or li tag
s1 = Regex.Replace(s1, "\\<\\s*/(div|p|li)\\s*\\s*\\>", Environment.NewLine);
s1 = Regex.Replace(s1, "\\>=", "");
string s2 = Regex.Replace(s1, "“", "\x201c");
string s3 = Regex.Replace(s2, "\\<[Aa]([^><]+|\\s*)\\>.*\\</\\s*[Aa]\\s*\\>", "");
string s4 = Regex.Replace(s3, "\\<[^<>]+\\>", "");
string s5 = Regex.Replace(s4, "\\|", "");
//replace multiple lines with 1 line
s5 = Regex.Replace(s5, "(\\r\\n|\\r|\\n){2,}", Environment.NewLine);
//any annoying text put it here to replace from post text
//s5 = Regex.Replace(s5, "Copyright (c) 2008 Saadia Malik", "");
s5 = s5.Trim();
return s5;
}