I tried to convert html to plain text with the following function but still getting error while converting.
private static string HtmlToPlainText(string html)
{
const string tagWhiteSpace = @"(>|$)(\W|\n|\r)+<";//matches one or more (white space or line breaks) between '>' and '<'
const string stripFormatting = @"<[^>]*(>|$)";//match any character between '<' and '>', even when end tag is missing
const string lineBreak = @"<(br|BR)\s{0,1}\/{0,1}>";//matches: <br>,<br/>,<br />,<BR>,<BR/>,<BR />
var lineBreakRegex = new Regex(lineBreak, RegexOptions.Multiline);
var stripFormattingRegex = new Regex(stripFormatting, RegexOptions.Multiline);
var tagWhiteSpaceRegex = new Regex(tagWhiteSpace, RegexOptions.Multiline);
var text = html;
//Decode html specific characters
text = System.Net.WebUtility.HtmlDecode(text);
//Remove tag whitespace/line breaks
text = tagWhiteSpaceRegex.Replace(text, "><");
//Replace <br /> with line breaks
text = lineBreakRegex.Replace(text, Environment.NewLine);
//Strip formatting
text = stripFormattingRegex.Replace(text, string.Empty);
text = text.Replace(">", "");
return text;
}
when I tried to debug the code it display \r and \r\n also in plain text output.This function is not properly convert the html to plain text. Can anyone suggest me any other conversion function?
Thanks