1

i want to convert html to plain text using C# so far i have implemented this code :->

    public static string HTMLToText(string HTMLCode)
    {
        if (HTMLCode == null)
            return null;
        // Remove new lines since they are not visible in HTML
        HTMLCode = HTMLCode.Replace("\n", " ");

        // Remove tab spaces
        HTMLCode = HTMLCode.Replace("\t", " ");

        // Remove multiple white spaces from HTML
        HTMLCode = System.Text.RegularExpressions.Regex.Replace(HTMLCode, "\\s+", " ");

        // Remove HEAD tag
        HTMLCode = System.Text.RegularExpressions.Regex.Replace(HTMLCode, "<head.*?</head>", ""
            , System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline);

        // Remove any JavaScript
        HTMLCode = System.Text.RegularExpressions.Regex.Replace(HTMLCode, "<script.*?</script>", ""
            , System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline);

        // Replace special characters like &, <, >, " etc.
        StringBuilder sbHTML = new StringBuilder(HTMLCode);
        // Note: There are many more special characters, these are just
        // most common. You can add new characters in this arrays if needed
        string[] OldWords = { "&nbsp;", "&amp;", "&quot;", "&lt;", "&gt;", "&reg;", "&copy;", "&bull;", "&trade;" };
        string[] NewWords = { " ", "&", "\"", "<", ">", "®", "©", "•", "™" };
        for (int i = 0; i < OldWords.Length; i++)
        {
            sbHTML.Replace(OldWords[i], NewWords[i]);
        }

        // Check if there are line breaks (<br>) or paragraph (<p>)
        sbHTML.Replace("<br>", "\n");
        sbHTML.Replace("<br ", "\n ");
        sbHTML.Replace("<p> ", "\n ");
        sbHTML.Replace("<p ", "\n ");
        sbHTML.Replace("<span ", "\n ");
        sbHTML.Replace("style", " ");
        sbHTML.Replace("FONT", " ");
        sbHTML.Replace("FONT-SIZE", " ");
        sbHTML.Replace("</span> ", "\n");

        // Finally, remove all HTML tags and return plain text
        //Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase); 

        return System.Text.RegularExpressions.Regex.Replace(sbHTML.ToString(), @"</?[a-z][a-z0-9][^<>]>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

    }

But its not removing all the tags and also the recursive tags like <p>.....<p>...</p>..</p> so outer <p> is removed but inner tag is still their...

Is their any way i could get pure plain text ??

Uwe Keim
  • 39,551
  • 56
  • 175
  • 291
user2530619
  • 75
  • 2
  • 8
  • 6
    Look at using an HTML parser. Regex, despite popular belief, is **not** an HTML parser. – Brad Christie Jun 28 '13 at 13:16
  • 11
    I would consider using HTMLAgility pack. – Daniel A. White Jun 28 '13 at 13:16
  • I had a [similar question](http://stackoverflow.com/questions/8419517) and also [came up with an answer](http://pastebin.com/NswerNkQ). – Uwe Keim Jun 28 '13 at 13:19
  • 1
    another simple way you can use webbrowser control first navigate webbrowser to html code and when page load completed you can get plain text using body.innertext – KF2 Jun 28 '13 at 13:25

0 Answers0