17

Im using the following method to extract text form html:

    public string getAllText(string _html)
    {
        string _allText = "";
        try
        {
            HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
            document.LoadHtml(_html);


            var root = document.DocumentNode;
            var sb = new StringBuilder();
            foreach (var node in root.DescendantNodesAndSelf())
            {
                if (!node.HasChildNodes)
                {
                    string text = node.InnerText;
                    if (!string.IsNullOrEmpty(text))
                        sb.AppendLine(text.Trim());
                }
            }

            _allText = sb.ToString();

        }
        catch (Exception)
        {
        }

        _allText = System.Web.HttpUtility.HtmlDecode(_allText);

        return _allText;
    }

Problem is that i also get script and style tags.

How could i exclude them?

Jacqueline
  • 481
  • 2
  • 11
  • 20
  • What about an inline style i.e.

    ? I see it in OuterHtml but would like to strip out all inline styles too.

    – Jeremy Jun 03 '16 at 17:51
  • 1
    `if (childNode.Attributes.Contains("style")) { childNode.Attributes.Remove("style"); } if (childNode.Attributes.Contains("class")) { childNode.Attributes.Remove("class"); }` – Jeremy Jun 03 '16 at 18:48

4 Answers4

58
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);

doc.DocumentNode.Descendants()
                .Where(n => n.Name == "script" || n.Name == "style")
                .ToList()
                .ForEach(n => n.Remove());
L.B
  • 114,136
  • 19
  • 178
  • 224
7

You can do so using HtmlDocument class:

HtmlDocument doc = new HtmlDocument();

doc.LoadHtml(input);

doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());
johnw86
  • 121
  • 1
  • 6
  • Shouldn't it be `doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());`? – MonkeyDreamzzz Nov 22 '17 at 11:04
  • @Rubanov Yeah it should be, I had an extension method so I didn't require the .ToList in my code. Answer updated, thanks. – johnw86 Jan 29 '18 at 14:18
2

Some excellent answers, System.Linq is handy!

For a non Linq based approach:

private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{

// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");

// Make sure not Null:
if (Nodes == null)
    return webDocument;

// Remove all Nodes:
foreach (HtmlNode node in Nodes)
    node.Remove();

return webDocument;

}
Rusty Nail
  • 2,692
  • 3
  • 34
  • 55
0
public static string StripStyles(this string html)
{
    var document = new HtmlDocument();

    document.LoadHtml(html);

    foreach (var node in document.DocumentNode.DescendantsAndSelf())
    {
        var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
            .ToList();
        foreach (var attribute in toRemove)
        {
            attribute.Remove();
        }
    }

    return document.DocumentNode.OuterHtml;
}
Simon
  • 33,714
  • 21
  • 133
  • 202