0

I'm Validating HTML inputs (form an rss feed) To be displayed in an Mvc View

I'm using the following whitelist approach to sanitise my html

private static Regex _tags = new Regex("<[^>]*(>|$)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
private static Regex _whitelist = new Regex(@"
^</?(b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3)|i|kbd|u|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$|
^<(b|h)r\s?/?>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_a = new Regex(@"
^<a\s
href=""(\#\d+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
(\stitle=""[^""<>]+"")?\s?>$|
^</a>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_img = new Regex(@"
^<img\s
src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
(\swidth=""\d{1,3}"")?
(\sheight=""\d{1,3}"")?
(\salt=""[^""<>]*"")?
(\stitle=""[^""<>]*"")?
\s?/?>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);


/// <summary>
/// sanitize any potentially dangerous tags from the provided raw HTML input using 
/// a whitelist based approach, leaving the "safe" HTML tags
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937
/// </summary>
public static string Sanitize(string html)
{
    if (String.IsNullOrEmpty(html)) return html;

    string tagname;
    Match tag;

    // match every HTML tag in the input
    MatchCollection tags = _tags.Matches(html);
    for (int i = tags.Count - 1; i > -1; i--)
    {
        tag = tags[i];
        tagname = tag.Value.ToLowerInvariant();

        if (!(_whitelist.IsMatch(tagname) || _whitelist_a.IsMatch(tagname) || _whitelist_img.IsMatch(tagname)))
        {
            html = html.Remove(tag.Index, tag.Length);

        }
    }

    return html;
}

I'd like to also allow for video content from Youtube or Vimeo to be displayed using iFrames or the html5 video tag

Can anyone point me in the right direction for a reg ex that's a bit more flexible?

Here's my attempt for the Iframe

private static Regex _whitelist_iframe = new Regex(@"
             ^<iframe\s
            src=""https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+""
            (\swidth=""\d{1,3}"")?
            (\sheight=""\d{1,3}"")?
            (\sframeborder=""\d{1,3}"")?
            (\sallowfullscreen)?
            \s?>$|^</iframe>$",
            RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
Axe
  • 764
  • 6
  • 29
  • 1
    This has been mentioned many times (this being the best http://stackoverflow.com/a/1732454/1191903) - don't use Regex for parsing HTML tags. – Kevin Main May 17 '12 at 12:40

1 Answers1

1

The RegEx approach above was too strict, not to mention kevin's well made point!

Here's what I did:

used the html-agility-pack to parse the Html, and sanitised it as mentioned in this stackoverflow answer

I also added some code to check the src tag for images or iframes against a regex. (I'm pretty sure it could be done better)

public class HtmlSanitizer
{
    private readonly IDictionary<string, string[]> _whitelist;
    private readonly List<string> _deletableNodesXpath = new List<string>();

    public HtmlSanitizer()
    {
        _whitelist = new Dictionary<string, string[]>
                        {
                            {"a", new[] {"href", "target", "title"}},
                            {"img", new[] {"src", "alt", "width", "height"}},
                            {"iframe", new[] {"src", "width", "height", "frameborder", "allowfullscreen" }},
                            {"strong", null},
                            {"em", null},
                            {"blockquote", null},
                            {"b", null},
                            {"p", null},
                            {"ul", null},
                            {"ol", null},
                            {"li", null},
                            {"div", new[] {"align"}},
                            {"strike", null},
                            {"u", null},
                            {"sub", null},
                            {"sup", null},
                            {"table", null},
                            {"tr", null},
                            {"td", null},
                            {"th", null},
                            {"dd", null},
                            {"dt", null},
                            {"dl", null},
                            {"h1", null},
                            {"h2", null},
                            {"h3", null},
                        };
    }

    public string Sanitize(string input)
    {
        if (input.Trim().Length < 1)
            return string.Empty;
        var htmlDocument = new HtmlDocument();

        htmlDocument.LoadHtml(input);
        SanitizeNode(htmlDocument.DocumentNode);
        string xPath = CreateXPath();

        return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath);
    }

    private void SanitizeChildren(HtmlNode parentNode)
    {
        for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
        {
            SanitizeNode(parentNode.ChildNodes[i]);
        }
    }

    private static Regex _srcAttribute = new Regex(@"^https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+$", RegexOptions.Singleline | RegexOptions.IgnoreCase
                         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

    private static Regex _iframeSrc = new Regex(@"https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+", RegexOptions.Singleline | RegexOptions.IgnoreCase
                         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

    private void SanitizeNode(HtmlNode node)
    {
        if (node.NodeType == HtmlNodeType.Element)
        {
            if (!_whitelist.ContainsKey(node.Name))
            {
                if (!_deletableNodesXpath.Contains(node.Name))
                {
                    //DeletableNodesXpath.Add(node.Name.Replace("?",""));
                    node.Name = "removeableNode";
                    _deletableNodesXpath.Add(node.Name);
                }
                if (node.HasChildNodes)
                {
                    SanitizeChildren(node);
                }

                return;
            }

            if (node.HasAttributes)
            {
                for (int i = node.Attributes.Count - 1; i >= 0; i--)
                {
                    HtmlAttribute currentAttribute = node.Attributes[i];
                    string[] allowedAttributes = _whitelist[node.Name];
                    if (allowedAttributes != null)
                    {
                        if (!allowedAttributes.Contains(currentAttribute.Name))
                        {
                            node.Attributes.Remove(currentAttribute);
                        }

                        // if img src ensure matches regex 
                        if (node.Name == "img" && currentAttribute.Name == "src")
                        {
                            if (!_srcAttribute.IsMatch(currentAttribute.Value))
                            {
                                // remove node 
                                node.Name = "removeableNode";
                                _deletableNodesXpath.Add(node.Name);
                            }
                        }

                        // if iframe - ensure it within allowed src tags 
                        if (node.Name == "iframe" && currentAttribute.Name == "src")
                        {
                            if (!_iframeSrc.IsMatch(currentAttribute.Value))
                            {
                                // remove node 
                                node.Name = "removeableNode";
                                _deletableNodesXpath.Add(node.Name);
                            }
                        }

                    }
                    else
                    {
                        node.Attributes.Remove(currentAttribute);
                    }
                }
            }
        }

        if (node.HasChildNodes)
        {
            SanitizeChildren(node);
        }
    }

    private string StripHtml(string html, string xPath)
    {
        HtmlDocument htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(html);
        if (xPath.Length > 0)
        {
            HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath);
            foreach (HtmlNode node in invalidNodes)
            {
                node.ParentNode.RemoveChild(node, true);
            }
        }
        return htmlDoc.DocumentNode.WriteContentTo();
        ;
    }

    private string CreateXPath()
    {
        string xPath = string.Empty;
        for (int i = 0; i < _deletableNodesXpath.Count; i++)
        {
            if (i != _deletableNodesXpath.Count - 1)
            {
                xPath += string.Format("//{0}|", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
            }
            else xPath += string.Format("//{0}", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
        }
        return xPath;
    }
}
Community
  • 1
  • 1
Axe
  • 764
  • 6
  • 29