HTML Agility Pack strip tags NOT IN whitelist

c# html-agility-pack html-parsing sanitize tags

Question

I'm trying to create a function which removes html tags and attributes which are not in a white list. I have the following HTML:

<b>first text </b>
<b>second text here
       <a>some text here</a>
 <a>some text here</a>

 </b>
<a>some twxt here</a>

I am using HTML agility pack and the code I have so far is:

static List<string> WhiteNodeList = new List<string> { "b" };
static List<string> WhiteAttrList = new List<string> { };
static HtmlNode htmlNode;
public static void RemoveNotInWhiteList(out string _output, HtmlNode pNode, List<string> pWhiteList, List<string> attrWhiteList)
{

 // remove all attributes not on white list
 foreach (var item in pNode.ChildNodes)
 {
  item.Attributes.Where(u => attrWhiteList.Contains(u.Name) == false).ToList().ForEach(u => RemoveAttribute(u));

 }

 // remove all html and their innerText and attributes if not on whitelist.
 //pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.Remove());
 //pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.ParentNode.ReplaceChild(ConvertHtmlToNode(u.InnerHtml),u));
 //pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.Remove());

 for (int i = 0; i < pNode.ChildNodes.Count; i++)
 {
  if (!pWhiteList.Contains(pNode.ChildNodes[i].Name))
  {
   HtmlNode _newNode = ConvertHtmlToNode(pNode.ChildNodes[i].InnerHtml);
   pNode.ChildNodes[i].ParentNode.ReplaceChild(_newNode, pNode.ChildNodes[i]);
   if (pNode.ChildNodes[i].HasChildNodes && !string.IsNullOrEmpty(pNode.ChildNodes[i].InnerText.Trim().Replace("\r\n", "")))
   {
    HtmlNode outputNode1 = pNode.ChildNodes[i];
    for (int j = 0; j < pNode.ChildNodes[i].ChildNodes.Count; j++)
    {
     string _childNodeOutput;
     RemoveNotInWhiteList(out _childNodeOutput,
          pNode.ChildNodes[i], WhiteNodeList, WhiteAttrList);
     pNode.ChildNodes[i].ReplaceChild(ConvertHtmlToNode(_childNodeOutput), pNode.ChildNodes[i].ChildNodes[j]);
     i++;
    }
   }
  }
 }

 // Console.WriteLine(pNode.OuterHtml);
 _output = pNode.OuterHtml;
}  

private static void RemoveAttribute(HtmlAttribute u)
{
 u.Value = u.Value.ToLower().Replace("javascript", "");
 u.Remove();

}

public static HtmlNode ConvertHtmlToNode(string html)
{
 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
 doc.LoadHtml(html);
 if (doc.DocumentNode.ChildNodes.Count == 1)
  return doc.DocumentNode.ChildNodes[0];
 else return doc.DocumentNode;
}

The output I am tryig to achieve is

<b>first text </b>
<b>second text here
       some text here
 some text here

 </b>
some twxt here

That means that I only want to keep the <b> tags.
The reason i'm doing this is because Some of the users do cpoy-paste from MS WORD into ny WYSYWYG html editor.

Thanks.!

Accepted Answer

heh, apparently I ALMOST found an answer in a blog post someone made....

using System.Collections.Generic;
using System.Linq;
using HtmlAgilityPack;

namespace Wayloop.Blog.Core.Markup
{
    public static class HtmlSanitizer
    {
        private static readonly IDictionary<string, string[]> Whitelist;

        static HtmlSanitizer()
        {
            Whitelist = new Dictionary<string, string[]> {
                { "a", new[] { "href" } },
                { "strong", null },
                { "em", null },
                { "blockquote", null },
                };
        }

        public static string Sanitize(string input)
        {
            var htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(input);
            SanitizeNode(htmlDocument.DocumentNode);

            return htmlDocument.DocumentNode.WriteTo().Trim();
        }

        private static void SanitizeChildren(HtmlNode parentNode)
        {
            for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) {
                SanitizeNode(parentNode.ChildNodes[i]);
            }
        }

        private static void SanitizeNode(HtmlNode node)
        {
            if (node.NodeType == HtmlNodeType.Element) {
                if (!Whitelist.ContainsKey(node.Name)) {
                    node.ParentNode.RemoveChild(node);
                    return;
                }

                if (node.HasAttributes) {
                    for (int i = node.Attributes.Count - 1; i >= 0; i--) {
                        HtmlAttribute currentAttribute = node.Attributes[i];
                        string[] allowedAttributes = Whitelist[node.Name];
                        if (!allowedAttributes.Contains(currentAttribute.Name)) {
                            node.Attributes.Remove(currentAttribute);
                        }
                    }
                }
            }

            if (node.HasChildNodes) {
                SanitizeChildren(node);
            }
        }
    }
}

I got HtmlSanitizer from here Apparently it does not strip th tags, but removes the element altoghether.

OK, here is the solution for those who will need it later.

public static class HtmlSanitizer
    {
        private static readonly IDictionary<string, string[]> Whitelist;
        private static List<string> DeletableNodesXpath = new List<string>();

        static HtmlSanitizer()
        {
            Whitelist = new Dictionary<string, string[]> {
                { "a", new[] { "href" } },
                { "strong", null },
                { "em", null },
                { "blockquote", null },
                { "b", null},
                { "p", null},
                { "ul", null},
                { "ol", null},
                { "li", null},
                { "div", new[] { "align" } },
                { "strike", null},
                { "u", null},                
                { "sub", null},
                { "sup", null},
                { "table", null },
                { "tr", null },
                { "td", null },
                { "th", null }
                };
        }

        public static string Sanitize(string input)
        {
            if (input.Trim().Length < 1)
                return string.Empty;
            var htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(input);            
            SanitizeNode(htmlDocument.DocumentNode);
            string xPath = HtmlSanitizer.CreateXPath();

            return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath);
        }

        private static void SanitizeChildren(HtmlNode parentNode)
        {
            for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
            {
                SanitizeNode(parentNode.ChildNodes[i]);
            }
        }

        private static void SanitizeNode(HtmlNode node)
        {
            if (node.NodeType == HtmlNodeType.Element)
            {
                if (!Whitelist.ContainsKey(node.Name))
                {
                    if (!DeletableNodesXpath.Contains(node.Name))
                    {                       
                        //DeletableNodesXpath.Add(node.Name.Replace("?",""));
                        node.Name = "removeableNode";
                        DeletableNodesXpath.Add(node.Name);
                    }
                    if (node.HasChildNodes)
                    {
                        SanitizeChildren(node);
                    }                  

                    return;
                }

                if (node.HasAttributes)
                {
                    for (int i = node.Attributes.Count - 1; i >= 0; i--)
                    {
                        HtmlAttribute currentAttribute = node.Attributes[i];
                        string[] allowedAttributes = Whitelist[node.Name];
                        if (allowedAttributes != null)
                        {
                            if (!allowedAttributes.Contains(currentAttribute.Name))
                            {
                                node.Attributes.Remove(currentAttribute);
                            }
                        }
                        else
                        {
                            node.Attributes.Remove(currentAttribute);
                        }
                    }
                }
            }

            if (node.HasChildNodes)
            {
                SanitizeChildren(node);
            }
        }

        private static string StripHtml(string html, string xPath)
        {
            HtmlDocument htmlDoc = new HtmlDocument();
            htmlDoc.LoadHtml(html);
            if (xPath.Length > 0)
            {
                HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath);
                foreach (HtmlNode node in invalidNodes)
                {
                    node.ParentNode.RemoveChild(node, true);
                }
            }
            return htmlDoc.DocumentNode.WriteContentTo(); ;
        }

        private static string CreateXPath()
        {
            string _xPath = string.Empty;
            for (int i = 0; i < DeletableNodesXpath.Count; i++)
            {
                if (i != DeletableNodesXpath.Count - 1)
                {
                    _xPath += string.Format("//{0}|", DeletableNodesXpath[i].ToString());
                }
                else _xPath += string.Format("//{0}", DeletableNodesXpath[i].ToString());
            }
            return _xPath;
        }
    }

I renamed the node because if I had to parse an XML namespace node it would crash on the xpath parsing.


Popular Answer

Thanks for the code - great thing!!!!

I did few optimization...

class TagSanitizer
{
    List<HtmlNode> _deleteNodes = new List<HtmlNode>();

    public static void Sanitize(HtmlNode node)
    {
        new TagSanitizer().Clean(node);
    }

    void Clean(HtmlNode node)
    {
        CleanRecursive(node);
        for (int i = _deleteNodes.Count - 1; i >= 0; i--)
        {
            HtmlNode nodeToDelete = _deleteNodes[i];
            nodeToDelete.ParentNode.RemoveChild(nodeToDelete, true);
        }
    }

    void CleanRecursive(HtmlNode node)
    {
        if (node.NodeType == HtmlNodeType.Element)
        {
            if (Config.TagsWhiteList.ContainsKey(node.Name) == false)
            {
                _deleteNodes.Add(node);
            }
            else if (node.HasAttributes)
            {
                for (int i = node.Attributes.Count - 1; i >= 0; i--)
                {
                    HtmlAttribute currentAttribute = node.Attributes[i];

                    string[] allowedAttributes = Config.TagsWhiteList[node.Name];
                    if (allowedAttributes != null)
                    {
                        if (allowedAttributes.Contains(currentAttribute.Name) == false)
                        {
                            node.Attributes.Remove(currentAttribute);
                        }
                    }
                    else
                    {
                        node.Attributes.Remove(currentAttribute);
                    }
                }
            }
        }

        if (node.HasChildNodes)
        {
            node.ChildNodes.ToList().ForEach(v => CleanRecursive(v));
        }
    }
}



Licensed under: CC-BY-SA with attribution
Not affiliated with Stack Overflow
Is this KB legal? Yes, learn why
Licensed under: CC-BY-SA with attribution
Not affiliated with Stack Overflow
Is this KB legal? Yes, learn why