HTML-Agility-Pack-Strip-Tags NICHT IN der Whitelist

c# html-agility-pack html-parsing sanitize tags

Frage

Ich versuche, eine Funktion zu erstellen, die HTML-Tags und Attribute entfernt, die nicht in einer Whitelist sind. Ich habe folgendes HTML:

<b>first text </b>
<b>second text here
       <a>some text here</a>
 <a>some text here</a>

 </b>
<a>some twxt here</a>

Ich benutze HTML Agility Pack und der Code, den ich bisher habe ist:

static List<string> WhiteNodeList = new List<string> { "b" };
static List<string> WhiteAttrList = new List<string> { };
static HtmlNode htmlNode;
public static void RemoveNotInWhiteList(out string _output, HtmlNode pNode, List<string> pWhiteList, List<string> attrWhiteList)
{

 // remove all attributes not on white list
 foreach (var item in pNode.ChildNodes)
 {
  item.Attributes.Where(u => attrWhiteList.Contains(u.Name) == false).ToList().ForEach(u => RemoveAttribute(u));

 }

 // remove all html and their innerText and attributes if not on whitelist.
 //pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.Remove());
 //pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.ParentNode.ReplaceChild(ConvertHtmlToNode(u.InnerHtml),u));
 //pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.Remove());

 for (int i = 0; i < pNode.ChildNodes.Count; i++)
 {
  if (!pWhiteList.Contains(pNode.ChildNodes[i].Name))
  {
   HtmlNode _newNode = ConvertHtmlToNode(pNode.ChildNodes[i].InnerHtml);
   pNode.ChildNodes[i].ParentNode.ReplaceChild(_newNode, pNode.ChildNodes[i]);
   if (pNode.ChildNodes[i].HasChildNodes && !string.IsNullOrEmpty(pNode.ChildNodes[i].InnerText.Trim().Replace("\r\n", "")))
   {
    HtmlNode outputNode1 = pNode.ChildNodes[i];
    for (int j = 0; j < pNode.ChildNodes[i].ChildNodes.Count; j++)
    {
     string _childNodeOutput;
     RemoveNotInWhiteList(out _childNodeOutput,
          pNode.ChildNodes[i], WhiteNodeList, WhiteAttrList);
     pNode.ChildNodes[i].ReplaceChild(ConvertHtmlToNode(_childNodeOutput), pNode.ChildNodes[i].ChildNodes[j]);
     i++;
    }
   }
  }
 }

 // Console.WriteLine(pNode.OuterHtml);
 _output = pNode.OuterHtml;
}  

private static void RemoveAttribute(HtmlAttribute u)
{
 u.Value = u.Value.ToLower().Replace("javascript", "");
 u.Remove();

}

public static HtmlNode ConvertHtmlToNode(string html)
{
 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
 doc.LoadHtml(html);
 if (doc.DocumentNode.ChildNodes.Count == 1)
  return doc.DocumentNode.ChildNodes[0];
 else return doc.DocumentNode;
}

Die Leistung, die ich versuche zu erreichen, ist

<b>first text </b>
<b>second text here
       some text here
 some text here

 </b>
some twxt here

Das bedeutet, dass ich nur die <b> -Tags behalten möchte.
Der Grund, warum ich das tue, ist, weil einige der Benutzer cpoy-einfügen von MS Word in Ny WYSYWYG HTML-Editor.

Vielen Dank.!

Akzeptierte Antwort

heh, anscheinend habe ich FAST eine antwort gefunden in einem blog-post jemand gemacht ....

using System.Collections.Generic;
using System.Linq;
using HtmlAgilityPack;

namespace Wayloop.Blog.Core.Markup
{
    public static class HtmlSanitizer
    {
        private static readonly IDictionary<string, string[]> Whitelist;

        static HtmlSanitizer()
        {
            Whitelist = new Dictionary<string, string[]> {
                { "a", new[] { "href" } },
                { "strong", null },
                { "em", null },
                { "blockquote", null },
                };
        }

        public static string Sanitize(string input)
        {
            var htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(input);
            SanitizeNode(htmlDocument.DocumentNode);

            return htmlDocument.DocumentNode.WriteTo().Trim();
        }

        private static void SanitizeChildren(HtmlNode parentNode)
        {
            for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) {
                SanitizeNode(parentNode.ChildNodes[i]);
            }
        }

        private static void SanitizeNode(HtmlNode node)
        {
            if (node.NodeType == HtmlNodeType.Element) {
                if (!Whitelist.ContainsKey(node.Name)) {
                    node.ParentNode.RemoveChild(node);
                    return;
                }

                if (node.HasAttributes) {
                    for (int i = node.Attributes.Count - 1; i >= 0; i--) {
                        HtmlAttribute currentAttribute = node.Attributes[i];
                        string[] allowedAttributes = Whitelist[node.Name];
                        if (!allowedAttributes.Contains(currentAttribute.Name)) {
                            node.Attributes.Remove(currentAttribute);
                        }
                    }
                }
            }

            if (node.HasChildNodes) {
                SanitizeChildren(node);
            }
        }
    }
}

Ich habe HtmlSanitizer von hier Anscheinend entfernt es nicht th Tags, aber entfernt das Element altoghether.

OK, hier ist die Lösung für diejenigen, die es später brauchen werden.

public static class HtmlSanitizer
    {
        private static readonly IDictionary<string, string[]> Whitelist;
        private static List<string> DeletableNodesXpath = new List<string>();

        static HtmlSanitizer()
        {
            Whitelist = new Dictionary<string, string[]> {
                { "a", new[] { "href" } },
                { "strong", null },
                { "em", null },
                { "blockquote", null },
                { "b", null},
                { "p", null},
                { "ul", null},
                { "ol", null},
                { "li", null},
                { "div", new[] { "align" } },
                { "strike", null},
                { "u", null},                
                { "sub", null},
                { "sup", null},
                { "table", null },
                { "tr", null },
                { "td", null },
                { "th", null }
                };
        }

        public static string Sanitize(string input)
        {
            if (input.Trim().Length < 1)
                return string.Empty;
            var htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(input);            
            SanitizeNode(htmlDocument.DocumentNode);
            string xPath = HtmlSanitizer.CreateXPath();

            return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath);
        }

        private static void SanitizeChildren(HtmlNode parentNode)
        {
            for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
            {
                SanitizeNode(parentNode.ChildNodes[i]);
            }
        }

        private static void SanitizeNode(HtmlNode node)
        {
            if (node.NodeType == HtmlNodeType.Element)
            {
                if (!Whitelist.ContainsKey(node.Name))
                {
                    if (!DeletableNodesXpath.Contains(node.Name))
                    {                       
                        //DeletableNodesXpath.Add(node.Name.Replace("?",""));
                        node.Name = "removeableNode";
                        DeletableNodesXpath.Add(node.Name);
                    }
                    if (node.HasChildNodes)
                    {
                        SanitizeChildren(node);
                    }                  

                    return;
                }

                if (node.HasAttributes)
                {
                    for (int i = node.Attributes.Count - 1; i >= 0; i--)
                    {
                        HtmlAttribute currentAttribute = node.Attributes[i];
                        string[] allowedAttributes = Whitelist[node.Name];
                        if (allowedAttributes != null)
                        {
                            if (!allowedAttributes.Contains(currentAttribute.Name))
                            {
                                node.Attributes.Remove(currentAttribute);
                            }
                        }
                        else
                        {
                            node.Attributes.Remove(currentAttribute);
                        }
                    }
                }
            }

            if (node.HasChildNodes)
            {
                SanitizeChildren(node);
            }
        }

        private static string StripHtml(string html, string xPath)
        {
            HtmlDocument htmlDoc = new HtmlDocument();
            htmlDoc.LoadHtml(html);
            if (xPath.Length > 0)
            {
                HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath);
                foreach (HtmlNode node in invalidNodes)
                {
                    node.ParentNode.RemoveChild(node, true);
                }
            }
            return htmlDoc.DocumentNode.WriteContentTo(); ;
        }

        private static string CreateXPath()
        {
            string _xPath = string.Empty;
            for (int i = 0; i < DeletableNodesXpath.Count; i++)
            {
                if (i != DeletableNodesXpath.Count - 1)
                {
                    _xPath += string.Format("//{0}|", DeletableNodesXpath[i].ToString());
                }
                else _xPath += string.Format("//{0}", DeletableNodesXpath[i].ToString());
            }
            return _xPath;
        }
    }

Ich habe den Knoten umbenannt, denn wenn ich einen XML-Namespace-Knoten analysieren müsste, würde er beim Xpath-Parsing abstürzen.


Beliebte Antwort

Danke für den Code - tolle Sache !!!!

Ich habe ein paar Optimierungen gemacht ...

class TagSanitizer
{
    List<HtmlNode> _deleteNodes = new List<HtmlNode>();

    public static void Sanitize(HtmlNode node)
    {
        new TagSanitizer().Clean(node);
    }

    void Clean(HtmlNode node)
    {
        CleanRecursive(node);
        for (int i = _deleteNodes.Count - 1; i >= 0; i--)
        {
            HtmlNode nodeToDelete = _deleteNodes[i];
            nodeToDelete.ParentNode.RemoveChild(nodeToDelete, true);
        }
    }

    void CleanRecursive(HtmlNode node)
    {
        if (node.NodeType == HtmlNodeType.Element)
        {
            if (Config.TagsWhiteList.ContainsKey(node.Name) == false)
            {
                _deleteNodes.Add(node);
            }
            else if (node.HasAttributes)
            {
                for (int i = node.Attributes.Count - 1; i >= 0; i--)
                {
                    HtmlAttribute currentAttribute = node.Attributes[i];

                    string[] allowedAttributes = Config.TagsWhiteList[node.Name];
                    if (allowedAttributes != null)
                    {
                        if (allowedAttributes.Contains(currentAttribute.Name) == false)
                        {
                            node.Attributes.Remove(currentAttribute);
                        }
                    }
                    else
                    {
                        node.Attributes.Remove(currentAttribute);
                    }
                }
            }
        }

        if (node.HasChildNodes)
        {
            node.ChildNodes.ToList().ForEach(v => CleanRecursive(v));
        }
    }
}


Related

Lizenziert unter: CC-BY-SA with attribution
Nicht verbunden mit Stack Overflow
Lizenziert unter: CC-BY-SA with attribution
Nicht verbunden mit Stack Overflow