I've seen a few related questions out here, but they don’t exactly talk about the same problem I am facing.
I want to use the HTML Agility Pack to remove unwanted tags from my HTML without losing the content within the tags.
So for instance, in my scenario, I would like to preserve the tags "b
", "i
" and "u
".
And for an input like:
<p>my paragraph <div>and my <b>div</b></div> are <i>italic</i> and <b>bold</b></p>
The resulting HTML should be:
my paragraph and my <b>div</b> are <i>italic</i> and <b>bold</b>
I tried using HtmlNode
's Remove
method, but it removes my content too. Any suggestions?
I wrote an algorithm based on Oded's suggestions. Here it is. Works like a charm.
It removes all tags except strong
, em
, u
and raw text nodes.
internal static string RemoveUnwantedTags(string data)
{
if(string.IsNullOrEmpty(data)) return string.Empty;
var document = new HtmlDocument();
document.LoadHtml(data);
var acceptableTags = new String[] { "strong", "em", "u"};
var nodes = new Queue<HtmlNode>(document.DocumentNode.SelectNodes("./*|./text()"));
while(nodes.Count > 0)
{
var node = nodes.Dequeue();
var parentNode = node.ParentNode;
if(!acceptableTags.Contains(node.Name) && node.Name != "#text")
{
var childNodes = node.SelectNodes("./*|./text()");
if (childNodes != null)
{
foreach (var child in childNodes)
{
nodes.Enqueue(child);
parentNode.InsertBefore(child, node);
}
}
parentNode.RemoveChild(node);
}
}
return document.DocumentNode.InnerHtml;
}
I took @mathias answer and improved his extension method so that you can supply a list of tags to exclude as a List<string>
(e.g. {"a","p","hr"}
). I also fixed the logic so that it works recursively properly:
public static string RemoveUnwantedHtmlTags(this string html, List<string> unwantedTags)
{
if (String.IsNullOrEmpty(html))
{
return html;
}
var document = new HtmlDocument();
document.LoadHtml(html);
HtmlNodeCollection tryGetNodes = document.DocumentNode.SelectNodes("./*|./text()");
if (tryGetNodes == null || !tryGetNodes.Any())
{
return html;
}
var nodes = new Queue<HtmlNode>(tryGetNodes);
while (nodes.Count > 0)
{
var node = nodes.Dequeue();
var parentNode = node.ParentNode;
var childNodes = node.SelectNodes("./*|./text()");
if (childNodes != null)
{
foreach (var child in childNodes)
{
nodes.Enqueue(child);
}
}
if (unwantedTags.Any(tag => tag == node.Name))
{
if (childNodes != null)
{
foreach (var child in childNodes)
{
parentNode.InsertBefore(child, node);
}
}
parentNode.RemoveChild(node);
}
}
return document.DocumentNode.InnerHtml;
}