Stripping di tutti i tag html con Html Agility Pack

html html-agility-pack strip vb.net

Domanda

Ho una stringa html come questa:

<html><body><p>foo <a href='http://www.example.com'>bar</a> baz</p></body></html>

Desidero rimuovere tutti i tag html in modo che la stringa risultante diventi:

foo bar baz

Da un altro post qui a SO ho trovato questa funzione (che utilizza il pacchetto di agilità Html):

  Public Shared Function stripTags(ByVal html As String) As String
    Dim plain As String = String.Empty
    Dim htmldoc As New HtmlAgilityPack.HtmlDocument

    htmldoc.LoadHtml(html)
    Dim invalidNodes As HtmlAgilityPack.HtmlNodeCollection = htmldoc.DocumentNode.SelectNodes("//html|//body|//p|//a")

    If Not htmldoc Is Nothing Then
      For Each node In invalidNodes
        node.ParentNode.RemoveChild(node, True)
      Next
    End If

    Return htmldoc.DocumentNode.WriteContentTo
  End Function

Purtroppo questo non restituisce quello che mi aspetto, invece dà:

bazbarfoo

Per favore, dove sbaglio? E questo è l'approccio migliore?

Saluti e buon codice!

AGGIORNAMENTO: dalla risposta in basso ho trovato questa funzione, potrebbe essere utile agli altri:

  Public Shared Function stripTags(ByVal html As String) As String
    Dim htmldoc As New HtmlAgilityPack.HtmlDocument
    htmldoc.LoadHtml(html.Replace("</p>", "</p>" & New String(Environment.NewLine, 2)).Replace("<br/>", Environment.NewLine))
    Return htmldoc.DocumentNode.InnerText
  End Function

Risposta accettata

Perché non restituire semplicemente htmldoc.DocumentNode.InnerText invece di rimuovere tutti i nodi non di testo? Dovrebbe darti quello che vuoi.


Risposta popolare

Rimuove i tag e le proprietà non trovati nella lista bianca.

Public NotInheritable Class HtmlSanitizer
    Private Sub New()
    End Sub
    Private Shared ReadOnly Whitelist As IDictionary(Of String, String())
    Private Shared DeletableNodesXpath As New List(Of String)()

    Shared Sub New()
        Whitelist = New Dictionary(Of String, String())() From { _
            {"a", New () {"href"}}, _
            {"strong", Nothing}, _
            {"em", Nothing}, _
            {"blockquote", Nothing}, _
            {"b", Nothing}, _
            {"p", Nothing}, _
            {"ul", Nothing}, _
            {"ol", Nothing}, _
            {"li", Nothing}, _
            {"div", New () {"align"}}, _
            {"strike", Nothing}, _
            {"u", Nothing}, _
            {"sub", Nothing}, _
            {"sup", Nothing}, _
            {"table", Nothing}, _
            {"tr", Nothing}, _
            {"td", Nothing}, _
            {"th", Nothing} _
        }
    End Sub

    Public Shared Function Sanitize(input As String) As String
        If input.Trim().Length < 1 Then
            Return String.Empty
        End If
        Dim htmlDocument = New HtmlDocument()

        htmlDocument.LoadHtml(input)
        SanitizeNode(htmlDocument.DocumentNode)
        Dim xPath As String = HtmlSanitizer.CreateXPath()

        Return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath)
    End Function

    Private Shared Sub SanitizeChildren(parentNode As HtmlNode)
        For i As Integer = parentNode.ChildNodes.Count - 1 To 0 Step -1
            SanitizeNode(parentNode.ChildNodes(i))
        Next
    End Sub

    Private Shared Sub SanitizeNode(node As HtmlNode)
        If node.NodeType = HtmlNodeType.Element Then
            If Not Whitelist.ContainsKey(node.Name) Then
                If Not DeletableNodesXpath.Contains(node.Name) Then
                    'DeletableNodesXpath.Add(node.Name.Replace("?",""));
                    node.Name = "removeableNode"
                    DeletableNodesXpath.Add(node.Name)
                End If
                If node.HasChildNodes Then
                    SanitizeChildren(node)
                End If

                Return
            End If

            If node.HasAttributes Then
                For i As Integer = node.Attributes.Count - 1 To 0 Step -1
                    Dim currentAttribute As HtmlAttribute = node.Attributes(i)
                    Dim allowedAttributes As String() = Whitelist(node.Name)
                    If allowedAttributes IsNot Nothing Then
                        If Not allowedAttributes.Contains(currentAttribute.Name) Then
                            node.Attributes.Remove(currentAttribute)
                        End If
                    Else
                        node.Attributes.Remove(currentAttribute)
                    End If
                Next
            End If
        End If

        If node.HasChildNodes Then
            SanitizeChildren(node)
        End If
    End Sub

    Private Shared Function StripHtml(html As String, xPath As String) As String
        Dim htmlDoc As New HtmlDocument()
        htmlDoc.LoadHtml(html)
        If xPath.Length > 0 Then
            Dim invalidNodes As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes(xPath)
            For Each node As HtmlNode In invalidNodes
                node.ParentNode.RemoveChild(node, True)
            Next
        End If
        Return htmlDoc.DocumentNode.WriteContentTo()


    End Function

    Private Shared Function CreateXPath() As String
        Dim _xPath As String = String.Empty
        For i As Integer = 0 To DeletableNodesXpath.Count - 1
            If i IsNot DeletableNodesXpath.Count - 1 Then
                _xPath += String.Format("//{0}|", DeletableNodesXpath(i).ToString())
            Else
                _xPath += String.Format("//{0}", DeletableNodesXpath(i).ToString())
            End If
        Next
        Return _xPath
    End Function
End Class



Autorizzato sotto: CC-BY-SA with attribution
Non affiliato con Stack Overflow
È legale questo KB? Sì, impara il perché
Autorizzato sotto: CC-BY-SA with attribution
Non affiliato con Stack Overflow
È legale questo KB? Sì, impara il perché