使用Html Agility Pack剝離所有html標籤

html html-agility-pack strip vb.net

我有一個像這樣的html字符串:

<html><body><p>foo <a href='http://www.example.com'>bar</a> baz</p></body></html>

我希望刪除所有html標記,以便生成的字符串變為:

<html><body><p>foo <a href='http://www.example.com'>bar</a> baz</p></body></html>

從SO的另一篇文章中我得出了這個函數(使用Html Agility Pack):

<html><body><p>foo <a href='http://www.example.com'>bar</a> baz</p></body></html>

不幸的是,這不會返回我的預期,而是給出:

<html><body><p>foo <a href='http://www.example.com'>bar</a> baz</p></body></html>

請問,我哪裡出錯 - 這是最好的方法嗎?

問候和快樂的編碼!

更新:通過下面的答案我想出了這個功能,可能對其他人有用:

<html><body><p>foo <a href='http://www.example.com'>bar</a> baz</p></body></html>

一般承認的答案

為什麼不直接返回htmldoc.DocumentNode.InnerText而不是刪除所有非文本節點?它應該給你你想要的東西。


熱門答案

它會刪除白名單中未找到的標記和屬性。

Public NotInheritable Class HtmlSanitizer
    Private Sub New()
    End Sub
    Private Shared ReadOnly Whitelist As IDictionary(Of String, String())
    Private Shared DeletableNodesXpath As New List(Of String)()

    Shared Sub New()
        Whitelist = New Dictionary(Of String, String())() From { _
            {"a", New () {"href"}}, _
            {"strong", Nothing}, _
            {"em", Nothing}, _
            {"blockquote", Nothing}, _
            {"b", Nothing}, _
            {"p", Nothing}, _
            {"ul", Nothing}, _
            {"ol", Nothing}, _
            {"li", Nothing}, _
            {"div", New () {"align"}}, _
            {"strike", Nothing}, _
            {"u", Nothing}, _
            {"sub", Nothing}, _
            {"sup", Nothing}, _
            {"table", Nothing}, _
            {"tr", Nothing}, _
            {"td", Nothing}, _
            {"th", Nothing} _
        }
    End Sub

    Public Shared Function Sanitize(input As String) As String
        If input.Trim().Length < 1 Then
            Return String.Empty
        End If
        Dim htmlDocument = New HtmlDocument()

        htmlDocument.LoadHtml(input)
        SanitizeNode(htmlDocument.DocumentNode)
        Dim xPath As String = HtmlSanitizer.CreateXPath()

        Return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath)
    End Function

    Private Shared Sub SanitizeChildren(parentNode As HtmlNode)
        For i As Integer = parentNode.ChildNodes.Count - 1 To 0 Step -1
            SanitizeNode(parentNode.ChildNodes(i))
        Next
    End Sub

    Private Shared Sub SanitizeNode(node As HtmlNode)
        If node.NodeType = HtmlNodeType.Element Then
            If Not Whitelist.ContainsKey(node.Name) Then
                If Not DeletableNodesXpath.Contains(node.Name) Then
                    'DeletableNodesXpath.Add(node.Name.Replace("?",""));
                    node.Name = "removeableNode"
                    DeletableNodesXpath.Add(node.Name)
                End If
                If node.HasChildNodes Then
                    SanitizeChildren(node)
                End If

                Return
            End If

            If node.HasAttributes Then
                For i As Integer = node.Attributes.Count - 1 To 0 Step -1
                    Dim currentAttribute As HtmlAttribute = node.Attributes(i)
                    Dim allowedAttributes As String() = Whitelist(node.Name)
                    If allowedAttributes IsNot Nothing Then
                        If Not allowedAttributes.Contains(currentAttribute.Name) Then
                            node.Attributes.Remove(currentAttribute)
                        End If
                    Else
                        node.Attributes.Remove(currentAttribute)
                    End If
                Next
            End If
        End If

        If node.HasChildNodes Then
            SanitizeChildren(node)
        End If
    End Sub

    Private Shared Function StripHtml(html As String, xPath As String) As String
        Dim htmlDoc As New HtmlDocument()
        htmlDoc.LoadHtml(html)
        If xPath.Length > 0 Then
            Dim invalidNodes As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes(xPath)
            For Each node As HtmlNode In invalidNodes
                node.ParentNode.RemoveChild(node, True)
            Next
        End If
        Return htmlDoc.DocumentNode.WriteContentTo()


    End Function

    Private Shared Function CreateXPath() As String
        Dim _xPath As String = String.Empty
        For i As Integer = 0 To DeletableNodesXpath.Count - 1
            If i IsNot DeletableNodesXpath.Count - 1 Then
                _xPath += String.Format("//{0}|", DeletableNodesXpath(i).ToString())
            Else
                _xPath += String.Format("//{0}", DeletableNodesXpath(i).ToString())
            End If
        Next
        Return _xPath
    End Function
End Class



許可下: CC-BY-SA with attribution
不隸屬於 Stack Overflow
這個KB合法嗎? 是的,了解原因
許可下: CC-BY-SA with attribution
不隸屬於 Stack Overflow
這個KB合法嗎? 是的,了解原因