나는 이런 HTML 문자열을 가졌다 :
<html><body><p>foo <a href='http://www.example.com'>bar</a> baz</p></body></html>
모든 html 태그를 제거하여 결과 문자열이 다음과 같이됩니다.
foo bar baz
여기에있는 다른 게시물에서 나는이 기능 (Html Agility Pack을 사용함)을 제안했습니다.
Public Shared Function stripTags(ByVal html As String) As String
Dim plain As String = String.Empty
Dim htmldoc As New HtmlAgilityPack.HtmlDocument
htmldoc.LoadHtml(html)
Dim invalidNodes As HtmlAgilityPack.HtmlNodeCollection = htmldoc.DocumentNode.SelectNodes("//html|//body|//p|//a")
If Not htmldoc Is Nothing Then
For Each node In invalidNodes
node.ParentNode.RemoveChild(node, True)
Next
End If
Return htmldoc.DocumentNode.WriteContentTo
End Function
불행히도 이것은 예상 한 것을 반환하지 않고, 대신 다음을 제공합니다.
bazbarfoo
제발, 어디가 잘못 될까요? 이것이 최선의 접근 방법입니까?
안부와 행복한 코딩!
업데이트 : 아래의 답변에 의해 나는이 함수를 생각해 냈고 다른 사람들에게도 유용 할 수 있습니다.
Public Shared Function stripTags(ByVal html As String) As String
Dim htmldoc As New HtmlAgilityPack.HtmlDocument
htmldoc.LoadHtml(html.Replace("</p>", "</p>" & New String(Environment.NewLine, 2)).Replace("<br/>", Environment.NewLine))
Return htmldoc.DocumentNode.InnerText
End Function
텍스트가 아닌 노드를 모두 제거하는 대신 htmldoc.DocumentNode.InnerText
를 반환 htmldoc.DocumentNode.InnerText
것이 어떻습니까? 그것은 당신이 원하는 것을 줄 것입니다.
화이트리스트에없는 태그와 속성을 제거합니다.
Public NotInheritable Class HtmlSanitizer
Private Sub New()
End Sub
Private Shared ReadOnly Whitelist As IDictionary(Of String, String())
Private Shared DeletableNodesXpath As New List(Of String)()
Shared Sub New()
Whitelist = New Dictionary(Of String, String())() From { _
{"a", New () {"href"}}, _
{"strong", Nothing}, _
{"em", Nothing}, _
{"blockquote", Nothing}, _
{"b", Nothing}, _
{"p", Nothing}, _
{"ul", Nothing}, _
{"ol", Nothing}, _
{"li", Nothing}, _
{"div", New () {"align"}}, _
{"strike", Nothing}, _
{"u", Nothing}, _
{"sub", Nothing}, _
{"sup", Nothing}, _
{"table", Nothing}, _
{"tr", Nothing}, _
{"td", Nothing}, _
{"th", Nothing} _
}
End Sub
Public Shared Function Sanitize(input As String) As String
If input.Trim().Length < 1 Then
Return String.Empty
End If
Dim htmlDocument = New HtmlDocument()
htmlDocument.LoadHtml(input)
SanitizeNode(htmlDocument.DocumentNode)
Dim xPath As String = HtmlSanitizer.CreateXPath()
Return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath)
End Function
Private Shared Sub SanitizeChildren(parentNode As HtmlNode)
For i As Integer = parentNode.ChildNodes.Count - 1 To 0 Step -1
SanitizeNode(parentNode.ChildNodes(i))
Next
End Sub
Private Shared Sub SanitizeNode(node As HtmlNode)
If node.NodeType = HtmlNodeType.Element Then
If Not Whitelist.ContainsKey(node.Name) Then
If Not DeletableNodesXpath.Contains(node.Name) Then
'DeletableNodesXpath.Add(node.Name.Replace("?",""));
node.Name = "removeableNode"
DeletableNodesXpath.Add(node.Name)
End If
If node.HasChildNodes Then
SanitizeChildren(node)
End If
Return
End If
If node.HasAttributes Then
For i As Integer = node.Attributes.Count - 1 To 0 Step -1
Dim currentAttribute As HtmlAttribute = node.Attributes(i)
Dim allowedAttributes As String() = Whitelist(node.Name)
If allowedAttributes IsNot Nothing Then
If Not allowedAttributes.Contains(currentAttribute.Name) Then
node.Attributes.Remove(currentAttribute)
End If
Else
node.Attributes.Remove(currentAttribute)
End If
Next
End If
End If
If node.HasChildNodes Then
SanitizeChildren(node)
End If
End Sub
Private Shared Function StripHtml(html As String, xPath As String) As String
Dim htmlDoc As New HtmlDocument()
htmlDoc.LoadHtml(html)
If xPath.Length > 0 Then
Dim invalidNodes As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes(xPath)
For Each node As HtmlNode In invalidNodes
node.ParentNode.RemoveChild(node, True)
Next
End If
Return htmlDoc.DocumentNode.WriteContentTo()
End Function
Private Shared Function CreateXPath() As String
Dim _xPath As String = String.Empty
For i As Integer = 0 To DeletableNodesXpath.Count - 1
If i IsNot DeletableNodesXpath.Count - 1 Then
_xPath += String.Format("//{0}|", DeletableNodesXpath(i).ToString())
Else
_xPath += String.Format("//{0}", DeletableNodesXpath(i).ToString())
End If
Next
Return _xPath
End Function
End Class