我需要将HTML字符串转换为纯文本(最好使用HTML Agility包)。适当的白色空间,尤其是正确的换行符 。
通过“正确的换行符”我的意思是这段代码:
<div>
<div>
<div>
line1
</div>
</div>
</div>
<div>line2</div>
应转换为
line1
line2
即只有一个换行符。
我见过的大多数解决方案只是将所有<div> <br> <p>
标签转换为\n
,显然,这是s * cks。
有关C#的html到plaintext渲染逻辑的任何建议吗?不完整的代码,至少常见的逻辑答案,如“用换行符替换所有关闭的DIV,但只有当下一个兄弟也不是DIV”时才会真正有用。
我试过的事情:简单地获取.InnerText
属性(显然是错误的),正则表达式(缓慢,痛苦,大量黑客,也是正则表达式比HtmlAgilityPack慢12倍 - 我测量它),此解决方案和类似(返回更多换行符然后需要)
下面的代码与提供的示例一起正常工作,甚至处理一些奇怪的东西,如<div><br></div>
,还有一些事情需要改进,但基本的想法就在那里。查看评论。
public static string FormatLineBreaks(string html)
{
//first - remove all the existing '\n' from HTML
//they mean nothing in HTML, but break our logic
html = html.Replace("\r", "").Replace("\n", " ");
//now create an Html Agile Doc object
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
//remove comments, head, style and script tags
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//comment() | //script | //style | //head"))
{
node.ParentNode.RemoveChild(node);
}
//now remove all "meaningless" inline elements like "span"
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//span | //label")) //add "b", "i" if required
{
node.ParentNode.ReplaceChild(HtmlNode.CreateNode(node.InnerHtml), node);
}
//block-elements - convert to line-breaks
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//p | //div")) //you could add more tags here
{
//we add a "\n" ONLY if the node contains some plain text as "direct" child
//meaning - text is not nested inside children, but only one-level deep
//use XPath to find direct "text" in element
var txtNode = node.SelectSingleNode("text()");
//no "direct" text - NOT ADDDING the \n !!!!
if (txtNode == null || txtNode.InnerHtml.Trim() == "") continue;
//"surround" the node with line breaks
node.ParentNode.InsertBefore(doc.CreateTextNode("\r\n"), node);
node.ParentNode.InsertAfter(doc.CreateTextNode("\r\n"), node);
}
//todo: might need to replace multiple "\n\n" into one here, I'm still testing...
//now BR tags - simply replace with "\n" and forget
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//br"))
node.ParentNode.ReplaceChild(doc.CreateTextNode("\r\n"), node);
//finally - return the text which will have our inserted line-breaks in it
return doc.DocumentNode.InnerText.Trim();
//todo - you should probably add "&code;" processing, to decode all the and such
}
//here's the extension method I use
private static HtmlNodeCollection SafeSelectNodes(this HtmlNode node, string selector)
{
return (node.SelectNodes(selector) ?? new HtmlNodeCollection(node));
}
顾虑:
代数决策:
plain-text = Process(Plain(html))
Plain(node-s) => Plain(node-0), Plain(node-1), ..., Plain(node-N)
Plain(BR) => BR
Plain(not-visible-element(child-s)) => nil
Plain(block-element(child-s)) => BS, Plain(child-s), BE
Plain(inline-element(child-s)) => Plain(child-s)
Plain(text) => ch-0, ch-1, .., ch-N
Process(symbol-s) => Process(start-line, symbol-s)
Process(start-line, BR, symbol-s) => Print('\n'), Process(start-line, symbol-s)
Process(start-line, BS, symbol-s) => Process(start-line, symbol-s)
Process(start-line, BE, symbol-s) => Process(start-line, symbol-s)
Process(start-line, hard-space, symbol-s) => Print(' '), Process(not-ws, symbol-s)
Process(start-line, space, symbol-s) => Process(start-line, symbol-s)
Process(start-line, common-symbol, symbol-s) => Print(common-symbol),
Process(not-ws, symbol-s)
Process(not-ws, BR|BS|BE, symbol-s) => Print('\n'), Process(start-line, symbol-s)
Process(not-ws, hard-space, symbol-s) => Print(' '), Process(not-ws, symbol-s)
Process(not-ws, space, symbol-s) => Process(ws, symbol-s)
Process(not-ws, common-symbol, symbol-s) => Process(ws, symbol-s)
Process(ws, BR|BS|BE, symbol-s) => Print('\n'), Process(start-line, symbol-s)
Process(ws, hard-space, symbol-s) => Print(' '), Print(' '),
Process(not-ws, symbol-s)
Process(ws, space, symbol-s) => Process(ws, symbol-s)
Process(ws, common-symbol, symbol-s) => Print(' '), Print(common-symbol),
Process(not-ws, symbol-s)
HtmlAgilityPack和System.Xml.Linq的C#决策:
//HtmlAgilityPack part
public static string ToPlainText(this HtmlAgilityPack.HtmlDocument doc)
{
var builder = new System.Text.StringBuilder();
var state = ToPlainTextState.StartLine;
Plain(builder, ref state, new[]{doc.DocumentNode});
return builder.ToString();
}
static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable<HtmlAgilityPack.HtmlNode> nodes)
{
foreach (var node in nodes)
{
if (node is HtmlAgilityPack.HtmlTextNode)
{
var text = (HtmlAgilityPack.HtmlTextNode)node;
Process(builder, ref state, HtmlAgilityPack.HtmlEntity.DeEntitize(text.Text).ToCharArray());
}
else
{
var tag = node.Name.ToLower();
if (tag == "br")
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
else if (NonVisibleTags.Contains(tag))
{
}
else if (InlineTags.Contains(tag))
{
Plain(builder, ref state, node.ChildNodes);
}
else
{
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
Plain(builder, ref state, node.ChildNodes);
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
}
}
}
}
//System.Xml.Linq part
public static string ToPlainText(this IEnumerable<XNode> nodes)
{
var builder = new System.Text.StringBuilder();
var state = ToPlainTextState.StartLine;
Plain(builder, ref state, nodes);
return builder.ToString();
}
static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable<XNode> nodes)
{
foreach (var node in nodes)
{
if (node is XElement)
{
var element = (XElement)node;
var tag = element.Name.LocalName.ToLower();
if (tag == "br")
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
else if (NonVisibleTags.Contains(tag))
{
}
else if (InlineTags.Contains(tag))
{
Plain(builder, ref state, element.Nodes());
}
else
{
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
Plain(builder, ref state, element.Nodes());
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
}
}
else if (node is XText)
{
var text = (XText)node;
Process(builder, ref state, text.Value.ToCharArray());
}
}
}
//common part
public static void Process(System.Text.StringBuilder builder, ref ToPlainTextState state, params char[] chars)
{
foreach (var ch in chars)
{
if (char.IsWhiteSpace(ch))
{
if (IsHardSpace(ch))
{
if (state == ToPlainTextState.WhiteSpace)
builder.Append(' ');
builder.Append(' ');
state = ToPlainTextState.NotWhiteSpace;
}
else
{
if (state == ToPlainTextState.NotWhiteSpace)
state = ToPlainTextState.WhiteSpace;
}
}
else
{
if (state == ToPlainTextState.WhiteSpace)
builder.Append(' ');
builder.Append(ch);
state = ToPlainTextState.NotWhiteSpace;
}
}
}
static bool IsHardSpace(char ch)
{
return ch == 0xA0 || ch == 0x2007 || ch == 0x202F;
}
private static readonly HashSet<string> InlineTags = new HashSet<string>
{
//from https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elemente
"b", "big", "i", "small", "tt", "abbr", "acronym",
"cite", "code", "dfn", "em", "kbd", "strong", "samp",
"var", "a", "bdo", "br", "img", "map", "object", "q",
"script", "span", "sub", "sup", "button", "input", "label",
"select", "textarea"
};
private static readonly HashSet<string> NonVisibleTags = new HashSet<string>
{
"script", "style"
};
public enum ToPlainTextState
{
StartLine = 0,
NotWhiteSpace,
WhiteSpace,
}
}
例子:
// <div> 1 </div> 2 <div> 3 </div>
1
2
3
// <div>1 <br/><br/>  <b> 2 </b> <div> </div><div> </div>  3</div>
1
2
3
// <span>1<style> text </style><i>2</i></span>3
123
//<div>
// <div>
// <div>
// line1
// </div>
// </div>
//</div>
//<div>line2</div>
line1
line2