HTML 문자열을 일반 텍스트로 변환해야합니다 (HTML 민첩성 팩 사용이 바람직 함). 적절한 흰 공백과 특히 줄 바꿈이 적절한 경우 .
그리고 "적절한 줄 바꿈 (line-breaks)"에 의해이 코드가 의미하는 바는 다음과 같습니다.
<div>
<div>
<div>
line1
</div>
</div>
</div>
<div>line2</div>
다음으로 변환해야합니다.
line1
line2
즉 단 하나의 줄 바꿈.
내가 본 솔루션의 대부분은 모든 <div> <br> <p>
태그를 \n
분명히 s * cks)로 변환합니다.
C #을위한 html-to-plaintext 렌더링 논리에 대한 제안? 완전한 코드는 아니지만, 적어도 공통 논리는 "모든 닫는 DIV를 줄 바꿈으로 바꾸십시오.하지만 다음 형제가 DIV가 아닐지라도"정말 도움이 될 것입니다.
내가 시도한 것들 : 단순히 .InnerText
속성 (잘못 분명히), 정규식 (천천히, 고통스러운, 많은 해킹, 또한 정규식은 12 배 더 느린 HtmlAgilityPack - 나는 그것을 측정했다),이 솔루션 과 유사한 (더 줄 바꿈을 반환합니다. 필수)
아래의 코드는 제공된 예제에서 올바르게 작동하지만 <div><br></div>
와 같은 이상한 것을 다루기는하지만 여전히 개선해야 할 사항이 있지만 기본적인 아이디어가 있습니다. 의견을보십시오.
public static string FormatLineBreaks(string html)
{
//first - remove all the existing '\n' from HTML
//they mean nothing in HTML, but break our logic
html = html.Replace("\r", "").Replace("\n", " ");
//now create an Html Agile Doc object
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
//remove comments, head, style and script tags
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//comment() | //script | //style | //head"))
{
node.ParentNode.RemoveChild(node);
}
//now remove all "meaningless" inline elements like "span"
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//span | //label")) //add "b", "i" if required
{
node.ParentNode.ReplaceChild(HtmlNode.CreateNode(node.InnerHtml), node);
}
//block-elements - convert to line-breaks
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//p | //div")) //you could add more tags here
{
//we add a "\n" ONLY if the node contains some plain text as "direct" child
//meaning - text is not nested inside children, but only one-level deep
//use XPath to find direct "text" in element
var txtNode = node.SelectSingleNode("text()");
//no "direct" text - NOT ADDDING the \n !!!!
if (txtNode == null || txtNode.InnerHtml.Trim() == "") continue;
//"surround" the node with line breaks
node.ParentNode.InsertBefore(doc.CreateTextNode("\r\n"), node);
node.ParentNode.InsertAfter(doc.CreateTextNode("\r\n"), node);
}
//todo: might need to replace multiple "\n\n" into one here, I'm still testing...
//now BR tags - simply replace with "\n" and forget
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//br"))
node.ParentNode.ReplaceChild(doc.CreateTextNode("\r\n"), node);
//finally - return the text which will have our inserted line-breaks in it
return doc.DocumentNode.InnerText.Trim();
//todo - you should probably add "&code;" processing, to decode all the and such
}
//here's the extension method I use
private static HtmlNodeCollection SafeSelectNodes(this HtmlNode node, string selector)
{
return (node.SelectNodes(selector) ?? new HtmlNodeCollection(node));
}
우려 사항 :
대수 결정 :
plain-text = Process(Plain(html))
Plain(node-s) => Plain(node-0), Plain(node-1), ..., Plain(node-N)
Plain(BR) => BR
Plain(not-visible-element(child-s)) => nil
Plain(block-element(child-s)) => BS, Plain(child-s), BE
Plain(inline-element(child-s)) => Plain(child-s)
Plain(text) => ch-0, ch-1, .., ch-N
Process(symbol-s) => Process(start-line, symbol-s)
Process(start-line, BR, symbol-s) => Print('\n'), Process(start-line, symbol-s)
Process(start-line, BS, symbol-s) => Process(start-line, symbol-s)
Process(start-line, BE, symbol-s) => Process(start-line, symbol-s)
Process(start-line, hard-space, symbol-s) => Print(' '), Process(not-ws, symbol-s)
Process(start-line, space, symbol-s) => Process(start-line, symbol-s)
Process(start-line, common-symbol, symbol-s) => Print(common-symbol),
Process(not-ws, symbol-s)
Process(not-ws, BR|BS|BE, symbol-s) => Print('\n'), Process(start-line, symbol-s)
Process(not-ws, hard-space, symbol-s) => Print(' '), Process(not-ws, symbol-s)
Process(not-ws, space, symbol-s) => Process(ws, symbol-s)
Process(not-ws, common-symbol, symbol-s) => Process(ws, symbol-s)
Process(ws, BR|BS|BE, symbol-s) => Print('\n'), Process(start-line, symbol-s)
Process(ws, hard-space, symbol-s) => Print(' '), Print(' '),
Process(not-ws, symbol-s)
Process(ws, space, symbol-s) => Process(ws, symbol-s)
Process(ws, common-symbol, symbol-s) => Print(' '), Print(common-symbol),
Process(not-ws, symbol-s)
HtmlAgilityPack 및 System.Xml.Linq에 대한 C # 결정 :
//HtmlAgilityPack part
public static string ToPlainText(this HtmlAgilityPack.HtmlDocument doc)
{
var builder = new System.Text.StringBuilder();
var state = ToPlainTextState.StartLine;
Plain(builder, ref state, new[]{doc.DocumentNode});
return builder.ToString();
}
static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable<HtmlAgilityPack.HtmlNode> nodes)
{
foreach (var node in nodes)
{
if (node is HtmlAgilityPack.HtmlTextNode)
{
var text = (HtmlAgilityPack.HtmlTextNode)node;
Process(builder, ref state, HtmlAgilityPack.HtmlEntity.DeEntitize(text.Text).ToCharArray());
}
else
{
var tag = node.Name.ToLower();
if (tag == "br")
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
else if (NonVisibleTags.Contains(tag))
{
}
else if (InlineTags.Contains(tag))
{
Plain(builder, ref state, node.ChildNodes);
}
else
{
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
Plain(builder, ref state, node.ChildNodes);
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
}
}
}
}
//System.Xml.Linq part
public static string ToPlainText(this IEnumerable<XNode> nodes)
{
var builder = new System.Text.StringBuilder();
var state = ToPlainTextState.StartLine;
Plain(builder, ref state, nodes);
return builder.ToString();
}
static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable<XNode> nodes)
{
foreach (var node in nodes)
{
if (node is XElement)
{
var element = (XElement)node;
var tag = element.Name.LocalName.ToLower();
if (tag == "br")
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
else if (NonVisibleTags.Contains(tag))
{
}
else if (InlineTags.Contains(tag))
{
Plain(builder, ref state, element.Nodes());
}
else
{
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
Plain(builder, ref state, element.Nodes());
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
}
}
else if (node is XText)
{
var text = (XText)node;
Process(builder, ref state, text.Value.ToCharArray());
}
}
}
//common part
public static void Process(System.Text.StringBuilder builder, ref ToPlainTextState state, params char[] chars)
{
foreach (var ch in chars)
{
if (char.IsWhiteSpace(ch))
{
if (IsHardSpace(ch))
{
if (state == ToPlainTextState.WhiteSpace)
builder.Append(' ');
builder.Append(' ');
state = ToPlainTextState.NotWhiteSpace;
}
else
{
if (state == ToPlainTextState.NotWhiteSpace)
state = ToPlainTextState.WhiteSpace;
}
}
else
{
if (state == ToPlainTextState.WhiteSpace)
builder.Append(' ');
builder.Append(ch);
state = ToPlainTextState.NotWhiteSpace;
}
}
}
static bool IsHardSpace(char ch)
{
return ch == 0xA0 || ch == 0x2007 || ch == 0x202F;
}
private static readonly HashSet<string> InlineTags = new HashSet<string>
{
//from https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elemente
"b", "big", "i", "small", "tt", "abbr", "acronym",
"cite", "code", "dfn", "em", "kbd", "strong", "samp",
"var", "a", "bdo", "br", "img", "map", "object", "q",
"script", "span", "sub", "sup", "button", "input", "label",
"select", "textarea"
};
private static readonly HashSet<string> NonVisibleTags = new HashSet<string>
{
"script", "style"
};
public enum ToPlainTextState
{
StartLine = 0,
NotWhiteSpace,
WhiteSpace,
}
}
예 :
// <div> 1 </div> 2 <div> 3 </div>
1
2
3
// <div>1 <br/><br/>  <b> 2 </b> <div> </div><div> </div>  3</div>
1
2
3
// <span>1<style> text </style><i>2</i></span>3
123
//<div>
// <div>
// <div>
// line1
// </div>
// </div>
//</div>
//<div>line2</div>
line1
line2