I need to convert HTML string to plain text (preferably using HTML Agility pack). With proper white-spaces and, especially, proper line-breaks.
And by "proper line-breaks" I mean that this code:
<div>
<div>
<div>
line1
</div>
</div>
</div>
<div>line2</div>
Should be converted as
line1
line2
I.e. only one line-break.
Most of the solutions I've seen simply convert all <div> <br> <p>
tags to \n
which, obviously, s*cks.
Any suggestions for html-to-plaintext rendering logic for C#? Not the complete code, at least common logic answers like "replace all closing DIVs with line-breaks, but only if the next sibling is not a DIV too" will really help.
Things I tried: simply getting the .InnerText
property (wrong obviously), regex (slow, painful, lots of hacks, also regexs are 12 times slower then HtmlAgilityPack - I measured it), this solution and similar (returns more line-breaks then required)
The code below works correctly with the example provided, even deals with some weird stuff like <div><br></div>
, there're still some things to improve, but the basic idea is there. See the comments.
public static string FormatLineBreaks(string html)
{
//first - remove all the existing '\n' from HTML
//they mean nothing in HTML, but break our logic
html = html.Replace("\r", "").Replace("\n", " ");
//now create an Html Agile Doc object
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
//remove comments, head, style and script tags
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//comment() | //script | //style | //head"))
{
node.ParentNode.RemoveChild(node);
}
//now remove all "meaningless" inline elements like "span"
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//span | //label")) //add "b", "i" if required
{
node.ParentNode.ReplaceChild(HtmlNode.CreateNode(node.InnerHtml), node);
}
//block-elements - convert to line-breaks
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//p | //div")) //you could add more tags here
{
//we add a "\n" ONLY if the node contains some plain text as "direct" child
//meaning - text is not nested inside children, but only one-level deep
//use XPath to find direct "text" in element
var txtNode = node.SelectSingleNode("text()");
//no "direct" text - NOT ADDDING the \n !!!!
if (txtNode == null || txtNode.InnerHtml.Trim() == "") continue;
//"surround" the node with line breaks
node.ParentNode.InsertBefore(doc.CreateTextNode("\r\n"), node);
node.ParentNode.InsertAfter(doc.CreateTextNode("\r\n"), node);
}
//todo: might need to replace multiple "\n\n" into one here, I'm still testing...
//now BR tags - simply replace with "\n" and forget
foreach (HtmlNode node in doc.DocumentNode.SafeSelectNodes("//br"))
node.ParentNode.ReplaceChild(doc.CreateTextNode("\r\n"), node);
//finally - return the text which will have our inserted line-breaks in it
return doc.DocumentNode.InnerText.Trim();
//todo - you should probably add "&code;" processing, to decode all the and such
}
//here's the extension method I use
private static HtmlNodeCollection SafeSelectNodes(this HtmlNode node, string selector)
{
return (node.SelectNodes(selector) ?? new HtmlNodeCollection(node));
}
Concerns:
Algebraic decision:
plain-text = Process(Plain(html))
Plain(node-s) => Plain(node-0), Plain(node-1), ..., Plain(node-N)
Plain(BR) => BR
Plain(not-visible-element(child-s)) => nil
Plain(block-element(child-s)) => BS, Plain(child-s), BE
Plain(inline-element(child-s)) => Plain(child-s)
Plain(text) => ch-0, ch-1, .., ch-N
Process(symbol-s) => Process(start-line, symbol-s)
Process(start-line, BR, symbol-s) => Print('\n'), Process(start-line, symbol-s)
Process(start-line, BS, symbol-s) => Process(start-line, symbol-s)
Process(start-line, BE, symbol-s) => Process(start-line, symbol-s)
Process(start-line, hard-space, symbol-s) => Print(' '), Process(not-ws, symbol-s)
Process(start-line, space, symbol-s) => Process(start-line, symbol-s)
Process(start-line, common-symbol, symbol-s) => Print(common-symbol),
Process(not-ws, symbol-s)
Process(not-ws, BR|BS|BE, symbol-s) => Print('\n'), Process(start-line, symbol-s)
Process(not-ws, hard-space, symbol-s) => Print(' '), Process(not-ws, symbol-s)
Process(not-ws, space, symbol-s) => Process(ws, symbol-s)
Process(not-ws, common-symbol, symbol-s) => Process(ws, symbol-s)
Process(ws, BR|BS|BE, symbol-s) => Print('\n'), Process(start-line, symbol-s)
Process(ws, hard-space, symbol-s) => Print(' '), Print(' '),
Process(not-ws, symbol-s)
Process(ws, space, symbol-s) => Process(ws, symbol-s)
Process(ws, common-symbol, symbol-s) => Print(' '), Print(common-symbol),
Process(not-ws, symbol-s)
C# decision for HtmlAgilityPack and System.Xml.Linq:
//HtmlAgilityPack part
public static string ToPlainText(this HtmlAgilityPack.HtmlDocument doc)
{
var builder = new System.Text.StringBuilder();
var state = ToPlainTextState.StartLine;
Plain(builder, ref state, new[]{doc.DocumentNode});
return builder.ToString();
}
static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable<HtmlAgilityPack.HtmlNode> nodes)
{
foreach (var node in nodes)
{
if (node is HtmlAgilityPack.HtmlTextNode)
{
var text = (HtmlAgilityPack.HtmlTextNode)node;
Process(builder, ref state, HtmlAgilityPack.HtmlEntity.DeEntitize(text.Text).ToCharArray());
}
else
{
var tag = node.Name.ToLower();
if (tag == "br")
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
else if (NonVisibleTags.Contains(tag))
{
}
else if (InlineTags.Contains(tag))
{
Plain(builder, ref state, node.ChildNodes);
}
else
{
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
Plain(builder, ref state, node.ChildNodes);
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
}
}
}
}
//System.Xml.Linq part
public static string ToPlainText(this IEnumerable<XNode> nodes)
{
var builder = new System.Text.StringBuilder();
var state = ToPlainTextState.StartLine;
Plain(builder, ref state, nodes);
return builder.ToString();
}
static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable<XNode> nodes)
{
foreach (var node in nodes)
{
if (node is XElement)
{
var element = (XElement)node;
var tag = element.Name.LocalName.ToLower();
if (tag == "br")
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
else if (NonVisibleTags.Contains(tag))
{
}
else if (InlineTags.Contains(tag))
{
Plain(builder, ref state, element.Nodes());
}
else
{
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
Plain(builder, ref state, element.Nodes());
if (state != ToPlainTextState.StartLine)
{
builder.AppendLine();
state = ToPlainTextState.StartLine;
}
}
}
else if (node is XText)
{
var text = (XText)node;
Process(builder, ref state, text.Value.ToCharArray());
}
}
}
//common part
public static void Process(System.Text.StringBuilder builder, ref ToPlainTextState state, params char[] chars)
{
foreach (var ch in chars)
{
if (char.IsWhiteSpace(ch))
{
if (IsHardSpace(ch))
{
if (state == ToPlainTextState.WhiteSpace)
builder.Append(' ');
builder.Append(' ');
state = ToPlainTextState.NotWhiteSpace;
}
else
{
if (state == ToPlainTextState.NotWhiteSpace)
state = ToPlainTextState.WhiteSpace;
}
}
else
{
if (state == ToPlainTextState.WhiteSpace)
builder.Append(' ');
builder.Append(ch);
state = ToPlainTextState.NotWhiteSpace;
}
}
}
static bool IsHardSpace(char ch)
{
return ch == 0xA0 || ch == 0x2007 || ch == 0x202F;
}
private static readonly HashSet<string> InlineTags = new HashSet<string>
{
//from https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elemente
"b", "big", "i", "small", "tt", "abbr", "acronym",
"cite", "code", "dfn", "em", "kbd", "strong", "samp",
"var", "a", "bdo", "br", "img", "map", "object", "q",
"script", "span", "sub", "sup", "button", "input", "label",
"select", "textarea"
};
private static readonly HashSet<string> NonVisibleTags = new HashSet<string>
{
"script", "style"
};
public enum ToPlainTextState
{
StartLine = 0,
NotWhiteSpace,
WhiteSpace,
}
}
Examples:
// <div> 1 </div> 2 <div> 3 </div>
1
2
3
// <div>1 <br/><br/>  <b> 2 </b> <div> </div><div> </div>  3</div>
1
2
3
// <span>1<style> text </style><i>2</i></span>3
123
//<div>
// <div>
// <div>
// line1
// </div>
// </div>
//</div>
//<div>line2</div>
line1
line2