I am trying to parse a table that looks like this:
<table><tbody>
<tr><th a href=""></th><th></th></tr>
<tr><td class="v"></td><td class="d"></td><td class="h"></td><td class="a"> </td><td class="o"></td><td class="o"></td><td class="o"></td><td class="p"><table class="p" title="ttt"></table></td></tr>
<tr><td class="v"></td><td class="d"></td><td class="h"></td><td class="a"> </td><td class="o"></td><td class="o"></td><td class="o"></td><td class="p"><table class="p" title="eee"></table></td></tr>
<tr><td class="v"></td><td class="d"></td><td class="h"></td><td class="a"> </td><td class="o"></td><td class="o"></td><td class="o"></td><td class="p"><table class="p" title="rtr"></table></td></tr>
<tr><th a href=""></th><th></th></tr>
<tr><td class="v"></td><td class="d"></td><td class="h"></td><td class="a"> </td><td class="o"></td><td class="o"></td><td class="o"></td><td class="p"><table class="p" title="ouu"></table></td></tr>
<tr><td class="v"></td><td class="d"></td><td class="h"></td><td class="a"> </td><td class="o"></td><td class="o"></td><td class="o"></td><td class="p"><table class="p" title="teee"></table></td></tr>
</tbody></table>
And I am using this code in ASP.net to get the cells in each row I want:
var getHtmlWeb = new HtmlWeb();
var document = getHtmlWeb.Load(txtbox.Text);
//get tables
foreach (HtmlNode table in document.DocumentNode.SelectNodes("//table"))
{
//get each table row
foreach (HtmlNode row in table.SelectNodes("tr"))
{
Outputlabel.Text += "row: <br />";
//get table head tags that have a link, get the Inner text
if((row.SelectSingleNode("//th//a").InnerText) != null)
{
Outputlabel.Text += row.SelectSingleNode("//th//a").InnerText + "<br />";
}
// get the cells with the classes I want
string d = row.SelectSingleNode("//td[@class='d']").InnerText;
Outputlabel.Text += row.SelectSingleNode("//td[@class='d']").InnerText + " ";
string h = row.SelectSingleNode("//td[@class='h']").InnerText;
Outputlabel.Text += row.SelectSingleNode("//td[@class='h']").InnerText + " ";
string a = row.SelectSingleNode("//td[@class='a']").InnerText;
Outputlabel.Text += row.SelectSingleNode("//td[@class='a']").InnerText + " ";
string op = "";
//there are 3 classes in each row to have the class="o"
if (row.SelectNodes("//td[@class='o']") != null)
{
foreach (HtmlNode o in row.SelectNodes("//td[@class='o']"))
{
op += o.InnerText;
}
Outputlabel.Text += op + " ";
}
var pr = row.SelectSingleNode("//td//table[@class='p']");
string pr = probability.Attributes["title"].Value;
Outputlabel.Text += pr + "<br />";
}
}
I get the first line of the first table only and it is repeated many times...and I do not get the class "o" and the title of the table with class "p" in the td tag with class "p"
It seems to work this way for the online html file:
HtmlWeb getHtmlWeb = new HtmlWeb();
HtmlDocument doc = getHtmlWeb.Load(txtbox.Text);
string d = "//td[@class='d']";
string h = "//td[@class='h']";
string a = "//td[@class='a']";
string p = "//table[@class='p']";
HtmlNodeCollection ds = doc.DocumentNode.SelectNodes(d);
HtmlNodeCollection hs = doc.DocumentNode.SelectNodes(h);
HtmlNodeCollection as = doc.DocumentNode.SelectNodes(a);
HtmlNodeCollection ps = doc.DocumentNode.SelectNodes(p);
foreach (HtmlNode n in ds)
{
Outputlabel.Text += n.InnerHtml + "<br />";
}
foreach (HtmlNode h in hs)
{
Outputlabel.Text += h.InnerHtml + "<br />";
}
foreach (HtmlNode a in as)
{
Outputlabel.Text += a.Attributes["href"].Value + "<br />";
}
foreach (HtmlNode p in ps)
{
Outputlabel.Text += p.Attributes["title"].Value + "<br />";
}