In C#, parsing tables and cells with HTML agility

c# html-agility-pack html-parsing xml-parsing

Question

I must decode HTML code. Parse every cell in every row of every table, in more detail. Each cell in a row represents a separate property of an individual item. I want to parse them so that I can create an XML file that has all of the data (without the useless HTML code). Each column in the HTML file has been correctly parsed, however I'm not sure what my choices are for putting this to an XML file right now. I'm perplexed.

HTML:

<tr><tr> 
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF"> 
    1
</td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="left"> 
        <a href="/ice/player.htm?id=8471675">Sidney Crosby</a> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        PIT
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        C
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        39
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        32
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        33
    </td> 
    <td class="statBox sorted" style="border-width:0px 1px 1px 0px; background-color: #E0E0E0" align="right"> 
        <font color="#000000"> 
            65
        </font> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        29
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        10
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        1
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        3
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        0
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        154
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20.8
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        21:54
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        22.6
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
        55.7
    </td> 
</tr></tr>

C#:

using HtmlAgilityPack;

namespace Stats
{
    class StatsParser
    {
        private string htmlCode;
        private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml";

        public StatsParser(string htmlCode)
        {
            this.htmlCode = htmlCode;
            this.ParseHtml();
        }

        public void ParseHtml()
    {
        HtmlDocument doc = new HtmlDocument();
        doc.LoadHtml(htmlCode);

        try
        {
            // Get all tables in the document
            HtmlNodeCollection tables = doc.DocumentNode.SelectNodes("//table");

            // Iterate all rows in the first table
            HtmlNodeCollection rows = tables[0].SelectNodes(".//tr");
            for (int i = 0; i < rows.Count; ++i)
            {

                // Iterate all columns in this row
                HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']");
                for (int j = 0; j < cols.Count; ++j)
                {

                    // Get the value of the column and print it
                    string value = cols[j].InnerText;
                    if (value!="")
                        System.Windows.MessageBox.Show(value);
                }
            }
        }
        catch (NullReferenceException)
        {
            System.Windows.Forms.MessageBox.Show("Exception!!");
        }
    }

XML:

<?xml version="1.0" encoding="utf-8" ?>

<Stats Date="2011-01-01">
  <Player Rank="1">
    <Name>Sidney Crosby</Name>
    <Team>PIT</Team>
    <Position>C</Position>
    <GamesPlayed>39</GamesPlayed>
    <Goals>32</Goals>
    <Assists>33</Assists>
  </Player>
</Stats>
1
1
1/2/2011 6:24:00 AM

Accepted Answer

I eventually discovered an implementation solution to my issue after searching MSDN:

    using System;
    using HtmlAgilityPack;
    using System.Xml;

    namespace HockeyStats
    {
        class StatsParser
        {
            private string htmlCode;
            private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml";

            public StatsParser(string htmlCode)
            {
                this.htmlCode = htmlCode;

                this.ParseHtml();
            }

            public void ParseHtml()
            {

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(htmlCode);
                XmlWriter writer = null;

                try
                {
                    // Create an XmlWriterSettings object with the correct options. 
                    XmlWriterSettings settings = new XmlWriterSettings();
                    settings.Indent = true;
                    settings.IndentChars = ("  ");
                    settings.OmitXmlDeclaration = false;

                    // Create the XmlWriter object and write some content.
                    writer = XmlWriter.Create(@"..\..\"+fileName, settings);
                    writer.WriteStartElement("Stats");
                    writer.WriteAttributeString("Date", DateTime.Now.ToShortDateString());

                // Iterate all rows within another row
                HtmlNodeCollection rows = doc.DocumentNode.SelectNodes(".//tr/tr");
                for (int i = 0; i < rows.Count; ++i)
                {
                    // Iterate all columns in this row
                    HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']");
                    for (int j = 0; j < 20; ++j)
                    {
                                switch (j)
                                {
                                    case 0:
                                        {
                                            writer.WriteStartElement("Player");
                                            writer.WriteAttributeString("Rank", cols[j].InnerText.Trim()); break;
                                        }
                                    case 1: writer.WriteElementString("Name", cols[j].InnerText.Trim()); break;
                                    case 2: writer.WriteElementString("Team", cols[j].InnerText.Trim()); break;
                                    case 3: writer.WriteElementString("Pos", cols[j].InnerText.Trim()); break;
                                    case 4: writer.WriteElementString("GP", cols[j].InnerText.Trim()); break;
                                    case 5: writer.WriteElementString("G", cols[j].InnerText.Trim()); break;
                                    case 6: writer.WriteElementString("A", cols[j].InnerText.Trim()); break;
                                    case 7: writer.WriteElementString("PlusMinus", cols[j].InnerText.Trim()); break;
                                    case 8: writer.WriteElementString("PIM", cols[j].InnerText); break;
                                    case 9: writer.WriteElementString("PP", cols[j].InnerText); break;
                                    case 10: writer.WriteElementString("SH", cols[j].InnerText); break;
                                    case 11: writer.WriteElementString("GW", cols[j].InnerText); break;
                                    case 12: writer.WriteElementString("OT", cols[j].InnerText); break;
                                    case 13: writer.WriteElementString("Shots", cols[j].InnerText); break;
                                    case 14: writer.WriteElementString("ShotPctg", cols[j].InnerText); break;
                                    case 15: writer.WriteElementString("TOIPerGame", cols[j].InnerText); break;
                                    case 16: writer.WriteElementString("ShiftsPerGame", cols[j].InnerText); break;
                                    case 17: writer.WriteElementString("FOWinPctg", cols[j].InnerText); break;

                                }
                            }
                        }
                        writer.WriteEndElement();
                    }
                    writer.WriteEndElement();
                    writer.Flush();
                }
                finally
                {
                    if (writer != null)
                        writer.Close();
                }
            }
        }
    }

This produces the XML file shown below:

<?xml version="1.0" encoding="utf-8" ?> 
<Stats Date="2011-01-01">
 <Player Rank="1">
  <Name>Sidney Crosby</Name> 
  <Team>PIT</Team> 
  <Pos>C</Pos> 
  <GP>39</GP> 
  <G>32</G> 
  <A>33</A> 
  <PlusMinus>20</PlusMinus> 
  <PIM>29</PIM> 
  <PP>10</PP> 
  <SH>1</SH> 
  <GW>3</GW> 
  <Shots>0</Shots> 
  <ShotPctg>154</ShotPctg> 
  <TOIPerGame>20.8</TOIPerGame> 
  <ShiftsPerGame>21:54</ShiftsPerGame> 
  <FOWinPctg>22.6</FOWinPctg> 
 </Player>
</Stats>
1
1/2/2011 6:22:46 AM

Popular Answer

I had intended for my statement to imply that you are doing in code (the nested loops) what the appropriate XPath can accomplish for you. Writing this may be made much easier by using LINQ-to-XML. But now that we understand your preferred XML file format, we are able to provide our own solutions. Writing theParseHtml() a process like this:

public void ParseHtml()
{
    var htmlDoc = new HtmlDocument();
    htmlDoc.LoadHtml(htmlCode);
    var cells = htmlDoc.DocumentNode
                                    // use the right XPath rather than looping manually
                       .SelectNodes(@"//tr/tr/td[@class='statBox']")
                       .Select(node => node.InnerText.Trim())
                       .ToList();
    var elementNames = new[] { "Name", "Team", "Pos", "GP", "G", "A", "PlusMinus", "PIM", "PP", "SH", "GW", "OT", "Shots", "ShotPctg", "TOIPerGame", "ShiftsPerGame", "FOWinPctg", "UnknownField" };
    var xmlDoc =
        new XElement("Stats", new XAttribute("Date", DateTime.Now.ToShortDateString()),
            new XElement("Player", new XAttribute("Rank", cells.First()),
                // generate the elements based on the parsed cells
                cells.Skip(1)
                     .Zip(elementNames, (Value, Name) => new XElement(Name, Value))
                     .Where(element => !String.IsNullOrEmpty(element.Value))
            )
        );

    // save to your file
    xmlDoc.Save(filepath);
}

generates the output

<?xml version="1.0" encoding="utf-8"?>
<Stats Date="1/3/2011">
  <Player Rank="1">
    <Name>Sidney Crosby</Name>
    <Team>PIT</Team>
    <Pos>C</Pos>
    <GP>39</GP>
    <G>32</G>
    <A>33</A>
    <PlusMinus>20</PlusMinus>
    <PIM>29</PIM>
    <PP>10</PP>
    <SH>1</SH>
    <GW>3</GW>
    <Shots>0</Shots>
    <ShotPctg>154</ShotPctg>
    <TOIPerGame>20.8</TOIPerGame>
    <ShiftsPerGame>21:54</ShiftsPerGame>
    <FOWinPctg>22.6</FOWinPctg>
    <UnknownField>55.7</UnknownField>
  </Player>
</Stats>


Related Questions





Related

Licensed under: CC-BY-SA with attribution
Not affiliated with Stack Overflow
Licensed under: CC-BY-SA with attribution
Not affiliated with Stack Overflow