Tabellen analysieren, Zellen mit Html-Agilität in C #

c# html-agility-pack html-parsing xml-parsing

Frage

Ich muss HTML-Code analysieren. Genauer gesagt, analysiere jede Zelle jeder Zeile in allen Tabellen. Jede Zeile repräsentiert ein einzelnes Objekt und jede Zelle repräsentiert unterschiedliche Eigenschaften. Ich möchte diese analysieren, um eine XML-Datei mit allen Daten darin schreiben zu können (ohne den nutzlosen HTML-Code). Ich bin erfolgreich in der Lage gewesen, jede Spalte von der HTML-Akte zu analysieren, aber jetzt weiß ich nicht, was meine Möglichkeiten sind, dieses in eine XML-Akte zu schreiben. Ich bin verwirrt.

HTML:

<tr><tr> 
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF"> 
    1
</td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="left"> 
        <a href="/ice/player.htm?id=8471675">Sidney Crosby</a> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        PIT
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        C
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        39
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        32
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        33
    </td> 
    <td class="statBox sorted" style="border-width:0px 1px 1px 0px; background-color: #E0E0E0" align="right"> 
        <font color="#000000"> 
            65
        </font> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        29
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        10
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        1
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        3
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        0
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        154
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20.8
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        21:54
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        22.6
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
        55.7
    </td> 
</tr></tr>

C #:

using HtmlAgilityPack;

namespace Stats
{
    class StatsParser
    {
        private string htmlCode;
        private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml";

        public StatsParser(string htmlCode)
        {
            this.htmlCode = htmlCode;
            this.ParseHtml();
        }

        public void ParseHtml()
    {
        HtmlDocument doc = new HtmlDocument();
        doc.LoadHtml(htmlCode);

        try
        {
            // Get all tables in the document
            HtmlNodeCollection tables = doc.DocumentNode.SelectNodes("//table");

            // Iterate all rows in the first table
            HtmlNodeCollection rows = tables[0].SelectNodes(".//tr");
            for (int i = 0; i < rows.Count; ++i)
            {

                // Iterate all columns in this row
                HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']");
                for (int j = 0; j < cols.Count; ++j)
                {

                    // Get the value of the column and print it
                    string value = cols[j].InnerText;
                    if (value!="")
                        System.Windows.MessageBox.Show(value);
                }
            }
        }
        catch (NullReferenceException)
        {
            System.Windows.Forms.MessageBox.Show("Exception!!");
        }
    }

XML:

<?xml version="1.0" encoding="utf-8" ?>

<Stats Date="2011-01-01">
  <Player Rank="1">
    <Name>Sidney Crosby</Name>
    <Team>PIT</Team>
    <Position>C</Position>
    <GamesPlayed>39</GamesPlayed>
    <Goals>32</Goals>
    <Assists>33</Assists>
  </Player>
</Stats>

Akzeptierte Antwort

Nachdem ich mich in MSDN umgesehen hatte, fand ich endlich eine Implementierungslösung für mein Problem:

    using System;
    using HtmlAgilityPack;
    using System.Xml;

    namespace HockeyStats
    {
        class StatsParser
        {
            private string htmlCode;
            private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml";

            public StatsParser(string htmlCode)
            {
                this.htmlCode = htmlCode;

                this.ParseHtml();
            }

            public void ParseHtml()
            {

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(htmlCode);
                XmlWriter writer = null;

                try
                {
                    // Create an XmlWriterSettings object with the correct options. 
                    XmlWriterSettings settings = new XmlWriterSettings();
                    settings.Indent = true;
                    settings.IndentChars = ("  ");
                    settings.OmitXmlDeclaration = false;

                    // Create the XmlWriter object and write some content.
                    writer = XmlWriter.Create(@"..\..\"+fileName, settings);
                    writer.WriteStartElement("Stats");
                    writer.WriteAttributeString("Date", DateTime.Now.ToShortDateString());

                // Iterate all rows within another row
                HtmlNodeCollection rows = doc.DocumentNode.SelectNodes(".//tr/tr");
                for (int i = 0; i < rows.Count; ++i)
                {
                    // Iterate all columns in this row
                    HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']");
                    for (int j = 0; j < 20; ++j)
                    {
                                switch (j)
                                {
                                    case 0:
                                        {
                                            writer.WriteStartElement("Player");
                                            writer.WriteAttributeString("Rank", cols[j].InnerText.Trim()); break;
                                        }
                                    case 1: writer.WriteElementString("Name", cols[j].InnerText.Trim()); break;
                                    case 2: writer.WriteElementString("Team", cols[j].InnerText.Trim()); break;
                                    case 3: writer.WriteElementString("Pos", cols[j].InnerText.Trim()); break;
                                    case 4: writer.WriteElementString("GP", cols[j].InnerText.Trim()); break;
                                    case 5: writer.WriteElementString("G", cols[j].InnerText.Trim()); break;
                                    case 6: writer.WriteElementString("A", cols[j].InnerText.Trim()); break;
                                    case 7: writer.WriteElementString("PlusMinus", cols[j].InnerText.Trim()); break;
                                    case 8: writer.WriteElementString("PIM", cols[j].InnerText); break;
                                    case 9: writer.WriteElementString("PP", cols[j].InnerText); break;
                                    case 10: writer.WriteElementString("SH", cols[j].InnerText); break;
                                    case 11: writer.WriteElementString("GW", cols[j].InnerText); break;
                                    case 12: writer.WriteElementString("OT", cols[j].InnerText); break;
                                    case 13: writer.WriteElementString("Shots", cols[j].InnerText); break;
                                    case 14: writer.WriteElementString("ShotPctg", cols[j].InnerText); break;
                                    case 15: writer.WriteElementString("TOIPerGame", cols[j].InnerText); break;
                                    case 16: writer.WriteElementString("ShiftsPerGame", cols[j].InnerText); break;
                                    case 17: writer.WriteElementString("FOWinPctg", cols[j].InnerText); break;

                                }
                            }
                        }
                        writer.WriteEndElement();
                    }
                    writer.WriteEndElement();
                    writer.Flush();
                }
                finally
                {
                    if (writer != null)
                        writer.Close();
                }
            }
        }
    }

Dies gibt die folgende XML-Datei als Ausgabe an:

<?xml version="1.0" encoding="utf-8" ?> 
<Stats Date="2011-01-01">
 <Player Rank="1">
  <Name>Sidney Crosby</Name> 
  <Team>PIT</Team> 
  <Pos>C</Pos> 
  <GP>39</GP> 
  <G>32</G> 
  <A>33</A> 
  <PlusMinus>20</PlusMinus> 
  <PIM>29</PIM> 
  <PP>10</PP> 
  <SH>1</SH> 
  <GW>3</GW> 
  <Shots>0</Shots> 
  <ShotPctg>154</ShotPctg> 
  <TOIPerGame>20.8</TOIPerGame> 
  <ShiftsPerGame>21:54</ShiftsPerGame> 
  <FOWinPctg>22.6</FOWinPctg> 
 </Player>
</Stats>

Beliebte Antwort

Was ich in meinem Kommentar gemeint hatte, war, dass Sie im Code (den verschachtelten Schleifen) vorgehen, was Sie mit dem richtigen XPath erreichen können. Die Verwendung von LINQ-to-XML kann das Schreiben noch einfacher machen. Aber jetzt, wo wir sehen, wie Sie Ihre XML-Datei formatiert haben, können wir unsere eigenen Antworten anbieten. Ich würde die ParseHtml() Methode so schreiben:

public void ParseHtml()
{
    var htmlDoc = new HtmlDocument();
    htmlDoc.LoadHtml(htmlCode);
    var cells = htmlDoc.DocumentNode
                                    // use the right XPath rather than looping manually
                       .SelectNodes(@"//tr/tr/td[@class='statBox']")
                       .Select(node => node.InnerText.Trim())
                       .ToList();
    var elementNames = new[] { "Name", "Team", "Pos", "GP", "G", "A", "PlusMinus", "PIM", "PP", "SH", "GW", "OT", "Shots", "ShotPctg", "TOIPerGame", "ShiftsPerGame", "FOWinPctg", "UnknownField" };
    var xmlDoc =
        new XElement("Stats", new XAttribute("Date", DateTime.Now.ToShortDateString()),
            new XElement("Player", new XAttribute("Rank", cells.First()),
                // generate the elements based on the parsed cells
                cells.Skip(1)
                     .Zip(elementNames, (Value, Name) => new XElement(Name, Value))
                     .Where(element => !String.IsNullOrEmpty(element.Value))
            )
        );

    // save to your file
    xmlDoc.Save(filepath);
}

Erzeugt die Ausgabe:

<?xml version="1.0" encoding="utf-8"?>
<Stats Date="1/3/2011">
  <Player Rank="1">
    <Name>Sidney Crosby</Name>
    <Team>PIT</Team>
    <Pos>C</Pos>
    <GP>39</GP>
    <G>32</G>
    <A>33</A>
    <PlusMinus>20</PlusMinus>
    <PIM>29</PIM>
    <PP>10</PP>
    <SH>1</SH>
    <GW>3</GW>
    <Shots>0</Shots>
    <ShotPctg>154</ShotPctg>
    <TOIPerGame>20.8</TOIPerGame>
    <ShiftsPerGame>21:54</ShiftsPerGame>
    <FOWinPctg>22.6</FOWinPctg>
    <UnknownField>55.7</UnknownField>
  </Player>
</Stats>



Lizenziert unter: CC-BY-SA with attribution
Nicht verbunden mit Stack Overflow
Ist diese KB legal? Ja, lerne warum
Lizenziert unter: CC-BY-SA with attribution
Nicht verbunden mit Stack Overflow
Ist diese KB legal? Ja, lerne warum