在C#中解析具有Html敏捷性的表格,單元格


我需要解析Html代碼。更具體地說,解析所有表中每行的每個單元格。每行代表一個對象,每個單元代表不同的屬性。我想解析這些,以便能夠寫入包含每個數據的XML文件(沒有無用的HTML代碼)。我已經成功地解析了HTML文件中的每一列,但現在我不知道將這個列寫入XML文件的選項。我很困惑。

HTML:

<tr><tr> 
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF"> 
    1
</td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="left"> 
        <a href="/ice/player.htm?id=8471675">Sidney Crosby</a> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        PIT
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        C
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        39
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        32
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        33
    </td> 
    <td class="statBox sorted" style="border-width:0px 1px 1px 0px; background-color: #E0E0E0" align="right"> 
        <font color="#000000"> 
            65
        </font> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        29
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        10
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        1
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        3
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        0
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        154
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20.8
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        21:54
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        22.6
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
        55.7
    </td> 
</tr></tr>

C#:

<tr><tr> 
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF"> 
    1
</td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="left"> 
        <a href="/ice/player.htm?id=8471675">Sidney Crosby</a> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        PIT
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        C
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        39
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        32
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        33
    </td> 
    <td class="statBox sorted" style="border-width:0px 1px 1px 0px; background-color: #E0E0E0" align="right"> 
        <font color="#000000"> 
            65
        </font> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        29
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        10
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        1
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        3
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        0
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        154
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20.8
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        21:54
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        22.6
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
        55.7
    </td> 
</tr></tr>

XML:

<tr><tr> 
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF"> 
    1
</td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="left"> 
        <a href="/ice/player.htm?id=8471675">Sidney Crosby</a> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        PIT
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center"> 
        C
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        39
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        32
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        33
    </td> 
    <td class="statBox sorted" style="border-width:0px 1px 1px 0px; background-color: #E0E0E0" align="right"> 
        <font color="#000000"> 
            65
        </font> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        29
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        10
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        1
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        3
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        0
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        154
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        20.8
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        21:54
    </td> 
    <td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right"> 
        22.6
    </td> 
    <td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right"> 
        55.7
    </td> 
</tr></tr>

一般承認的答案

環顧MSDN後,我終於找到了解決問題的實現方案:

    using System;
    using HtmlAgilityPack;
    using System.Xml;

    namespace HockeyStats
    {
        class StatsParser
        {
            private string htmlCode;
            private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml";

            public StatsParser(string htmlCode)
            {
                this.htmlCode = htmlCode;

                this.ParseHtml();
            }

            public void ParseHtml()
            {

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(htmlCode);
                XmlWriter writer = null;

                try
                {
                    // Create an XmlWriterSettings object with the correct options. 
                    XmlWriterSettings settings = new XmlWriterSettings();
                    settings.Indent = true;
                    settings.IndentChars = ("  ");
                    settings.OmitXmlDeclaration = false;

                    // Create the XmlWriter object and write some content.
                    writer = XmlWriter.Create(@"..\..\"+fileName, settings);
                    writer.WriteStartElement("Stats");
                    writer.WriteAttributeString("Date", DateTime.Now.ToShortDateString());

                // Iterate all rows within another row
                HtmlNodeCollection rows = doc.DocumentNode.SelectNodes(".//tr/tr");
                for (int i = 0; i < rows.Count; ++i)
                {
                    // Iterate all columns in this row
                    HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']");
                    for (int j = 0; j < 20; ++j)
                    {
                                switch (j)
                                {
                                    case 0:
                                        {
                                            writer.WriteStartElement("Player");
                                            writer.WriteAttributeString("Rank", cols[j].InnerText.Trim()); break;
                                        }
                                    case 1: writer.WriteElementString("Name", cols[j].InnerText.Trim()); break;
                                    case 2: writer.WriteElementString("Team", cols[j].InnerText.Trim()); break;
                                    case 3: writer.WriteElementString("Pos", cols[j].InnerText.Trim()); break;
                                    case 4: writer.WriteElementString("GP", cols[j].InnerText.Trim()); break;
                                    case 5: writer.WriteElementString("G", cols[j].InnerText.Trim()); break;
                                    case 6: writer.WriteElementString("A", cols[j].InnerText.Trim()); break;
                                    case 7: writer.WriteElementString("PlusMinus", cols[j].InnerText.Trim()); break;
                                    case 8: writer.WriteElementString("PIM", cols[j].InnerText); break;
                                    case 9: writer.WriteElementString("PP", cols[j].InnerText); break;
                                    case 10: writer.WriteElementString("SH", cols[j].InnerText); break;
                                    case 11: writer.WriteElementString("GW", cols[j].InnerText); break;
                                    case 12: writer.WriteElementString("OT", cols[j].InnerText); break;
                                    case 13: writer.WriteElementString("Shots", cols[j].InnerText); break;
                                    case 14: writer.WriteElementString("ShotPctg", cols[j].InnerText); break;
                                    case 15: writer.WriteElementString("TOIPerGame", cols[j].InnerText); break;
                                    case 16: writer.WriteElementString("ShiftsPerGame", cols[j].InnerText); break;
                                    case 17: writer.WriteElementString("FOWinPctg", cols[j].InnerText); break;

                                }
                            }
                        }
                        writer.WriteEndElement();
                    }
                    writer.WriteEndElement();
                    writer.Flush();
                }
                finally
                {
                    if (writer != null)
                        writer.Close();
                }
            }
        }
    }

它提供以下XML文件作為輸出:

    using System;
    using HtmlAgilityPack;
    using System.Xml;

    namespace HockeyStats
    {
        class StatsParser
        {
            private string htmlCode;
            private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml";

            public StatsParser(string htmlCode)
            {
                this.htmlCode = htmlCode;

                this.ParseHtml();
            }

            public void ParseHtml()
            {

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(htmlCode);
                XmlWriter writer = null;

                try
                {
                    // Create an XmlWriterSettings object with the correct options. 
                    XmlWriterSettings settings = new XmlWriterSettings();
                    settings.Indent = true;
                    settings.IndentChars = ("  ");
                    settings.OmitXmlDeclaration = false;

                    // Create the XmlWriter object and write some content.
                    writer = XmlWriter.Create(@"..\..\"+fileName, settings);
                    writer.WriteStartElement("Stats");
                    writer.WriteAttributeString("Date", DateTime.Now.ToShortDateString());

                // Iterate all rows within another row
                HtmlNodeCollection rows = doc.DocumentNode.SelectNodes(".//tr/tr");
                for (int i = 0; i < rows.Count; ++i)
                {
                    // Iterate all columns in this row
                    HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']");
                    for (int j = 0; j < 20; ++j)
                    {
                                switch (j)
                                {
                                    case 0:
                                        {
                                            writer.WriteStartElement("Player");
                                            writer.WriteAttributeString("Rank", cols[j].InnerText.Trim()); break;
                                        }
                                    case 1: writer.WriteElementString("Name", cols[j].InnerText.Trim()); break;
                                    case 2: writer.WriteElementString("Team", cols[j].InnerText.Trim()); break;
                                    case 3: writer.WriteElementString("Pos", cols[j].InnerText.Trim()); break;
                                    case 4: writer.WriteElementString("GP", cols[j].InnerText.Trim()); break;
                                    case 5: writer.WriteElementString("G", cols[j].InnerText.Trim()); break;
                                    case 6: writer.WriteElementString("A", cols[j].InnerText.Trim()); break;
                                    case 7: writer.WriteElementString("PlusMinus", cols[j].InnerText.Trim()); break;
                                    case 8: writer.WriteElementString("PIM", cols[j].InnerText); break;
                                    case 9: writer.WriteElementString("PP", cols[j].InnerText); break;
                                    case 10: writer.WriteElementString("SH", cols[j].InnerText); break;
                                    case 11: writer.WriteElementString("GW", cols[j].InnerText); break;
                                    case 12: writer.WriteElementString("OT", cols[j].InnerText); break;
                                    case 13: writer.WriteElementString("Shots", cols[j].InnerText); break;
                                    case 14: writer.WriteElementString("ShotPctg", cols[j].InnerText); break;
                                    case 15: writer.WriteElementString("TOIPerGame", cols[j].InnerText); break;
                                    case 16: writer.WriteElementString("ShiftsPerGame", cols[j].InnerText); break;
                                    case 17: writer.WriteElementString("FOWinPctg", cols[j].InnerText); break;

                                }
                            }
                        }
                        writer.WriteEndElement();
                    }
                    writer.WriteEndElement();
                    writer.Flush();
                }
                finally
                {
                    if (writer != null)
                        writer.Close();
                }
            }
        }
    }

熱門答案

我在評論中的意思是你在代碼(嵌套循環)中做了正確的XPath可以為你做什麼。使用LINQ-to-XML可以使編寫更加簡單。但是現在我們看到您希望如何格式化XML文件,我們可以提供自己的答案。我會像這樣編寫ParseHtml()方法:

public void ParseHtml()
{
    var htmlDoc = new HtmlDocument();
    htmlDoc.LoadHtml(htmlCode);
    var cells = htmlDoc.DocumentNode
                                    // use the right XPath rather than looping manually
                       .SelectNodes(@"//tr/tr/td[@class='statBox']")
                       .Select(node => node.InnerText.Trim())
                       .ToList();
    var elementNames = new[] { "Name", "Team", "Pos", "GP", "G", "A", "PlusMinus", "PIM", "PP", "SH", "GW", "OT", "Shots", "ShotPctg", "TOIPerGame", "ShiftsPerGame", "FOWinPctg", "UnknownField" };
    var xmlDoc =
        new XElement("Stats", new XAttribute("Date", DateTime.Now.ToShortDateString()),
            new XElement("Player", new XAttribute("Rank", cells.First()),
                // generate the elements based on the parsed cells
                cells.Skip(1)
                     .Zip(elementNames, (Value, Name) => new XElement(Name, Value))
                     .Where(element => !String.IsNullOrEmpty(element.Value))
            )
        );

    // save to your file
    xmlDoc.Save(filepath);
}

產生輸出:

public void ParseHtml()
{
    var htmlDoc = new HtmlDocument();
    htmlDoc.LoadHtml(htmlCode);
    var cells = htmlDoc.DocumentNode
                                    // use the right XPath rather than looping manually
                       .SelectNodes(@"//tr/tr/td[@class='statBox']")
                       .Select(node => node.InnerText.Trim())
                       .ToList();
    var elementNames = new[] { "Name", "Team", "Pos", "GP", "G", "A", "PlusMinus", "PIM", "PP", "SH", "GW", "OT", "Shots", "ShotPctg", "TOIPerGame", "ShiftsPerGame", "FOWinPctg", "UnknownField" };
    var xmlDoc =
        new XElement("Stats", new XAttribute("Date", DateTime.Now.ToShortDateString()),
            new XElement("Player", new XAttribute("Rank", cells.First()),
                // generate the elements based on the parsed cells
                cells.Skip(1)
                     .Zip(elementNames, (Value, Name) => new XElement(Name, Value))
                     .Where(element => !String.IsNullOrEmpty(element.Value))
            )
        );

    // save to your file
    xmlDoc.Save(filepath);
}




許可下: CC-BY-SA
不隸屬於 Stack Overflow
這個KB合法嗎? 是的,了解原因