循環遍歷HTML Agility Pack中的多個HTML表

c# html html-agility-pack xpath

我按照下面的鏈接中的示例,並能夠成功地將HTML表解析為數據表。

http://blog.ditran.net/parsing-html-table-to-c-usable-datalist/

但我無法解析多個表,當我遍歷TR時,第一個TR總是有列名,其餘的都有每個表中的數據。所以我使用這個邏輯並將表數據存儲在字典中並發送給我ToDataTable函數。

有人可以幫助我如何循環多個表並實現相同的邏輯。感謝它。

var tRowList = doc.DocumentNode.SelectNodes("//tr");
foreach (HtmlNode tRow in tRowList)
                    {
                        if (previousRowSpanList.Count > 0)
                        {
                            theDict = previousRowSpanList[0];
                            previousRowSpanList.Remove(theDict);        //remove it off the list
                            isWorkingWithRowSpan = true;
                        }
                        else
                        {
                            theDict = new List<KeyValuePair<string, string>>();
                            isWorkingWithRowSpan = false;
                        }
                        var tCellList = tRow.SelectNodes("td|th");
                        tCelCount = tCellList.Count;
                        if (tCelCount > 0 &&
                        !(tCelCount == 1 && string.IsNullOrEmpty(tCellList[0].InnerText.Trim()))
                        )
                        {
                            //colOrder = 1;
                            IsNullEntireRow = true;
                            for (int colIndex = 0; colIndex < tCelCount; colIndex++)
                            {
                                cell = tCellList[colIndex];
                                ColInnerText = cell.InnerText.Replace("&nbsp;", " ").Trim();
                                if (!string.IsNullOrEmpty(ColInnerText))
                                    IsNullEntireRow = false;

//

 static DataTable ToDataTable(List<List<KeyValuePair<string, string>>> list)
        {
            DataTable result = new DataTable();
            if (list.Count == 0)
                return result;

            result.Columns.AddRange(
        list.First().Select(r => new DataColumn(r.Value)).ToArray()
    );



            list= list.Skip(1).ToArray().ToList();
            list.ForEach(r => result.Rows.Add(r.Select(c => c.Value).Cast<object>().ToArray()));


            return result;

示例HTML:

<table>
<tbody>
<tr><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Node</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Logtime</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Hardware</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Prcstate A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Prcstate B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Cluster</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">RAID</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">AD replication A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">AD replication B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">File replication A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">File replication B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">hcstart RESULT</td></tr>
<tr><td class="center">DTMSCB1</td><td class="center">2016-08-26 16:40</td><td class="center">APG43L</td><td class="center">active</td><td class="center">passive</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">Not OK</td></tr>
<tr><td class="center">MSC9</td><td class="center">2016-08-26 16:40</td><td class="center">APG40C/4</td><td class="center">passive</td><td class="center">active</td><td class="center">OK</td><td class="center">OK</td><td class="center">OK</td><td class="center">OK</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">Not OK</td><td class="center">OK</td><td class="center">-</td></tr>
</tbody>
</table>


<table>
<tbody>
<tr><td style="background-color:#A9F5A9;" class="center">Node Type</td><td style="background-color:#A9F5A9;" class="center">Node</td><td style="background-color:#A9F5A9;" class="center">Log Time</td><td style="background-color:#A9F5A9;" class="center">New Mon. Alarms</td><td style="background-color:#A9F5A9;" class="center">Mon. Alarms Total</td><td style="background-color:#A9F5A9;" class="center">Other Alarms</td><td style="background-color:#A9F5A9;" class="center">MML</td></tr>
<tr><td class="center">BSC</td><td class="center">BMBSC1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">46</td><td class="center">445</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">BMBSC2C</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">27</td><td class="center">609</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">CYBSC1</td><td class="center">2016-08-26 16:45</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">1</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">45</td><td class="center">665</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">CYBSC2C</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">30</td><td class="center">849</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-BC</td><td class="center">CYMSCB1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">38</td><td class="center">283</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">DTBSC1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">48</td><td class="center">201</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">DTBSC2</td><td class="center">2016-08-26 16:45</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">1</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">31</td><td class="center">310</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-BC</td><td class="center">DTMSCB1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">25</td><td class="center">130</td><td class="center">OK</td></tr>
<tr><td class="center">HLR</td><td class="center">HLR1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">16</td><td class="center">12</td><td class="center">OK</td></tr>
<tr><td class="center">HLR</td><td class="center">HLR2</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">24</td><td class="center">10</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-S</td><td class="center">MSC10</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">48</td><td class="center">79</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-S</td><td class="center">MSC9</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">46</td><td class="center">131</td><td class="center">OK</td></tr>
</tbody>
</table>

一般承認的答案

我將保留第一個答案以供參考,但下面是一個將原始html拆分為字符串數組的方法,每個字符串元素包含一個表的HTML:

public static string[] ParseHtmlSplitTables(string htmlString)
{
    string[] result = new string[] { };

    if (!String.IsNullOrWhiteSpace(htmlString))
    {
        HtmlDocument doc = new HtmlDocument();
        doc.LoadHtml(htmlString);

        var tableNodes = doc.DocumentNode.SelectNodes("//table");
        if (tableNodes != null)
        {
            result = Array.ConvertAll<HtmlNode, string>(tableNodes.ToArray(), n => n.OuterHtml);
        }
    }

    return result;
}

然後,您可以繼續解析每個表:

string[] htmlTables = ParseHtmlSplitTables(htmlString);

foreach (string html in htmlTables)
{
    List<List<KeyValuePair<string, string>>> parseResult = ParseHtmlToDataTable(html);

    DataTable dataTable = ToDataTable(parseResult);
}

熱門答案

由於您要解析多個html表,因此應該返回一個DataSet ,每個html表都有一個DataTable 。如果存在表頭,則下面的代碼將向相應的DataTable添加列名。 html表id將用作DataTable的名稱,您可以使用該名稱直接從DataSet訪問:

將html表轉換為DataSet

public static DataSet HtmlTablesToDataset(string html)
{
    var resultDataset = new DataSet();

    HtmlDocument doc = new HtmlDocument();
    doc.LoadHtml(html);
    foreach (HtmlNode table in doc.DocumentNode.SelectNodes("//table"))
    {
        var resultTable = new DataTable(table.Id);

        foreach (HtmlNode row in table.SelectNodes("tr"))
        {
            var headerCells = row.SelectNodes("th");
            if (headerCells != null)
            {
                foreach (HtmlNode cell in headerCells)
                {
                    resultTable.Columns.Add(cell.InnerText);
                }
            }

            var dataCells = row.SelectNodes("td");
            if (dataCells != null)
            {
                var dataRow = resultTable.NewRow();
                for (int i=0; i < dataCells.Count; i++)
                {
                    dataRow[i] = dataCells[i].InnerText;
                }

                resultTable.Rows.Add(dataRow);
            }
        }

        resultDataset.Tables.Add(resultTable);
    }

    return resultDataset;
}

測試代碼:

var resultDS = HtmlTablesToDataset(html);

foreach(DataTable dt in resultDS.Tables)
{
    Console.WriteLine("Table: " + dt.TableName);

    string line = "";

    foreach (DataColumn col in dt.Columns)
    {
        line += col.ToString() + " ";
    }
    Console.WriteLine(line.Trim());

    foreach (DataRow row in dt.Rows)
    {
        line = "";
        foreach (DataColumn col in dt.Columns)
        {
            line += row[col].ToString() + " ";
        }

        Console.WriteLine(line.Trim());
    }
}

示例HTML:

string html =
@"
<html>
    <head>
        <title>Test</title>
    </head>
    <body>
        <table id='t1'>
            <tr>
                <th>Col1</th>
                <th>Col2</th>
            </tr>
            <tr>
                <td>1</td>
                <td>2</td>
            </tr>
            <tr>
                <td>3</td>
                <td>4</td>
            </tr>
        </table>
        <table id='t2'>
            <tr>
                <th>Col1</th>
                <th>Col2</th>
            </tr>
            <tr>
                <td>5</td>
                <td>6</td>
            </tr>
            <tr>
                <td>7</td>
                <td>8</td>
            </tr>        
        </table>
    </body>
</html>                
";


Related

許可下: CC-BY-SA with attribution
不隸屬於 Stack Overflow
這個KB合法嗎? 是的,了解原因
許可下: CC-BY-SA with attribution
不隸屬於 Stack Overflow
這個KB合法嗎? 是的,了解原因