Parcourez plusieurs tables HTML dans HTML Agility Pack

c# html html-agility-pack xpath

Question

J'ai suivi l'exemple du lien ci-dessous et j'ai réussi à analyser un tableau HTML avec succès.

http://blog.ditran.net/parsing-html-table-to-c-usable-datalist/

Mais je ne suis pas en mesure d'analyser plusieurs tables. Lorsque je traverse RT, les premiers TR ont toujours les noms des colonnes et les autres dans chaque table. Fonction ToDataTable.

Quelqu'un peut-il m'aider à comprendre comment puis-je parcourir plusieurs tables et mettre en œuvre la même logique? Appréciez-le.

var tRowList = doc.DocumentNode.SelectNodes("//tr");
foreach (HtmlNode tRow in tRowList)
                    {
                        if (previousRowSpanList.Count > 0)
                        {
                            theDict = previousRowSpanList[0];
                            previousRowSpanList.Remove(theDict);        //remove it off the list
                            isWorkingWithRowSpan = true;
                        }
                        else
                        {
                            theDict = new List<KeyValuePair<string, string>>();
                            isWorkingWithRowSpan = false;
                        }
                        var tCellList = tRow.SelectNodes("td|th");
                        tCelCount = tCellList.Count;
                        if (tCelCount > 0 &&
                        !(tCelCount == 1 && string.IsNullOrEmpty(tCellList[0].InnerText.Trim()))
                        )
                        {
                            //colOrder = 1;
                            IsNullEntireRow = true;
                            for (int colIndex = 0; colIndex < tCelCount; colIndex++)
                            {
                                cell = tCellList[colIndex];
                                ColInnerText = cell.InnerText.Replace("&nbsp;", " ").Trim();
                                if (!string.IsNullOrEmpty(ColInnerText))
                                    IsNullEntireRow = false;

//

 static DataTable ToDataTable(List<List<KeyValuePair<string, string>>> list)
        {
            DataTable result = new DataTable();
            if (list.Count == 0)
                return result;

            result.Columns.AddRange(
        list.First().Select(r => new DataColumn(r.Value)).ToArray()
    );



            list= list.Skip(1).ToArray().ToList();
            list.ForEach(r => result.Rows.Add(r.Select(c => c.Value).Cast<object>().ToArray()));


            return result;

exemple HTML:

<table>
<tbody>
<tr><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Node</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Logtime</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Hardware</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Prcstate A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Prcstate B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Cluster</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">RAID</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">AD replication A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">AD replication B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">File replication A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">File replication B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">hcstart RESULT</td></tr>
<tr><td class="center">DTMSCB1</td><td class="center">2016-08-26 16:40</td><td class="center">APG43L</td><td class="center">active</td><td class="center">passive</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">Not OK</td></tr>
<tr><td class="center">MSC9</td><td class="center">2016-08-26 16:40</td><td class="center">APG40C/4</td><td class="center">passive</td><td class="center">active</td><td class="center">OK</td><td class="center">OK</td><td class="center">OK</td><td class="center">OK</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">Not OK</td><td class="center">OK</td><td class="center">-</td></tr>
</tbody>
</table>


<table>
<tbody>
<tr><td style="background-color:#A9F5A9;" class="center">Node Type</td><td style="background-color:#A9F5A9;" class="center">Node</td><td style="background-color:#A9F5A9;" class="center">Log Time</td><td style="background-color:#A9F5A9;" class="center">New Mon. Alarms</td><td style="background-color:#A9F5A9;" class="center">Mon. Alarms Total</td><td style="background-color:#A9F5A9;" class="center">Other Alarms</td><td style="background-color:#A9F5A9;" class="center">MML</td></tr>
<tr><td class="center">BSC</td><td class="center">BMBSC1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">46</td><td class="center">445</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">BMBSC2C</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">27</td><td class="center">609</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">CYBSC1</td><td class="center">2016-08-26 16:45</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">1</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">45</td><td class="center">665</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">CYBSC2C</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">30</td><td class="center">849</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-BC</td><td class="center">CYMSCB1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">38</td><td class="center">283</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">DTBSC1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">48</td><td class="center">201</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">DTBSC2</td><td class="center">2016-08-26 16:45</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">1</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">31</td><td class="center">310</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-BC</td><td class="center">DTMSCB1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">25</td><td class="center">130</td><td class="center">OK</td></tr>
<tr><td class="center">HLR</td><td class="center">HLR1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">16</td><td class="center">12</td><td class="center">OK</td></tr>
<tr><td class="center">HLR</td><td class="center">HLR2</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">24</td><td class="center">10</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-S</td><td class="center">MSC10</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">48</td><td class="center">79</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-S</td><td class="center">MSC9</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">46</td><td class="center">131</td><td class="center">OK</td></tr>
</tbody>
</table>

Réponse acceptée

Je garderai la première réponse pour référence, mais ci-dessous est une méthode qui divisera le code HTML d'origine en un tableau de chaînes avec chaque élément de chaîne contenant le code HTML d'un tableau:

public static string[] ParseHtmlSplitTables(string htmlString)
{
    string[] result = new string[] { };

    if (!String.IsNullOrWhiteSpace(htmlString))
    {
        HtmlDocument doc = new HtmlDocument();
        doc.LoadHtml(htmlString);

        var tableNodes = doc.DocumentNode.SelectNodes("//table");
        if (tableNodes != null)
        {
            result = Array.ConvertAll<HtmlNode, string>(tableNodes.ToArray(), n => n.OuterHtml);
        }
    }

    return result;
}

Avec le résultat, vous pouvez ensuite analyser chaque table:

string[] htmlTables = ParseHtmlSplitTables(htmlString);

foreach (string html in htmlTables)
{
    List<List<KeyValuePair<string, string>>> parseResult = ParseHtmlToDataTable(html);

    DataTable dataTable = ToDataTable(parseResult);
}

Réponse populaire

Étant donné que vous souhaitez analyser plusieurs tables html , vous devriez retourner un DataSet qui aura un DataTable par table html. Si des en-têtes de table sont présents, le code ci-dessous ajoutera des noms de colonne au DataTable correspondant. L'identifiant de table html sera utilisé comme nom du DataTable avec lequel vous pouvez utiliser pour accéder directement à partir du DataSet :

Méthode pour convertir les tables html en un DataSet :

public static DataSet HtmlTablesToDataset(string html)
{
    var resultDataset = new DataSet();

    HtmlDocument doc = new HtmlDocument();
    doc.LoadHtml(html);
    foreach (HtmlNode table in doc.DocumentNode.SelectNodes("//table"))
    {
        var resultTable = new DataTable(table.Id);

        foreach (HtmlNode row in table.SelectNodes("tr"))
        {
            var headerCells = row.SelectNodes("th");
            if (headerCells != null)
            {
                foreach (HtmlNode cell in headerCells)
                {
                    resultTable.Columns.Add(cell.InnerText);
                }
            }

            var dataCells = row.SelectNodes("td");
            if (dataCells != null)
            {
                var dataRow = resultTable.NewRow();
                for (int i=0; i < dataCells.Count; i++)
                {
                    dataRow[i] = dataCells[i].InnerText;
                }

                resultTable.Rows.Add(dataRow);
            }
        }

        resultDataset.Tables.Add(resultTable);
    }

    return resultDataset;
}

Code de test:

var resultDS = HtmlTablesToDataset(html);

foreach(DataTable dt in resultDS.Tables)
{
    Console.WriteLine("Table: " + dt.TableName);

    string line = "";

    foreach (DataColumn col in dt.Columns)
    {
        line += col.ToString() + " ";
    }
    Console.WriteLine(line.Trim());

    foreach (DataRow row in dt.Rows)
    {
        line = "";
        foreach (DataColumn col in dt.Columns)
        {
            line += row[col].ToString() + " ";
        }

        Console.WriteLine(line.Trim());
    }
}

Exemple HTML:

string html =
@"
<html>
    <head>
        <title>Test</title>
    </head>
    <body>
        <table id='t1'>
            <tr>
                <th>Col1</th>
                <th>Col2</th>
            </tr>
            <tr>
                <td>1</td>
                <td>2</td>
            </tr>
            <tr>
                <td>3</td>
                <td>4</td>
            </tr>
        </table>
        <table id='t2'>
            <tr>
                <th>Col1</th>
                <th>Col2</th>
            </tr>
            <tr>
                <td>5</td>
                <td>6</td>
            </tr>
            <tr>
                <td>7</td>
                <td>8</td>
            </tr>        
        </table>
    </body>
</html>                
";


Related

Sous licence: CC-BY-SA with attribution
Non affilié à Stack Overflow
Est-ce KB légal? Oui, apprenez pourquoi
Sous licence: CC-BY-SA with attribution
Non affilié à Stack Overflow
Est-ce KB légal? Oui, apprenez pourquoi