如何通过HtmlAgilityPack获取tr链接和内容?

c# html-agility-pack html-parsing parsing

我使用此代码获取HtmlAgilityPack的url html源代码:

private string GetUrlSource(string urlAddress)
        {
            string content = string.Empty;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.OK)
            {
                Stream receiveStream = response.GetResponseStream();
                StreamReader readStream = null;
                if (response.CharacterSet == null)
                    readStream = new StreamReader(receiveStream);
                else
                    readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
                content = readStream.ReadToEnd();
                response.Close();
                readStream.Close();
            }
            return content;
        }

然后,使用此代码获取数据:

private string GetUrlSource(string urlAddress)
        {
            string content = string.Empty;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.OK)
            {
                Stream receiveStream = response.GetResponseStream();
                StreamReader readStream = null;
                if (response.CharacterSet == null)
                    readStream = new StreamReader(receiveStream);
                else
                    readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
                content = readStream.ReadToEnd();
                response.Close();
                readStream.Close();
            }
            return content;
        }

我的节点结果是:

private string GetUrlSource(string urlAddress)
        {
            string content = string.Empty;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.OK)
            {
                Stream receiveStream = response.GetResponseStream();
                StreamReader readStream = null;
                if (response.CharacterSet == null)
                    readStream = new StreamReader(receiveStream);
                else
                    readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
                content = readStream.ReadToEnd();
                response.Close();
                readStream.Close();
            }
            return content;
        }

我使用此代码获取数据:

private string GetUrlSource(string urlAddress)
        {
            string content = string.Empty;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.OK)
            {
                Stream receiveStream = response.GetResponseStream();
                StreamReader readStream = null;
                if (response.CharacterSet == null)
                    readStream = new StreamReader(receiveStream);
                else
                    readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
                content = readStream.ReadToEnd();
                response.Close();
                readStream.Close();
            }
            return content;
        }

如果没有if&else,我如何获得每个tr标签的链接,标题和发布者?例如: http://link1.com , Title1 , Publisher1http://link2.com , Title2 , Publisher2http://link3.com , Title3 , Publisher3

一般承认的答案

许多可能的方法之一:

//select <tr> having child node <td>
var tr = doc.DocumentNode.SelectNodes("//div[@class='linear-view']/table/tr[td]");
foreach (HtmlNode node in tr)
{
    //select <td> having child node <a>
    var td1 = node.SelectSingleNode("./td[a]"); //or using index: ./td[1]
    var link = td1.FirstChild.Attributes["href"].Value;
    var title = td1.InnerText;
    //select <td> not having child node <a>
    var publisher = node.SelectSingleNode("./td[not(a)]") //using index: ./td[2]
                        .InnerText;
}



许可下: CC-BY-SA with attribution
不隶属于 Stack Overflow
这个KB合法吗? 是的,了解原因
许可下: CC-BY-SA with attribution
不隶属于 Stack Overflow
这个KB合法吗? 是的,了解原因