link extraction using HtmlAgilityPack and c#

c# html html-agility-pack search-engine


i want to extract google result links
My code works it does extract links, but these links are not what i expected to be extracted. My program would extract links inside the "a href" tag but all links in search result are not Appropriate links , ads link , googles link are also included what should i do?

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.ServiceModel.Syndication;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Xml;

namespace Search
public partial class Form1 : Form
    // load snippet
    HtmlAgilityPack.HtmlDocument htmlSnippet = new HtmlAgilityPack.HtmlDocument();

    public Form1()

    private void btn1_Click(object sender, EventArgs e)
        StringBuilder sb = new StringBuilder();
        byte[] ResultsBuffer = new byte[8192];
        string SearchResults = "" + txtKeyWords.Text.Trim();
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

        Stream resStream = response.GetResponseStream();
        string tempString = null;
        int count = 0;
            count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
            if (count != 0)
                tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);

        while (count > 0);
        string sbb = sb.ToString();

        HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
        html.OptionOutputAsXml = true;
        HtmlNode doc = html.DocumentNode;

        foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
            //HtmlAttribute att = link.Attributes["href"];
            string hrefValue = link.GetAttributeValue("href", string.Empty);
     //       if ()
                int index = hrefValue.IndexOf("&");
                if (index > 0)
                    hrefValue = hrefValue.Substring(0, index);
                    listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));


if i want to work with "a href" tag i have to add some condition in If
but i dont know what condition i should use here:

if ()

someplace i read about extracting cite tag not ahref tag
anybody can help?

5/30/2016 8:01:55 PM

Accepted Answer

To get the links that are contained in the cite elements, simply access their inner text, like:

    HtmlWeb w = new HtmlWeb();
    var hd = w.Load("");

    var cites = hd.DocumentNode.SelectNodes("//cite");

    foreach (var cite in cites)
5/30/2016 10:48:20 AM

Related Questions


Licensed under: CC-BY-SA with attribution
Not affiliated with Stack Overflow
Licensed under: CC-BY-SA with attribution
Not affiliated with Stack Overflow