1

How can I parse Google results as in my example?

<div class="srg">
<li class="g">...</li>
<li class="g">...</li>
<li class="g">...</li>
<li class="g">...</li>
<li class="g">...</li>
<li class="g">...</li>
</div>

This is my code for parsing Google results, selectNodes remains null.

HtmlAgilityPack.HtmlDocument doc1 = new HtmlAgilityPack.HtmlDocument();
StreamReader reader = new StreamReader(WebRequest.Create("http://www.google.com/?gws_rd=ssl#q=(404)8271500").GetResponse().GetResponseStream(), Encoding.Default); //put your encoding            
doc1.Load(reader);

var selectNodes = doc1.DocumentNode.SelectNodes("//li[@class='g']");
foreach (var node in selectNodes)  
{
    //node.InnerText will give you the text content of the li tags ...
}
aloisdg
  • 22,270
  • 6
  • 85
  • 105
  • What do you want? The title? The url? The description? – aloisdg Jan 14 '15 at 11:24
  • The title and the description. – Zion Zipris Zafrir Jan 14 '15 at 12:01
  • 1
    Check out http://scraping.compunect.com and go to the "Google Search scraper" There is an open source PHP project which parses Google, it's not C# but it also uses a DOM class to go through the layout so you can look how it is done there. Be prepared to update your code from time to time, Google layouts do not always stay the same. – John Apr 30 '15 at 18:57

2 Answers2

1

Sample code:

        string result = @"<div class=""srg"">
                        <li class=""g"">...</li>
                        <li class=""g"">...</li>
                        <li class=""g"">...</li>
                        <li class=""g"">...</li>
                        <li class=""g"">...</li>
                        <li class=""g"">...</li>
                        </div>";

        HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
        doc.LoadHtml(result);
        var selectNodes = doc.DocumentNode.SelectNodes("//li[@class='g']");
        foreach (var node in selectNodes)
        {
           //node.InnerText will give you the text content of the li tags ...
        } 
TH Todorov
  • 1,129
  • 11
  • 26
0

Why not use the API ?

string query = "(404)8271500";
string json = "";

// Get the Json from the API. Dont forget to put your function in async.
// You need HttpClient https://www.nuget.org/packages/Microsoft.Net.Http
using (var client = new HttpClient()) 
{
    json = await client.GetStringAsync("http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&start=0&q=" + query);
}

// Parse the Json string to your object.
// You need Json.NET https://www.nuget.org/packages/Newtonsoft.Json/
GoogleObject googleObject = JsonConvert.DeserializeObject<GoogleObject>(json);
foreach (var item in googleObject.responseData.results)
{
    Console.WriteLine(item.title); // title
    Console.WriteLine(item.content); // description
}

and your GoogleObject :

public class GoogleObject
{
    public Responsedata responseData { get; set; }
    public object responseDetails { get; set; }
    public int responseStatus { get; set; }
}

public class Responsedata
{
    public Result[] results { get; set; }
    public Cursor cursor { get; set; }
}

public class Cursor
{
    public string resultCount { get; set; }
    public Page[] pages { get; set; }
    public string estimatedResultCount { get; set; }
    public int currentPageIndex { get; set; }
    public string moreResultsUrl { get; set; }
    public string searchResultTime { get; set; }
}

public class Page
{
    public string start { get; set; }
    public int label { get; set; }
}

public class Result
{
    public string GsearchResultClass { get; set; }
    public string unescapedUrl { get; set; }
    public string url { get; set; }
    public string visibleUrl { get; set; }
    public string cacheUrl { get; set; }
    public string title { get; set; }
    public string titleNoFormatting { get; set; }
    public string content { get; set; }
}

It will not resolve your problem, but it may fit your need.

aloisdg
  • 22,270
  • 6
  • 85
  • 105