15

I used the following code:

library(XML)
library(RCurl)
getGoogleURL <- function(search.term, domain = '.co.uk', quotes=TRUE) 
    {
    search.term <- gsub(' ', '%20', search.term)
    if(quotes) search.term <- paste('%22', search.term, '%22', sep='') 
        getGoogleURL <- paste('http://www.google', domain, '/search?q=',
        search.term, sep='')
    }

    getGoogleLinks <- function(google.url) 
    {
       doc <- getURL(google.url, httpheader = c("User-Agent" = "R(2.10.0)"))
       html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function(...){})
       nodes <- getNodeSet(html, "//a[@href][@class='l']")
       return(sapply(nodes, function(x) x <- xmlAttrs(x)[[1]]))
    }

search.term <- "cran"
quotes <- "FALSE"
search.url <- getGoogleURL(search.term=search.term, quotes=quotes)

links <- getGoogleLinks(search.url)

I would like to find all the links that resulted from my search and I get the following result:

> links
list()

How can I get the links? In addition I would like to get the headlines and summary of google results how can I get it? And finally is there a way to get the links that resides in ChillingEffects.org results?

Bas
  • 1,066
  • 1
  • 10
  • 28
Avi
  • 2,247
  • 4
  • 30
  • 52

4 Answers4

8

If you look at the htmlvariable, you can see that the search result links all are nested in <h3 class="r"> tags.

Try to change your getGoogleLinks function to:

getGoogleLinks <- function(google.url) {
   doc <- getURL(google.url, httpheader = c("User-Agent" = "R
                                             (2.10.0)"))
   html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
                          (...){})
   nodes <- getNodeSet(html, "//h3[@class='r']//a")
   return(sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]]))
}
user3794498
  • 184
  • 3
  • Hi, I do exactly the same thing but my nodes equals NULL. What could be wrong? Thank you! – hoang tran Jun 18 '19 at 16:58
  • Can you please also explain me how to choose the "//h3[@class='r']//a", based on what? – hoang tran Jun 18 '19 at 18:10
  • 3
    Google has changed their website, so the results are no longer nested in h3 tags. When looking for nodes, "//h3[@class='r']//a" means to look for 'a' nodes (i.e. links) nodes nested anywhere in 'h3' nodes (i.e. level 3 headers) with class 3 anywhere in the document. – user3794498 Jun 20 '19 at 12:12
7

I created this function to read in a list of company names and then get the top website result for each. It will get you started then you can adjust it as needed.

#libraries.
library(URLencode)
library(rvest)

#load data
d <-read.csv("P:\\needWebsites.csv")
c <- as.character(d$Company.Name)

# Function for getting website.
getWebsite <- function(name)
{
    url = URLencode(paste0("https://www.google.com/search?q=",name))

    page <- read_html(url)

    results <- page %>% 
      html_nodes("cite") %>% # Get all notes of type cite. You can change this to grab other node types.
      html_text()

    result <- results[1]

    return(as.character(result)) # Return results if you want to see them all.
}

# Apply the function to a list of company names.
websites <- data.frame(Website = sapply(c,getWebsite))]
  • Hello Bryce Sir. I have written a program taking a inspiration from ur program. But I'm getting character 0. Plzz help. – Therii May 31 '19 at 14:31
  • r_h = read_html("https://www.google.com/search?q=google&oq=google&aqs=chrome.0.69i59j0l2j69i60l2j69i65.1101j0j7&sourceid=chrome&ie=UTF-8") ; d = r_h %>% html_nodes(".iUh30") %>% html_text() %>% as.character() – Therii May 31 '19 at 14:31
  • @Therii maybe my answer will help – moodymudskipper Aug 10 '19 at 11:16
6

other solutions here don't work for me, here's my take on @Bryce-Chamberlain's issue which works for me in August 2019, it answers also another closed question : company name to URL in R


# install.packages("rvest")

get_first_google_link <- function(name, root = TRUE) {
  url = URLencode(paste0("https://www.google.com/search?q=",name))
  page <- xml2::read_html(url)
  # extract all links
  nodes <- rvest::html_nodes(page, "a")
  links <- rvest::html_attr(nodes,"href")
  # extract first link of the search results
  link <- links[startsWith(links, "/url?q=")][1]
  # clean it
  link <- sub("^/url\\?q\\=(.*?)\\&sa.*$","\\1", link)
  # get root if relevant
  if(root) link <- sub("^(https?://.*?/).*$", "\\1", link)
  link
}

companies <- data.frame(company = c("apple acres llc","abbvie inc","apple inc"))
companies <- transform(companies, url = sapply(company,get_first_google_link))
companies
#>           company                            url
#> 1 apple acres llc https://www.appleacresllc.com/
#> 2      abbvie inc        https://www.abbvie.com/
#> 3       apple inc         https://www.apple.com/

Created on 2019-08-10 by the reprex package (v0.2.1)

moodymudskipper
  • 46,417
  • 11
  • 121
  • 167
1

The free solutions don't work anymore. Plus it doesn't allow you to search for regions outside your location. Here's a solution using Google Custom Search API. The API allows 100 free API calls per day. The function below returns only 10 results or page 1. 1 API call returns only 10 results.

Google.Search.API <- function(keyword, google.key, google.cx, country = "us")
{
  # keyword = keywords[10]; country = "us"
  url <- paste0("https://www.googleapis.com/customsearch/v1?"
                , "key=", google.key
                , "&q=", gsub(" ", "+", keyword)
                , "&gl=", country         # Country
                , "&hl=en"                # Language from Browser, english
                , "&cx=", google.cx
                , "&fields=items(link)"
                )

  d2 <- url %>%
        httr::GET(ssl.verifypeer=TRUE) %>%
        httr::content(.) %>% .[["items"]] %>%
        data.table::rbindlist(.) %>%
        mutate(keyword, SERP = row_number(), search.engine = "Google API") %>%
        rename(source = link) %>%
        select(search.engine, keyword, SERP, source)

  pause <- round(runif(1, min = 1.1, max = 5), 1)
  if(nrow(d2) == 0)
  {cat("\nPausing", pause, "seconds. Failed for:", keyword)} else
  {cat("\nPausing", pause, "seconds. Successful for:", keyword)}

  Sys.sleep(pause)
  rm(keyword, country, pause, url, google.key, google.cx)
  return(d2)
}
ishonest
  • 433
  • 4
  • 8
  • Hi - currently looking at this very task for retrieving company URLs using company names; your function works unless there is no link returned by the search, in which case it crashes with following error output: Error: Can't rename columns that don't exist. x Column `link` doesn't exist. – user1849286 Jul 17 '23 at 07:12