2

As an intern in a data science team, I was given the task to find a way to automatically collect specific data on a real estate ad website, using R.

Thanks to the answer given on this post (Web scraping with R over real estate ads) and some changes in the code, I managed to perform the task I want. But my problem is that I can't scrape the phone number. I tried several things but without any success.

I want to do exactly the same as in the previous post but with the phone number as a new variable.

Here is the detail of an ad: https://www.leboncoin.fr/ventes_immobilieres/1074663461.htm?ca=13_s My variables are: the price ("Prix"), the city ("Ville"), the surface ("surface"), the "GES, the "Classe énergie" , the number of room ("Pièces") and the phone number , as well as the number of pictures shown in the ad.

I noticed that the code given on the answer is no longer working since at the time the question was asked, the website was not secured (http). Today it includes the beginning 'https'. That is why I made some changes in the code.

I am a beginner with R and any help would be really appreciated (sorry for my bad english).

get_ad_links = function(page){
require(rvest)
# construct url to page (!when running the code put the url in 1 line!)
url_base = "https://www.leboncoin.fr/ventes_immobilieres/offres/
languedoc_roussillon/pyrenees_orientales"
url      = paste(url_base, "?o=", page,"&ret=1&ret=2&f=p", sep = "")
page     = read_html(url)

# extract links to ads on page
a="//*/section/section/ul/li["
b="]/a/@href"
t =lapply(1:30, function(i)  paste(a,i,b, sep = ""))
ad_links = sapply(1:30,function(i) { page %>% 
html_node(xpath=as.character(t[i])) %>% html_text()})
return(ad_links)  
}

# Function to Get Ad Details by Ad URL
get_ad_details = function(ad_url){
require(rvest)
# parse ad url to html tree
doc = read_html(paste("https:",ad_url,sep=""))

# extract labels and values using xpath expression
pattern<- "</?\\w+((\\s+\\w+(\\s*m\\s*(?:\".*?
\"|'.*?'[^'\">\\s]+))?)+\\s*|\\s*)/?>"

prix = doc %>% 
html_node(xpath="//section/section/section[2]/div[4]/h2/span[2]") %>% 
html_text()
PRIX = stringr::str_replace_all(prix,pattern,"")
PRIX =stringr::str_wrap(PRIX)
ville = doc %>% 
html_node(xpath="//section/section/section[2]/div[5]/h2/span[2]") %>% 
html_text()
VILLE = stringr::str_replace_all(ville,pattern,"")
VILLE = stringr::str_wrap(VILLE)
surface = doc %>% 
html_node(xpath="//section/section/section[2]/div[8]/h2/span[2]") %>% 
html_text()
SURFACE = stringr::str_replace_all(surface,pattern,"")
SURFACE = stringr::str_wrap(SURFACE)
pieces = doc %>% 
html_node(xpath="//section/section/section[2]/div[7]/h2/span[2]") %>% 
html_text()
PIECES = stringr::str_replace_all(pieces,pattern,"")
PIECES = stringr::str_wrap(PIECES)
type = doc %>% 
html_node(xpath="//section/section/section[2]/div[6]/h2/span[2]") %>% 
html_text()
TYPE_BIEN = stringr::str_replace_all(type,pattern,"")
TYPE_BIEN = stringr::str_wrap(TYPE_BIEN)
ges = doc %>% 
html_node(xpath="//section/section/section[2]/div[9]/h2/span[2]") %>% 
html_text()
GES = stringr::str_replace_all(ges,pattern,"")
GES = stringr::str_wrap(GES)
values  = c(PRIX, VILLE,SURFACE,PIECES,TYPE_BIEN,GES)

# convert to data frame and add labels
mydf  = as.data.frame(t(values))
names(mydf)= c("PRIX", "VILLE","SURFACE","PIECES" ,"TYPE_BIEN","GES")
return(mydf)
}


ad_links = get_ad_links(page = 1)

# grab ad details for first 30 links from page 1
require(plyr)
ad_details = ldply(ad_links[1:30], get_ad_details, .progress = 'text')

3 Answers3

2

The issue here is that the phone number is behind a button that has to be clicked before this number is shown. This is done on purpose to prevent web-scraping tools to acquire these phone-numbers.

There is no way to click on the website using rvest. You can however look into another approach using RSelenium. This method uses a webbrowser docker, that works just as a normal browser but can be directed by R commands.

Wietze314
  • 5,942
  • 2
  • 21
  • 40
1

Finally I managed to find the solution using Rselenium, here is the function get_phone_number which give a dataframe with the ad link as an ID and the phone number that I match with the data frame created before.

But I'm facing a new issue , actually when I retrieve 4 or 5 phone numbers ,my IP address got blocked . Also when I use a VPN not located in France , the phone number does not appear after I click on it.

So, how can change my IP address (only in France) dynamically after each click or any other ideas?

x<- c("RSelenium","rvest","plyr")

lapply(x, require, character.only = TRUE)

wdman::selenium(verbose = FALSE)

remDr <- remoteDriver(port = 4567L, browserName = "phantomjs")
remDr$open() 

# Function to Get the phone number by Ad URL
get_ad_phoneNumber = function(ad_url){

# put the url as ID to match later with the data frame created previously
Id = ad_url

 # go to the url
remDr$navigate(ad_url)
Sys.sleep(5) # wait until the page stop  loading

# find the phone number's button 
webElem <- remDr$findElement(using = 'css selector', value = 'aside > div > 
div.box-grey-light.mbs.align-center > div > button')

Sys.sleep(5)  # wait until the page stop  loading

webElem$clickElement() # click on the the button

Sys.sleep(5)  # wait until the page stop  loading

#find the phone number after the click
webElem <- remDr$findElement(using = 'xpath', value = 
 '//aside/div/div[1]/div/span/a')

 # extract the phone as a string character 
phoneNumber=webElem$getElementText()

values  = c(Id,phoneNumber)

# convert to data frame and add labels
mydf  = as.data.frame(t(values))
names(mydf)= c("ID","PhoneNumber")
return(mydf)
}
0

So this is what I came up with in my earlier days of programming.

Looking at it now, I cringe at the flow and lack of comments. But it worked. Please let me know if it works? I'm new to StackOverflow.

 Shownumbers <- function(k){
      for (i in k:k){
        url <- paste0("https://www.yellowpages.co.za/search?what=travel&pg=")
          webpage_new <- read_html(url)
          show_html <- html_nodes(webpage_new , ".idShowNumber")
          show <- html_text(show_html)

          show <- as.character(sub(paste("\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t") , "" , 
  show))
          show <- as.character(replace(show , as.character("\\") , ""))

          show_base <- as.data.frame(show)
          show_final <- show_base
        paste0(i)
      }

        paste0(url,k)
      }

      Shownumbers(1)

      for(d in 1:135){

        url <- paste0("https://www.yellowpages.co.za/search?what=travel&pg=")
        if(d ==1){ 
          Shownumbers(d)
          webpage_new <- read_html(url)
          no_html <- html_nodes(webpage_new , ".yext-phone")
          no <- html_text(no_html)

          no <- as.character(sub(paste("\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t") , "" , no))
          no <- as.character(replace(no , as.character("\\") , ""))

          no_base <- as.data.frame(no)
          no_final <- no_base
        } else {
          Shownumbers(d)
          webpage_new <- paste0(url,d)
          no_read <- read_html(webpage_new)
          no_html <- html_nodes(no_read , ".yext-phone")
          no <- html_text(no_html)

          no <- as.character(sub(paste("\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t") , "" , no))
          no <- as.character(replace(no , as.character("\\") , ""))

          no_base <- as.data.frame(no)
          no_final <- rbind(no_final,no_base)


        }
        paste0(d)
      }

      no_final <- unique(no_final)