As an intern in a data science team, I was given the task to find a way to automatically collect specific data on a real estate ad website, using R.
Thanks to the answer given on this post (Web scraping with R over real estate ads) and some changes in the code, I managed to perform the task I want. But my problem is that I can't scrape the phone number. I tried several things but without any success.
I want to do exactly the same as in the previous post but with the phone number as a new variable.
Here is the detail of an ad: https://www.leboncoin.fr/ventes_immobilieres/1074663461.htm?ca=13_s My variables are: the price ("Prix"), the city ("Ville"), the surface ("surface"), the "GES, the "Classe énergie" , the number of room ("Pièces") and the phone number , as well as the number of pictures shown in the ad.
I noticed that the code given on the answer is no longer working since at the time the question was asked, the website was not secured (http). Today it includes the beginning 'https'. That is why I made some changes in the code.
I am a beginner with R and any help would be really appreciated (sorry for my bad english).
get_ad_links = function(page){
require(rvest)
# construct url to page (!when running the code put the url in 1 line!)
url_base = "https://www.leboncoin.fr/ventes_immobilieres/offres/
languedoc_roussillon/pyrenees_orientales"
url = paste(url_base, "?o=", page,"&ret=1&ret=2&f=p", sep = "")
page = read_html(url)
# extract links to ads on page
a="//*/section/section/ul/li["
b="]/a/@href"
t =lapply(1:30, function(i) paste(a,i,b, sep = ""))
ad_links = sapply(1:30,function(i) { page %>%
html_node(xpath=as.character(t[i])) %>% html_text()})
return(ad_links)
}
# Function to Get Ad Details by Ad URL
get_ad_details = function(ad_url){
require(rvest)
# parse ad url to html tree
doc = read_html(paste("https:",ad_url,sep=""))
# extract labels and values using xpath expression
pattern<- "</?\\w+((\\s+\\w+(\\s*m\\s*(?:\".*?
\"|'.*?'[^'\">\\s]+))?)+\\s*|\\s*)/?>"
prix = doc %>%
html_node(xpath="//section/section/section[2]/div[4]/h2/span[2]") %>%
html_text()
PRIX = stringr::str_replace_all(prix,pattern,"")
PRIX =stringr::str_wrap(PRIX)
ville = doc %>%
html_node(xpath="//section/section/section[2]/div[5]/h2/span[2]") %>%
html_text()
VILLE = stringr::str_replace_all(ville,pattern,"")
VILLE = stringr::str_wrap(VILLE)
surface = doc %>%
html_node(xpath="//section/section/section[2]/div[8]/h2/span[2]") %>%
html_text()
SURFACE = stringr::str_replace_all(surface,pattern,"")
SURFACE = stringr::str_wrap(SURFACE)
pieces = doc %>%
html_node(xpath="//section/section/section[2]/div[7]/h2/span[2]") %>%
html_text()
PIECES = stringr::str_replace_all(pieces,pattern,"")
PIECES = stringr::str_wrap(PIECES)
type = doc %>%
html_node(xpath="//section/section/section[2]/div[6]/h2/span[2]") %>%
html_text()
TYPE_BIEN = stringr::str_replace_all(type,pattern,"")
TYPE_BIEN = stringr::str_wrap(TYPE_BIEN)
ges = doc %>%
html_node(xpath="//section/section/section[2]/div[9]/h2/span[2]") %>%
html_text()
GES = stringr::str_replace_all(ges,pattern,"")
GES = stringr::str_wrap(GES)
values = c(PRIX, VILLE,SURFACE,PIECES,TYPE_BIEN,GES)
# convert to data frame and add labels
mydf = as.data.frame(t(values))
names(mydf)= c("PRIX", "VILLE","SURFACE","PIECES" ,"TYPE_BIEN","GES")
return(mydf)
}
ad_links = get_ad_links(page = 1)
# grab ad details for first 30 links from page 1
require(plyr)
ad_details = ldply(ad_links[1:30], get_ad_details, .progress = 'text')