0

I have dataframe with 7 rows and 1 column,which contains links of a website, I'm trying to extract data from those various link and store them in a data frame but not able to append that.Also I'm checking that if for a link if there is no records(this I'm checking through html attribute of that link) skip that link and proceed to next link.I'm also trying to fetch data for multiple pages of a link.

This is reproducible data

text1="http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom="
text3="&proptype="
text4="Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment"
text5="&cityName=Thane&BudgetMin="
text6="&BudgetMax="

bhk=c("1","2","3","4","5",">5")
budg_min=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
budg_max=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
eg <- expand.grid(bhk = bhk, budg_min = budg_min, budg_max = budg_max)
eg <- eg[as.integer(eg$budg_min) <= as.integer(eg$budg_max),]
uuu <- sprintf("%s%s%s%s%s%s%s%s", text1,eg[,1],text3,text4,text5,eg[,2],text6,eg[,3])
uuu_df1=data.frame(x=uuu[1:7,])
dput(uuu_df1)

I have 3 solution for this but none seems to be working fine.

SOlution#1

urlList <- llply(uuu_df1[,1], function(url){     

  this_pg <- read_html(url)

  results_count <- this_pg %>% 
    xml_find_first(".//span[@id='resultCount']") %>% 
    xml_text() %>%
    as.integer()

  if(results_count > 0){

    cards <- this_pg %>% 
      xml_find_all('//div[@class="SRCard"]')

    df <- ldply(cards, .fun=function(x){
      y <- data.frame(wine = x %>% xml_find_first('.//span[@class="agentNameh"]') %>% xml_text(),
                      excerpt = x %>% xml_find_first('.//div[@class="postedOn"]') %>% xml_text(),
                      locality = x %>% xml_find_first('.//span[@class="localityFirst"]') %>% xml_text(),
                      society = x %>% xml_find_first('.//div[@class="labValu"]') %>% xml_text() %>% gsub('\\n', '', .))
      return(y)
    })

  } else {
    df <- NULL
  }

  return(df)   
}, .progress = 'text')
names(urlList) <- uuu_df1[,1]

a=bind_rows(urlList)

Above code gives me error Error in if (results_count > 0) { : missing value where TRUE/FALSE needed

Solution#2

urlList <- lapply(uuu_df1[,1], function(url){     

  UrlPage <- html(as.character(url))
  ImgNode <- UrlPage %>% html_node("div.noResultHead")
  u <- paste("No", word(string = as(ImgNode, "character"), start=4, end=5), sep=" ")

  cat(".")        
  pg <- read_html(url)

  if(u!="No Results Found!") {
    df <- data.frame(wine=html_text(html_nodes(pg, ".agentNameh")),
                     excerpt=html_text(html_nodes(pg, ".postedOn")),
                     locality=html_text(html_nodes(pg,".localityFirst")),
                     society=html_text(html_nodes(pg,'.labValu .stop-propagation:nth-child(1)')),
                     stringsAsFactors=FALSE)
  } else {
    # ASSIGN EMPTY DATAFRAME (FOR CONSISTENT STRUCTURE)
    df <- data.frame(wine=character(), excerpt=character(), locality=character(), society=character())
  }
  # RETURN NAMED LIST
  return(list(UrlPage=UrlPage, ImgNode=ImgNode, u=u, df=df))    
})

# ROW BIND ONLY DATAFRAME ELEMENT FROM LIST
wines <- map_df(urlList, function(u) u$df)

Above code gives empty dataframe

Solution#3

uuu_df1=data.frame(x=uuu_df[1:7,])
wines=data.frame()
url_test=c()
UrlPage_test=c()
u=c()
ImgNode=c()
pg=c()

for(i in 1:dim(uuu_df1)[1]) {

  url_test[i]=as.character(uuu_df1[i,])
  UrlPage_test[i] <- html(url_test[i])
  ImgNode[i] <- UrlPage_test[i] %>% html_node("div.noResultHead")
  u[i]=ImgNode[i]
  u[i]=as(u[i],"character")
  u[i]=paste("No",word(string = u, start = 4, end = 5),sep = " ")

  if(u[i]=="No Results Found!") next
  {
    map_df(1:5, function(i) # here 1:5 is number of webpages of a website 
    {

      # simple but effective progress indicator
      cat(".")

      pg[i] <- read_html(sprintf(url_test[i], i))

      data.frame(wine=html_text(html_nodes(pg[i], ".agentNameh")),
                 excerpt=html_text(html_nodes(pg[i], ".postedOn")),
                 locality=html_text(html_nodes(pg[i],".localityFirst")),
                 society=html_text(html_nodes(pg[i],'.labValu .stop-propagation:nth-child(1)')),
                 stringsAsFactors=FALSE)

    }) -> wines

  }}

Above code also gives an error

Error in UseMethod("xml_find_first") : 
  no applicable method for 'xml_find_first' applied to an object of class "list"
In addition: Warning messages:
1: 'html' is deprecated.
Use 'read_html' instead.
See help("Deprecated") 
2: In UrlPage_test[i] <- html(url_test[i]) :
  number of items to replace is not a multiple of replacement length

Any suggestions on which code can be corrected so that my requirement is met. Thanks in advance

Andre_k
  • 1,680
  • 3
  • 18
  • 41

1 Answers1

1

Solution #1

That missing value where TRUE/FALSE needed is printed when you do something like this:

if (NA > 0) {
    do something
}

So replace your if condition

if(results_count > 0)

with

(!is.na(results_count) & (results_count > 0))
psychOle
  • 1,054
  • 9
  • 19
  • Excellent @herbaman that one line saved my day. it worked well..Thanks a ton ..Thanks for the effort!!! – Andre_k Jun 16 '17 at 10:49
  • if you check "only for the 7th record" that link shows that it has 94 records but if you run that code for just 7th record the dataframe created contains only 30 records and not 94 ..why is it so?? – Andre_k Jun 16 '17 at 11:22