0

I am having a tough time extracting the value as some pages have missing tag: result-cats

I have already visited this question here, however I am still not able to scrape the data.

HTML:

<div class="result ">
    <span class="result-txt">

        <span class="result-name">
            <a href="/some/value/">COMPANY_NAME</a>
            <a class="result-icons" href="/some/value/COMPANY_NAME_/">
                <span title="Info" class="sprite sprite-info">Info</span>
                <span title="Phone" class="sprite sprite-phone">Phone</span>
            </a>
        </span>

        <em>
            <a href="/some/value/">LOCATION</a>
            <span> ADDRESS </span>
        </em>

        <span class="result-cats">
            <a href="/some/value/" title="CAT1">CAT1</a>
            <a href="/some/value/" title="CAT2">CAT2</a>
        </span>

    </span>
</div>

I am trying the following code, however it gives me error as some pages do not have results-cats tag. Hence the data frame has mismatch of vector length

code

library(rvest)
library(XML)
library(stringi)

df <- data.frame(CompanyName = NULL, CompanyLink = NULL, Address = NULL, cats = NULL)

for(i in 1:100 ){

  print(paste("Page: ", i, sep = ""))

  url <- "url.com"
  page <- read_html(url)

  CompanyNameNode <- html_nodes(page,'.result-name a:nth-child(1)')
  CompanyName <- html_text(CompanyNameNode)
  CompanyLink <- html_attr(CompanyNameNode, 'href')

  Address <- html_text(html_nodes(page,'.result-txt em'))
  Address <- gsub("[\r\n]", "", Address)

  cats <- html_text(html_nodes(page,'.result-cats'))
  cats <- stri_trim(cats)
  cats <- gsub("[\r\n]", ", ", cats)

  df <- rbind(df, data.frame(CompanyName = CompanyName, 
                             CompanyLink = CompanyLink, 
                             Address = Address, 
                             cats = cats))

}

UPDATE: Issue resolved using following code

pg <- html_nodes(page,'.result-txt')
cats <- NULL

for(j in 1:length(pg)){
  cats[j] <- ifelse(length(html_text(html_nodes(pg[j],'.result-cats')))==0, 
                   NA, 
                   html_text(html_nodes(pg[j],'.result-cats')))
}

cats <- stri_trim(cats)
cats <- gsub("[\r\n]", ", ", cats)
Hardik Gupta
  • 4,700
  • 9
  • 41
  • 83

1 Answers1

1

solved the issue using the following code

pg <- html_nodes(page,'.result-txt')
cats <- NULL

for(j in 1:length(pg)){
  cats[j] <- ifelse(length(html_text(html_nodes(pg[j],'.result-cats')))==0, 
                   NA, 
                   html_text(html_nodes(pg[j],'.result-cats')))
}

cats <- stri_trim(cats)
cats <- gsub("[\r\n]", ", ", cats)
Hardik Gupta
  • 4,700
  • 9
  • 41
  • 83