1

I wrote the following code to scrap tendering information from a portal on daily basis.

packages <- c('rvest', 'stringi', 'tidyverse','lubridate','dplyr')
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)
start_time <- proc.time()

Main Page to scrap and get total no of records.

data <- read_html('https://eprocure.gov.in/mmp/latestactivetenders')
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
All_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
All_tenders <- cbind(All_tenders,links_fair)

Reading the total number of records to fetch

Count_of_Recs_raw <- html_nodes(data, xpath = '//*[(@id = "edit-l-active-teners")]//div')
Count_of_Recs <- as.numeric(gsub("Total Tenders : ","",html_text(Count_of_Recs_raw[1])))

Functions for cleaning and processing data fields like dates and Factors.

process_dates <- function(data){
    cols2date <- c('Bid.Submission.Closing.Date','epublished_date','document_download_start_date','bid_submission_start_date','bid_opening_date','document_download_end_date','bid_submission_end_date')
    date_processed_data <- data
    date_processed_data[cols2date] <- lapply(data[cols2date] , dmy_hm)
    return(date_processed_data)
}

clean_process_data <- function(data){
    cols2factor <- c('State.Name','product_category','pre_qualification','organisation_name','organisation_type','tender_type')
    clean_processed_data <- data
    clean_processed_data[cols2factor] <- lapply(data[cols2factor] , factor)
   #clean_processed_data <- process_dates(clean_processed_data)
    return(clean_processed_data)

}

The code below is where precisely my question lies...

Table Scrapping starts here. Page one has already been scrapped to get the structure of the data frame.

for (page_no in 2:round(Count_of_Recs/10)){
    closeAllConnections()
    on.exit(closeAllConnections())
    url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
    url <- paste(url_bit1, page_no, sep="")
    cat(page_no,"\t",proc.time() - start_time,"\n")
    data <- read_html(url)
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
    links_fair <- html_attr(links,'href')
    links_fair <- links_fair[grep("tendersfullview",links_fair)]
    Page_tenders <- cbind(Page_tenders,links_fair)
    All_tenders <- rbind(All_tenders,Page_tenders)
 }

This for loop usually ends up taking hours to complete. I am looking for using the apply family to good effect so as to save on time. This program has further responsibility of fetching and processing all records and then for each individual record again scrapping a entirely new page every time (code not listed here)....

I have tried the following code but it doesn't give me what i want:

url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
   closeAllConnections()
   on.exit(closeAllConnections())
   url <- paste(url_bit1, datain$S.No., sep="")
   cat(S.No.,"\t",proc.time() - start_time,"\n")
   data <- read_html(url)
   total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
   Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
   links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
   links_fair <- html_attr(links,'href')
   links_fair <- links_fair[grep("tendersfullview",links_fair)]
   Page_tenders <- cbind(Page_tenders,links_fair)
   All_tenders <- rbind(All_tenders,Page_tenders)
}

All_tenders <- sapply(All_tenders, FUN=read_page(All_tenders$S.No.))

Any advise, guidance, suggestions, inputs or help is welcome. I have been using R for 3-4 months only. I am also aware of Python's strengths in this matter over R but am inclined towards R for the solution to this problem.

Community
  • 1
  • 1
Shikhar Parashar
  • 206
  • 2
  • 15

2 Answers2

1

Your sapply function is incorrect. I made some edit on your code and tested it on sample size N = 50. We might use system.time() to find out how much time it takes to finish the task.

The "for" approach:

system.time(
  for (page_no in 1:50){
    closeAllConnections()
    on.exit(closeAllConnections())
    url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
    url <- paste(url_bit1, page_no, sep="")
    cat(page_no,"\t",proc.time() - start_time,"\n")
    data <- read_html(url)
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
    links_fair <- html_attr(links,'href')
    links_fair <- links_fair[grep("tendersfullview",links_fair)]
    Page_tenders <- cbind(Page_tenders,links_fair)
    All_tenders <- rbind(All_tenders,Page_tenders)
  }
)

#user  system elapsed 
# 50.15   81.26  132.73

The "lapply" approach:

All_tenders = NULL
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
  closeAllConnections()
  on.exit(closeAllConnections())
  url <- paste(url_bit1, datain, sep="")
  cat(datain,"\t",proc.time() - start_time,"\n")
  data <- read_html(url)
  total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
  Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
  links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
  links_fair <- html_attr(links,'href')
  links_fair <- links_fair[grep("tendersfullview",links_fair)]
  Page_tenders <- cbind(Page_tenders,links_fair)
  All_tenders <- rbind(All_tenders,Page_tenders)
}

system.time(
  All_tenders <- lapply(1:50, function(x) read_page(x))
)
# user  system elapsed 
# 49.84   78.97  131.16

If we want to put our results in a dataframe, then transform All_tenders list to a dataframe as follows:

All_tenders = do.call(rbind, lapply(All_tenders, data.frame, stringsAsFactors=FALSE)

Turns out lapply is slightly faster.

Aleksandr
  • 1,814
  • 11
  • 19
  • I think that changing `All_tenders` in your function make it realy slow... – Emmanuel-Lin Aug 29 '17 at 08:01
  • Well, you have to deal with it. Web scraping is not fast, otherwise you might get banned by server admins. The alternative (significantly faster) approach would be to use TOR and multiple requests from different IP's via python but it is another story. – Aleksandr Aug 29 '17 at 08:04
  • That was silly on my part..... as i mentioned i am only novice in R.... Can you please elaborate the following part please system.time( All_tenders <- lapply(1:50, function(x) read_page(x)) ) All_tenders = do.call(rbind, lapply(All_tenders, data.frame, stringsAsFactors=FALSE) – Shikhar Parashar Aug 29 '17 at 08:06
  • @Emmanuel-Lin: All_tenders is the dataframe for which i am looking to process each row.... Any suggestions on improving the above performance??? – Shikhar Parashar Aug 29 '17 at 08:08
  • 1
    @ShikharParashar You can check my answer bellow it should work ;) – Emmanuel-Lin Aug 29 '17 at 08:10
  • @AleksandrVoitov: Any place other than CRAN where i can read more about the apply family to have a better understanding of it??? – Shikhar Parashar Aug 29 '17 at 08:54
  • 2
    You may look here: https://stackoverflow.com/questions/3505701/r-grouping-functions-sapply-vs-lapply-vs-apply-vs-tapply-vs-by-vs-aggrega – Aleksandr Aug 29 '17 at 09:02
1

for loops and sapply works differently: - for loops do stuff iteratively: they do computation on first element, then on second... - sapply do stuff on the list of elements independently (and in any order). So resulsts are constructed indepedently.

So at the and of your for loop, when you do:

All_tenders <- rbind(All_tenders,Page_tenders)

All_tenders variable increase itertively.

While in your sapply function, it won't work (since it doesn't know the results for other elements).

So you should do something like that:

url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
   closeAllConnections()
   on.exit(closeAllConnections())
   url <- paste(url_bit1,  datain, sep="")
   cat(S.No.,"\t",proc.time() - start_time,"\n")
   data <- read_html(url)
   total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
   Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
   links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
   links_fair <- html_attr(links,'href')
   links_fair <- links_fair[grep("tendersfullview",links_fair)]
   Page_tenders <- cbind(Page_tenders,links_fair)
   return(Page_tenders)
}

To return a result for each page and apply it the following way:

All_tenders_tmp <- sapply(2:round(Count_of_Recs/10), FUN=read_page)

Then your result will be list of all results and you can merge it with data.table::rbindlist for example.

I hope i was clear.

Emmanuel-Lin
  • 1,848
  • 1
  • 16
  • 31