Apply vs For loop in R

Question

I wrote the following code to scrap tendering information from a portal on daily basis.

packages <- c('rvest', 'stringi', 'tidyverse','lubridate','dplyr')
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)
start_time <- proc.time()

Main Page to scrap and get total no of records.

data <- read_html('https://eprocure.gov.in/mmp/latestactivetenders')
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
All_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
All_tenders <- cbind(All_tenders,links_fair)

Reading the total number of records to fetch

Count_of_Recs_raw <- html_nodes(data, xpath = '//*[(@id = "edit-l-active-teners")]//div')
Count_of_Recs <- as.numeric(gsub("Total Tenders : ","",html_text(Count_of_Recs_raw[1])))

Functions for cleaning and processing data fields like dates and Factors.

process_dates <- function(data){
    cols2date <- c('Bid.Submission.Closing.Date','epublished_date','document_download_start_date','bid_submission_start_date','bid_opening_date','document_download_end_date','bid_submission_end_date')
    date_processed_data <- data
    date_processed_data[cols2date] <- lapply(data[cols2date] , dmy_hm)
    return(date_processed_data)
}

clean_process_data <- function(data){
    cols2factor <- c('State.Name','product_category','pre_qualification','organisation_name','organisation_type','tender_type')
    clean_processed_data <- data
    clean_processed_data[cols2factor] <- lapply(data[cols2factor] , factor)
   #clean_processed_data <- process_dates(clean_processed_data)
    return(clean_processed_data)

}

The code below is where precisely my question lies...

Table Scrapping starts here. Page one has already been scrapped to get the structure of the data frame.

for (page_no in 2:round(Count_of_Recs/10)){
    closeAllConnections()
    on.exit(closeAllConnections())
    url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
    url <- paste(url_bit1, page_no, sep="")
    cat(page_no,"\t",proc.time() - start_time,"\n")
    data <- read_html(url)
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
    links_fair <- html_attr(links,'href')
    links_fair <- links_fair[grep("tendersfullview",links_fair)]
    Page_tenders <- cbind(Page_tenders,links_fair)
    All_tenders <- rbind(All_tenders,Page_tenders)
 }

This for loop usually ends up taking hours to complete. I am looking for using the apply family to good effect so as to save on time. This program has further responsibility of fetching and processing all records and then for each individual record again scrapping a entirely new page every time (code not listed here)....

I have tried the following code but it doesn't give me what i want:

url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
   closeAllConnections()
   on.exit(closeAllConnections())
   url <- paste(url_bit1, datain$S.No., sep="")
   cat(S.No.,"\t",proc.time() - start_time,"\n")
   data <- read_html(url)
   total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
   Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
   links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
   links_fair <- html_attr(links,'href')
   links_fair <- links_fair[grep("tendersfullview",links_fair)]
   Page_tenders <- cbind(Page_tenders,links_fair)
   All_tenders <- rbind(All_tenders,Page_tenders)
}

All_tenders <- sapply(All_tenders, FUN=read_page(All_tenders$S.No.))

Any advise, guidance, suggestions, inputs or help is welcome. I have been using R for 3-4 months only. I am also aware of Python's strengths in this matter over R but am inclined towards R for the solution to this problem.

Aleksandr · Accepted Answer · 2017-08-29T08:13:06.120

Your sapply function is incorrect. I made some edit on your code and tested it on sample size N = 50. We might use system.time() to find out how much time it takes to finish the task.

The "for" approach:

system.time(
  for (page_no in 1:50){
    closeAllConnections()
    on.exit(closeAllConnections())
    url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
    url <- paste(url_bit1, page_no, sep="")
    cat(page_no,"\t",proc.time() - start_time,"\n")
    data <- read_html(url)
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
    links_fair <- html_attr(links,'href')
    links_fair <- links_fair[grep("tendersfullview",links_fair)]
    Page_tenders <- cbind(Page_tenders,links_fair)
    All_tenders <- rbind(All_tenders,Page_tenders)
  }
)

#user  system elapsed 
# 50.15   81.26  132.73

The "lapply" approach:

All_tenders = NULL
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
  closeAllConnections()
  on.exit(closeAllConnections())
  url <- paste(url_bit1, datain, sep="")
  cat(datain,"\t",proc.time() - start_time,"\n")
  data <- read_html(url)
  total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
  Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
  links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
  links_fair <- html_attr(links,'href')
  links_fair <- links_fair[grep("tendersfullview",links_fair)]
  Page_tenders <- cbind(Page_tenders,links_fair)
  All_tenders <- rbind(All_tenders,Page_tenders)
}

system.time(
  All_tenders <- lapply(1:50, function(x) read_page(x))
)
# user  system elapsed 
# 49.84   78.97  131.16

If we want to put our results in a dataframe, then transform All_tenders list to a dataframe as follows:

All_tenders = do.call(rbind, lapply(All_tenders, data.frame, stringsAsFactors=FALSE)

Turns out lapply is slightly faster.

I think that changing `All_tenders` in your function make it realy slow... — Emmanuel-Lin, Aug 29 '17 at 08:01
Well, you have to deal with it. Web scraping is not fast, otherwise you might get banned by server admins. The alternative (significantly faster) approach would be to use TOR and multiple requests from different IP's via python but it is another story. — Aleksandr, Aug 29 '17 at 08:04
That was silly on my part..... as i mentioned i am only novice in R.... Can you please elaborate the following part please system.time( All_tenders <- lapply(1:50, function(x) read_page(x)) ) All_tenders = do.call(rbind, lapply(All_tenders, data.frame, stringsAsFactors=FALSE) — Shikhar Parashar, Aug 29 '17 at 08:06
@Emmanuel-Lin: All_tenders is the dataframe for which i am looking to process each row.... Any suggestions on improving the above performance??? — Shikhar Parashar, Aug 29 '17 at 08:08
@ShikharParashar You can check my answer bellow it should work ;) — Emmanuel-Lin, Aug 29 '17 at 08:10
@AleksandrVoitov: Any place other than CRAN where i can read more about the apply family to have a better understanding of it??? — Shikhar Parashar, Aug 29 '17 at 08:54
You may look here: https://stackoverflow.com/questions/3505701/r-grouping-functions-sapply-vs-lapply-vs-apply-vs-tapply-vs-by-vs-aggrega — Aleksandr, Aug 29 '17 at 09:02

Emmanuel-Lin · Answer 2 · 2017-08-29T09:37:29.577

for loops and sapply works differently: - for loops do stuff iteratively: they do computation on first element, then on second... - sapply do stuff on the list of elements independently (and in any order). So resulsts are constructed indepedently.

So at the and of your for loop, when you do:

All_tenders <- rbind(All_tenders,Page_tenders)

All_tenders variable increase itertively.

While in your sapply function, it won't work (since it doesn't know the results for other elements).

So you should do something like that:

url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
   closeAllConnections()
   on.exit(closeAllConnections())
   url <- paste(url_bit1,  datain, sep="")
   cat(S.No.,"\t",proc.time() - start_time,"\n")
   data <- read_html(url)
   total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
   Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
   links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
   links_fair <- html_attr(links,'href')
   links_fair <- links_fair[grep("tendersfullview",links_fair)]
   Page_tenders <- cbind(Page_tenders,links_fair)
   return(Page_tenders)
}

To return a result for each page and apply it the following way:

All_tenders_tmp <- sapply(2:round(Count_of_Recs/10), FUN=read_page)

Then your result will be list of all results and you can merge it with data.table::rbindlist for example.

I hope i was clear.

Btw, this function will throw an error as "page_no", "S.No." variables are not defined. — Aleksandr, Aug 29 '17 at 08:36
This one still doesn't work for me.. For that reason i am inclined to select the earlier answer as the correct one as of now.. — Shikhar Parashar, Aug 29 '17 at 08:50
Oups a typo sorry. input variable was datain i corrected it. Butg anyway the idea was to show you how sapply should work — Emmanuel-Lin, Aug 29 '17 at 09:38

Apply vs For loop in R

The code below is where precisely my question lies...

2 Answers2

Linked