-1

I am learning web scraping and have been facing one hurdle after another. I want to create a data frame full of the first table on this page for all portfolio managers, for the month of august, the year 2022.

So far, I have found a way to scrape a single table properly (I think! Please let me know if I can improve on this).

I haven't been able to bind all the tables into a data frame properly, also I wanted to find out if there is a way to transform this form type data into a proper data frame with the 1st column of every table as the variable and the second column as the row (I know I can use the usual data wrangling thing but I wanted to know if some function helped transform this form type data into a data frame).

> library(tidyverse)

> library(rvest)

> library(httr)

> url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"

> pm_id <- read_html(url) %>%
+   html_elements('select[name="pmrId"].f_control option') %>%
+   html_attr("value")

> pm_id <- pm_id[2:416]

> sebi_pm <- function(x) {
+   resp = POST(url, 
+               body = list(
+               pmrId= x,
+               year="2022",
+               m .... [TRUNCATED] 

> #s <- lapply(pm_id[i], sebi_pm)
> #v <- sebi_pm(pm_id[1])
> #v
> #do.call() lapply(pm_id[1:5], sebi_pm)
> ha <- do.call("rbind", lapply(pm_id, sebi_ .... [TRUNCATED] 
#> Error in .[[1]] : subscript out of bounds
mathplyr
  • 3
  • 2
  • 2
    Please take a moment to provide a minimal reproducible example - https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example . Currently you've copied output from your R console with all it's `>`s & `+`s and truncated expressions, it's not really clear what your function should return or how to interpret comments in your code. If it's about combining tables, include some of your tables as `dput()`outputs as part of the reprex and drop the scarping part. Functions you are after a probably `tidyr::pivot_wider()` or just plain transpose (`t()` ) to swap rows & cols. – margusl Oct 12 '22 at 19:07

1 Answers1

1

Normally I would be a stickler for a reproducible example, but I think I know what you're getting at here... try this...

# DEPENDENCIES -----------------------------------------------------------------
library(rvest)
library(httr)
library(stringr)
library(data.table)

# UTILITY FUNCTIONS ------------------------------------------------------------
get_pm_ids <- function() {
  url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
  # get list of portfolio manager ids
  pm_ids <- read_html(url) |>
    html_elements('select[name="pmrId"].f_control option') |>
    html_attr('value')
  pm_ids
}

get_monthly_report <- function(pmr_id, report_year, report_month) {
  msg <- sprintf('fetching report for portfolio manager: %s; year = %s; month = %s',
                 str_split(pmr_id, '@@', simplify = TRUE)[ , 3] |> str_squish(),
                 report_year,
                 report_month)
  message(msg)
  url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
  params <- list(
    currdate = '',
    loginflag = 0,
    searchValue = '',
    pmrId = pmr_id,
    year = report_year,
    month = report_month,
    loginEmail = '',
    loginPassword = '',
    cap_login = '',
    moduleNo = -1,
    moduleId = '',
    link = '',
    yourName = '',
    friendName = '',
    friendEmail = '',
    mailmessage = '',
    cap_email = ''
  )
  resp <- POST(url, body = params)
  pg <- httr::content(resp)
  tbl <- html_nodes(pg, 'div.portlet:nth-child(3) > div:nth-child(1) > table:nth-child(1)')
  result_df <- data.frame()
  if (length(tbl) == 0) {
    # no records found
    result_df <- data.frame(id = pmr_id, 
                            report_year = report_year, 
                            report_month = report_month)
  } else {
    tr <- html_nodes(tbl, 'tr')
    cell_captions <- lapply(tr, html_children) |> lapply('[', 1) |> lapply(html_text) |> unlist()
    cell_contents <- lapply(tr, html_children) |> lapply('[', 2) |> lapply(html_text) |> unlist()
    result_df <- data.frame(t(cell_contents))
    colnames(result_df) <- cell_captions
    result_df$id <- pmr_id
    result_df$report_year <- report_year
    result_df$report_month <- report_month
  }
  return(result_df)
}                    

# MAIN -------------------------------------------------------------------------

## 1. fetch list of portfolio manager ids --------------------------------------
pm_ids <- get_pm_ids()

## 2. filter list of portfolio manager ids -------------------------------------
pm_ids <- pm_ids[ 2:416 ]

## 3. testing: fetch reports for a sample of managers in January 2022 ---------- 
set.seed(1234)
tmp <- sample(pm_ids, 5)
reports_list <- lapply(tmp, get_monthly_report, 2022, 1)

## 4. combine the results ------------------------------------------------------
reports_df <- rbindlist(reports_list, use.names = TRUE, fill = TRUE) |> 
  as.data.frame()

## 5. inspect results ----------------------------------------------------------
View(reports_df, 'downloaded reports')

                                                      

This code could be improved by providing some kind of input validation and more robust error handling. Hope this helps!

br00t
  • 1,440
  • 8
  • 10