0

There is an old HTML website with aggregate research data on plant viruses. I am interested in extracting Susceptible host species and Insusceptible host species data for individual virus species.

The parent URL containing all individual, paginated URLs for individual species is here.

The URLs are paginated from 001 to 911. An example of one of these URLs, in this case for Abelia latent tymovirus, is found here.

With the kind help of someone on this board, I have managed to generate the code to extract just susceptible/insusceptible data from a single URL. The code:

library(rvest)
library(stringr)

#### Preparing for the loop ####

## URL template
url <- 'http://bio-mirror.im.ac.cn/mirrors/pvo/vide/'

## List of all URLs to visit
list_of_pages <- str_c(url, 'descr', c("001", "002", "003", "004", "005", "006", "007", "008", "009", "010","011","012","013","014","015","016","017","018","019","020","021","022","023","024","025","026","027","028","029","030","031","032","033","034","035","036","037","038","039","040","041","042","043","044","045","046","047","048","049","050","051","052","053","054","055","056","057","058","059","060","061","062","063","064","065","066","067","068","069","070","071","072","073","074","075","076","077","078","079","080","081","082","083","084","085","086","087","088","089","090","091","092","093","094","095","096","097","098","099",100:911))
list_of_pages <- str_c(list_of_pages, ".htm")

#### Functions to be executed for each URL

## Obtain URL data for one species (will need to change for loop)
pvo <- read_html("http://bio-mirror.im.ac.cn/mirrors/pvo/vide/descr001.htm")

## Extract URL-specific virus species identifier and cleanup
virus_species <- pvo %>% html_element("center+ h1") %>% html_text()%>% str_replace_all("[\n]" , " ")

## Select only data in node h4, include listitems, and cleanup
all_values <- pvo %>% html_elements("h4, li") %>% html_text() %>% str_replace_all("[\n]" , "")

## Index headings in h4 to isolate susceptibility data
sus_index <- grep('Susceptible host species', all_values, fixed = TRUE)
insus_index <- grep('Insusceptible host species', all_values, fixed = TRUE)
family_sus_index <- grep('Families containing susceptible hosts', all_values, fixed = TRUE)

## Extract susceptible species data
susceptible_species <- all_values[(sus_index+1):(insus_index-1)]

## Extract insusceptible species data
insusceptible_species <- all_values[(insus_index+1):(family_sus_index-1)]

Now I would like to apply these functions over my list_of_pages and aggregate the data into one dataframe, ideally looking something like:

Example aggregate data

Can someone help me accomplish this? I have very little experience with loops and have mostly used dplyr.

EDIT: One very important thing I forgot to add: many of the URLs do not contain susceptibility/insusceptibility data, as the studies simply were not conducted. The loop must be able to continue through these cases and not terminate.

Examples of these URLs:

Contains susceptible data but not unsusceptible data: http://bio-mirror.im.ac.cn/mirrors/pvo/vide/descr565.htm

Contains neither susceptible nor insusceptible data: http://bio-mirror.im.ac.cn/mirrors/pvo/vide/descr562.htm

1 Answers1

1

Try this:

library(rvest)
library(stringr)

#### Preparing for the loop ####

## URL template
url <- 'http://bio-mirror.im.ac.cn/mirrors/pvo/vide/'

## List of all URLs to visit
list_of_pages <- str_c(url, 'descr', c("001", "002", "003", "004", "005", "006", "007", "008", "009", "010","011","012","013","014","015","016","017","018","019","020","021","022","023","024","025","026","027","028","029","030","031","032","033","034","035","036","037","038","039","040","041","042","043","044","045","046","047","048","049","050","051","052","053","054","055","056","057","058","059","060","061","062","063","064","065","066","067","068","069","070","071","072","073","074","075","076","077","078","079","080","081","082","083","084","085","086","087","088","089","090","091","092","093","094","095","096","097","098","099",100:911))
list_of_pages <- str_c(list_of_pages, ".htm")
#list_of_pages <- list_of_pages[641:n_distinct(list_of_pages)]
#### Functions to be executed for each URL

df <- data.frame()

for (pages in list_of_pages) {
  print(pages)
  ## Obtain URL data for one species (will need to change for loop)
  
  ## First catch the Error
  possible_error <- tryCatch(
    pvo <- read_html(pages),
    error = function(e)
      e
  )

  ## If any Error found then just capture it in virus_species column
  ## You will find  "Failed to parse text" where 
  if (inherits(possible_error, 'error')) {
    temp_df <- data.frame(
      pages,
      virus_species = possible_error$message,
      susceptible_species = NA,
      insusceptible_species = NA
    )
    
    df <- rbind(df, temp_df)
    next
  }
  
  ## If no error found reading the html then collect all necessary data
  if (!inherits(possible_error, 'error')) {
    #pvo <- read_html(pages)
    
    ## Extract URL-specific virus species identifier and cleanup
    virus_species <-
      pvo %>% html_element("center+ h1") %>% html_text() %>% str_replace_all("[\n]" , " ")
    
    ## Select only data in node h4, include listitems, and cleanup
    all_values <-
      pvo %>% html_elements("h4, li") %>% html_text() %>% str_replace_all("[\n]" , "")
    
    ## Index headings in h4 to isolate susceptibility data
    sus_index <-
      grep('Susceptible host species', all_values, fixed = TRUE)
    insus_index <-
      grep('Insusceptible host species', all_values, fixed = TRUE)
    family_sus_index <-
      grep('Families containing susceptible hosts', all_values, fixed = TRUE)
    
    ## Extract susceptible species data
    #susceptible_species <- all_values[(sus_index + 1):(insus_index - 1)]
    
    if (length(sus_index) > 0L & length(insus_index) > 0L) {
      susceptible_species <-
        all_values[(sus_index + 1):(insus_index - 1)]
    } else{
      susceptible_species <- NA
    }
    
    ## Extract insusceptible species data
    if (length(insus_index) > 0L & length(family_sus_index) > 0L) {
      insusceptible_species <-
        all_values[(insus_index + 1):(family_sus_index - 1)]
    } else{
      insusceptible_species <- NA
    }
    
    ll <-
      sapply(list(
        virus_species,
        susceptible_species,
        insusceptible_species
      ),
      length)
    
    mll <- max(ll)
    
    temp_df <- data.frame(
      pages = rep(pages, mll),
      virus_species = rep(virus_species, mll),
      susceptible_species = c(susceptible_species, rep(NA, mll - ll[2])),
      insusceptible_species = c(insusceptible_species, rep(NA, mll - ll[3]))
    )
    
    df <- rbind(df, temp_df)
    ## uncomment below line if you need the delay 
    ##Sys.sleep(1) 
    
  }
}

df

For better error handling in r see here

forhad
  • 196
  • 4
  • I appreciate this response so much. Before I try this, do you mind adding a 1000 ms delay in the loop so I don't overload the website or get locked out by the server? – Dieter Kahl Aug 06 '21 at 22:08
  • You can add `Sys.sleep(time)` after `df <- rbind(df, temp_df)` in the loop. Here time is in seconds. i.e. after processing one url it will wait your provided time before loading the next url. – forhad Aug 07 '21 at 05:02
  • @DieterKahl I've updated the answer with better error handling and it worked for all url. – forhad Aug 09 '21 at 09:58
  • This works incredibly well. Thank you so much. – Dieter Kahl Aug 09 '21 at 21:49