You could grab the table without isotopes, then mimic the post request the page does if you decide to go with isotopes; then left-join the two on Name
column. You will get more rows back than were in left table (no isotopes) because there are multiple Change values
, but this matches with what you see in the method of viewing isotopes you describe, where there are comma separated lists of values against isotopes, within plots, rather than split out by rows.
I go for a more selective css selector to target the specific table of interest initially, rather than indexing into lists.
I use write_excel_csv
to preserve the character encoding of headers on write out (an idea I got from @stefan).
You can remove columns you don't want in output from joint_table
before writing out (subset/select etc).
r
library(dyplr)
library(httr)
library(rvest)
library(readr)
library(magrittr)
library(stringr)
webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")
no_isotopes <- webpage %>%
html_node("#maintable") %>%
html_table(fill = T)
data <- list(
'sfor' = "names",
'stype' = "contains",
'country' = "All",
'categ' = "Ungrouped achondrites",
'page' = "0",
'map' = "ge",
'srt' = "name",
'lrec' = "200",
'pnt' = "Oxygen isotopes",
'mblist' = "All",
'snew' = "0",
'sea' = "*"
)
r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)
isotopes <- content(r, "text") %>%
read_html(encoding = "UTF-8") %>%
html_node("#maintable") %>%
html_table(fill = T)
joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)
write_excel_csv(x = joint_table, path = "joint.csv", col_names = T, na = "")
Example output:

Edit:
Adding in the additional information that comes from other urls as per your request in comments. I had to dynamically determine which table number to pick up, as well as handle cases where no table present.
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#>
#> pluck
#> The following object is masked from 'package:readr':
#>
#> guess_encoding
library(readr)
library(furrr)
get_table <- function(url) {
page <- read_html(url)
test_list <- page %>%
html_nodes("#maintable tr > .inside:nth-child(odd)") %>%
html_text() # get left hand column %>%
index <- match(TRUE, stringr::str_detect(test_list, "Data from:")) + 1
table <- page %>%
html_node(paste0("#maintable tr:nth-of-type(", index, ") table")) %>%
html_table() %>%
as_tibble()
temp <- set_names(data.frame(t(table[, -1]), row.names = c()), t(table[, 1])) # https://www.nesono.com/node/456 ; https://stackoverflow.com/a/7970267/6241235
return(temp)
}
start_url <- "https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0"
base <- "https://www.lpi.usra.edu"
webpage <- read_html(start_url)
no_isotopes <- webpage %>%
html_node("#maintable") %>%
html_table(fill = T)
data <- list(
"sfor" = "names",
"stype" = "contains",
"country" = "All",
"categ" = "Ungrouped achondrites",
"page" = "0",
"map" = "ge",
"srt" = "name",
"lrec" = "200",
"pnt" = "Oxygen isotopes",
"mblist" = "All",
"snew" = "0",
"sea" = "*"
)
r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)
isotopes <- content(r, "text") %>%
read_html(encoding = "UTF-8") %>%
html_node("#maintable") %>%
html_table(fill = T)
joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)
lookups <- webpage %>%
html_node("#maintable") %>%
html_nodes("td:nth-of-type(1) a") %>%
map_df(~ c(html_text(.), html_attr(., "href")) %>%
set_names("Name", "Link")) %>%
mutate(Link = paste0(base, gsub("\\s+", "%20", Link)))
error_df <- tibble(
`State/Prov/County:` = NA_character_,
`Origin or pseudonym:` = NA_character_,
`Date:` = NA_character_,
`Latitude:` = NA_character_,
`Longitude:` = NA_character_,
`Mass (g):` = NA_character_,
`Pieces:` = NA_character_,
`Class:` = NA_character_,
`Shock stage:` = NA_character_,
`Fayalite (mol%):` = NA_character_,
`Ferrosilite (mol%):` = NA_character_,
`Wollastonite (mol%):` = NA_character_,
`Magnetic suscept.:` = NA_character_,
`Classifier:` = NA_character_,
`Type spec mass (g):` = NA_character_,
`Type spec location:` = NA_character_,
`Main mass:` = NA_character_,
`Finder:` = NA_character_,
`Comments:` = NA_character_,
)
no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
df <- furrr::future_map_dfr(lookups$Link, ~ tryCatch(get_table(.x), error = function(e) error_df))
colnames(df) <- sub(":", "", colnames(df))
df2 <- df %>%
mutate(
`Mass (g)` = gsub(",", "", `Mass (g)`),
across(c(`Mass (g)`, `Magnetic suscept.`), as.numeric)
)
if (nrow(df2) == nrow(no_isotopes)) {
additional_info <- cbind(lookups, df2)
joint_table$Name <- gsub(" \\*\\*", "", joint_table$Name)
final_table <- dplyr::left_join(joint_table, additional_info, by = "Name", copy = FALSE)
write_excel_csv(x = final_table, file = "joint.csv", col_names = T, na = "")
}
Created on 2021-02-27 by the reprex package (v0.3.0)
N.B.
OP had problems with lookups variable for some reason so here is an alternative I wrote that worked for them:
lookups <- map_df(
webpage %>% html_node("#maintable") %>% html_nodes("td:nth-of-type(1) a") , ~
data.frame(
Name = .x %>% html_text(),
Link = paste0(base, gsub("\\s+", "%20", .x %>% html_attr("href")))
)
) %>% as_tibble()