Or, possibly something readable:
library(xml2)
library(tidyverse)
This will help make better column names:
mcga <- function(tbl) {
x <- colnames(tbl)
x <- tolower(x)
x <- gsub("[[:punct:][:space:]]+", "_", x)
x <- gsub("_+", "_", x)
x <- gsub("(^_|_$)", "", x)
x <- make.unique(x, sep = "_")
colnames(tbl) <- x
tbl
}
This gets figured out automagically but it's nice to define it after it figures it out since it help with data consistency:
cols(
.default = col_integer(),
site = col_character(),
aod_47 = col_double(),
omi_aot = col_double(),
omi_no2 = col_double(),
fit = col_double(),
lng = col_double(),
lat = col_double()
) -> xdf_cols
Now the work:
doc <- read_xml("http://www.mahdial-husseini.com/xmlthing.php")
xml_find_all(doc, ".//PPM1_0") %>%
map_df(~{
xml_attrs(.x) %>%
as.list()
}) %>%
mcga() %>%
type_convert(col_types = xdf_cols) -> xdf
The type_convert()
isn't fully necessary but it — with the column definitions — make for consistency in results.
And, the results:
xdf
## # A tibble: 8 x 21
## sample site month day year hour jd doy pm25_hourly aod_47 omi_aot omi_no2 fit res
## <int> <chr> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0 duluth 0 0 0 0 0 0 0 0.000 0.000 0.000 0.00000 0
## 2 19 <NA> 12 0 2004 5 0 0 30 0.000 0.000 0.000 0.00000 0
## 3 4545 Sarasota 4 0 2017 0 0 0 0 0.000 0.000 0.000 0.00000 0
## 4 11111 Atlanta 10 1 2004 13 2453280 275 23 0.379 0.148 0.274 16.01850 NA
## 5 11112 Birmingham 10 2 2008 14 2453281 276 0 0.000 0.000 0.000 19.19440 0
## 6 11113 Savannah 10 3 2004 13 2453282 277 15 0.181 0.133 0.127 9.00433 NA
## 7 11114 Fort Knox 6 20 2017 21 0 301 18 0.000 0.000 0.000 0.00000 0
## 8 63738 Fort Rucker 1 0 2015 0 0 0 40 0.000 0.000 0.000 0.00000 0
## # ... with 7 more variables: lng <dbl>, lat <dbl>, rel_humid <int>, altitude <int>, pressure <int>,
## # signal_received <int>, temp_c <int>
Full structure:
glimpse(xdf)
## Observations: 8
## Variables: 21
## $ sample <int> 0, 19, 4545, 11111, 11112, 11113, 11114, 63738
## $ site <chr> "duluth", NA, "Sarasota", "Atlanta", "Birmingham", "Savan...
## $ month <int> 0, 12, 4, 10, 10, 10, 6, 1
## $ day <int> 0, 0, 0, 1, 2, 3, 20, 0
## $ year <int> 0, 2004, 2017, 2004, 2008, 2004, 2017, 2015
## $ hour <int> 0, 5, 0, 13, 14, 13, 21, 0
## $ jd <int> 0, 0, 0, 2453280, 2453281, 2453282, 0, 0
## $ doy <int> 0, 0, 0, 275, 276, 277, 301, 0
## $ pm25_hourly <int> 0, 30, 0, 23, 0, 15, 18, 40
## $ aod_47 <dbl> 0.000, 0.000, 0.000, 0.379, 0.000, 0.181, 0.000, 0.000
## $ omi_aot <dbl> 0.000, 0.000, 0.000, 0.148, 0.000, 0.133, 0.000, 0.000
## $ omi_no2 <dbl> 0.000, 0.000, 0.000, 0.274, 0.000, 0.127, 0.000, 0.000
## $ fit <dbl> 0.00000, 0.00000, 0.00000, 16.01850, 19.19440, 9.00433, 0...
## $ res <int> 0, 0, 0, NA, 0, NA, 0, 0
## $ lng <dbl> 84.1000, 63.6167, -82.5300, -84.7000, -86.8000, -81.1000,...
## $ lat <dbl> 34.0000, 38.4161, 27.3300, 33.7500, 33.5200, 32.0800, 37....
## $ rel_humid <int> 0, 0, 0, 0, 0, 0, 0, 0
## $ altitude <int> 0, 0, 0, 0, 0, 0, 0, 0
## $ pressure <int> 0, 0, 0, 0, 0, 0, 0, 0
## $ signal_received <int> 0, 0, 0, 0, 0, 0, 0, 0
## $ temp_c <int> 0, 0, 0, 0, 0, 0, 0, 0