Step 1: extract the header info.
hdr <- readr::read_csv("~/StackOverflow/14285118/Book1.csv", n_max = 3, col_names = FALSE)
# Parsed with column specification:
# cols(
# X1 = col_character(),
# X2 = col_character(),
# X3 = col_character(),
# X4 = col_character(),
# X5 = col_character()
# )
colnms <- unlist(hdr[1,], use.names = FALSE)
colnms
# [1] "Sample Date" "Site Code" "Arsenic" "Barium" "Antimony"
library(dplyr)
dat_units <- hdr %>%
select(-(1:2)) %>%
t() %>%
as.data.frame() %>%
setNames(., c("Metal", "Units", "TestCode"))
dat_units
# Metal Units TestCode
# X3 Arsenic ug/L TM66TW
# X4 Barium ug/L TM66TW
# X5 Antimony ug/L TM66FW
(That last part is a little inelegant in the tidyverse ecosystem, but it's fast and does what we need.)
Step 2: read all data, skipping the first three rows.
dat <- readr::read_csv("~/StackOverflow/14285118/Book1.csv", skip = 3, col_names = colnms)
# Parsed with column specification:
# cols(
# `Sample Date` = col_character(),
# `Site Code` = col_character(),
# Arsenic = col_double(),
# Barium = col_double(),
# Antimony = col_character()
# )
dat
# # A tibble: 4 x 5
# `Sample Date` `Site Code` Arsenic Barium Antimony
# <chr> <chr> <dbl> <dbl> <chr>
# 1 21/07/2016 1 0.7 88.6 <0.3
# 2 21/07/2016 A 0.7 110 <0.3
# 3 15/09/2016 1 0.5 67 <0.3
# 4 15/09/2016 A 0.4 96.5 <0.3
(Base R's read.csv
would work too, with read.csv(..., skip=3, header=FALSE)
.)
Step 3: adjust <0.3
to 0.15
I'll write a function named reduce_lessthan
which allows you to determine the scaling:
- if the parameter is already
numeric
, it passes through untouched;
- if a string and it is just a number, then it converts to
numeric
unmodified;
- if a string and it starts with
<
, then it removes the <
, converts to a numeric
, and multiplies by a factor (mult
); and
- otherwise, it will error
reduce_lessthan <- function(x, mult = 1) {
if (is.numeric(x)) return(x)
out <- numeric(length(x))
islessthan <- grepl("^\\s*<", x)
nolessthan <- grepl("^-?[0-9]+(\\.[0-9]*)?", x)
out[islessthan] <- as.numeric(gsub("^\\s*<\\s*", "", x[islessthan])) * mult
out[nolessthan] <- as.numeric(x[nolessthan])
if (!all(islessthan | nolessthan)) {
nm <- deparse(substitute(x))
stop("non-number/lessthan in ", sQuote(nm), ": ",
sQuote(x[ which(!islessthan & !nolessthan)[1] ]), " (may be more data)")
}
out
}
Arsenic <- c("<0.3", "0.3", "-0.3")
reduce_lessthan(Arsenic, mult = 0.5)
# [1] 0.15 0.30 -0.30
Arsenic <- ">0.5"
reduce_lessthan(Arsenic)
# Error in reduce_lessthan(Arsenic) :
# non-number/lessthan in 'Arsenic': '>0.5' (may be more data)
Step 4: use that last function on the data, and rejoin with the original header information.
library(dplyr)
library(tidyr) # pivot_longer
combined <- dat %>%
mutate(`Sample Date` = as.Date(`Sample Date`, "%d/%m/%Y")) %>%
mutate_at(vars(-one_of(colnms[1:2])), ~ reduce_lessthan(., mult = 0.5)) %>%
pivot_longer(-one_of(colnms[1:2]), names_to = "Metal") %>%
left_join(., dat_units, by = "Metal")
combined
# # A tibble: 12 x 6
# `Sample Date` `Site Code` Metal value Units TestCode
# <date> <chr> <chr> <dbl> <chr> <chr>
# 1 2016-07-21 1 Arsenic 0.7 ug/L TM66TW
# 2 2016-07-21 1 Barium 88.6 ug/L TM66TW
# 3 2016-07-21 1 Antimony 0.15 ug/L TM66FW
# 4 2016-07-21 A Arsenic 0.7 ug/L TM66TW
# 5 2016-07-21 A Barium 110 ug/L TM66TW
# 6 2016-07-21 A Antimony 0.15 ug/L TM66FW
# 7 2016-09-15 1 Arsenic 0.5 ug/L TM66TW
# 8 2016-09-15 1 Barium 67 ug/L TM66TW
# 9 2016-09-15 1 Antimony 0.15 ug/L TM66FW
# 10 2016-09-15 A Arsenic 0.4 ug/L TM66TW
# 11 2016-09-15 A Barium 96.5 ug/L TM66TW
# 12 2016-09-15 A Antimony 0.15 ug/L TM66FW
Book1.csv
:
Sample Date,Site Code,Arsenic,Barium,Antimony
,,ug/L,ug/L,ug/L
,,TM66TW,TM66TW,TM66FW
21/07/2016,1,0.7,88.6,<0.3
21/07/2016,A,0.7,110,<0.3
15/09/2016,1,0.5,67,<0.3
15/09/2016,A,0.4,96.5,<0.3