I would like to paste column names with their values. It must be based on some condition (if statement) and it can be based on a single variable or multiple variables.
Below is a small example showing how the data looks like. I would like to speed up this process and get the same results as the fun2, fun3, and fun4.
To make this as simple as possible, there is only one rule to set to missing if columns a, b, c, and d have values bigger than zero. But, I left the name of the rule, because it can be different, like "rule 1" > 0 and "rule 2" if is non-missing.
library("data.table")
library("tidytable")
library("glue")
library("stringi")
library("benchr")
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a = c(0, NA, 0, 1, 0, NA, 1, 1, 0, 1),
b = c(0, NA, NA, 0, 1, 0, 1, NA, 1, 1),
c = c(0, NA, 0, NA, 0, 1, NA, 1, 1, 1),
d = c(0, NA, 1, 1, 0, 1, 0, 1, NA, 1),
re = "")
This it how the data looks like:
id t1 t2 a b c d re
1 0.6883367 -0.3454049 0 0 0 0 ''
2 -1.0653127 -1.3035077 NA NA NA NA ''
3 0.5210550 0.8489376 0 NA 0 1 ''
4 0.3697369 -0.1135827 1 0 NA 1 ''
5 1.3195759 -1.5431305 0 1 0 0 ''
6 -0.2106836 -0.3421900 NA 0 1 1 ''
7 -0.2258871 -2.1644697 1 1 NA 0 ''
8 -0.7132686 1.7673775 1 NA 1 1 ''
9 0.9467068 1.8188665 0 1 1 NA ''
10 -0.3900479 1.7306935 1 1 1 1 ''
Bellow is the desired output. The idea is to keep a column whit a description with the reason some value has been set to missing. In this example, only the first two individuals have records for both t1 and t2. Individuals 1, 2, and 3 have records for t1, while individuals 1, 2, 5, 7, and 9 have records for t2.
id t1 t2 a b c d re
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1);"
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1);"
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1);"
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1);"
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1);"
First attempt (fun1). Not the expected results because it looks for single whitespace inside mutate. All the other functions (fun2, fun3, and fun4) print the right results.
fun1 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 NA 1.43 0 0 0 0 "Rule1:t1( ); "
2 NA 0.733 NA NA NA NA "Rule1:t1( ); "
3 NA NA 0 NA 0 1 "Rule2:t2(d=1); Rule1:t1( ); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
Function 2 (fun2) uses "trimws".
fun2 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := trimws(do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
Function 3 (fun3) uses "gsub" with regular expression.
fun3 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := gsub("\\s+","", do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1b=1c=1); "
Function 4 (fun4) uses stri_detect inside mutate with regular expression.
fun4 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(!stri_detect(aux, regex = "[[:alpha:]]") ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(!stri_detect(aux, regex = '[[:alpha:]]') ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
Benchmark with more data
n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a = sample(c(0, NA, 1), n, replace = TRUE),
b = sample(c(0, NA, 1), n, replace = TRUE),
c = sample(c(0, NA, 1), n, replace = TRUE),
d = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
benchmark(fun1(dat),
fun2(dat),
fun3(dat),
fun4(dat))
Benchmark summary:
Time units : milliseconds
expr n.eval min lw.qu median mean up.qu max total relative
fun1(dat) 100 642 653 660 668 666 774 66800 1.00
fun2(dat) 100 742 756 763 773 768 874 77300 1.16
fun3(dat) 100 765 779 785 794 791 903 79400 1.19
fun4(dat) 100 743 756 763 777 770 1010 77700 1.16
Does anyone have an idea on how to speed up this process?
Thank you.