4

I'm working on a package which uses data.table inside. In this package, I have a function count_by which calculates the number of distinct ID's for a specific variable in a data.table by groups. With some help (R data.table: How to pass "everything possible" to by in a function?) I got this to work as expected:

library(data.table)
#> Warning: package 'data.table' was built under R version 3.6.2

# create example data
sample_dt <- data.table(
    id = sort(rep(LETTERS[1:3], 3)),
    year = rep(2018L:2020L),
    x = runif(9)
)
sample_dt[id == "B" & year < 2020, x := NA_real_]

# define inner function
count_by <- function(DT, id_var, val_var, by = NULL) {
    id_var <- as.character(substitute(id_var))
    val_var <- as.character(substitute(val_var))

    eval(substitute(
        DT[!is.na(get(val_var)), .(distinct_ids = uniqueN(get(id_var))), by = by]
    ))
}

# test inner function -> works as expected
(reference <- count_by(sample_dt, id_var = id, val_var = x, by = year))
#>    year distinct_ids
#> 1: 2018            2
#> 2: 2019            2
#> 3: 2020            3

identical(count_by(sample_dt, "id", x, year)       , reference)
#> [1] TRUE
identical(count_by(sample_dt, "id", "x", year)     , reference)
#> [1] TRUE
identical(count_by(sample_dt, "id", x, "year")     , reference)
#> [1] TRUE
identical(count_by(sample_dt, "id", x, c("year"))  , reference)
#> [1] TRUE
identical(count_by(sample_dt, "id", "x", "year")   , reference)
#> [1] TRUE
identical(count_by(sample_dt, "id", "x", c("year")), reference)
#> [1] TRUE
identical(count_by(sample_dt, id, "x", year)       , reference)
#> [1] TRUE
identical(count_by(sample_dt, id, "x", "year")     , reference)
#> [1] TRUE
identical(count_by(sample_dt, id, "x", c("year"))  , reference)
#> [1] TRUE
identical(count_by(sample_dt, id, x, "year")       , reference)
#> [1] TRUE
identical(count_by(sample_dt, id, x, c("year"))    , reference)
#> [1] TRUE

Created on 2020-02-20 by the reprex package (v0.3.0)

Now I would like to use the function count_by() within another function (minimal example below):

# define wrapper function
wrapper <- function(data, id_var, val_var, by = NULL) {
    data <- as.data.table(data)
    count_by(data, id_var, val_var, by)
}

# test wrapper function
wrapper(sample_dt, id_var = id, val_var = x, by = year)
#> Error in .(distinct_ids = uniqueN(get("id_var"))): could not find function "."

Created on 2020-02-20 by the reprex package (v0.3.0)

Debugging count_by() lead to the observation that if count_by() is called from wrapper(), substitute(DT[...]) also substitutes DT to data:

Browse[2]> substitute(
+         DT[!is.na(get(val_var)), .(distinct_ids = uniqueN(get(id_var))), by = by]
+     )
data[!is.na(get("val_var")), .(distinct_ids = uniqueN(get("id_var"))), 
    by = by]

Since data is not available in the function environment of count_by() it gets evaluated to utils::data which then leads to the error. This makes the problem clear, but I cannot think of a solution.

I need to substitute the whole DT[...] expression in order for by to work properly (see R data.table: How to pass "everything possible" to by in a function? or pass variables and names to data.table function). But I can't substitute the whole expression in order for DT not beeing substituted.

What is the solution to this dilemma?

der_grund
  • 1,898
  • 20
  • 36
  • Life would be easier if you passed your variable arguments always as strings, replacing the need for `eval(substitute(...))`. – s_baldur Feb 20 '20 at 08:22
  • I agree, but then it would be impossible to pass expressions like `year > 2019` into `by`, right? – der_grund Feb 20 '20 at 08:44
  • You *can* using `get()` directly on the string: `DT <- data.table(year = 2010:2020);foo <- function(dt, y) dt[, .N, by = get(y) < 2019];foo(DT, "year")` – s_baldur Feb 20 '20 at 08:52
  • Didn't think of `get()`ing `by` partially, yet. But then, `by` is somewhat fixed and I couldn't easily pass `year == 2019` without altering the function. And this is necessary. I want `by` to use just as in `data.table` directly. – der_grund Feb 20 '20 at 09:05

2 Answers2

1

Getting NSE out of the picture works for this particular example and simplifies things a lot. But then you should pass the arguments as string:

count_by <- function(DT, id_var, val_var, by = NULL) {
    DT[!is.na(get(val_var)), .(distinct_ids = uniqueN(get(id_var))), by = by]
}

wrapper <- function(data, id_var, val_var, by = NULL) {
    count_by(data, id_var, val_var, by)
}

wrapper(sample_dt, id_var = "id", val_var = "x", by = "year")

#    year distinct_ids
# 1: 2018            2
# 2: 2019            2
# 3: 2020            3
s_baldur
  • 29,441
  • 4
  • 36
  • 69
1

@chinsoon12, thank you very much! You answer almost did it! I still need to convert id_var and val_var to character and then get() these in the call to data.table - otherwise, passing strings to id_var and val_var does not work.

But evaluating at the higher level is the necessary key idea. Here is the complete answer for future reference:

# define inner function
count_by <- function(DT, id_var, val_var, by = NULL) {
    id_var <- as.character(substitute(id_var))
    val_var <- as.character(substitute(val_var))

    substitute(
        DT[!is.na(get(val_var)), .(distinct_ids = uniqueN(get(id_var))), by = by]
    )
}

# define wrapper function
wrapper <- function(data, id_var, val_var, by = NULL) {
    data <- as.data.table(data)
    expr <- eval(substitute(count_by(data, id_var, val_var, by)))
    eval(expr)
}

# test wrapper function
(reference <- (wrapper(sample_dt, id_var = id, val_var = x, by = year)))
#>    year distinct_ids
#> 1: 2018            2
#> 2: 2019            2
#> 3: 2020            3

identical(wrapper(sample_dt, "id", x, year)       , reference)
#> [1] TRUE
identical(wrapper(sample_dt, "id", "x", year)     , reference)
#> [1] TRUE
identical(wrapper(sample_dt, "id", x, "year")     , reference)
#> [1] TRUE
identical(wrapper(sample_dt, "id", x, c("year"))  , reference)
#> [1] TRUE
identical(wrapper(sample_dt, "id", "x", "year")   , reference)
#> [1] TRUE
identical(wrapper(sample_dt, "id", "x", c("year")), reference)
#> [1] TRUE
identical(wrapper(sample_dt, id, "x", year)       , reference)
#> [1] TRUE
identical(wrapper(sample_dt, id, "x", "year")     , reference)
#> [1] TRUE
identical(wrapper(sample_dt, id, "x", c("year"))  , reference)
#> [1] TRUE
identical(wrapper(sample_dt, id, x, "year")       , reference)
#> [1] TRUE
identical(wrapper(sample_dt, id, x, c("year"))    , reference)
#> [1] TRUE

# test expression in by
wrapper(sample_dt, "id", x, by = .(year_2019 = year > 2019L))
#>    year_2019 distinct_ids
#> 1:     FALSE            2
#> 2:      TRUE            3

Created on 2020-02-20 by the reprex package (v0.3.0)

der_grund
  • 1,898
  • 20
  • 36