3

I am using dplyr and trying to create a function to calculate p.values based on grouping arguments. I would like to be able to have an argument that would be list of any length of variables to group by. Here is the example dataset:

dataset <- structure(list(Experiment = c(170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170222, 170222, 170222, 170222, 
170222, 170222, 170222, 170222, 170824, 170824, 170824, 170824, 
170824, 170824, 170824, 170824, 170824, 170824, 170824, 170824, 
170824, 170824, 170824, 170824, 170824, 170824, 170824, 170824, 
170824, 170824, 170824, 170824, 170824, 170824, 170824, 170824, 
170824, 170824, 170824, 170824, 170824, 170824, 170824, 170824, 
170824, 170824, 170824, 170824, 170824, 170824, 170824, 170824, 
170824, 170824, 170824, 170824, 170824, 170824, 170824, 170824, 
170824, 170824, 170824, 170824, 170824, 170824, 170824, 170824, 
170824, 170824, 170824, 170824), Sample = c("1: FL_496", "1: FL_496", 
"1: FL_496", "1: FL_496", "1: FL_496", "1: FL_496", "1: FL_496", 
"1: FL_496", "2: FL_505", "2: FL_505", "2: FL_505", "2: FL_505", 
"2: FL_505", "2: FL_505", "2: FL_505", "2: FL_505", "3: FL_509", 
"3: FL_509", "3: FL_509", "3: FL_509", "3: FL_509", "3: FL_509", 
"3: FL_509", "3: FL_509", "4: FL_514", "4: FL_514", "4: FL_514", 
"4: FL_514", "4: FL_514", "4: FL_514", "4: FL_514", "4: FL_514", 
"5: cKO_497", "5: cKO_497", "5: cKO_497", "5: cKO_497", "5: cKO_497", 
"5: cKO_497", "5: cKO_497", "5: cKO_497", "6: cKO_504", "6: cKO_504", 
"6: cKO_504", "6: cKO_504", "6: cKO_504", "6: cKO_504", "6: cKO_504", 
"6: cKO_504", "7: cKO_510", "7: cKO_510", "7: cKO_510", "7: cKO_510", 
"7: cKO_510", "7: cKO_510", "7: cKO_510", "7: cKO_510", "8: cKO_515", 
"8: cKO_515", "8: cKO_515", "8: cKO_515", "8: cKO_515", "8: cKO_515", 
"8: cKO_515", "8: cKO_515", "9: cKO_517", "9: cKO_517", "9: cKO_517", 
"9: cKO_517", "9: cKO_517", "9: cKO_517", "9: cKO_517", "9: cKO_517", 
NA, NA, NA, NA, NA, NA, NA, NA, "1: FL_627", "1: FL_627", "1: FL_627", 
"1: FL_627", "1: FL_627", "1: FL_627", "2: FL_628", "2: FL_628", 
"2: FL_628", "2: FL_628", "2: FL_628", "2: FL_628", "3: FL_633", 
"3: FL_633", "3: FL_633", "3: FL_633", "3: FL_633", "3: FL_633", 
"4: FL_636", "4: FL_636", "4: FL_636", "4: FL_636", "4: FL_636", 
"4: FL_636", "5: cKO_620", "5: cKO_620", "5: cKO_620", "5: cKO_620", 
"5: cKO_620", "5: cKO_620", "6: cKO_625", "6: cKO_625", "6: cKO_625", 
"6: cKO_625", "6: cKO_625", "6: cKO_625", "7: cKO_626", "7: cKO_626", 
"7: cKO_626", "7: cKO_626", "7: cKO_626", "7: cKO_626", "8: cKO_634", 
"8: cKO_634", "8: cKO_634", "8: cKO_634", "8: cKO_634", "8: cKO_634", 
"cKO_620", "cKO_620", "cKO_625", "cKO_625", "cKO_626", "cKO_626", 
"cKO_634", "cKO_634", "FL_627", "FL_627", "FL_628", "FL_628", 
"FL_633", "FL_633", "FL_636", "FL_636"), Genotype = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("miR-15/16 FL", 
"miR-15/16 cKO"), class = "factor"), variable = c("% CD127+", 
"% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", 
"% KLRG1+", "% KLRG1+", "% KLRG1+", "% CD127+", "% CD127+", "% CD127+", 
"% CD127+", "% KLRG1+", "% KLRG1+", "% KLRG1+", "% KLRG1+", "% CD127+", 
"% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", 
"% KLRG1+", "% KLRG1+", "% KLRG1+", "% CD127+", "% CD127+", "% CD127+", 
"% CD127+", "% KLRG1+", "% KLRG1+", "% KLRG1+", "% KLRG1+", "% CD127+", 
"% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", 
"% KLRG1+", "% KLRG1+", "% KLRG1+", "% CD127+", "% CD127+", "% CD127+", 
"% CD127+", "% KLRG1+", "% KLRG1+", "% KLRG1+", "% KLRG1+", "% CD127+", 
"% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% CD127+", "% CD127+", "% KLRG1+", "% KLRG1+", 
"% KLRG1+", "% CD127+", "% KLRG1+", "% CD127+", "% KLRG1+", "% CD127+", 
"% KLRG1+", "% CD127+", "% KLRG1+", "% CD127+", "% KLRG1+", "% CD127+", 
"% KLRG1+", "% CD127+", "% KLRG1+", "% CD127+", "% KLRG1+"), 
    value = c(1, 28.7, 40.1, 47.4, 64.1, 69.9, 73.1, 79.42, 0.99, 
    21.72, 33, 56.6, 55.5, 82.9, 84.96, 86.7, 3.94, 43.4, 49.5, 
    60.8, 57.1, 69.8, 71.4, 77.72, 1, 20.56, 28.77, 35.1, 71.07, 
    71.2, 78.16, 84.04, 3.77, 56.9, 60.5, 66.5, 43.7, 50.36, 
    50.8, 51.8, 3.24, 58.2, 59.8, 70.8, 47.9, 58.5, 59.5, 61.3, 
    4.21, 62, 65.7, 73.8, 40, 51.5, 53.1, 55.69, 9.48, 41.7, 
    44, 63, 53.7, 57.31, 60.4, 60.8, 3.84, 34.1, 41.1, 53.2, 
    55.07, 55.3, 62.2, 76.6, NA, NA, NA, NA, NA, NA, NA, NA, 
    12.01, 18.5, 20.99, 66.39, 77.2, 85.6, 12.8, 31.3, 35.11, 
    59.8, 85.5, 89.7, 32.1, 33.3, 34.7, 63.2, 71.6, 80.5, 15.3, 
    17.02, 33.5, 65.54, 82.7, 85.8, 41.61, 51.3, 69.3, 39.81, 
    59, 62, 46.6, 52.1, 67.8, 39.5, 58.8, 66, 52.2, 52.9, 68.7, 
    46, 55.9, 61.6, 45.17, 59.9, 74.3, 31.87, 48.4, 51.2, 6.2, 
    56.34, 4.17, 70.85, 3.54, 59.89, 5.61, 49.71, 1.87, 77.09, 
    0.51, 86.05, 1.8, 80.69, 2.15, 79.43), Day = structure(c(1L, 
    2L, 3L, 4L, 4L, 3L, 2L, 1L, 1L, 3L, 4L, 2L, 2L, 4L, 1L, 3L, 
    1L, 3L, 2L, 4L, 4L, 2L, 3L, 1L, 1L, 3L, 4L, 2L, 4L, 2L, 3L, 
    1L, 1L, 3L, 2L, 4L, 4L, 1L, 2L, 3L, 1L, 3L, 2L, 4L, 4L, 2L, 
    3L, 1L, 1L, 3L, 2L, 4L, 4L, 3L, 2L, 1L, 1L, 3L, 4L, 2L, 2L, 
    1L, 4L, 3L, 1L, 2L, 3L, 4L, 1L, 4L, 3L, 2L, 2L, 3L, 4L, 1L, 
    2L, 3L, 4L, 1L, 3L, 2L, 4L, 3L, 2L, 4L, 2L, 3L, 4L, 3L, 2L, 
    4L, 2L, 3L, 4L, 3L, 2L, 4L, 2L, 3L, 4L, 3L, 4L, 2L, 3L, 2L, 
    4L, 3L, 2L, 4L, 3L, 2L, 4L, 3L, 2L, 4L, 3L, 2L, 4L, 3L, 2L, 
    4L, 3L, 2L, 4L, 3L, 2L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("8", "15", "22", 
    "30+"), class = "factor")), class = "data.frame", row.names = c(NA, 
-144L), .Names = c("Experiment", "Sample", "Genotype", "variable", 
"value", "Day"))

and here is the function I have made that works using ...

grouped.t.test <- function(dataset, subset.plot, comparison, ...)
  {
  group.by <- quos(...)
  if (is.null(subset.plot)){
    subset.plot <- dataset[['variable']]
    }
  filter(dataset, variable %in% subset.plot) %>%
    group_by(!!!group.by) %>%
    do(tidy(t.test(x = .$value[.[comparison] == levels(.[[comparison]])[1]],
                   y = .$value[.[comparison] == levels(.[[comparison]])[2]]))) %>%
    mutate(p.value.format = symnum(p.value, corr = FALSE, na = FALSE, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", NA))) %>%
    arrange(!!!group.by)
  }
View(grouped.t.test(dataset = dataset, subset.plot = NULL, comparison = 'Genotype', variable, Day))

I would like to be able to replace ... with an argument (e.g., group_vars) and call it like this:

View(grouped.t.test(dataset = dataset, subset.plot = NULL, comparison = 'Genotype', group_vars = c(variable, Day)))

This does not seem to work with quos() but I don't understand why. It would be nice to be able to use multiple list arguments that get quosed and used independently (e.g., creating an argument "arrange.by" that would be a list of variables to pass to arrange at the end of the function.

I'd greatly appreciate any help understanding why this doesn't work and what I could do instead!

Tung
  • 26,371
  • 7
  • 91
  • 115
John Gagnon
  • 825
  • 1
  • 8
  • 20
  • I'm a little confused by what you mean. `c(variable, Day)` won't create a vector of symbols, it will try to find actual objects by those names. You might try just typing `c(variable, Day)` at the console. Maybe you meant that you want to pass the argument like `group_vars = c("variable","Day")` i.e. as characters? – joran Apr 14 '18 at 01:09
  • my understanding was that using quosures is supposed to allow you to catch the what the user of the function passes and then quote it. Then the !!! is supposed to unquote it within the group_by() function. Is this not how it works? – John Gagnon Apr 14 '18 at 01:22
  • 1
    Yes, but the way that works is by using `...`. You could create a list of symbols using something like `rlang::syms`, but that's more work for your user. Typically you'd either pass them one at a time via `...` or as a character vector and turn them into symbols yourself in the function. – joran Apr 14 '18 at 01:23
  • I see. So if I provide a character vector, and create a list of symbols with `rlang::syms`, what is the proper way to pass that to `group_by()`? Thanks! – John Gagnon Apr 14 '18 at 01:27
  • Looks like it works with the `!!!` unquoting. Is this the correct implementation? I get warning message: `Vectorizing 'noquote' elements may not preserve their attributes` using this. – John Gagnon Apr 14 '18 at 01:32
  • Well, if it works that's something, but honestly I'm not comfortable enough with tidyeval stuff to comment strongly on what the preferred style is in cases like this. Someone else might come along and weigh in, though. – joran Apr 14 '18 at 01:34

1 Answers1

3

As mentioned by @lionel - one of the lead developers of dplyr in this comment

You want the quoting to be external and explicitly done by the user rather than implicitly by your function. To this end you can ask your users to quote with base::alist(), rlang::exprs(), or dplyr::vars()

You can do something like this for your question

grouped.t.test2 <- function(dataset, subset.plot, comparison, group_vars) {

  if (is.null(subset.plot)) {
    subset.plot <- dataset[['variable']]
  }

  filter(dataset, variable %in% subset.plot) %>%
    group_by(!!! group_vars) %>%
    do(tidy(t.test(x = .$value[.[comparison] == levels(.[[comparison]])[1]],
                   y = .$value[.[comparison] == levels(.[[comparison]])[2]]))) %>%
    mutate(p.value.format = symnum(p.value, corr = FALSE, na = FALSE, 
                                   cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), 
                                   symbols = c("****", "***", "**", "*", NA))) %>%
    arrange(!!! group_vars)
}

grouped.t.test2(dataset = dataset, subset.plot = NULL, comparison = 'Genotype', 
               alist(variable, Day))

# or

grouped.t.test2(dataset = dataset, subset.plot = NULL, comparison = 'Genotype', 
               dplyr::vars(variable, Day))

# A tibble: 8 x 13
# Groups:   variable, Day [8]
  variable Day   estimate estimate1 estimate2 statistic p.value parameter
  <fct>    <fct>    <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>
1 % CD127+ 8        -3.24      1.66      4.90     -4.26 9.93e-4      12.6
2 % CD127+ 15      -24.4      31.1      55.5      -3.80 2.88e-3      11.2
3 % CD127+ 22      -22.1      27.4      49.5      -4.60 5.54e-4      12.5
4 % CD127+ 30+     -28.6      36.8      65.4      -5.23 1.36e-4      13.7
5 % KLRG1+ 8        23.8      81.2      57.4       9.79 3.11e-7      12.5
6 % KLRG1+ 15       16.5      73.7      57.2       3.78 2.08e-3      13.8
7 % KLRG1+ 22       20.9      70.1      49.2       4.44 4.82e-4      14.9
8 % KLRG1+ 30+      22.5      76.7      54.2       4.46 6.01e-4      13.4
# ... with 5 more variables: conf.low <dbl>, conf.high <dbl>,
#   method <fct>, alternative <fct>, p.value.format <chr>              
Tung
  • 26,371
  • 7
  • 91
  • 115