Check dataframe with different functions (dplyr)

Question

I am trying to write some functions that take a dataframe and check whether certain variables fulfill certain criteria. For each check I would like to create a new variable "check_" giving the result of the check. Unfortunately, I still struggle to get it right. Can someone help me?

# Some sample data
dat <- data.frame(Q1_1 = c(1, 1, 2, 5, 2, 1),
                  Q1_2 = c(1, 2, 3, 5, 1, 3),
                  Q1_3 = c(4, 3, 3, 5, 1, 3),
                  Q1_4 = c(4, 2, 2, 5, 1, 2),
                  Q1_5 = c(2, 2, 1, 5, 5, 4),
                  Q2_1 = c(1, 2, 1, 2, 1, 2),
                  Q2_2 = c(2, 1, 1, 1, 2, 1),
                  Q2_3 = c(1, 1, 1, 2, 2, 1),
                  age = c(22,36,20,27,13, 9))


# Some checker-functions

check_age <- function(.df, agevar = "age"){
  #' Function should check if the age value is within a certain range
  #' and create a new variable "check_age" giving the result of the check

  .df %>% mutate(check_age = ifelse(age > 100, FALSE, TRUE),
                 check_age = ifelse(age < 4, FALSE, TRUE))
  ???
}

check_sameAnswers <- function(.df, varname = "Q1_"){
  #' Function should check whether all sub Of a question (e.g. Q1_1 to Q1_5) have the
  #' same values and create a new variable "check_sameAnswers" giving the result of the check.
  #' It should be TRUE if Q1_1, Q1_2, ... have the value 5 for example, otherwise FALSE
  
  ???
}


# Apply checker functions to dataframe in "dplyr-style"
dat <- dat %>% 
          check_age(agevar = "age") %>%
          check_sameAnswers(varname = "Q1_")

Not related to your code problem, but for these kinds of checks, you should look into the [pointblank](https://rich-iannone.github.io/pointblank/) package. Excellent data validation tools in there. — caldwellst, Feb 16 '22 at 12:36

Claudiu Papasteri · Accepted Answer · 2022-02-17T06:58:55.423

As @Bloxx already mentioned, your ifelse is the first problem. Those conditions are evaluated sequentially (same would happen if you would use dplyr::case_when). Its not a problem with the functions, but the way you declare the conditions, it should be one condition, not two. But you also have a second problem in your first function - you call age variable directly from the data frame (by data masking), not the column corresponding to the column name stored in argument agevar. This is why I used deparse(substitute()) to get the column corresponding to the column name in agevar.

I changed the first values in the age column so you can see the results of the function.

In your second function you want to check if different items have the same values across colums. In the tidyverse you generally use rowwise for this. contains was used to give you more flexibility when specifying column prefixes that define your grouppings.

library(tidyverse)

# Some sample data
dat <- data.frame(Q1_1 = c(1, 1, 2, 5, 2, 1),
                  Q1_2 = c(1, 2, 3, 5, 1, 3),
                  Q1_3 = c(4, 3, 3, 5, 1, 3),
                  Q1_4 = c(4, 2, 2, 5, 1, 2),
                  Q1_5 = c(2, 2, 1, 5, 5, 4),
                  Q2_1 = c(1, 2, 1, 2, 1, 2),
                  Q2_2 = c(2, 1, 1, 1, 2, 1),
                  Q2_3 = c(1, 1, 1, 2, 2, 1),
                  age = c(1,101,20,27,13, 9))   # here changed first 2 values


# Some checker-functions

check_age <- function(.df, agevar = "age"){
  #' Function should check if the age value is within a certain range
  #' and create a new variable "check_age" giving the result of the check
  
  age <- deparse(substitute(agevar))
  
  .df %>% mutate(check_age = ifelse(age > 100 | age < 4, FALSE, TRUE))
  
}

check_sameAnswers <- function(.df, varname = "Q1_"){
  #' Function should check whether all sub Of a question (e.g. Q1_1 to Q1_5) have the
  #' same values and create a new variable "check_sameAnswers" giving the result of the check.
  #' It should be TRUE if Q1_1, Q1_2, ... have the value 5 for example, otherwise FALSE
  
  .df %>% 
    rowwise() %>% 
    mutate(check_sameAnswers = length(unique(c_across(contains(varname)))) == 1)
}


# Apply checker functions to dataframe in "dplyr-style"
dat %>% 
  check_age(agevar = "age") %>%
  check_sameAnswers(varname = "Q1_") %>%
  dplyr::select(-contains("Q2_"))                   # just for uncluttered printing
#> # A tibble: 6 x 8
#> # Rowwise: 
#>    Q1_1  Q1_2  Q1_3  Q1_4  Q1_5   age check_age check_sameAnswers
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>     <lgl>            
#> 1     1     1     4     4     2     1 FALSE     FALSE            
#> 2     1     2     3     2     2   101 FALSE     FALSE            
#> 3     2     3     3     2     1    20 TRUE      FALSE            
#> 4     5     5     5     5     5    27 TRUE      TRUE             
#> 5     2     1     1     1     5    13 TRUE      FALSE            
#> 6     1     3     3     2     4     9 TRUE      FALSE

^{Created on 2022-02-16 by the reprex package (v2.0.1)}

UPDATE: Dynamic naming inside mutate is possible by using := to dynamically assign to dynamically named column "check_{varname}". In our case you can extend the logic of check_sameAnswer so that each time you use it for a Q1_, Q2_, ... prefix it returns a column containing that name with the check for those particular columns.

check_sameAnsDyn <- function(.df, varname = "Q1_"){

  .df %>% 
    rowwise() %>% 
    mutate("check_sameAnsDyn_{varname}" := length(unique(c_across(contains(varname)))) == 1)
}


dat %>% 
  check_sameAnsDyn(varname = "Q1_") %>%
  check_sameAnsDyn(varname = "Q2_") %>%
  dplyr::select(-age)                   # just for uncluttered printing

# A tibble: 6 x 10
# Rowwise: 
   Q1_1  Q1_2  Q1_3  Q1_4  Q1_5  Q2_1  Q2_2  Q2_3 check_sameAnsDyn_Q1_ check_sameAnsDyn_Q2_
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>                <lgl>               
1     1     1     4     4     2     1     2     1 FALSE                FALSE               
2     1     2     3     2     2     2     1     1 FALSE                FALSE               
3     2     3     3     2     1     1     1     1 FALSE                TRUE                
4     5     5     5     5     5     2     1     2 TRUE                 FALSE               
5     2     1     1     1     5     1     2     2 FALSE                FALSE               
6     1     3     3     2     4     2     1     1 FALSE                FALSE

Thanks, thats very helpful! Just a short follow-up question: In `check_sameAnswers()` how can I call the newly created variable `check_` (dynamically, using the "varname" argument) instead of `check_sameAnswers`? — D. Studer, Feb 16 '22 at 14:37
You are welcome. You can use dynamic variable naming inside `mutate` using `"check_{varname}"` so that string from `varname` is pasted in the name and use `:=` for dynamic assignment. See more here https://stackoverflow.com/questions/26003574/use-dynamic-variable-names-in-dplyr. This being said you should modify the code I provided if you want to do this because `check_sameAnswers` function is predicated on the assumption that `varname` is a common string pattern that defines a group of columns. — Claudiu Papasteri, Feb 16 '22 at 14:48
If this answer was a satisfactory solution to your problem, please consider accepting it so that others know one was found. — Claudiu Papasteri, Feb 16 '22 at 21:53
`check <- function(.df, varname) { .df %>% mutate(check_{varname} = "xy") }` I'm still having troubles setting a dynamic variable name within a function. — D. Studer, Feb 16 '22 at 22:08

score 1 · Answer 2 · answered Feb 16 '22 at 17:10

You can embrace the argument to use variables (from data masking) in your function

Functions

library(dplyr)

check_age <- function(data, age_var, start = 0, end = 0){
  data %>% 
  mutate(between = ifelse({{age_var}} >= start & {{age_var}} <= end,T,F))
}

check_sameAnswers <- function(data, cols){
  data %>% 
  rowwise() %>% 
  mutate(same = length(unique(c_across(starts_with(cols)))) == 1) %>% 
  ungroup()
}

Use

dat %>% 
  check_age(age, 30, 40) %>% 
  check_sameAnswers(cols="Q1")
# A tibble: 6 × 11
   Q1_1  Q1_2  Q1_3  Q1_4  Q1_5  Q2_1  Q2_2  Q2_3   age between same 
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>   <lgl>
1     1     1     4     4     2     1     2     1    22 FALSE   FALSE
2     1     2     3     2     2     2     1     1    36 TRUE    FALSE
3     2     3     3     2     1     1     1     1    20 FALSE   FALSE
4     5     5     5     5     5     2     1     2    27 FALSE   TRUE 
5     2     1     1     1     5     1     2     2    13 FALSE   FALSE
6     1     3     3     2     4     2     1     1     9 FALSE   FALSE

Bloxx · Answer 3 · 2022-02-16T12:50:33.880

I think the problem is in your ifelse statement. Try this:

check_age <- function(.df, agevar = "age"){
  #' Function should check if the age value is within a certain range
  #' and create a new variable "check_age" giving the result of the check
  
  .df %>% mutate(check_age = ifelse(age > 100 | age < 4, FALSE, TRUE))

}

check_sameAnswers <- function(.df, varname = "Q1_"){
  #' Function should check whether all sub Of a question (e.g. Q1_1 to Q1_5) have the
  #' same values and create a new variable "check_sameAnswers" giving the result of the check.
  #' It should be TRUE if Q1_1, Q1_2, ... have the value 5 for example, otherwise FALSE
  .df %>% mutate(sameAnswers = ifelse(length(unique(dat$Q1_2)) == 1, TRUE, FALSE))
}


# Apply checker functions to dataframe in "dplyr-style"
dat <- dat %>% 
  check_age(agevar = "age") %>%
  check_sameAnswers(varname = "Q1_")
dat

What @D. Studer is asking for in the second function is to get TRUE/FALSE when all columns starting with `Q1_` (i.e. Q1_1 ... Q1_5) have the same value. Your solution does not do that, it checks if all values in Q1_2 are the same (i.e. if Q1_2 has zero variance). — Claudiu Papasteri, Feb 16 '22 at 13:09

Check dataframe with different functions (dplyr)

3 Answers3

Functions