0

I am trying to check the assumptions of multiple models following a solution I found here. I want to extend that solution to a list of tidymodels, but I can't figure out how to convert the results into a dataframe/tibble. I tried using dplyr::bind_rows but I get the error shown in the code example below.

  library("tidyverse")
  library("tidymodels")
  library("olsrr")
  
  # Fit each model separately
  lm_spec <- parsnip::linear_reg()
  mod1 <- fit(lm_spec, mpg ~ cyl, data = mtcars) 
  mod2 <- fit(lm_spec, mpg ~ hp, data = mtcars)
  
  # Combine models in list
  mod_list <- tibble::lst(mod1 = mod1, mod2 = mod2)
  
  # Function for residual normality tests
  get_lm_normality <- function(x) {
    x %>%
      # get the lm model object
      extract_fit_engine() %>%
      # transform its format
      olsrr::ols_test_normality()
  }  
  
  mod_normality <-
    purrr::map(mod_list, get_lm_normality) %>% 
    bind_rows(., .id = "name")

# Here I get the error:
# Error: All columns in a tibble must be vectors.
# x Column `mod1` is a `ols_test_normality` object.
# x Column `mod2` is a `ols_test_normality` object.

Any ideas for how to convert the list results to a dataframe?

D Kincaid
  • 167
  • 1
  • 13

2 Answers2

0

An option is to use rbindlist

library(purrr)
library(data.table)
library(dplyr)
map_dfr(mod_list, ~ get_lm_normality(.x) %>% 
          data.table::rbindlist(fill = TRUE), .id = "name")

-output

      name statistic      p.value alternative                                   method data.name
    <char>     <num>        <num>      <char>                                   <char>    <char>
 1:   mod1 0.1350410 6.038119e-01   two-sided       One-sample Kolmogorov-Smirnov test         y
 2:   mod1 0.9632327 3.358610e-01        <NA>              Shapiro-Wilk normality test         y
 3:   mod1 1.6778226 4.677620e-05        <NA> Cramer-von Mises test of goodness-of-fit         y
 4:   mod1 1.6778226 4.677620e-05        <NA>    Null hypothesis: uniform distribution         y
 5:   mod1 1.6778226 4.677620e-05        <NA>           Parameters assumed to be fixed         y
 6:   mod1 0.3887402 3.650452e-01        <NA>          Anderson-Darling normality test         y
 7:   mod2 0.1166882 7.328735e-01   two-sided       One-sample Kolmogorov-Smirnov test         y
 8:   mod2 0.9233670 2.568169e-02        <NA>              Shapiro-Wilk normality test         y
 9:   mod2 2.7282258 3.777997e-08        <NA> Cramer-von Mises test of goodness-of-fit         y
10:   mod2 2.7282258 3.777997e-08        <NA>    Null hypothesis: uniform distribution         y
11:   mod2 2.7282258 3.777997e-08        <NA>           Parameters assumed to be fixed         y
12:   mod2 0.7982230 3.446857e-02        <NA>          Anderson-Darling normality test         y

Some attributes in the output prevents the binding. We may have to remove the attributes

purrr::map_dfr(mod_list, ~ get_lm_normality(.x) %>%
     map_dfr(c), .id = "name") 

-output

# A tibble: 12 × 6
   name  statistic      p.value alternative method                                   data.name
   <chr>     <dbl>        <dbl> <chr>       <chr>                                    <chr>    
 1 mod1      0.135 0.604        two-sided   One-sample Kolmogorov-Smirnov test       y        
 2 mod1      0.963 0.336        NA          Shapiro-Wilk normality test              y        
 3 mod1      1.68  0.0000468    NA          Cramer-von Mises test of goodness-of-fit y        
 4 mod1      1.68  0.0000468    NA          Null hypothesis: uniform distribution    y        
 5 mod1      1.68  0.0000468    NA          Parameters assumed to be fixed           y        
 6 mod1      0.389 0.365        NA          Anderson-Darling normality test          y        
 7 mod2      0.117 0.733        two-sided   One-sample Kolmogorov-Smirnov test       y        
 8 mod2      0.923 0.0257       NA          Shapiro-Wilk normality test              y        
 9 mod2      2.73  0.0000000378 NA          Cramer-von Mises test of goodness-of-fit y        
10 mod2      2.73  0.0000000378 NA          Null hypothesis: uniform distribution    y        
11 mod2      2.73  0.0000000378 NA          Parameters assumed to be fixed           y        
12 mod2      0.798 0.0345       NA          Anderson-Darling normality test          y      
akrun
  • 874,273
  • 37
  • 540
  • 662
0

I think you'll have better luck if you put the models in a dataframe:

library("tidyverse")
library("tidymodels")
#> Registered S3 method overwritten by 'tune':
#>   method                   from   
#>   required_pkgs.model_spec parsnip
library("olsrr")
#> 
#> Attaching package: 'olsrr'
#> The following object is masked from 'package:datasets':
#> 
#>     rivers

# Fit each model separately
lm_spec <- parsnip::linear_reg()
mod1 <- fit(lm_spec, mpg ~ cyl, data = mtcars) 
mod2 <- fit(lm_spec, mpg ~ hp, data = mtcars)

# Combine models in list
mod_list <- tibble::lst(mod1 = mod1, mod2 = mod2)
mod_df <- tibble::enframe(mod_list)

# Function for residual normality tests
get_lm_normality <- function(x) {
    x %>%
        # get the lm model object
        extract_fit_engine() %>%
        # transform its format
        olsrr::ols_test_normality()
}  

mod_df %>% mutate(normality = purrr::map(value, get_lm_normality))
#> Warning in ks.test(y, "pnorm", mean(y), sd(y)): ties should not be present for
#> the Kolmogorov-Smirnov test
#> # A tibble: 2 × 3
#>   name  value    normality 
#>   <chr> <list>   <list>    
#> 1 mod1  <fit[+]> <ols_tst_>
#> 2 mod2  <fit[+]> <ols_tst_>

Created on 2022-03-11 by the reprex package (v2.0.1)

Julia Silge
  • 10,848
  • 2
  • 40
  • 48