1

I have a data set df with 102 variables: 16 int, 80 factors, 8 logi. There are no NA values.

I've used DataExplorer before without issue, but when I ran it on this data set ...

library(DataExplorer)
create_report(df)

... it chugged along fine, outputting its progress ...

# label: correlation_analysis
#   |................................................                 |  74%
#   ordinary text without R code

... until it got to the PCA section when it produced this error:

#  |..................................................               |  76%
# label: principle_component_analysis
# Quitting from lines 208-221 (report.rmd) 
#
# Error in data.table(pc = paste0("PC", seq_along(pca$sdev)), var = var_exp,  : 
#  Item 2 has no length. Provide at least one item (such as NA, NA_integer_ etc) to be repeated to match the 1 row in the longest column. Or, all columns can be 0 length, for insert()ing rows into. 

I googled on this error but found only pages explaining PCA and not this error. Any suggestions?

The traceback is

26. stop("Item ", i, " has no length. Provide at least one item (such as NA, NA_integer_ etc) to be repeated to match the ", 
    nr, " row", if (nr > 1L) "s", " in the longest column. Or, all columns can be 0 length, for insert()ing rows into.") 
25. data.table(pc = paste0("PC", seq_along(pca$sdev)), var = var_exp, 
    pct = var_exp/sum(var_exp), cum_pct = cumsum(var_exp)/sum(var_exp)) 
24. plot_prcomp(data = structure(list(EnrollmentID = c(4603L, 8457L, 
3290L, 3323L, 6186L, 6501L, 3084L, 8662L, 7676L, 3229L, 6005L, 
3387L, 8204L, 9018L, 4517L, 3320L, 8840L, 7729L, 8835L, 5148L, 
7560L, 1239L, 5874L, 4963L, 3755L, 3397L, 9877L, 8609L, 6584L,  ... 
23. do.call(fun_name, c(list(data = data), report_config[[fun_name]])) at <text>#9
22. do_call("plot_prcomp", na_omit = TRUE) at <text>#8
21. eval(expr, envir, enclos) 
20. eval(expr, envir, enclos) 
19. withVisible(eval(expr, envir, enclos)) 
18. withCallingHandlers(withVisible(eval(expr, envir, enclos)), warning = wHandler, 
    error = eHandler, message = mHandler) 
17. handle(ev <- withCallingHandlers(withVisible(eval(expr, envir, 
    enclos)), warning = wHandler, error = eHandler, message = mHandler)) 
16. timing_fn(handle(ev <- withCallingHandlers(withVisible(eval(expr, 
    envir, enclos)), warning = wHandler, error = eHandler, message = mHandler))) 
15. valuate_call(expr, parsed$src[[i]], envir = envir, enclos = enclos, 
    debug = debug, last = i == length(out), use_try = stop_on_error != 
        2L, keep_warning = keep_warning, keep_message = keep_message, 
    output_handler = output_handler, include_timing = include_timing) 
14. evaluate::evaluate(...) 
13. evaluate(code, envir = env, new_device = FALSE, keep_warning = !isFALSE(options$warning), 
    keep_message = !isFALSE(options$message), stop_on_error = if (options$error && 
        options$include) 0L else 2L, output_handler = knit_handlers(options$render, 
        options)) 
12. in_dir(input_dir(), evaluate(code, envir = env, new_device = FALSE, 
    keep_warning = !isFALSE(options$warning), keep_message = !isFALSE(options$message), 
    stop_on_error = if (options$error && options$include) 0L else 2L, 
    output_handler = knit_handlers(options$render, options))) 
11. block_exec(params) 
10. call_block(x) 
9. process_group.block(group) 
8. process_group(group) 
7. withCallingHandlers(if (tangle) process_tangle(group) else process_group(group), 
    error = function(e) {
        setwd(wd)
        cat(res, sep = "\n", file = output %n% "") ... 
6. process_file(text, output) 
5. knitr::knit(knit_input, knit_output, envir = envir, quiet = quiet, 
    encoding = encoding) 
4. render(input = report_dir, output_file = output_file, output_dir = output_dir, 
    intermediates_dir = output_dir, params = list(data = data, 
        report_config = config, response = y), ...) 
3. withCallingHandlers(expr, warning = function(w) invokeRestart("muffleWarning")) 
2. suppressWarnings(render(input = report_dir, output_file = output_file, 
    output_dir = output_dir, intermediates_dir = output_dir, 
    params = list(data = data, report_config = config, response = y), 
    ...)) 
1. create_report(df) 

Here's the session info:

sessionInfo()
R version 3.5.1 (2018-07-02)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] car_3.0-2          knitr_1.20         rmarkdown_1.10     data.table_1.11.8 
 [5] DataExplorer_0.7.0 mosaic_1.4.0       Matrix_1.2-14      mosaicData_0.17.0 
 [9] ggformula_0.9.0    ggstance_0.3.1     mdsr_0.1.6         Lahman_6.0-0      
[13] ISLR_1.2           forcats_0.3.0      stringr_1.3.1      dplyr_0.7.8       
[17] purrr_0.2.5        readr_1.1.1        tidyr_0.8.2        tibble_1.4.2      
[21] ggplot2_3.1.0      tidyverse_1.2.1    lattice_0.20-35    carData_3.0-2     

loaded via a namespace (and not attached):
 [1] ggdendro_0.1-20  httr_1.3.1       RMySQL_0.10.15   jsonlite_1.5     splines_3.5.1   
 [6] modelr_0.1.2     assertthat_0.2.0 highr_0.7        cellranger_1.1.0 yaml_2.2.0      
[11] ggrepel_0.8.0    pillar_1.3.0     backports_1.1.2  glue_1.3.0       downloader_0.4  
[16] digest_0.6.18    rvest_0.3.2      colorspace_1.3-2 htmltools_0.3.6  plyr_1.8.4      
[21] pkgconfig_2.0.2  broom_0.5.0      haven_1.1.2      scales_1.0.0     openxlsx_4.1.0  
[26] rio_0.5.10       withr_2.1.2      lazyeval_0.2.1   cli_1.0.1        magrittr_1.5    
[31] crayon_1.3.4     readxl_1.1.0     evaluate_0.12    nlme_3.1-137     MASS_7.3-50     
[36] xml2_1.2.0       foreign_0.8-71   tools_3.5.1      hms_0.4.2        munsell_0.5.0   
[41] babynames_0.3.0  zip_1.0.0        bindrcpp_0.2.2   networkD3_0.4    compiler_3.5.1  
[46] rlang_0.3.0.1    grid_3.5.1       rstudioapi_0.8   htmlwidgets_1.3  igraph_1.2.2    
[51] labeling_0.3     mosaicCore_0.6.0 gtable_0.2.0     abind_1.4-5      DBI_1.0.0       
[56] curl_3.2         reshape2_1.4.3   R6_2.3.0         gridExtra_2.3    lubridate_1.7.4 
[61] rprojroot_1.3-2  bindr_0.1.1      stringi_1.2.4    parallel_3.5.1   Rcpp_1.0.0      
[66] dbplyr_1.2.2     tidyselect_0.2.5

Here's the output of introduce(df_dummified) as requested in comments below:

A tibble: 1 x 9  
 rows columns discrete_columns continuous_columns  
<int>   <int>            <int>              <int>  
 9527     489                2                487  

all_missing_columns total_missing_values  
              <int>                <int>  
                  0                 7826  

complete_rows total_observations memory_usage  
        <int>              <int>        <dbl>  
         6889            4658703     18919440  
Karl Baker
  • 903
  • 12
  • 27

2 Answers2

4

You might also consider skipping the PCA part of the report, by removing "plot_prcomp" from the create_report() config.

I had the same issue and this still created the rest of the report for me:

library(DataExplorer)

config <- list(
  "introduce" = list(),
  "plot_str" = list(
    "type" = "diagonal",
    "fontSize" = 35,
    "width" = 1000,
    "margin" = list("left" = 350, "right" = 250)
  ),
  "plot_missing" = list(),
  "plot_histogram" = list(),
  "plot_qq" = list(sampled_rows = 1000L),
  "plot_bar" = list(),
  "plot_correlation" = list("cor_args" = list("use" = "pairwise.complete.obs")),
#  "plot_prcomp" = list(),
  "plot_boxplot" = list(),
  "plot_scatterplot" = list(sampled_rows = 1000L)
)

create_report(df, config = config)
  • Great suggestion, that's exactly what I did so that I could at least see the rest of the DataExplorer output. Then I began combining and converting factors to logical variables. Haven't finished but hopefully this will allow me to run the PCA. – Karl Baker Nov 28 '18 at 23:22
3

PCA can be applied only on numerical data. Consider only numeric columns for PCA, remove columns other than numeric.

nums <- unlist(lapply(df, is.numeric))
df_new <- df[, nums]

Remove all the columns which have a constant variance.

df_new <- df_new[, apply(df_new, 2, var) != 0]

Reference: How to solve prcomp.default(): cannot rescale a constant/zero column to unit variance

Now, run this. This should create a nice html report for you.

create_report(df_new)
Boxuan
  • 4,937
  • 6
  • 37
  • 73
RAJK
  • 195
  • 2
  • 9
  • thanks for the explanation and the link. Since my df contains most of the interesting information in the factors, I'll need to do somehow convert them to numeric data for PCA. Is there a standard approach to convert categorical columns to numeric? – Karl Baker Nov 26 '18 at 19:22
  • You might also use dummy variables to convert categorical data to numeric. That might be an option. – RAJK Nov 26 '18 at 22:00
  • @KarlBaker Have you tried `plot_prcomp(dummify(df))`? – Boxuan Nov 28 '18 at 20:05
  • @Boxuan, that's new to me, sounds promising, but how do run it? I tried running it after loading DataExplorer but got a similar error as before: "cannot rescale a constant/zero column to unit variance Error in data.table(pc = paste0("PC", seq_along(pca$sdev)), var = var_exp, : Item 2 has no length. Provide at least one item (such as NA, NA_integer_ etc) to be repeated to match the 1 row in the longest column. Or, all columns can be 0 length, for insert()ing rows into." – Karl Baker Nov 28 '18 at 23:32
  • @Boxuan, I should note I was able to dummify the data (`df_dummified <- dummify(df)`), I just wasn't able to run the PCA (`plot_prcomp(df_dummified)`). – Karl Baker Nov 28 '18 at 23:43
  • @KarlBaker Would you mind sharing the output of `str(df)` and/or `introduce(df)`? – Boxuan Nov 29 '18 at 15:47
  • @Boxuan Sure, I added the output of `introduce(df_dummified)` to the end of my original question. Dummify exploded my vars to 400+ so the output of `str()` is a bit unwieldy. Let me know if it would still be helpful to see that and I can append it as well. – Karl Baker Nov 29 '18 at 16:37
  • @KarlBaker What is the error message for `plot_prcomp(df_dummified)`? Same as OP? – Boxuan Nov 30 '18 at 21:23