How can I find a dataset that has some specific attributes?

Question

The package datasets and various packages come with a fair amount of useful datasets, however there seems to be no easy way to find your perfect dataset when you need it for your package examples, for teaching purposes, or to ask/answer a question here on SO.

Say for instance I want a dataset that is a data.frame, has at least 2 character columns, and is less than 100 rows long.

How can I explore EVERY dataset available and see a maximum of relevant information to make my choice ?

My past tries were messy, taking time, and crashed with some packages which have an unusual object structure like caret.

moodymudskipper · Accepted Answer · 2020-11-05T15:00:36.140

I've packaged a solution in a one function github package.

I'm copying the whole code at the bottom but the simplest is :

remotes::install_github("moodymudskipper/datasearch")
library(datasearch)

All data sets from package "dplyr"

dplyr_all <-
  datasearch("dplyr")

View(dplyr_all)

dplyr

Datasets from package "datasets" restricted by condition

datasets_ncol5 <-
  datasearch("datasets", filter =  ~is.data.frame(.) && ncol(.) == 5)

View(datasets_ncol5)

datasets

All datasets from all installed packages, no restriction


# might take more or less time, depends what you have installed
all_datasets <- datasearch()

View(all_datasets)

# subsetting the output
my_subset <- subset(
  all_datasets, 
  class1 == "data.frame" &
    grepl("treatment", names_collapsed) &
    nrow < 100
)

View(my_subset)

all

datasearch <- function(pkgs = NULL, filter = NULL){
  # make function silent
  w <- options()$warn
  options(warn = -1)
  search_ <- search()
  file_ <- tempfile()
  file_ <- file(file_, "w")
  on.exit({
    options(warn = w)
    to_detach <- setdiff(search(), search_)
    for(pkg in to_detach) eval(bquote(detach(.(pkg))))
    # note : we still have loaded namespaces, we could unload those that we ddn't
    # have in the beginning but i'm worried about surprising effects, I think
    # the S3 method tables should be cleaned too, and maybe other things

    # note2 : tracing library and require didn't work
    })

  # convert formula to function
  if(inherits(filter, "formula")) {
    filter <- as.function(c(alist(.=), filter[[length(filter)]]))
  }

  ## by default fetch all available packages in .libPaths()
  if(is.null(pkgs)) pkgs <- .packages(all.available = TRUE)
  ## fetch all data sets description
  df <- as.data.frame(data(package = pkgs, verbose = FALSE)$results)
  names(df) <- tolower(names(df))
  item <- NULL # for cmd check note
  df <- transform(
    df,
    data_name = sub('.*\\((.*)\\)', '\\1', item),
    dataset   = sub(' \\(.*', '', item),
    libpath = NULL,
    item = NULL
    )
  df <- df[order(df$package, df$data_name),]
  pkg_data_names <- aggregate(dataset ~ package + data_name, df, c)
  pkg_data_names <- pkg_data_names[order(pkg_data_names$package, pkg_data_names$data_name),]

  env <- new.env()
  n <-  nrow(pkg_data_names)
  pb <- progress::progress_bar$new(
    format = "[:bar] :percent :pkg",
    total = n)
  row_dfs <- vector("list", n)
  for(i in seq(nrow(pkg_data_names))) {
    pkg    <- pkg_data_names$package[i]
    data_name <- pkg_data_names$data_name[i]
    datasets  <- pkg_data_names$dataset[[i]]
    pb$tick(tokens = list(pkg = format(pkg, width = 12)))

    sink(file_, type = "message")
    data(list=data_name, package = pkg, envir = env)
    row_dfs_i <- lapply(datasets, function(dataset) {
      dat <- get(dataset, envir = env)
      if(!is.null(filter) && !filter(dat)) return(NULL)
      cl <- class(dat)
      nms <- names(dat)
      nc <- ncol(dat)
      if (is.null(nc)) nc <- NA
      nr <- nrow(dat)
      if (is.null(nr)) nr <- NA

      out <- data.frame(
        package = pkg,
        data_name = data_name,
        dataset = dataset,
        class = I(list(cl)),
        class1 = cl[1],
        type = typeof(dat),
        names = I(list(nms)),
        names_collapsed = paste(nms, collapse = "/"),
        nrow       = nr,
        ncol       = nc,
        length     = length(dat))

      if("data.frame" %in% cl) {
        classes <- lapply(dat, class)
        cl_flat <- unlist(classes)
        out <- transform(
          out,
          classes    = I(list(classes)),
          types      = I(list(vapply(dat, typeof, character(1)))),
          logical    = sum(cl_flat == 'logical'),
          integer    = sum(cl_flat == 'integer'),
          numeric    = sum(cl_flat == 'numeric'),
          complex    = sum(cl_flat == 'complex'),
          character  = sum(cl_flat == 'character'),
          raw        = sum(cl_flat == 'raw'),
          list       = sum(cl_flat == 'list'),
          data.frame = sum(cl_flat == 'data.frame'),
          factor     = sum(cl_flat == 'factor'),
          ordered    = sum(cl_flat == 'ordered'),
          Date       = sum(cl_flat == 'Date'),
          POSIXt     = sum(cl_flat == 'POSIXt'),
          POSIXct    = sum(cl_flat == 'POSIXct'),
          POSIXlt    = sum(cl_flat == 'POSIXlt'))
      } else {
        out <- transform(
          out,
          nrow       = NA,
          ncol       = NA,
          classes    = NA,
          types      = NA,
          logical    = NA,
          integer    = NA,
          numeric    = NA,
          complex    = NA,
          character  = NA,
          raw        = NA,
          list       = NA,
          data.frame = NA,
          factor     = NA,
          ordered    = NA,
          Date       = NA,
          POSIXt     = NA,
          POSIXct    = NA,
          POSIXlt    = NA)
      }
      if(is.matrix(dat)) {
        out$names <- list(colnames(dat))
        out$names_collapsed = paste(out$names, collapse = "/")
      }
      out
    })
    row_dfs_i <- do.call(rbind, row_dfs_i)
    if(!is.null(row_dfs_i)) row_dfs[[i]] <- row_dfs_i
    sink(type = "message")
  }
  df2 <- do.call(rbind, row_dfs)
  df <- merge(df, df2)
  df
}

eddi · Answer 2 · 2017-11-28T20:35:08.750

Extend/modify to your liking.

library(data.table)
dt = as.data.table(data(package = .packages(all.available = TRUE))$results)
dt = dt[, `:=`(Item   = sub(' \\(.*', '', Item),
               Object = sub('.*\\((.*)\\)', '\\1', Item))]

dt[, { 
       data(list = Object, package = Package)
       d = eval(parse(text = Item))

       classes = if (sum(class(d) %in% c('data.frame')) > 0) unlist(lapply(d, class))
                 else NA_integer_

       .(class    = paste(class(d), collapse = ","),
         nrow     = if (!is.null(nrow(d))) nrow(d) else NA_integer_,
         ncol     = if (!is.null(ncol(d))) ncol(d) else NA_integer_,
         charCols = sum(classes == 'character'),
         facCols  = sum(classes == 'factor'))
     }
   , by = .(Package, Item)]
#      Package          Item                                               class nrow ncol charCols facCols
#  1: datasets AirPassengers                                                  ts   NA   NA       NA      NA
#  2: datasets       BJsales                                                  ts   NA   NA       NA      NA
#  3: datasets  BJsales.lead                                                  ts   NA   NA       NA      NA
#  4: datasets           BOD                                          data.frame    6    2        0       0
#  5: datasets           CO2 nfnGroupedData,nfGroupedData,groupedData,data.frame   84    5        0       3
# ---                                                                                                      
#492: survival    transplant                                          data.frame  815    6        0       3
#493: survival        uspop2                                               array  101    2       NA      NA
#494: survival       veteran                                          data.frame  137    8        0       1
#495:  viridis   viridis.map                                          data.frame 1024    4        1       0
#496:   xtable           tli                                          data.frame  100    5        0       3

FYI I've reworked it into the function I'll use, see my updated answer. — moodymudskipper, Nov 28 '17 at 22:25

score 1 · Answer 3 · answered Sep 14 '17 at 16:39

In package datasets there is no dataset of class data.frame that fulfills your conditions, more exactly if they are of class data.frame and have at most 100 columns, then none of them has two or more columns of class character. I've just found that out with a first version of the following code.

library(datasets)
res <- library(help = "datasets")

dat <- unlist(lapply(strsplit(res$info[[2]], " "), '[[', 1))
dat <- dat[dat != ""]
df_names <- NULL
for(i in seq_along(dat)){
    d <- tryCatch(get(dat[i]), error = function(e) e)
    if(inherits(d, "data.frame")){
        if(nrow(d) <= 100){
            char <- sum(sapply(d, is.character))
            fact <- sum(sapply(d, is.factor))
            if(char >= 2 || fact >= 2){
                print(dat[i])
                df_names <- c(df_names, dat[i])
            }
        }
    }
}

df_names
[1] "CO2"        "esoph"      "npk"        "sleep"      "warpbreaks"

So I had to include extra instructions to handle columns of class factor. By default data frames are created with stringsAsFactors = TRUE. If you can do with those, there you have it, their names are in vector df_names. In order to make them available in the global environment just get the one you want.

Nice thank you. I think if there is nothing built in I'll build a general function around it and share it here. Like some data.frame with dataset name, description, class, length, number of items of each class. There is also a `data` function that returns the datasets that you can restrict to some packages, it can be interesting to use it. But it's surprising to me that every example we see involving a datest was the result of a person browsing randomly lists of 100s of datasets, or writing a custom function as you did. — moodymudskipper, Sep 14 '17 at 21:18

Sathish · Answer 4 · 2017-11-24T21:48:52.550

The table returned by myfun() can be filtered with appropriate conditions, and the columns of datasets can be identified by its class given in the classes coulmn.

The problem with caret package is that it does not have any data frames or matrix object in it. The datasets may be present in the caret inside the list object. I am not sure about it, some list objects in the caret package contains a list of functions.

Also, if interested, you can make myfun() function to be more specific for returning information about data frames or matrix objects only.

myfun <- function( package )
{
  t( sapply( ls( paste0( 'package:', package ) ), function(x){
    y <- eval(parse(text = paste0( package, "::`", x, "`")))
    data.frame( data_class = paste0(class(y), collapse = ","), 
                nrow = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
                               nrow(y), 
                               NA_integer_ ),
                ncol = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
                               ncol(y),
                               NA_integer_),
                classes = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
                                  paste0( unlist(lapply(y, class)), collapse = "," ),
                                  NA),
                stringsAsFactors = FALSE )

  } ) )
}

library( datasets )
meta_data <- myfun( package = "datasets")
head(meta_data)
#               data_class   nrow ncol classes                                                          
# ability.cov   "list"       NA   NA   NA                                                               
# airmiles      "ts"         NA   NA   NA                                                               
# AirPassengers "ts"         NA   NA   NA                                                               
# airquality    "data.frame" 153  6    "integer,integer,numeric,integer,integer,integer"                
# anscombe      "data.frame" 11   8    "numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric"
# attenu        "data.frame" 182  5    "numeric,numeric,factor,numeric,numeric"  

meta_data[ "ChickWeight", ]
# $data_class
# [1] "nfnGroupedData,nfGroupedData,groupedData,data.frame"
# 
# $nrow
# [1] 578
# 
# $ncol
# [1] 4
# 
# $classes
# [1] "numeric,numeric,ordered,factor,factor"

library( 'caret' )
meta_data <- myfun( package = "caret")
#               data_class nrow ncol classes
# anovaScores   "function" NA   NA   NA     
# avNNet        "function" NA   NA   NA     
# bag           "function" NA   NA   NA     
# bagControl    "function" NA   NA   NA     
# bagEarth      "function" NA   NA   NA     
# bagEarthStats "function" NA   NA   NA

If the loaded packages needs to unloaded after applying the myfun() function on the package, try this:

loaded_pkgs <- search()
library( 'caret' )
meta_data <- myfun( package = "caret")
unload_pkgs <- setdiff( search(), loaded_pkgs )
for( i in unload_pkgs ) { 
  detach( pos = which( search() %in% i ) ) 
}

I really like the idea of using `ls('package:...')` as it gives access to additional objects, that could be leverage to do more cool stuff like looking up a function by regular expression or with a bit more work looking up functions by parameter for example. But it's problematic that it doesn't "see" some datasets, such as the ones from `caret` package. — moodymudskipper, Nov 28 '17 at 11:43

How can I find a dataset that has some specific attributes?

4 Answers4