How can I find a dataset that has some specific attributes?

I've packaged a solution in a one function github package.

I'm copying the whole code at the bottom but the simplest is :

remotes::install_github("moodymudskipper/datasearch")
library(datasearch)

All data sets from package "dplyr"

dplyr_all <-
  datasearch("dplyr")

View(dplyr_all)

dplyr

Datasets from package "datasets" restricted by condition

datasets_ncol5 <-
  datasearch("datasets", filter =  ~is.data.frame(.) && ncol(.) == 5)

View(datasets_ncol5)

datasets

All datasets from all installed packages, no restriction


# might take more or less time, depends what you have installed
all_datasets <- datasearch()

View(all_datasets)

# subsetting the output
my_subset <- subset(
  all_datasets, 
  class1 == "data.frame" &
    grepl("treatment", names_collapsed) &
    nrow < 100
)

View(my_subset)

all


datasearch <- function(pkgs = NULL, filter = NULL){
  # make function silent
  w <- options()$warn
  options(warn = -1)
  search_ <- search()
  file_ <- tempfile()
  file_ <- file(file_, "w")
  on.exit({
    options(warn = w)
    to_detach <- setdiff(search(), search_)
    for(pkg in to_detach) eval(bquote(detach(.(pkg))))
    # note : we still have loaded namespaces, we could unload those that we ddn't
    # have in the beginning but i'm worried about surprising effects, I think
    # the S3 method tables should be cleaned too, and maybe other things

    # note2 : tracing library and require didn't work
    })

  # convert formula to function
  if(inherits(filter, "formula")) {
    filter <- as.function(c(alist(.=), filter[[length(filter)]]))
  }

  ## by default fetch all available packages in .libPaths()
  if(is.null(pkgs)) pkgs <- .packages(all.available = TRUE)
  ## fetch all data sets description
  df <- as.data.frame(data(package = pkgs, verbose = FALSE)$results)
  names(df) <- tolower(names(df))
  item <- NULL # for cmd check note
  df <- transform(
    df,
    data_name = sub('.*\\((.*)\\)', '\\1', item),
    dataset   = sub(' \\(.*', '', item),
    libpath = NULL,
    item = NULL
    )
  df <- df[order(df$package, df$data_name),]
  pkg_data_names <- aggregate(dataset ~ package + data_name, df, c)
  pkg_data_names <- pkg_data_names[order(pkg_data_names$package, pkg_data_names$data_name),]

  env <- new.env()
  n <-  nrow(pkg_data_names)
  pb <- progress::progress_bar$new(
    format = "[:bar] :percent :pkg",
    total = n)
  row_dfs <- vector("list", n)
  for(i in seq(nrow(pkg_data_names))) {
    pkg    <- pkg_data_names$package[i]
    data_name <- pkg_data_names$data_name[i]
    datasets  <- pkg_data_names$dataset[[i]]
    pb$tick(tokens = list(pkg = format(pkg, width = 12)))

    sink(file_, type = "message")
    data(list=data_name, package = pkg, envir = env)
    row_dfs_i <- lapply(datasets, function(dataset) {
      dat <- get(dataset, envir = env)
      if(!is.null(filter) && !filter(dat)) return(NULL)
      cl <- class(dat)
      nms <- names(dat)
      nc <- ncol(dat)
      if (is.null(nc)) nc <- NA
      nr <- nrow(dat)
      if (is.null(nr)) nr <- NA

      out <- data.frame(
        package = pkg,
        data_name = data_name,
        dataset = dataset,
        class = I(list(cl)),
        class1 = cl[1],
        type = typeof(dat),
        names = I(list(nms)),
        names_collapsed = paste(nms, collapse = "/"),
        nrow       = nr,
        ncol       = nc,
        length     = length(dat))

      if("data.frame" %in% cl) {
        classes <- lapply(dat, class)
        cl_flat <- unlist(classes)
        out <- transform(
          out,
          classes    = I(list(classes)),
          types      = I(list(vapply(dat, typeof, character(1)))),
          logical    = sum(cl_flat == 'logical'),
          integer    = sum(cl_flat == 'integer'),
          numeric    = sum(cl_flat == 'numeric'),
          complex    = sum(cl_flat == 'complex'),
          character  = sum(cl_flat == 'character'),
          raw        = sum(cl_flat == 'raw'),
          list       = sum(cl_flat == 'list'),
          data.frame = sum(cl_flat == 'data.frame'),
          factor     = sum(cl_flat == 'factor'),
          ordered    = sum(cl_flat == 'ordered'),
          Date       = sum(cl_flat == 'Date'),
          POSIXt     = sum(cl_flat == 'POSIXt'),
          POSIXct    = sum(cl_flat == 'POSIXct'),
          POSIXlt    = sum(cl_flat == 'POSIXlt'))
      } else {
        out <- transform(
          out,
          nrow       = NA,
          ncol       = NA,
          classes    = NA,
          types      = NA,
          logical    = NA,
          integer    = NA,
          numeric    = NA,
          complex    = NA,
          character  = NA,
          raw        = NA,
          list       = NA,
          data.frame = NA,
          factor     = NA,
          ordered    = NA,
          Date       = NA,
          POSIXt     = NA,
          POSIXct    = NA,
          POSIXlt    = NA)
      }
      if(is.matrix(dat)) {
        out$names <- list(colnames(dat))
        out$names_collapsed = paste(out$names, collapse = "/")
      }
      out
    })
    row_dfs_i <- do.call(rbind, row_dfs_i)
    if(!is.null(row_dfs_i)) row_dfs[[i]] <- row_dfs_i
    sink(type = "message")
  }
  df2 <- do.call(rbind, row_dfs)
  df <- merge(df, df2)
  df
}

Extend/modify to your liking.

library(data.table)
dt = as.data.table(data(package = .packages(all.available = TRUE))$results)
dt = dt[, `:=`(Item   = sub(' \\(.*', '', Item),
               Object = sub('.*\\((.*)\\)', '\\1', Item))]

dt[, { 
       data(list = Object, package = Package)
       d = eval(parse(text = Item))

       classes = if (sum(class(d) %in% c('data.frame')) > 0) unlist(lapply(d, class))
                 else NA_integer_

       .(class    = paste(class(d), collapse = ","),
         nrow     = if (!is.null(nrow(d))) nrow(d) else NA_integer_,
         ncol     = if (!is.null(ncol(d))) ncol(d) else NA_integer_,
         charCols = sum(classes == 'character'),
         facCols  = sum(classes == 'factor'))
     }
   , by = .(Package, Item)]
#      Package          Item                                               class nrow ncol charCols facCols
#  1: datasets AirPassengers                                                  ts   NA   NA       NA      NA
#  2: datasets       BJsales                                                  ts   NA   NA       NA      NA
#  3: datasets  BJsales.lead                                                  ts   NA   NA       NA      NA
#  4: datasets           BOD                                          data.frame    6    2        0       0
#  5: datasets           CO2 nfnGroupedData,nfGroupedData,groupedData,data.frame   84    5        0       3
# ---                                                                                                      
#492: survival    transplant                                          data.frame  815    6        0       3
#493: survival        uspop2                                               array  101    2       NA      NA
#494: survival       veteran                                          data.frame  137    8        0       1
#495:  viridis   viridis.map                                          data.frame 1024    4        1       0
#496:   xtable           tli                                          data.frame  100    5        0       3

Tags:

R