diff --git a/NEWS.md b/NEWS.md index ea2a8a8ee..4bed00f77 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,8 @@ 2. `[,showProgress=]` and `options(datatable.showProgress)` now accept an integer to control the progress bar update interval in seconds, allowing finer control over progress reporting frequency; `TRUE` uses the default 3-second interval, [#6514](https://github.com/Rdatatable/data.table/issues/6514). Thanks @ethanbsmith for the report and @ben-schwen for the PR. +3. `tables()` can now optionally report `data.table` objects stored one level deep inside list objects when `shallow_search=TRUE`, [#2606](https://github.com/Rdatatable/data.table/issues/2606). Thanks @MichaelChirico for the report and @manmita for the PR + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/R/tables.R b/R/tables.R index 960c74343..83a355567 100644 --- a/R/tables.R +++ b/R/tables.R @@ -19,7 +19,8 @@ type_size = function(DT) { } tables = function(mb=type_size, order.col="NAME", width=80L, - env=parent.frame(), silent=FALSE, index=FALSE) + env=parent.frame(), silent=FALSE, index=FALSE, + shallow_search=FALSE) { # Prints name, size and colnames of all data.tables in the calling environment by default mb_name = as.character(substitute(mb)) @@ -27,11 +28,76 @@ tables = function(mb=type_size, order.col="NAME", width=80L, names = ls(envir=env, all.names=TRUE) # include "hidden" objects (starting with .) obj = mget(names, envir=env) # doesn't copy; mget is ok with ... unlike get, #5197 w = which(vapply_1b(obj, is.data.table)) - if (!length(w)) { - if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) - return(invisible(data.table(NULL))) + + info = NULL + # we check if shallow_search is requested and add found tables to w + if (shallow_search) { + is_list = vapply_1b(obj, is.list) + is_df = vapply_1b(obj, is.data.frame) + is_dt = vapply_1b(obj, is.data.table) + # list_index is a index of list which is not data.frame or data.table + list_index = which(is_list & !is_dt & !is_df) + # obj_list is a list of lists of data.tables found inside lists + obj_list = vector("list", length(list_index)) + #make a listof size list_index and add wl in it + total_dt = 0L + # filling obj_list and counting total_dt + for (i in seq_along(list_index)) { + L = obj[[list_index[i]]] + wl = which(vapply_1b(L, is.data.table)) + total_dt = total_dt + length(wl) + obj_list[[i]] = L[wl] + } + name_count = length(w) + total_dt + # initialize info data.table with total number of data.tables found + if (name_count == 0L) { + # nocov start. Requires long-running test case + if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) + # nocov end + return(invisible(data.table(NULL))) + } + # create info data.table with total rows equal to number of data.tables found + info = data.table(NAME=character(name_count), NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list()) + # fill in the names of data.tables found in w + for (i in seq_along(w)) { # names of w items + set(info, i, "NAME", names[w[i]]) + } + # now fill in the data.tables found inside lists + cnt = 1L + if (total_dt > 0L) { + for (i in seq_along(list_index)) { + if (length(obj_list[[i]]) == 0L) next + # get the parent list name + parent_name = names[list_index[i]] + for (j in seq_along(obj_list[[i]])) { + elem_names = names(obj[[list_index[i]]]) + if (!is.null(elem_names) && nzchar(elem_names[j])) { + new_name = paste0(parent_name, "$", elem_names[j]) + } else { + new_name = paste0(parent_name, "[[", j, "]]") + } + DT = obj_list[[i]][[j]] + k = cnt + length(w) # row number in info data.table + cnt = cnt + 1L + set(info, k, "NAME", new_name) + set(info, k, "NROW", nrow(DT)) + set(info, k, "NCOL", ncol(DT)) + if (is.function(mb)) set(info, k, "MB", as.integer(mb(DT)/1048576L)) # i.e. 1024**2 + if (!is.null(tt<-names(DT))) set(info, k, "COLS", tt) # TODO: don't need these if()s when #5526 is done + if (!is.null(tt<-key(DT))) set(info, k, "KEY", tt) + if (index && !is.null(tt<-indices(DT))) set(info, k, "INDICES", tt) + } + } + } + } + else { + # the original code path when shallow_search=FALSE + if (!length(w)) { + if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) + return(invisible(data.table(NULL))) + } + info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list()) } - info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list()) for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables DT = obj[[w[i]]] set(info, i, "NROW", nrow(DT)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1e2b985a4..278cb2ffb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -22035,3 +22035,25 @@ if (test_bit64) local({ # 7407 Test for fread() handling \x1A (ASCII SUB) at end of input txt = paste0("foo\n", strrep("a", 4096 * 100), "\x1A") test(2359.1, nchar(fread(txt)$foo), 409600L) + +#2606 tables() shallow_search finds nested data.tables in lists +# creating env so that the names are within it +xenv2 = new.env() +xenv2$DT = data.table(a = 1L) +xenv2$L = list(data.table(a = 1, b = 4:6), data.table(a = 2, b = 7:10)) +xenv2$M = list(b = data.table(a = 1, b = 4:6), a = 1:5) +xenv2$N = list(a = 1:5) +setkey(xenv2$M$b, a) +setindex(xenv2$M$b, b) +test(2360.1, tables(env = xenv2, shallow_search = TRUE)$NAME, c("DT", "L[[1]]", "L[[2]]", "M$b")) +test(2360.2, tables(env = xenv2, shallow_search = TRUE)$NROW, c(1L, 3L, 4L, 3L)) +test(2360.3, tables(env = xenv2, shallow_search = TRUE)$NCOL, c(1L, 2L, 2L, 2L)) +test(2360.4, tables(env = xenv2, shallow_search = TRUE, index = TRUE)$INDICES, list(NULL, NULL, NULL, "b")) +test(2360.5, tables(env = xenv2, shallow_search = TRUE, index = TRUE)$KEY, list(NULL, NULL, NULL, "a")) +rm(xenv2) + +# no data.table test +xenv_empty = new.env() +test(2360.6, tables(env = xenv_empty, shallow_search = TRUE), invisible(data.table(NULL))) +test(2360.7, tables(env = xenv_empty), invisible(data.table(NULL))) +rm(xenv_empty) diff --git a/man/tables.Rd b/man/tables.Rd index 5898cd730..576455d44 100644 --- a/man/tables.Rd +++ b/man/tables.Rd @@ -6,7 +6,7 @@ } \usage{ tables(mb=type_size, order.col="NAME", width=80, - env=parent.frame(), silent=FALSE, index=FALSE) + env=parent.frame(), silent=FALSE, index=FALSE, shallow_search=FALSE) } \arguments{ \item{mb}{ a function which accepts a \code{data.table} and returns its size in bytes. By default, \code{type_size} (same as \code{TRUE}) provides a fast lower bound by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). A column \code{"MB"} is included in the output unless \code{FALSE} or \code{NULL}. } @@ -15,6 +15,7 @@ tables(mb=type_size, order.col="NAME", width=80, \item{env}{ An \code{environment}, typically the \code{.GlobalEnv} by default, see Details. } \item{silent}{ \code{logical}; should the output be printed? } \item{index}{ \code{logical}; if \code{TRUE}, the column \code{INDICES} is added to indicate the indices assorted with each object, see \code{\link{indices}}. } + \item{shallow_search}{\code{logical}; if \code{TRUE}, searches for \code{data.table} objects inside top-level lists} } \details{ Usually \code{tables()} is executed at the prompt, where \code{parent.frame()} returns \code{.GlobalEnv}. \code{tables()} may also be useful inside functions where \code{parent.frame()} is the local scope of the function; in such a scenario, simply set it to \code{.GlobalEnv} to get the same behaviour as at prompt.