From 774791adf8eb9d2b8e838312ad1b48fe75cbfb5c Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 23 Dec 2025 01:09:44 +0300 Subject: [PATCH 1/2] DT[,foo:=bar]: earlier check for selfrefok The check needs to be there, not below, to detect non-selfrefok tables in by-group operations. Use static analysis to detect common forms of column deletion: foo := NULL and .(bar, baz) := .(NULL, NULL). Static analysis is doomed to miss things like frob := if (runif(1) < .5) 42 else NULL, but hopefully it covers the needs of our reverse dependencies. --- R/data.table.R | 84 +++++++++++++++++++++---------------------- inst/tests/tests.Rraw | 5 +++ 2 files changed, 47 insertions(+), 42 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 3893fe2fc..9c602883f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1220,11 +1220,51 @@ replace_dot_alias = function(e) { m[is.na(m)] = ncol(x)+seq_along(newnames) cols = as.integer(m) # don't pass verbose to selfrefok here -- only activated when - # ok=-1 which will trigger setalloccol with verbose after - # the jval = eval(jsub, ...) + # ok=-1 which will trigger setalloccol with verbose in the next branch + # if a change in the number of columns is suspected if (ok==0L) # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R] if (is.data.table(x)) warningf("A shallow copy of this data.table was taken so that := can add or remove %d columns by reference. At an earlier point, this data.table was copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual for data.table-agnostic packages to produce tables affected by this issue. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.", length(newnames)) } + # ok <- selfrefok above called without verbose -- only activated when + # ok=-1 which will trigger setalloccol with verbose in the next + # branch, which again calls _selfrefok and returns the message then + # !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame + if ( + ( + !is.null(newnames) || # adding new columns + is.null(jsub) || (jsub %iscall% "list" && any(vapply_1b(jsub[-1], is.null))) # removing columns + ) && ( + (ok<1L) || # unsafe to resize + (truelength(x) < ncol(x)+length(newnames)) # not enough space for new columns + ) + ) { + DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove + n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval() + # i.e. reallocate at the size as if the new columns were added followed by setalloccol(). + name = substitute(x) + if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk) + catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n) + # #1729 -- copying to the wrong environment here can cause some confusion + if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") + + # Verbosity should not issue warnings, so cat rather than warning. + # TO DO: Add option 'datatable.pedantic' to turn on warnings like this. + + # TO DO ... comments moved up from C ... + # Note that the NAMED(dt)>1 doesn't work because .Call + # always sets to 2 (see R-ints), it seems. Work around + # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too + # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2. + # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we + # don't mind. + } + setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope + if (is.name(name)) { + assign(as.character(name),x,parent.frame(),inherits=TRUE) + } else if (.is_simple_extraction(name)) { + .reassign_extracted_table(name, x) + } # TO DO: else if env$<- or list$<- + } } } @@ -1400,46 +1440,6 @@ replace_dot_alias = function(e) { } else if (is.numeric(lhs)) { lhs = names_x[m] } - # ok <- selfrefok above called without verbose -- only activated when - # ok=-1 which will trigger setalloccol with verbose in the next - # branch, which again calls _selfrefok and returns the message then - # !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame - if ( - ( - !is.null(newnames) || # adding new columns - is.null(jval) || (is.list(jval) && any(vapply_1b(jval, is.null))) # removing columns - ) && ( - (ok<1L) || # unsafe to resize - (truelength(x) < ncol(x)+length(newnames)) # not enough space for new columns - ) - ) { - DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove - n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval() - # i.e. reallocate at the size as if the new columns were added followed by setalloccol(). - name = substitute(x) - if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk) - catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n) - # #1729 -- copying to the wrong environment here can cause some confusion - if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") - - # Verbosity should not issue warnings, so cat rather than warning. - # TO DO: Add option 'datatable.pedantic' to turn on warnings like this. - - # TO DO ... comments moved up from C ... - # Note that the NAMED(dt)>1 doesn't work because .Call - # always sets to 2 (see R-ints), it seems. Work around - # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too - # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2. - # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we - # don't mind. - } - setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope - if (is.name(name)) { - assign(as.character(name),x,parent.frame(),inherits=TRUE) - } else if (.is_simple_extraction(name)) { - .reassign_extracted_table(name, x) - } # TO DO: else if env$<- or list$<- - } # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above. .Call(Cassign,x,irows,cols,newnames,jval) return(suppPrint(x)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 91e6bc580..c4a6199eb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21909,3 +21909,8 @@ DT = suppressMessages(rbindlist(DTl)) # used to crash test(2352.1, dim(DT), c(300002L, 1L)) test(2352.2, DT[[1]], rep(42, nrow(DT))) rm(DTn, DTl, DT) + +# insert columns by group in freshly unserialized data.tables, #7498 +DT = unserialize(serialize(as.data.table(mtcars), NULL)) +test(2353, DT[,foo:=mean(mpg),by=cyl], as.data.table(mtcars)[,foo:=mean(mpg),by=cyl]) +rm(DT) From 5786d1f88bc996a5524d2842c0342df03a8ed9f1 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 23 Dec 2025 14:26:20 +0100 Subject: [PATCH 2/2] add second setalloccol check --- R/data.table.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/data.table.R b/R/data.table.R index a73dcea7d..bdc92e0aa 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1440,6 +1440,14 @@ replace_dot_alias = function(e) { } else if (is.numeric(lhs)) { lhs = names_x[m] } + # cater for deleting columns by assigning NULL + if ((is.null(jval) || (is.list(jval) && any(vapply_1b(jval, is.null)))) && selfrefok(x, verbose=FALSE) < 1L) { + name = substitute(x) + setalloccol(x, verbose=FALSE) + if (is.name(name)) { + assign(as.character(name), x, parent.frame(), inherits=TRUE) + } + } # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above. .Call(Cassign,x,irows,cols,newnames,jval) return(suppPrint(x))