From 0a6e0e7983a7835bed7594c3527f46b53c2c70da Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 10:43:20 +0300 Subject: [PATCH 01/12] frev: drop SET_ATTRIB Instead, backport and use CLEAR_ATTRIB (R >= 4.5). --- src/data.table.h | 1 + src/utils.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/data.table.h b/src/data.table.h index 0bfcb09d8..0fb796eb0 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -15,6 +15,7 @@ #endif #if R_VERSION < R_Version(4, 5, 0) # define isDataFrame(x) isFrame(x) // #6180 +# define CLEAR_ATTRIB(x) SET_ATTRIB(x, R_NilValue) #endif #include #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT diff --git a/src/utils.c b/src/utils.c index fa95c60e7..1608f9020 100644 --- a/src/utils.c +++ b/src/utils.c @@ -625,7 +625,7 @@ SEXP frev(SEXP x, SEXP copyArg) { SEXP levels = PROTECT(getAttrib(x, R_LevelsSymbol)); nprotect += 2; // swipe attributes from x - SET_ATTRIB(x, R_NilValue); + CLEAR_ATTRIB(x); setAttrib(x, R_NamesSymbol, names); setAttrib(x, R_ClassSymbol, klass); setAttrib(x, R_LevelsSymbol, levels); From 2e0c3893a3607ab7813df03a4f330e6eb8300108 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 11:02:40 +0300 Subject: [PATCH 02/12] mergeIndexAttrib: drop SET_ATTRIB Use SHALLOW_DUPLICATE_ATTRIB (R >= 3.3) for the simple case. Also, Backport ANY_ATTRIB (R >= 4.5) instead of testing !isNull(ATTRIB(.)). --- src/data.table.h | 1 + src/mergelist.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 0fb796eb0..111ae3373 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -16,6 +16,7 @@ #if R_VERSION < R_Version(4, 5, 0) # define isDataFrame(x) isFrame(x) // #6180 # define CLEAR_ATTRIB(x) SET_ATTRIB(x, R_NilValue) +# define ANY_ATTRIB(x) (!(isNull(ATTRIB(x)))) #endif #include #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT diff --git a/src/mergelist.c b/src/mergelist.c index 51f28d224..aed75e424 100644 --- a/src/mergelist.c +++ b/src/mergelist.c @@ -22,11 +22,11 @@ void mergeIndexAttrib(SEXP to, SEXP from) { internal_error(__func__, "'to' must be integer() already"); // # nocov if (isNull(from)) return; - SEXP t = ATTRIB(to), f = ATTRIB(from); - if (isNull(t)) // target has no attributes -> overwrite - SET_ATTRIB(to, shallow_duplicate(f)); + if (!ANY_ATTRIB(to)) // target has no attributes -> overwrite + SHALLOW_DUPLICATE_ATTRIB(to, from); else { - for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to + SEXP t = ATTRIB(to), f = ATTRIB(from); + for (; CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to SETCDR(t, shallow_duplicate(f)); } } From 58d586b0857523200c7e98cdd278398bb7dc0509 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 11:08:56 +0300 Subject: [PATCH 03/12] cbindlist: use ANY_ATTRIB --- src/mergelist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mergelist.c b/src/mergelist.c index aed75e424..5ecb45609 100644 --- a/src/mergelist.c +++ b/src/mergelist.c @@ -84,7 +84,7 @@ SEXP cbindlist(SEXP x, SEXP copyArg) { key = getAttrib(thisx, sym_sorted); UNPROTECT(protecti); // thisnames, thisxcol } - if (isNull(ATTRIB(index))) + if (!ANY_ATTRIB(index)) setAttrib(ans, sym_index, R_NilValue); setAttrib(ans, R_NamesSymbol, names); setAttrib(ans, sym_sorted, key); From 402f0d55452d75470f3dfa5c6c9258af70a0a5ed Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 11:13:10 +0300 Subject: [PATCH 04/12] nafillR: use ANY_ATTRIB --- src/nafill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nafill.c b/src/nafill.c index 5c9568efb..ff2a7fc34 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -218,7 +218,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S if (!binplace) { for (R_len_t i=0; i Date: Thu, 18 Dec 2025 20:55:22 +0300 Subject: [PATCH 05/12] Backport R_mapAttrib --- src/data.table.h | 8 ++++++++ src/utils.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/data.table.h b/src/data.table.h index 111ae3373..2f1036848 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -105,6 +105,11 @@ } # define R_resizeVector(x, newlen) R_resizeVector_(x, newlen) #endif +// TODO(R>=4.6.0): remove the SVN revision check +#if R_VERSION < R_Version(4, 6, 0) || R_SVN_REVISION < 89194 +# define BACKPORT_MAP_ATTRIB +# define R_mapAttrib(x, fun, ctx) R_mapAttrib_(x, fun, ctx) +#endif // init.c extern SEXP char_integer64; @@ -345,6 +350,9 @@ SEXP R_allocResizableVector_(SEXPTYPE type, R_xlen_t maxlen); SEXP R_duplicateAsResizable_(SEXP x); void R_resizeVector_(SEXP x, R_xlen_t newlen); #endif +#ifdef BACKPORT_MAP_ATTRIB +SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx); +#endif // types.c char *end(char *start); diff --git a/src/utils.c b/src/utils.c index 1608f9020..5df924e92 100644 --- a/src/utils.c +++ b/src/utils.c @@ -672,3 +672,20 @@ void R_resizeVector_(SEXP x, R_xlen_t newlen) { SETLENGTH(x, newlen); } #endif + +#ifdef BACKPORT_MAP_ATTRIB +SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx) { + PROTECT_INDEX i; + SEXP a = ATTRIB(x); + PROTECT_WITH_INDEX(a, &i); + + SEXP ret = NULL; + for (; !isNull(a); REPROTECT(a = CDR(a), i)) { + ret = fun(TAG(a), CAR(a), ctx); + if (ret) break; + } + + UNPROTECT(1); + return ret; +} +#endif From ff280b5f1f7cf0170b917de32f676600886d3418 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 23:16:24 +0300 Subject: [PATCH 06/12] anySpecialStatic: switch to R_mapAttrib --- src/dogroups.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index 373242516..7423e7c00 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,6 +3,8 @@ #include #include +static SEXP attribWalker(SEXP key, SEXP val, void *ctx); + static bool anySpecialStatic(SEXP x, hashtab * specials) { // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups @@ -54,15 +56,18 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { list_el = VECTOR_ELT(x,i); if (anySpecialStatic(list_el, specials)) return true; - for(attribs = ATTRIB(list_el); attribs != R_NilValue; attribs = CDR(attribs)) { - if (anySpecialStatic(CAR(attribs), specials)) - return true; // #4936 - } + if (R_mapAttrib(list_el, attribWalker, specials)) + return true; // #4936 } } return false; } +static SEXP attribWalker(SEXP key, SEXP val, void *specials) { + (void)key; + return anySpecialStatic(val, specials) ? R_NilValue : NULL; +} + SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; From e4f178819aa9d825694258973fbcf1eb49f8c316 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 23:56:02 +0300 Subject: [PATCH 07/12] dogroups: construct rownames anew Instead of trying to walk ATTRIB in search of the compact 'rownames' attribute to modify, install it anew, take note of the returned reference to the value being installed (a different one!) and modify that. --- src/dogroups.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index 7423e7c00..21c12618a 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -72,7 +72,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; int nprotect=0; - SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source; + SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, s, RHS, target, source; Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; const bool verbose = LOGICAL(verboseArg)[0]==1; double tstart=0, tblock[10]={0}; int nblock[10]={0}; // For verbose printing, tstart is updated each block @@ -135,12 +135,15 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX R_LockBinding(install(".I"), env); SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by" - // fetch rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to + + // override rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to // non data.table aware package that uses rownames - for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s)); // getAttrib0 basically but that's hidden in attrib.c; #loop_counter_not_local_scope_ok - if (s==R_NilValue) error(_("row.names attribute of .SD not found")); - rownames = CAR(s); - if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]); + SEXP rownames; + PROTECT_INDEX rownamesi; + PROTECT_WITH_INDEX(rownames = allocVector(INTSXP, 2), &rownamesi); nprotect++; + INTEGER(rownames)[0] = NA_INTEGER; + INTEGER(rownames)[1] = -maxGrpSize; + REPROTECT(rownames = setAttrib(SD, R_RowNamesSymbol, rownames), rownamesi); // fetch names of .SD and prepare symbols. In case they are copied-on-write by user assigning to those variables // using <- in j (which is valid, useful and tested), they are repointed to the .SD cols for each group. From 3c6675717f3e8c68b0f07784586d81fe998f9719 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Fri, 19 Dec 2025 00:14:12 +0300 Subject: [PATCH 08/12] mergeIndexAttrib: switch to R_mapAttrib --- src/mergelist.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/mergelist.c b/src/mergelist.c index 5ecb45609..90854ae82 100644 --- a/src/mergelist.c +++ b/src/mergelist.c @@ -17,6 +17,12 @@ SEXP copyCols(SEXP x, SEXP cols) { return R_NilValue; } +static SEXP setDuplicateOneAttrib(SEXP key, SEXP val, void *x) { + setAttrib(x, PROTECT(key), PROTECT(shallow_duplicate(val))); + UNPROTECT(2); + return NULL; // continue +} + void mergeIndexAttrib(SEXP to, SEXP from) { if (!isInteger(to) || LENGTH(to)!=0) internal_error(__func__, "'to' must be integer() already"); // # nocov @@ -24,11 +30,8 @@ void mergeIndexAttrib(SEXP to, SEXP from) { return; if (!ANY_ATTRIB(to)) // target has no attributes -> overwrite SHALLOW_DUPLICATE_ATTRIB(to, from); - else { - SEXP t = ATTRIB(to), f = ATTRIB(from); - for (; CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to - SETCDR(t, shallow_duplicate(f)); - } + else + R_mapAttrib(from, setDuplicateOneAttrib, to); } SEXP cbindlist(SEXP x, SEXP copyArg) { From 2c10c371e64ac5362ad49a1f7891450bd8c5651c Mon Sep 17 00:00:00 2001 From: Ivan K Date: Fri, 19 Dec 2025 00:42:22 +0300 Subject: [PATCH 09/12] assign: factor out index fixup Instead of walking the attribute list directly, use R_mapAttrib(). Create a hash table of index names instead of relying on chin() and a temporary string vector. Move all temporary allocations onto the R heap. --- src/assign.c | 195 ++++++++++++++++++++++++++++----------------------- 1 file changed, 106 insertions(+), 89 deletions(-) diff --git a/src/assign.c b/src/assign.c index 849cb08f2..fb79eab32 100644 --- a/src/assign.c +++ b/src/assign.c @@ -256,6 +256,99 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) { return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0])); } +struct attrib_name_ctx { + hashtab *indexNames; + SEXP index, assignedNames; + R_xlen_t len; + bool verbose; +}; +static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) { + (void)val; + struct attrib_name_ctx *ctx = ctx_; + if (ctx->indexNames) + hash_set(ctx->indexNames, PRINTNAME(key), 1); + else + ctx->len++; + return NULL; +} + +static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) { + const struct attrib_name_ctx *ctx = ctx_; + + hashtab *indexNames = ctx->indexNames; + SEXP index = ctx->index, assignedNames = ctx->assignedNames; + R_xlen_t indexLength = xlength(value); + bool verbose = ctx->verbose; + + const char *tc1, *c1; + tc1 = c1 = CHAR(PRINTNAME(tag)); // the index name; e.g. "__col1__col2" + + if (*tc1!='_' || *(tc1+1)!='_') { + // fix for #1396 + if (verbose) { + Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); + } + setAttrib(index, tag, R_NilValue); + return NULL; + } + + tc1 += 2; // tc1 always marks the start of a key column + if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov + + void *vmax = vmaxget(); + // check the position of the first appearance of an assigned column in the index. + // the new index will be truncated to this position. + size_t newKeyLength = strlen(c1); + char *s4 = R_alloc(newKeyLength + 3, 1); + memcpy(s4, c1, newKeyLength); + memcpy(s4 + newKeyLength, "__", 3); + + for(int i = 0; i < xlength(assignedNames); i++){ + const char *tc2 = CHAR(STRING_ELT(assignedNames, i)); + void *vmax2 = vmaxget(); + size_t tc2_len = strlen(tc2); + char *s5 = R_alloc(tc2_len + 5, 1); //4 * '_' + \0 + memcpy(s5, "__", 2); + memcpy(s5 + 2, tc2, tc2_len); + memcpy(s5 + 2 + tc2_len, "__", 3); + tc2 = strstr(s4, s5); + if(tc2 && (tc2 - s4 < newKeyLength)){ // new column is part of key; match is before last match + newKeyLength = tc2 - s4; + } + vmaxset(vmax2); + } + + s4[newKeyLength] = '\0'; // truncate the new key to the new length + if(newKeyLength == 0){ // no valid key column remains. Drop the key + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, PRINTNAME(tag), 0); + if (verbose) { + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + } else if(newKeyLength < strlen(c1)) { + SEXP s4Str = PROTECT(mkChar(s4)); + if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) + !hash_lookup(indexNames, s4Str, 0)) { // index with shortened name not present yet + setAttrib(index, installChar(s4Str), value); + hash_set(indexNames, PRINTNAME(tag), 0); + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, s4Str, 1); + if (verbose) + Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4+2); + } else { // indexLength > 0 || shortened name present already + // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) + // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, tag, 0); + if (verbose) + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + UNPROTECT(1); // s4Str + } //else: index is not affected by assign: nothing to be done + vmaxset(vmax); + return NULL; +} + int *_Last_updated = NULL; SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) @@ -265,11 +358,11 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; - SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames; + SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted const char *c1, *tc1, *tc2; - int *buf, indexNo; + int *buf; if (isNull(dt)) error(_("assign has been passed a NULL dt")); if (TYPEOF(dt) != VECSXP) error(_("dt passed to assign isn't type VECSXP")); if (islocked(dt)) @@ -549,93 +642,17 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } index = getAttrib(dt, install("index")); if (index != R_NilValue) { - s = ATTRIB(index); - indexNo = 0; - // get a vector with all index names - PROTECT(indexNames = allocVector(STRSXP, xlength(s))); protecti++; - while(s != R_NilValue){ - SET_STRING_ELT(indexNames, indexNo, PRINTNAME(TAG(s))); - indexNo++; - s = CDR(s); - } - s = ATTRIB(index); // reset to first element - indexNo = 0; - while(s != R_NilValue) { - a = TAG(s); - indexLength = xlength(CAR(s)); - tc1 = c1 = CHAR(PRINTNAME(a)); // the index name; e.g. "__col1__col2" - if (*tc1!='_' || *(tc1+1)!='_') { - // fix for #1396 - if (verbose) { - Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); - } - setAttrib(index, a, R_NilValue); - indexNo++; - s = CDR(s); - continue; // with next index - } - tc1 += 2; // tc1 always marks the start of a key column - if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov - // check the position of the first appearance of an assigned column in the index. - // the new index will be truncated to this position. - char *s4 = malloc(strlen(c1) + 3); - if (!s4) { - internal_error(__func__, "Couldn't allocate memory for s4"); // # nocov - } - memcpy(s4, c1, strlen(c1)); - memset(s4 + strlen(c1), '\0', 1); - strcat(s4, "__"); // add trailing '__' to newKey so we can search for pattern '__colName__' also at the end of the index. - int newKeyLength = strlen(c1); - for(int i = 0; i < xlength(assignedNames); i++){ - tc2 = CHAR(STRING_ELT(assignedNames, i)); - char *s5 = malloc(strlen(tc2) + 5); //4 * '_' + \0 - if (!s5) { - free(s4); // # nocov - internal_error(__func__, "Couldn't allocate memory for s5"); // # nocov - } - memset(s5, '_', 2); - memset(s5 + 2, '\0', 1); - strcat(s5, tc2); - strcat(s5, "__"); - tc2 = strstr(s4, s5); - if(tc2 == NULL){ // column is not part of key - free(s5); - continue; - } - if(tc2 - s4 < newKeyLength){ // new column match is before last match - newKeyLength = tc2 - s4; - } - free(s5); - } - memset(s4 + newKeyLength, '\0', 1); // truncate the new key to the new length - if(newKeyLength == 0){ // no valid key column remains. Drop the key - setAttrib(index, a, R_NilValue); - SET_STRING_ELT(indexNames, indexNo, NA_STRING); - if (verbose) { - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - } else if(newKeyLength < strlen(c1)) { - SEXP s4Str = PROTECT(mkString(s4)); - if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) - LOGICAL(chin(s4Str, indexNames))[0] == 0) {// index with shortened name not present yet - SET_TAG(s, install(s4)); - SET_STRING_ELT(indexNames, indexNo, mkChar(s4)); - if (verbose) - Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4 + 2); - } else { // indexLength > 0 || shortened name present already - // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) - // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. - setAttrib(index, a, R_NilValue); - SET_STRING_ELT(indexNames, indexNo, NA_STRING); - if (verbose) - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - UNPROTECT(1); // s4Str - } //else: index is not affected by assign: nothing to be done - free(s4); - indexNo ++; - s = CDR(s); - } + struct attrib_name_ctx ctx = { 0, }; + R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes? + hashtab *h = hash_create(ctx.len); + PROTECT(h->prot); + ctx.indexNames = h; + R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names + ctx.index = index; + ctx.assignedNames = assignedNames; + ctx.verbose = verbose; + R_mapAttrib(index, fixIndexAttrib, &ctx); // adjust indices as needed + UNPROTECT(1); // h } if (ndelete) { // delete any columns assigned NULL (there was a 'continue' earlier in loop above) From e1fa0a63b32fe968cd4cfbbae9364a7055d1b136 Mon Sep 17 00:00:00 2001 From: HughParsonage Date: Sun, 21 Dec 2025 22:13:00 +0300 Subject: [PATCH 10/12] assign: drop indexLength --- src/assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/assign.c b/src/assign.c index fb79eab32..94c27d598 100644 --- a/src/assign.c +++ b/src/assign.c @@ -357,7 +357,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // newcolnames : add these columns (if any) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign - R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; + R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum; SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted From 09b1f7eee884e4925244fd7f2379be63d6b0e929 Mon Sep 17 00:00:00 2001 From: HughParsonage Date: Sun, 21 Dec 2025 22:22:03 +0300 Subject: [PATCH 11/12] assign: fix index unmarking --- src/assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/assign.c b/src/assign.c index 94c27d598..235875ac9 100644 --- a/src/assign.c +++ b/src/assign.c @@ -339,7 +339,7 @@ static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) { // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. setAttrib(index, tag, R_NilValue); - hash_set(indexNames, tag, 0); + hash_set(indexNames, PRINTNAME(tag), 0); if (verbose) Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); } From 1a408bf6087e49e7aee6fe86630d80e5c4dbcd68 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 21 Dec 2025 22:46:43 +0300 Subject: [PATCH 12/12] Comments, better field names --- src/assign.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/assign.c b/src/assign.c index 235875ac9..bc07cf2c6 100644 --- a/src/assign.c +++ b/src/assign.c @@ -257,21 +257,25 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) { } struct attrib_name_ctx { - hashtab *indexNames; - SEXP index, assignedNames; - R_xlen_t len; + hashtab *indexNames; // stores a 1/0 mark for every CHARSXP index name + R_xlen_t indexNamesLen; // how much memory to allocate for the hash? + SEXP index; // attr(DT, "index") + SEXP assignedNames; // STRSXP vector of variable names just assigned bool verbose; }; + +// Mark each CHARSXP attribute name with a 1 inside the hash, or count them to find out the allocation size. static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) { (void)val; struct attrib_name_ctx *ctx = ctx_; if (ctx->indexNames) hash_set(ctx->indexNames, PRINTNAME(key), 1); else - ctx->len++; + ctx->indexNamesLen++; return NULL; } +// For a given index, find out if it sorts a column that has just been assigned. If so, shorten the index (if an equivalent one doesn't already exist) or remove it altogether. static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) { const struct attrib_name_ctx *ctx = ctx_; @@ -644,7 +648,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) if (index != R_NilValue) { struct attrib_name_ctx ctx = { 0, }; R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes? - hashtab *h = hash_create(ctx.len); + hashtab *h = hash_create(ctx.indexNamesLen); PROTECT(h->prot); ctx.indexNames = h; R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names