diff --git a/src/assign.c b/src/assign.c index 849cb08f2..bc07cf2c6 100644 --- a/src/assign.c +++ b/src/assign.c @@ -256,6 +256,103 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) { return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0])); } +struct attrib_name_ctx { + hashtab *indexNames; // stores a 1/0 mark for every CHARSXP index name + R_xlen_t indexNamesLen; // how much memory to allocate for the hash? + SEXP index; // attr(DT, "index") + SEXP assignedNames; // STRSXP vector of variable names just assigned + bool verbose; +}; + +// Mark each CHARSXP attribute name with a 1 inside the hash, or count them to find out the allocation size. +static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) { + (void)val; + struct attrib_name_ctx *ctx = ctx_; + if (ctx->indexNames) + hash_set(ctx->indexNames, PRINTNAME(key), 1); + else + ctx->indexNamesLen++; + return NULL; +} + +// For a given index, find out if it sorts a column that has just been assigned. If so, shorten the index (if an equivalent one doesn't already exist) or remove it altogether. +static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) { + const struct attrib_name_ctx *ctx = ctx_; + + hashtab *indexNames = ctx->indexNames; + SEXP index = ctx->index, assignedNames = ctx->assignedNames; + R_xlen_t indexLength = xlength(value); + bool verbose = ctx->verbose; + + const char *tc1, *c1; + tc1 = c1 = CHAR(PRINTNAME(tag)); // the index name; e.g. "__col1__col2" + + if (*tc1!='_' || *(tc1+1)!='_') { + // fix for #1396 + if (verbose) { + Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); + } + setAttrib(index, tag, R_NilValue); + return NULL; + } + + tc1 += 2; // tc1 always marks the start of a key column + if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov + + void *vmax = vmaxget(); + // check the position of the first appearance of an assigned column in the index. + // the new index will be truncated to this position. + size_t newKeyLength = strlen(c1); + char *s4 = R_alloc(newKeyLength + 3, 1); + memcpy(s4, c1, newKeyLength); + memcpy(s4 + newKeyLength, "__", 3); + + for(int i = 0; i < xlength(assignedNames); i++){ + const char *tc2 = CHAR(STRING_ELT(assignedNames, i)); + void *vmax2 = vmaxget(); + size_t tc2_len = strlen(tc2); + char *s5 = R_alloc(tc2_len + 5, 1); //4 * '_' + \0 + memcpy(s5, "__", 2); + memcpy(s5 + 2, tc2, tc2_len); + memcpy(s5 + 2 + tc2_len, "__", 3); + tc2 = strstr(s4, s5); + if(tc2 && (tc2 - s4 < newKeyLength)){ // new column is part of key; match is before last match + newKeyLength = tc2 - s4; + } + vmaxset(vmax2); + } + + s4[newKeyLength] = '\0'; // truncate the new key to the new length + if(newKeyLength == 0){ // no valid key column remains. Drop the key + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, PRINTNAME(tag), 0); + if (verbose) { + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + } else if(newKeyLength < strlen(c1)) { + SEXP s4Str = PROTECT(mkChar(s4)); + if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) + !hash_lookup(indexNames, s4Str, 0)) { // index with shortened name not present yet + setAttrib(index, installChar(s4Str), value); + hash_set(indexNames, PRINTNAME(tag), 0); + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, s4Str, 1); + if (verbose) + Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4+2); + } else { // indexLength > 0 || shortened name present already + // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) + // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, PRINTNAME(tag), 0); + if (verbose) + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + UNPROTECT(1); // s4Str + } //else: index is not affected by assign: nothing to be done + vmaxset(vmax); + return NULL; +} + int *_Last_updated = NULL; SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) @@ -264,12 +361,12 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // newcolnames : add these columns (if any) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign - R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; - SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames; + R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum; + SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted const char *c1, *tc1, *tc2; - int *buf, indexNo; + int *buf; if (isNull(dt)) error(_("assign has been passed a NULL dt")); if (TYPEOF(dt) != VECSXP) error(_("dt passed to assign isn't type VECSXP")); if (islocked(dt)) @@ -549,93 +646,17 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } index = getAttrib(dt, install("index")); if (index != R_NilValue) { - s = ATTRIB(index); - indexNo = 0; - // get a vector with all index names - PROTECT(indexNames = allocVector(STRSXP, xlength(s))); protecti++; - while(s != R_NilValue){ - SET_STRING_ELT(indexNames, indexNo, PRINTNAME(TAG(s))); - indexNo++; - s = CDR(s); - } - s = ATTRIB(index); // reset to first element - indexNo = 0; - while(s != R_NilValue) { - a = TAG(s); - indexLength = xlength(CAR(s)); - tc1 = c1 = CHAR(PRINTNAME(a)); // the index name; e.g. "__col1__col2" - if (*tc1!='_' || *(tc1+1)!='_') { - // fix for #1396 - if (verbose) { - Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); - } - setAttrib(index, a, R_NilValue); - indexNo++; - s = CDR(s); - continue; // with next index - } - tc1 += 2; // tc1 always marks the start of a key column - if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov - // check the position of the first appearance of an assigned column in the index. - // the new index will be truncated to this position. - char *s4 = malloc(strlen(c1) + 3); - if (!s4) { - internal_error(__func__, "Couldn't allocate memory for s4"); // # nocov - } - memcpy(s4, c1, strlen(c1)); - memset(s4 + strlen(c1), '\0', 1); - strcat(s4, "__"); // add trailing '__' to newKey so we can search for pattern '__colName__' also at the end of the index. - int newKeyLength = strlen(c1); - for(int i = 0; i < xlength(assignedNames); i++){ - tc2 = CHAR(STRING_ELT(assignedNames, i)); - char *s5 = malloc(strlen(tc2) + 5); //4 * '_' + \0 - if (!s5) { - free(s4); // # nocov - internal_error(__func__, "Couldn't allocate memory for s5"); // # nocov - } - memset(s5, '_', 2); - memset(s5 + 2, '\0', 1); - strcat(s5, tc2); - strcat(s5, "__"); - tc2 = strstr(s4, s5); - if(tc2 == NULL){ // column is not part of key - free(s5); - continue; - } - if(tc2 - s4 < newKeyLength){ // new column match is before last match - newKeyLength = tc2 - s4; - } - free(s5); - } - memset(s4 + newKeyLength, '\0', 1); // truncate the new key to the new length - if(newKeyLength == 0){ // no valid key column remains. Drop the key - setAttrib(index, a, R_NilValue); - SET_STRING_ELT(indexNames, indexNo, NA_STRING); - if (verbose) { - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - } else if(newKeyLength < strlen(c1)) { - SEXP s4Str = PROTECT(mkString(s4)); - if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) - LOGICAL(chin(s4Str, indexNames))[0] == 0) {// index with shortened name not present yet - SET_TAG(s, install(s4)); - SET_STRING_ELT(indexNames, indexNo, mkChar(s4)); - if (verbose) - Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4 + 2); - } else { // indexLength > 0 || shortened name present already - // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) - // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. - setAttrib(index, a, R_NilValue); - SET_STRING_ELT(indexNames, indexNo, NA_STRING); - if (verbose) - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - UNPROTECT(1); // s4Str - } //else: index is not affected by assign: nothing to be done - free(s4); - indexNo ++; - s = CDR(s); - } + struct attrib_name_ctx ctx = { 0, }; + R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes? + hashtab *h = hash_create(ctx.indexNamesLen); + PROTECT(h->prot); + ctx.indexNames = h; + R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names + ctx.index = index; + ctx.assignedNames = assignedNames; + ctx.verbose = verbose; + R_mapAttrib(index, fixIndexAttrib, &ctx); // adjust indices as needed + UNPROTECT(1); // h } if (ndelete) { // delete any columns assigned NULL (there was a 'continue' earlier in loop above) diff --git a/src/data.table.h b/src/data.table.h index 0bfcb09d8..2f1036848 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -15,6 +15,8 @@ #endif #if R_VERSION < R_Version(4, 5, 0) # define isDataFrame(x) isFrame(x) // #6180 +# define CLEAR_ATTRIB(x) SET_ATTRIB(x, R_NilValue) +# define ANY_ATTRIB(x) (!(isNull(ATTRIB(x)))) #endif #include #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT @@ -103,6 +105,11 @@ } # define R_resizeVector(x, newlen) R_resizeVector_(x, newlen) #endif +// TODO(R>=4.6.0): remove the SVN revision check +#if R_VERSION < R_Version(4, 6, 0) || R_SVN_REVISION < 89194 +# define BACKPORT_MAP_ATTRIB +# define R_mapAttrib(x, fun, ctx) R_mapAttrib_(x, fun, ctx) +#endif // init.c extern SEXP char_integer64; @@ -343,6 +350,9 @@ SEXP R_allocResizableVector_(SEXPTYPE type, R_xlen_t maxlen); SEXP R_duplicateAsResizable_(SEXP x); void R_resizeVector_(SEXP x, R_xlen_t newlen); #endif +#ifdef BACKPORT_MAP_ATTRIB +SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx); +#endif // types.c char *end(char *start); diff --git a/src/dogroups.c b/src/dogroups.c index 373242516..21c12618a 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,6 +3,8 @@ #include #include +static SEXP attribWalker(SEXP key, SEXP val, void *ctx); + static bool anySpecialStatic(SEXP x, hashtab * specials) { // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups @@ -54,20 +56,23 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { list_el = VECTOR_ELT(x,i); if (anySpecialStatic(list_el, specials)) return true; - for(attribs = ATTRIB(list_el); attribs != R_NilValue; attribs = CDR(attribs)) { - if (anySpecialStatic(CAR(attribs), specials)) - return true; // #4936 - } + if (R_mapAttrib(list_el, attribWalker, specials)) + return true; // #4936 } } return false; } +static SEXP attribWalker(SEXP key, SEXP val, void *specials) { + (void)key; + return anySpecialStatic(val, specials) ? R_NilValue : NULL; +} + SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; int nprotect=0; - SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source; + SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, s, RHS, target, source; Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; const bool verbose = LOGICAL(verboseArg)[0]==1; double tstart=0, tblock[10]={0}; int nblock[10]={0}; // For verbose printing, tstart is updated each block @@ -130,12 +135,15 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX R_LockBinding(install(".I"), env); SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by" - // fetch rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to + + // override rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to // non data.table aware package that uses rownames - for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s)); // getAttrib0 basically but that's hidden in attrib.c; #loop_counter_not_local_scope_ok - if (s==R_NilValue) error(_("row.names attribute of .SD not found")); - rownames = CAR(s); - if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]); + SEXP rownames; + PROTECT_INDEX rownamesi; + PROTECT_WITH_INDEX(rownames = allocVector(INTSXP, 2), &rownamesi); nprotect++; + INTEGER(rownames)[0] = NA_INTEGER; + INTEGER(rownames)[1] = -maxGrpSize; + REPROTECT(rownames = setAttrib(SD, R_RowNamesSymbol, rownames), rownamesi); // fetch names of .SD and prepare symbols. In case they are copied-on-write by user assigning to those variables // using <- in j (which is valid, useful and tested), they are repointed to the .SD cols for each group. diff --git a/src/mergelist.c b/src/mergelist.c index 51f28d224..90854ae82 100644 --- a/src/mergelist.c +++ b/src/mergelist.c @@ -17,18 +17,21 @@ SEXP copyCols(SEXP x, SEXP cols) { return R_NilValue; } +static SEXP setDuplicateOneAttrib(SEXP key, SEXP val, void *x) { + setAttrib(x, PROTECT(key), PROTECT(shallow_duplicate(val))); + UNPROTECT(2); + return NULL; // continue +} + void mergeIndexAttrib(SEXP to, SEXP from) { if (!isInteger(to) || LENGTH(to)!=0) internal_error(__func__, "'to' must be integer() already"); // # nocov if (isNull(from)) return; - SEXP t = ATTRIB(to), f = ATTRIB(from); - if (isNull(t)) // target has no attributes -> overwrite - SET_ATTRIB(to, shallow_duplicate(f)); - else { - for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to - SETCDR(t, shallow_duplicate(f)); - } + if (!ANY_ATTRIB(to)) // target has no attributes -> overwrite + SHALLOW_DUPLICATE_ATTRIB(to, from); + else + R_mapAttrib(from, setDuplicateOneAttrib, to); } SEXP cbindlist(SEXP x, SEXP copyArg) { @@ -84,7 +87,7 @@ SEXP cbindlist(SEXP x, SEXP copyArg) { key = getAttrib(thisx, sym_sorted); UNPROTECT(protecti); // thisnames, thisxcol } - if (isNull(ATTRIB(index))) + if (!ANY_ATTRIB(index)) setAttrib(ans, sym_index, R_NilValue); setAttrib(ans, R_NamesSymbol, names); setAttrib(ans, sym_sorted, key); diff --git a/src/nafill.c b/src/nafill.c index 5c9568efb..ff2a7fc34 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -218,7 +218,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S if (!binplace) { for (R_len_t i=0; i