Skip to content
201 changes: 111 additions & 90 deletions src/assign.c
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,103 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0]));
}

struct attrib_name_ctx {
hashtab *indexNames; // stores a 1/0 mark for every CHARSXP index name
R_xlen_t indexNamesLen; // how much memory to allocate for the hash?
SEXP index; // attr(DT, "index")
SEXP assignedNames; // STRSXP vector of variable names just assigned
bool verbose;
};

// Mark each CHARSXP attribute name with a 1 inside the hash, or count them to find out the allocation size.
static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) {
(void)val;
struct attrib_name_ctx *ctx = ctx_;
if (ctx->indexNames)
hash_set(ctx->indexNames, PRINTNAME(key), 1);
else
ctx->indexNamesLen++;
return NULL;
}

// For a given index, find out if it sorts a column that has just been assigned. If so, shorten the index (if an equivalent one doesn't already exist) or remove it altogether.
static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is SEXP the intended return type?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's following the R_mapAttrib interface. I have to return (SEXP)NULL to continue iteration.

const struct attrib_name_ctx *ctx = ctx_;

hashtab *indexNames = ctx->indexNames;
SEXP index = ctx->index, assignedNames = ctx->assignedNames;
R_xlen_t indexLength = xlength(value);
bool verbose = ctx->verbose;

const char *tc1, *c1;
tc1 = c1 = CHAR(PRINTNAME(tag)); // the index name; e.g. "__col1__col2"

if (*tc1!='_' || *(tc1+1)!='_') {
// fix for #1396
if (verbose) {
Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1);
}
setAttrib(index, tag, R_NilValue);
return NULL;
}

tc1 += 2; // tc1 always marks the start of a key column
if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov

void *vmax = vmaxget();
// check the position of the first appearance of an assigned column in the index.
// the new index will be truncated to this position.
size_t newKeyLength = strlen(c1);
char *s4 = R_alloc(newKeyLength + 3, 1);
memcpy(s4, c1, newKeyLength);
memcpy(s4 + newKeyLength, "__", 3);

for(int i = 0; i < xlength(assignedNames); i++){
const char *tc2 = CHAR(STRING_ELT(assignedNames, i));
void *vmax2 = vmaxget();
size_t tc2_len = strlen(tc2);
char *s5 = R_alloc(tc2_len + 5, 1); //4 * '_' + \0
memcpy(s5, "__", 2);
memcpy(s5 + 2, tc2, tc2_len);
memcpy(s5 + 2 + tc2_len, "__", 3);
tc2 = strstr(s4, s5);
if(tc2 && (tc2 - s4 < newKeyLength)){ // new column is part of key; match is before last match
newKeyLength = tc2 - s4;
}
vmaxset(vmax2);
}

s4[newKeyLength] = '\0'; // truncate the new key to the new length
if(newKeyLength == 0){ // no valid key column remains. Drop the key
setAttrib(index, tag, R_NilValue);
hash_set(indexNames, PRINTNAME(tag), 0);
if (verbose) {
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
}
} else if(newKeyLength < strlen(c1)) {
SEXP s4Str = PROTECT(mkChar(s4));
if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372)
!hash_lookup(indexNames, s4Str, 0)) { // index with shortened name not present yet
setAttrib(index, installChar(s4Str), value);
hash_set(indexNames, PRINTNAME(tag), 0);
setAttrib(index, tag, R_NilValue);
hash_set(indexNames, s4Str, 1);
if (verbose)
Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4+2);
} else { // indexLength > 0 || shortened name present already
// indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372)
// shortened name already present indicates that index needs to be dropped to avoid duplicate indices.
setAttrib(index, tag, R_NilValue);
hash_set(indexNames, PRINTNAME(tag), 0);
if (verbose)
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
}
UNPROTECT(1); // s4Str
} //else: index is not affected by assign: nothing to be done
vmaxset(vmax);
return NULL;
}

int *_Last_updated = NULL;

SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
Expand All @@ -264,12 +361,12 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
// newcolnames : add these columns (if any)
// cols : column names or numbers corresponding to the values to set
// rows : row numbers to assign
R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength;
SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames;
R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum;
SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames;
bool verbose=GetVerbose();
int ndelete=0; // how many columns are being deleted
const char *c1, *tc1, *tc2;
int *buf, indexNo;
int *buf;
if (isNull(dt)) error(_("assign has been passed a NULL dt"));
if (TYPEOF(dt) != VECSXP) error(_("dt passed to assign isn't type VECSXP"));
if (islocked(dt))
Expand Down Expand Up @@ -549,93 +646,17 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
}
index = getAttrib(dt, install("index"));
if (index != R_NilValue) {
s = ATTRIB(index);
indexNo = 0;
// get a vector with all index names
PROTECT(indexNames = allocVector(STRSXP, xlength(s))); protecti++;
while(s != R_NilValue){
SET_STRING_ELT(indexNames, indexNo, PRINTNAME(TAG(s)));
indexNo++;
s = CDR(s);
}
s = ATTRIB(index); // reset to first element
indexNo = 0;
while(s != R_NilValue) {
a = TAG(s);
indexLength = xlength(CAR(s));
tc1 = c1 = CHAR(PRINTNAME(a)); // the index name; e.g. "__col1__col2"
if (*tc1!='_' || *(tc1+1)!='_') {
// fix for #1396
if (verbose) {
Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1);
}
setAttrib(index, a, R_NilValue);
indexNo++;
s = CDR(s);
continue; // with next index
}
tc1 += 2; // tc1 always marks the start of a key column
if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov
// check the position of the first appearance of an assigned column in the index.
// the new index will be truncated to this position.
char *s4 = malloc(strlen(c1) + 3);
if (!s4) {
internal_error(__func__, "Couldn't allocate memory for s4"); // # nocov
}
memcpy(s4, c1, strlen(c1));
memset(s4 + strlen(c1), '\0', 1);
strcat(s4, "__"); // add trailing '__' to newKey so we can search for pattern '__colName__' also at the end of the index.
int newKeyLength = strlen(c1);
for(int i = 0; i < xlength(assignedNames); i++){
tc2 = CHAR(STRING_ELT(assignedNames, i));
char *s5 = malloc(strlen(tc2) + 5); //4 * '_' + \0
if (!s5) {
free(s4); // # nocov
internal_error(__func__, "Couldn't allocate memory for s5"); // # nocov
}
memset(s5, '_', 2);
memset(s5 + 2, '\0', 1);
strcat(s5, tc2);
strcat(s5, "__");
tc2 = strstr(s4, s5);
if(tc2 == NULL){ // column is not part of key
free(s5);
continue;
}
if(tc2 - s4 < newKeyLength){ // new column match is before last match
newKeyLength = tc2 - s4;
}
free(s5);
}
memset(s4 + newKeyLength, '\0', 1); // truncate the new key to the new length
if(newKeyLength == 0){ // no valid key column remains. Drop the key
setAttrib(index, a, R_NilValue);
SET_STRING_ELT(indexNames, indexNo, NA_STRING);
if (verbose) {
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
}
} else if(newKeyLength < strlen(c1)) {
SEXP s4Str = PROTECT(mkString(s4));
if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372)
LOGICAL(chin(s4Str, indexNames))[0] == 0) {// index with shortened name not present yet
SET_TAG(s, install(s4));
SET_STRING_ELT(indexNames, indexNo, mkChar(s4));
if (verbose)
Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4 + 2);
} else { // indexLength > 0 || shortened name present already
// indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372)
// shortened name already present indicates that index needs to be dropped to avoid duplicate indices.
setAttrib(index, a, R_NilValue);
SET_STRING_ELT(indexNames, indexNo, NA_STRING);
if (verbose)
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
}
UNPROTECT(1); // s4Str
} //else: index is not affected by assign: nothing to be done
free(s4);
indexNo ++;
s = CDR(s);
}
struct attrib_name_ctx ctx = { 0, };
R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes?
hashtab *h = hash_create(ctx.indexNamesLen);
PROTECT(h->prot);
ctx.indexNames = h;
R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names
ctx.index = index;
ctx.assignedNames = assignedNames;
ctx.verbose = verbose;
R_mapAttrib(index, fixIndexAttrib, &ctx); // adjust indices as needed
UNPROTECT(1); // h
}
if (ndelete) {
// delete any columns assigned NULL (there was a 'continue' earlier in loop above)
Expand Down
10 changes: 10 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#endif
#if R_VERSION < R_Version(4, 5, 0)
# define isDataFrame(x) isFrame(x) // #6180
# define CLEAR_ATTRIB(x) SET_ATTRIB(x, R_NilValue)
# define ANY_ATTRIB(x) (!(isNull(ATTRIB(x))))
#endif
#include <Rinternals.h>
#define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT
Expand Down Expand Up @@ -103,6 +105,11 @@
}
# define R_resizeVector(x, newlen) R_resizeVector_(x, newlen)
#endif
// TODO(R>=4.6.0): remove the SVN revision check
#if R_VERSION < R_Version(4, 6, 0) || R_SVN_REVISION < 89194
# define BACKPORT_MAP_ATTRIB
# define R_mapAttrib(x, fun, ctx) R_mapAttrib_(x, fun, ctx)
#endif

// init.c
extern SEXP char_integer64;
Expand Down Expand Up @@ -343,6 +350,9 @@ SEXP R_allocResizableVector_(SEXPTYPE type, R_xlen_t maxlen);
SEXP R_duplicateAsResizable_(SEXP x);
void R_resizeVector_(SEXP x, R_xlen_t newlen);
#endif
#ifdef BACKPORT_MAP_ATTRIB
SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx);
#endif

// types.c
char *end(char *start);
Expand Down
28 changes: 18 additions & 10 deletions src/dogroups.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <fcntl.h>
#include <time.h>

static SEXP attribWalker(SEXP key, SEXP val, void *ctx);

static bool anySpecialStatic(SEXP x, hashtab * specials) {
// Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd
// Static because these are like C static arrays which are the same memory for each group; e.g., dogroups
Expand Down Expand Up @@ -54,20 +56,23 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) {
list_el = VECTOR_ELT(x,i);
if (anySpecialStatic(list_el, specials))
return true;
for(attribs = ATTRIB(list_el); attribs != R_NilValue; attribs = CDR(attribs)) {
if (anySpecialStatic(CAR(attribs), specials))
return true; // #4936
}
if (R_mapAttrib(list_el, attribWalker, specials))
return true; // #4936
}
}
return false;
}

static SEXP attribWalker(SEXP key, SEXP val, void *specials) {
(void)key;
return anySpecialStatic(val, specials) ? R_NilValue : NULL;
}

SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg)
{
R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp;
int nprotect=0;
SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source;
SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, s, RHS, target, source;
Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE;
const bool verbose = LOGICAL(verboseArg)[0]==1;
double tstart=0, tblock[10]={0}; int nblock[10]={0}; // For verbose printing, tstart is updated each block
Expand Down Expand Up @@ -130,12 +135,15 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
R_LockBinding(install(".I"), env);

SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by"
// fetch rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to

// override rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to
// non data.table aware package that uses rownames
for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s)); // getAttrib0 basically but that's hidden in attrib.c; #loop_counter_not_local_scope_ok
if (s==R_NilValue) error(_("row.names attribute of .SD not found"));
rownames = CAR(s);
if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]);
SEXP rownames;
PROTECT_INDEX rownamesi;
PROTECT_WITH_INDEX(rownames = allocVector(INTSXP, 2), &rownamesi); nprotect++;
INTEGER(rownames)[0] = NA_INTEGER;
INTEGER(rownames)[1] = -maxGrpSize;
REPROTECT(rownames = setAttrib(SD, R_RowNamesSymbol, rownames), rownamesi);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is wrong; check the return value of setAttrib

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider R_mapAttrib for this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

R is somewhat inconsistent about this. setAttrib(foo, R_CommentSymbol, bar) and setAttrib(foo, R_ClassSymbol, bar) return R_NilValue, but generic installAttrib and our specific case row_names_gets do return the freshly installed value. Since it's not documented, R_mapAttrib might be a safer approach.


// fetch names of .SD and prepare symbols. In case they are copied-on-write by user assigning to those variables
// using <- in j (which is valid, useful and tested), they are repointed to the .SD cols for each group.
Expand Down
19 changes: 11 additions & 8 deletions src/mergelist.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,21 @@ SEXP copyCols(SEXP x, SEXP cols) {
return R_NilValue;
}

static SEXP setDuplicateOneAttrib(SEXP key, SEXP val, void *x) {
setAttrib(x, PROTECT(key), PROTECT(shallow_duplicate(val)));
UNPROTECT(2);
return NULL; // continue
}

void mergeIndexAttrib(SEXP to, SEXP from) {
if (!isInteger(to) || LENGTH(to)!=0)
internal_error(__func__, "'to' must be integer() already"); // # nocov
if (isNull(from))
return;
SEXP t = ATTRIB(to), f = ATTRIB(from);
if (isNull(t)) // target has no attributes -> overwrite
SET_ATTRIB(to, shallow_duplicate(f));
else {
for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to
SETCDR(t, shallow_duplicate(f));
}
if (!ANY_ATTRIB(to)) // target has no attributes -> overwrite
SHALLOW_DUPLICATE_ATTRIB(to, from);
else
R_mapAttrib(from, setDuplicateOneAttrib, to);
}

SEXP cbindlist(SEXP x, SEXP copyArg) {
Expand Down Expand Up @@ -84,7 +87,7 @@ SEXP cbindlist(SEXP x, SEXP copyArg) {
key = getAttrib(thisx, sym_sorted);
UNPROTECT(protecti); // thisnames, thisxcol
}
if (isNull(ATTRIB(index)))
if (!ANY_ATTRIB(index))
setAttrib(ans, sym_index, R_NilValue);
setAttrib(ans, R_NamesSymbol, names);
setAttrib(ans, sym_sorted, key);
Expand Down
2 changes: 1 addition & 1 deletion src/nafill.c
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S

if (!binplace) {
for (R_len_t i=0; i<nx; i++) {
if (!isNull(ATTRIB(VECTOR_ELT(x, i))))
if (ANY_ATTRIB(VECTOR_ELT(x, i)))
copyMostAttrib(VECTOR_ELT(x, i), VECTOR_ELT(ans, i));
}
SEXP obj_names = getAttrib(obj, R_NamesSymbol); // copy names
Expand Down
19 changes: 18 additions & 1 deletion src/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ SEXP frev(SEXP x, SEXP copyArg) {
SEXP levels = PROTECT(getAttrib(x, R_LevelsSymbol));
nprotect += 2;
// swipe attributes from x
SET_ATTRIB(x, R_NilValue);
CLEAR_ATTRIB(x);
setAttrib(x, R_NamesSymbol, names);
setAttrib(x, R_ClassSymbol, klass);
setAttrib(x, R_LevelsSymbol, levels);
Expand Down Expand Up @@ -672,3 +672,20 @@ void R_resizeVector_(SEXP x, R_xlen_t newlen) {
SETLENGTH(x, newlen);
}
#endif

#ifdef BACKPORT_MAP_ATTRIB
SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx) {
PROTECT_INDEX i;
SEXP a = ATTRIB(x);
PROTECT_WITH_INDEX(a, &i);

SEXP ret = NULL;
for (; !isNull(a); REPROTECT(a = CDR(a), i)) {
ret = fun(TAG(a), CAR(a), ctx);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not blocking, but if time permits consider R's structure more closely with protection of the TAG(a) and CAR(a) in teh callback.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK while a is protected, TAG(a), CAR(a), and CDR(a) should also stay protected. What do you recommend to do instead?

if (ret) break;
}

UNPROTECT(1);
return ret;
}
#endif