Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T

27. `dogroups()` no longer reads beyond the resized end of over-allocated data.table list columns, [#7486](https://github.com/Rdatatable/data.table/issues/7486). While this didn't crash in practice, it is now explicitly checked for in recent R versions (r89198+). Thanks @TimTaylor and @aitap for the report and @aitap for the fix.

28. `rbindlist()` now avoids the crash when working with many non-UTF-8 column names, [#7452](https://github.com/Rdatatable/data.table/issues/7452). Thanks @aitap for the report and the fix.

### NOTES

1. The following in-progress deprecations have proceeded:
Expand Down
9 changes: 9 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21900,3 +21900,12 @@ rm(DT, strings)
DT = unserialize(serialize(as.data.table(mtcars), NULL))
test(2351, DT[,carb:=NULL], as.data.table(mtcars)[,carb:=NULL])
rm(DT)

# rbindlist did not protect the temporary UTF-8 strings, #7452
DTn = apply(matrix(as.raw(rep(0xa1:0xff, length.out = 100)), 10), 2, rawToChar)
Encoding(DTn) = 'latin1' # will need conversion to UTF-8
DTl = lapply(DTn, function(n) setNames(list(42), n))[c(1, rep(2:length(DTn), length.out = 3e5), 1)]
DT = suppressMessages(rbindlist(DTl)) # used to crash
test(2352.1, dim(DT), c(300002L, 1L))
test(2352.2, DT[[1]], rep(42, nrow(DT)))
rm(DTn, DTl, DT)
21 changes: 14 additions & 7 deletions src/rbindlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor

int *colMap=NULL; // maps each column in final result to the column of each list item
if (usenames==TRUE || usenames==NA_LOGICAL) {
// zeroth pass - convert all names to UTF-8
SEXP cnl = PROTECT(allocVector(VECSXP, XLENGTH(l)));
for (R_xlen_t i = 0; i < XLENGTH(l); ++i) {
const SEXP cn = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
if (xlength(cn)) SET_VECTOR_ELT(cnl, i, coerceUtf8IfNeeded(cn));
}
const SEXP *cnlp = SEXPPTR_RO(cnl);
// here we proceed as if fill=true for brevity (accounting for dups is tricky) and then catch any missings after this branch
// when use.names==NA we also proceed here as if use.names was TRUE to save new code and then check afterwards the map is 1:ncol for every item
// first find number of unique column names present; i.e. length(unique(unlist(lapply(l,names))))
Expand All @@ -79,11 +86,11 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
SEXP li = VECTOR_ELT(l, i);
int thisncol=LENGTH(li);
if (isNull(li) || !LENGTH(li)) continue;
const SEXP cn = getAttrib(li, R_NamesSymbol);
const SEXP cn = cnlp[i];
if (!length(cn)) continue;
const SEXP *cnp = STRING_PTR_RO(cn);
for (int j=0; j<thisncol; j++) {
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
SEXP s = cnp[j]; // convert different encodings for use.names #5452
if (hash_lookup(marks, s, 0)<0) continue; // seen this name before
nuniq++;
hash_set(marks, s,-nuniq);
Expand All @@ -104,12 +111,12 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
SEXP li = VECTOR_ELT(l, i);
int thisncol=length(li);
if (thisncol==0) continue;
const SEXP cn = getAttrib(li, R_NamesSymbol);
const SEXP cn = cnlp[i];
if (!length(cn)) continue;
const SEXP *cnp = STRING_PTR_RO(cn);
memset(counts, 0, nuniq*sizeof(*counts));
for (int j=0; j<thisncol; j++) {
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
SEXP s = cnp[j]; // convert different encodings for use.names #5452
counts[ -hash_lookup(marks, s, 0)-1 ]++;
}
for (int u=0; u<nuniq; u++) {
Expand Down Expand Up @@ -141,14 +148,14 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
SEXP li = VECTOR_ELT(l, i);
int thisncol=length(li);
if (thisncol==0) continue;
const SEXP cn = getAttrib(li, R_NamesSymbol);
const SEXP cn = cnlp[i];
if (!length(cn)) {
for (int j=0; j<thisncol; j++) colMapRaw[i*ncol + j] = j;
} else {
const SEXP *cnp = STRING_PTR_RO(cn);
memset(counts, 0, nuniq*sizeof(*counts));
for (int j=0; j<thisncol; j++) {
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
SEXP s = cnp[j]; // convert different encodings for use.names #5452
int w = -hash_lookup(marks, s, 0)-1;
int wi = counts[w]++; // how many dups have we seen before of this name within this item
if (uniqMap[w]==-1) {
Expand All @@ -167,8 +174,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
}
}
free(counts); free(uniqMap); free(dupLink); // all local scope so no need to set to NULL
UNPROTECT(2); // cnl, marks

UNPROTECT(1); // marks
// colMapRaw is still allocated. It was allocated with malloc because we needed to catch if the alloc failed.
// move it to R's heap so it gets automatically free'd on exit, and on any error between now and the end of rbindlist.
colMap = (int *)R_alloc(LENGTH(l)*ncol, sizeof(*colMap));
Expand Down
Loading