diff --git a/NEWS.md b/NEWS.md index bec4ada86..1c8540a9c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -354,6 +354,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T 27. `dogroups()` no longer reads beyond the resized end of over-allocated data.table list columns, [#7486](https://github.com/Rdatatable/data.table/issues/7486). While this didn't crash in practice, it is now explicitly checked for in recent R versions (r89198+). Thanks @TimTaylor and @aitap for the report and @aitap for the fix. +28. `rbindlist()` now avoids the crash when working with many non-UTF-8 column names, [#7452](https://github.com/Rdatatable/data.table/issues/7452). Thanks @aitap for the report and the fix. + ### NOTES 1. The following in-progress deprecations have proceeded: diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6087ae0e6..91e6bc580 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21900,3 +21900,12 @@ rm(DT, strings) DT = unserialize(serialize(as.data.table(mtcars), NULL)) test(2351, DT[,carb:=NULL], as.data.table(mtcars)[,carb:=NULL]) rm(DT) + +# rbindlist did not protect the temporary UTF-8 strings, #7452 +DTn = apply(matrix(as.raw(rep(0xa1:0xff, length.out = 100)), 10), 2, rawToChar) +Encoding(DTn) = 'latin1' # will need conversion to UTF-8 +DTl = lapply(DTn, function(n) setNames(list(42), n))[c(1, rep(2:length(DTn), length.out = 3e5), 1)] +DT = suppressMessages(rbindlist(DTl)) # used to crash +test(2352.1, dim(DT), c(300002L, 1L)) +test(2352.2, DT[[1]], rep(42, nrow(DT))) +rm(DTn, DTl, DT) diff --git a/src/rbindlist.c b/src/rbindlist.c index 672ffd892..764558c18 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -68,6 +68,13 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor int *colMap=NULL; // maps each column in final result to the column of each list item if (usenames==TRUE || usenames==NA_LOGICAL) { + // zeroth pass - convert all names to UTF-8 + SEXP cnl = PROTECT(allocVector(VECSXP, XLENGTH(l))); + for (R_xlen_t i = 0; i < XLENGTH(l); ++i) { + const SEXP cn = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol); + if (xlength(cn)) SET_VECTOR_ELT(cnl, i, coerceUtf8IfNeeded(cn)); + } + const SEXP *cnlp = SEXPPTR_RO(cnl); // here we proceed as if fill=true for brevity (accounting for dups is tricky) and then catch any missings after this branch // when use.names==NA we also proceed here as if use.names was TRUE to save new code and then check afterwards the map is 1:ncol for every item // first find number of unique column names present; i.e. length(unique(unlist(lapply(l,names)))) @@ -79,11 +86,11 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor SEXP li = VECTOR_ELT(l, i); int thisncol=LENGTH(li); if (isNull(li) || !LENGTH(li)) continue; - const SEXP cn = getAttrib(li, R_NamesSymbol); + const SEXP cn = cnlp[i]; if (!length(cn)) continue; const SEXP *cnp = STRING_PTR_RO(cn); for (int j=0; j