From 57021768ab7e8cd48cdc9bc7c92c99b6d0a2c4cd Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Dec 2025 00:54:58 +0100 Subject: [PATCH 1/2] add fix --- R/fread.R | 6 ++---- inst/tests/tests.Rraw | 12 +++++++++++- src/data.table.h | 2 +- src/freadR.c | 25 +++++++++++++++++++------ 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/R/fread.R b/R/fread.R index 16a72ed24..122357476 100644 --- a/R/fread.R +++ b/R/fread.R @@ -2,7 +2,7 @@ fread = function( input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), -col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL, +col.names=NULL, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), logicalYN=getOption("datatable.logicalYN", FALSE), @@ -293,7 +293,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC") tz="UTC" } ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,comment.char, - fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") + fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC",col.names) if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns nr = length(ans[[1L]]) require_bit64_if_needed(ans) @@ -356,8 +356,6 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC") for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j))) } - if (!missing(col.names)) # FR #768 - setnames(ans, col.names) # setnames checks and errors automatically if (!is.null(key) && data.table) { if (!is.character(key)) stopf("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8b151609b..70983b8c6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8072,7 +8072,7 @@ test(1557.1, names(fread(str)), c("V1", "V2")) # autonamed test(1557.2, names(fread(str, col.names=letters[1:2])), letters[1:2]) test(1557.3, names(fread(str, col.names=letters[1])), error="Can't assign 1 names to") test(1557.4, names(fread(str, col.names=letters[1:3])), error="Can't assign 3 names to") -test(1557.5, names(fread(str, col.names=1:2)), error="Passed a vector of type") +test(1557.5, names(fread(str, col.names=1:2)), error="'col.names' must be") # Fix for #773 f = testDir("issue_773_fread.txt") @@ -21901,3 +21901,13 @@ rm(DT, strings) DT = unserialize(serialize(as.data.table(mtcars), NULL)) test(2351, DT[,carb:=NULL], as.data.table(mtcars)[,carb:=NULL]) rm(DT) + +# drop works with user override colnames #3459 +DT = data.table(a=c(1L, 4L), c=c(3L, 6L)) +test(2352.1, fread("a,b,c\n1,2,3\n4,5,6", drop='b'), DT) +test(2352.2, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", drop='b'), DT) +test(2352.3, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", drop=2), DT) +test(2352.4, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", colClasses=c(b="NULL")), DT) +test(2352.5, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", select=c('a','c')), DT) +test(2352.6, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", select=c(1,3)), DT) +rm(DT) diff --git a/src/data.table.h b/src/data.table.h index 0bfcb09d8..86dbaa6e3 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -399,7 +399,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP); SEXP chmatch_R(SEXP, SEXP, SEXP); SEXP chmatchdup_R(SEXP, SEXP, SEXP); SEXP chin_R(SEXP, SEXP); -SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP setlistelt(SEXP, SEXP, SEXP); diff --git a/src/freadR.c b/src/freadR.c index b3b3d6450..b33a5a2b7 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -32,6 +32,7 @@ static colType readInt64As = CT_INT64; static SEXP selectSxp; static SEXP dropSxp; static SEXP colClassesSxp; +static SEXP cNamesSxp; static bool selectColClasses = false; cetype_t ienc = CE_NATIVE; static SEXP RCHK; @@ -77,7 +78,8 @@ SEXP freadR( SEXP integer64Arg, SEXP encodingArg, SEXP keepLeadingZerosArgs, - SEXP noTZasUTC + SEXP noTZasUTC, + SEXP colNamesArg ) { verbose = LOGICAL(verboseArg)[0]; @@ -185,6 +187,8 @@ SEXP freadR( args.readInt64As = readInt64As; colClassesSxp = colClassesArg; + if (!isNull(colNamesArg) && !isString(colNamesArg)) error(_("'col.names' must be NULL or a character vector")); + cNamesSxp = colNamesArg; selectSxp = selectArg; dropSxp = dropArg; @@ -231,10 +235,10 @@ SEXP freadR( return DT; } -static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) +static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource, SEXP matchNames) { if (!length(items)) return; - const SEXP itemsInt = PROTECT(isString(items) ? chmatch(items, colNamesSxp, NA_INTEGER) : coerceVector(items, INTSXP)); + const SEXP itemsInt = PROTECT(isString(items) ? chmatch(items, matchNames, NA_INTEGER) : coerceVector(items, INTSXP)); const int* const itemsD = INTEGER(itemsInt); const int n = LENGTH(itemsInt); for (int j = 0; j < n; j++) { @@ -277,13 +281,14 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int } } // "use either select= or drop= but not both" was checked earlier in freadR - applyDrop(dropSxp, type, ncol, /*dropSource=*/-1); + SEXP matchNames = isNull(cNamesSxp) ? colNamesSxp : cNamesSxp; + applyDrop(dropSxp, type, ncol, /*dropSource=*/-1, matchNames); if (TYPEOF(colClassesSxp) == VECSXP) { // not isNewList() because that returns true for NULL SEXP listNames = PROTECT(getAttrib(colClassesSxp, R_NamesSymbol)); // rchk wanted this protected for (int i = 0; i < LENGTH(colClassesSxp); i++) { if (STRING_ELT(listNames, i) == char_NULL) { SEXP items = VECTOR_ELT(colClassesSxp, i); - applyDrop(items, type, ncol, /*dropSource=*/i); + applyDrop(items, type, ncol, /*dropSource=*/i, matchNames); } } UNPROTECT(1); // listNames @@ -294,7 +299,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int if (length(selectSxp)) { const int n = length(selectSxp); if (isString(selectSxp)) { - selectInts = INTEGER(PROTECT(chmatch(selectSxp, colNamesSxp, NA_INTEGER))); nprotect++; + selectInts = INTEGER(PROTECT(chmatch(selectSxp, matchNames, NA_INTEGER))); nprotect++; for (int i = 0; i < n; i++) if (selectInts[i] == NA_INTEGER) DTWARN(_("Column name '%s' not found in column name header (case sensitive), skipping."), CHAR(STRING_ELT(selectSxp, i))); } else { @@ -318,6 +323,14 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int else type[i] = CT_DROP; } } + // override col names if user provided them + if (!isNull(cNamesSxp)) { + int ncnames = LENGTH(cNamesSxp); + if (ncnames != ncol) STOP(_("Can't assign %d names to a %d-column data.table"), ncnames, ncol); + for (int i = 0; i < ncol; i++) { + SET_STRING_ELT(colNamesSxp, i, STRING_ELT(cNamesSxp, i)); + } + } colClassesAs = NULL; // any coercions we can't handle here in C are deferred to R (to handle with methods::as) via this attribute if (length(colClassesSxp)) { SEXP typeRName_sxp = PROTECT(allocVector(STRSXP, NUT)); From 6719e1163a092f89e1cb8ba25b42fe7b03f65aad Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Dec 2025 01:42:49 +0100 Subject: [PATCH 2/2] add catering for header --- NEWS.md | 2 ++ src/freadR.c | 23 +++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index bec4ada86..a8d8bd88e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -354,6 +354,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T 27. `dogroups()` no longer reads beyond the resized end of over-allocated data.table list columns, [#7486](https://github.com/Rdatatable/data.table/issues/7486). While this didn't crash in practice, it is now explicitly checked for in recent R versions (r89198+). Thanks @TimTaylor and @aitap for the report and @aitap for the fix. +28. `fread()` with `col.names=` now correctly applies `drop=` and `select=` using user-provided column names instead of auto-generated names, [#3459](https://github.com/Rdatatable/data.table/issues/3459). Thanks to @malcook for the report and @ben-schwen for the fix. + ### NOTES 1. The following in-progress deprecations have proceeded: diff --git a/src/freadR.c b/src/freadR.c index b33a5a2b7..599f066ec 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -281,7 +281,8 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int } } // "use either select= or drop= but not both" was checked earlier in freadR - SEXP matchNames = isNull(cNamesSxp) ? colNamesSxp : cNamesSxp; + bool hasHeader = ncol > 0 && strcmp(CHAR(STRING_ELT(colNamesSxp, 0)), "V1") != 0; + SEXP matchNames = (isNull(cNamesSxp) || hasHeader) ? colNamesSxp : cNamesSxp; applyDrop(dropSxp, type, ncol, /*dropSource=*/-1, matchNames); if (TYPEOF(colClassesSxp) == VECSXP) { // not isNewList() because that returns true for NULL SEXP listNames = PROTECT(getAttrib(colClassesSxp, R_NamesSymbol)); // rchk wanted this protected @@ -326,9 +327,23 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int // override col names if user provided them if (!isNull(cNamesSxp)) { int ncnames = LENGTH(cNamesSxp); - if (ncnames != ncol) STOP(_("Can't assign %d names to a %d-column data.table"), ncnames, ncol); - for (int i = 0; i < ncol; i++) { - SET_STRING_ELT(colNamesSxp, i, STRING_ELT(cNamesSxp, i)); + if (hasHeader) { + int ndrop = 0; + for (int i = 0; i < ncol; i++) if (type[i] == CT_DROP) ndrop++; + if (ncnames != (ncol - ndrop)) STOP(_("Can't assign %d names to a %d-column data.table"), ncnames, ncol - ndrop); + int *selectRankD = selectRank ? INTEGER(selectRank) : NULL; + int cname_idx = 0; + for (int i = 0; i < ncol; i++) { + if (type[i] != CT_DROP) { + const int idx = selectRankD ? (selectRankD[i] - 1) : cname_idx++; + SET_STRING_ELT(colNamesSxp, i, STRING_ELT(cNamesSxp, idx)); + } + } + } else { + if (ncnames != ncol) STOP(_("Can't assign %d names to a %d-column data.table"), ncnames, ncol); + for (int i = 0; i < ncol; i++) { + SET_STRING_ELT(colNamesSxp, i, STRING_ELT(cNamesSxp, i)); + } } } colClassesAs = NULL; // any coercions we can't handle here in C are deferred to R (to handle with methods::as) via this attribute