From 21027553661ea72d3a17f26c43e92a33d6211034 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 17 Oct 2025 23:33:42 +0200 Subject: [PATCH 1/3] fread can now select= and drop= with duplicated column names --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 4 +++ src/freadR.c | 60 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1ef9396f7..baea2d8b5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -332,6 +332,8 @@ 19. Ellipsis elements like `..1` are correctly excluded when searching for variables in "up-a-level" syntax inside `[`, [#5460](https://github.com/Rdatatable/data.table/issues/5460). Thanks @ggrothendieck for the report and @MichaelChirico for the fix. +20. `fread()` honors now duplicate column names when using `select=` or `drop=`, so every matching column is kept or removed consistently, [#1899](https://github.com/Rdatatable/data.table/issues/1899). Thanks @MichaelChirico for the report and @ben-schwen for the fix. + ### NOTES 1. The following in-progress deprecations have proceeded: diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b2142520d..e407c9568 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21685,3 +21685,7 @@ d3 = unserialize(serialize(d2, NULL)) test(2340.05, .selfref.ok(d3), FALSE) setDT(d3) test(2340.06, .selfref.ok(d3), TRUE) + +# fread select= and drop= with duplicated column names #6729 +test(2341.1, fread("x1,x1,y\n2,3,4", select="x1"), data.table(x1=2L, x1=3L)) +test(2342.2, fread("x1,x1,y\n2,3,4", drop="x1"), data.table(y=4L)) diff --git a/src/freadR.c b/src/freadR.c index 11bf02759..ba2cb2ff6 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -231,8 +231,35 @@ SEXP freadR( static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) { if (!length(items)) return; - const SEXP itemsInt = PROTECT(isString(items) ? chmatch(items, colNamesSxp, NA_INTEGER) : coerceVector(items, INTSXP)); - const int* const itemsD = INTEGER(itemsInt); + if (isString(items)) { + const int nItems = LENGTH(items); + SEXP matches = PROTECT(chmatch(colNamesSxp, items, NA_INTEGER)); + const int *matchD = INTEGER(matches); + bool *found = (bool *)R_alloc((size_t)nItems, sizeof(*found)); // remembers which drop entries matched at least once + memset(found, 0, (size_t)nItems * sizeof(*found)); + for (int col = 0; col < ncol; col++) { + const int idx = matchD[col]; + if (idx == NA_INTEGER) continue; + type[col] = CT_DROP; + found[idx - 1] = true; + } + for (int j = 0; j < nItems; j++) { + char buff[50]; + if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate + else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate + SEXP thisItem = STRING_ELT(items, j); + if (thisItem == NA_STRING) { + DTWARN(_("%s is NA"), buff); + continue; + } + if (!found[j]) + DTWARN(_("Column name '%s' (%s) not found"), CHAR(thisItem), buff); + } + UNPROTECT(1); + return; + } + SEXP itemsInt = PROTECT(coerceVector(items, INTSXP)); + const int *itemsD = INTEGER(itemsInt); const int n = LENGTH(itemsInt); for (int j = 0; j < n; j++) { const int k = itemsD[j]; @@ -240,14 +267,8 @@ static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) char buff[50]; if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate - if (k == NA_INTEGER) { - if (isString(items)) - DTWARN(_("Column name '%s' (%s) not found"), CHAR(STRING_ELT(items, j)), buff); - else - DTWARN(_("%s is NA"), buff); - } else { - DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol); - } + if (k == NA_INTEGER) DTWARN(_("%s is NA"), buff); + else DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol); } else { type[k - 1] = CT_DROP; // aside: dropping the same column several times is acceptable with no warning. This could arise via duplicates in the drop= vector, @@ -300,6 +321,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int } SET_VECTOR_ELT(RCHK, 3, selectRank = allocVector(INTSXP, ncol)); int *selectRankD = INTEGER(selectRank), rank = 1; + for (int i = 0; i < ncol; i++) selectRankD[i] = 0; for (int i = 0; i < n; i++) { const int k = selectInts[i]; if (k == NA_INTEGER) continue; // missing column name warned above and skipped @@ -310,6 +332,24 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int type[k - 1] *= -1; // detect and error on duplicates on all types without calling duplicated() at all selectRankD[k - 1] = rank++; // rank not i to skip missing column names } + if (isString(selectSxp)) { + for (int i = 0; i < n; i++) { + const int k = selectInts[i]; + if (k == NA_INTEGER) continue; // skip NA matches + const SEXP targetS = STRING_ELT(selectSxp, i); + if (targetS == NA_STRING) continue; // skip empty names + const char *target = CHAR(targetS); + for (int col = 0; col < ncol; col++) { + if (selectRankD[col]) continue; // already marked by first match (including duplicates handled earlier) + SEXP nameS = STRING_ELT(colNamesSxp, col); + if (nameS == NA_STRING) continue; + if (strcmp(CHAR(nameS), target) == 0) { + type[col] *= -1; // flip type to flag + selectRankD[col] = rank++; + } + } + } + } for (int i = 0; i < ncol; i++) { if (type[i] < 0) type[i] *= -1; else type[i] = CT_DROP; From fc8b409fb29b942e52c84397f3bfeaccb1a756b6 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 01:48:56 +0200 Subject: [PATCH 2/3] add coverage --- inst/tests/tests.Rraw | 1 + src/freadR.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e407c9568..697409a9c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21689,3 +21689,4 @@ test(2340.06, .selfref.ok(d3), TRUE) # fread select= and drop= with duplicated column names #6729 test(2341.1, fread("x1,x1,y\n2,3,4", select="x1"), data.table(x1=2L, x1=3L)) test(2342.2, fread("x1,x1,y\n2,3,4", drop="x1"), data.table(y=4L)) +test(2342.3, fread("x1,x1,y\n2,3,4", drop=NA_character_), data.table(x1=2L, x1=3L, y=4L), warning="drop.*=NA") diff --git a/src/freadR.c b/src/freadR.c index ba2cb2ff6..e0ce42210 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -249,7 +249,7 @@ static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate SEXP thisItem = STRING_ELT(items, j); if (thisItem == NA_STRING) { - DTWARN(_("%s is NA"), buff); + DTWARN(_("%s=NA"), buff); continue; } if (!found[j]) @@ -267,7 +267,7 @@ static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) char buff[50]; if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate - if (k == NA_INTEGER) DTWARN(_("%s is NA"), buff); + if (k == NA_INTEGER) DTWARN(_("%s=NA"), buff); else DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol); } else { type[k - 1] = CT_DROP; From 80805b4faf1fab699acf2a8789511501e196b9c7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 02:29:38 +0200 Subject: [PATCH 3/3] fix tests --- inst/tests/tests.Rraw | 2 +- src/freadR.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 697409a9c..94a0a67ea 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21689,4 +21689,4 @@ test(2340.06, .selfref.ok(d3), TRUE) # fread select= and drop= with duplicated column names #6729 test(2341.1, fread("x1,x1,y\n2,3,4", select="x1"), data.table(x1=2L, x1=3L)) test(2342.2, fread("x1,x1,y\n2,3,4", drop="x1"), data.table(y=4L)) -test(2342.3, fread("x1,x1,y\n2,3,4", drop=NA_character_), data.table(x1=2L, x1=3L, y=4L), warning="drop.*=NA") +test(2342.3, fread("x1,x1,y\n2,3,4", drop=NA_character_), data.table(x1=2L, x1=3L, y=4L), warning="drop.*is NA") diff --git a/src/freadR.c b/src/freadR.c index e0ce42210..ba2cb2ff6 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -249,7 +249,7 @@ static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate SEXP thisItem = STRING_ELT(items, j); if (thisItem == NA_STRING) { - DTWARN(_("%s=NA"), buff); + DTWARN(_("%s is NA"), buff); continue; } if (!found[j]) @@ -267,7 +267,7 @@ static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) char buff[50]; if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate - if (k == NA_INTEGER) DTWARN(_("%s=NA"), buff); + if (k == NA_INTEGER) DTWARN(_("%s is NA"), buff); else DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol); } else { type[k - 1] = CT_DROP;