From a43f30098148104bb08ad5b6046df2666f266ea1 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 19 Dec 2025 11:55:16 +0100 Subject: [PATCH] fread no longer skips on skip=0 --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 7 +++++++ src/fread.c | 6 +++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 107795c91..02570972f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -352,6 +352,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T 26. Grouping by a factor with many groups is now fast again, fixing a timing regression introduced in [#6890](https://github.com/Rdatatable/data.table/pull/6890) where UTF-8 coercion and level remapping were performed unnecessarily, [#7404](https://github.com/Rdatatable/data.table/issues/7404). Thanks @ben-schwen for the report and fix. +27. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. + ### NOTES 1. The following in-progress deprecations have proceeded: diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d7fe0f7f9..098e5d3e6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21896,3 +21896,10 @@ DT = data.table(x = strings) setorder(DT, x) test(2350, DT[["x"]], sort.int(strings, method='radix')) rm(DT, strings) + +# fread dont skip on skip=0, #7463 +txt = 'a1;a2\nb1;b2;b3\nc1;c2;c3' +test(2351.1, fread(txt, skip=0), data.table(V1 = c("b1", "c1"), a1 = c("b2", "c2"), a2 = c("b3", "c3")), warning="Added an extra default column name") +test(2351.2, fread(txt, skip=0, header=TRUE), data.table(V1 = c("b1", "c1"), a1 = c("b2", "c2"), a2 = c("b3", "c3")), warning="Added an extra default column name") +test(2351.3, fread(txt, skip=0, header=FALSE), data.table(V1=character(), V2=character(), V3=character()), warning="Consider fill=TRUE") +test(2351.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"), V2 = c("a2", "b2", "c2"), V3 = c("", "b3", "c3"))) diff --git a/src/fread.c b/src/fread.c index beca9a2f4..9d16397f7 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2190,7 +2190,7 @@ int freadMain(freadMainArgs _args) } } - if (args.header == NA_BOOL8 && prevStart != NULL) { + if (prevStart != NULL && (args.header == NA_BOOL8 || args.skipNrow >= 0)) { // The first data row matches types in the row after that, and user didn't override default auto detection. // Maybe previous line (if there is one, prevStart!=NULL) contains column names but there are too few (which is why it didn't become the first data row). ch = prevStart; @@ -2198,7 +2198,7 @@ int freadMain(freadMainArgs _args) if (tt == ncol) INTERNAL_STOP("row before first data row has the same number of fields but we're not using it"); // # nocov if (ch != pos) INTERNAL_STOP("ch!=pos after counting fields in the line before the first data row"); // # nocov if (verbose) DTPRINT(_("Types in 1st data row match types in 2nd data row but previous row has %d fields. Taking previous row as column names."), tt); - if (tt < ncol) { + if (tt < ncol && args.header != false) { autoFirstColName = (ncol - tt == 1); if (autoFirstColName) { DTWARN(_("Detected %d column names but the data has %d columns (i.e. invalid file). Added an extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.\n"), @@ -2216,7 +2216,7 @@ int freadMain(freadMainArgs _args) for (int j = ncol; j < tt; j++) { tmpType[j] = type[j] = type0; } ncol = tt; } - args.header = true; + if (args.header == NA_BOOL8) args.header = true; pos = prevStart; row1line--; }