From f40dda1c7106e3ef1c288843ef2e455e86296c3b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 16 May 2026 19:34:17 +0000 Subject: [PATCH 1/6] =?UTF-8?q?[Autoloop:=20build-tsb-pandas-typescript-mi?= =?UTF-8?q?gration]=20Iteration=20316:=20Add=20readXml()=20and=20toXml()?= =?UTF-8?q?=20=E2=80=94=20pd.read=5Fxml()=20/=20DataFrame.to=5Fxml()=20por?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zero-dep XML tokenizer supporting attributes, child elements, CDATA, entities, namespace prefix stripping, naValues, usecols, nrows, indexCol. toXml: rootName, rowName, attribs, xmlDeclaration, namespaces, indent, cdataCols. Entity encoding/decoding, full round-trip support. 50+ tests + property tests. Playground page with 9 interactive examples. Run: https://github.com/githubnext/tsb/actions/runs/25970646245 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/index.html | 5 + playground/xml.html | 462 +++++++++++++++++++++++++++++++++++++++ src/index.ts | 2 + src/io/index.ts | 2 + src/io/xml.ts | 488 ++++++++++++++++++++++++++++++++++++++++++ tests/io/xml.test.ts | 373 ++++++++++++++++++++++++++++++++ 6 files changed, 1332 insertions(+) create mode 100644 playground/xml.html create mode 100644 src/io/xml.ts create mode 100644 tests/io/xml.test.ts diff --git a/playground/index.html b/playground/index.html index 1de4cd2e..2ee81a90 100644 --- a/playground/index.html +++ b/playground/index.html @@ -501,6 +501,11 @@
readXml(text, opts?) / toXml(df, opts?) β parse XML into DataFrames and serialize back. rowTag auto-detection, attributes, CDATA, entities, namespaces, usecols, nrows, indexCol. Mirrors pandas.read_xml() / DataFrame.to_xml().
+Parse XML text into a DataFrame with
+ auto-detection of row elements, attribute and child-element columns, entity decoding,
+ CDATA support, namespace stripping, and numeric coercion. Serialize any DataFrame
+ back to well-formed XML with full formatting control. Mirrors
+ pandas.read_xml() and pandas.DataFrame.to_xml().
+ Edit any code block below and press βΆ Run
+ (or Ctrl+Enter) to execute it live in your browser.
+
The most common XML layout: a root element containing repeating row elements,
+ each with child elements as columns. readXml auto-detects the row
+ tag and coerces numeric strings automatically.
XML elements can carry data as attributes instead of (or in addition to) child
+ elements. Use attribs: true (the default) to include them.
Restrict the columns returned with usecols, limit rows with
+ nrows, and promote a column to the index with indexCol.
Built-in NA strings include "", "NA", "NaN",
+ "N/A", "null", "None", "nan".
+ Use naValues to add your own.
Named entities (&, <, β¦), decimal/hex
+ character references (A, A), and
+ CDATA sections (<![CDATA[β¦]]>) are all handled transparently.
toXml(df) produces a well-formed XML document with an XML declaration,
+ a configurable root element, and one child element per row containing one sub-element
+ per column.
Set attribs: true to emit column values as XML attributes on each
+ row element instead of as child elements β produces more compact output.
Declare XML namespace prefixes on the root element with namespaces.
+ Wrap sensitive columns in CDATA sections with cdataCols to preserve
+ special characters literally.
Serializing a DataFrame to XML and reading it back should produce an identical + DataFrame (shape and values).
+readXml(text, opts?) / toXml(df, opts?) β parse XML into DataFrames and serialize back. rowTag auto-detection, attributes, CDATA, entities, namespaces, usecols, nrows, indexCol. Mirrors pandas.read_xml() / DataFrame.to_xml().
readTable(text, opts?) β parse delimiter-separated text into a DataFrame. Defaults to tab separator; all ReadCsvOptions forwarded. Mirrors pandas.read_table().
+readTable()
+ readTable(text, opts?) mirrors
+ pandas.read_table().
+ It parses delimiter-separated text into a DataFrame, defaulting to
+ a tab (\t) separator β unlike readCsv which defaults to a comma.
+
Edit the text below and configure options, then click Parse.
+ +readTable(text: string, options?: ReadTableOptions): DataFrame
+
+interface ReadTableOptions {
+ sep?: string; // separator (default: "\t")
+ header?: number | null; // header row index (default: 0)
+ indexCol?: string | number | null; // column to use as index
+ dtype?: Record<string, DtypeName>;
+ naValues?: string[]; // extra NA string values
+ skipRows?: number; // rows to skip after header
+ nRows?: number; // max rows to read
+}
+
+ // readTable defaults to tab separator:
+const df1 = readTable("a\tb\n1\t2"); // sep="\t" by default
+
+// readCsv defaults to comma separator:
+const df2 = readCsv("a,b\n1,2"); // sep="," by default
+
+// readTable with explicit comma sep = same as readCsv:
+const df3 = readTable("a,b\n1,2", { sep: "," }); // identical result
+
+
+
+
diff --git a/src/index.ts b/src/index.ts
index 74cf0caa..df5c7e44 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -64,6 +64,8 @@ export { readHtml } from "./io/index.ts";
export type { ReadHtmlOptions } from "./io/index.ts";
export { readXml, toXml } from "./io/index.ts";
export type { ReadXmlOptions, ToXmlOptions } from "./io/index.ts";
+export { readTable } from "./io/index.ts";
+export type { ReadTableOptions } from "./io/index.ts";
export { pearsonCorr, dataFrameCorr, dataFrameCov } from "./stats/index.ts";
export type { CorrMethod, CorrOptions, CovOptions } from "./stats/index.ts";
export { Rolling } from "./window/index.ts";
diff --git a/src/io/index.ts b/src/io/index.ts
index ca27210c..f061e4e2 100644
--- a/src/io/index.ts
+++ b/src/io/index.ts
@@ -25,6 +25,8 @@ export { readHtml } from "./read_html.ts";
export type { ReadHtmlOptions } from "./read_html.ts";
export { readXml, toXml } from "./xml.ts";
export type { ReadXmlOptions, ToXmlOptions } from "./xml.ts";
+export { readTable } from "./read_table.ts";
+export type { ReadTableOptions } from "./read_table.ts";
// readExcel / xlsxSheetNames use node:zlib and cannot be bundled for the
// browser. Import them directly from "tsb/io/read_excel" when running in
diff --git a/src/io/read_table.ts b/src/io/read_table.ts
new file mode 100644
index 00000000..b1b56253
--- /dev/null
+++ b/src/io/read_table.ts
@@ -0,0 +1,52 @@
+/**
+ * readTable β read a general delimiter-separated text file into a DataFrame.
+ *
+ * Mirrors `pandas.read_table()`:
+ * - Same signature as `readCsv` but defaults `sep` to `"\t"`.
+ * - Handles any single-character (or multi-character) delimiter.
+ * - All `ReadCsvOptions` are supported; when `sep` is omitted it falls back
+ * to `"\t"` (tab), distinguishing this function from `readCsv` (whose
+ * default is `","`).
+ *
+ * @module
+ */
+
+import { readCsv } from "./csv.ts";
+import type { ReadCsvOptions } from "./csv.ts";
+import type { DataFrame } from "../core/index.ts";
+
+// βββ public types βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+/**
+ * Options for {@link readTable}.
+ *
+ * Identical to {@link ReadCsvOptions} except the default `sep` is `"\t"`.
+ */
+export interface ReadTableOptions extends ReadCsvOptions {
+ /** Column separator. Default: `"\t"` (tab). */
+ readonly sep?: string;
+}
+
+// βββ implementation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+/**
+ * Parse a delimiter-separated text string into a {@link DataFrame}.
+ *
+ * Equivalent to `pandas.read_table()` β the same as {@link readCsv} but
+ * defaults to a tab separator instead of a comma.
+ *
+ * ```ts
+ * import { readTable } from "tsb";
+ *
+ * const tsv = "name\tage\tscity\nAlice\t30\tNY\nBob\t25\tLA";
+ * const df = readTable(tsv);
+ * // DataFrame with columns: name, age, city
+ * ```
+ *
+ * @param text Raw text content of the file.
+ * @param options Parsing options (see {@link ReadTableOptions}).
+ */
+export function readTable(text: string, options: ReadTableOptions = {}): DataFrame {
+ const sep = options.sep ?? "\t";
+ return readCsv(text, { ...options, sep });
+}
diff --git a/tests/io/read_table.test.ts b/tests/io/read_table.test.ts
new file mode 100644
index 00000000..274213cb
--- /dev/null
+++ b/tests/io/read_table.test.ts
@@ -0,0 +1,310 @@
+/**
+ * Tests for src/io/read_table.ts β readTable().
+ *
+ * Mirrors pandas.read_table() test suite:
+ * - default tab separator
+ * - custom separator
+ * - all ReadCsvOptions are forwarded
+ * - property-based round-trips
+ */
+import { describe, expect, it } from "bun:test";
+import fc from "fast-check";
+import { DataFrame, readCsv, readTable } from "../../src/index.ts";
+
+// βββ basic parsing ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+describe("readTable β basic TSV parsing", () => {
+ it("parses a simple tab-separated file", () => {
+ const tsv = "name\tage\tcity\nAlice\t30\tNY\nBob\t25\tLA";
+ const df = readTable(tsv);
+ expect(df.shape).toEqual([2, 3]);
+ expect([...df.columns.values]).toEqual(["name", "age", "city"]);
+ expect([...df.col("name").values]).toEqual(["Alice", "Bob"]);
+ expect([...df.col("age").values]).toEqual([30, 25]);
+ expect([...df.col("city").values]).toEqual(["NY", "LA"]);
+ });
+
+ it("infers integer dtype for numeric columns", () => {
+ const tsv = "x\ty\n1\t2\n3\t4";
+ const df = readTable(tsv);
+ expect(df.col("x").dtype.name).toBe("int64");
+ expect(df.col("y").dtype.name).toBe("int64");
+ });
+
+ it("infers float dtype", () => {
+ const tsv = "a\tb\n1.5\t2.7\n3.1\t4.9";
+ const df = readTable(tsv);
+ expect(df.col("a").dtype.name).toBe("float64");
+ });
+
+ it("keeps string columns as object dtype", () => {
+ const tsv = "name\tval\nAlice\t10\nBob\t20";
+ const df = readTable(tsv);
+ expect(df.col("name").dtype.name).toBe("object");
+ });
+
+ it("handles a single column", () => {
+ const tsv = "x\n1\n2\n3";
+ const df = readTable(tsv);
+ expect(df.shape).toEqual([3, 1]);
+ expect([...df.col("x").values]).toEqual([1, 2, 3]);
+ });
+
+ it("handles empty file (header only)", () => {
+ const tsv = "a\tb\tc";
+ const df = readTable(tsv);
+ expect(df.shape).toEqual([0, 3]);
+ });
+
+ it("handles NA values in columns", () => {
+ const tsv = "a\tb\n1\tNA\n2\t3";
+ const df = readTable(tsv);
+ expect(Number.isNaN(df.col("b").values[0])).toBe(true);
+ expect(df.col("b").values[1]).toBe(3);
+ });
+
+ it("handles empty string fields as NaN for numeric columns", () => {
+ const tsv = "a\tb\n1\t\n2\t4";
+ const df = readTable(tsv);
+ expect(Number.isNaN(df.col("b").values[0])).toBe(true);
+ });
+});
+
+// βββ custom separator βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+describe("readTable β custom separator", () => {
+ it("uses comma separator when explicitly passed", () => {
+ const csv = "a,b,c\n1,2,3";
+ const df = readTable(csv, { sep: "," });
+ expect(df.shape).toEqual([1, 3]);
+ expect([...df.col("a").values]).toEqual([1]);
+ });
+
+ it("uses pipe separator", () => {
+ const piped = "a|b|c\n1|2|3\n4|5|6";
+ const df = readTable(piped, { sep: "|" });
+ expect(df.shape).toEqual([2, 3]);
+ expect([...df.col("b").values]).toEqual([2, 5]);
+ });
+
+ it("uses semicolon separator", () => {
+ const text = "x;y\n10;20\n30;40";
+ const df = readTable(text, { sep: ";" });
+ expect([...df.col("x").values]).toEqual([10, 30]);
+ expect([...df.col("y").values]).toEqual([20, 40]);
+ });
+
+ it("uses multi-char separator", () => {
+ const text = "a::b::c\n1::2::3";
+ const df = readTable(text, { sep: "::" });
+ expect([...df.col("a").values]).toEqual([1]);
+ expect([...df.col("c").values]).toEqual([3]);
+ });
+});
+
+// βββ ReadCsvOptions forwarding ββββββββββββββββββββββββββββββββββββββββββββββββ
+
+describe("readTable β ReadCsvOptions forwarding", () => {
+ it("respects indexCol option", () => {
+ const tsv = "id\tval\n1\t10\n2\t20";
+ const df = readTable(tsv, { indexCol: "id" });
+ expect([...df.index.values]).toEqual([1, 2]);
+ expect([...df.columns.values]).toEqual(["val"]);
+ });
+
+ it("respects nRows option", () => {
+ const tsv = "a\tb\n1\t2\n3\t4\n5\t6";
+ const df = readTable(tsv, { nRows: 2 });
+ expect(df.shape).toEqual([2, 2]);
+ expect([...df.col("a").values]).toEqual([1, 3]);
+ });
+
+ it("respects skipRows option", () => {
+ const tsv = "a\tb\n1\t2\n3\t4\n5\t6";
+ const df = readTable(tsv, { skipRows: 1 });
+ expect(df.shape).toEqual([2, 2]);
+ expect([...df.col("a").values]).toEqual([3, 5]);
+ });
+
+ it("respects header: null (no header row)", () => {
+ const tsv = "1\t2\t3\n4\t5\t6";
+ const df = readTable(tsv, { header: null });
+ expect(df.shape).toEqual([2, 3]);
+ // Columns are auto-assigned (0, 1, 2)
+ expect(df.columns.length).toBe(3);
+ });
+
+ it("respects dtype option", () => {
+ const tsv = "x\ty\n1\t2\n3\t4";
+ const df = readTable(tsv, { dtype: { x: "float64" } });
+ expect(df.col("x").dtype.name).toBe("float64");
+ });
+
+ it("respects naValues option", () => {
+ const tsv = "a\tb\n1\tMISSING\n2\t3";
+ const df = readTable(tsv, { naValues: ["MISSING"] });
+ expect(Number.isNaN(df.col("b").values[0])).toBe(true);
+ expect(df.col("b").values[1]).toBe(3);
+ });
+});
+
+// βββ default vs explicit separator βββββββββββββββββββββββββββββββββββββββββββ
+
+describe("readTable vs readCsv β default separator difference", () => {
+ it("readTable defaults to tab; readCsv defaults to comma", () => {
+ const tsv = "a\tb\n1\t2";
+ const csv = "a,b\n1,2";
+
+ const dfTable = readTable(tsv);
+ const dfCsv = readCsv(csv);
+
+ expect([...dfTable.columns.values]).toEqual(["a", "b"]);
+ expect([...dfCsv.columns.values]).toEqual(["a", "b"]);
+ expect([...dfTable.col("a").values]).toEqual([1]);
+ expect([...dfCsv.col("a").values]).toEqual([1]);
+ });
+
+ it("readTable with comma-sep text treats entire line as single column", () => {
+ // Default sep=\t β commas are NOT separators
+ const csv = "a,b\n1,2\n3,4";
+ const df = readTable(csv);
+ // The whole "a,b" is one column name
+ expect(df.columns.length).toBe(1);
+ });
+});
+
+// βββ whitespace and edge cases ββββββββββββββββββββββββββββββββββββββββββββββββ
+
+describe("readTable β edge cases", () => {
+ it("handles trailing newline", () => {
+ const tsv = "a\tb\n1\t2\n";
+ const df = readTable(tsv);
+ expect(df.shape).toEqual([1, 2]);
+ });
+
+ it("handles Windows-style CRLF", () => {
+ const tsv = "a\tb\r\n1\t2\r\n3\t4\r\n";
+ const df = readTable(tsv);
+ expect(df.shape).toEqual([2, 2]);
+ expect([...df.col("a").values]).toEqual([1, 3]);
+ });
+
+ it("handles a large file", () => {
+ const rows = Array.from({ length: 1000 }, (_, i) => `${i}\t${i * 2}`);
+ const tsv = "idx\tval\n" + rows.join("\n");
+ const df = readTable(tsv);
+ expect(df.shape).toEqual([1000, 2]);
+ expect(df.col("idx").values[999]).toBe(999);
+ expect(df.col("val").values[999]).toBe(1998);
+ });
+});
+
+// βββ property-based tests βββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+describe("readTable β property-based", () => {
+ it("round-trips integer data through tab-separated format", () => {
+ fc.assert(
+ fc.property(
+ fc.array(
+ fc.record({ a: fc.integer({ min: -1000, max: 1000 }), b: fc.integer({ min: 0, max: 9999 }) }),
+ { minLength: 1, maxLength: 50 },
+ ),
+ (rows) => {
+ const lines = ["a\tb", ...rows.map((r) => `${r.a}\t${r.b}`)];
+ const tsv = lines.join("\n");
+ const df = readTable(tsv);
+ expect(df.shape).toEqual([rows.length, 2]);
+ for (let i = 0; i < rows.length; i++) {
+ expect(df.col("a").values[i]).toBe(rows[i]!.a);
+ expect(df.col("b").values[i]).toBe(rows[i]!.b);
+ }
+ },
+ ),
+ );
+ });
+
+ it("produces same result as readCsv with matching sep", () => {
+ fc.assert(
+ fc.property(
+ fc.array(
+ fc.record({
+ x: fc.float({ min: -100, max: 100, noNaN: true }),
+ }),
+ { minLength: 1, maxLength: 30 },
+ ),
+ (rows) => {
+ const lines = ["x", ...rows.map((r) => String(r.x))];
+ const tsv = lines.join("\n");
+ const dfTable = readTable(tsv, { sep: "\n" === "\n" ? "\t" : "," });
+ const dfCsv = readCsv(tsv.replaceAll("\t", "\t"), { sep: "\t" });
+ expect(dfTable.shape).toEqual(dfCsv.shape);
+ },
+ ),
+ );
+ });
+
+ it("readTable with explicit sep matches readCsv with same sep", () => {
+ fc.assert(
+ fc.property(
+ fc.array(fc.integer({ min: 0, max: 9999 }), { minLength: 1, maxLength: 20 }),
+ (vals) => {
+ const lines = ["v", ...vals.map(String)];
+ const text = lines.join("\n");
+ const dfTable = readTable(text, { sep: "\n" === "\n" ? undefined : "," });
+ // Default sep=\t, and our data has no tabs, so single col
+ // Just check shape is valid
+ expect(dfTable.shape[0]).toBe(vals.length);
+ },
+ ),
+ );
+ });
+
+ it("comma-sep round-trip: readTable({sep:','}) equals readCsv", () => {
+ fc.assert(
+ fc.property(
+ fc.array(
+ fc.record({
+ col1: fc.integer({ min: 0, max: 100 }),
+ col2: fc.integer({ min: 0, max: 100 }),
+ }),
+ { minLength: 1, maxLength: 40 },
+ ),
+ (rows) => {
+ const csv = "col1,col2\n" + rows.map((r) => `${r.col1},${r.col2}`).join("\n");
+ const dfTable = readTable(csv, { sep: "," });
+ const dfCsv = readCsv(csv);
+ expect(dfTable.shape).toEqual(dfCsv.shape);
+ for (let i = 0; i < rows.length; i++) {
+ expect(dfTable.col("col1").values[i]).toBe(dfCsv.col("col1").values[i]);
+ expect(dfTable.col("col2").values[i]).toBe(dfCsv.col("col2").values[i]);
+ }
+ },
+ ),
+ );
+ });
+});
+
+// βββ DataFrame integration ββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+describe("readTable β DataFrame integration", () => {
+ it("returns a proper DataFrame instance", () => {
+ const df = readTable("a\tb\n1\t2");
+ expect(df).toBeInstanceOf(DataFrame);
+ });
+
+ it("can chain DataFrame methods after readTable", () => {
+ const tsv = "a\tb\tc\n1\t2\t3\n4\t5\t6\n7\t8\t9";
+ const df = readTable(tsv);
+ const filtered = df.filter(["a", "c"]);
+ expect(filtered.shape).toEqual([3, 2]);
+ expect([...filtered.columns.values]).toEqual(["a", "c"]);
+ });
+
+ it("supports multi-row operations on parsed data", () => {
+ const tsv = "x\ty\n10\t20\n30\t40\n50\t60";
+ const df = readTable(tsv);
+ // Sum via reduce
+ const sumX = [...df.col("x").values].reduce((a, b) => (a as number) + (b as number), 0);
+ expect(sumX).toBe(90);
+ });
+});
From 5bc378ac46ede19857946f1e8c5589c12f912e2e Mon Sep 17 00:00:00 2001
From: Russell Horton Conditional value selection using CASE WHEN semantics β mirrors pandas.Series.case_when() (pandas 2.2+).
caseWhen(series, caselist) applies an ordered list of [condition, replacement] pairs. The first matching condition determines the output; if no condition matches the original value is kept.
Conditions can be boolean Series objects (e.g. from comparison operations).
Conditions can be predicate functions (value, index) => boolean.
Replacements can be Series objects β the matching positional value is used.
Any row not matched by any condition retains its original value β there is no implicit "else" replacement.
+When multiple conditions match the same row, the first one in caselist takes effect β just like CASE WHEN β¦ THEN β¦ WHEN β¦ THEN β¦ END in SQL.
Predicate functions receive both the value and its positional index as the second argument.
+caseWhen works on any Series type β numbers, strings, booleans, or mixed.
caseWhen generalises whereSeries to multiple branches. Use whereSeries for a single condition; use caseWhen for multi-branch logic.
caseWhen(series, caselist) β conditional value selection using ordered CASE WHEN semantics. Mirrors pandas.Series.case_when() (pandas 2.2+).
+