diff --git a/benchmarks/pandas/bench_read_html.py b/benchmarks/pandas/bench_read_html.py new file mode 100644 index 00000000..03dd0199 --- /dev/null +++ b/benchmarks/pandas/bench_read_html.py @@ -0,0 +1,52 @@ +""" +Benchmark: pd.read_html — parse HTML tables into DataFrames. +Outputs JSON: {"function": "read_html", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import math + +try: + import pandas as pd +except ImportError: + import subprocess, sys + subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas", "--quiet"]) + import pandas as pd + +try: + import lxml # noqa: F401 +except ImportError: + import subprocess, sys + subprocess.check_call([sys.executable, "-m", "pip", "install", "lxml", "--quiet"]) + +ROWS = 1_000 +WARMUP = 3 +ITERATIONS = 20 + + +def build_html(rows: int) -> str: + header = "idnamevaluescore" + body_rows = [ + f"{i}item_{i % 100}{i * 1.5:.2f}{math.sin(i * 0.01):.6f}" + for i in range(rows) + ] + return f"{header}{''.join(body_rows)}
" + + +html = build_html(ROWS) + +# Warm-up +for _ in range(WARMUP): + pd.read_html(html) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.read_html(html) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "read_html", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/tsb/bench_read_html.ts b/benchmarks/tsb/bench_read_html.ts new file mode 100644 index 00000000..3cbc7149 --- /dev/null +++ b/benchmarks/tsb/bench_read_html.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: readHtml — parse HTML tables into DataFrames. + * Outputs JSON: {"function": "read_html", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { readHtml } from "../../src/index.js"; + +const ROWS = 1_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Build a realistic HTML string with a 1000-row table. +function buildHtml(rows: number): string { + const header = "idnamevaluescore"; + const bodyRows: string[] = []; + for (let i = 0; i < rows; i++) { + bodyRows.push( + `${i}item_${i % 100}${(i * 1.5).toFixed(2)}${Math.sin(i * 0.01).toFixed(6)}`, + ); + } + return `${header}${bodyRows.join("")}
`; +} + +const html = buildHtml(ROWS); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + readHtml(html); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + readHtml(html); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "read_html", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +);