Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
6de7c30
feat: add CTable, a columnar in-memory table built on top of blosc2
Jacc4224 Mar 26, 2026
01e47f4
Merge pull request #604 from Jacc4224/ctable-new
FrancescAlted Mar 26, 2026
c05c2ec
Add a plan for declaring a simple schema for CTable objects
FrancescAlted Mar 26, 2026
725c28b
Add a pydantic as a new dependency
FrancescAlted Mar 26, 2026
0efd450
Fix small formatting issues
FrancescAlted Mar 26, 2026
f504ad0
Simplify the plan for ctable schema
FrancescAlted Mar 26, 2026
46bf2e3
Disable wheel generation for each commit in this branch
FrancescAlted Mar 26, 2026
43bf562
Add a new plan on CTable persistence
FrancescAlted Mar 26, 2026
e84f7ac
_
Jacc4224 Mar 26, 2026
8de1870
_
Jacc4224 Mar 26, 2026
a8db18d
Testing
FrancescAlted Mar 26, 2026
dd154b1
Merge branch 'ctable3' of github.com:Blosc/python-blosc2 into my_ctable3
Jacc4224 Mar 26, 2026
ce65607
writen test
Jacc4224 Mar 26, 2026
b623f0e
Remove testing file
FrancescAlted Mar 26, 2026
b9e8c35
Merge branch 'ctable3' of github.com:Blosc/python-blosc2 into my_ctable3
Jacc4224 Mar 26, 2026
4ce8296
Schema layer:
Jacc4224 Apr 4, 2026
ee1d0c4
persistency half way done
Jacc4224 Apr 4, 2026
a422d72
CTable: full feature build-out (persistency, aggregates, mutations, …
Jacc4224 Apr 6, 2026
0472b3f
CTable: full feature build-out (persistency, aggregates, mutations, …
Jacc4224 Apr 6, 2026
34f8219
CSV compatibility implementation
Jacc4224 Apr 7, 2026
6bf1ec8
Persistent ctables.
Jacc4224 Apr 7, 2026
34c2eee
Colision bug fixed 1
Jacc4224 Apr 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/cibuildwheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ env:
jobs:

build_wheels:
if: ${{ github.ref_name != 'ctable3' && github.head_ref != 'ctable3' }}
name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }}
runs-on: ${{ matrix.runs-on || matrix.os }}
permissions:
Expand Down Expand Up @@ -128,10 +129,9 @@ jobs:


upload_pypi:
if: ${{ (github.ref_name != 'ctable3' && github.head_ref != 'ctable3') && startsWith(github.event.ref, 'refs/tags') }}
needs: [ build_wheels]
runs-on: ubuntu-latest
# Only upload wheels when tagging (typically a release)
if: startsWith(github.event.ref, 'refs/tags')
steps:
- uses: actions/download-artifact@v8
with:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/wasm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ env:

jobs:
build_wheels_wasm:
if: ${{ github.ref_name != 'ctable3' && github.head_ref != 'ctable3' }}
name: Build and test wheels for WASM on ${{ matrix.os }} for ${{ matrix.p_ver }}
runs-on: ubuntu-latest
permissions:
Expand Down
97 changes: 97 additions & 0 deletions bench/ctable/Prueba_iter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

from dataclasses import dataclass
from time import time

import blosc2
from blosc2 import CTable


@dataclass
class Row:
id: int = blosc2.field(blosc2.int64(ge=0))
score: float = blosc2.field(blosc2.float64(ge=0, le=100))
active: bool = blosc2.field(blosc2.bool(), default=True)


N = 1_000 # start small, increase when confident

data = [(i, float(i % 100), i % 2 == 0) for i in range(N)]
tabla = CTable(Row, new_data=data)

print(f"Table created with {len(tabla)} rows\n")

# -------------------------------------------------------------------
# Test 1: iterate without accessing any column (minimum cost)
# -------------------------------------------------------------------
t0 = time()
for _row in tabla:
pass
t1 = time()
print(f"[Test 1] Iter without accessing columns: {(t1 - t0)*1000:.3f} ms")

# -------------------------------------------------------------------
# Test 2: iterate accessing a single column (real_pos cached once)
# -------------------------------------------------------------------
t0 = time()
for row in tabla:
_ = row["id"]
t1 = time()
print(f"[Test 2] Iter accessing 'id': {(t1 - t0)*1000:.3f} ms")

# -------------------------------------------------------------------
# Test 3: iterate accessing all columns (real_pos cached once per row)
# -------------------------------------------------------------------
t0 = time()
for row in tabla:
_ = row["id"]
_ = row["score"]
_ = row["active"]
t1 = time()
print(f"[Test 3] Iter accessing 3 columns: {(t1 - t0)*1000:.3f} ms")

# -------------------------------------------------------------------
# Test 4: correctness — values match expected
# -------------------------------------------------------------------
errors = 0
for row in tabla:
if row["id"] != row._nrow:
errors += 1
if row["score"] != float(row._nrow % 100):
errors += 1
if row["active"] != (row._nrow % 2 == 0):
errors += 1

print(f"\n[Test 4] Correctness errors: {errors} (expected: 0)")

# -------------------------------------------------------------------
# Test 5: with holes (deleted rows)
# -------------------------------------------------------------------
tabla2 = CTable(Row, new_data=data)
tabla2.delete(list(range(0, N, 2))) # delete even rows, keep odd ones

print(f"\nTable with holes: {len(tabla2)} rows (expected: {N // 2})")

t0 = time()
ids = []
for row in tabla2:
ids.append(row["id"])
t1 = time()

expected_ids = [i for i in range(N) if i % 2 != 0]
ok = ids == expected_ids
print(f"[Test 5] Iter with holes ({N//2} rows): {(t1 - t0)*1000:.3f} ms | correctness: {ok}")

# -------------------------------------------------------------------
# Test 6: real_pos is cached correctly (not recomputed)
# -------------------------------------------------------------------
row0 = next(iter(tabla))
assert row0._real_pos is None, "real_pos should be None before first access"
_ = row0["id"]
assert row0._real_pos is not None, "real_pos should be cached after first access"
print(f"\n[Test 6] real_pos caching: OK (real_pos={row0._real_pos})")
117 changes: 117 additions & 0 deletions bench/ctable/bench_append_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark: append() overhead introduced by the new schema pipeline
#
# The new append() path routes every row through:
# _normalize_row_input → validate_row (Pydantic) → _coerce_row_to_storage
#
# This benchmark isolates how much each step costs, and shows the
# total overhead vs the raw NDArray write speed.

from dataclasses import dataclass
from time import perf_counter

import numpy as np

import blosc2
from blosc2.schema_compiler import compile_schema
from blosc2.schema_validation import build_validator_model, validate_row


@dataclass
class Row:
id: int = blosc2.field(blosc2.int64(ge=0))
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
active: bool = blosc2.field(blosc2.bool(), default=True)


N = 5_000
rng = np.random.default_rng(42)
data = [
(int(i), float(rng.uniform(0, 100)), bool(i % 2))
for i in range(N)
]
schema = compile_schema(Row)
# Warm up the Pydantic model cache
build_validator_model(schema)

print(f"append() pipeline cost breakdown | N = {N:,} rows")
print("=" * 60)

# ── 1. Raw NDArray writes (no CTable overhead at all) ────────────────────────
ids = np.zeros(N, dtype=np.int64)
scores = np.zeros(N, dtype=np.float64)
flags = np.zeros(N, dtype=np.bool_)
mask = np.zeros(N, dtype=np.bool_)

t0 = perf_counter()
for i, (id_, score, active) in enumerate(data):
ids[i] = id_
scores[i] = score
flags[i] = active
mask[i] = True
t_raw = perf_counter() - t0
print(f"{'Raw NumPy writes (baseline)':<40} {t_raw:.4f} s")

# ── 2. _normalize_row_input only ─────────────────────────────────────────────
t_obj = blosc2.CTable(Row, expected_size=N, validate=False)
t0 = perf_counter()
for row in data:
_ = t_obj._normalize_row_input(row)
t_normalize = perf_counter() - t0
print(f"{'_normalize_row_input only':<40} {t_normalize:.4f} s ({t_normalize/t_raw:.1f}x baseline)")

# ── 3. Pydantic validate_row only ────────────────────────────────────────────
row_dicts = [t_obj._normalize_row_input(row) for row in data]
t0 = perf_counter()
for rd in row_dicts:
_ = validate_row(schema, rd)
t_validate = perf_counter() - t0
print(f"{'validate_row (Pydantic) only':<40} {t_validate:.4f} s ({t_validate/t_raw:.1f}x baseline)")

# ── 4. _coerce_row_to_storage only ───────────────────────────────────────────
t0 = perf_counter()
for rd in row_dicts:
_ = t_obj._coerce_row_to_storage(rd)
t_coerce = perf_counter() - t0
print(f"{'_coerce_row_to_storage only':<40} {t_coerce:.4f} s ({t_coerce/t_raw:.1f}x baseline)")

# ── 5. Full append(), validate=False (3 runs, take minimum) ─────────────────
RUNS = 3
best_off = float("inf")
for _ in range(RUNS):
t_obj2 = blosc2.CTable(Row, expected_size=N, validate=False)
t0 = perf_counter()
for row in data:
t_obj2.append(row)
best_off = min(best_off, perf_counter() - t0)
t_append_off = best_off
print(f"{'Full append(), validate=False':<40} {t_append_off:.4f} s ({t_append_off/t_raw:.1f}x baseline)")

# ── 6. Full append(), validate=True (3 runs, take minimum) ──────────────────
best_on = float("inf")
for _ in range(RUNS):
t_obj3 = blosc2.CTable(Row, expected_size=N, validate=True)
t0 = perf_counter()
for row in data:
t_obj3.append(row)
best_on = min(best_on, perf_counter() - t0)
t_append_on = best_on
print(f"{'Full append(), validate=True':<40} {t_append_on:.4f} s ({t_append_on/t_raw:.1f}x baseline)")

print()
print("=" * 60)
pydantic_cost = max(t_append_on - t_append_off, 0.0)
print(f"{'Pydantic overhead in append()':<40} {pydantic_cost:.4f} s")
if t_append_on > 0:
print(f"{'Validation fraction of total':<40} {pydantic_cost/t_append_on*100:.1f}%")
print(f"{'Per-row Pydantic cost (isolated)':<40} {(t_validate/N)*1e6:.2f} µs/row")
print()
print(f"Note: append() is dominated by blosc2 I/O ({t_append_off/t_raw:.0f}x raw numpy),")
print(" not by the validation pipeline.")
print(" The main bottleneck is the last_true_pos backward scan per row.")
Loading
Loading