From d2106d5e7a23e43f121b20481c3e8f0bf2a723bc Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 4 Feb 2026 16:32:30 +0000 Subject: [PATCH 1/2] chore[btrblocks]: more precise `FoR` `expected_compression_ratio` Signed-off-by: Joe Isaacs --- .../src/compressor/integer/mod.rs | 23 +++++++++++++------ .../src/compressor/integer/stats.rs | 22 ++++++++++++++++++ vortex-python/src/io.rs | 2 +- vortex-test/e2e/src/lib.rs | 2 +- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index f40af8e253e..d9c1dd96c78 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -140,12 +140,12 @@ pub enum IntCode { Uncompressed, /// Constant encoding for arrays with a single distinct value. Constant, + /// BitPacking encoding - compresses non-negative integers by reducing bit width. + BitPacking, /// Frame of Reference encoding - subtracts minimum value then bitpacks. For, /// ZigZag encoding - transforms negative integers to positive for better bitpacking. ZigZag, - /// BitPacking encoding - compresses non-negative integers by reducing bit width. - BitPacking, /// Sparse encoding - optimizes null-dominated or single-value-dominated arrays. Sparse, /// Dictionary encoding - creates a dictionary of unique values. @@ -342,19 +342,28 @@ impl Scheme for FORScheme { .bit_width() .try_into() .vortex_expect("bit width must fit in u32"); - let bw = match stats.typed.max_minus_min().checked_ilog2() { + let for_bw = match stats.typed.max_minus_min().checked_ilog2() { Some(l) => l + 1, // If max-min == 0, it we should use a different compression scheme // as we don't want to bitpack down to 0 bits. None => return Ok(0.0), }; - // If we're not saving at least 1 byte, don't bother with FOR - if full_width - bw < 8 { - return Ok(0.0); + // If BitPacking could apply (non-negative values) and FOR doesn't reduce bit width + // compared to BitPacking, don't use FOR since it has overhead (storing reference). + // Only skip FOR when min >= 0, otherwise BitPacking can't apply directly. + if let Some(max_log) = stats + .typed + .max_ilog2() + .filter(|_| !stats.typed.min_is_negative()) + { + let bitpack_bw = max_log + 1; + if for_bw >= bitpack_bw { + return Ok(0.0); + } } - Ok(full_width as f64 / bw as f64) + Ok(full_width as f64 / for_bw as f64) } fn compress( diff --git a/vortex-btrblocks/src/compressor/integer/stats.rs b/vortex-btrblocks/src/compressor/integer/stats.rs index cff8fcf8901..99a38455bf9 100644 --- a/vortex-btrblocks/src/compressor/integer/stats.rs +++ b/vortex-btrblocks/src/compressor/integer/stats.rs @@ -93,6 +93,28 @@ impl ErasedStats { } } + /// Returns the ilog2 of the max value when transmuted to unsigned, or None if zero. + /// + /// This matches how BitPacking computes bit width: it reinterprets signed values as + /// unsigned (preserving bit pattern) and uses leading_zeros. For non-negative signed + /// values, the transmuted value equals the original value. + /// + /// This is used to determine if FOR encoding would reduce bit width compared to + /// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help. + pub fn max_ilog2(&self) -> Option { + match &self { + ErasedStats::U8(x) => x.max.checked_ilog2(), + ErasedStats::U16(x) => x.max.checked_ilog2(), + ErasedStats::U32(x) => x.max.checked_ilog2(), + ErasedStats::U64(x) => x.max.checked_ilog2(), + // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior + ErasedStats::I8(x) => (x.max as u8).checked_ilog2(), + ErasedStats::I16(x) => (x.max as u16).checked_ilog2(), + ErasedStats::I32(x) => (x.max as u32).checked_ilog2(), + ErasedStats::I64(x) => (x.max as u64).checked_ilog2(), + } + } + /// Get the most commonly occurring value and its count pub fn top_value_and_count(&self) -> (PValue, u32) { match &self { diff --git a/vortex-python/src/io.rs b/vortex-python/src/io.rs index 22743e4828d..d4b50e71753 100644 --- a/vortex-python/src/io.rs +++ b/vortex-python/src/io.rs @@ -278,7 +278,7 @@ impl PyVortexWriteOptions { /// >>> vx.io.VortexWriteOptions.default().write(sprl, "chonky.vortex") /// >>> import os /// >>> os.path.getsize('chonky.vortex') - /// 216156 + /// 216020 /// ``` /// /// Wow, Vortex manages to use about two bytes per integer! So advanced. So tiny. diff --git a/vortex-test/e2e/src/lib.rs b/vortex-test/e2e/src/lib.rs index f97eb4e398d..7d56c2f9e76 100644 --- a/vortex-test/e2e/src/lib.rs +++ b/vortex-test/e2e/src/lib.rs @@ -25,7 +25,7 @@ mod tests { PrimitiveArray::new(Buffer::from_iter(values), Validity::NonNullable).into_array(); // Write in parallel and verify all sizes match expected - const EXPECTED_SIZE: usize = 216156; + const EXPECTED_SIZE: usize = 216020; let futures: Vec<_> = (0..5) .map(|_| { let array = array.clone(); From 653cf666309cf0dad645caaf41ab406b53951c8c Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 5 Feb 2026 10:53:50 +0000 Subject: [PATCH 2/2] fix Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/compressor/integer/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index d9c1dd96c78..e53884b95fa 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -134,16 +134,18 @@ impl Hash for dyn IntegerScheme { } /// Unique identifier for integer compression schemes. +/// +/// NOTE: Variant order matters for tie-breaking; `For` must precede `BitPacking` to avoid unnecessary patches. #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence, Ord, PartialOrd)] pub enum IntCode { /// No compression applied. Uncompressed, /// Constant encoding for arrays with a single distinct value. Constant, - /// BitPacking encoding - compresses non-negative integers by reducing bit width. - BitPacking, /// Frame of Reference encoding - subtracts minimum value then bitpacks. For, + /// BitPacking encoding - compresses non-negative integers by reducing bit width. + BitPacking, /// ZigZag encoding - transforms negative integers to positive for better bitpacking. ZigZag, /// Sparse encoding - optimizes null-dominated or single-value-dominated arrays.