From de68d4a5a31133aa80db7b10d6904de147774c1c Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 8 Feb 2026 08:45:57 -0500 Subject: [PATCH 1/2] upgrade pco to 1.0.1 Signed-off-by: mwlon --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- encodings/pco/src/array.rs | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6bf31b33898..f9734a244ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3853,9 +3853,9 @@ checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" [[package]] name = "dtype_dispatch" -version = "0.1.1" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3a5ccdfd6c5e7e2fea9c5cf256f2a08216047fab19c621c3da64e9ae4a1462d" +checksum = "ab23e69df104e2fd85ee63a533a22d2132ef5975dc6b36f9f3e5a7305e4a8ed7" [[package]] name = "duckdb-bench" @@ -7028,9 +7028,9 @@ dependencies = [ [[package]] name = "pco" -version = "0.4.9" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42382de9fb564e2d10cb4d5ca97cc06d928f0f9667bbef456b57e60827b6548b" +checksum = "e89d71ab3c07ed898defa4915bdc2a963131d811a1eab0eeacfac65c94cdeae8" dependencies = [ "better_io", "dtype_dispatch", diff --git a/Cargo.toml b/Cargo.toml index ba9a0268b87..732c61aea52 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -177,7 +177,7 @@ opentelemetry_sdk = "0.31.0" parking_lot = { version = "0.12.3", features = ["nightly"] } parquet = "57.1" paste = "1.0.15" -pco = "0.4.4" +pco = "1.0.1" pin-project-lite = "0.2.15" primitive-types = { version = "0.14.0" } proc-macro2 = "1.0.95" diff --git a/encodings/pco/src/array.rs b/encodings/pco/src/array.rs index 1cda0031520..7a48f991b52 100644 --- a/encodings/pco/src/array.rs +++ b/encodings/pco/src/array.rs @@ -297,7 +297,7 @@ impl PcoArray { let mut chunk_infos = vec![]; // the Vortex metadata let mut page_buffers = vec![]; for chunk_start in (0..n_values).step_by(values_per_chunk) { - let cc = match_number_enum!( + let mut cc = match_number_enum!( number_type, NumberType => { let chunk_end = cmp::min(n_values, chunk_start + values_per_chunk); @@ -309,8 +309,8 @@ impl PcoArray { } ); - let mut chunk_meta_buffer = ByteBufferMut::with_capacity(cc.chunk_meta_size_hint()); - cc.write_chunk_meta(&mut chunk_meta_buffer) + let mut chunk_meta_buffer = ByteBufferMut::with_capacity(cc.meta_size_hint()); + cc.write_meta(&mut chunk_meta_buffer) .map_err(vortex_err_from_pco)?; chunk_meta_buffers.push(chunk_meta_buffer.freeze()); @@ -424,7 +424,7 @@ impl PcoArray { .page_decompressor(page, page_n_values) .map_err(vortex_err_from_pco) .vortex_expect("page_decompressor should succeed with valid page data"); - pd.decompress(&mut decompressed_values[old_len..new_len]) + pd.read(&mut decompressed_values[old_len..new_len]) .map_err(vortex_err_from_pco) .vortex_expect("decompress should succeed with valid compressed data"); } else { From 25002b579b4a2a122aa7d9a936b39789d391b793 Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 8 Feb 2026 08:56:41 -0500 Subject: [PATCH 2/2] reduce pco default values per page to 2048 Signed-off-by: mwlon --- encodings/pco/src/array.rs | 2 +- vortex-layout/src/layouts/compact.rs | 46 ++++++++++++++++++---------- vortex/src/lib.rs | 2 +- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/encodings/pco/src/array.rs b/encodings/pco/src/array.rs index 7a48f991b52..961326a3d55 100644 --- a/encodings/pco/src/array.rs +++ b/encodings/pco/src/array.rs @@ -297,10 +297,10 @@ impl PcoArray { let mut chunk_infos = vec![]; // the Vortex metadata let mut page_buffers = vec![]; for chunk_start in (0..n_values).step_by(values_per_chunk) { + let chunk_end = cmp::min(n_values, chunk_start + values_per_chunk); let mut cc = match_number_enum!( number_type, NumberType => { - let chunk_end = cmp::min(n_values, chunk_start + values_per_chunk); let values = values.to_buffer::(); let chunk = &values.as_slice()[chunk_start..chunk_end]; fc diff --git a/vortex-layout/src/layouts/compact.rs b/vortex-layout/src/layouts/compact.rs index 8163a32e4ae..127aa00d187 100644 --- a/vortex-layout/src/layouts/compact.rs +++ b/vortex-layout/src/layouts/compact.rs @@ -46,7 +46,8 @@ pub struct CompactCompressor { /// nvCOMP though doesn't support ZSTD dictionaries. Therefore, we need the option to /// disable them for compatibility. zstd_use_dicts: bool, - zstd_values_per_page: usize, + pco_values_per_page: usize, + zstd_values_per_frame: usize, } impl CompactCompressor { @@ -66,13 +67,22 @@ impl CompactCompressor { } /// Sets the number of non-null primitive values to store per - /// separately-decompressible page/frame. + /// separately-decompressible Pco page. /// - /// Fewer values per page can reduce the time to query a small slice of rows, but too - /// few can increase compressed size and (de)compression time. The default is 0, which - /// is used for maximally-large pages. - pub fn with_zstd_values_per_page(mut self, values_per_page: usize) -> Self { - self.zstd_values_per_page = values_per_page; + /// Fewer values per frame can reduce the time to query a small slice of rows, but too + /// few can increase compressed size and (de)compression time. + pub fn with_pco_values_per_page(mut self, values_per_page: usize) -> Self { + self.zstd_values_per_frame = values_per_page; + self + } + + /// Sets the number of non-null primitive values to store per + /// separately-decompressible Zstd frame. + /// + /// Fewer values per frame can reduce the time to query a small slice of rows, but too + /// few can increase compressed size and (de)compression time. + pub fn with_zstd_values_per_frame(mut self, values_per_page: usize) -> Self { + self.zstd_values_per_frame = values_per_page; self } @@ -93,7 +103,7 @@ impl CompactCompressor { let pco_array = PcoArray::from_primitive( primitive, self.pco_level, - self.zstd_values_per_page, + self.pco_values_per_page, )?; pco_array.into_array() } else { @@ -101,13 +111,13 @@ impl CompactCompressor { ZstdArray::from_primitive( primitive, self.zstd_level, - self.zstd_values_per_page, + self.zstd_values_per_frame, )? } else { ZstdArray::from_primitive_without_dict( primitive, self.zstd_level, - self.zstd_values_per_page, + self.zstd_values_per_frame, )? }; zstd_array.into_array() @@ -140,13 +150,13 @@ impl CompactCompressor { Canonical::VarBinView(vbv) => { // always zstd if self.zstd_use_dicts { - ZstdArray::from_var_bin_view(vbv, self.zstd_level, self.zstd_values_per_page)? + ZstdArray::from_var_bin_view(vbv, self.zstd_level, self.zstd_values_per_frame)? .into_array() } else { ZstdArray::from_var_bin_view_without_dict( vbv, self.zstd_level, - self.zstd_values_per_page, + self.zstd_values_per_frame, )? .into_array() } @@ -224,11 +234,13 @@ impl Default for CompactCompressor { pco_level: pco::DEFAULT_COMPRESSION_LEVEL, zstd_level: 3, zstd_use_dicts: true, - // This is probably high enough to not hurt performance or - // compression. It also currently aligns with the default strategy's - // number of rows per statistic, which allows efficient pushdown - // (but nothing enforces this). - zstd_values_per_page: 8192, + // These values per page/frame amounts are probably high enough to + // not hurt performance or compression. They also currently divide + // the default strategy's number of rows per statistic, which allows + // efficient pushdown in the case of scalar, non-nullable data (but + // nothing enforces this). + pco_values_per_page: 2048, + zstd_values_per_frame: 8192, } } } diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs index c7d304ed971..85e1f9032df 100644 --- a/vortex/src/lib.rs +++ b/vortex/src/lib.rs @@ -258,7 +258,7 @@ mod test { // Or apply generally stronger compression with the compact compressor let compressed = CompactCompressor::default() - .with_zstd_values_per_page(8192) + .with_zstd_values_per_frame(8192) .compress(array.as_ref())?; println!("Compact size: {} / {}", compressed.nbytes(), array.nbytes()); // [compress]