From 8f75831d5b8581c73a8e24efec1c7dac4be0ede2 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Mon, 17 Nov 2025 15:01:33 +0100 Subject: [PATCH] Add RunEndEncoding --- src/ArrowTypes/src/ArrowTypes.jl | 4 + src/arraytypes/arraytypes.jl | 1 + src/arraytypes/runendencoded.jl | 245 +++++++++++++++++++++++++++++++ src/eltypes.jl | 24 +++ src/table.jl | 46 ++++++ test/generate_ree_test_data.py | 117 +++++++++++++++ test/runtests.jl | 1 + test/test_ree_data.arrow | Bin 0 -> 3114 bytes test/test_ree_simple.arrow | Bin 0 -> 802 bytes test/test_runendencoded.jl | 191 ++++++++++++++++++++++++ test/testtables.jl | 29 ++++ 11 files changed, 658 insertions(+) create mode 100644 src/arraytypes/runendencoded.jl create mode 100644 test/generate_ree_test_data.py create mode 100644 test/test_ree_data.arrow create mode 100644 test/test_ree_simple.arrow create mode 100644 test/test_runendencoded.jl diff --git a/src/ArrowTypes/src/ArrowTypes.jl b/src/ArrowTypes/src/ArrowTypes.jl index 86183b54..dfd35c11 100644 --- a/src/ArrowTypes/src/ArrowTypes.jl +++ b/src/ArrowTypes/src/ArrowTypes.jl @@ -33,6 +33,7 @@ export ArrowKind, StructKind, UnionKind, DictEncodedKind, + RunEndEncodedKind, toarrow, arrowname, fromarrow, @@ -348,6 +349,9 @@ ArrowKind(::Union) = UnionKind() "DictEncodedKind store a small pool of unique values in one buffer, with a full-length buffer of integer offsets into the small value pool" struct DictEncodedKind <: ArrowKind end +"RunEndEncodedKind efficiently stores arrays with repeated values using run-end encoding, with two child arrays: run_ends (indices where runs end) and values (the actual values)" +struct RunEndEncodedKind <: ArrowKind end + """ There are a couple places when writing arrow buffers where we need to write a "dummy" value; it doesn't really matter diff --git a/src/arraytypes/arraytypes.jl b/src/arraytypes/arraytypes.jl index 58bab082..e6d6fe94 100644 --- a/src/arraytypes/arraytypes.jl +++ b/src/arraytypes/arraytypes.jl @@ -271,4 +271,5 @@ include("map.jl") include("struct.jl") include("unions.jl") include("dictencoding.jl") +include("runendencoded.jl") include("views.jl") diff --git a/src/arraytypes/runendencoded.jl b/src/arraytypes/runendencoded.jl new file mode 100644 index 00000000..dbd882e8 --- /dev/null +++ b/src/arraytypes/runendencoded.jl @@ -0,0 +1,245 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ..ArrowTypes: RunEndEncodedKind + +""" + Arrow.RunEndEncoded + +An `ArrowVector` that uses run-end encoding (REE) to efficiently represent +arrays with sequences of repeated values. This is a variation of run-length +encoding where each run is represented by a value and an integer giving the +logical index where the run ends. + +The array contains two child arrays: +- `run_ends`: A vector of Int16, Int32, or Int64 values representing the + accumulated length where each run ends (strictly ascending, 1-indexed) +- `values`: The actual values for each run + +For example, the array `[1, 1, 1, 2, 2]` would be encoded as: +- `run_ends = [3, 5]` +- `values = [1, 2]` + +Note: The parent array has no validity bitmap (null_count = 0). Nulls are +represented as null values in the `values` child array. +""" +struct RunEndEncoded{T,R<:Union{Int16,Int32,Int64},A} <: ArrowVector{T} + arrow::Vector{UInt8} # reference to arrow memory blob + validity::ValidityBitmap # always empty for REE (null_count = 0) + run_ends::Vector{R} # strictly ascending indices where runs end + values::A # child array with actual values + ℓ::Int64 # logical length of the decoded array + metadata::Union{Nothing,Base.ImmutableDict{String,String}} +end + +RunEndEncoded( + ::Type{T}, + b::Vector{UInt8}, + v::ValidityBitmap, + run_ends::Vector{R}, + values::A, + len, + meta, +) where {T,R,A} = RunEndEncoded{T,R,A}(b, v, run_ends, values, len, meta) + +Base.size(r::RunEndEncoded) = (r.ℓ,) + +""" + _find_physical_index(run_ends, logical_index) + +Find the physical index (into the values array) for a given logical index. +Uses binary search to achieve O(log n) lookup time. +""" +@inline function _find_physical_index(run_ends::Vector{R}, i::Integer) where {R} + # Binary search to find which run contains index i + # run_ends[j-1] < i <= run_ends[j] + lo = 1 + hi = length(run_ends) + + @inbounds while lo < hi + mid = (lo + hi) >>> 1 # unsigned right shift for safe midpoint + if run_ends[mid] < i + lo = mid + 1 + else + hi = mid + end + end + + return lo +end + +@propagate_inbounds function Base.getindex(r::RunEndEncoded{T}, i::Integer) where {T} + @boundscheck checkbounds(r, i) + # Find which run contains this index + @inbounds physical_idx = _find_physical_index(r.run_ends, i) + # Return the value for that run + return @inbounds ArrowTypes.fromarrow(T, r.values[physical_idx]) +end + +# Iteration - implement efficiently by iterating over runs +function Base.iterate(r::RunEndEncoded{T}) where {T} + isempty(r) && return nothing + # State: (current_physical_index, current_logical_index, run_end) + run_idx = 1 + @inbounds run_end = r.run_ends[1] + @inbounds val = ArrowTypes.fromarrow(T, r.values[1]) + return (val, (1, 1, run_end, val)) +end + +function Base.iterate(r::RunEndEncoded{T}, state) where {T} + run_idx, logical_idx, run_end, val = state + logical_idx += 1 + logical_idx > r.ℓ && return nothing + + if logical_idx > run_end + # Move to next run + run_idx += 1 + @inbounds run_end = r.run_ends[run_idx] + @inbounds val = ArrowTypes.fromarrow(T, r.values[run_idx]) + end + + return (val, (run_idx, logical_idx, run_end, val)) +end + +# Don't pass through REE in arrowvector, keep it as-is +arrowvector(::RunEndEncodedKind, x::RunEndEncoded, i, nl, fi, de, ded, meta; kw...) = x + +# Convert a regular Julia array to RunEndEncoded format +function arrowvector(::RunEndEncodedKind, x, i, nl, fi, de, ded, meta; run_ends_type::Type{R}=Int32) where {R<:Union{Int16,Int32,Int64}} + len = length(x) + len == 0 && error("Cannot create RunEndEncoded array with length 0") + + # Compute runs + run_ends_vec = R[] + values_vec = [] + + prev_val = @inbounds x[1] + run_end = 1 + + for i in 2:len + @inbounds curr_val = x[i] + if !isequal(curr_val, prev_val) + # End of current run + push!(run_ends_vec, R(run_end)) + push!(values_vec, prev_val) + prev_val = curr_val + end + run_end = i + end + + # Don't forget the final run + push!(run_ends_vec, R(run_end)) + push!(values_vec, prev_val) + + # Create the values child array + T = eltype(x) + values_arrow = arrowvector(values_vec, i, nl, fi, de, ded, meta; kw...) + + # Validity bitmap is always empty for REE parent + validity = ValidityBitmap(UInt8[], len, 0) + + return RunEndEncoded(T, UInt8[], validity, run_ends_vec, values_arrow, len, meta) +end + +function compress(Z::Meta.CompressionType.T, comp, r::R) where {R<:RunEndEncoded} + len = length(r) + nc = 0 # REE always has null_count = 0 on parent + # Note: validity bitmap is always empty, so we only compress the child arrays + # For simplicity, we'll compress the run_ends and delegate values compression + run_ends_compressed = compress(Z, comp, r.run_ends) + values_compressed = compress(Z, comp, r.values) + return Compressed{Z,R}(r, [run_ends_compressed, values_compressed], len, nc, Compressed[]) +end + +function makenodesbuffers!( + col::RunEndEncoded{T}, + fieldnodes, + fieldbuffers, + bufferoffset, + alignment, +) where {T} + len = length(col) + nc = 0 # REE parent always has null_count = 0 + push!(fieldnodes, FieldNode(len, nc)) + @debug "made field node: nodeidx = $(length(fieldnodes)), col = $(typeof(col)), len = $(fieldnodes[end].length), nc = $(fieldnodes[end].null_count)" + + # REE has no buffers on the parent level - it uses child arrays instead + # The validity bitmap is always empty (0 bytes) + push!(fieldbuffers, Buffer(bufferoffset, 0)) + @debug "made field buffer (validity): bufferidx = $(length(fieldbuffers)), offset = $(fieldbuffers[end].offset), len = $(fieldbuffers[end].length)" + + # Now add the child arrays (run_ends and values) + # Note: The run_ends array is a primitive int array with no nulls + bufferoffset = makenodesbuffers!(col.run_ends, fieldnodes, fieldbuffers, bufferoffset, alignment) + bufferoffset = makenodesbuffers!(col.values, fieldnodes, fieldbuffers, bufferoffset, alignment) + + return bufferoffset +end + +# Special handling for run_ends which is a plain Vector +function makenodesbuffers!( + col::Vector{R}, + fieldnodes, + fieldbuffers, + bufferoffset, + alignment, +) where {R<:Union{Int16,Int32,Int64}} + len = length(col) + nc = 0 # run_ends never has nulls + push!(fieldnodes, FieldNode(len, nc)) + @debug "made field node (run_ends): nodeidx = $(length(fieldnodes)), len = $len, nc = 0" + + # validity bitmap (empty - 0 bytes) + push!(fieldbuffers, Buffer(bufferoffset, 0)) + @debug "made field buffer (run_ends validity): bufferidx = $(length(fieldbuffers)), offset = $bufferoffset, len = 0" + + # data buffer + blen = len * sizeof(R) + push!(fieldbuffers, Buffer(bufferoffset, blen)) + @debug "made field buffer (run_ends data): bufferidx = $(length(fieldbuffers)), offset = $bufferoffset, len = $blen" + + return bufferoffset + padding(blen, alignment) +end + +function writebuffer(io, col::RunEndEncoded, alignment) + @debug "writebuffer: col = $(typeof(col))" + @debug col + + # Write empty validity bitmap (0 bytes for parent REE array) + # No need to write anything or pad since length is 0 + + # Write run_ends child array + writebuffer(io, col.run_ends, alignment) + + # Write values child array + writebuffer(io, col.values, alignment) + + return +end + +# Write buffer for plain Vector{R} (run_ends) +function writebuffer(io, col::Vector{R}, alignment) where {R<:Union{Int16,Int32,Int64}} + @debug "writebuffer (run_ends): col = $(typeof(col)), length = $(length(col))" + + # No validity bitmap to write (0 bytes) + + # Write the data + n = writearray(io, R, col) + @debug "writing run_ends array: n = $n, padded = $(padding(n, alignment))" + writezeros(io, paddinglength(n, alignment)) + + return +end diff --git a/src/eltypes.jl b/src/eltypes.jl index 52dbb809..8cbb1eb9 100644 --- a/src/eltypes.jl +++ b/src/eltypes.jl @@ -432,6 +432,30 @@ ArrowTypes.arrowname(::Type{P}) where {P<:Dates.Period} = PERIOD_SYMBOL ArrowTypes.JuliaType(::Val{PERIOD_SYMBOL}, ::Type{Duration{U}}) where {U} = periodtype(U) ArrowTypes.fromarrow(::Type{P}, x::Duration{U}) where {P<:Dates.Period,U} = convert(P, x) +# RunEndEncoded type +function juliaeltype(f::Meta.Field, ree::Meta.RunEndEncoded, convert) + # RunEndEncoded has two child arrays: run_ends and values + # The element type is determined by the values child array + @assert length(f.children) == 2 "RunEndEncoded must have exactly 2 children (run_ends and values)" + run_ends_field = f.children[1] + values_field = f.children[2] + + # Get the element type from the values child + values_type = juliaeltype(values_field, buildmetadata(values_field), convert) + + return values_type +end + +function arrowtype(b, x::RunEndEncoded{T,R,A}) where {T,R,A} + # Create field offsets for the two child arrays + children = [ + fieldoffset(b, "run_ends", x.run_ends), + fieldoffset(b, "values", x.values) + ] + Meta.runEndEncodedStart(b) + return Meta.RunEndEncoded, Meta.runEndEncodedEnd(b), children +end + # nested types; call juliaeltype recursively on nested children function juliaeltype( f::Meta.Field, diff --git a/src/table.jl b/src/table.jl index de8bfc37..95fb98e2 100644 --- a/src/table.jl +++ b/src/table.jl @@ -1063,6 +1063,52 @@ function build( return B, nodeidx, bufferidx, varbufferidx end +function build( + f::Meta.Field, + ree::Meta.RunEndEncoded, + batch, + rb, + de, + nodeidx, + bufferidx, + varbufferidx, + convert, +) + @debug "building array: RunEndEncoded" + # REE parent has empty validity bitmap + validity = buildbitmap(batch, rb, nodeidx, bufferidx) + bufferidx += 1 + len = rb.nodes[nodeidx].length + nodeidx += 1 + + meta = buildmetadata(f.custom_metadata) + T = juliaeltype(f, meta, convert) + + # Build the two child arrays: run_ends and values + @assert length(f.children) == 2 "RunEndEncoded must have exactly 2 children" + + # First child: run_ends (Int16, Int32, or Int64) + run_ends_child = f.children[1] + run_ends_array, nodeidx, bufferidx, varbufferidx = + build(run_ends_child, batch, rb, de, nodeidx, bufferidx, varbufferidx, convert) + + # Extract the actual run_ends vector + # run_ends_array should be a Primitive{R, Vector{R}} where R is Int16/Int32/Int64 + run_ends = run_ends_array.data + R = eltype(run_ends) + + # Second child: values (any Arrow type) + values_child = f.children[2] + values_array, nodeidx, bufferidx, varbufferidx = + build(values_child, batch, rb, de, nodeidx, bufferidx, varbufferidx, convert) + + bytes = UInt8[] # Reference to arrow memory (from children) + return RunEndEncoded{T,R,typeof(values_array)}(bytes, validity, run_ends, values_array, len, meta), + nodeidx, + bufferidx, + varbufferidx +end + function build( f::Meta.Field, L::Meta.Null, diff --git a/test/generate_ree_test_data.py b/test/generate_ree_test_data.py new file mode 100644 index 00000000..489c19a8 --- /dev/null +++ b/test/generate_ree_test_data.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +""" +Generate Arrow IPC files with RunEndEncoded (REE) arrays for testing Arrow.jl +""" + +import pyarrow as pa +import pyarrow.ipc as ipc + +def create_ree_test_file(): + """Create an Arrow IPC file with various RunEndEncoded array examples""" + + print(f"PyArrow version: {pa.__version__}") + + # All arrays must have same logical length (10 elements each) + + # Example 1: Simple repeated integers - [1,1,1,1,2,2,3,3,3,3] + values1 = pa.array([1, 2, 3], type=pa.int64()) + run_ends1 = pa.array([4, 6, 10], type=pa.int32()) + ree1 = pa.RunEndEncodedArray.from_arrays(run_ends1, values1) + + # Example 2: Float values with nulls - [1.0,1.0,1.0,1.0,null,null,2.0,2.0,2.0,2.0] + values2 = pa.array([1.0, None, 2.0], type=pa.float64()) + run_ends2 = pa.array([4, 6, 10], type=pa.int32()) + ree2 = pa.RunEndEncodedArray.from_arrays(run_ends2, values2) + + # Example 3: String values - ["hello","hello","hello","world","world","foo","foo","foo","foo","foo"] + values3 = pa.array(['hello', 'world', 'foo'], type=pa.string()) + run_ends3 = pa.array([3, 5, 10], type=pa.int32()) + ree3 = pa.RunEndEncodedArray.from_arrays(run_ends3, values3) + + # Example 4: Using Int16 run ends - [100,100,100,100,100,200,200,200,200,200] + values4 = pa.array([100, 200], type=pa.int32()) + run_ends4 = pa.array([5, 10], type=pa.int16()) + ree4 = pa.RunEndEncodedArray.from_arrays(run_ends4, values4) + + # Example 5: Single run - [42,42,42,42,42,42,42,42,42,42] + values5 = pa.array([42], type=pa.int64()) + run_ends5 = pa.array([10], type=pa.int32()) + ree5 = pa.RunEndEncodedArray.from_arrays(run_ends5, values5) + + # Example 6: Boolean values - [True,True,True,False,False,True,True,True,True,True] + values6 = pa.array([True, False, True], type=pa.bool_()) + run_ends6 = pa.array([3, 5, 10], type=pa.int32()) + ree6 = pa.RunEndEncodedArray.from_arrays(run_ends6, values6) + + # Create a table with all examples + table = pa.table({ + 'ree_int': ree1, + 'ree_float_with_nulls': ree2, + 'ree_string': ree3, + 'ree_int16_ends': ree4, + 'ree_single_run': ree5, + 'ree_bool': ree6, + }) + + print("\nTable schema:") + print(table.schema) + print(f"\nNumber of rows: {len(table)}") + + # Print decoded values for verification + print("\nDecoded values:") + for col_name in table.column_names: + col = table[col_name] + decoded = col.to_pylist() + print(f"{col_name}: {decoded}") + + # Write to Arrow IPC file + output_path = 'test_ree_data.arrow' + with ipc.RecordBatchFileWriter(output_path, table.schema) as writer: + writer.write_table(table) + + print(f"\n✓ Successfully wrote REE test data to: {output_path}") + + # Verify we can read it back + with ipc.open_file(output_path) as reader: + table_read = reader.read_all() + print(f"✓ Verified: read back {len(table_read)} rows") + + return output_path + +def create_simple_ree_file(): + """Create a minimal REE file for initial testing""" + + # Very simple case: [1, 1, 1, 2, 2] + values = pa.array([1, 2], type=pa.int64()) + run_ends = pa.array([3, 5], type=pa.int32()) + ree = pa.RunEndEncodedArray.from_arrays(run_ends, values) + + table = pa.table({'simple_ree': ree}) + + output_path = 'test_ree_simple.arrow' + with ipc.RecordBatchFileWriter(output_path, table.schema) as writer: + writer.write_table(table) + + print(f"✓ Created simple REE file: {output_path}") + print(f" Decoded values: {table['simple_ree'].to_pylist()}") + + return output_path + +if __name__ == '__main__': + print("Generating RunEndEncoded Arrow test files...\n") + + try: + # Create both comprehensive and simple test files + create_simple_ree_file() + print() + create_ree_test_file() + + print("\n" + "="*60) + print("Test data generation complete!") + print("="*60) + + except Exception as e: + print(f"\n✗ Error: {e}") + print("\nNote: RunEndEncoded support was added in PyArrow 13.0.0") + print("Please upgrade: pip install pyarrow>=13.0.0") + raise diff --git a/test/runtests.jl b/test/runtests.jl index 9ca171f1..c36f8921 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -34,6 +34,7 @@ include(joinpath(dirname(pathof(Arrow)), "../test/testtables.jl")) include(joinpath(dirname(pathof(Arrow)), "../test/testappend.jl")) include(joinpath(dirname(pathof(Arrow)), "../test/integrationtest.jl")) include(joinpath(dirname(pathof(Arrow)), "../test/dates.jl")) +include(joinpath(dirname(pathof(Arrow)), "../test/test_runendencoded.jl")) struct CustomStruct x::Int diff --git a/test/test_ree_data.arrow b/test/test_ree_data.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e8745bd337e73b3cce73f3b15fd77c4c1a4642e0 GIT binary patch literal 3114 zcmeHKv2GJV5S@$9$tgOJg`&tv6j!)Fh{BN~lnzItpaZ2LMIkbNo7&$l3)aiV#oq1K^yy4fDmccMDhiZkCDg*_(%~VXWyq&7_K%UxBMJn@&L>^ zt~BcRyHCq9$57%Qz>_6fWDLY093}dBzMD__`yzLN`K)Wp0&%y%hSN1CMYmt{fJ96V z7iAdlxV1H&+mLC)cpihf#&s~`*&i0q@_u(TDXl_UdoUaiy2T{VwH^kr z?K%4zW6Qu+JJlLvWhY?Hu}ZepW_Fwp9WKgXb>{s1l{XeU>u(9nIB}+kfq%SvE9!Si zZ(GCP#mmq4gZJ6_ZO^rNTg~3Z&D_{t!+VkdpTLUqpeRFwWv2=5HuFhJWfl40i;(K- zQj7JDo~V}1GUOCK+?(+hZvk_x?}x|S^!>j&Mogkjl}zT{BQbYXGTH1myb|t}I=(mi zZ7*AAve^$z;d_%Uju(FZn(to)c#|#HuLHfwiJw0T^d_7AaiBNZ?B4`>lWjlm1HH+X zAF6MFtLMZy9Q`>8<*VY&&sFi}G{p0*u2Lw6c=KyEp&a7F{e?2mgYSXa*?Ja-;X_>A z0?_`;{d1y7hjzr+y40J@8XNTUygVpJd2g>QRp*dBj8DO4cL!rqxSsjveWspuoZ?pV zg-ap1I*DZ+^DQvr=iZtefEs>dCn~vS{vGh=Ev^1;%)bNh{WSj$kpJmBppAE6Q~yW! E1vPt|B>(^b literal 0 HcmV?d00001 diff --git a/test/test_ree_simple.arrow b/test/test_ree_simple.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b8881fff28b021ba916c761b5ec9532602ca10a8 GIT binary patch literal 802 zcmds$O-=$q5QX2sFic1PHd;KF5 zj^LYA*SzYfs#o2xT&*6T?xY{^Eoq8L69qLIspSn+HC(WkwazQy$fOgwu=y4ICdUT& z9d?Vk#^LRKx9QgVu5$%H?lq%Uqj9W0&AqFEHMo|)F&gaSWpnHf(r4_Y>Va6w*G0Z! z{x$n0->Ub=?Yi5(Qo(ZsY#ir+Rtpx-f5Oi&BWRvaAVhT<)=`snB~0Lf9_fsp30n6k z@7k6Az0lvQH5tVxcIN3GQgn#x*LBqA+s7p@Ild#K?0Zgg*AGnFpdi3}zgB$DTF%b7 YZ7%TTZE&97TCM(n{vH<0y^Q~jU)Bvi1ONa4 literal 0 HcmV?d00001 diff --git a/test/test_runendencoded.jl b/test/test_runendencoded.jl new file mode 100644 index 00000000..d34ced5f --- /dev/null +++ b/test/test_runendencoded.jl @@ -0,0 +1,191 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +@testset "RunEndEncoded" begin + @testset "PyArrow generated simple file" begin + # Test reading the simple REE file generated by PyArrow + # File contains: [1, 1, 1, 2, 2] + file_path = joinpath(@__DIR__, "test_ree_simple.arrow") + tbl = Arrow.Table(file_path) + + @test length(tbl) == 5 + @test propertynames(tbl) == (:simple_ree,) + + col = tbl.simple_ree + @test col isa Arrow.RunEndEncoded + @test length(col) == 5 + + # Test indexing + @test col[1] == 1 + @test col[2] == 1 + @test col[3] == 1 + @test col[4] == 2 + @test col[5] == 2 + + # Test collecting + @test collect(col) == [1, 1, 1, 2, 2] + + # Test iteration + result = Int64[] + for x in col + push!(result, x) + end + @test result == [1, 1, 1, 2, 2] + end + + @testset "PyArrow generated comprehensive file" begin + # Test reading the comprehensive REE file with multiple column types + # All columns have 10 elements + file_path = joinpath(@__DIR__, "test_ree_data.arrow") + tbl = Arrow.Table(file_path) + + @test length(tbl) == 10 + @test :ree_int in propertynames(tbl) + @test :ree_float_with_nulls in propertynames(tbl) + @test :ree_string in propertynames(tbl) + @test :ree_int16_ends in propertynames(tbl) + @test :ree_single_run in propertynames(tbl) + @test :ree_bool in propertynames(tbl) + + # Test integer column: [1,1,1,1,2,2,3,3,3,3] + col_int = tbl.ree_int + @test col_int isa Arrow.RunEndEncoded + @test length(col_int) == 10 + @test collect(col_int) == [1, 1, 1, 1, 2, 2, 3, 3, 3, 3] + @test col_int[1] == 1 + @test col_int[4] == 1 + @test col_int[5] == 2 + @test col_int[6] == 2 + @test col_int[7] == 3 + @test col_int[10] == 3 + + # Test float column with nulls: [1.0,1.0,1.0,1.0,null,null,2.0,2.0,2.0,2.0] + col_float = tbl.ree_float_with_nulls + @test col_float isa Arrow.RunEndEncoded + @test length(col_float) == 10 + @test col_float[1] == 1.0 + @test col_float[4] == 1.0 + @test ismissing(col_float[5]) + @test ismissing(col_float[6]) + @test col_float[7] == 2.0 + @test col_float[10] == 2.0 + + # Test string column: ["hello","hello","hello","world","world","foo","foo","foo","foo","foo"] + col_string = tbl.ree_string + @test col_string isa Arrow.RunEndEncoded + @test length(col_string) == 10 + @test col_string[1] == "hello" + @test col_string[3] == "hello" + @test col_string[4] == "world" + @test col_string[5] == "world" + @test col_string[6] == "foo" + @test col_string[10] == "foo" + + # Test Int16 run_ends: [100,100,100,100,100,200,200,200,200,200] + col_int16 = tbl.ree_int16_ends + @test col_int16 isa Arrow.RunEndEncoded + @test length(col_int16) == 10 + @test collect(col_int16) == [100, 100, 100, 100, 100, 200, 200, 200, 200, 200] + @test col_int16[1] == 100 + @test col_int16[5] == 100 + @test col_int16[6] == 200 + @test col_int16[10] == 200 + + # Test single run: [42,42,42,42,42,42,42,42,42,42] + col_single = tbl.ree_single_run + @test col_single isa Arrow.RunEndEncoded + @test length(col_single) == 10 + @test all(x -> x == 42, col_single) + @test collect(col_single) == fill(42, 10) + + # Test boolean: [True,True,True,False,False,True,True,True,True,True] + col_bool = tbl.ree_bool + @test col_bool isa Arrow.RunEndEncoded + @test length(col_bool) == 10 + @test col_bool[1] == true + @test col_bool[3] == true + @test col_bool[4] == false + @test col_bool[5] == false + @test col_bool[6] == true + @test col_bool[10] == true + end + + @testset "Julia round-trip tests" begin + # Test writing and reading back REE arrays + + @testset "Simple repeated values" begin + data = (col1=[1, 1, 1, 2, 2],) + io = Arrow.tobuffer(data) + tbl = Arrow.Table(io) + @test collect(tbl.col1) == [1, 1, 1, 2, 2] + end + + @testset "With missing values" begin + data = (col1=[1.0, 1.0, missing, missing, 2.0],) + io = Arrow.tobuffer(data) + tbl = Arrow.Table(io) + result = collect(tbl.col1) + @test result[1] == 1.0 + @test result[2] == 1.0 + @test ismissing(result[3]) + @test ismissing(result[4]) + @test result[5] == 2.0 + end + + @testset "String values" begin + data = (col1=["hello", "hello", "world", "world", "world"],) + io = Arrow.tobuffer(data) + tbl = Arrow.Table(io) + @test collect(tbl.col1) == ["hello", "hello", "world", "world", "world"] + end + + @testset "All same value (single run)" begin + data = (col1=fill(42, 100),) + io = Arrow.tobuffer(data) + tbl = Arrow.Table(io) + @test all(x -> x == 42, tbl.col1) + @test length(tbl.col1) == 100 + end + + @testset "Boolean values" begin + data = (col1=[true, true, false, false, true],) + io = Arrow.tobuffer(data) + tbl = Arrow.Table(io) + @test collect(tbl.col1) == [true, true, false, false, true] + end + end + + @testset "Edge cases" begin + @testset "Alternating values (no runs)" begin + # This is a worst case for REE - every value is different + data = (col1=[1, 2, 1, 2, 1, 2],) + io = Arrow.tobuffer(data) + tbl = Arrow.Table(io) + @test collect(tbl.col1) == [1, 2, 1, 2, 1, 2] + end + + @testset "Long runs" begin + data = (col1=vcat(fill(1, 1000), fill(2, 1000), fill(3, 1000)),) + io = Arrow.tobuffer(data) + tbl = Arrow.Table(io) + col = tbl.col1 + @test length(col) == 3000 + @test all(x -> x == 1, col[1:1000]) + @test all(x -> x == 2, col[1001:2000]) + @test all(x -> x == 3, col[2001:3000]) + end + end +end diff --git a/test/testtables.jl b/test/testtables.jl index 1ee54045..a71ded28 100644 --- a/test/testtables.jl +++ b/test/testtables.jl @@ -300,6 +300,35 @@ testtables = [ (convert=false,), nothing, ), + ( + "RunEndEncoded simple", + (col1=[1, 1, 1, 2, 2],), + NamedTuple(), + NamedTuple(), + function (tt) + @test tt.col1 isa Arrow.RunEndEncoded + @test collect(tt.col1) == [1, 1, 1, 2, 2] + @test length(tt.col1) == 5 + @test tt.col1[1] == 1 + @test tt.col1[3] == 1 + @test tt.col1[4] == 2 + @test tt.col1[5] == 2 + end, + ), + ( + "RunEndEncoded with nulls", + (col1=[1.0, 1.0, 1.0, 1.0, missing, missing, 2.0, 2.0, 2.0, 2.0],), + NamedTuple(), + NamedTuple(), + function (tt) + @test tt.col1 isa Arrow.RunEndEncoded + @test collect(tt.col1) == [1.0, 1.0, 1.0, 1.0, missing, missing, 2.0, 2.0, 2.0, 2.0] + @test length(tt.col1) == 10 + @test tt.col1[1] == 1.0 + @test ismissing(tt.col1[5]) + @test tt.col1[7] == 2.0 + end, + ), ]; function testtable(nm, t, writekw, readkw, extratests)