From d92e1144f116410bab19a6e01e7d1e1b039dfd0c Mon Sep 17 00:00:00 2001 From: Jorge Candeias Date: Fri, 13 Feb 2026 00:29:18 +0000 Subject: [PATCH 1/2] Add support for Run-End Encoded arrays (REE) in Arrow .NET Introduced RunEndEncodedType and RunEndEncodedArray classes to represent run-end encoded arrays, including validation and logical length calculation. Integrated REE support into ArrowArrayFactory and IPC serialization/deserialization (ArrowStreamWriter, ArrowReaderImplementation, ArrowTypeFlatbufferBuilder, MessageSerializer). Added unit tests for REE array creation, validation, serialization, and indexing. This enables efficient handling of consecutive runs of the same value in Arrow .NET. --- src/Apache.Arrow/Arrays/ArrowArrayFactory.cs | 2 + src/Apache.Arrow/Arrays/RunEndEncodedArray.cs | 223 ++++++++++++ .../Ipc/ArrowReaderImplementation.cs | 3 + src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 12 +- .../Ipc/ArrowTypeFlatbufferBuilder.cs | 11 +- src/Apache.Arrow/Ipc/MessageSerializer.cs | 6 + src/Apache.Arrow/Types/IArrowType.cs | 1 + src/Apache.Arrow/Types/RunEndEncodedType.cs | 89 +++++ .../RunEndEncodedArrayTests.cs | 322 ++++++++++++++++++ 9 files changed, 667 insertions(+), 2 deletions(-) create mode 100644 src/Apache.Arrow/Arrays/RunEndEncodedArray.cs create mode 100644 src/Apache.Arrow/Types/RunEndEncodedType.cs create mode 100644 test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs diff --git a/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs b/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs index 8aaba2d9..5552d59f 100644 --- a/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs +++ b/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs @@ -107,6 +107,8 @@ public static IArrowArray BuildArray(ArrayData data) return new FixedSizeListArray(data); case ArrowTypeId.Interval: return IntervalArray.Create(data); + case ArrowTypeId.RunEndEncoded: + return new RunEndEncodedArray(data); default: throw new NotSupportedException($"An ArrowArray cannot be built for type {data.DataType.TypeId}."); } diff --git a/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs b/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs new file mode 100644 index 00000000..1427dab8 --- /dev/null +++ b/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + /// + /// Represents a run-end encoded array. + /// A run-end encoded array stores consecutive runs of the same value more efficiently. + /// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any type). + /// The run_ends array stores the cumulative end positions of each run. + /// + public class RunEndEncodedArray : Array + { + /// + /// Gets the run ends array (Int16Array, Int32Array, or Int64Array). + /// This array contains the cumulative end indices for each run. + /// + public IArrowArray RunEnds { get; } + + /// + /// Gets the values array. + /// This array contains the actual values that are run-length encoded. + /// + public IArrowArray Values { get; } + + /// + /// Creates a new RunEndEncodedArray from ArrayData. + /// + /// The array data containing run ends and values as children. + public RunEndEncodedArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0]), ArrowArrayFactory.BuildArray(data.Children[1])) + { + } + + /// + /// Creates a new RunEndEncodedArray with specified run ends and values arrays. + /// + /// The run ends array (must be Int16Array, Int32Array, or Int64Array). + /// The values array (can be any type). + public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values) + : this(CreateArrayData(runEnds, values), runEnds, values) + { + } + + private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, IArrowArray values) + : base(data) + { + data.EnsureBufferCount(0); // REE arrays have no buffers, only children + data.EnsureDataType(ArrowTypeId.RunEndEncoded); + + ValidateRunEndsType(runEnds); + RunEnds = runEnds; + Values = values; + } + + private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray values) + { + ValidateRunEndsType(runEnds); + + // The logical length of a REE array is determined by the last value in run_ends + int logicalLength = GetLogicalLength(runEnds); + + var dataType = new RunEndEncodedType(runEnds.Data.DataType, values.Data.DataType); + + return new ArrayData( + dataType, + logicalLength, + nullCount: 0, // REE arrays don't have a validity bitmap + offset: 0, + buffers: [], + children: [runEnds.Data, values.Data]); + } + + private static void ValidateRunEndsType(IArrowArray runEnds) + { + ArrowTypeId typeId = runEnds.Data.DataType.TypeId; + if (typeId != ArrowTypeId.Int16 && + typeId != ArrowTypeId.Int32 && + typeId != ArrowTypeId.Int64) + { + throw new ArgumentException( + $"Run ends array must be Int16, Int32, or Int64, but got {typeId}", + nameof(runEnds)); + } + } + + private static int GetLogicalLength(IArrowArray runEnds) + { + if (runEnds.Length == 0) + { + return 0; + } + + // Get the last run end value which represents the logical length + switch (runEnds) + { + case Int16Array int16Array: + return int16Array.GetValue(int16Array.Length - 1) ?? 0; + case Int32Array int32Array: + return int32Array.GetValue(int32Array.Length - 1) ?? 0; + case Int64Array int64Array: + { + long? lastValue = int64Array.GetValue(int64Array.Length - 1); + if (lastValue.HasValue && lastValue.Value > int.MaxValue) + { + throw new ArgumentException("Run ends value exceeds maximum supported length."); + } + return (int)(lastValue ?? 0); + } + default: + throw new InvalidOperationException($"Unexpected run ends array type: {runEnds.GetType()}"); + } + } + + /// + /// Finds the physical index in the run_ends array that contains the specified logical index. + /// + /// The logical index in the decoded array. + /// The physical index in the run_ends/values arrays. + public int FindPhysicalIndex(int logicalIndex) + { + if (logicalIndex < 0 || logicalIndex >= Length) + { + throw new ArgumentOutOfRangeException(nameof(logicalIndex)); + } + + // Binary search to find the run that contains this logical index + return RunEnds switch + { + Int16Array int16Array => BinarySearchRunEnds(int16Array, logicalIndex), + Int32Array int32Array => BinarySearchRunEnds(int32Array, logicalIndex), + Int64Array int64Array => BinarySearchRunEnds(int64Array, logicalIndex), + _ => throw new InvalidOperationException($"Unexpected run ends array type: {RunEnds.GetType()}"), + }; + } + + private static int BinarySearchRunEnds(Int16Array runEnds, int logicalIndex) + { + int left = 0; + int right = runEnds.Length - 1; + + while (left < right) + { + int mid = left + (right - left) / 2; + int runEnd = runEnds.GetValue(mid) ?? 0; + + if (logicalIndex < runEnd) + { + right = mid; + } + else + { + left = mid + 1; + } + } + + return left; + } + + private static int BinarySearchRunEnds(Int32Array runEnds, int logicalIndex) + { + int left = 0; + int right = runEnds.Length - 1; + + while (left < right) + { + int mid = left + (right - left) / 2; + int runEnd = runEnds.GetValue(mid) ?? 0; + + if (logicalIndex < runEnd) + { + right = mid; + } + else + { + left = mid + 1; + } + } + + return left; + } + + private static int BinarySearchRunEnds(Int64Array runEnds, int logicalIndex) + { + int left = 0; + int right = runEnds.Length - 1; + + while (left < right) + { + int mid = left + (right - left) / 2; + long runEnd = runEnds.GetValue(mid) ?? 0; + + if (logicalIndex < runEnd) + { + right = mid; + } + else + { + left = mid + 1; + } + } + + return left; + } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + } +} diff --git a/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs b/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs index d5d3758d..3bb24abc 100644 --- a/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs +++ b/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs @@ -269,6 +269,9 @@ private ArrayData LoadField( { case ArrowTypeId.Null: return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, System.Array.Empty()); + case ArrowTypeId.RunEndEncoded: + buffers = 0; + break; case ArrowTypeId.Union: if (version < MetadataVersion.V5) { diff --git a/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index 6c58c154..479b8841 100644 --- a/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -73,7 +73,8 @@ private class ArrowRecordBatchFlatBufferBuilder : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, - IArrowArrayVisitor + IArrowArrayVisitor, + IArrowArrayVisitor { public readonly struct FieldNode { @@ -345,6 +346,15 @@ public void Visit(NullArray array) // There are no buffers for a NullArray } + public void Visit(RunEndEncodedArray array) + { + // REE arrays have no buffers at the top level, only child arrays + // Visit the run_ends array + VisitArray(array.RunEnds); + // Visit the values array + VisitArray(array.Values); + } + private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer valueOffsetsBuffer, int arrayOffset, int arrayLength) { var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * (arrayLength + 1)); diff --git a/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs b/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs index 503680a2..050c563e 100644 --- a/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs +++ b/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs @@ -81,7 +81,8 @@ class TypeVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, - IArrowTypeVisitor + IArrowTypeVisitor, + IArrowTypeVisitor { private FlatBufferBuilder Builder { get; } @@ -343,6 +344,14 @@ public void Visit(NullType type) Flatbuf.Null.EndNull(Builder)); } + public void Visit(RunEndEncodedType type) + { + Flatbuf.RunEndEncoded.StartRunEndEncoded(Builder); + Result = FieldType.Build( + Flatbuf.Type.RunEndEncoded, + Flatbuf.RunEndEncoded.EndRunEndEncoded(Builder)); + } + public void Visit(IArrowType type) { throw new NotImplementedException($"Cannot visit type {type}"); diff --git a/src/Apache.Arrow/Ipc/MessageSerializer.cs b/src/Apache.Arrow/Ipc/MessageSerializer.cs index 7c7f7a38..ab7f3a75 100644 --- a/src/Apache.Arrow/Ipc/MessageSerializer.cs +++ b/src/Apache.Arrow/Ipc/MessageSerializer.cs @@ -240,6 +240,12 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c } Flatbuf.Map meta = field.Type().Value; return new Types.MapType(childFields[0], meta.KeysSorted); + case Flatbuf.Type.RunEndEncoded: + if (childFields == null || childFields.Length != 2) + { + throw new InvalidDataException($"Run-end encoded type must have exactly two children (run_ends and values)."); + } + return new Types.RunEndEncodedType(childFields[0], childFields[1]); default: throw new InvalidDataException($"Arrow primitive '{field.TypeType}' is unsupported."); } diff --git a/src/Apache.Arrow/Types/IArrowType.cs b/src/Apache.Arrow/Types/IArrowType.cs index 657b234b..39e98a49 100644 --- a/src/Apache.Arrow/Types/IArrowType.cs +++ b/src/Apache.Arrow/Types/IArrowType.cs @@ -58,6 +58,7 @@ public enum ArrowTypeId LargeString, Decimal32, Decimal64, + RunEndEncoded, } public interface IArrowType diff --git a/src/Apache.Arrow/Types/RunEndEncodedType.cs b/src/Apache.Arrow/Types/RunEndEncodedType.cs new file mode 100644 index 00000000..e9840d60 --- /dev/null +++ b/src/Apache.Arrow/Types/RunEndEncodedType.cs @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; + +namespace Apache.Arrow.Types +{ + /// + /// Represents a run-end encoded array type. + /// Contains two child arrays: run_ends and values. + /// The run_ends child array must be a 16/32/64-bit signed integer array + /// which encodes the indices at which the run with the value in + /// each corresponding index in the values child array ends. + /// + public sealed class RunEndEncodedType : NestedType + { + public override ArrowTypeId TypeId => ArrowTypeId.RunEndEncoded; + public override string Name => "run_end_encoded"; + + /// + /// Gets the run ends field (must be Int16, Int32, or Int64). + /// + public Field RunEndsField => Fields[0]; + + /// + /// Gets the values field (can be any type). + /// + public Field ValuesField => Fields[1]; + + /// + /// Gets the data type of the run ends array. + /// + public IArrowType RunEndsDataType => RunEndsField.DataType; + + /// + /// Gets the data type of the values array. + /// + public IArrowType ValuesDataType => ValuesField.DataType; + + /// + /// Creates a new RunEndEncodedType with the specified run ends and values fields. + /// + /// The run ends field (must be Int16, Int32, or Int64). + /// The values field (can be any type). + public RunEndEncodedType(Field runEndsField, Field valuesField) + : base([runEndsField, valuesField]) + { + ValidateRunEndsType(runEndsField.DataType); + } + + /// + /// Creates a new RunEndEncodedType with the specified run ends and values data types. + /// Uses default field names "run_ends" and "values". + /// + /// The run ends data type (must be Int16, Int32, or Int64). + /// The values data type (can be any type). + public RunEndEncodedType(IArrowType runEndsDataType, IArrowType valuesDataType) + : this(new Field("run_ends", runEndsDataType, nullable: false), + new Field("values", valuesDataType, nullable: true)) + { + } + + private static void ValidateRunEndsType(IArrowType runEndsDataType) + { + if (runEndsDataType.TypeId != ArrowTypeId.Int16 && + runEndsDataType.TypeId != ArrowTypeId.Int32 && + runEndsDataType.TypeId != ArrowTypeId.Int64) + { + throw new ArgumentException( + $"Run ends type must be Int16, Int32, or Int64, but got {runEndsDataType.TypeId}", + nameof(runEndsDataType)); + } + } + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs b/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs new file mode 100644 index 00000000..d993ab39 --- /dev/null +++ b/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs @@ -0,0 +1,322 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.IO; +using Apache.Arrow.Ipc; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Tests +{ + public class RunEndEncodedArrayTests + { + [Fact] + public void TestRunEndEncodedTypeCreation() + { + // Test with explicit fields + var runEndsField = new Field("run_ends", Int32Type.Default, nullable: false); + var valuesField = new Field("values", StringType.Default, nullable: true); + var reeType = new RunEndEncodedType(runEndsField, valuesField); + + Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId); + Assert.Equal("run_end_encoded", reeType.Name); + Assert.Equal(runEndsField, reeType.RunEndsField); + Assert.Equal(valuesField, reeType.ValuesField); + Assert.Equal(Int32Type.Default.TypeId, reeType.RunEndsDataType.TypeId); + Assert.Equal(StringType.Default.TypeId, reeType.ValuesDataType.TypeId); + } + + [Fact] + public void TestRunEndEncodedTypeCreationWithDataTypes() + { + // Test with data types (uses default field names) + var reeType = new RunEndEncodedType(Int32Type.Default, StringType.Default); + + Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId); + Assert.Equal("run_ends", reeType.RunEndsField.Name); + Assert.Equal("values", reeType.ValuesField.Name); + } + + [Fact] + public void TestRunEndEncodedTypeValidation() + { + // Invalid run ends type (must be Int16, Int32, or Int64) + Assert.Throws(() => new RunEndEncodedType(Int8Type.Default, StringType.Default)); + Assert.Throws(() => new RunEndEncodedType(FloatType.Default, StringType.Default)); + Assert.Throws(() => new RunEndEncodedType(StringType.Default, StringType.Default)); + + // Valid run ends types + Assert.NotNull(new RunEndEncodedType(Int16Type.Default, StringType.Default)); // Should not throw + Assert.NotNull(new RunEndEncodedType(Int32Type.Default, StringType.Default)); // Should not throw + Assert.NotNull(new RunEndEncodedType(Int64Type.Default, StringType.Default)); // Should not throw + } + + [Fact] + public void TestRunEndEncodedArrayWithInt32RunEnds() + { + // Create run ends: [3, 7, 10, 15] + // This represents: 3 'A's, 4 'B's, 3 'C's, 5 'D's + var runEndsBuilder = new Int32Array.Builder(); + runEndsBuilder.AppendRange([3, 7, 10, 15]); + Int32Array runEnds = runEndsBuilder.Build(); + + // Create values: ['A', 'B', 'C', 'D'] + var valuesBuilder = new StringArray.Builder(); + valuesBuilder.AppendRange(["A", "B", "C", "D"]); + StringArray values = valuesBuilder.Build(); + + // Create REE array + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(15, reeArray.Length); // Logical length is the last run end value + Assert.Equal(0, reeArray.NullCount); // REE arrays don't have nulls at the top level + Assert.Equal(runEnds, reeArray.RunEnds); + Assert.Equal(values, reeArray.Values); + } + + [Fact] + public void TestRunEndEncodedArrayWithInt16RunEnds() + { + var runEndsBuilder = new Int16Array.Builder(); + runEndsBuilder.AppendRange([2, 5, 8]); + Int16Array runEnds = runEndsBuilder.Build(); + + var valuesBuilder = new Int32Array.Builder(); + valuesBuilder.AppendRange([100, 200, 300]); + Int32Array values = valuesBuilder.Build(); + + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(8, reeArray.Length); + Assert.Equal(runEnds, reeArray.RunEnds); + Assert.Equal(values, reeArray.Values); + } + + [Fact] + public void TestRunEndEncodedArrayWithInt64RunEnds() + { + var runEndsBuilder = new Int64Array.Builder(); + runEndsBuilder.AppendRange([1000, 2000, 3000]); + Int64Array runEnds = runEndsBuilder.Build(); + + var valuesBuilder = new DoubleArray.Builder(); + valuesBuilder.AppendRange([1.5, 2.5, 3.5]); + DoubleArray values = valuesBuilder.Build(); + + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(3000, reeArray.Length); + Assert.Equal(runEnds, reeArray.RunEnds); + Assert.Equal(values, reeArray.Values); + } + + [Fact] + public void TestRunEndEncodedArrayInvalidRunEndsType() + { + Int8Array invalidRunEnds = new Int8Array.Builder().AppendRange([1, 2, 3]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["A", "B", "C"]).Build(); + + Assert.Throws(() => new RunEndEncodedArray(invalidRunEnds, values)); + } + + [Fact] + public void TestRunEndEncodedArrayEmpty() + { + Int32Array runEnds = new Int32Array.Builder().Build(); + StringArray values = new StringArray.Builder().Build(); + + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(0, reeArray.Length); + } + + [Fact] + public void TestFindPhysicalIndexInt32() + { + // Run ends: [3, 7, 10, 15] means: + // Logical indices 0-2 map to physical index 0 (value 'A') + // Logical indices 3-6 map to physical index 1 (value 'B') + // Logical indices 7-9 map to physical index 2 (value 'C') + // Logical indices 10-14 map to physical index 3 (value 'D') + Int32Array runEnds = new Int32Array.Builder() + .AppendRange([3, 7, 10, 15]) + .Build(); + StringArray values = new StringArray.Builder() + .AppendRange(["A", "B", "C", "D"]) + .Build(); + + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(0, reeArray.FindPhysicalIndex(0)); + Assert.Equal(0, reeArray.FindPhysicalIndex(1)); + Assert.Equal(0, reeArray.FindPhysicalIndex(2)); + Assert.Equal(1, reeArray.FindPhysicalIndex(3)); + Assert.Equal(1, reeArray.FindPhysicalIndex(4)); + Assert.Equal(1, reeArray.FindPhysicalIndex(5)); + Assert.Equal(1, reeArray.FindPhysicalIndex(6)); + Assert.Equal(2, reeArray.FindPhysicalIndex(7)); + Assert.Equal(2, reeArray.FindPhysicalIndex(8)); + Assert.Equal(2, reeArray.FindPhysicalIndex(9)); + Assert.Equal(3, reeArray.FindPhysicalIndex(10)); + Assert.Equal(3, reeArray.FindPhysicalIndex(11)); + Assert.Equal(3, reeArray.FindPhysicalIndex(14)); + } + + [Fact] + public void TestFindPhysicalIndexOutOfRange() + { + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["A", "B"]).Build(); + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Throws(() => reeArray.FindPhysicalIndex(-1)); + Assert.Throws(() => reeArray.FindPhysicalIndex(7)); + Assert.Throws(() => reeArray.FindPhysicalIndex(100)); + } + + [Fact] + public void TestRunEndEncodedArraySerialization() + { + // Create a REE array + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7, 10]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["foo", "bar", "baz"]).Build(); + var reeArray = new RunEndEncodedArray(runEnds, values); + + // Create a record batch with the REE array + var reeField = new Field("ree_column", reeArray.Data.DataType, nullable: false); + var schema = new Schema([reeField], null); + var recordBatch = new RecordBatch(schema, [reeArray], reeArray.Length); + + // Serialize and deserialize + using var stream = new MemoryStream(); + using (var writer = new ArrowStreamWriter(stream, schema, leaveOpen: true)) + { + writer.WriteRecordBatch(recordBatch); + writer.WriteEnd(); + } + + stream.Position = 0; + + using var reader = new ArrowStreamReader(stream); + RecordBatch readBatch = reader.ReadNextRecordBatch(); + + Assert.NotNull(readBatch); + Assert.Equal(1, readBatch.ColumnCount); + Assert.Equal(10, readBatch.Length); + + var readArray = readBatch.Column(0) as RunEndEncodedArray; + Assert.NotNull(readArray); + Assert.Equal(10, readArray.Length); + Assert.Equal(ArrowTypeId.RunEndEncoded, readArray.Data.DataType.TypeId); + + // Verify run ends + var readRunEnds = readArray.RunEnds as Int32Array; + Assert.NotNull(readRunEnds); + Assert.Equal(3, readRunEnds.Length); + Assert.Equal(3, readRunEnds.GetValue(0)); + Assert.Equal(7, readRunEnds.GetValue(1)); + Assert.Equal(10, readRunEnds.GetValue(2)); + + // Verify values + var readValues = readArray.Values as StringArray; + Assert.NotNull(readValues); + Assert.Equal(3, readValues.Length); + Assert.Equal("foo", readValues.GetString(0)); + Assert.Equal("bar", readValues.GetString(1)); + Assert.Equal("baz", readValues.GetString(2)); + } + + [Fact] + public void TestRunEndEncodedArrayWithDifferentValueTypes() + { + // Test with boolean values + Int32Array runEnds1 = new Int32Array.Builder().AppendRange([5, 10]).Build(); + BooleanArray values1 = new BooleanArray.Builder().AppendRange([true, false]).Build(); + var reeArray1 = new RunEndEncodedArray(runEnds1, values1); + Assert.Equal(10, reeArray1.Length); + + // Test with double values + Int32Array runEnds2 = new Int32Array.Builder().AppendRange([3, 8]).Build(); + DoubleArray values2 = new DoubleArray.Builder().AppendRange([1.5, 2.5]).Build(); + var reeArray2 = new RunEndEncodedArray(runEnds2, values2); + Assert.Equal(8, reeArray2.Length); + + // Test with list values + var listBuilder = new ListArray.Builder(Int32Type.Default); + var int32Builder = (Int32Array.Builder)listBuilder.ValueBuilder; + listBuilder.Append(); + int32Builder.Append(1); + int32Builder.Append(2); + listBuilder.Append(); + int32Builder.Append(3); + int32Builder.Append(4); + ListArray listValues = listBuilder.Build(); + + Int32Array runEnds3 = new Int32Array.Builder().AppendRange([2, 5]).Build(); + var reeArray3 = new RunEndEncodedArray(runEnds3, listValues); + Assert.Equal(5, reeArray3.Length); + } + + [Fact] + public void TestRunEndEncodedArrayFromArrayData() + { + // Create arrays + Int32Array runEnds = new Int32Array.Builder().AppendRange([2, 5]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["X", "Y"]).Build(); + + // Create ArrayData manually + var reeType = new RunEndEncodedType(Int32Type.Default, StringType.Default); + var arrayData = new ArrayData( + reeType, + length: 5, + nullCount: 0, + offset: 0, + buffers: [], + children: [runEnds.Data, values.Data]); + + // Create REE array from ArrayData + var reeArray = new RunEndEncodedArray(arrayData); + + Assert.Equal(5, reeArray.Length); + Assert.Equal(0, reeArray.NullCount); + Assert.IsType(reeArray.RunEnds); + Assert.IsType(reeArray.Values); + } + + [Fact] + public void TestRunEndEncodedArrayFactoryBuild() + { + // Test that ArrowArrayFactory can build REE arrays + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 6]).Build(); + Int64Array values = new Int64Array.Builder().AppendRange([100, 200]).Build(); + + var reeType = new RunEndEncodedType(Int32Type.Default, Int64Type.Default); + var arrayData = new ArrayData( + reeType, + length: 6, + nullCount: 0, + offset: 0, + buffers: [], + children: [runEnds.Data, values.Data]); + + IArrowArray array = ArrowArrayFactory.BuildArray(arrayData); + + Assert.IsType(array); + var reeArray = (RunEndEncodedArray)array; + Assert.Equal(6, reeArray.Length); + } + } +} From df1f57605e31d8347a1159f8ea1ccf4eadd095a1 Mon Sep 17 00:00:00 2001 From: Jorge Candeias Date: Fri, 13 Feb 2026 18:40:56 +0000 Subject: [PATCH 2/2] Cleanup --- src/Apache.Arrow/Arrays/RunEndEncodedArray.cs | 323 ++++++----- src/Apache.Arrow/Types/RunEndEncodedType.cs | 117 ++-- .../RunEndEncodedArrayTests.cs | 541 +++++++++--------- 3 files changed, 489 insertions(+), 492 deletions(-) diff --git a/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs b/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs index 1427dab8..08e2683d 100644 --- a/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs +++ b/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs @@ -16,208 +16,207 @@ using System; using Apache.Arrow.Types; -namespace Apache.Arrow +namespace Apache.Arrow; + +/// +/// Represents a run-end encoded array. +/// A run-end encoded array stores consecutive runs of the same value more efficiently. +/// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any type). +/// The run_ends array stores the cumulative end positions of each run. +/// +public class RunEndEncodedArray : Array { /// - /// Represents a run-end encoded array. - /// A run-end encoded array stores consecutive runs of the same value more efficiently. - /// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any type). - /// The run_ends array stores the cumulative end positions of each run. + /// Gets the run ends array (Int16Array, Int32Array, or Int64Array). + /// This array contains the cumulative end indices for each run. /// - public class RunEndEncodedArray : Array + public IArrowArray RunEnds { get; } + + /// + /// Gets the values array. + /// This array contains the actual values that are run-length encoded. + /// + public IArrowArray Values { get; } + + /// + /// Creates a new RunEndEncodedArray from ArrayData. + /// + /// The array data containing run ends and values as children. + public RunEndEncodedArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0]), ArrowArrayFactory.BuildArray(data.Children[1])) + { + } + + /// + /// Creates a new RunEndEncodedArray with specified run ends and values arrays. + /// + /// The run ends array (must be Int16Array, Int32Array, or Int64Array). + /// The values array (can be any type). + public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values) + : this(CreateArrayData(runEnds, values), runEnds, values) + { + } + + private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, IArrowArray values) + : base(data) + { + data.EnsureBufferCount(0); // REE arrays have no buffers, only children + data.EnsureDataType(ArrowTypeId.RunEndEncoded); + + ValidateRunEndsType(runEnds); + RunEnds = runEnds; + Values = values; + } + + private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray values) + { + ValidateRunEndsType(runEnds); + + // The logical length of a REE array is determined by the last value in run_ends + int logicalLength = GetLogicalLength(runEnds); + + var dataType = new RunEndEncodedType(runEnds.Data.DataType, values.Data.DataType); + + return new ArrayData( + dataType, + logicalLength, + nullCount: 0, // REE arrays don't have a validity bitmap + offset: 0, + buffers: [], + children: [runEnds.Data, values.Data]); + } + + private static void ValidateRunEndsType(IArrowArray runEnds) { - /// - /// Gets the run ends array (Int16Array, Int32Array, or Int64Array). - /// This array contains the cumulative end indices for each run. - /// - public IArrowArray RunEnds { get; } - - /// - /// Gets the values array. - /// This array contains the actual values that are run-length encoded. - /// - public IArrowArray Values { get; } - - /// - /// Creates a new RunEndEncodedArray from ArrayData. - /// - /// The array data containing run ends and values as children. - public RunEndEncodedArray(ArrayData data) - : this(data, ArrowArrayFactory.BuildArray(data.Children[0]), ArrowArrayFactory.BuildArray(data.Children[1])) + ArrowTypeId typeId = runEnds.Data.DataType.TypeId; + if (typeId != ArrowTypeId.Int16 && + typeId != ArrowTypeId.Int32 && + typeId != ArrowTypeId.Int64) { + throw new ArgumentException( + $"Run ends array must be Int16, Int32, or Int64, but got {typeId}", + nameof(runEnds)); } + } - /// - /// Creates a new RunEndEncodedArray with specified run ends and values arrays. - /// - /// The run ends array (must be Int16Array, Int32Array, or Int64Array). - /// The values array (can be any type). - public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values) - : this(CreateArrayData(runEnds, values), runEnds, values) + private static int GetLogicalLength(IArrowArray runEnds) + { + if (runEnds.Length == 0) { + return 0; } - private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, IArrowArray values) - : base(data) + // Get the last run end value which represents the logical length + switch (runEnds) { - data.EnsureBufferCount(0); // REE arrays have no buffers, only children - data.EnsureDataType(ArrowTypeId.RunEndEncoded); - - ValidateRunEndsType(runEnds); - RunEnds = runEnds; - Values = values; + case Int16Array int16Array: + return int16Array.GetValue(int16Array.Length - 1) ?? 0; + case Int32Array int32Array: + return int32Array.GetValue(int32Array.Length - 1) ?? 0; + case Int64Array int64Array: + { + long? lastValue = int64Array.GetValue(int64Array.Length - 1); + if (lastValue.HasValue && lastValue.Value > int.MaxValue) + { + throw new ArgumentException("Run ends value exceeds maximum supported length."); + } + return (int)(lastValue ?? 0); + } + default: + throw new InvalidOperationException($"Unexpected run ends array type: {runEnds.GetType()}"); } + } - private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray values) + /// + /// Finds the physical index in the run_ends array that contains the specified logical index. + /// + /// The logical index in the decoded array. + /// The physical index in the run_ends/values arrays. + public int FindPhysicalIndex(int logicalIndex) + { + if (logicalIndex < 0 || logicalIndex >= Length) { - ValidateRunEndsType(runEnds); - - // The logical length of a REE array is determined by the last value in run_ends - int logicalLength = GetLogicalLength(runEnds); - - var dataType = new RunEndEncodedType(runEnds.Data.DataType, values.Data.DataType); - - return new ArrayData( - dataType, - logicalLength, - nullCount: 0, // REE arrays don't have a validity bitmap - offset: 0, - buffers: [], - children: [runEnds.Data, values.Data]); + throw new ArgumentOutOfRangeException(nameof(logicalIndex)); } - private static void ValidateRunEndsType(IArrowArray runEnds) + // Binary search to find the run that contains this logical index + return RunEnds switch { - ArrowTypeId typeId = runEnds.Data.DataType.TypeId; - if (typeId != ArrowTypeId.Int16 && - typeId != ArrowTypeId.Int32 && - typeId != ArrowTypeId.Int64) - { - throw new ArgumentException( - $"Run ends array must be Int16, Int32, or Int64, but got {typeId}", - nameof(runEnds)); - } - } + Int16Array int16Array => BinarySearchRunEnds(int16Array, logicalIndex), + Int32Array int32Array => BinarySearchRunEnds(int32Array, logicalIndex), + Int64Array int64Array => BinarySearchRunEnds(int64Array, logicalIndex), + _ => throw new InvalidOperationException($"Unexpected run ends array type: {RunEnds.GetType()}"), + }; + } + + private static int BinarySearchRunEnds(Int16Array runEnds, int logicalIndex) + { + int left = 0; + int right = runEnds.Length - 1; - private static int GetLogicalLength(IArrowArray runEnds) + while (left < right) { - if (runEnds.Length == 0) + int mid = left + (right - left) / 2; + int runEnd = runEnds.GetValue(mid) ?? 0; + + if (logicalIndex < runEnd) { - return 0; + right = mid; } - - // Get the last run end value which represents the logical length - switch (runEnds) + else { - case Int16Array int16Array: - return int16Array.GetValue(int16Array.Length - 1) ?? 0; - case Int32Array int32Array: - return int32Array.GetValue(int32Array.Length - 1) ?? 0; - case Int64Array int64Array: - { - long? lastValue = int64Array.GetValue(int64Array.Length - 1); - if (lastValue.HasValue && lastValue.Value > int.MaxValue) - { - throw new ArgumentException("Run ends value exceeds maximum supported length."); - } - return (int)(lastValue ?? 0); - } - default: - throw new InvalidOperationException($"Unexpected run ends array type: {runEnds.GetType()}"); + left = mid + 1; } } - /// - /// Finds the physical index in the run_ends array that contains the specified logical index. - /// - /// The logical index in the decoded array. - /// The physical index in the run_ends/values arrays. - public int FindPhysicalIndex(int logicalIndex) - { - if (logicalIndex < 0 || logicalIndex >= Length) - { - throw new ArgumentOutOfRangeException(nameof(logicalIndex)); - } + return left; + } - // Binary search to find the run that contains this logical index - return RunEnds switch - { - Int16Array int16Array => BinarySearchRunEnds(int16Array, logicalIndex), - Int32Array int32Array => BinarySearchRunEnds(int32Array, logicalIndex), - Int64Array int64Array => BinarySearchRunEnds(int64Array, logicalIndex), - _ => throw new InvalidOperationException($"Unexpected run ends array type: {RunEnds.GetType()}"), - }; - } + private static int BinarySearchRunEnds(Int32Array runEnds, int logicalIndex) + { + int left = 0; + int right = runEnds.Length - 1; - private static int BinarySearchRunEnds(Int16Array runEnds, int logicalIndex) + while (left < right) { - int left = 0; - int right = runEnds.Length - 1; + int mid = left + (right - left) / 2; + int runEnd = runEnds.GetValue(mid) ?? 0; - while (left < right) + if (logicalIndex < runEnd) { - int mid = left + (right - left) / 2; - int runEnd = runEnds.GetValue(mid) ?? 0; - - if (logicalIndex < runEnd) - { - right = mid; - } - else - { - left = mid + 1; - } + right = mid; } - - return left; - } - - private static int BinarySearchRunEnds(Int32Array runEnds, int logicalIndex) - { - int left = 0; - int right = runEnds.Length - 1; - - while (left < right) + else { - int mid = left + (right - left) / 2; - int runEnd = runEnds.GetValue(mid) ?? 0; - - if (logicalIndex < runEnd) - { - right = mid; - } - else - { - left = mid + 1; - } + left = mid + 1; } - - return left; } - private static int BinarySearchRunEnds(Int64Array runEnds, int logicalIndex) + return left; + } + + private static int BinarySearchRunEnds(Int64Array runEnds, int logicalIndex) + { + int left = 0; + int right = runEnds.Length - 1; + + while (left < right) { - int left = 0; - int right = runEnds.Length - 1; + int mid = left + (right - left) / 2; + long runEnd = runEnds.GetValue(mid) ?? 0; - while (left < right) + if (logicalIndex < runEnd) { - int mid = left + (right - left) / 2; - long runEnd = runEnds.GetValue(mid) ?? 0; - - if (logicalIndex < runEnd) - { - right = mid; - } - else - { - left = mid + 1; - } + right = mid; + } + else + { + left = mid + 1; } - - return left; } - public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + return left; } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); } diff --git a/src/Apache.Arrow/Types/RunEndEncodedType.cs b/src/Apache.Arrow/Types/RunEndEncodedType.cs index e9840d60..f8ed4c40 100644 --- a/src/Apache.Arrow/Types/RunEndEncodedType.cs +++ b/src/Apache.Arrow/Types/RunEndEncodedType.cs @@ -15,75 +15,74 @@ using System; -namespace Apache.Arrow.Types +namespace Apache.Arrow.Types; + +/// +/// Represents a run-end encoded array type. +/// Contains two child arrays: run_ends and values. +/// The run_ends child array must be a 16/32/64-bit signed integer array +/// which encodes the indices at which the run with the value in +/// each corresponding index in the values child array ends. +/// +public sealed class RunEndEncodedType : NestedType { + public override ArrowTypeId TypeId => ArrowTypeId.RunEndEncoded; + public override string Name => "run_end_encoded"; + /// - /// Represents a run-end encoded array type. - /// Contains two child arrays: run_ends and values. - /// The run_ends child array must be a 16/32/64-bit signed integer array - /// which encodes the indices at which the run with the value in - /// each corresponding index in the values child array ends. + /// Gets the run ends field (must be Int16, Int32, or Int64). /// - public sealed class RunEndEncodedType : NestedType - { - public override ArrowTypeId TypeId => ArrowTypeId.RunEndEncoded; - public override string Name => "run_end_encoded"; - - /// - /// Gets the run ends field (must be Int16, Int32, or Int64). - /// - public Field RunEndsField => Fields[0]; + public Field RunEndsField => Fields[0]; - /// - /// Gets the values field (can be any type). - /// - public Field ValuesField => Fields[1]; + /// + /// Gets the values field (can be any type). + /// + public Field ValuesField => Fields[1]; - /// - /// Gets the data type of the run ends array. - /// - public IArrowType RunEndsDataType => RunEndsField.DataType; + /// + /// Gets the data type of the run ends array. + /// + public IArrowType RunEndsDataType => RunEndsField.DataType; - /// - /// Gets the data type of the values array. - /// - public IArrowType ValuesDataType => ValuesField.DataType; + /// + /// Gets the data type of the values array. + /// + public IArrowType ValuesDataType => ValuesField.DataType; - /// - /// Creates a new RunEndEncodedType with the specified run ends and values fields. - /// - /// The run ends field (must be Int16, Int32, or Int64). - /// The values field (can be any type). - public RunEndEncodedType(Field runEndsField, Field valuesField) - : base([runEndsField, valuesField]) - { - ValidateRunEndsType(runEndsField.DataType); - } + /// + /// Creates a new RunEndEncodedType with the specified run ends and values fields. + /// + /// The run ends field (must be Int16, Int32, or Int64). + /// The values field (can be any type). + public RunEndEncodedType(Field runEndsField, Field valuesField) + : base([runEndsField, valuesField]) + { + ValidateRunEndsType(runEndsField.DataType); + } - /// - /// Creates a new RunEndEncodedType with the specified run ends and values data types. - /// Uses default field names "run_ends" and "values". - /// - /// The run ends data type (must be Int16, Int32, or Int64). - /// The values data type (can be any type). - public RunEndEncodedType(IArrowType runEndsDataType, IArrowType valuesDataType) - : this(new Field("run_ends", runEndsDataType, nullable: false), - new Field("values", valuesDataType, nullable: true)) - { - } + /// + /// Creates a new RunEndEncodedType with the specified run ends and values data types. + /// Uses default field names "run_ends" and "values". + /// + /// The run ends data type (must be Int16, Int32, or Int64). + /// The values data type (can be any type). + public RunEndEncodedType(IArrowType runEndsDataType, IArrowType valuesDataType) + : this(new Field("run_ends", runEndsDataType, nullable: false), + new Field("values", valuesDataType, nullable: true)) + { + } - private static void ValidateRunEndsType(IArrowType runEndsDataType) + private static void ValidateRunEndsType(IArrowType runEndsDataType) + { + if (runEndsDataType.TypeId != ArrowTypeId.Int16 && + runEndsDataType.TypeId != ArrowTypeId.Int32 && + runEndsDataType.TypeId != ArrowTypeId.Int64) { - if (runEndsDataType.TypeId != ArrowTypeId.Int16 && - runEndsDataType.TypeId != ArrowTypeId.Int32 && - runEndsDataType.TypeId != ArrowTypeId.Int64) - { - throw new ArgumentException( - $"Run ends type must be Int16, Int32, or Int64, but got {runEndsDataType.TypeId}", - nameof(runEndsDataType)); - } + throw new ArgumentException( + $"Run ends type must be Int16, Int32, or Int64, but got {runEndsDataType.TypeId}", + nameof(runEndsDataType)); } - - public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } diff --git a/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs b/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs index d993ab39..669aaac5 100644 --- a/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs +++ b/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs @@ -19,304 +19,303 @@ using Apache.Arrow.Types; using Xunit; -namespace Apache.Arrow.Tests +namespace Apache.Arrow.Tests; + +public class RunEndEncodedArrayTests { - public class RunEndEncodedArrayTests + [Fact] + public void TestRunEndEncodedTypeCreation() { - [Fact] - public void TestRunEndEncodedTypeCreation() - { - // Test with explicit fields - var runEndsField = new Field("run_ends", Int32Type.Default, nullable: false); - var valuesField = new Field("values", StringType.Default, nullable: true); - var reeType = new RunEndEncodedType(runEndsField, valuesField); - - Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId); - Assert.Equal("run_end_encoded", reeType.Name); - Assert.Equal(runEndsField, reeType.RunEndsField); - Assert.Equal(valuesField, reeType.ValuesField); - Assert.Equal(Int32Type.Default.TypeId, reeType.RunEndsDataType.TypeId); - Assert.Equal(StringType.Default.TypeId, reeType.ValuesDataType.TypeId); - } + // Test with explicit fields + var runEndsField = new Field("run_ends", Int32Type.Default, nullable: false); + var valuesField = new Field("values", StringType.Default, nullable: true); + var reeType = new RunEndEncodedType(runEndsField, valuesField); + + Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId); + Assert.Equal("run_end_encoded", reeType.Name); + Assert.Equal(runEndsField, reeType.RunEndsField); + Assert.Equal(valuesField, reeType.ValuesField); + Assert.Equal(Int32Type.Default.TypeId, reeType.RunEndsDataType.TypeId); + Assert.Equal(StringType.Default.TypeId, reeType.ValuesDataType.TypeId); + } - [Fact] - public void TestRunEndEncodedTypeCreationWithDataTypes() - { - // Test with data types (uses default field names) - var reeType = new RunEndEncodedType(Int32Type.Default, StringType.Default); + [Fact] + public void TestRunEndEncodedTypeCreationWithDataTypes() + { + // Test with data types (uses default field names) + var reeType = new RunEndEncodedType(Int32Type.Default, StringType.Default); - Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId); - Assert.Equal("run_ends", reeType.RunEndsField.Name); - Assert.Equal("values", reeType.ValuesField.Name); - } + Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId); + Assert.Equal("run_ends", reeType.RunEndsField.Name); + Assert.Equal("values", reeType.ValuesField.Name); + } - [Fact] - public void TestRunEndEncodedTypeValidation() - { - // Invalid run ends type (must be Int16, Int32, or Int64) - Assert.Throws(() => new RunEndEncodedType(Int8Type.Default, StringType.Default)); - Assert.Throws(() => new RunEndEncodedType(FloatType.Default, StringType.Default)); - Assert.Throws(() => new RunEndEncodedType(StringType.Default, StringType.Default)); - - // Valid run ends types - Assert.NotNull(new RunEndEncodedType(Int16Type.Default, StringType.Default)); // Should not throw - Assert.NotNull(new RunEndEncodedType(Int32Type.Default, StringType.Default)); // Should not throw - Assert.NotNull(new RunEndEncodedType(Int64Type.Default, StringType.Default)); // Should not throw - } + [Fact] + public void TestRunEndEncodedTypeValidation() + { + // Invalid run ends type (must be Int16, Int32, or Int64) + Assert.Throws(() => new RunEndEncodedType(Int8Type.Default, StringType.Default)); + Assert.Throws(() => new RunEndEncodedType(FloatType.Default, StringType.Default)); + Assert.Throws(() => new RunEndEncodedType(StringType.Default, StringType.Default)); + + // Valid run ends types + Assert.NotNull(new RunEndEncodedType(Int16Type.Default, StringType.Default)); // Should not throw + Assert.NotNull(new RunEndEncodedType(Int32Type.Default, StringType.Default)); // Should not throw + Assert.NotNull(new RunEndEncodedType(Int64Type.Default, StringType.Default)); // Should not throw + } - [Fact] - public void TestRunEndEncodedArrayWithInt32RunEnds() - { - // Create run ends: [3, 7, 10, 15] - // This represents: 3 'A's, 4 'B's, 3 'C's, 5 'D's - var runEndsBuilder = new Int32Array.Builder(); - runEndsBuilder.AppendRange([3, 7, 10, 15]); - Int32Array runEnds = runEndsBuilder.Build(); - - // Create values: ['A', 'B', 'C', 'D'] - var valuesBuilder = new StringArray.Builder(); - valuesBuilder.AppendRange(["A", "B", "C", "D"]); - StringArray values = valuesBuilder.Build(); - - // Create REE array - var reeArray = new RunEndEncodedArray(runEnds, values); - - Assert.Equal(15, reeArray.Length); // Logical length is the last run end value - Assert.Equal(0, reeArray.NullCount); // REE arrays don't have nulls at the top level - Assert.Equal(runEnds, reeArray.RunEnds); - Assert.Equal(values, reeArray.Values); - } + [Fact] + public void TestRunEndEncodedArrayWithInt32RunEnds() + { + // Create run ends: [3, 7, 10, 15] + // This represents: 3 'A's, 4 'B's, 3 'C's, 5 'D's + var runEndsBuilder = new Int32Array.Builder(); + runEndsBuilder.AppendRange([3, 7, 10, 15]); + Int32Array runEnds = runEndsBuilder.Build(); + + // Create values: ['A', 'B', 'C', 'D'] + var valuesBuilder = new StringArray.Builder(); + valuesBuilder.AppendRange(["A", "B", "C", "D"]); + StringArray values = valuesBuilder.Build(); + + // Create REE array + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(15, reeArray.Length); // Logical length is the last run end value + Assert.Equal(0, reeArray.NullCount); // REE arrays don't have nulls at the top level + Assert.Equal(runEnds, reeArray.RunEnds); + Assert.Equal(values, reeArray.Values); + } - [Fact] - public void TestRunEndEncodedArrayWithInt16RunEnds() - { - var runEndsBuilder = new Int16Array.Builder(); - runEndsBuilder.AppendRange([2, 5, 8]); - Int16Array runEnds = runEndsBuilder.Build(); + [Fact] + public void TestRunEndEncodedArrayWithInt16RunEnds() + { + var runEndsBuilder = new Int16Array.Builder(); + runEndsBuilder.AppendRange([2, 5, 8]); + Int16Array runEnds = runEndsBuilder.Build(); - var valuesBuilder = new Int32Array.Builder(); - valuesBuilder.AppendRange([100, 200, 300]); - Int32Array values = valuesBuilder.Build(); + var valuesBuilder = new Int32Array.Builder(); + valuesBuilder.AppendRange([100, 200, 300]); + Int32Array values = valuesBuilder.Build(); - var reeArray = new RunEndEncodedArray(runEnds, values); + var reeArray = new RunEndEncodedArray(runEnds, values); - Assert.Equal(8, reeArray.Length); - Assert.Equal(runEnds, reeArray.RunEnds); - Assert.Equal(values, reeArray.Values); - } + Assert.Equal(8, reeArray.Length); + Assert.Equal(runEnds, reeArray.RunEnds); + Assert.Equal(values, reeArray.Values); + } - [Fact] - public void TestRunEndEncodedArrayWithInt64RunEnds() - { - var runEndsBuilder = new Int64Array.Builder(); - runEndsBuilder.AppendRange([1000, 2000, 3000]); - Int64Array runEnds = runEndsBuilder.Build(); + [Fact] + public void TestRunEndEncodedArrayWithInt64RunEnds() + { + var runEndsBuilder = new Int64Array.Builder(); + runEndsBuilder.AppendRange([1000, 2000, 3000]); + Int64Array runEnds = runEndsBuilder.Build(); - var valuesBuilder = new DoubleArray.Builder(); - valuesBuilder.AppendRange([1.5, 2.5, 3.5]); - DoubleArray values = valuesBuilder.Build(); + var valuesBuilder = new DoubleArray.Builder(); + valuesBuilder.AppendRange([1.5, 2.5, 3.5]); + DoubleArray values = valuesBuilder.Build(); - var reeArray = new RunEndEncodedArray(runEnds, values); + var reeArray = new RunEndEncodedArray(runEnds, values); - Assert.Equal(3000, reeArray.Length); - Assert.Equal(runEnds, reeArray.RunEnds); - Assert.Equal(values, reeArray.Values); - } + Assert.Equal(3000, reeArray.Length); + Assert.Equal(runEnds, reeArray.RunEnds); + Assert.Equal(values, reeArray.Values); + } - [Fact] - public void TestRunEndEncodedArrayInvalidRunEndsType() - { - Int8Array invalidRunEnds = new Int8Array.Builder().AppendRange([1, 2, 3]).Build(); - StringArray values = new StringArray.Builder().AppendRange(["A", "B", "C"]).Build(); + [Fact] + public void TestRunEndEncodedArrayInvalidRunEndsType() + { + Int8Array invalidRunEnds = new Int8Array.Builder().AppendRange([1, 2, 3]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["A", "B", "C"]).Build(); - Assert.Throws(() => new RunEndEncodedArray(invalidRunEnds, values)); - } + Assert.Throws(() => new RunEndEncodedArray(invalidRunEnds, values)); + } - [Fact] - public void TestRunEndEncodedArrayEmpty() - { - Int32Array runEnds = new Int32Array.Builder().Build(); - StringArray values = new StringArray.Builder().Build(); + [Fact] + public void TestRunEndEncodedArrayEmpty() + { + Int32Array runEnds = new Int32Array.Builder().Build(); + StringArray values = new StringArray.Builder().Build(); - var reeArray = new RunEndEncodedArray(runEnds, values); + var reeArray = new RunEndEncodedArray(runEnds, values); - Assert.Equal(0, reeArray.Length); - } + Assert.Equal(0, reeArray.Length); + } - [Fact] - public void TestFindPhysicalIndexInt32() - { - // Run ends: [3, 7, 10, 15] means: - // Logical indices 0-2 map to physical index 0 (value 'A') - // Logical indices 3-6 map to physical index 1 (value 'B') - // Logical indices 7-9 map to physical index 2 (value 'C') - // Logical indices 10-14 map to physical index 3 (value 'D') - Int32Array runEnds = new Int32Array.Builder() - .AppendRange([3, 7, 10, 15]) - .Build(); - StringArray values = new StringArray.Builder() - .AppendRange(["A", "B", "C", "D"]) - .Build(); - - var reeArray = new RunEndEncodedArray(runEnds, values); - - Assert.Equal(0, reeArray.FindPhysicalIndex(0)); - Assert.Equal(0, reeArray.FindPhysicalIndex(1)); - Assert.Equal(0, reeArray.FindPhysicalIndex(2)); - Assert.Equal(1, reeArray.FindPhysicalIndex(3)); - Assert.Equal(1, reeArray.FindPhysicalIndex(4)); - Assert.Equal(1, reeArray.FindPhysicalIndex(5)); - Assert.Equal(1, reeArray.FindPhysicalIndex(6)); - Assert.Equal(2, reeArray.FindPhysicalIndex(7)); - Assert.Equal(2, reeArray.FindPhysicalIndex(8)); - Assert.Equal(2, reeArray.FindPhysicalIndex(9)); - Assert.Equal(3, reeArray.FindPhysicalIndex(10)); - Assert.Equal(3, reeArray.FindPhysicalIndex(11)); - Assert.Equal(3, reeArray.FindPhysicalIndex(14)); - } + [Fact] + public void TestFindPhysicalIndexInt32() + { + // Run ends: [3, 7, 10, 15] means: + // Logical indices 0-2 map to physical index 0 (value 'A') + // Logical indices 3-6 map to physical index 1 (value 'B') + // Logical indices 7-9 map to physical index 2 (value 'C') + // Logical indices 10-14 map to physical index 3 (value 'D') + Int32Array runEnds = new Int32Array.Builder() + .AppendRange([3, 7, 10, 15]) + .Build(); + StringArray values = new StringArray.Builder() + .AppendRange(["A", "B", "C", "D"]) + .Build(); + + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(0, reeArray.FindPhysicalIndex(0)); + Assert.Equal(0, reeArray.FindPhysicalIndex(1)); + Assert.Equal(0, reeArray.FindPhysicalIndex(2)); + Assert.Equal(1, reeArray.FindPhysicalIndex(3)); + Assert.Equal(1, reeArray.FindPhysicalIndex(4)); + Assert.Equal(1, reeArray.FindPhysicalIndex(5)); + Assert.Equal(1, reeArray.FindPhysicalIndex(6)); + Assert.Equal(2, reeArray.FindPhysicalIndex(7)); + Assert.Equal(2, reeArray.FindPhysicalIndex(8)); + Assert.Equal(2, reeArray.FindPhysicalIndex(9)); + Assert.Equal(3, reeArray.FindPhysicalIndex(10)); + Assert.Equal(3, reeArray.FindPhysicalIndex(11)); + Assert.Equal(3, reeArray.FindPhysicalIndex(14)); + } - [Fact] - public void TestFindPhysicalIndexOutOfRange() - { - Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7]).Build(); - StringArray values = new StringArray.Builder().AppendRange(["A", "B"]).Build(); - var reeArray = new RunEndEncodedArray(runEnds, values); + [Fact] + public void TestFindPhysicalIndexOutOfRange() + { + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["A", "B"]).Build(); + var reeArray = new RunEndEncodedArray(runEnds, values); - Assert.Throws(() => reeArray.FindPhysicalIndex(-1)); - Assert.Throws(() => reeArray.FindPhysicalIndex(7)); - Assert.Throws(() => reeArray.FindPhysicalIndex(100)); - } + Assert.Throws(() => reeArray.FindPhysicalIndex(-1)); + Assert.Throws(() => reeArray.FindPhysicalIndex(7)); + Assert.Throws(() => reeArray.FindPhysicalIndex(100)); + } - [Fact] - public void TestRunEndEncodedArraySerialization() + [Fact] + public void TestRunEndEncodedArraySerialization() + { + // Create a REE array + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7, 10]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["foo", "bar", "baz"]).Build(); + var reeArray = new RunEndEncodedArray(runEnds, values); + + // Create a record batch with the REE array + var reeField = new Field("ree_column", reeArray.Data.DataType, nullable: false); + var schema = new Schema([reeField], null); + var recordBatch = new RecordBatch(schema, [reeArray], reeArray.Length); + + // Serialize and deserialize + using var stream = new MemoryStream(); + using (var writer = new ArrowStreamWriter(stream, schema, leaveOpen: true)) { - // Create a REE array - Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7, 10]).Build(); - StringArray values = new StringArray.Builder().AppendRange(["foo", "bar", "baz"]).Build(); - var reeArray = new RunEndEncodedArray(runEnds, values); - - // Create a record batch with the REE array - var reeField = new Field("ree_column", reeArray.Data.DataType, nullable: false); - var schema = new Schema([reeField], null); - var recordBatch = new RecordBatch(schema, [reeArray], reeArray.Length); - - // Serialize and deserialize - using var stream = new MemoryStream(); - using (var writer = new ArrowStreamWriter(stream, schema, leaveOpen: true)) - { - writer.WriteRecordBatch(recordBatch); - writer.WriteEnd(); - } - - stream.Position = 0; - - using var reader = new ArrowStreamReader(stream); - RecordBatch readBatch = reader.ReadNextRecordBatch(); - - Assert.NotNull(readBatch); - Assert.Equal(1, readBatch.ColumnCount); - Assert.Equal(10, readBatch.Length); - - var readArray = readBatch.Column(0) as RunEndEncodedArray; - Assert.NotNull(readArray); - Assert.Equal(10, readArray.Length); - Assert.Equal(ArrowTypeId.RunEndEncoded, readArray.Data.DataType.TypeId); - - // Verify run ends - var readRunEnds = readArray.RunEnds as Int32Array; - Assert.NotNull(readRunEnds); - Assert.Equal(3, readRunEnds.Length); - Assert.Equal(3, readRunEnds.GetValue(0)); - Assert.Equal(7, readRunEnds.GetValue(1)); - Assert.Equal(10, readRunEnds.GetValue(2)); - - // Verify values - var readValues = readArray.Values as StringArray; - Assert.NotNull(readValues); - Assert.Equal(3, readValues.Length); - Assert.Equal("foo", readValues.GetString(0)); - Assert.Equal("bar", readValues.GetString(1)); - Assert.Equal("baz", readValues.GetString(2)); + writer.WriteRecordBatch(recordBatch); + writer.WriteEnd(); } - [Fact] - public void TestRunEndEncodedArrayWithDifferentValueTypes() - { - // Test with boolean values - Int32Array runEnds1 = new Int32Array.Builder().AppendRange([5, 10]).Build(); - BooleanArray values1 = new BooleanArray.Builder().AppendRange([true, false]).Build(); - var reeArray1 = new RunEndEncodedArray(runEnds1, values1); - Assert.Equal(10, reeArray1.Length); - - // Test with double values - Int32Array runEnds2 = new Int32Array.Builder().AppendRange([3, 8]).Build(); - DoubleArray values2 = new DoubleArray.Builder().AppendRange([1.5, 2.5]).Build(); - var reeArray2 = new RunEndEncodedArray(runEnds2, values2); - Assert.Equal(8, reeArray2.Length); - - // Test with list values - var listBuilder = new ListArray.Builder(Int32Type.Default); - var int32Builder = (Int32Array.Builder)listBuilder.ValueBuilder; - listBuilder.Append(); - int32Builder.Append(1); - int32Builder.Append(2); - listBuilder.Append(); - int32Builder.Append(3); - int32Builder.Append(4); - ListArray listValues = listBuilder.Build(); - - Int32Array runEnds3 = new Int32Array.Builder().AppendRange([2, 5]).Build(); - var reeArray3 = new RunEndEncodedArray(runEnds3, listValues); - Assert.Equal(5, reeArray3.Length); - } + stream.Position = 0; + + using var reader = new ArrowStreamReader(stream); + RecordBatch readBatch = reader.ReadNextRecordBatch(); + + Assert.NotNull(readBatch); + Assert.Equal(1, readBatch.ColumnCount); + Assert.Equal(10, readBatch.Length); + + var readArray = readBatch.Column(0) as RunEndEncodedArray; + Assert.NotNull(readArray); + Assert.Equal(10, readArray.Length); + Assert.Equal(ArrowTypeId.RunEndEncoded, readArray.Data.DataType.TypeId); + + // Verify run ends + var readRunEnds = readArray.RunEnds as Int32Array; + Assert.NotNull(readRunEnds); + Assert.Equal(3, readRunEnds.Length); + Assert.Equal(3, readRunEnds.GetValue(0)); + Assert.Equal(7, readRunEnds.GetValue(1)); + Assert.Equal(10, readRunEnds.GetValue(2)); + + // Verify values + var readValues = readArray.Values as StringArray; + Assert.NotNull(readValues); + Assert.Equal(3, readValues.Length); + Assert.Equal("foo", readValues.GetString(0)); + Assert.Equal("bar", readValues.GetString(1)); + Assert.Equal("baz", readValues.GetString(2)); + } - [Fact] - public void TestRunEndEncodedArrayFromArrayData() - { - // Create arrays - Int32Array runEnds = new Int32Array.Builder().AppendRange([2, 5]).Build(); - StringArray values = new StringArray.Builder().AppendRange(["X", "Y"]).Build(); - - // Create ArrayData manually - var reeType = new RunEndEncodedType(Int32Type.Default, StringType.Default); - var arrayData = new ArrayData( - reeType, - length: 5, - nullCount: 0, - offset: 0, - buffers: [], - children: [runEnds.Data, values.Data]); - - // Create REE array from ArrayData - var reeArray = new RunEndEncodedArray(arrayData); - - Assert.Equal(5, reeArray.Length); - Assert.Equal(0, reeArray.NullCount); - Assert.IsType(reeArray.RunEnds); - Assert.IsType(reeArray.Values); - } + [Fact] + public void TestRunEndEncodedArrayWithDifferentValueTypes() + { + // Test with boolean values + Int32Array runEnds1 = new Int32Array.Builder().AppendRange([5, 10]).Build(); + BooleanArray values1 = new BooleanArray.Builder().AppendRange([true, false]).Build(); + var reeArray1 = new RunEndEncodedArray(runEnds1, values1); + Assert.Equal(10, reeArray1.Length); + + // Test with double values + Int32Array runEnds2 = new Int32Array.Builder().AppendRange([3, 8]).Build(); + DoubleArray values2 = new DoubleArray.Builder().AppendRange([1.5, 2.5]).Build(); + var reeArray2 = new RunEndEncodedArray(runEnds2, values2); + Assert.Equal(8, reeArray2.Length); + + // Test with list values + var listBuilder = new ListArray.Builder(Int32Type.Default); + var int32Builder = (Int32Array.Builder)listBuilder.ValueBuilder; + listBuilder.Append(); + int32Builder.Append(1); + int32Builder.Append(2); + listBuilder.Append(); + int32Builder.Append(3); + int32Builder.Append(4); + ListArray listValues = listBuilder.Build(); + + Int32Array runEnds3 = new Int32Array.Builder().AppendRange([2, 5]).Build(); + var reeArray3 = new RunEndEncodedArray(runEnds3, listValues); + Assert.Equal(5, reeArray3.Length); + } - [Fact] - public void TestRunEndEncodedArrayFactoryBuild() - { - // Test that ArrowArrayFactory can build REE arrays - Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 6]).Build(); - Int64Array values = new Int64Array.Builder().AppendRange([100, 200]).Build(); - - var reeType = new RunEndEncodedType(Int32Type.Default, Int64Type.Default); - var arrayData = new ArrayData( - reeType, - length: 6, - nullCount: 0, - offset: 0, - buffers: [], - children: [runEnds.Data, values.Data]); - - IArrowArray array = ArrowArrayFactory.BuildArray(arrayData); - - Assert.IsType(array); - var reeArray = (RunEndEncodedArray)array; - Assert.Equal(6, reeArray.Length); - } + [Fact] + public void TestRunEndEncodedArrayFromArrayData() + { + // Create arrays + Int32Array runEnds = new Int32Array.Builder().AppendRange([2, 5]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["X", "Y"]).Build(); + + // Create ArrayData manually + var reeType = new RunEndEncodedType(Int32Type.Default, StringType.Default); + var arrayData = new ArrayData( + reeType, + length: 5, + nullCount: 0, + offset: 0, + buffers: [], + children: [runEnds.Data, values.Data]); + + // Create REE array from ArrayData + var reeArray = new RunEndEncodedArray(arrayData); + + Assert.Equal(5, reeArray.Length); + Assert.Equal(0, reeArray.NullCount); + Assert.IsType(reeArray.RunEnds); + Assert.IsType(reeArray.Values); + } + + [Fact] + public void TestRunEndEncodedArrayFactoryBuild() + { + // Test that ArrowArrayFactory can build REE arrays + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 6]).Build(); + Int64Array values = new Int64Array.Builder().AppendRange([100, 200]).Build(); + + var reeType = new RunEndEncodedType(Int32Type.Default, Int64Type.Default); + var arrayData = new ArrayData( + reeType, + length: 6, + nullCount: 0, + offset: 0, + buffers: [], + children: [runEnds.Data, values.Data]); + + IArrowArray array = ArrowArrayFactory.BuildArray(arrayData); + + Assert.IsType(array); + var reeArray = (RunEndEncodedArray)array; + Assert.Equal(6, reeArray.Length); } }