Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ public static IArrowArray BuildArray(ArrayData data)
return new FixedSizeListArray(data);
case ArrowTypeId.Interval:
return IntervalArray.Create(data);
case ArrowTypeId.RunEndEncoded:
return new RunEndEncodedArray(data);
default:
throw new NotSupportedException($"An ArrowArray cannot be built for type {data.DataType.TypeId}.");
}
Expand Down
222 changes: 222 additions & 0 deletions src/Apache.Arrow/Arrays/RunEndEncodedArray.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

using System;
using Apache.Arrow.Types;

namespace Apache.Arrow;

/// <summary>
/// Represents a run-end encoded array.
/// A run-end encoded array stores consecutive runs of the same value more efficiently.
/// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any type).
/// The run_ends array stores the cumulative end positions of each run.
/// </summary>
public class RunEndEncodedArray : Array
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice if this had a helper method to make it easy to iterate over the expanded values. I guess this would only be possible for Values arrays that are a PrimitveArray. Eg. maybe something like a public IEnumerator<TValue> GetEnumerator<TValue>() method?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking something similar for dictionary-encoded values but hadn't yet gotten to the point of playing with implementation ideas. Ideally, a consumer ought to be able to say e.g "give me an IReadOnlyList<DateTimeOffset> for this column" and be able to get one whether it's straight values or encoded as a dictionary or as REE.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that would be nice. Maybe this is something that should be done later in a consistent way for dictionary and REE arrays rather than as part of this PR then.

{
/// <summary>
/// Gets the run ends array (Int16Array, Int32Array, or Int64Array).
/// This array contains the cumulative end indices for each run.
/// </summary>
public IArrowArray RunEnds { get; }

/// <summary>
/// Gets the values array.
/// This array contains the actual values that are run-length encoded.
/// </summary>
public IArrowArray Values { get; }

/// <summary>
/// Creates a new RunEndEncodedArray from ArrayData.
/// </summary>
/// <param name="data">The array data containing run ends and values as children.</param>
public RunEndEncodedArray(ArrayData data)
: this(data, ArrowArrayFactory.BuildArray(data.Children[0]), ArrowArrayFactory.BuildArray(data.Children[1]))
{
}

/// <summary>
/// Creates a new RunEndEncodedArray with specified run ends and values arrays.
/// </summary>
/// <param name="runEnds">The run ends array (must be Int16Array, Int32Array, or Int64Array).</param>
/// <param name="values">The values array (can be any type).</param>
public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values)
: this(CreateArrayData(runEnds, values), runEnds, values)
{
}

private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, IArrowArray values)
: base(data)
{
data.EnsureBufferCount(0); // REE arrays have no buffers, only children
data.EnsureDataType(ArrowTypeId.RunEndEncoded);

ValidateRunEndsType(runEnds);
RunEnds = runEnds;
Values = values;
}

private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray values)
{
ValidateRunEndsType(runEnds);

// The logical length of a REE array is determined by the last value in run_ends
int logicalLength = GetLogicalLength(runEnds);

var dataType = new RunEndEncodedType(runEnds.Data.DataType, values.Data.DataType);

return new ArrayData(
dataType,
logicalLength,
nullCount: 0, // REE arrays don't have a validity bitmap
offset: 0,
buffers: [],
children: [runEnds.Data, values.Data]);
}

private static void ValidateRunEndsType(IArrowArray runEnds)
{
ArrowTypeId typeId = runEnds.Data.DataType.TypeId;
if (typeId != ArrowTypeId.Int16 &&
typeId != ArrowTypeId.Int32 &&
typeId != ArrowTypeId.Int64)
{
throw new ArgumentException(
$"Run ends array must be Int16, Int32, or Int64, but got {typeId}",
nameof(runEnds));
}
}

private static int GetLogicalLength(IArrowArray runEnds)
{
if (runEnds.Length == 0)
{
return 0;
}

// Get the last run end value which represents the logical length
switch (runEnds)
{
case Int16Array int16Array:
return int16Array.GetValue(int16Array.Length - 1) ?? 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a null run-end value here should throw an exception rather than return 0.

case Int32Array int32Array:
return int32Array.GetValue(int32Array.Length - 1) ?? 0;
case Int64Array int64Array:
{
long? lastValue = int64Array.GetValue(int64Array.Length - 1);
if (lastValue.HasValue && lastValue.Value > int.MaxValue)
{
throw new ArgumentException("Run ends value exceeds maximum supported length.");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any blocker to making this work with int64 indices instead of int, or was this just chosen to align with default .NET array indexing? It seems like we could choose to use int64 indices here and allow larger lengths?

}
return (int)(lastValue ?? 0);
}
default:
throw new InvalidOperationException($"Unexpected run ends array type: {runEnds.GetType()}");
}
}

/// <summary>
/// Finds the physical index in the run_ends array that contains the specified logical index.
/// </summary>
/// <param name="logicalIndex">The logical index in the decoded array.</param>
/// <returns>The physical index in the run_ends/values arrays.</returns>
public int FindPhysicalIndex(int logicalIndex)
{
if (logicalIndex < 0 || logicalIndex >= Length)
{
throw new ArgumentOutOfRangeException(nameof(logicalIndex));
}

// Binary search to find the run that contains this logical index
return RunEnds switch
{
Int16Array int16Array => BinarySearchRunEnds(int16Array, logicalIndex),
Int32Array int32Array => BinarySearchRunEnds(int32Array, logicalIndex),
Int64Array int64Array => BinarySearchRunEnds(int64Array, logicalIndex),
_ => throw new InvalidOperationException($"Unexpected run ends array type: {RunEnds.GetType()}"),
};
}

private static int BinarySearchRunEnds(Int16Array runEnds, int logicalIndex)
{
int left = 0;
int right = runEnds.Length - 1;

while (left < right)
{
int mid = left + (right - left) / 2;
int runEnd = runEnds.GetValue(mid) ?? 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should also throw here if we encounter a null run-end.


if (logicalIndex < runEnd)
{
right = mid;
}
else
{
left = mid + 1;
}
}

return left;
}

private static int BinarySearchRunEnds(Int32Array runEnds, int logicalIndex)
{
int left = 0;
int right = runEnds.Length - 1;

while (left < right)
{
int mid = left + (right - left) / 2;
int runEnd = runEnds.GetValue(mid) ?? 0;

if (logicalIndex < runEnd)
{
right = mid;
}
else
{
left = mid + 1;
}
}

return left;
}

private static int BinarySearchRunEnds(Int64Array runEnds, int logicalIndex)
{
int left = 0;
int right = runEnds.Length - 1;

while (left < right)
{
int mid = left + (right - left) / 2;
long runEnd = runEnds.GetValue(mid) ?? 0;

if (logicalIndex < runEnd)
{
right = mid;
}
else
{
left = mid + 1;
}
}

return left;
}

public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor);
}
3 changes: 3 additions & 0 deletions src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,9 @@ private ArrayData LoadField(
{
case ArrowTypeId.Null:
return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, System.Array.Empty<ArrowBuffer>());
case ArrowTypeId.RunEndEncoded:
buffers = 0;
break;
case ArrowTypeId.Union:
if (version < MetadataVersion.V5)
{
Expand Down
12 changes: 11 additions & 1 deletion src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ private class ArrowRecordBatchFlatBufferBuilder :
IArrowArrayVisitor<Decimal128Array>,
IArrowArrayVisitor<Decimal256Array>,
IArrowArrayVisitor<DictionaryArray>,
IArrowArrayVisitor<NullArray>
IArrowArrayVisitor<NullArray>,
IArrowArrayVisitor<RunEndEncodedArray>
{
public readonly struct FieldNode
{
Expand Down Expand Up @@ -345,6 +346,15 @@ public void Visit(NullArray array)
// There are no buffers for a NullArray
}

public void Visit(RunEndEncodedArray array)
{
// REE arrays have no buffers at the top level, only child arrays
// Visit the run_ends array
VisitArray(array.RunEnds);
// Visit the values array
VisitArray(array.Values);
}

private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer valueOffsetsBuffer, int arrayOffset, int arrayLength)
{
var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * (arrayLength + 1));
Expand Down
11 changes: 10 additions & 1 deletion src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ class TypeVisitor :
IArrowTypeVisitor<DictionaryType>,
IArrowTypeVisitor<FixedSizeBinaryType>,
IArrowTypeVisitor<MapType>,
IArrowTypeVisitor<NullType>
IArrowTypeVisitor<NullType>,
IArrowTypeVisitor<RunEndEncodedType>
{
private FlatBufferBuilder Builder { get; }

Expand Down Expand Up @@ -343,6 +344,14 @@ public void Visit(NullType type)
Flatbuf.Null.EndNull(Builder));
}

public void Visit(RunEndEncodedType type)
{
Flatbuf.RunEndEncoded.StartRunEndEncoded(Builder);
Result = FieldType.Build(
Flatbuf.Type.RunEndEncoded,
Flatbuf.RunEndEncoded.EndRunEndEncoded(Builder));
}

public void Visit(IArrowType type)
{
throw new NotImplementedException($"Cannot visit type {type}");
Expand Down
6 changes: 6 additions & 0 deletions src/Apache.Arrow/Ipc/MessageSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,12 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c
}
Flatbuf.Map meta = field.Type<Flatbuf.Map>().Value;
return new Types.MapType(childFields[0], meta.KeysSorted);
case Flatbuf.Type.RunEndEncoded:
if (childFields == null || childFields.Length != 2)
{
throw new InvalidDataException($"Run-end encoded type must have exactly two children (run_ends and values).");
}
return new Types.RunEndEncodedType(childFields[0], childFields[1]);
default:
throw new InvalidDataException($"Arrow primitive '{field.TypeType}' is unsupported.");
}
Expand Down
1 change: 1 addition & 0 deletions src/Apache.Arrow/Types/IArrowType.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ public enum ArrowTypeId
LargeString,
Decimal32,
Decimal64,
RunEndEncoded,
}

public interface IArrowType
Expand Down
Loading