From e889d0efc18e4b1463fd14372fef4c201494756b Mon Sep 17 00:00:00 2001 From: bodduv Date: Wed, 6 May 2026 13:00:38 +0200 Subject: [PATCH] Handle empty union list readers after IPC deser --- .../complex/impl/UnionLargeListReader.java | 19 ++++- .../vector/complex/impl/UnionListReader.java | 24 ++++-- .../impl/UnionListReaderBoundsChecker.java | 47 +++++++++++ .../arrow/vector/ipc/TestRoundTrip.java | 79 +++++++++++++++++++ 4 files changed, 160 insertions(+), 9 deletions(-) create mode 100644 vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReaderBoundsChecker.java diff --git a/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionLargeListReader.java b/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionLargeListReader.java index be236c3166..ade455e73e 100644 --- a/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionLargeListReader.java +++ b/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionLargeListReader.java @@ -53,9 +53,24 @@ public boolean isSet() { @Override public void setPosition(int index) { + int valueCount = vector.getValueCount(); + if (UnionListReaderBoundsChecker.isEmptyVectorPosition(index, valueCount)) { + setEmptyPosition(index); + return; + } + + UnionListReaderBoundsChecker.checkIndex(index, valueCount); + UnionListReaderBoundsChecker.checkOffsetBuffer(vector.getOffsetBuffer(), index, OFFSET_WIDTH); + + super.setPosition(index); + currentOffset = vector.getElementStartIndex(index) - 1; + maxOffset = vector.getElementEndIndex(index); + } + + private void setEmptyPosition(int index) { super.setPosition(index); - currentOffset = vector.getOffsetBuffer().getLong((long) index * OFFSET_WIDTH) - 1; - maxOffset = vector.getOffsetBuffer().getLong(((long) index + 1L) * OFFSET_WIDTH); + currentOffset = 0; + maxOffset = 0; } @Override diff --git a/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java b/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java index 014608afee..7036bba77d 100644 --- a/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java +++ b/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java @@ -52,14 +52,24 @@ public boolean isSet() { @Override public void setPosition(int index) { - super.setPosition(index); - if (vector.getOffsetBuffer().capacity() == 0) { - currentOffset = 0; - maxOffset = 0; - } else { - currentOffset = vector.getOffsetBuffer().getInt(index * (long) OFFSET_WIDTH) - 1; - maxOffset = vector.getOffsetBuffer().getInt((index + 1) * (long) OFFSET_WIDTH); + int valueCount = vector.getValueCount(); + if (UnionListReaderBoundsChecker.isEmptyVectorPosition(index, valueCount)) { + setEmptyPosition(index); + return; } + + UnionListReaderBoundsChecker.checkIndex(index, valueCount); + UnionListReaderBoundsChecker.checkOffsetBuffer(vector.getOffsetBuffer(), index, OFFSET_WIDTH); + + super.setPosition(index); + currentOffset = vector.getElementStartIndex(index) - 1; + maxOffset = vector.getElementEndIndex(index); + } + + private void setEmptyPosition(int index) { + super.setPosition(index); + currentOffset = 0; + maxOffset = 0; } @Override diff --git a/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReaderBoundsChecker.java b/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReaderBoundsChecker.java new file mode 100644 index 0000000000..41f9f3b9c6 --- /dev/null +++ b/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReaderBoundsChecker.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.memory.ArrowBuf; + +/** Shared position validation for union list readers backed by offset buffers. */ +final class UnionListReaderBoundsChecker { + + private UnionListReaderBoundsChecker() {} + + static boolean isEmptyVectorPosition(int index, int valueCount) { + return valueCount == 0 && index == 0; + } + + static void checkIndex(int index, int valueCount) { + if (index < 0 || index >= valueCount) { + throw new IndexOutOfBoundsException( + String.format("index: %s, expected range (0, %s)", index, valueCount)); + } + } + + static void checkOffsetBuffer(ArrowBuf offsetBuffer, int index, long offsetWidth) { + long requiredBytes = ((long) index + 2L) * offsetWidth; + long capacity = offsetBuffer.capacity(); + if (capacity < requiredBytes) { + throw new IndexOutOfBoundsException( + String.format( + "Offset buffer has capacity %s but reading index %s requires %s bytes", + capacity, index, requiredBytes)); + } + } +} diff --git a/vector/src/test/java/org/apache/arrow/vector/ipc/TestRoundTrip.java b/vector/src/test/java/org/apache/arrow/vector/ipc/TestRoundTrip.java index 65a3791dd4..3074885acb 100644 --- a/vector/src/test/java/org/apache/arrow/vector/ipc/TestRoundTrip.java +++ b/vector/src/test/java/org/apache/arrow/vector/ipc/TestRoundTrip.java @@ -50,8 +50,11 @@ import org.apache.arrow.vector.TinyIntVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.complex.BaseRepeatedValueVector; import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.ipc.message.ArrowBlock; import org.apache.arrow.vector.ipc.message.ArrowBuffer; @@ -326,6 +329,82 @@ public void testMetadata(String name, IpcOption writeOption) throws Exception { } } + @ParameterizedTest(name = "options = {0}") + @MethodSource("getWriteOption") + public void testEmptyUnionListReadersAfterIpc(String name, IpcOption writeOption) + throws Exception { + Field structField = + new Field( + "struct", + FieldType.nullable(ArrowType.Struct.INSTANCE), + Collections2.asImmutableList( + listField("list", ArrowType.List.INSTANCE), + listField("largeList", ArrowType.LargeList.INSTANCE), + mapField("map"))); + Schema schema = new Schema(Collections2.asImmutableList(structField)); + + try (final BufferAllocator originalVectorAllocator = + allocator.newChildAllocator("original vectors", 0, allocator.getLimit()); + final StructVector vector = (StructVector) structField.createVector(originalVectorAllocator)) { + vector.allocateNewSafe(); + vector.setValueCount(0); + + List vectors = Collections2.asImmutableList(vector); + VectorSchemaRoot root = new VectorSchemaRoot(schema, vectors, 0); + roundTrip( + name, + writeOption, + root, + /* dictionaryProvider */ null, + TestRoundTrip::writeSingleBatch, + validateFileBatches(new int[] {0}, this::validateEmptyUnionListReaders), + validateStreamBatches(new int[] {0}, this::validateEmptyUnionListReaders)); + } + } + + private Field listField(String name, ArrowType type) { + return new Field( + name, + FieldType.nullable(type), + Collections2.asImmutableList( + new Field( + BaseRepeatedValueVector.DATA_VECTOR_NAME, + FieldType.nullable(new ArrowType.Int(32, true)), + null))); + } + + private Field mapField(String name) { + Field keyField = + new Field(MapVector.KEY_NAME, FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field valueField = + new Field(MapVector.VALUE_NAME, FieldType.nullable(new ArrowType.Int(32, true)), null); + Field entriesField = + new Field( + MapVector.DATA_VECTOR_NAME, + FieldType.notNullable(ArrowType.Struct.INSTANCE), + Collections2.asImmutableList(keyField, valueField)); + + return new Field( + name, + FieldType.nullable(new ArrowType.Map(false)), + Collections2.asImmutableList(entriesField)); + } + + private void validateEmptyUnionListReaders(int expectedCount, VectorSchemaRoot root) { + assertEquals(0, expectedCount); + + FieldReader structReader = root.getVector("struct").getReader(); + assertEmptyUnionListReader(structReader.reader("list")); + assertEmptyUnionListReader(structReader.reader("largeList")); + assertEmptyUnionListReader(structReader.reader("map")); + } + + private void assertEmptyUnionListReader(FieldReader reader) { + assertEquals(0, reader.size()); + assertFalse(reader.next()); + assertThrows(IndexOutOfBoundsException.class, () -> reader.setPosition(1)); + } + private Map metadata(int i) { Map map = new HashMap<>(); map.put("k_" + i, "v_" + i);