Skip to content

Commit 77df3ec

Browse files
Yicong-HuangYicong Huang
andauthored
GH-343: Fix BaseVariableWidthVector and BaseLargeVariableWidthVector offset buffer serialization (#989)
## What's Changed Fix `BaseVariableWidthVector`/`BaseLargeVariableWidthVector` IPC serialization when `valueCount` is 0. ### Problem When `valueCount == 0`, `setReaderAndWriterIndex()` was setting `offsetBuffer.writerIndex(0)`, which means `readableBytes() == 0`. IPC serializer uses `readableBytes()` to determine buffer size, so 0 bytes were written to the IPC stream. This crashes IPC readers in other libraries because Arrow spec requires offset buffer to have at least one entry `[0]`. This is a follow-up to #967 which fixed the same issue in `ListVector`/`LargeListVector`. ### Fix Modify `setReaderAndWriterIndex()` to always use `(valueCount + 1) * OFFSET_WIDTH` for the offset buffer's `writerIndex`, moved outside the if/else branch. When the offset buffer capacity is insufficient (e.g., empty buffer from constructor or loaded via `loadFieldBuffers()`), it reallocates a properly sized buffer on demand. ### Testing Added tests for empty `VarCharVector` and `LargeVarCharVector` verifying offset buffer has correct `readableBytes()` after `setValueCount(0)`. Closes #343 --------- Co-authored-by: Yicong Huang <yicong.huang+data@databricks.com>
1 parent 4a7fb4e commit 77df3ec

File tree

4 files changed

+79
-13
lines changed

4 files changed

+79
-13
lines changed

adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,15 +43,19 @@ public void testZeroRowResultSet() throws Exception {
4343
.setReuseVectorSchemaRoot(reuseVectorSchemaRoot)
4444
.build();
4545

46-
ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs, config);
47-
assertTrue(iter.hasNext(), "Iterator on zero row ResultSet should haveNext() before use");
48-
VectorSchemaRoot root = iter.next();
49-
assertNotNull(root, "VectorSchemaRoot from first next() result should never be null");
50-
assertEquals(
51-
0, root.getRowCount(), "VectorSchemaRoot from empty ResultSet should have zero rows");
52-
assertFalse(
53-
iter.hasNext(),
54-
"hasNext() should return false on empty ResultSets after initial next() call");
46+
try (ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs, config)) {
47+
assertTrue(iter.hasNext(), "Iterator on zero row ResultSet should haveNext() before use");
48+
VectorSchemaRoot root = iter.next();
49+
assertNotNull(root, "VectorSchemaRoot from first next() result should never be null");
50+
assertEquals(
51+
0, root.getRowCount(), "VectorSchemaRoot from empty ResultSet should have zero rows");
52+
assertFalse(
53+
iter.hasNext(),
54+
"hasNext() should return false on empty ResultSets after initial next() call");
55+
if (!reuseVectorSchemaRoot) {
56+
root.close();
57+
}
58+
}
5559
}
5660
}
5761
}

vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,14 +373,26 @@ private void setReaderAndWriterIndex() {
373373
valueBuffer.readerIndex(0);
374374
if (valueCount == 0) {
375375
validityBuffer.writerIndex(0);
376-
offsetBuffer.writerIndex(0);
377376
valueBuffer.writerIndex(0);
378377
} else {
379378
final long lastDataOffset = getStartOffset(valueCount);
380379
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
381-
offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
382380
valueBuffer.writerIndex(lastDataOffset);
383381
}
382+
// IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`.
383+
// Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers
384+
// in other libraries. According to Arrow spec, we should still output the offset buffer which
385+
// is [0].
386+
final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH;
387+
if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
388+
ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize);
389+
if (offsetBuffer.capacity() > 0) {
390+
newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
391+
}
392+
offsetBuffer.getReferenceManager().release();
393+
offsetBuffer = newOffsetBuffer;
394+
}
395+
offsetBuffer.writerIndex(requiredOffsetBufferSize);
384396
}
385397

386398
/** Same as {@link #allocateNewSafe()}. */

vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -389,14 +389,26 @@ private void setReaderAndWriterIndex() {
389389
valueBuffer.readerIndex(0);
390390
if (valueCount == 0) {
391391
validityBuffer.writerIndex(0);
392-
offsetBuffer.writerIndex(0);
393392
valueBuffer.writerIndex(0);
394393
} else {
395394
final int lastDataOffset = getStartOffset(valueCount);
396395
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
397-
offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
398396
valueBuffer.writerIndex(lastDataOffset);
399397
}
398+
// IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`.
399+
// Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers
400+
// in other libraries. According to Arrow spec, we should still output the offset buffer which
401+
// is [0].
402+
final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH;
403+
if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
404+
ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize);
405+
if (offsetBuffer.capacity() > 0) {
406+
newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
407+
}
408+
offsetBuffer.getReferenceManager().release();
409+
offsetBuffer = newOffsetBuffer;
410+
}
411+
offsetBuffer.writerIndex(requiredOffsetBufferSize);
400412
}
401413

402414
/** Same as {@link #allocateNewSafe()}. */

vector/src/test/java/org/apache/arrow/vector/TestValueVector.java

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3940,4 +3940,42 @@ public void testVectorLoadUnloadOnNonVariadicVectors() {
39403940
}
39413941
}
39423942
}
3943+
3944+
@Test
3945+
public void testEmptyVarCharOffsetBuffer() {
3946+
// Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0)
3947+
// even when valueCount is 0, per Arrow specification.
3948+
try (VarCharVector vector = newVarCharVector("varchar", allocator)) {
3949+
vector.allocateNew();
3950+
vector.setValueCount(0);
3951+
3952+
List<ArrowBuf> buffers = vector.getFieldBuffers();
3953+
// buffers: [validity, offset, data]
3954+
assertTrue(
3955+
buffers.get(1).readableBytes() >= BaseVariableWidthVector.OFFSET_WIDTH,
3956+
"Offset buffer should have at least "
3957+
+ BaseVariableWidthVector.OFFSET_WIDTH
3958+
+ " bytes for offset[0]");
3959+
assertEquals(0, vector.getOffsetBuffer().getInt(0));
3960+
}
3961+
}
3962+
3963+
@Test
3964+
public void testEmptyLargeVarCharOffsetBuffer() {
3965+
// Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0)
3966+
// even when valueCount is 0, per Arrow specification.
3967+
try (LargeVarCharVector vector = new LargeVarCharVector("largevarchar", allocator)) {
3968+
vector.allocateNew();
3969+
vector.setValueCount(0);
3970+
3971+
List<ArrowBuf> buffers = vector.getFieldBuffers();
3972+
// buffers: [validity, offset, data]
3973+
assertTrue(
3974+
buffers.get(1).readableBytes() >= BaseLargeVariableWidthVector.OFFSET_WIDTH,
3975+
"Offset buffer should have at least "
3976+
+ BaseLargeVariableWidthVector.OFFSET_WIDTH
3977+
+ " bytes for offset[0]");
3978+
assertEquals(0, vector.getOffsetBuffer().getLong(0));
3979+
}
3980+
}
39433981
}

0 commit comments

Comments
 (0)