Skip to content

Commit 4f818f9

Browse files
committed
Allow reading dictionary encoded boolean
I've observed some Parquet files in the wild that contain dictionary encoded boolean values, which is also wild. I don't think we want allow producing this, but I think it would be good to allow reading this. We don't judge.
1 parent 43c5976 commit 4f818f9

File tree

3 files changed

+115
-1
lines changed

3 files changed

+115
-1
lines changed

parquet-column/src/main/java/org/apache/parquet/column/Encoding.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesReader;
4040
import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
4141
import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
42+
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary;
4243
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBinaryDictionary;
4344
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainDoubleDictionary;
4445
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainFloatDictionary;
@@ -102,6 +103,8 @@ public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dic
102103
return new PlainIntegerDictionary(dictionaryPage);
103104
case FLOAT:
104105
return new PlainFloatDictionary(dictionaryPage);
106+
case BOOLEAN:
107+
return new PlainBooleanDictionary(dictionaryPage);
105108
default:
106109
throw new ParquetDecodingException(
107110
"Dictionary encoding not supported for type: " + descriptor.getType());

parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.apache.parquet.bytes.ByteBufferInputStream;
2929
import org.apache.parquet.column.Dictionary;
3030
import org.apache.parquet.column.page.DictionaryPage;
31+
import org.apache.parquet.column.values.plain.BooleanPlainValuesReader;
3132
import org.apache.parquet.column.values.plain.PlainValuesReader.DoublePlainValuesReader;
3233
import org.apache.parquet.column.values.plain.PlainValuesReader.FloatPlainValuesReader;
3334
import org.apache.parquet.column.values.plain.PlainValuesReader.IntegerPlainValuesReader;
@@ -300,4 +301,47 @@ public int getMaxId() {
300301
return floatDictionaryContent.length - 1;
301302
}
302303
}
304+
305+
306+
/**
307+
* a simple implementation of dictionary for plain encoded boolean values
308+
*/
309+
public static class PlainBooleanDictionary extends PlainValuesDictionary {
310+
311+
private final boolean[] boolDictionaryContent;
312+
313+
/**
314+
* @param dictionaryPage a dictionary page of encoded boolean values
315+
* @throws IOException if there is an exception while decoding the dictionary page
316+
*/
317+
public PlainBooleanDictionary(DictionaryPage dictionaryPage) throws IOException {
318+
super(dictionaryPage);
319+
ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
320+
boolDictionaryContent = new boolean[dictionaryPage.getDictionarySize()];
321+
BooleanPlainValuesReader boolReader = new BooleanPlainValuesReader();
322+
boolReader.initFromPage(dictionaryPage.getDictionarySize(), in);
323+
for (int i = 0; i < boolDictionaryContent.length; i++) {
324+
boolDictionaryContent[i] = boolReader.readBoolean();
325+
}
326+
}
327+
328+
@Override
329+
public boolean decodeToBoolean(int id) {
330+
return boolDictionaryContent[id];
331+
}
332+
333+
@Override
334+
public String toString() {
335+
StringBuilder sb = new StringBuilder("PlainIntegerDictionary {\n");
336+
for (int i = 0; i < boolDictionaryContent.length; i++) {
337+
sb.append(i).append(" => ").append(boolDictionaryContent[i]).append("\n");
338+
}
339+
return sb.append("}").toString();
340+
}
341+
342+
@Override
343+
public int getMaxId() {
344+
return boolDictionaryContent.length - 1;
345+
}
346+
}
303347
}

parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
2525
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
2626
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
27-
import static org.junit.Assert.assertEquals;
27+
import static org.junit.Assert.*;
2828

2929
import java.io.IOException;
3030
import java.nio.ByteBuffer;
@@ -44,6 +44,7 @@
4444
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter;
4545
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter;
4646
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter;
47+
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary;
4748
import org.apache.parquet.column.values.fallback.FallbackValuesWriter;
4849
import org.apache.parquet.column.values.plain.BinaryPlainValuesReader;
4950
import org.apache.parquet.column.values.plain.PlainValuesReader;
@@ -678,6 +679,72 @@ public void testZeroValues() throws IOException {
678679
}
679680
}
680681

682+
@Test
683+
public void testBooleanDictionary() throws IOException {
684+
// Create a dictionary page with boolean values (false, true)
685+
// Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 = 0x02
686+
BytesInput bytes = BytesInput.from(new byte[] {0x02});
687+
DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN);
688+
689+
PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage);
690+
691+
// Verify dictionary decoding
692+
assertFalse(dictionary.decodeToBoolean(0));
693+
assertTrue(dictionary.decodeToBoolean(1));
694+
assertEquals(1, dictionary.getMaxId());
695+
}
696+
697+
@Test
698+
public void testBooleanDictionarySingleValue() throws IOException {
699+
// Test dictionary with only true value
700+
// Bit-packed: bit 0 = true (1) => byte = 0b00000001 = 0x01
701+
BytesInput bytesTrue = BytesInput.from(new byte[] {0x01});
702+
DictionaryPage dictionaryPageTrue = new DictionaryPage(bytesTrue, 1, PLAIN);
703+
704+
PlainBooleanDictionary dictionaryTrue = new PlainBooleanDictionary(dictionaryPageTrue);
705+
706+
assertTrue(dictionaryTrue.decodeToBoolean(0));
707+
assertEquals(0, dictionaryTrue.getMaxId());
708+
709+
// Test dictionary with only false value
710+
// Bit-packed: bit 0 = false (0) => byte = 0b00000000 = 0x00
711+
BytesInput bytesFalse = BytesInput.from(new byte[] {0x00});
712+
DictionaryPage dictionaryPageFalse = new DictionaryPage(bytesFalse, 1, PLAIN);
713+
714+
PlainBooleanDictionary dictionaryFalse = new PlainBooleanDictionary(dictionaryPageFalse);
715+
716+
assertFalse(dictionaryFalse.decodeToBoolean(0));
717+
assertEquals(0, dictionaryFalse.getMaxId());
718+
}
719+
720+
@Test
721+
public void testBooleanDictionaryToString() throws IOException {
722+
// Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 = 0x02
723+
BytesInput bytes = BytesInput.from(new byte[] {0x02});
724+
DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN);
725+
726+
PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage);
727+
728+
String str = dictionary.toString();
729+
Assert.assertTrue(str.contains("PlainIntegerDictionary"));
730+
Assert.assertTrue(str.contains("0 => false"));
731+
Assert.assertTrue(str.contains("1 => true"));
732+
}
733+
734+
@Test
735+
public void testBooleanDictionaryWithDictionaryEncoding() throws IOException {
736+
// Test with PLAIN_DICTIONARY encoding (both PLAIN and PLAIN_DICTIONARY should work)
737+
// Bit-packed: bit 0 = true (1), bit 1 = false (0) => byte = 0b00000001 = 0x01
738+
BytesInput bytes = BytesInput.from(new byte[] {0x01});
739+
DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN_DICTIONARY);
740+
741+
PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage);
742+
743+
assertEquals(true, dictionary.decodeToBoolean(0));
744+
assertEquals(false, dictionary.decodeToBoolean(1));
745+
assertEquals(1, dictionary.getMaxId());
746+
}
747+
681748
private DictionaryValuesReader initDicReader(ValuesWriter cw, PrimitiveTypeName type) throws IOException {
682749
final DictionaryPage dictionaryPage = cw.toDictPageAndClose().copy();
683750
final ColumnDescriptor descriptor = new ColumnDescriptor(new String[] {"foo"}, type, 0, 0);

0 commit comments

Comments
 (0)