Skip to content

Commit eefd756

Browse files
committed
update
1 parent f50dd6c commit eefd756

File tree

3 files changed

+100
-0
lines changed

3 files changed

+100
-0
lines changed

parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV1.java

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,72 @@ public DataPageV1(
8888
this.indexRowCount = rowCount;
8989
}
9090

91+
/**
92+
* Constructor with explicit compressed size.
93+
* Use this when you know both compressed and uncompressed sizes.
94+
*
95+
* @param bytes the bytes for this page (compressed or decompressed)
96+
* @param valueCount count of values in this page
97+
* @param compressedSize the actual compressed size of the page
98+
* @param uncompressedSize the uncompressed size of the page
99+
* @param statistics of the page's values (max, min, num_null)
100+
* @param rlEncoding the repetition level encoding for this page
101+
* @param dlEncoding the definition level encoding for this page
102+
* @param valuesEncoding the values encoding for this page
103+
*/
104+
public DataPageV1(
105+
BytesInput bytes,
106+
int valueCount,
107+
int compressedSize,
108+
int uncompressedSize,
109+
Statistics<?> statistics,
110+
Encoding rlEncoding,
111+
Encoding dlEncoding,
112+
Encoding valuesEncoding) {
113+
super(compressedSize, uncompressedSize, valueCount);
114+
this.bytes = bytes;
115+
this.statistics = statistics;
116+
this.rlEncoding = rlEncoding;
117+
this.dlEncoding = dlEncoding;
118+
this.valuesEncoding = valuesEncoding;
119+
this.indexRowCount = -1;
120+
}
121+
122+
/**
123+
* Constructor with explicit compressed size and row info.
124+
* Use this when you know both compressed and uncompressed sizes plus row information.
125+
*
126+
* @param bytes
127+
* @param valueCount
128+
* @param compressedSize
129+
* @param uncompressedSize
130+
* @param firstRowIndex
131+
* @param rowCount
132+
* @param statistics
133+
* @param rlEncoding
134+
* @param dlEncoding
135+
* @param valuesEncoding
136+
*/
137+
public DataPageV1(
138+
BytesInput bytes,
139+
int valueCount,
140+
int compressedSize,
141+
int uncompressedSize,
142+
long firstRowIndex,
143+
int rowCount,
144+
Statistics<?> statistics,
145+
Encoding rlEncoding,
146+
Encoding dlEncoding,
147+
Encoding valuesEncoding) {
148+
super(compressedSize, uncompressedSize, valueCount, firstRowIndex);
149+
this.bytes = bytes;
150+
this.statistics = statistics;
151+
this.rlEncoding = rlEncoding;
152+
this.dlEncoding = dlEncoding;
153+
this.valuesEncoding = valuesEncoding;
154+
this.indexRowCount = rowCount;
155+
}
156+
91157
/**
92158
* @return the bytes for the page
93159
*/

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageReadStore.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ public DataPage visit(DataPageV1 dataPageV1) {
184184
decompressedPage = new DataPageV1(
185185
decompressed,
186186
dataPageV1.getValueCount(),
187+
dataPageV1.getCompressedSize(),
187188
dataPageV1.getUncompressedSize(),
188189
dataPageV1.getStatistics(),
189190
dataPageV1.getRlEncoding(),
@@ -194,6 +195,7 @@ public DataPage visit(DataPageV1 dataPageV1) {
194195
decompressedPage = new DataPageV1(
195196
decompressed,
196197
dataPageV1.getValueCount(),
198+
dataPageV1.getCompressedSize(),
197199
dataPageV1.getUncompressedSize(),
198200
firstRowIndex,
199201
Math.toIntExact(offsetIndex.getLastRowIndex(currentPageIndex, rowCount)

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDataPageChecksums.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import org.apache.parquet.column.page.DictionaryPage;
5151
import org.apache.parquet.column.page.Page;
5252
import org.apache.parquet.column.page.PageReadStore;
53+
import org.apache.parquet.column.page.PageReader;
5354
import org.apache.parquet.column.page.PageWriter;
5455
import org.apache.parquet.column.statistics.Statistics;
5556
import org.apache.parquet.compression.CompressionCodecFactory.BytesInputCompressor;
@@ -771,4 +772,35 @@ private void assertVerificationFailed(ParquetFileReader reader) {
771772
e.getMessage().contains("CRC checksum verification failed"));
772773
}
773774
}
775+
776+
@Test
777+
public void testCompressedSizePreservedAfterDecompression() throws IOException {
778+
Configuration conf = new Configuration();
779+
780+
CompressionCodecName compression = CompressionCodecName.GZIP;
781+
ParquetProperties.WriterVersion version = ParquetProperties.WriterVersion.PARQUET_1_0;
782+
783+
Path testFile = writeSimpleParquetFile(conf, compression, version);
784+
785+
try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(testFile, conf))) {
786+
PageReadStore pageStore = reader.readNextRowGroup();
787+
788+
ColumnDescriptor colDesc =
789+
reader.getFileMetaData().getSchema().getColumns().get(0);
790+
PageReader pageReader = pageStore.getPageReader(colDesc);
791+
792+
DataPage page;
793+
while ((page = pageReader.readPage()) != null) {
794+
int compressedSize = page.getCompressedSize();
795+
int uncompressedSize = page.getUncompressedSize();
796+
797+
assertTrue(
798+
"Compressed size (" + compressedSize + ") should be less than uncompressed size ("
799+
+ uncompressedSize + ") for compressed pages",
800+
compressedSize < uncompressedSize);
801+
802+
assertTrue("Compressed size should be positive", compressedSize > 0);
803+
}
804+
}
805+
}
774806
}

0 commit comments

Comments
 (0)