From 923d193fa5ed220b13db441af55bc2a3c917a4f3 Mon Sep 17 00:00:00 2001 From: Daniel Adam Date: Mon, 8 Dec 2025 14:10:27 +0100 Subject: [PATCH 1/2] Add TestEncryptFile and TestDecryptFile tests --- parquet/file/file_reader_test.go | 122 ++++++++++++++++++++++++ tools/arrowgo_encrypted_uniform.parquet | Bin 0 -> 740 bytes tools/pyarrow_encrypted_uniform.parquet | Bin 0 -> 1409 bytes tools/read_encrypted_parquet.py | 43 +++++++++ tools/write_encrypted_parquet.py | 38 ++++++++ 5 files changed, 203 insertions(+) create mode 100644 tools/arrowgo_encrypted_uniform.parquet create mode 100644 tools/pyarrow_encrypted_uniform.parquet create mode 100644 tools/read_encrypted_parquet.py create mode 100755 tools/write_encrypted_parquet.py diff --git a/parquet/file/file_reader_test.go b/parquet/file/file_reader_test.go index 1927ca87..f82faab6 100644 --- a/parquet/file/file_reader_test.go +++ b/parquet/file/file_reader_test.go @@ -26,6 +26,7 @@ import ( "io" "os" "path" + "path/filepath" "testing" "github.com/apache/arrow-go/v18/arrow" @@ -36,6 +37,7 @@ import ( "github.com/apache/arrow-go/v18/parquet/compress" "github.com/apache/arrow-go/v18/parquet/file" "github.com/apache/arrow-go/v18/parquet/internal/encoding" + "github.com/apache/arrow-go/v18/parquet/internal/encryption" format "github.com/apache/arrow-go/v18/parquet/internal/gen-go/parquet" "github.com/apache/arrow-go/v18/parquet/internal/thrift" "github.com/apache/arrow-go/v18/parquet/metadata" @@ -940,3 +942,123 @@ func TestListColumns(t *testing.T) { } } } + +func TestEncryptFile(t *testing.T) { + dir := "../../tools" + require.DirExists(t, dir) + + footerKeyID := "footer_key" + footerKey := "0123456789012345" + + enc := parquet.NewFileEncryptionProperties(footerKey, parquet.WithFooterKeyID(footerKeyID)) + props := parquet.NewWriterProperties( + parquet.WithCompression(compress.Codecs.Uncompressed), + parquet.WithEncryptionProperties(enc), + ) + filename := filepath.Join(dir, "arrowgo_encrypted_uniform.parquet") + outFile, err := os.Create(filename) + require.NoError(t, err) + require.NotNil(t, outFile) + defer outFile.Close() + + fields := make(schema.FieldList, 0) + // {"id": [1, 2, 3], "name": ["alice", "bob", "charlie"]} + fields = append(fields, + schema.NewInt32Node("id", parquet.Repetitions.Required, -1), + schema.NewByteArrayNode("name", parquet.Repetitions.Required, -1), + ) + schema, err := schema.NewGroupNode("schema", parquet.Repetitions.Required, fields, -1) + require.NoError(t, err) + + writer := file.NewParquetWriter(outFile, schema, file.WithWriterProps(props)) + defer writer.Close() + + rgr := writer.AppendBufferedRowGroup() + defer rgr.Close() + + ccwID, err := rgr.Column(0) + require.NoError(t, err) + ccwName, err := rgr.Column(1) + require.NoError(t, err) + + ids := []int32{1, 2, 3} + names := []parquet.ByteArray{ + []byte("alice"), + []byte("bob"), + []byte("charlie"), + } + + _, err = ccwID.(*file.Int32ColumnChunkWriter).WriteBatch(ids, nil, nil) + require.NoError(t, err) + _, err = ccwName.(*file.ByteArrayColumnChunkWriter).WriteBatch(names, nil, nil) + require.NoError(t, err) +} + +func TestDecryptFile(t *testing.T) { + dir := "../../tools" + require.DirExists(t, dir) + + footerKeyID := "footer_key" + footerKey := "0123456789012345" + + stringKr := make(encryption.StringKeyIDRetriever) + stringKr.PutKey(footerKeyID, footerKey) + decryptProps := parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr)) + // decryptProps := parquet.NewFileDecryptionProperties(parquet.WithFooterKey(footerKey)) + + props := parquet.NewReaderProperties(memory.DefaultAllocator) + props.FileDecryptProps = decryptProps.Clone("") + + fileReader, err := file.OpenParquetFile(path.Join(dir, "pyarrow_encrypted_uniform.parquet"), false, file.WithReadProps(props)) + // fileReader, err := file.OpenParquetFile(path.Join(dir, "arrowgo_encrypted_uniform.parquet"), false, file.WithReadProps(props)) + require.NoError(t, err) + defer fileReader.Close() + + // get metadata + fileMetadata := fileReader.MetaData() + // get number of rowgroups + numRowGroups := len(fileMetadata.RowGroups) + // number of columns + numColumns := fileMetadata.Schema.NumColumns() + assert.Equal(t, 2, numColumns) + + for r := range numRowGroups { + rowGroupReader := fileReader.RowGroup(r) + numRows := rowGroupReader.NumRows() + colReader, err := rowGroupReader.Column(0) + require.NoError(t, err) + int32ColReader, ok := colReader.(*file.Int32ColumnChunkReader) + require.True(t, ok) + + values := make([]int32, numRows) + defLvls := make([]int16, numRows) + repLvls := make([]int16, numRows) + + total, read, err := int32ColReader.ReadBatch(numRows, values, defLvls, repLvls) + require.NoError(t, err) + require.Equal(t, numRows, int64(total)) + require.Equal(t, numRows, int64(read)) + expected := []int32{1, 2, 3} + require.Equal(t, expected, values) + + colReader, err = rowGroupReader.Column(1) + require.NoError(t, err) + byteArrayColReader, ok := colReader.(*file.ByteArrayColumnChunkReader) + require.True(t, ok) + + byteArrayValues := make([]parquet.ByteArray, numRows) + defLvls = make([]int16, numRows) + repLvls = make([]int16, numRows) + + total, read, err = byteArrayColReader.ReadBatch(numRows, byteArrayValues, defLvls, repLvls) + require.NoError(t, err) + require.Equal(t, numRows, int64(total)) + require.Equal(t, numRows, int64(read)) + expectedByteArray := []parquet.ByteArray{ + []byte("alice"), + []byte("bob"), + []byte("charlie"), + } + require.Equal(t, expectedByteArray, byteArrayValues) + } +} diff --git a/tools/arrowgo_encrypted_uniform.parquet b/tools/arrowgo_encrypted_uniform.parquet new file mode 100644 index 0000000000000000000000000000000000000000..30d178dc37e421c23a1e41ddc34f70999052f151 GIT binary patch literal 740 zcmVH;LZ%Ib;`y;ZSKgKuMo0>g;Yu~g-e(vLl!|8zoP7#i=>&&kV^mn z0A&OD6qr^FQ^9qWepUW*FEQt(c^xMt`(hm*V7lv!3Wj@rWHnk;&%+HPvSuCWM7@^X zPw?pnyTsLj^>N?OiIUMJ)YO?mh53esAOHXW4S0%ag@~uj*DD^yq-{eHDksgMnz$xf z;4BYm3!XzmDgXcgu!NnS?bPth{Kltb>WfC90)Hd$k^GDl^uwK z`%E_g000TG5fEzptj0}@_p=?nGwe`O?61CoF6{{Y_P{Lj@7Ir1(oL;Y;S7!qEeXpAOHXWUKF+Ct=(`tDKfk%yEjw@pm!?+ zWhsTmh=m$taeoUM92_VJ|B#(Y+=^h{5&!@g3TAI_bY*g1Yh`%=7XbhOazQ#xm(xB! zrDG(Ccv(<(wO-U;mY?h<)v01K^4m-`q`5Z33-%_7KR)(fq1fd(ijN~ITCakV7FmVP z3#xSi#QmTH(nDB@y{e+>-9@6ZV&p$w zCRSSq1c!T~H$YOOfwcq!Ss)?0AiG>bi?vP#IX06&;Srq~&z&7mIgn)Ndi?8x8j0b* zvcnRkc-znj>GjB`Ho-FUwx1kj6B@@=vx`2^0x)H39Wg~yhrb2Ze~4qbW#I(7$TCub zNSZPiNLQ0RR9{K~hDF?@$Z? literal 0 HcmV?d00001 diff --git a/tools/pyarrow_encrypted_uniform.parquet b/tools/pyarrow_encrypted_uniform.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ca2f4c1b6bb8547c7a34aa5071e48ec58515f6a3 GIT binary patch literal 1409 zcmY+EdpOez9L9g-ljm&pH3T@Av&Y@B7b}MxcZG006Wpw7nhuMaNKG zcbh3oTtJxBdEA%hQq5v>UpwK4IykX6b1VwY#7T?g7Vxz%d9IkV&tm3o?o0tjenA=V zcc7*SLx>h>QSw@uex;DW+8-<@F85YD60~h!GWRoaR6jVUthMQWv7JZNAHt1dS>rC{ z@`ee`Y6FzP7DibG_&DK4mE4ij`F$!NP<5o=mG#to9Aks3olL;023X8KQBd2Q6y5#m zY-ygt(wBnC(Y}m{_J>6ME|VhsSlAsLj32?a+Sf=ZR~G%Zzo{xr={%U*q@SYbLmvRZ zF0^2|R7)2mEn(N@&bOn_I>#3IG+xIyYqX#CK9PHD(W+kIgMZ9tW#J<2@|89}B+|Mo zJ`-sTU(2F59hCX$%z(?n{I-=L>JLSGhsYDRW$jAWZmH^NAHvmmeXH4hm3WM-7oC++ z5A})e#tFte9pGq<;nW>FX}OJV7TU8=y`eLYf%>c{8LbaJ@S&1Z3EEk|QPF5@E`IlE zzIH@K7^00}=*Dm8M4pcySn0dpc(i4Hr~lfr5ByL0`i5Dqi@`50eBTR4vi3Cqq z2F&=6<4Je@pB2H3{EM<430`Lgz-{2vAt6&BfY^?p3!D%+(?CmFIMos#<^-BX@(=J$ zm_iEUKos5q&xa02MVN~{cvLnR1hPpyOHu?ONNgh%34(q3hiqXMn;*prhrHH^p*+sN zjUg$>3ld5NqEJ2$Ub5c7S`gtJOtcsC!q{eBtY9-!yq6or-FA)gcO#eeMy8Orxtkfu z*_pv5@S;fGEEdU~PJr)X03real2vdyll+=P9&quZt`B$XJT&5pCzB=W&T-9K&gKYS6H7|rV zY*j*zK@TJ}okJ@EssY*JI;dvk*_%Q8Gu04-PoaJ{*ebsbr|jp+x0;b_RspqJJ@FU% z%9{dRn3<9^lhmQvSfu>2zWdHhPvZsb98>9qk%ocBu9?yZ0@l~!LEhl0cU(O?rKjpn z$!!wXG(0bQcsd4)Fw8hVjWq6YQ@umQyVdPcyjF5@X5UYg$&D?^mw;D2>T0UlK_Asf z=DLck30K?wm9smyj_v4A=fu&L9vbz%3=H&1w+(33^EzLa_2Z-Q`DBkP=pCgvBy-Mt zgOplvylT1(Vy7o}SjSu+$8}8wG z#^&iV@~YY#%Q*Vzo^K1qvU|itb;S3wM+vggz!}`|i`bl&9-59^ekm<=Nc*6dmLz5B zQrLEbvPaYJjmEf2()R{&B`+{bb9WWB?8<+;Si5@0NEq^|`i9ztmQlf1gM#>c6(`>g zNk+qycZzmUp+T91^xiTda@90M`uc<+gCj^8(ah^TJp5etau~a?v9qMVMyp;k_JL|Z zrdbh9O-#!>iE`t33Z^vqsfTgT qC6X4^F59=a)_gr_mqeS&Kp2m0p1OxlO}ltwAu+8)9sp?Y^ZgUDYHlY0 literal 0 HcmV?d00001 diff --git a/tools/read_encrypted_parquet.py b/tools/read_encrypted_parquet.py new file mode 100644 index 00000000..c103cf51 --- /dev/null +++ b/tools/read_encrypted_parquet.py @@ -0,0 +1,43 @@ +import base64 +import pyarrow as pa +import pyarrow.parquet as pq +import pyarrow.parquet.encryption as pe + +KEY_ID = "footer_key" + +class MockKmsClient(pe.KmsClient): + def __init__(self, kms_connection_configuration): + super().__init__() + + def wrap_key(self, key_bytes, master_key_identifier): + return base64.b64encode(key_bytes) + + def unwrap_key(self, wrapped_key, master_key_identifier): + return base64.b64decode(wrapped_key) + +crypto_factory = pe.CryptoFactory(lambda config: MockKmsClient(config)) +kms_connection_config = pe.KmsConnectionConfig() +decryption_config = pe.DecryptionConfiguration() +decryption_properties = crypto_factory.file_decryption_properties(kms_connection_config, decryption_config) + +# Read back stats +# input_file = "pyarrow_encrypted_uniform.parquet" +input_file = "arrowgo_encrypted_uniform.parquet" +print(f"\nReading {input_file}") +with pq.ParquetFile( + input_file, + decryption_properties=decryption_properties) as f: + # meta data + rg = f.metadata.row_group(0) + # per-column stats + for col_idx in range(f.metadata.num_columns): + col = rg.column(col_idx) + statistics = "None" if col.statistics is None else str(col.statistics) + print(f"Column '{col.path_in_schema}' statistics:\n {statistics}") + + # read the table + table = f.read() + print("\nDecrypted table:") + print(table) + +print(f"\nSuccessfully read and decrypted {input_file}") \ No newline at end of file diff --git a/tools/write_encrypted_parquet.py b/tools/write_encrypted_parquet.py new file mode 100755 index 00000000..e3be95e3 --- /dev/null +++ b/tools/write_encrypted_parquet.py @@ -0,0 +1,38 @@ +import base64 +import pyarrow as pa +import pyarrow.parquet as pq +import pyarrow.parquet.encryption as pe + +KEY_ID = "footer_key" + +class MockKmsClient(pe.KmsClient): + def __init__(self, kms_connection_configuration): + super().__init__() + + def wrap_key(self, key_bytes, master_key_identifier): + return base64.b64encode(key_bytes) + + def unwrap_key(self, wrapped_key, master_key_identifier): + return base64.b64decode(wrapped_key) + +# ── Write the file ─────────────────────────────────────────────────────── +table = pa.table({"id": [1, 2, 3], "name": ["alice", "bob", "charlie"]}) + +crypto_factory = pe.CryptoFactory(lambda config: MockKmsClient(config)) +kms_connection_config = pe.KmsConnectionConfig() +encryption_config = pe.EncryptionConfiguration( + KEY_ID, + uniform_encryption=True, + plaintext_footer=False) + +enc_props = crypto_factory.file_encryption_properties( + kms_connection_config, + encryption_config +) + +output_file = "pyarrow_encrypted_uniform.parquet" +pq.write_table(table, output_file, compression="none", encryption_properties=enc_props) + +print(f"Successfully written {output_file}") +print(f" Key ID: {KEY_ID}") +print(" → This file MUST be readable by any correct arrow-go implementation") \ No newline at end of file From 6774d13f299e3a7ba338e1a25243a3ae87b6b982 Mon Sep 17 00:00:00 2001 From: Daniel Adam Date: Mon, 8 Dec 2025 15:21:45 +0100 Subject: [PATCH 2/2] fixup! Add TestEncryptFile and TestDecryptFile tests --- parquet/file/file_reader_test.go | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/parquet/file/file_reader_test.go b/parquet/file/file_reader_test.go index f82faab6..e5c96bb7 100644 --- a/parquet/file/file_reader_test.go +++ b/parquet/file/file_reader_test.go @@ -22,6 +22,7 @@ import ( "crypto/rand" "encoding/binary" "encoding/csv" + "encoding/json" "fmt" "io" "os" @@ -37,7 +38,6 @@ import ( "github.com/apache/arrow-go/v18/parquet/compress" "github.com/apache/arrow-go/v18/parquet/file" "github.com/apache/arrow-go/v18/parquet/internal/encoding" - "github.com/apache/arrow-go/v18/parquet/internal/encryption" format "github.com/apache/arrow-go/v18/parquet/internal/gen-go/parquet" "github.com/apache/arrow-go/v18/parquet/internal/thrift" "github.com/apache/arrow-go/v18/parquet/metadata" @@ -994,6 +994,32 @@ func TestEncryptFile(t *testing.T) { require.NoError(t, err) } +type keyIDRetriever map[string]string + +// PutKey adds a key with the given string ID that can be retrieved +func (s keyIDRetriever) PutKey(keyID, key string) { + s[keyID] = key +} + +// GetKey expects the keymetadata to match one of the keys that were added +// with PutKey and panics if the key cannot be found. +func (s keyIDRetriever) GetKey(keyMetadata []byte) string { + var mdMap map[string]any + err := json.Unmarshal(keyMetadata, &mdMap) + if err != nil { + panic(fmt.Errorf("parquet: invalid key metadata: %w", err)) + } + keyMetadataStr, ok := mdMap["masterKeyID"].(string) + if !ok { + panic(fmt.Errorf("parquet: masterKeyID missing from key metadata")) + } + k, ok := s[keyMetadataStr] + if !ok { + panic(fmt.Errorf("parquet: key missing for id %s", keyMetadata)) + } + return k +} + func TestDecryptFile(t *testing.T) { dir := "../../tools" require.DirExists(t, dir) @@ -1001,7 +1027,7 @@ func TestDecryptFile(t *testing.T) { footerKeyID := "footer_key" footerKey := "0123456789012345" - stringKr := make(encryption.StringKeyIDRetriever) + stringKr := make(keyIDRetriever) stringKr.PutKey(footerKeyID, footerKey) decryptProps := parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr)) // decryptProps := parquet.NewFileDecryptionProperties(parquet.WithFooterKey(footerKey))