2727import pytest
2828from datafusion import (
2929 DataFrame ,
30- ParquetWriterOptions ,
3130 ParquetColumnOptions ,
31+ ParquetWriterOptions ,
3232 SessionContext ,
3333 WindowFrame ,
3434 column ,
@@ -1668,7 +1668,9 @@ def test_write_parquet_with_options_compression(df, tmp_path, compression):
16681668 import re
16691669
16701670 path = tmp_path
1671- df .write_parquet_with_options (str (path ), ParquetWriterOptions (compression = compression ))
1671+ df .write_parquet_with_options (
1672+ str (path ), ParquetWriterOptions (compression = compression )
1673+ )
16721674
16731675 # test that the actual compression scheme is the one written
16741676 for _root , _dirs , files in os .walk (path ):
@@ -1695,28 +1697,36 @@ def test_write_parquet_with_options_wrong_compression_level(df, tmp_path, compre
16951697 path = tmp_path
16961698
16971699 with pytest .raises (Exception , match = r"valid compression range .*? exceeded." ):
1698- df .write_parquet_with_options (str (path ), ParquetWriterOptions (compression = compression ))
1700+ df .write_parquet_with_options (
1701+ str (path ), ParquetWriterOptions (compression = compression )
1702+ )
16991703
17001704
17011705@pytest .mark .parametrize ("compression" , ["wrong" , "wrong(12)" ])
17021706def test_write_parquet_with_options_invalid_compression (df , tmp_path , compression ):
17031707 path = tmp_path
17041708
17051709 with pytest .raises (Exception , match = "Unknown or unsupported parquet compression" ):
1706- df .write_parquet_with_options (str (path ), ParquetWriterOptions (compression = compression ))
1710+ df .write_parquet_with_options (
1711+ str (path ), ParquetWriterOptions (compression = compression )
1712+ )
17071713
17081714
17091715@pytest .mark .parametrize (
17101716 ("writer_version" , "format_version" ),
17111717 [("1.0" , "1.0" ), ("2.0" , "2.6" ), (None , "1.0" )],
17121718)
1713- def test_write_parquet_with_options_writer_version (df , tmp_path , writer_version , format_version ):
1719+ def test_write_parquet_with_options_writer_version (
1720+ df , tmp_path , writer_version , format_version
1721+ ):
17141722 """Test the Parquet writer version. Note that writer_version=2.0 results in
17151723 format_version=2.6"""
17161724 if writer_version is None :
17171725 df .write_parquet_with_options (tmp_path , ParquetWriterOptions ())
17181726 else :
1719- df .write_parquet_with_options (tmp_path , ParquetWriterOptions (writer_version = writer_version ))
1727+ df .write_parquet_with_options (
1728+ tmp_path , ParquetWriterOptions (writer_version = writer_version )
1729+ )
17201730
17211731 for file in tmp_path .rglob ("*.parquet" ):
17221732 parquet = pq .ParquetFile (file )
@@ -1730,13 +1740,19 @@ def test_write_parquet_with_options_wrong_writer_version(df, tmp_path, writer_ve
17301740 with pytest .raises (
17311741 Exception , match = "Unknown or unsupported parquet writer version"
17321742 ):
1733- df .write_parquet_with_options (tmp_path , ParquetWriterOptions (writer_version = writer_version ))
1743+ df .write_parquet_with_options (
1744+ tmp_path , ParquetWriterOptions (writer_version = writer_version )
1745+ )
17341746
17351747
17361748@pytest .mark .parametrize ("dictionary_enabled" , [True , False , None ])
1737- def test_write_parquet_with_options_dictionary_enabled (df , tmp_path , dictionary_enabled ):
1749+ def test_write_parquet_with_options_dictionary_enabled (
1750+ df , tmp_path , dictionary_enabled
1751+ ):
17381752 """Test enabling/disabling the dictionaries in Parquet."""
1739- df .write_parquet_with_options (tmp_path , ParquetWriterOptions (dictionary_enabled = dictionary_enabled ))
1753+ df .write_parquet_with_options (
1754+ tmp_path , ParquetWriterOptions (dictionary_enabled = dictionary_enabled )
1755+ )
17401756 # by default, the dictionary is enabled, so None results in True
17411757 result = dictionary_enabled if dictionary_enabled is not None else True
17421758
@@ -1758,7 +1774,9 @@ def test_write_parquet_with_options_statistics_enabled(
17581774):
17591775 """Test configuring the statistics in Parquet. In pyarrow we can only check for
17601776 column-level statistics, so "page" and "chunk" are tested in the same way."""
1761- df .write_parquet_with_options (tmp_path , ParquetWriterOptions (statistics_enabled = statistics_enabled ))
1777+ df .write_parquet_with_options (
1778+ tmp_path , ParquetWriterOptions (statistics_enabled = statistics_enabled )
1779+ )
17621780
17631781 for file in tmp_path .rglob ("*.parquet" ):
17641782 parquet = pq .ParquetFile (file )
@@ -1773,11 +1791,15 @@ def test_write_parquet_with_options_statistics_enabled(
17731791
17741792
17751793@pytest .mark .parametrize ("max_row_group_size" , [1000 , 5000 , 10000 , 100000 ])
1776- def test_write_parquet_with_options_max_row_group_size (large_df , tmp_path , max_row_group_size ):
1794+ def test_write_parquet_with_options_max_row_group_size (
1795+ large_df , tmp_path , max_row_group_size
1796+ ):
17771797 """Test configuring the max number of rows per group in Parquet. These test cases
17781798 guarantee that the number of rows for each row group is max_row_group_size, given
17791799 the total number of rows is a multiple of max_row_group_size."""
1780- large_df .write_parquet_with_options (tmp_path , ParquetWriterOptions (max_row_group_size = max_row_group_size ))
1800+ large_df .write_parquet_with_options (
1801+ tmp_path , ParquetWriterOptions (max_row_group_size = max_row_group_size )
1802+ )
17811803
17821804 for file in tmp_path .rglob ("*.parquet" ):
17831805 parquet = pq .ParquetFile (file )
@@ -1812,7 +1834,10 @@ def test_write_parquet_with_options_statistics_truncate_length(
18121834 "b" : ["a_smaller" , "m_smaller" , "z_smaller" ],
18131835 }
18141836 df = ctx .from_arrow (pa .record_batch (data ))
1815- df .write_parquet_with_options (tmp_path , ParquetWriterOptions (statistics_truncate_length = statistics_truncate_length ))
1837+ df .write_parquet_with_options (
1838+ tmp_path ,
1839+ ParquetWriterOptions (statistics_truncate_length = statistics_truncate_length ),
1840+ )
18161841
18171842 for file in tmp_path .rglob ("*.parquet" ):
18181843 parquet = pq .ParquetFile (file )
@@ -1870,11 +1895,13 @@ def test_write_parquet_with_options_encoding(tmp_path, encoding, data_types, res
18701895 data ["float" ] = [1.01 , 2.02 , 3.03 ]
18711896 elif data_type == "str" :
18721897 data ["str" ] = ["a" , "b" , "c" ]
1873- elif data_type == "bool" :
1898+ elif data_type == "bool" :
18741899 data ["bool" ] = [True , False , True ]
18751900
18761901 df = ctx .from_arrow (pa .record_batch (data ))
1877- df .write_parquet_with_options (tmp_path , ParquetWriterOptions (encoding = encoding , dictionary_enabled = False ))
1902+ df .write_parquet_with_options (
1903+ tmp_path , ParquetWriterOptions (encoding = encoding , dictionary_enabled = False )
1904+ )
18781905
18791906 for file in tmp_path .rglob ("*.parquet" ):
18801907 parquet = pq .ParquetFile (file )
@@ -1901,7 +1928,9 @@ def test_write_parquet_with_options_invalid_encoding(df, tmp_path, encoding):
19011928
19021929
19031930@pytest .mark .parametrize ("encoding" , ["plain_dictionary" , "rle_dictionary" ])
1904- def test_write_parquet_with_options_dictionary_encoding_fallback (df , tmp_path , encoding ):
1931+ def test_write_parquet_with_options_dictionary_encoding_fallback (
1932+ df , tmp_path , encoding
1933+ ):
19051934 """Test that the dictionary encoding cannot be used as fallback in Parquet."""
19061935 # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519
19071936 with pytest .raises (
@@ -1918,7 +1947,9 @@ def test_write_parquet_with_options_bloom_filter(df, tmp_path):
19181947 path_bloom_filter = tmp_path / "2"
19191948
19201949 df .write_parquet_with_options (path_no_bloom_filter , ParquetWriterOptions ())
1921- df .write_parquet_with_options (path_bloom_filter , ParquetWriterOptions (bloom_filter_on_write = True ))
1950+ df .write_parquet_with_options (
1951+ path_bloom_filter , ParquetWriterOptions (bloom_filter_on_write = True )
1952+ )
19221953
19231954 size_no_bloom_filter = 0
19241955 for file in path_no_bloom_filter .rglob ("*.parquet" ):
@@ -1989,8 +2020,9 @@ def test_write_parquet_with_options_column_options(df, tmp_path):
19892020 df = ctx .from_arrow (pa .record_batch (data ))
19902021 df .write_parquet_with_options (
19912022 tmp_path ,
1992- ParquetWriterOptions (compression = "brotli(8)" ,
1993- column_specific_options = column_specific_options ),
2023+ ParquetWriterOptions (
2024+ compression = "brotli(8)" , column_specific_options = column_specific_options
2025+ ),
19942026 )
19952027
19962028 for file in tmp_path .rglob ("*.parquet" ):
0 commit comments