From 43fde4500a2e9fb7ccace21b36e388398a854629 Mon Sep 17 00:00:00 2001 From: ChiLin Chiu Date: Sat, 21 Feb 2026 18:34:31 +0800 Subject: [PATCH 1/4] doc: add pyarrow.parquet.filters_to_expression example Signed-off-by: ChiLin Chiu --- docs/source/python/dataset.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index 4e18ea0a51cd..fca34ea852d8 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -569,6 +569,28 @@ calculate the average of a column without loading the entire column into memory: ... count += batch.num_rows >>> mean_a = col2_sum/count +The ``filter`` argument of :meth:`Dataset.to_batches` (and :func:`~Dataset.to_table`) +expects a boolean :class:`~pyarrow.dataset.Expression`, which can be constructed using +:func:`pyarrow.dataset.field` and its operator overloads. However, if you already have +filters in the DNF (Disjunctive Normal Form) list-of-tuples format accepted by +:class:`pyarrow.parquet.ParquetDataset`, you can convert them to an ``Expression`` +using :func:`pyarrow.parquet.filters_to_expression`: + +.. code-block:: python + + >>> import pyarrow.parquet as pq + >>> import pyarrow.compute as pc + >>> filters = [("a", ">=", 5), ("c", "==", 2)] + >>> filter_expr = pq.filters_to_expression(filters) + >>> filter_expr + = 5) and (c == 2))> + >>> a_sum = 0 + >>> for batch in dataset.to_batches(columns=["a"], filter=filter_expr): + ... if batch.num_rows: + ... a_sum += pc.sum(batch.column("a")).as_py() + >>> a_sum + 21 + Customizing the batch size ~~~~~~~~~~~~~~~~~~~~~~~~~~ From 6e04566c4b421aee4c6c15130be9923b9fe50622 Mon Sep 17 00:00:00 2001 From: ChiLin Chiu Date: Sun, 22 Feb 2026 13:05:43 +0800 Subject: [PATCH 2/4] doc: fix doctest error Signed-off-by: ChiLin Chiu --- docs/source/python/dataset.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index fca34ea852d8..65dfda7d4a7b 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -580,6 +580,7 @@ using :func:`pyarrow.parquet.filters_to_expression`: >>> import pyarrow.parquet as pq >>> import pyarrow.compute as pc + >>> dataset = ds.dataset(base / "parquet_dataset", format="parquet") >>> filters = [("a", ">=", 5), ("c", "==", 2)] >>> filter_expr = pq.filters_to_expression(filters) >>> filter_expr From 951e45bfac8524bf652387c6ff18b09f518dc14e Mon Sep 17 00:00:00 2001 From: ChiLin Chiu Date: Fri, 6 Mar 2026 23:37:16 +0800 Subject: [PATCH 3/4] Update docs/source/python/dataset.rst Co-authored-by: Alenka Frim --- docs/source/python/dataset.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index 65dfda7d4a7b..b90df30ac5c9 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -578,8 +578,6 @@ using :func:`pyarrow.parquet.filters_to_expression`: .. code-block:: python - >>> import pyarrow.parquet as pq - >>> import pyarrow.compute as pc >>> dataset = ds.dataset(base / "parquet_dataset", format="parquet") >>> filters = [("a", ">=", 5), ("c", "==", 2)] >>> filter_expr = pq.filters_to_expression(filters) From aa99f53860d1154f4e8a4602cce151cf6f9200c6 Mon Sep 17 00:00:00 2001 From: ChiLin Chiu Date: Fri, 6 Mar 2026 23:41:14 +0800 Subject: [PATCH 4/4] Apply suggestion from @AlenkaF Co-authored-by: Alenka Frim --- docs/source/python/dataset.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index b90df30ac5c9..4736fe570b9d 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -578,7 +578,8 @@ using :func:`pyarrow.parquet.filters_to_expression`: .. code-block:: python - >>> dataset = ds.dataset(base / "parquet_dataset", format="parquet") + >>> dataset.to_table(columns=['a']) + >>> filters = [("a", ">=", 5), ("c", "==", 2)] >>> filter_expr = pq.filters_to_expression(filters) >>> filter_expr