From 6e8b983540a18b995ce184aa9e2cbf9846c2764f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 20 Sep 2025 20:20:21 +0200 Subject: [PATCH] squash commits for easier rebase rebase and some minor work dan's homework minor post rebase change lint annotation fix fix some annotations fix path on macos add type checking guidelines for developers package stubs into wheels and test for presence Add typechecking for Windows Add typechecking for macos Moving typechecks under 'Execute Docker Build' step test for pyarrow Review feedback some fixes remove some newlines fixes more fix fix cleanup test minor fix fix CI fix mypy minor fixes fix ty checks WIP WIP pyright for test_{pandas,scalars,schema,substrait}.py pyright for test_{sparse_tensor,substrait,tensor,types,udf,without_numpy}.py and util.py pyright for test_compute.py pyright for test_dataset.py pyright test_types.py pyright work yet further pyright work further pyright work Make pyright stricter workaround for shadowed types module bumpy python in pyright Update python/pyproject.toml Co-authored-by: Dan Redding <125183946+dangotbanned@users.noreply.github.com> misc Update python/pyarrow-stubs/pyarrow/compute.pyi Co-authored-by: Dan Redding <125183946+dangotbanned@users.noreply.github.com> ty reduce mypy errors experiment fix pyright config fixing missing-imports Some changes Update python/pyarrow-stubs/_compute.pyi Co-authored-by: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Apply suggestions from code review Co-authored-by: Dan Redding <125183946+dangotbanned@users.noreply.github.com> minor adding some ignores to pass more checks Add CI check Add utility for adding docstrings into annotations Minor changes to pyarrow so some typechecks pass Add pyarrow-stubs minus their docstings --- .github/workflows/python.yml | 9 + ci/scripts/python_test_type_annotations.bat | 38 + ci/scripts/python_wheel_windows_build.bat | 5 + compose.yaml | 3 +- dev/update_stub_docstrings.py | 214 ++ docs/source/developers/python/development.rst | 2 +- python/pyarrow-stubs/pyarrow/__init__.pyi | 685 +++++- python/pyarrow-stubs/pyarrow/_acero.pyi | 163 ++ python/pyarrow-stubs/pyarrow/_azurefs.pyi | 36 + python/pyarrow-stubs/pyarrow/_compute.pyi | 671 ++++++ .../pyarrow/_compute_docstring.pyi | 18 + python/pyarrow-stubs/pyarrow/_csv.pyi | 132 ++ python/pyarrow-stubs/pyarrow/_cuda.pyi | 158 ++ python/pyarrow-stubs/pyarrow/_dataset.pyi | 682 ++++++ python/pyarrow-stubs/pyarrow/_dataset_orc.pyi | 24 + .../pyarrow/_dataset_parquet.pyi | 200 ++ .../pyarrow/_dataset_parquet_encryption.pyi | 58 + python/pyarrow-stubs/pyarrow/_feather.pyi | 51 + python/pyarrow-stubs/pyarrow/_flight.pyi | 660 ++++++ python/pyarrow-stubs/pyarrow/_fs.pyi | 234 +++ python/pyarrow-stubs/pyarrow/_gcsfs.pyi | 43 + python/pyarrow-stubs/pyarrow/_hdfs.pyi | 37 + python/pyarrow-stubs/pyarrow/_ipc.pyi | 317 +++ python/pyarrow-stubs/pyarrow/_json.pyi | 66 + python/pyarrow-stubs/pyarrow/_orc.pyi | 77 + python/pyarrow-stubs/pyarrow/_parquet.pyi | 524 +++++ .../pyarrow/_parquet_encryption.pyi | 141 ++ python/pyarrow-stubs/pyarrow/_s3fs.pyi | 106 + .../pyarrow-stubs/pyarrow/_stubs_typing.pyi | 133 ++ python/pyarrow-stubs/pyarrow/_substrait.pyi | 64 + python/pyarrow-stubs/pyarrow/_types.pyi | 966 +++++++++ python/pyarrow-stubs/pyarrow/array.pyi | 894 ++++++++ python/pyarrow-stubs/pyarrow/builder.pyi | 51 + python/pyarrow-stubs/pyarrow/cffi.pyi | 21 + python/pyarrow-stubs/pyarrow/compat.pyi | 23 + python/pyarrow-stubs/pyarrow/compute.pyi | 1834 +++++++++++++++++ python/pyarrow-stubs/pyarrow/config.pyi | 72 + python/pyarrow-stubs/pyarrow/csv.pyi | 44 + python/pyarrow-stubs/pyarrow/cuda.pyi | 42 + python/pyarrow-stubs/pyarrow/dataset.pyi | 199 ++ python/pyarrow-stubs/pyarrow/device.pyi | 66 + python/pyarrow-stubs/pyarrow/error.pyi | 104 + python/pyarrow-stubs/pyarrow/feather.pyi | 81 + python/pyarrow-stubs/pyarrow/flight.pyi | 112 + python/pyarrow-stubs/pyarrow/fs.pyi | 112 + python/pyarrow-stubs/pyarrow/gandiva.pyi | 110 + .../pyarrow/interchange/__init__.pyi | 20 + .../pyarrow/interchange/buffer.pyi | 41 + .../pyarrow/interchange/column.pyi | 93 + .../pyarrow/interchange/dataframe.pyi | 52 + .../pyarrow/interchange/from_dataframe.pyi | 92 + python/pyarrow-stubs/pyarrow/io.pyi | 430 ++++ python/pyarrow-stubs/pyarrow/ipc.pyi | 162 ++ python/pyarrow-stubs/pyarrow/json.pyi | 20 + python/pyarrow-stubs/pyarrow/lib.pyi | 133 ++ python/pyarrow-stubs/pyarrow/memory.pyi | 94 + python/pyarrow-stubs/pyarrow/orc.pyi | 146 ++ .../pyarrow-stubs/pyarrow/pandas_compat.pyi | 92 + python/pyarrow-stubs/pyarrow/pandas_shim.pyi | 73 + .../pyarrow/parquet/__init__.pyi | 18 + python/pyarrow-stubs/pyarrow/parquet/core.pyi | 372 ++++ .../pyarrow/parquet/encryption.pyi | 34 + python/pyarrow-stubs/pyarrow/scalar.pyi | 466 +++++ python/pyarrow-stubs/pyarrow/substrait.pyi | 38 + python/pyarrow-stubs/pyarrow/table.pyi | 686 ++++++ python/pyarrow-stubs/pyarrow/tensor.pyi | 268 +++ python/pyarrow-stubs/pyarrow/tests/util.pyi | 93 + python/pyarrow-stubs/pyarrow/types.pyi | 227 ++ python/pyarrow-stubs/pyarrow/util.pyi | 49 + python/pyarrow/__init__.py | 19 +- python/pyarrow/acero.py | 13 +- python/pyarrow/array.pxi | 2 +- python/pyarrow/benchmark.py | 2 +- python/pyarrow/cffi.py | 9 +- python/pyarrow/compute.py | 18 +- python/pyarrow/conftest.py | 6 +- python/pyarrow/cuda.py | 2 +- python/pyarrow/dataset.py | 18 +- python/pyarrow/feather.py | 3 +- python/pyarrow/flight.py | 2 +- python/pyarrow/fs.py | 11 +- python/pyarrow/orc.py | 8 +- python/pyarrow/pandas_compat.py | 57 +- python/pyarrow/parquet/core.py | 26 +- python/pyarrow/parquet/encryption.py | 3 +- python/pyarrow/scalar.pxi | 2 +- python/pyarrow/tests/conftest.py | 7 +- .../tests/interchange/test_conversion.py | 40 +- .../interchange/test_interchange_spec.py | 2 +- python/pyarrow/tests/parquet/common.py | 17 +- python/pyarrow/tests/parquet/encryption.py | 4 +- python/pyarrow/tests/parquet/test_basic.py | 18 +- .../parquet/test_compliant_nested_type.py | 5 +- .../pyarrow/tests/parquet/test_data_types.py | 14 +- python/pyarrow/tests/parquet/test_dataset.py | 74 +- python/pyarrow/tests/parquet/test_datetime.py | 13 +- .../pyarrow/tests/parquet/test_encryption.py | 15 +- python/pyarrow/tests/parquet/test_metadata.py | 118 +- python/pyarrow/tests/parquet/test_pandas.py | 65 +- .../tests/parquet/test_parquet_file.py | 24 +- .../tests/parquet/test_parquet_writer.py | 11 +- python/pyarrow/tests/strategies.py | 55 +- python/pyarrow/tests/test_acero.py | 39 +- .../pyarrow/tests/test_adhoc_memory_leak.py | 2 +- python/pyarrow/tests/test_array.py | 71 +- python/pyarrow/tests/test_cffi.py | 25 +- python/pyarrow/tests/test_compute.py | 198 +- python/pyarrow/tests/test_convert_builtin.py | 42 +- python/pyarrow/tests/test_cpp_internals.py | 3 +- python/pyarrow/tests/test_csv.py | 19 +- python/pyarrow/tests/test_cuda.py | 53 +- .../pyarrow/tests/test_cuda_numba_interop.py | 25 +- python/pyarrow/tests/test_cython.py | 2 +- python/pyarrow/tests/test_dataset.py | 181 +- .../pyarrow/tests/test_dataset_encryption.py | 35 +- python/pyarrow/tests/test_device.py | 12 +- python/pyarrow/tests/test_extension_type.py | 106 +- python/pyarrow/tests/test_feather.py | 34 +- python/pyarrow/tests/test_flight.py | 94 +- python/pyarrow/tests/test_flight_async.py | 4 +- python/pyarrow/tests/test_fs.py | 93 +- python/pyarrow/tests/test_gandiva.py | 9 +- python/pyarrow/tests/test_gdb.py | 7 + python/pyarrow/tests/test_io.py | 67 +- python/pyarrow/tests/test_ipc.py | 36 +- python/pyarrow/tests/test_json.py | 16 +- python/pyarrow/tests/test_jvm.py | 10 +- python/pyarrow/tests/test_orc.py | 20 +- python/pyarrow/tests/test_pandas.py | 95 +- python/pyarrow/tests/test_scalars.py | 38 +- python/pyarrow/tests/test_schema.py | 11 +- python/pyarrow/tests/test_sparse_tensor.py | 20 +- python/pyarrow/tests/test_strategies.py | 4 +- python/pyarrow/tests/test_substrait.py | 95 +- python/pyarrow/tests/test_table.py | 120 +- python/pyarrow/tests/test_tensor.py | 2 +- python/pyarrow/tests/test_types.py | 55 +- python/pyarrow/tests/test_udf.py | 14 +- python/pyarrow/tests/test_without_numpy.py | 1 + python/pyarrow/tests/util.py | 6 +- python/pyarrow/vendored/docscrape.py | 9 +- python/pyproject.toml | 38 +- python/scripts/run_emscripten_tests.py | 2 +- 143 files changed, 15380 insertions(+), 997 deletions(-) create mode 100644 ci/scripts/python_test_type_annotations.bat create mode 100644 dev/update_stub_docstrings.py create mode 100644 python/pyarrow-stubs/pyarrow/_acero.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_azurefs.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_compute.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_compute_docstring.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_csv.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_cuda.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_dataset.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_dataset_orc.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_feather.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_flight.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_fs.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_gcsfs.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_hdfs.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_ipc.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_json.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_orc.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_parquet.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_s3fs.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_stubs_typing.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_substrait.pyi create mode 100644 python/pyarrow-stubs/pyarrow/_types.pyi create mode 100644 python/pyarrow-stubs/pyarrow/array.pyi create mode 100644 python/pyarrow-stubs/pyarrow/builder.pyi create mode 100644 python/pyarrow-stubs/pyarrow/cffi.pyi create mode 100644 python/pyarrow-stubs/pyarrow/compat.pyi create mode 100644 python/pyarrow-stubs/pyarrow/compute.pyi create mode 100644 python/pyarrow-stubs/pyarrow/config.pyi create mode 100644 python/pyarrow-stubs/pyarrow/csv.pyi create mode 100644 python/pyarrow-stubs/pyarrow/cuda.pyi create mode 100644 python/pyarrow-stubs/pyarrow/dataset.pyi create mode 100644 python/pyarrow-stubs/pyarrow/device.pyi create mode 100644 python/pyarrow-stubs/pyarrow/error.pyi create mode 100644 python/pyarrow-stubs/pyarrow/feather.pyi create mode 100644 python/pyarrow-stubs/pyarrow/flight.pyi create mode 100644 python/pyarrow-stubs/pyarrow/fs.pyi create mode 100644 python/pyarrow-stubs/pyarrow/gandiva.pyi create mode 100644 python/pyarrow-stubs/pyarrow/interchange/__init__.pyi create mode 100644 python/pyarrow-stubs/pyarrow/interchange/buffer.pyi create mode 100644 python/pyarrow-stubs/pyarrow/interchange/column.pyi create mode 100644 python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi create mode 100644 python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi create mode 100644 python/pyarrow-stubs/pyarrow/io.pyi create mode 100644 python/pyarrow-stubs/pyarrow/ipc.pyi create mode 100644 python/pyarrow-stubs/pyarrow/json.pyi create mode 100644 python/pyarrow-stubs/pyarrow/lib.pyi create mode 100644 python/pyarrow-stubs/pyarrow/memory.pyi create mode 100644 python/pyarrow-stubs/pyarrow/orc.pyi create mode 100644 python/pyarrow-stubs/pyarrow/pandas_compat.pyi create mode 100644 python/pyarrow-stubs/pyarrow/pandas_shim.pyi create mode 100644 python/pyarrow-stubs/pyarrow/parquet/__init__.pyi create mode 100644 python/pyarrow-stubs/pyarrow/parquet/core.pyi create mode 100644 python/pyarrow-stubs/pyarrow/parquet/encryption.pyi create mode 100644 python/pyarrow-stubs/pyarrow/scalar.pyi create mode 100644 python/pyarrow-stubs/pyarrow/substrait.pyi create mode 100644 python/pyarrow-stubs/pyarrow/table.pyi create mode 100644 python/pyarrow-stubs/pyarrow/tensor.pyi create mode 100644 python/pyarrow-stubs/pyarrow/tests/util.pyi create mode 100644 python/pyarrow-stubs/pyarrow/types.pyi create mode 100644 python/pyarrow-stubs/pyarrow/util.pyi diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index bc7fe3cd6830..b0d526b1ee04 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -239,6 +239,11 @@ jobs: - name: Test shell: bash run: ci/scripts/python_test.sh $(pwd) $(pwd)/build + - name: Test annotations + shell: bash + env: + PYARROW_TEST_ANNOTATIONS: "ON" + run: ci/scripts/python_test_type_annotations.sh $(pwd)/python windows: name: AMD64 Windows 2022 Python 3.13 @@ -296,3 +301,7 @@ jobs: shell: cmd run: | call "ci\scripts\python_test.bat" %cd% + - name: Test annotations + shell: cmd + run: | + call "ci\scripts\python_test_type_annotations.bat" %cd%\python diff --git a/ci/scripts/python_test_type_annotations.bat b/ci/scripts/python_test_type_annotations.bat new file mode 100644 index 000000000000..3446e329a899 --- /dev/null +++ b/ci/scripts/python_test_type_annotations.bat @@ -0,0 +1,38 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@echo on + +set PYARROW_DIR=%1 + +echo Annotation testing on Windows ... + +@REM Install library stubs +%PYTHON_CMD% -m pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 + +@REM Install other dependencies for type checking +%PYTHON_CMD% -m pip install fsspec || exit /B 1 + +@REM Install type checkers +%PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1 + +@REM Run type checkers +pushd %PYARROW_DIR% + +mypy +pyright +ty check diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index fc256d72785c..2021e2d41d38 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -135,6 +135,11 @@ pushd C:\arrow\python @REM Build wheel %PYTHON_CMD% -m build --sdist --wheel . --no-isolation || exit /B 1 +@REM We first populate stub docstrings and then build the wheel +%PYTHON_CMD% setup.py build_ext --inplace +%PYTHON_CMD% -m pip install griffe libcst +%PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs + @REM Repair the wheel with delvewheel @REM @REM Since we bundled the Arrow C++ libraries ourselves, we only need to diff --git a/compose.yaml b/compose.yaml index c799059fe254..87b79300011a 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1539,8 +1539,7 @@ services: /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && - /arrow/ci/scripts/python_test.sh /arrow && - /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] + /arrow/ci/scripts/python_test.sh /arrow"] conda-python-dask: # Possible $DASK parameters: diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py new file mode 100644 index 000000000000..eaeb2a510eb5 --- /dev/null +++ b/dev/update_stub_docstrings.py @@ -0,0 +1,214 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Utility to extract docstrings from pyarrow and update +# docstrings in stubfiles. +# +# Usage +# ===== +# +# python ../dev/update_stub_docstrings.py pyarrow-stubs + + +from pathlib import Path +from textwrap import indent + +import click +# TODO: perhaps replace griffe with importlib +import griffe +from griffe import AliasResolutionError +import libcst +from libcst import matchers as m + + +def _get_docstring(name, package, indentation): + # print("extract_docstrings", name) + try: + obj = package.get_member(name) + except (KeyError, ValueError, AliasResolutionError): + # Some cython __init__ symbols can't be found + # e.g. pyarrow.lib.OSFile.__init__ + stack = name.split(".") + parent_name = ".".join(stack[:-1]) + + try: + obj = package.get_member(parent_name).all_members[stack[-1]] + except (KeyError, ValueError, AliasResolutionError): + print(f"{name} not found in {package.name}, it's probably ok.") + return None + + if obj.has_docstring: + docstring = obj.docstring.value + # Remove signature if present in docstring + if docstring.startswith(obj.name) or ( + (hasattr(obj.parent, "name") and + docstring.startswith(f"{obj.parent.name}.{obj.name}"))): + docstring = "\n".join(docstring.splitlines()[2:]) + # Skip empty docstrings + if docstring.strip() == "": + return None + # Indent docstring + indentation_prefix = indentation * " " + docstring = indent(docstring + '\n"""', indentation_prefix) + docstring = '"""\n' + docstring + return docstring + return None + + +class ReplaceEllipsis(libcst.CSTTransformer): + def __init__(self, package, namespace): + self.package = package + self.base_namespace = namespace + self.stack = [] + self.indentation = 0 + + # Insert module level docstring if _clone_signature is used + def leave_Module(self, original_node, updated_node): + new_body = [] + clone_matcher = m.SimpleStatementLine( + body=[m.Assign( + value=m.Call(func=m.Name(value="_clone_signature")) + ), m.ZeroOrMore()] + ) + for statement in updated_node.body: + new_body.append(statement) + if m.matches(statement, clone_matcher): + name = statement.body[0].targets[0].target.value + if self.base_namespace: + name = f"{self.base_namespace}.{name}" + docstring = _get_docstring(name, self.package, 0) + if docstring is not None: + new_expr = libcst.Expr(value=libcst.SimpleString(docstring)) + new_line = libcst.SimpleStatementLine(body=[new_expr]) + new_body.append(new_line) + + return updated_node.with_changes(body=new_body) + + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_ClassDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + class_matcher_1 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.SimpleStatementLine( + body=[m.Expr(m.Ellipsis()), m.ZeroOrMore()] + ), m.ZeroOrMore()] + ) + ) + class_matcher_2 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.FunctionDef(), m.ZeroOrMore()] + ) + ) + + if m.matches(updated_node, class_matcher_1): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_node = libcst.SimpleString(value=docstring) + updated_node = updated_node.deep_replace( + updated_node.body.body[0].body[0].value, new_node) + + if m.matches(updated_node, class_matcher_2): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + list(updated_node.body.body) + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_FunctionDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + function_matcher = m.FunctionDef( + name=m.Name(), + body=m.SimpleStatementSuite( + body=[m.Expr( + m.Ellipsis() + )])) + if m.matches(original_node, function_matcher): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + +@click.command() +@click.argument('pyarrow_folder', type=click.Path(resolve_path=True)) +def add_docs_to_stub_files(pyarrow_folder): + print("Updating docstrings of stub files in:", pyarrow_folder) + package = griffe.load("pyarrow", try_relative_path=True, + force_inspection=True, resolve_aliases=True) + lib_modules = ["array", "builder", "compat", "config", "device", "error", "io", + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", + "_types"] + + for stub_file in Path(pyarrow_folder).rglob('*.pyi'): + if stub_file.name == "_stubs_typing.pyi": + continue + module = stub_file.with_suffix('').name + print(f"[{stub_file} {module}]") + + with open(stub_file, 'r') as f: + tree = libcst.parse_module(f.read()) + + if module in lib_modules: + module = "lib" + elif stub_file.parent.name in ["parquet", "interchange"]: + module = f"{stub_file.parent.name}.{module}" + elif module == "__init__": + module = "" + + modified_tree = tree.visit(ReplaceEllipsis(package, module)) + with open(stub_file, "w") as f: + f.write(modified_tree.code) + print("\n") + + +if __name__ == "__main__": + docstrings_map = {} + add_docs_to_stub_files(obj={}) diff --git a/docs/source/developers/python/development.rst b/docs/source/developers/python/development.rst index 5529ad25a294..2e2413522439 100644 --- a/docs/source/developers/python/development.rst +++ b/docs/source/developers/python/development.rst @@ -42,7 +42,7 @@ Unit Testing ============ We are using `pytest `_ to develop our unit -test suite. After `building the project `_ you can run its unit tests +test suite. After `building the project `_ you can run its unit tests like so: .. code-block:: diff --git a/python/pyarrow-stubs/pyarrow/__init__.pyi b/python/pyarrow-stubs/pyarrow/__init__.pyi index ccec8d5abc07..a38ddaa6fe3e 100644 --- a/python/pyarrow-stubs/pyarrow/__init__.pyi +++ b/python/pyarrow-stubs/pyarrow/__init__.pyi @@ -15,15 +15,682 @@ # specific language governing permissions and limitations # under the License. -"""Type stubs for PyArrow. +from typing import Any +import pyarrow.lib as _lib -This is a placeholder stub file. -Complete type annotations will be added in subsequent PRs. -""" +from pyarrow.lib import ( + BuildInfo, + CppBuildInfo, + RuntimeInfo, + set_timezone_db_path, + MonthDayNano, + VersionInfo, + build_info, + cpp_build_info, + cpp_version, + cpp_version_info, + runtime_info, + cpu_count, + set_cpu_count, + enable_signal_handlers, + io_thread_count, + set_io_thread_count, +) + +from pyarrow.lib import ( + null, + bool_, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + time32, + time64, + timestamp, + date32, + date64, + duration, + month_day_nano_interval, + float16, + float32, + float64, + binary, + string, + utf8, + binary_view, + string_view, + large_binary, + large_string, + large_utf8, + decimal32, + decimal64, + decimal128, + decimal256, + list_, + large_list, + list_view, + large_list_view, + map_, + struct, + union, + sparse_union, + dense_union, + dictionary, + run_end_encoded, + json_, + uuid, + fixed_shape_tensor, + bool8, + opaque, + field, + type_for_alias, + DataType, + DictionaryType, + StructType, + ListType, + LargeListType, + FixedSizeListType, + ListViewType, + LargeListViewType, + MapType, + UnionType, + SparseUnionType, + DenseUnionType, + TimestampType, + Time32Type, + Time64Type, + DurationType, + FixedSizeBinaryType, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + BaseExtensionType, + ExtensionType, + RunEndEncodedType, + FixedShapeTensorType, + Bool8Type, + UuidType, + JsonType, + OpaqueType, + UnknownExtensionType, + register_extension_type, + unregister_extension_type, + DictionaryMemo, + KeyValueMetadata, + Field, + Schema, + schema, + unify_schemas, + Array, + Tensor, + array, + arange, + chunked_array, + record_batch, + nulls, + repeat, + SparseCOOTensor, + SparseCSRMatrix, + SparseCSCMatrix, + SparseCSFTensor, + infer_type, + from_numpy_dtype, + NullArray, + NumericArray, + IntegerArray, + FloatingPointArray, + BooleanArray, + Int8Array, + UInt8Array, + Int16Array, + UInt16Array, + Int32Array, + UInt32Array, + Int64Array, + UInt64Array, + HalfFloatArray, + FloatArray, + DoubleArray, + ListArray, + LargeListArray, + FixedSizeListArray, + ListViewArray, + LargeListViewArray, + MapArray, + UnionArray, + BinaryArray, + StringArray, + LargeBinaryArray, + LargeStringArray, + BinaryViewArray, + StringViewArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, + Date64Array, + TimestampArray, + Time32Array, + Time64Array, + DurationArray, + MonthDayNanoIntervalArray, + Decimal32Array, + Decimal64Array, + Decimal128Array, + Decimal256Array, + StructArray, + ExtensionArray, + RunEndEncodedArray, + FixedShapeTensorArray, + Bool8Array, + UuidArray, + JsonArray, + OpaqueArray, + scalar, + NA, + _NULL as NULL, + Scalar, + NullScalar, + BooleanScalar, + Int8Scalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + UInt8Scalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + HalfFloatScalar, + FloatScalar, + DoubleScalar, + Decimal32Scalar, + Decimal64Scalar, + Decimal128Scalar, + Decimal256Scalar, + ListScalar, + LargeListScalar, + FixedSizeListScalar, + ListViewScalar, + LargeListViewScalar, + Date32Scalar, + Date64Scalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + DurationScalar, + MonthDayNanoIntervalScalar, + BinaryScalar, + LargeBinaryScalar, + BinaryViewScalar, + StringScalar, + LargeStringScalar, + StringViewScalar, + FixedSizeBinaryScalar, + DictionaryScalar, + MapScalar, + StructScalar, + UnionScalar, + RunEndEncodedScalar, + ExtensionScalar, + Bool8Scalar, + UuidScalar, + JsonScalar, + OpaqueScalar, +) + + +# Buffers, allocation +from pyarrow.lib import ( + DeviceAllocationType, + Device, + MemoryManager, + default_cpu_memory_manager +) + +from pyarrow.lib import ( + Buffer, + ResizableBuffer, + foreign_buffer, + py_buffer, + Codec, + compress, + decompress, + allocate_buffer, +) + +from pyarrow.lib import ( + MemoryPool, + LoggingMemoryPool, + ProxyMemoryPool, + total_allocated_bytes, + set_memory_pool, + default_memory_pool, + system_memory_pool, + jemalloc_memory_pool, + mimalloc_memory_pool, + logging_memory_pool, + proxy_memory_pool, + log_memory_allocations, + jemalloc_set_decay_ms, + supported_memory_backends, +) + +# I/O +from pyarrow.lib import ( + NativeFile, + PythonFile, + BufferedInputStream, + BufferedOutputStream, + CacheOptions, + CompressedInputStream, + CompressedOutputStream, + TransformInputStream, + transcoding_input_stream, + FixedSizeBufferWriter, + BufferReader, + BufferOutputStream, + OSFile, + MemoryMappedFile, + memory_map, + create_memory_map, + MockOutputStream, + input_stream, + output_stream, + have_libhdfs, +) + +from pyarrow.lib import ( + ChunkedArray, + RecordBatch, + Table, + table, + concat_arrays, + concat_batches, + concat_tables, + TableGroupBy, + RecordBatchReader, +) + +# Exceptions +from pyarrow.lib import ( + ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError, +) + +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc +import pyarrow.lib as lib +import pyarrow.types as types +import pyarrow.feather as feather +import pyarrow.compute as compute +import pyarrow.csv as csv +import pyarrow.json as json +import pyarrow.dataset as dataset + +# ---------------------------------------------------------------------- +# Deprecations + +from pyarrow.util import _deprecate_api, _deprecate_class + +from pyarrow.ipc import ( + Message, + MessageReader, + MetadataVersion, + RecordBatchFileReader, + RecordBatchFileWriter, + RecordBatchStreamReader, + RecordBatchStreamWriter, +) + + +__version__: str +_gc_enabled: bool + + +def show_versions() -> None: ... +def show_info() -> None: ... +def _module_is_available(module: str) -> bool: ... +def _filesystem_is_available(fs: str) -> bool: ... + + +def get_include() -> str: ... +def _get_pkg_config_executable() -> str: ... +def _has_pkg_config(pkgname: str) -> bool: ... +def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... +def get_libraries() -> list[str]: ... +def create_library_symlinks() -> None: ... +def get_library_dirs() -> list[str]: ... -from typing import Any -# TODO(GH-48970): remove __getattr__ before release as this -# will annotate non-existing attributes as Any. -# https://github.com/apache/arrow/issues/48970 -def __getattr__(name: str) -> Any: ... +__all__ = [ + "__version__", + "_lib", + "_gc_enabled", + "BuildInfo", + "CppBuildInfo", + "RuntimeInfo", + "set_timezone_db_path", + "MonthDayNano", + "VersionInfo", + "build_info", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "cpu_count", + "set_cpu_count", + "enable_signal_handlers", + "io_thread_count", + "set_io_thread_count", + "show_versions", + "show_info", + "_module_is_available", + "_filesystem_is_available", + "null", + "bool_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "time32", + "time64", + "timestamp", + "date32", + "date64", + "duration", + "month_day_nano_interval", + "float16", + "float32", + "float64", + "binary", + "string", + "utf8", + "binary_view", + "string_view", + "large_binary", + "large_string", + "large_utf8", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "struct", + "union", + "sparse_union", + "dense_union", + "dictionary", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "field", + "type_for_alias", + "DataType", + "DictionaryType", + "StructType", + "ListType", + "LargeListType", + "FixedSizeListType", + "ListViewType", + "LargeListViewType", + "MapType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "BaseExtensionType", + "ExtensionType", + "RunEndEncodedType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "DictionaryMemo", + "KeyValueMetadata", + "Field", + "Schema", + "schema", + "unify_schemas", + "Array", + "Tensor", + "array", + "arange", + "chunked_array", + "record_batch", + "nulls", + "repeat", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", + "infer_type", + "from_numpy_dtype", + "NullArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "BooleanArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "ListArray", + "LargeListArray", + "FixedSizeListArray", + "ListViewArray", + "LargeListViewArray", + "MapArray", + "UnionArray", + "BinaryArray", + "StringArray", + "LargeBinaryArray", + "LargeStringArray", + "BinaryViewArray", + "StringViewArray", + "FixedSizeBinaryArray", + "DictionaryArray", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "StructArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "RunEndEncodedArray", + "FixedShapeTensorArray", + "scalar", + "NA", + "NULL", + "Scalar", + "NullScalar", + "BooleanScalar", + "Int8Scalar", + "Int16Scalar", + "Int32Scalar", + "Int64Scalar", + "UInt8Scalar", + "UInt16Scalar", + "UInt32Scalar", + "UInt64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "ListScalar", + "LargeListScalar", + "FixedSizeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "BinaryViewScalar", + "StringScalar", + "LargeStringScalar", + "StringViewScalar", + "FixedSizeBinaryScalar", + "DictionaryScalar", + "MapScalar", + "StructScalar", + "UnionScalar", + "RunEndEncodedScalar", + "ExtensionScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "DeviceAllocationType", + "Device", + "MemoryManager", + "default_cpu_memory_manager", + "Buffer", + "ResizableBuffer", + "foreign_buffer", + "py_buffer", + "Codec", + "compress", + "decompress", + "allocate_buffer", + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "total_allocated_bytes", + "set_memory_pool", + "default_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "logging_memory_pool", + "proxy_memory_pool", + "log_memory_allocations", + "jemalloc_set_decay_ms", + "supported_memory_backends", + "NativeFile", + "PythonFile", + "BufferedInputStream", + "BufferedOutputStream", + "CacheOptions", + "CompressedInputStream", + "CompressedOutputStream", + "TransformInputStream", + "transcoding_input_stream", + "FixedSizeBufferWriter", + "BufferReader", + "BufferOutputStream", + "OSFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "MockOutputStream", + "input_stream", + "output_stream", + "have_libhdfs", + "ChunkedArray", + "RecordBatch", + "Table", + "table", + "concat_arrays", + "concat_batches", + "concat_tables", + "TableGroupBy", + "RecordBatchReader", + "ArrowCancelled", + "ArrowCapacityError", + "ArrowException", + "ArrowKeyError", + "ArrowIndexError", + "ArrowInvalid", + "ArrowIOError", + "ArrowMemoryError", + "ArrowNotImplementedError", + "ArrowTypeError", + "ArrowSerializationError", + "serialize_pandas", + "deserialize_pandas", + "lib", + "ipc", + "types", + "_deprecate_api", + "_deprecate_class", + "Message", + "MessageReader", + "MetadataVersion", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "get_include", + "_get_pkg_config_executable", + "compute", + "feather", + "csv", + "json", + "_has_pkg_config", + "_read_pkg_config_variable", + "get_libraries", + "create_library_symlinks", + "dataset", + "get_library_dirs", +] diff --git a/python/pyarrow-stubs/pyarrow/_acero.pyi b/python/pyarrow-stubs/pyarrow/_acero.pyi new file mode 100644 index 000000000000..85ed9683e7ed --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_acero.pyi @@ -0,0 +1,163 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +from collections.abc import Iterable, Collection, Sequence + +if sys.version_info >= (3, 11): + from typing import Self, LiteralString +else: + from typing_extensions import Self, LiteralString +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Literal + +from . import lib +from .compute import Expression +from .dataset import InMemoryDataset, Dataset +from .table import Aggregation, AggregateOptions + +_StrOrExpr: TypeAlias = str | Expression + +IntoField: TypeAlias = str | int | Expression +Target: TypeAlias = ( + IntoField + | tuple[IntoField, ...] + | list[str] + | list[int] + | list[Expression] + | list[IntoField] +) + +UserDefinedAggregation: TypeAlias = LiteralString +OutputName: TypeAlias = str +AggregationSpec: TypeAlias = tuple[ + Target, Aggregation | UserDefinedAggregation, AggregateOptions | None, OutputName +] + + +class Declaration(lib._Weakrefable): + def __init__( + self, + factory_name: str, + options: ExecNodeOptions, + inputs: list[Declaration] | None = None, + ) -> None: ... + @classmethod + def from_sequence(cls, decls: Iterable[Declaration]) -> Self: ... + def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... + def to_table(self, use_threads: bool = True) -> lib.Table: ... + + +class ExecNodeOptions(lib._Weakrefable): + ... + + +class TableSourceNodeOptions(ExecNodeOptions): + def __init__(self, table: lib.Table | lib.RecordBatch | None) -> None: ... + + +class FilterNodeOptions(ExecNodeOptions): + def __init__(self, filter_expression: Expression | None) -> None: ... + + +class ProjectNodeOptions(ExecNodeOptions): + def __init__(self, expressions: Collection[Expression], + names: Collection[str] | None = None) -> None: ... + + +class AggregateNodeOptions(ExecNodeOptions): + def __init__( + self, + aggregates: Iterable[ + tuple[ + Target, + Aggregation | UserDefinedAggregation, + AggregateOptions | None, + OutputName, + ] + ], + keys: Iterable[str | Expression] | None = None, + ) -> None: ... + + +class OrderByNodeOptions(ExecNodeOptions): + def __init__( + self, + sort_keys: + Iterable[tuple[str | Expression | int, Literal["ascending", "descending"]]] + = (), + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> None: ... + + +class HashJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + join_type: Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", + ], + left_keys: _StrOrExpr | Sequence[_StrOrExpr], + right_keys: _StrOrExpr | Sequence[_StrOrExpr], + left_output: Sequence[_StrOrExpr] | None = None, + right_output: Sequence[_StrOrExpr] | None = None, + output_suffix_for_left: str = "", + output_suffix_for_right: str = "", + filter_expression: + lib.BooleanScalar | lib.BooleanArray | Expression | None = None, + ) -> None: ... + + +class AsofJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + left_on: _StrOrExpr, + left_by: _StrOrExpr | Sequence[_StrOrExpr], + right_on: _StrOrExpr, + right_by: _StrOrExpr | Sequence[_StrOrExpr], + tolerance: int, + ) -> None: ... + + +def _perform_join( + join_type: str, + left_operand: lib.Table | Dataset, + left_keys: str | list[str], + right_operand: lib.Table | Dataset, + right_keys: str | list[str], + left_suffix: str, + right_suffix: str, + use_threads: bool, + coalesce_keys: bool, + output_type: type[lib.Table | InMemoryDataset] = lib.Table, + filter_expression: Expression | None = None, +) -> lib.Table | InMemoryDataset: ... + + +def _filter_table( + table: lib.Table | lib.RecordBatch, filter_expression: Expression, + use_threads: bool = True) -> lib.Table | lib.RecordBatch: ... diff --git a/python/pyarrow-stubs/pyarrow/_azurefs.pyi b/python/pyarrow-stubs/pyarrow/_azurefs.pyi new file mode 100644 index 000000000000..5872de03825b --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_azurefs.pyi @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Literal + +from ._fs import FileSystem + + +class AzureFileSystem(FileSystem): + def __init__( + self, + account_name: str | None = None, + account_key: str | None = None, + blob_storage_authority: str | None = None, + dfs_storage_authority: str | None = None, + blob_storage_scheme: Literal["http", "https"] = "https", + dfs_storage_scheme: Literal["http", "https"] = "https", + sas_token: str | None = None, + tenant_id: str | None = None, + client_id: str | None = None, + client_secret: str | None = None, + ) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_compute.pyi b/python/pyarrow-stubs/pyarrow/_compute.pyi new file mode 100644 index 000000000000..dfe46908c080 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_compute.pyi @@ -0,0 +1,671 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import types as stdlib_types +from collections.abc import ( + Callable, + Iterable, + Mapping, + Sequence, +) + +from typing import ( + Any, + Literal, + TypeAlias, + TypedDict, +) + +from . import lib + +_Order: TypeAlias = Literal["ascending", "descending"] +_Placement: TypeAlias = Literal["at_start", "at_end"] + + +class Kernel(lib._Weakrefable): + ... + + +class Function(lib._Weakrefable): + @property + def arity(self) -> int | stdlib_types.EllipsisType: ... + + @property + def kind( + self, + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: ... + @property + def name(self) -> str: ... + @property + def num_kernels(self) -> int: ... + + @property + def kernels( + self, + ) -> list[ + ScalarKernel | VectorKernel | ScalarAggregateKernel | HashAggregateKernel + ]: ... + + def call( + self, + args: Iterable, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, + ) -> Any: ... + + +class FunctionOptions(lib._Weakrefable): + def serialize(self) -> lib.Buffer: ... + @classmethod + def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... + + +class FunctionRegistry(lib._Weakrefable): + def get_function(self, name: str) -> Function: ... + def list_functions(self) -> list[str]: ... + + +class HashAggregateFunction(Function): + ... + + +class HashAggregateKernel(Kernel): + ... + + +class ScalarAggregateFunction(Function): + ... + + +class ScalarAggregateKernel(Kernel): + ... + + +class ScalarFunction(Function): + ... + + +class ScalarKernel(Kernel): + ... + + +class VectorFunction(Function): + ... + + +class VectorKernel(Kernel): + ... + +# ==================== _compute.pyx Option classes ==================== + + +class ArraySortOptions(FunctionOptions): + def __init__( + self, + order: _Order = "ascending", + null_placement: _Placement = "at_end", + ) -> None: ... + + +class AssumeTimezoneOptions(FunctionOptions): + def __init__( + self, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + ) -> None: ... + + +class CastOptions(FunctionOptions): + allow_int_overflow: bool + allow_time_truncate: bool + allow_time_overflow: bool + allow_decimal_truncate: bool + allow_float_truncate: bool + allow_invalid_utf8: bool + + def __init__( + self, + target_type: lib.DataType | None = None, + *, + allow_int_overflow: bool | None = None, + allow_time_truncate: bool | None = None, + allow_time_overflow: bool | None = None, + allow_decimal_truncate: bool | None = None, + allow_float_truncate: bool | None = None, + allow_invalid_utf8: bool | None = None, + ) -> None: ... + @staticmethod + def safe(target_type: lib.DataType | None = None) -> CastOptions: ... + @staticmethod + def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... + def is_safe(self) -> bool: ... + + +class CountOptions(FunctionOptions): + def __init__(self, mode: Literal["only_valid", + "only_null", "all"] = "only_valid") -> None: ... + + +class CumulativeOptions(FunctionOptions): + def __init__(self, start: lib.Scalar | None = None, + *, skip_nulls: bool = False) -> None: ... + + +class CumulativeSumOptions(FunctionOptions): + def __init__(self, start: lib.Scalar | None = None, + *, skip_nulls: bool = False) -> None: ... + + +class DayOfWeekOptions(FunctionOptions): + def __init__(self, *, count_from_zero: bool = True, + week_start: int = 1) -> None: ... + + +class DictionaryEncodeOptions(FunctionOptions): + def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... + + +class RunEndEncodeOptions(FunctionOptions): + # TODO: default is DataType(int32) + def __init__(self, run_end_type: lib.DataType | str = ...) -> None: ... + + +class ElementWiseAggregateOptions(FunctionOptions): + def __init__(self, *, skip_nulls: bool = True) -> None: ... + + +class ExtractRegexOptions(FunctionOptions): + def __init__(self, pattern: str) -> None: ... + + +class ExtractRegexSpanOptions(FunctionOptions): + def __init__(self, pattern: str) -> None: ... + + +class FilterOptions(FunctionOptions): + def __init__(self, + null_selection_behavior: Literal["drop", + "emit_null"] = "drop") -> None: ... + + +class IndexOptions(FunctionOptions): + def __init__(self, value: lib.Scalar) -> None: ... + + +class JoinOptions(FunctionOptions): + def __init__( + self, + null_handling: + Literal["emit_null", "skip", "replace"] + = "emit_null", *, null_replacement: str = "") -> None: ... + + +class ListSliceOptions(FunctionOptions): + def __init__( + self, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + ) -> None: ... + + +class ListFlattenOptions(FunctionOptions): + def __init__(self, recursive: bool = False) -> None: ... + + +class MakeStructOptions(FunctionOptions): + def __init__( + self, + field_names: Sequence[str] = (), + *, + field_nullability: Sequence[bool] | None = None, + field_metadata: Sequence[lib.KeyValueMetadata] | None = None, + ) -> None: ... + + +class MapLookupOptions(FunctionOptions): + # TODO: query_key: Scalar or Object can be converted to Scalar + def __init__( + self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] + ) -> None: ... + + +class MatchSubstringOptions(FunctionOptions): + def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... + + +class ModeOptions(FunctionOptions): + def __init__(self, n: int = 1, *, skip_nulls: bool = True, + min_count: int = 0) -> None: ... + + +class NullOptions(FunctionOptions): + def __init__(self, *, nan_is_null: bool = False) -> None: ... + + +class PadOptions(FunctionOptions): + def __init__( + self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True + ) -> None: ... + + +class PairwiseOptions(FunctionOptions): + def __init__(self, period: int = 1) -> None: ... + + +class PartitionNthOptions(FunctionOptions): + def __init__(self, pivot: int, *, + null_placement: _Placement = "at_end") -> None: ... + + +class WinsorizeOptions(FunctionOptions): + def __init__(self, lower_limit: float, upper_limit: float) -> None: ... + + +class QuantileOptions(FunctionOptions): + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + interpolation: Literal["linear", "lower", + "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + + +class RandomOptions(FunctionOptions): + def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... + + +class RankOptions(FunctionOptions): + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + ) -> None: ... + + +class RankQuantileOptions(FunctionOptions): + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + ) -> None: ... + + +class PivotWiderOptions(FunctionOptions): + def __init__( + self, + key_names: Sequence[str], + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + ) -> None: ... + + +class ReplaceSliceOptions(FunctionOptions): + def __init__(self, start: int, stop: int, replacement: str) -> None: ... + + +class ReplaceSubstringOptions(FunctionOptions): + def __init__( + self, pattern: str, replacement: str, *, max_replacements: int | None = None + ) -> None: ... + + +_RoundMode: TypeAlias = Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", +] + + +class RoundBinaryOptions(FunctionOptions): + def __init__( + self, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + + +class RoundOptions(FunctionOptions): + def __init__( + self, + ndigits: int = 0, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + + +_DateTimeUint: TypeAlias = Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + + +class RoundTemporalOptions(FunctionOptions): + def __init__( + self, + multiple: int = 1, + unit: _DateTimeUint = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + ) -> None: ... + + +class RoundToMultipleOptions(FunctionOptions): + def __init__(self, multiple: int | float | lib.Scalar = 1.0, + round_mode: _RoundMode = "half_to_even") -> None: ... + + +class ScalarAggregateOptions(FunctionOptions): + def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... + + +class SelectKOptions(FunctionOptions): + def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... + + +class SetLookupOptions(FunctionOptions): + def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... + + +class SliceOptions(FunctionOptions): + def __init__( + self, start: int, stop: int | None = None, step: int = 1) -> None: ... + + +class SortOptions(FunctionOptions): + def __init__( + self, + sort_keys: Sequence[tuple[str, _Order]], + *, + null_placement: _Placement = "at_end" + ) -> None: ... + + +class SplitOptions(FunctionOptions): + def __init__(self, *, max_splits: int | None = None, + reverse: bool = False) -> None: ... + + +class SplitPatternOptions(FunctionOptions): + def __init__( + self, pattern: str, *, max_splits: int | None = None, reverse: bool = False + ) -> None: ... + + +class StrftimeOptions(FunctionOptions): + def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C") -> None: ... + + +class StrptimeOptions(FunctionOptions): + def __init__(self, + format: str, + unit: Literal["s", + "ms", + "us", + "ns"], + error_is_null: bool = False) -> None: ... + + +class StructFieldOptions(FunctionOptions): + def __init__(self, indices: list[str] | list[bytes] | + list[int] | Expression | bytes | str | int) -> None: ... + + +class TakeOptions(FunctionOptions): + def __init__(self, boundscheck: bool = True) -> None: ... + + +class TDigestOptions(FunctionOptions): + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + + +class TrimOptions(FunctionOptions): + def __init__(self, characters: str) -> None: ... + + +class Utf8NormalizeOptions(FunctionOptions): + def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... + + +class VarianceOptions(FunctionOptions): + def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, + min_count: int = 0) -> None: ... + + +class SkewOptions(FunctionOptions): + def __init__( + self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 + ) -> None: ... + + +class WeekOptions(FunctionOptions): + def __init__( + self, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + ) -> None: ... + + +class ZeroFillOptions(FunctionOptions): + def __init__(self, width: int, padding: str = "0") -> None: ... + +# ==================== _compute.pyx Functions ==================== + + +def call_function( + name: str, + args: list, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, +) -> Any: ... +def function_registry() -> FunctionRegistry: ... +def get_function(name: str) -> Function: ... +def list_functions() -> list[str]: ... + +# ==================== _compute.pyx Udf ==================== + + +def call_tabular_function( + function_name: str, + args: Iterable | None = None, + func_registry: FunctionRegistry | None = None) -> lib.RecordBatchReader: ... + + +class _FunctionDoc(TypedDict): + summary: str + description: str + + +def register_scalar_function( + func: Callable | None, + function_name: str | None, + function_doc: _FunctionDoc | dict[str, str], + in_types: Mapping[str, lib.DataType] | None, + out_type: lib.DataType | None, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +def register_tabular_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc | dict[str, str], + in_types: Mapping[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +def register_aggregate_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc | dict[str, str], + in_types: Mapping[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +def register_vector_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc | dict[str, str], + in_types: Mapping[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +class UdfContext: + @property + def batch_length(self) -> int: ... + @property + def memory_pool(self) -> lib.MemoryPool: ... + + +def _get_udf_context(memory_pool: lib.MemoryPool, batch_length: int) -> UdfContext: ... + +# ==================== _compute.pyx Expression ==================== + + +class Expression(lib._Weakrefable): + @staticmethod + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: ... + + def to_substrait(self, schema: lib.Schema, + allow_arrow_extensions: bool = False) -> lib.Buffer: ... + + @staticmethod + def _call( + func_name: str, args: list, options: FunctionOptions | None = None + ) -> Expression: ... + + @staticmethod + def _field(name_or_index: str | int) -> Expression: ... + + @staticmethod + def _nested_field(name: str) -> Expression: ... + + @staticmethod + def _scalar(value: Any) -> Expression: ... + + def __invert__(self) -> Expression: ... + + def __and__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __rand__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __or__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __ror__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __add__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __radd__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __mul__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __rmul__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __sub__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __rsub__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __eq__(self, value: object) -> Expression: ... # type: ignore[override] + def __ne__(self, value: object) -> Expression: ... # type: ignore[override] + def __gt__(self, value: object) -> Expression: ... + def __lt__(self, value: object) -> Expression: ... + def __ge__(self, value: object) -> Expression: ... + def __le__(self, value: object) -> Expression: ... + + def __truediv__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __rtruediv__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def is_valid(self) -> Expression: ... + def is_null(self, nan_is_null: bool = False) -> Expression: ... + def is_nan(self) -> Expression: ... + + def cast( + self, + type: lib.DataType | str, safe: bool = True, options: CastOptions | None = None + ) -> Expression: ... + + def isin(self, values: lib.Array | Iterable | Any) -> Expression: ... + def equals(self, other: object) -> bool: ... + + # Attributes and methods for materialized expressions (used in tests) + @property + def type(self) -> lib.DataType: ... + def to_pylist(self) -> list: ... + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> Any: ... + def to_pandas(self, **kwargs) -> Any: ... + def as_py(self) -> Any: ... + def tolist(self) -> list: ... + def slice(self, offset: int = 0, length: int | None = None) -> Expression: ... + +# ==================== _compute.py ==================== diff --git a/python/pyarrow-stubs/pyarrow/_compute_docstring.pyi b/python/pyarrow-stubs/pyarrow/_compute_docstring.pyi new file mode 100644 index 000000000000..514a4e4269c6 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_compute_docstring.pyi @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +function_doc_additions: dict[str, str] diff --git a/python/pyarrow-stubs/pyarrow/_csv.pyi b/python/pyarrow-stubs/pyarrow/_csv.pyi new file mode 100644 index 000000000000..6c911a8b0c1d --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_csv.pyi @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable, Sequence +from dataclasses import dataclass, field +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from . import lib + + +@dataclass(kw_only=True) +class ReadOptions(lib._Weakrefable): + use_threads: bool = field(default=True, kw_only=False) # noqa: Y015 + block_size: int | float | None = None + skip_rows: int = 0 + skip_rows_after_names: int = 0 + column_names: Sequence[str] | None = None + autogenerate_column_names: bool = False + encoding: str = "utf8" + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class ParseOptions(lib._Weakrefable): + delimiter: str = field(default=",", kw_only=False) # noqa: Y015 + quote_char: str | Literal[False] = '"' + double_quote: bool = True + escape_char: str | Literal[False] = False + newlines_in_values: bool = False + ignore_empty_lines: bool = True + invalid_row_handler: Callable[[InvalidRow], str] | None = None + + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class ConvertOptions(lib._Weakrefable): + check_utf8: bool = field(default=True, kw_only=False) # noqa: Y015 + column_types: lib.Schema | dict | Sequence[tuple[str, lib.DataType]] | None = None + null_values: list[str] | None = None + true_values: list[str] | None = None + false_values: list[str] | None = None + decimal_point: str = "." + strings_can_be_null: bool = False + quoted_strings_can_be_null: bool = True + include_columns: list[str] | None = None + include_missing_columns: bool = False + auto_dict_encode: bool = False + auto_dict_max_cardinality: int | None = None + timestamp_parsers: Sequence[str | lib._Weakrefable] | None = None + + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class WriteOptions(lib._Weakrefable): + include_header: bool = field(default=True, kw_only=False) # noqa: Y015 + batch_size: int = 1024 + delimiter: str = "," + quoting_style: Literal["needed", "all_valid", "none"] = "needed" + quoting_header: Literal["needed", "all_valid", "none"] = "needed" + + def validate(self) -> None: ... + + +@dataclass +class InvalidRow(lib._Weakrefable): + expected_columns: int + actual_columns: int + number: int | None + text: str + + +class CSVWriter(lib._CRecordBatchWriter): + def __init__( + self, + # TODO: OutputStream + sink: StrPath | IO[Any], + schema: lib.Schema, + write_options: WriteOptions | None = None, + *, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class CSVStreamingReader(lib.RecordBatchReader): + ... + + +ISO8601: lib._Weakrefable + + +def open_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> CSVStreamingReader: ... + + +def read_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Table: ... + + +def write_csv( + data: lib.RecordBatch | lib.Table, + output_file: StrPath | lib.NativeFile | IO[Any], + write_options: WriteOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_cuda.pyi b/python/pyarrow-stubs/pyarrow/_cuda.pyi new file mode 100644 index 000000000000..d484fc5cf5f3 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_cuda.pyi @@ -0,0 +1,158 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any + +import cuda # type: ignore[import-not-found] + +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-untyped, import-not-found] # noqa: E501 + +from . import lib +from ._stubs_typing import ArrayLike + + +class Context(lib._Weakrefable): + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: ... + + @staticmethod + def from_numba(context: _numba_driver.Context | None = None) -> Context: ... + + def to_numba(self) -> _numba_driver.Context: ... + + @staticmethod + def get_num_devices() -> int: ... + + @property + def device_number(self) -> int: ... + + @property + def handle(self) -> int: ... + + def synchronize(self) -> None: ... + + @property + def bytes_allocated(self) -> int: ... + + def get_device_address(self, address: int) -> int: ... + + def new_buffer(self, nbytes: int) -> CudaBuffer: ... + + @property + def memory_manager(self) -> lib.MemoryManager: ... + + @property + def device(self) -> lib.Device: ... + + def foreign_buffer(self, address: int, size: int, base: Any | + None = None) -> CudaBuffer: ... + + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: ... + + def buffer_from_data( + self, + data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, + offset: int = 0, + size: int = -1, + ) -> CudaBuffer: ... + + def buffer_from_object(self, obj: Any) -> CudaBuffer: ... + + +class IpcMemHandle(lib._Weakrefable): + @staticmethod + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: ... + + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: ... + + +class CudaBuffer(lib.Buffer): + @staticmethod + def from_buffer(buf: lib.Buffer) -> CudaBuffer: ... + + @staticmethod + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: ... + + def to_numba(self) -> _numba_driver.MemoryPointer: ... + + def copy_to_host( + self, + position: int = 0, + nbytes: int = -1, + buf: lib.Buffer | None = None, + memory_pool: lib.MemoryPool | None = None, + resizable: bool = False, + ) -> lib.Buffer: ... + + def copy_from_host( + self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + ) -> int: ... + + def copy_from_device(self, buf: CudaBuffer, position: int = 0, + nbytes: int = -1) -> int: ... + + def export_for_ipc(self) -> IpcMemHandle: ... + + @property + def context(self) -> Context: ... + + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: ... + + def to_pybytes(self) -> bytes: ... + + +class HostBuffer(lib.Buffer): + @property + def size(self) -> int: ... + + +class BufferReader(lib.NativeFile): + def __init__(self, obj: CudaBuffer) -> None: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: ... + + +class BufferWriter(lib.NativeFile): + def __init__(self, obj: CudaBuffer) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: ... + + @property + def buffer_size(self) -> int: ... + + @buffer_size.setter + def buffer_size(self, buffer_size: int): ... + + @property + def num_bytes_buffered(self) -> int: ... + + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: ... + + +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: ... + + +def read_message( + source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None +) -> lib.Message: ... + + +def read_record_batch( + buffer: lib.Buffer, + object: lib.Schema, + *, + dictionary_memo: lib.DictionaryMemo | None = None, + pool: lib.MemoryPool | None = None, +) -> lib.RecordBatch: ... diff --git a/python/pyarrow-stubs/pyarrow/_dataset.pyi b/python/pyarrow-stubs/pyarrow/_dataset.pyi new file mode 100644 index 000000000000..c8cd3d970890 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_dataset.pyi @@ -0,0 +1,682 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Collection, Callable, Iterator, Iterable +from typing import ( + IO, + Any, + Generic, + Literal, + NamedTuple, + TypeVar, +) + +from _typeshed import StrPath + +from . import csv, _json, _parquet, lib +from ._fs import FileSelector, FileSystem, SupportedFileSystem +from ._stubs_typing import Indices, JoinType, Order +from .acero import ExecNodeOptions +from .compute import Expression +from .ipc import IpcWriteOptions, RecordBatchReader + + +class Dataset(lib._Weakrefable): + @property + def partition_expression(self) -> Expression: ... + + def replace_schema(self, schema: lib.Schema) -> Self: ... + + def get_fragments(self, filter: Expression | None = None): ... + + def scanner( + self, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: ... + + def to_table( + self, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def count_rows( + self, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: ... + + @property + def schema(self) -> lib.Schema: ... + + def filter(self, expression: Expression | None) -> Self: ... + + def sort_by(self, sorting: str | + list[tuple[str, Order]], **kwargs) -> InMemoryDataset: ... + + def join( + self, + right_dataset: Dataset, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> InMemoryDataset: ... + + def join_asof( + self, + right_dataset: Dataset, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> InMemoryDataset: ... + + @property + def format(self) -> FileFormat: ... + + +class InMemoryDataset(Dataset): + def __init__( + self, + source: lib.Table + | lib.RecordBatch + | lib.RecordBatchReader + | Iterable[lib.RecordBatch] + | list[Any], + schema: lib.Schema | None = None, + ) -> None: ... + + +class UnionDataset(Dataset): + def __init__( + self, + schema: lib.Schema | None = None, + children: list[Dataset] | None = None, + ) -> None: ... + + @property + def children(self) -> list[Dataset]: ... + + +class FileSystemDataset(Dataset): + def __init__( + self, + fragments: list[Fragment], + schema: lib.Schema, + format: FileFormat, + filesystem: SupportedFileSystem | None = None, + root_partition: Expression | None = None, + ) -> None: ... + + @classmethod + def from_paths( + cls, + paths: list[str], + schema: lib.Schema | None = None, + format: FileFormat | None = None, + filesystem: SupportedFileSystem | None = None, + partitions: list[Expression] | None = None, + root_partition: Expression | None = None, + ) -> FileSystemDataset: ... + + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning | None: ... + + @property + def files(self) -> list[str]: ... + + +class FileWriteOptions(lib._Weakrefable): + @property + def format(self) -> FileFormat: ... + + +class FileFormat(lib._Weakrefable): + def inspect( + self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None + ) -> lib.Schema: ... + + def make_fragment( + self, + file: StrPath | IO | lib.Buffer | lib.BufferReader, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + *, + file_size: int | None = None, + ) -> Fragment: ... + + def make_write_options(self) -> FileWriteOptions: ... + @property + def default_extname(self) -> str: ... + @property + def default_fragment_scan_options(self) -> FragmentScanOptions: ... + @default_fragment_scan_options.setter + def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... + + +class Fragment(lib._Weakrefable): + def open(self) -> lib.NativeFile | lib.BufferReader: ... + @property + def path(self) -> str: ... + @property + def row_groups(self) -> list[int]: ... + + @property + def filesystem(self) -> SupportedFileSystem: ... + + @property + def physical_schema(self) -> lib.Schema: ... + + @property + def partition_expression(self) -> Expression: ... + + def scanner( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + def to_batches( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: ... + + def to_table( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: ... + + +class FileFragment(Fragment): + def open(self) -> lib.NativeFile: ... + + @property + def path(self) -> str: ... + + @property + def filesystem(self) -> FileSystem: ... + + @property + def buffer(self) -> lib.Buffer: ... + + @property + def format(self) -> FileFormat: ... + + +class FragmentScanOptions(lib._Weakrefable): + @property + def type_name(self) -> str: ... + + +class IpcFileWriteOptions(FileWriteOptions): + @property + def write_options(self) -> IpcWriteOptions: ... + @write_options.setter + def write_options(self, write_options: IpcWriteOptions) -> None: ... + + +class IpcFileFormat(FileFormat): + def equals(self, other: IpcFileFormat) -> bool: ... + def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... + @property + def default_extname(self) -> str: ... + + +class FeatherFileFormat(IpcFileFormat): + ... + + +class CsvFileFormat(FileFormat): + def __init__( + self, + parse_options: csv.ParseOptions | None = None, + default_fragment_scan_options: CsvFragmentScanOptions | None = None, + convert_options: csv.ConvertOptions | None = None, + read_options: csv.ReadOptions | None = None, + ) -> None: ... + def make_write_options( + self, **kwargs) -> CsvFileWriteOptions: ... # type: ignore[override] + + @property + def parse_options(self) -> csv.ParseOptions: ... + @parse_options.setter + def parse_options(self, parse_options: csv.ParseOptions) -> None: ... + def equals(self, other: CsvFileFormat) -> bool: ... + + +class CsvFragmentScanOptions(FragmentScanOptions): + convert_options: csv.ConvertOptions + read_options: csv.ReadOptions + + def __init__( + self, + convert_options: csv.ConvertOptions | None = None, + read_options: csv.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: CsvFragmentScanOptions) -> bool: ... + + +class CsvFileWriteOptions(FileWriteOptions): + write_options: csv.WriteOptions + + +class JsonFileFormat(FileFormat): + def __init__( + self, + default_fragment_scan_options: JsonFragmentScanOptions | None = None, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFileFormat) -> bool: ... + + +class JsonFragmentScanOptions(FragmentScanOptions): + parse_options: _json.ParseOptions + read_options: _json.ReadOptions + + def __init__( + self, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFragmentScanOptions) -> bool: ... + + +class Partitioning(lib._Weakrefable): + def parse(self, path: str) -> Expression: ... + + def format(self, expr: Expression) -> tuple[str, str]: ... + + @property + def schema(self) -> lib.Schema: ... + + @property + def dictionaries(self) -> list[Any]: ... + + +class PartitioningFactory(lib._Weakrefable): + @property + def type_name(self) -> str: ... + + +class KeyValuePartitioning(Partitioning): + @property + def dictionaries(self) -> list[Any]: ... + + +class DirectoryPartitioning(KeyValuePartitioning): + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + + +class HivePartitioning(KeyValuePartitioning): + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + null_fallback: str = "__HIVE_DEFAULT_PARTITION__", + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + + @staticmethod + def discover( + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + null_fallback="__HIVE_DEFAULT_PARTITION__", + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + + +class FilenamePartitioning(KeyValuePartitioning): + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + + +class DatasetFactory(lib._Weakrefable): + root_partition: Expression + def finish(self, schema: lib.Schema | None = None) -> Dataset: ... + + def inspect( + self, + *, + promote_options: str = "default", + fragments: list[Fragment] | int | str | None = None, + ) -> lib.Schema: ... + + def inspect_schemas(self) -> list[lib.Schema]: ... + + +class FileSystemFactoryOptions(lib._Weakrefable): + partitioning: Partitioning + partitioning_factory: PartitioningFactory + partition_base_dir: str + exclude_invalid_files: bool + selector_ignore_prefixes: list[str] + + def __init__( + self, + partition_base_dir: str | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + exclude_invalid_files: bool | None = True, + selector_ignore_prefixes: list[str] | None = None, + ) -> None: ... + + +class FileSystemDatasetFactory(DatasetFactory): + def __init__( + self, + filesystem: SupportedFileSystem, + paths_or_selector: Collection[str] | FileSelector, + format: FileFormat, + options: FileSystemFactoryOptions | None = None, + ) -> None: ... + + +class UnionDatasetFactory(DatasetFactory): + def __init__(self, factories: list[DatasetFactory]) -> None: ... + + +_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) + + +class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): + def __iter__(self) -> Self: ... + def __next__(self) -> _RecordBatchT: ... + + +class TaggedRecordBatch(NamedTuple): + record_batch: lib.RecordBatch + fragment: Fragment + + +class TaggedRecordBatchIterator(lib._Weakrefable): + def __iter__(self) -> Self: ... + def __next__(self) -> TaggedRecordBatch: ... + + +class Scanner(lib._Weakrefable): + @staticmethod + def from_dataset( + dataset: Dataset, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + @staticmethod + def from_fragment( + fragment: Fragment, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + @staticmethod + def from_batches( + source: Iterator[lib.RecordBatch] | RecordBatchReader | Any, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + @property + def dataset_schema(self) -> lib.Schema: ... + + @property + def projected_schema(self) -> lib.Schema: ... + + def to_batches(self) -> Iterator[lib.RecordBatch]: ... + + def scan_batches(self) -> TaggedRecordBatchIterator: ... + + def to_table(self) -> lib.Table: ... + + def take(self, indices: Indices) -> lib.Table: ... + + def head(self, num_rows: int) -> lib.Table: ... + + def count_rows(self) -> int: ... + + def to_reader(self) -> RecordBatchReader: ... + + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ... + + +class WrittenFile(lib._Weakrefable): + def __init__(self, path: str, metadata: _parquet.FileMetaData | + None, size: int) -> None: ... + + +def _filesystemdataset_write( + data: Scanner, + base_dir: StrPath, + basename_template: str, + filesystem: SupportedFileSystem, + partitioning: Partitioning, + preserve_order: bool, + file_options: FileWriteOptions, + max_partitions: int, + file_visitor: Callable[[str], None] | None, + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], + max_open_files: int, + max_rows_per_file: int, + min_rows_per_group: int, + max_rows_per_group: int, + create_dir: bool, +): ... + + +class _ScanNodeOptions(ExecNodeOptions): + def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... + + +class ScanNodeOptions(_ScanNodeOptions): + def __init__( + self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs + ) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_dataset_orc.pyi b/python/pyarrow-stubs/pyarrow/_dataset_orc.pyi new file mode 100644 index 000000000000..62f49bf5d301 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_dataset_orc.pyi @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from ._dataset import FileFormat + + +class OrcFileFormat(FileFormat): + def equals(self, other: OrcFileFormat) -> bool: ... + @property + def default_extname(self): ... diff --git a/python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi b/python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi new file mode 100644 index 000000000000..6c27e3c8a93e --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi @@ -0,0 +1,200 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable +from dataclasses import dataclass +from typing import IO, Any, TypedDict + +from _typeshed import StrPath + +from ._compute import Expression +from ._dataset import ( + DatasetFactory, + FileFormat, + FileFragment, + FileWriteOptions, + Fragment, + FragmentScanOptions, + Partitioning, + PartitioningFactory, +) +from ._dataset_parquet_encryption import ParquetDecryptionConfig +from ._fs import SupportedFileSystem +from ._parquet import FileDecryptionProperties, FileMetaData +from ._types import DataType, LargeListType, ListType +from .lib import CacheOptions, Schema, _Weakrefable, NativeFile, Buffer, BufferReader + +parquet_encryption_enabled: bool + + +class ParquetFileFormat(FileFormat): + def __init__( + self, + read_options: ParquetReadOptions | None = None, + default_fragment_scan_options: ParquetFragmentScanOptions | None = None, + *, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + arrow_extensions_enabled: bool = True, + binary_type: DataType | None = None, + list_type: type[ListType | LargeListType] | None = None, + use_buffered_stream: bool = False, + buffer_size: int = 8192, + dictionary_columns: list[str] | set[str] | None = None, + decryption_properties: FileDecryptionProperties | None = None, + ) -> None: ... + @property + def read_options(self) -> ParquetReadOptions: ... + def make_write_options( + self, **kwargs) -> ParquetFileWriteOptions: ... # type: ignore[override] + + def equals(self, other: ParquetFileFormat) -> bool: ... + @property + def default_extname(self) -> str: ... + + def make_fragment( + self, + file: StrPath | IO | Buffer | BufferReader, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + row_groups: Iterable[int] | None = None, + *, + file_size: int | None = None, + ) -> Fragment: ... + + +class _NameStats(TypedDict): + min: Any + max: Any + + +class RowGroupInfo: + id: int + metadata: FileMetaData + schema: Schema + + def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def statistics(self) -> dict[str, _NameStats]: ... + + +class ParquetFileFragment(FileFragment): + def ensure_complete_metadata(self) -> None: ... + @property + def path(self) -> str: ... + @property + def filesystem(self) -> SupportedFileSystem: ... + def open(self) -> NativeFile: ... + + @property + def row_groups(self) -> list[int]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def num_row_groups(self) -> int: ... + + def split_by_row_group( + self, filter: Expression | None = None, schema: Schema | None = None + ) -> list[Fragment]: ... + + def subset( + self, + filter: Expression | None = None, + schema: Schema | None = None, + row_group_ids: list[int] | None = None, + ) -> ParquetFileFormat: ... + + +class ParquetReadOptions(_Weakrefable): + def __init__( + self, + dictionary_columns: list[str] | set[str] | None = None, + coerce_int96_timestamp_unit: str | None = None, + binary_type: DataType | None = None, + list_type: type[ListType | LargeListType] | None = None, + ) -> None: ... + + @property + def dictionary_columns(self) -> set[str]: ... + @dictionary_columns.setter + def dictionary_columns(self, columns: list[str] | set[str]) -> None: ... + + @property + def coerce_int96_timestamp_unit(self) -> str: ... + @coerce_int96_timestamp_unit.setter + def coerce_int96_timestamp_unit(self, unit: str) -> None: ... + + @property + def binary_type(self) -> DataType: ... + @binary_type.setter + def binary_type(self, type: DataType | None) -> None: ... + + @property + def list_type(self) -> type[ListType | LargeListType]: ... + @list_type.setter + def list_type(self, type: type[ListType | LargeListType] | None) -> None: ... + + def equals(self, other: ParquetReadOptions) -> bool: ... + + +class ParquetFileWriteOptions(FileWriteOptions): + def update(self, **kwargs) -> None: ... + def _set_properties(self) -> None: ... + def _set_arrow_properties(self) -> None: ... + def _set_encryption_config(self) -> None: ... + # accept passthrough options used in tests + def __init__(self, **kwargs) -> None: ... + + +@dataclass(kw_only=True) +class ParquetFragmentScanOptions(FragmentScanOptions): + use_buffered_stream: bool = False + buffer_size: int = 8192 + pre_buffer: bool = True + cache_options: CacheOptions | None = None + thrift_string_size_limit: int | None = None + thrift_container_size_limit: int | None = None + decryption_config: ParquetDecryptionConfig | None = None + decryption_properties: FileDecryptionProperties | None = None + page_checksum_verification: bool = False + + def equals(self, other: ParquetFragmentScanOptions) -> bool: ... + + +@dataclass +class ParquetFactoryOptions(_Weakrefable): + + partition_base_dir: str | None = None + partitioning: Partitioning | PartitioningFactory | None = None + validate_column_chunk_paths: bool = False + + +class ParquetDatasetFactory(DatasetFactory): + def __init__( + self, + metadata_path: str, + filesystem: SupportedFileSystem, + format: FileFormat, + options: ParquetFactoryOptions | None = None, + ) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi b/python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi new file mode 100644 index 000000000000..b36f18522e5e --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions +from ._parquet import FileDecryptionProperties +from ._parquet_encryption import (CryptoFactory, EncryptionConfiguration, + DecryptionConfiguration, KmsConnectionConfig) +from .lib import _Weakrefable + + +class ParquetEncryptionConfig(_Weakrefable): + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + + +class ParquetDecryptionConfig(_Weakrefable): + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration, + ) -> None: ... + + +def set_encryption_config( + opts: ParquetFileWriteOptions, + config: ParquetEncryptionConfig, +) -> None: ... + + +def set_decryption_properties( + opts: ParquetFragmentScanOptions, + config: FileDecryptionProperties, +): ... + + +def set_decryption_config( + opts: ParquetFragmentScanOptions, + config: ParquetDecryptionConfig, +): ... diff --git a/python/pyarrow-stubs/pyarrow/_feather.pyi b/python/pyarrow-stubs/pyarrow/_feather.pyi new file mode 100644 index 000000000000..2f4757cd5f1a --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_feather.pyi @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Literal +from collections.abc import Sequence + +from _typeshed import StrPath + +from .lib import Buffer, NativeFile, Table, _Weakrefable + + +class FeatherError(Exception): + ... + + +def write_feather( + table: Table, + dest: StrPath | IO | NativeFile, + compression: str | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +): ... + + +class FeatherReader(_Weakrefable): + def __init__( + self, + source: StrPath | IO | NativeFile | Buffer, + use_memory_map: bool, + use_threads: bool, + ) -> None: ... + @property + def version(self) -> str: ... + def read(self) -> Table: ... + def read_indices(self, indices: Sequence[int]) -> Table: ... + def read_names(self, names: Sequence[str]) -> Table: ... diff --git a/python/pyarrow-stubs/pyarrow/_flight.pyi b/python/pyarrow-stubs/pyarrow/_flight.pyi new file mode 100644 index 000000000000..03d6c6580ab0 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_flight.pyi @@ -0,0 +1,660 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import asyncio +import enum +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Generator, Iterable, Iterator, Sequence +from typing import Any, Generic, NamedTuple, TypeVar +from datetime import datetime +from typing_extensions import deprecated + +from .ipc import _ReadPandasMixin, ReadStats +from .lib import ( + ArrowCancelled, + ArrowException, + ArrowInvalid, + Buffer, + IpcReadOptions, + IpcWriteOptions, + RecordBatch, + RecordBatchReader, + Scalar, + Schema, + Table, + _CRecordBatchWriter, + _Weakrefable, +) + +_T = TypeVar("_T") + + +class FlightCallOptions(_Weakrefable): + def __init__( + self, + timeout: float | None = None, + write_options: IpcWriteOptions | None = None, + headers: list[tuple[str | bytes, str | bytes]] | None = None, + read_options: IpcReadOptions | None = None, + ) -> None: ... + + +class CertKeyPair(NamedTuple): + cert: str | bytes | None + key: str | bytes | None + + +class FlightError(Exception): + extra_info: bytes + + +class FlightInternalError(FlightError, ArrowException): + ... + + +class FlightTimedOutError(FlightError, ArrowException): + ... + + +class FlightCancelledError(FlightError, ArrowCancelled): + def __init__(self, message: str, *, extra_info: bytes | None = None) -> None: ... + + +class FlightServerError(FlightError, ArrowException): + ... + + +class FlightUnauthenticatedError(FlightError, ArrowException): + ... + + +class FlightUnauthorizedError(FlightError, ArrowException): + ... + + +class FlightUnavailableError(FlightError, ArrowException): + ... + + +class FlightWriteSizeExceededError(ArrowInvalid): + limit: int + actual: int + + +class Action(_Weakrefable): + def __init__( + self, action_type: bytes | str, buf: Buffer | bytes | None) -> None: ... + + @property + def type(self) -> str: ... + + @property + def body(self) -> Buffer: ... + + def serialize(self) -> bytes: ... + + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class ActionType(NamedTuple): + type: str + description: str + + def make_action(self, buf: Buffer | bytes) -> Action: ... + + +class Result(_Weakrefable): + def __init__(self, buf: Buffer | bytes) -> None: ... + + @property + def body(self) -> Buffer: ... + + def serialize(self) -> bytes: ... + + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class BasicAuth(_Weakrefable): + def __init__( + self, username: str | bytes | None = None, password: str | bytes | None = None + ) -> None: ... + + @property + def username(self) -> bytes: ... + @property + def password(self) -> bytes: ... + def serialize(self) -> str: ... + @staticmethod + def deserialize(serialized: str | bytes) -> BasicAuth: ... + + +class DescriptorType(enum.Enum): + UNKNOWN = 0 + PATH = 1 + CMD = 2 + + +class FlightMethod(enum.Enum): + INVALID = 0 + HANDSHAKE = 1 + LIST_FLIGHTS = 2 + GET_FLIGHT_INFO = 3 + GET_SCHEMA = 4 + DO_GET = 5 + DO_PUT = 6 + DO_ACTION = 7 + LIST_ACTIONS = 8 + DO_EXCHANGE = 9 + + +class FlightDescriptor(_Weakrefable): + @staticmethod + def for_path(*path: str | bytes) -> FlightDescriptor: ... + + @staticmethod + def for_command(command: str | bytes) -> FlightDescriptor: ... + + @property + def descriptor_type(self) -> DescriptorType: ... + + @property + def path(self) -> list[bytes] | None: ... + + @property + def command(self) -> bytes | None: ... + + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class Ticket(_Weakrefable): + def __init__(self, ticket: str | bytes) -> None: ... + @property + def ticket(self) -> bytes: ... + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class Location(_Weakrefable): + def __init__(self, uri: str | bytes) -> None: ... + @property + def uri(self) -> bytes: ... + def equals(self, other: Location) -> bool: ... + @staticmethod + def for_grpc_tcp(host: str | bytes, port: int) -> Location: ... + + @staticmethod + def for_grpc_tls(host: str | bytes, port: int) -> Location: ... + + @staticmethod + def for_grpc_unix(path: str | bytes) -> Location: ... + + +class FlightEndpoint(_Weakrefable): + def __init__( + self, + ticket: Ticket | str | bytes | object, + locations: list[str | bytes | Location | object], + expiration_time: Scalar[Any] | str | datetime | None = ..., + app_metadata: bytes | str | object = ..., + ): ... + + @property + def ticket(self) -> Ticket: ... + + @property + def locations(self) -> list[Location]: ... + + def serialize(self) -> bytes: ... + @property + def expiration_time(self) -> Scalar[Any] | None: ... + + @property + def app_metadata(self) -> bytes | str: ... + + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class SchemaResult(_Weakrefable): + def __init__(self, schema: Schema) -> None: ... + + @property + def schema(self) -> Schema: ... + + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class FlightInfo(_Weakrefable): + def __init__( + self, + schema: Schema | None, + descriptor: FlightDescriptor, + endpoints: list[FlightEndpoint], + total_records: int | None = ..., + total_bytes: int | None = ..., + ordered: bool = ..., + app_metadata: bytes | str = ..., + ) -> None: ... + + @property + def schema(self) -> Schema | None: ... + + @property + def descriptor(self) -> FlightDescriptor: ... + + @property + def endpoints(self) -> list[FlightEndpoint]: ... + + @property + def total_records(self) -> int: ... + + @property + def total_bytes(self) -> int: ... + + @property + def ordered(self) -> bool: ... + + @property + def app_metadata(self) -> bytes | str: ... + + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class FlightStreamChunk(_Weakrefable): + @property + def data(self) -> RecordBatch | None: ... + @property + def app_metadata(self) -> Buffer | None: ... + def __iter__(self): ... + + +class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + # Needs to be separate class so the "real" class can subclass the + # pure-Python mixin class + + def __iter__(self) -> Self: ... + def __next__(self) -> FlightStreamChunk: ... + @property + def schema(self) -> Schema: ... + + def read_all(self) -> Table: ... + + def read_chunk(self) -> FlightStreamChunk: ... + + def to_reader(self) -> RecordBatchReader: ... + + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): + @property + def stats(self) -> ReadStats: ... + + +class FlightStreamReader(MetadataRecordBatchReader): + @property + def stats(self) -> ReadStats: ... + + def cancel(self) -> None: ... + + def read_all(self) -> Table: ... + + def read(self) -> RecordBatch | None: ... + + +class MetadataRecordBatchWriter(_CRecordBatchWriter): + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: ... + + def write_metadata(self, buf: Buffer | bytes) -> None: ... + + def write_batch(self, batch: RecordBatch) -> None: ... # type: ignore[override] + + def write_table(self, table: Table, max_chunksize: int | + None = None, **kwargs) -> None: ... + + def close(self) -> None: ... + + def write_with_metadata(self, batch: RecordBatch, buf: Buffer | bytes) -> None: ... + + +class FlightStreamWriter(MetadataRecordBatchWriter): + def done_writing(self) -> None: ... + + +class FlightMetadataReader(_Weakrefable): + def read(self) -> Buffer | None: ... + + +class FlightMetadataWriter(_Weakrefable): + def write(self, message: Buffer) -> None: ... + + +class AsyncioCall(Generic[_T]): + _future: asyncio.Future[_T] + + def as_awaitable(self) -> asyncio.Future[_T]: ... + def wakeup(self, result_or_exception: BaseException | _T) -> None: ... + + +class AsyncioFlightClient: + def __init__(self, client: FlightClient) -> None: ... + + async def get_flight_info( + self, + descriptor: FlightDescriptor, + *, + options: FlightCallOptions | None = None, + ): ... + + +class FlightClient(_Weakrefable): + def __init__( + self, + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, + ): ... + + @property + def supports_async(self) -> bool: ... + def as_async(self) -> AsyncioFlightClient: ... + def wait_for_available(self, timeout: int = 5) -> None: ... + + @classmethod + @deprecated( + "Use the ``FlightClient`` constructor or " + "``pyarrow.flight.connect`` function instead." + ) + def connect( + cls, + location: str | tuple[str, int] | Location, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + disable_server_verification: bool = False, + ) -> FlightClient: ... + + def authenticate( + self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None + ) -> None: ... + + def authenticate_basic_token( + self, username: str | bytes, password: str | bytes, + options: FlightCallOptions | None = None + ) -> tuple[str, str]: ... + + def list_actions(self, options: FlightCallOptions | + None = None) -> list[Action]: ... + + def do_action( + self, action: Action | tuple[bytes | str, bytes | str] | str, + options: FlightCallOptions | None = None + ) -> Iterator[Result]: ... + + def list_flights( + self, criteria: str | bytes | None = None, + options: FlightCallOptions | None = None + ) -> Generator[FlightInfo, None, None]: ... + + def get_flight_info( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> FlightInfo: ... + + def get_schema( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> SchemaResult: ... + + def do_get( + self, ticket: Ticket, options: FlightCallOptions | None = None + ) -> FlightStreamReader: ... + + def do_put( + self, + descriptor: FlightDescriptor, + schema: Schema | None, + options: FlightCallOptions | None = None, + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + + def do_exchange( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback) -> None: ... + + +class FlightDataStream(_Weakrefable): + ... + + +class RecordBatchStream(FlightDataStream): + def __init__(self, data_source: RecordBatchReader | Table | None = None, + options: IpcWriteOptions | None = None) -> None: ... + + +class GeneratorStream(FlightDataStream): + def __init__( + self, + schema: Schema, + generator: Iterable[ + FlightDataStream + | Table + | RecordBatch + | RecordBatchReader + | tuple[RecordBatch, bytes] + ], + options: IpcWriteOptions | None = None, + ) -> None: ... + + +class ServerCallContext(_Weakrefable): + def peer_identity(self) -> bytes: ... + + def peer(self) -> str: ... + + # Set safe=True as gRPC on Windows sometimes gives garbage bytes + def is_cancelled(self) -> bool: ... + + def add_header(self, key: str, value: str) -> None: ... + + def add_trailer(self, key: str, value: str) -> None: ... + + def get_middleware(self, key: str) -> ServerMiddleware | None: ... + + +class ServerAuthReader(_Weakrefable): + def read(self) -> str: ... + + +class ServerAuthSender(_Weakrefable): + def write(self, message: str) -> None: ... + + +class ClientAuthReader(_Weakrefable): + def read(self) -> str: ... + + +class ClientAuthSender(_Weakrefable): + def write(self, message: str) -> None: ... + + +class ServerAuthHandler(_Weakrefable): + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): ... + + def is_valid(self, token: str) -> bool: ... + + +class ClientAuthHandler(_Weakrefable): + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): ... + + def get_token(self) -> str: ... + + +class CallInfo(NamedTuple): + method: FlightMethod + + +class ClientMiddlewareFactory(_Weakrefable): + def start_call(self, info: CallInfo) -> ClientMiddleware | None: ... + + +class ClientMiddleware(_Weakrefable): + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... + + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): ... + + def call_completed(self, exception: ArrowException): ... + + +class ServerMiddlewareFactory(_Weakrefable): + def start_call( + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> ServerMiddleware | None: ... + + +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): + ... + + +class ServerMiddleware(_Weakrefable): + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... + + def call_completed(self, exception: ArrowException): ... + + @property + def trace_context(self) -> dict: ... + + +class TracingServerMiddleware(ServerMiddleware): + trace_context: dict + def __init__(self, trace_context: dict) -> None: ... + + +class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... + + def start_call( # type: ignore[override] + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> _ServerMiddlewareFactoryWrapper | None: ... + + +class _ServerMiddlewareWrapper(ServerMiddleware): + def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... + def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... + def call_completed(self, exception: ArrowException) -> None: ... + + +class _FlightServerFinalizer(_Weakrefable): + + def finalize(self) -> None: ... + + +class FlightServerBase(_Weakrefable): + def __init__( + self, + location: str | tuple[str, int] | Location | None = None, + auth_handler: ServerAuthHandler | None = None, + tls_certificates: list[tuple[str, str]] | None = None, + verify_client: bool = False, + root_certificates: str | None = None, + middleware: dict[str, ServerMiddlewareFactory] | None = None, + ): ... + + @property + def port(self) -> int: ... + + def list_flights(self, context: ServerCallContext, + criteria: str) -> Iterator[FlightInfo]: ... + + def get_flight_info( + self, context: ServerCallContext, descriptor: FlightDescriptor + ) -> FlightInfo: ... + + def get_schema(self, context: ServerCallContext, + descriptor: FlightDescriptor) -> Schema: ... + + def do_put( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: FlightMetadataWriter, + ) -> None: ... + + def do_get(self, context: ServerCallContext, + ticket: Ticket) -> FlightDataStream: ... + + def do_exchange( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: MetadataRecordBatchWriter, + ) -> None: ... + + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: ... + + def do_action(self, context: ServerCallContext, + action: Action) -> Iterable[bytes]: ... + + def serve(self) -> None: ... + + def run(self) -> None: ... + + def shutdown(self) -> None: ... + + def wait(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__( + self, exc_type: object, exc_value: object, traceback: object) -> None: ... + + +def connect( + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: Sequence[tuple[str, int | str]] | None = None, +) -> FlightClient: ... diff --git a/python/pyarrow-stubs/pyarrow/_fs.pyi b/python/pyarrow-stubs/pyarrow/_fs.pyi new file mode 100644 index 000000000000..caf23a75d99b --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_fs.pyi @@ -0,0 +1,234 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt +import enum +import sys + +from abc import ABC, abstractmethod +from _typeshed import StrPath + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from fsspec import AbstractFileSystem # type: ignore + +from .lib import NativeFile, _Weakrefable + + +class FileType(enum.IntFlag): + NotFound = enum.auto() + Unknown = enum.auto() + File = enum.auto() + Directory = enum.auto() + + +class FileInfo(_Weakrefable): + def __init__( + self, + path: str, + type: FileType = FileType.Unknown, + *, + mtime: dt.datetime | float | None = None, + mtime_ns: int | None = None, + size: int | None = None, + ): ... + + def __getitem__(self, int) -> FileInfo: ... + + @property + def type(self) -> FileType: ... + + @property + def is_file(self) -> bool: ... + @property + def path(self) -> str: ... + + @property + def base_name(self) -> str: ... + + @property + def size(self) -> int: ... + + @property + def extension(self) -> str: ... + + @property + def mtime(self) -> dt.datetime | None: ... + + @property + def mtime_ns(self) -> int | None: ... + + +class FileSelector(_Weakrefable): + base_dir: str + allow_not_found: bool + recursive: bool + def __init__(self, base_dir: str, allow_not_found: bool = False, + recursive: bool = False): ... + + +class FileSystem(_Weakrefable): + @classmethod + def from_uri(cls, uri: str | StrPath) -> tuple[Self, str]: ... + + def equals(self, other: FileSystem | object) -> bool: ... + + @property + def type_name(self) -> str: ... + + def get_file_info( + self, paths_or_selector: str | list[str] | FileSelector + ) -> list[FileInfo] | FileInfo: ... + + def create_dir(self, path: str, *, recursive: bool = True) -> None: ... + + def delete_dir(self, path: str) -> None: ... + + def delete_dir_contents( + self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False + ) -> None: ... + + def move(self, src: str, dest: str) -> None: ... + + def copy_file(self, src: str, dest: str) -> None: ... + + def delete_file(self, path: str) -> None: ... + + def open_input_file(self, path: str) -> NativeFile: ... + + def open_input_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None) -> NativeFile: ... + + def open_output_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ) -> NativeFile: ... + + def open_append_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ): ... + + def normalize_path(self, path: str) -> str: ... + + +class LocalFileSystem(FileSystem): + def __init__(self, *, use_mmap: bool = False) -> None: ... + + +class SubTreeFileSystem(FileSystem): + def __init__(self, base_path: str, base_fs: FileSystem): ... + @property + def base_path(self) -> str: ... + @property + def base_fs(self) -> FileSystem: ... + + +class _MockFileSystem(FileSystem): + def __init__(self, current_time: dt.datetime | None = None) -> None: ... + + +class PyFileSystem(FileSystem): + def __init__(self, handler: FileSystemHandler | None) -> None: ... + @property + def handler(self) -> FileSystemHandler: ... + + +class FileSystemHandler(ABC): + @abstractmethod + def get_type_name(self) -> str: ... + + @abstractmethod + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: ... + + @abstractmethod + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... + + @abstractmethod + def create_dir(self, path: str, recursive: bool) -> None: ... + + @abstractmethod + def delete_dir(self, path: str) -> None: ... + + @abstractmethod + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ... + + @abstractmethod + def delete_root_dir_contents(self) -> None: ... + + @abstractmethod + def delete_file(self, path: str) -> None: ... + + @abstractmethod + def move(self, src: str, dest: str) -> None: ... + + @abstractmethod + def copy_file(self, src: str, dest: str) -> None: ... + + @abstractmethod + def open_input_stream(self, path: str) -> NativeFile: ... + + @abstractmethod + def open_input_file(self, path: str) -> NativeFile: ... + + @abstractmethod + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... + + @abstractmethod + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... + + @abstractmethod + def normalize_path(self, path: str) -> str: ... + + +SupportedFileSystem: TypeAlias = AbstractFileSystem | FileSystem + + +def _copy_files( + source_fs: FileSystem, + source_path: str, + destination_fs: SupportedFileSystem | None, + destination_path: str, + chunk_size: int = 1048576, + use_threads: bool = True, +) -> None: ... + + +def _copy_files_selector( + source_fs: FileSystem, + source_sel: FileSelector, + destination_fs: SupportedFileSystem | None, + destination_base_dir: str, + chunk_size: int = 1048576, + use_threads: bool = True, +) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_gcsfs.pyi b/python/pyarrow-stubs/pyarrow/_gcsfs.pyi new file mode 100644 index 000000000000..a0af3fa38710 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_gcsfs.pyi @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt + +from ._fs import FileSystem +from .lib import KeyValueMetadata + + +class GcsFileSystem(FileSystem): + def __init__( + self, + *, + anonymous: bool = False, + access_token: str | None = None, + target_service_account: str | None = None, + credential_token_expiration: dt.datetime | None = None, + default_bucket_location: str = "US", + scheme: str = "https", + endpoint_override: str | None = None, + default_metadata: dict | KeyValueMetadata | None = None, + retry_time_limit: dt.timedelta | None = None, + project_id: str | None = None, + ): ... + @property + def default_bucket_location(self) -> str: ... + + @property + def project_id(self) -> str: ... diff --git a/python/pyarrow-stubs/pyarrow/_hdfs.pyi b/python/pyarrow-stubs/pyarrow/_hdfs.pyi new file mode 100644 index 000000000000..370eaf709274 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_hdfs.pyi @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from _typeshed import StrPath + +from ._fs import FileSystem + + +class HadoopFileSystem(FileSystem): + def __init__( + self, + host: str | None = None, + port: int = 8020, + *, + user: str | None = None, + replication: int = 3, + buffer_size: int = 0, + default_block_size: int | None = None, + kerb_ticket: StrPath | None = None, + extra_conf: dict | None = None, + ): ... + @staticmethod + def from_uri(uri: str | int) -> HadoopFileSystem: ... # type: ignore[override] diff --git a/python/pyarrow-stubs/pyarrow/_ipc.pyi b/python/pyarrow-stubs/pyarrow/_ipc.pyi new file mode 100644 index 000000000000..5a87f2439046 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_ipc.pyi @@ -0,0 +1,317 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum +import sys + +from io import IOBase + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from collections.abc import Iterable, Iterator, Mapping +from typing import Any, Literal, NamedTuple + +import pandas as pd + +from pyarrow._stubs_typing import SupportPyBuffer +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable + +from .io import Buffer, Codec, NativeFile, BufferReader +from ._types import DictionaryMemo, KeyValueMetadata + + +class MetadataVersion(enum.IntEnum): + V1 = enum.auto() + V2 = enum.auto() + V3 = enum.auto() + V4 = enum.auto() + V5 = enum.auto() + + +class Alignment(enum.IntEnum): + Any = enum.auto() + At64Byte = enum.auto() + DataTypeSpecific = enum.auto() + + +class WriteStats(NamedTuple): + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + + +class ReadStats(NamedTuple): + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + + +class IpcReadOptions(_Weakrefable): + ensure_native_endian: bool + use_threads: bool + ensure_alignment: Alignment + included_fields: list[int] | None + + def __init__( + self, + *, + ensure_native_endian: bool = True, + use_threads: bool = True, + ensure_alignment: Alignment = ..., + included_fields: list[int] | None = None, + ) -> None: ... + + +class IpcWriteOptions(_Weakrefable): + metadata_version: Any + allow_64bit: bool + use_legacy_format: bool + compression: Any + use_threads: bool + emit_dictionary_deltas: bool + unify_dictionaries: bool + + def __init__( + self, + *, + metadata_version: MetadataVersion = MetadataVersion.V5, + allow_64bit: bool = False, + use_legacy_format: bool = False, + compression: Codec | Literal["lz4", "zstd"] | None = None, + use_threads: bool = True, + emit_dictionary_deltas: bool = False, + unify_dictionaries: bool = False, + ) -> None: ... + + +class Message(_Weakrefable): + @property + def type(self) -> str: ... + @property + def metadata(self) -> Buffer: ... + @property + def metadata_version(self) -> MetadataVersion: ... + @property + def body(self) -> Buffer | None: ... + def equals(self, other: Message) -> bool: ... + + def serialize_to(self, sink: NativeFile, alignment: int = 8, + memory_pool: MemoryPool | None = None): ... + + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | + None = None) -> Buffer: ... + + +class MessageReader(_Weakrefable): + @classmethod + def open_stream(cls, source: bytes | NativeFile | + IOBase | SupportPyBuffer) -> Self: ... + + def __iter__(self) -> Self: ... + def read_next_message(self) -> Message: ... + + __next__ = read_next_message + +# ---------------------------------------------------------------------- +# File and stream readers and writers + + +class _CRecordBatchWriter(_Weakrefable): + def write(self, table_or_batch: Table | RecordBatch): ... + + def write_batch( + self, + batch: RecordBatch, + custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, + ): ... + + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: ... + + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> WriteStats: ... + + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + @property + def _use_legacy_format(self) -> bool: ... + @property + def _metadata_version(self) -> MetadataVersion: ... + + def _open( + self, + sink, + schema: Schema, + options: IpcWriteOptions = IpcWriteOptions(), # noqa: Y011 + metadata: dict[bytes, bytes] | None = None, + ): ... + + +class _ReadPandasMixin: + def read_pandas(self, **options) -> pd.DataFrame: ... + + +class RecordBatchReader(_ReadPandasMixin, _Weakrefable): + def __iter__(self) -> Self: ... + def read_next_batch(self) -> RecordBatch: ... + + __next__ = read_next_batch + @property + def schema(self) -> Schema: ... + + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: ... + + def iter_batches_with_custom_metadata( + self, + ) -> Iterator[RecordBatchWithMetadata]: ... + + def read_all(self) -> Table: ... + + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + def cast(self, target_schema: Schema) -> Self: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + + @classmethod + def from_stream(cls, data: Any, + schema: Any = None) -> Self: ... + + @classmethod + def from_batches(cls, schema: Any, batches: Iterable[RecordBatch]) -> Self: ... + + +class _RecordBatchStreamReader(RecordBatchReader): + @property + def stats(self) -> ReadStats: ... + + def _open( + self, + source, + options: IpcReadOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): + ... + + +class RecordBatchWithMetadata(NamedTuple): + batch: RecordBatch + custom_metadata: KeyValueMetadata + + +class _RecordBatchFileReader(_ReadPandasMixin, _Weakrefable): + @property + def num_record_batches(self) -> int: ... + + def get_batch(self, i: int) -> RecordBatch: ... + + get_record_batch = get_batch + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: ... + + def read_all(self) -> Table: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def schema(self) -> Schema: ... + @property + def stats(self) -> ReadStats: ... + @property + def metadata(self) -> KeyValueMetadata | None: ... + + def _open( + self, + source, + footer_offset: int | None = None, + options: IpcReadOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + + +def get_tensor_size(tensor: Tensor) -> int: ... + + +def get_record_batch_size(batch: RecordBatch) -> int: ... + + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: ... + + +def read_tensor(source: NativeFile) -> Tensor: ... + + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: ... + + +def read_schema(obj: Buffer | Message | BufferReader, dictionary_memo: DictionaryMemo | + None = None) -> Schema: ... + + +def read_record_batch( + obj: Message | SupportPyBuffer, + schema: Schema, + dictionary_memo: DictionaryMemo | None = None) -> RecordBatch: ... + + +__all__ = [ + "MetadataVersion", + "Alignment", + "WriteStats", + "ReadStats", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "_CRecordBatchWriter", + "_RecordBatchStreamWriter", + "_ReadPandasMixin", + "RecordBatchReader", + "_RecordBatchStreamReader", + "_RecordBatchFileWriter", + "RecordBatchWithMetadata", + "_RecordBatchFileReader", + "get_tensor_size", + "get_record_batch_size", + "write_tensor", + "read_tensor", + "read_message", + "read_schema", + "read_record_batch", +] diff --git a/python/pyarrow-stubs/pyarrow/_json.pyi b/python/pyarrow-stubs/pyarrow/_json.pyi new file mode 100644 index 000000000000..bae2ff404f09 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_json.pyi @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable + + +class ReadOptions(_Weakrefable): + use_threads: bool + block_size: int + + def __init__(self, use_threads: bool | None = None, + block_size: int | None = None): ... + + def equals(self, other: ReadOptions) -> bool: ... + + +class ParseOptions(_Weakrefable): + explicit_schema: Schema + newlines_in_values: bool + unexpected_field_behavior: Literal["ignore", "error", "infer"] + + def __init__( + self, + explicit_schema: Schema | None = None, + newlines_in_values: bool | None = None, + unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", + ): ... + def equals(self, other: ParseOptions) -> bool: ... + + +class JSONStreamingReader(RecordBatchReader): + ... + + +def read_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> Table: ... + + +def open_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> JSONStreamingReader: ... diff --git a/python/pyarrow-stubs/pyarrow/_orc.pyi b/python/pyarrow-stubs/pyarrow/_orc.pyi new file mode 100644 index 000000000000..faa0f57c1fdc --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_orc.pyi @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Literal, Any + +from .lib import ( + Buffer, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + + +class ORCReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... + def metadata(self) -> KeyValueMetadata: ... + def schema(self) -> Schema: ... + def nrows(self) -> int: ... + def nstripes(self) -> int: ... + def file_version(self) -> str: ... + def software_version(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", + "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + + def compression_size(self) -> int: ... + def row_index_stride(self) -> int: ... + def writer(self) -> str: ... + def writer_version(self) -> str: ... + def nstripe_statistics(self) -> int: ... + def content_length(self) -> int: ... + def stripe_statistics_length(self) -> int: ... + def file_footer_length(self) -> int: ... + def file_postscript_length(self) -> int: ... + def file_length(self) -> int: ... + def serialized_file_tail(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... + + +class ORCWriter(_Weakrefable): + def open( + self, + where: str | NativeFile | IO, + *, + file_version: str | None = None, + batch_size: int | None = None, + stripe_size: int | None = None, + compression: Any = 'UNCOMPRESSED', + compression_block_size: int | None = None, + compression_strategy: Any = 'SPEED', + row_index_stride: int | None = None, + padding_tolerance: float | None = None, + dictionary_key_size_threshold: float | None = None, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float | None = None, + ) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_parquet.pyi b/python/pyarrow-stubs/pyarrow/_parquet.pyi new file mode 100644 index 000000000000..2521936ad5c5 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_parquet.pyi @@ -0,0 +1,524 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable, Iterator, Sequence +from typing import IO, Any, Literal, TypeAlias, TypedDict + +from _typeshed import StrPath + +from ._stubs_typing import Order +from .lib import ( + Buffer, + ChunkedArray, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, + DataType, + ListType, + LargeListType +) + +_PhysicalType: TypeAlias = Literal[ + "BOOLEAN", + "INT32", + "INT64", + "INT96", + "FLOAT", + "DOUBLE", + "BYTE_ARRAY", + "FIXED_LEN_BYTE_ARRAY", + "UNKNOWN", +] +_LogicTypeName: TypeAlias = Literal[ + "UNDEFINED", + "STRING", + "MAP", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME", + "TIMESTAMP", + "INT", + "FLOAT16", + "JSON", + "BSON", + "UUID", + "NONE", + "UNKNOWN", +] +_ConvertedType: TypeAlias = Literal[ + "NONE", + "UTF8", + "MAP", + "MAP_KEY_VALUE", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME_MILLIS", + "TIME_MICROS", + "TIMESTAMP_MILLIS", + "TIMESTAMP_MICROS", + "UINT_8", + "UINT_16", + "UINT_32", + "UINT_64", + "INT_8", + "INT_16", + "INT_32", + "INT_64", + "JSON", + "BSON", + "INTERVAL", + "UNKNOWN", +] +_Encoding: TypeAlias = Literal[ + "PLAIN", + "PLAIN_DICTIONARY", + "RLE", + "BIT_PACKED", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "RLE_DICTIONARY", + "BYTE_STREAM_SPLIT", + "UNKNOWN", +] +_Compression: TypeAlias = Literal[ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "LZO", + "BROTLI", + "LZ4", + "ZSTD", + "UNKNOWN", +] + + +class _Statistics(TypedDict): + has_min_max: bool + min: Any | None + max: Any | None + null_count: int | None + distinct_count: int | None + num_values: int + physical_type: _PhysicalType + + +class Statistics(_Weakrefable): + def to_dict(self) -> _Statistics: ... + def equals(self, other: Statistics) -> bool: ... + @property + def has_min_max(self) -> bool: ... + @property + def has_null_count(self) -> bool: ... + @property + def has_distinct_count(self) -> bool: ... + @property + def min_raw(self) -> Any | None: ... + @property + def max_raw(self) -> Any | None: ... + @property + def min(self) -> Any | None: ... + @property + def max(self) -> Any | None: ... + @property + def null_count(self) -> int | None: ... + @property + def distinct_count(self) -> int | None: ... + @property + def num_values(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + @property + def is_min_exact(self) -> bool: ... + @property + def is_max_exact(self) -> bool: ... + + +class ParquetLogicalType(_Weakrefable): + def to_json(self) -> str: ... + @property + def type(self) -> _LogicTypeName: ... + + +class _ColumnChunkMetaData(TypedDict): + file_offset: int + file_path: str | None + physical_type: _PhysicalType + num_values: int + path_in_schema: str + is_stats_set: bool + statistics: Statistics | None + compression: _Compression + encodings: tuple[_Encoding, ...] + has_dictionary_page: bool + dictionary_page_offset: int | None + data_page_offset: int + total_compressed_size: int + total_uncompressed_size: int + + +class ColumnChunkMetaData(_Weakrefable): + def to_dict(self) -> _ColumnChunkMetaData: ... + def equals(self, other: ColumnChunkMetaData) -> bool: ... + @property + def file_offset(self) -> int: ... + @property + def file_path(self) -> str | None: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def num_values(self) -> int: ... + @property + def path_in_schema(self) -> str: ... + @property + def is_stats_set(self) -> bool: ... + @property + def statistics(self) -> Statistics | None: ... + @property + def compression(self) -> _Compression: ... + @property + def encodings(self) -> tuple[_Encoding, ...]: ... + @property + def has_dictionary_page(self) -> bool: ... + @property + def dictionary_page_offset(self) -> int | None: ... + @property + def data_page_offset(self) -> int: ... + @property + def has_index_page(self) -> bool: ... + @property + def index_page_offset(self) -> int: ... + @property + def total_compressed_size(self) -> int: ... + @property + def total_uncompressed_size(self) -> int: ... + @property + def has_offset_index(self) -> bool: ... + @property + def has_column_index(self) -> bool: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + @property + def name(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def converted_type(self) -> _ConvertedType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + + +class _SortingColumn(TypedDict): + column_index: int + descending: bool + nulls_first: bool + + +class SortingColumn: + def __init__( + self, column_index: int, descending: bool = False, nulls_first: bool = False + ) -> None: ... + + @classmethod + def from_ordering( + cls, + schema: Schema, + sort_keys: Sequence[str] + | Sequence[tuple[str, Order]] + | Sequence[str | tuple[str, Order]], + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> tuple[SortingColumn, ...]: ... + + @staticmethod + def to_ordering( + schema: Schema, sorting_columns: tuple[SortingColumn, ...] | list[SortingColumn] + ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... + def __hash__(self) -> int: ... + @property + def column_index(self) -> int: ... + @property + def descending(self) -> bool: ... + @property + def nulls_first(self) -> bool: ... + def to_dict(self) -> _SortingColumn: ... + + +class _RowGroupMetaData(TypedDict): + num_columns: int + num_rows: int + total_byte_size: int + columns: list[ColumnChunkMetaData] + sorting_columns: list[SortingColumn] + + +class RowGroupMetaData(_Weakrefable): + def __init__(self, parent: FileMetaData, index: int) -> None: ... + def equals(self, other: RowGroupMetaData) -> bool: ... + def column(self, i: int) -> ColumnChunkMetaData: ... + def to_dict(self) -> _RowGroupMetaData: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def sorting_columns(self) -> list[SortingColumn]: ... + + +class _FileMetaData(TypedDict): + created_by: str + num_columns: int + num_rows: int + num_row_groups: int + format_version: str + serialized_size: int + row_groups: list[Any] # List of row group metadata dictionaries + + +class FileMetaData(_Weakrefable): + def __hash__(self) -> int: ... + def to_dict(self) -> _FileMetaData: ... + def equals(self, other: FileMetaData) -> bool: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def serialized_size(self) -> int: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + @property + def format_version(self) -> str: ... + @property + def created_by(self) -> str: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + def row_group(self, i: int) -> RowGroupMetaData: ... + def set_file_path(self, path: str) -> None: ... + def append_row_groups(self, other: FileMetaData) -> None: ... + def write_metadata_file(self, where: StrPath | Buffer | + NativeFile | IO) -> None: ... + + +class ParquetSchema(_Weakrefable): + def __init__(self, container: FileMetaData) -> None: ... + def __getitem__(self, i: int) -> ColumnSchema: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + @property + def names(self) -> list[str]: ... + def to_arrow_schema(self) -> Schema: ... + def equals(self, other: ParquetSchema) -> bool: ... + def column(self, i: int) -> ColumnSchema: ... + + +class ColumnSchema(_Weakrefable): + def __init__(self, schema: ParquetSchema, index: int) -> None: ... + def equals(self, other: ColumnSchema) -> bool: ... + @property + def name(self) -> str: ... + @property + def path(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + @property + def length(self) -> int | None: ... + @property + def precision(self) -> int | None: ... + @property + def scale(self) -> int | None: ... + + +class ParquetReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + + def open( + self, + source: StrPath | Buffer | NativeFile | IO, + *, + use_memory_map: bool = False, + read_dictionary: Iterable[int] | Iterable[str] | None = None, + metadata: FileMetaData | None = None, + binary_type: DataType | None = None, + list_type: ListType | LargeListType | None = None, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + arrow_extensions_enabled: bool | None = None, + ) -> None: ... + + @property + def column_paths(self) -> list[str]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def set_use_threads(self, use_threads: bool) -> None: ... + def set_batch_size(self, batch_size: int) -> None: ... + + def iter_batches( + self, + batch_size: int = 65536, + row_groups: list[int] | range | None = None, + column_indices: list[str] | list[int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: ... + + def read_row_group( + self, i: int, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + + def read_row_groups( + self, + row_groups: Sequence[int] | range, + column_indices: list[str] | list[int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def read_all( + self, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + + def scan_contents( + self, columns: Sequence[str] | Sequence[int] | None = None, + batch_size: int = 65536 + ) -> int: ... + + def column_name_idx(self, column_name: str) -> int: ... + def read_column(self, column_index: int) -> ChunkedArray: ... + def close(self) -> None: ... + @property + def closed(self) -> bool: ... + + +class ParquetWriter(_Weakrefable): + def __init__( + self, + where: StrPath | NativeFile | IO, + schema: Schema, + use_dictionary: bool | list[str] | None = None, + compression: _Compression | dict[str, _Compression] | str | None = None, + version: str | None = None, + write_statistics: bool | list[str] | None = None, + memory_pool: MemoryPool | None = None, + use_deprecated_int96_timestamps: bool = False, + coerce_timestamps: Literal["ms", "us"] | None = None, + data_page_size: int | None = None, + allow_truncated_timestamps: bool = False, + compression_level: int | dict[str, int] | None = None, + use_byte_stream_split: bool | list[str] = False, + column_encoding: _Encoding | dict[str, _Encoding] | None = None, + writer_engine_version: str | None = None, + data_page_version: str | None = None, + use_compliant_nested_type: bool = True, + encryption_properties: FileDecryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: tuple[SortingColumn, ...] | None = None, + store_decimal_as_integer: bool = False, + write_time_adjusted_to_utc: bool = False, + max_rows_per_page: int | None = None, + ): ... + def close(self) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def use_dictionary(self) -> bool | list[str] | None: ... + @property + def use_deprecated_int96_timestamps(self) -> bool: ... + @property + def use_byte_stream_split(self) -> bool | list[str]: ... + @property + def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... + @property + def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... + @property + def allow_truncated_timestamps(self) -> bool: ... + @property + def compression(self) -> _Compression | dict[str, _Compression] | None: ... + @property + def compression_level(self) -> int | dict[str, int] | None: ... + @property + def data_page_version(self) -> str | None: ... + @property + def use_compliant_nested_type(self) -> bool: ... + @property + def version(self) -> str | None: ... + @property + def write_statistics(self) -> bool | list[str] | None: ... + @property + def writer_engine_version(self) -> str: ... + @property + def row_group_size(self) -> int: ... + @property + def data_page_size(self) -> int: ... + @property + def encryption_properties(self) -> FileDecryptionProperties: ... + @property + def write_batch_size(self) -> int: ... + @property + def dictionary_pagesize_limit(self) -> int: ... + @property + def store_schema(self) -> bool: ... + @property + def store_decimal_as_integer(self) -> bool: ... + + +class FileEncryptionProperties: + ... + + +class FileDecryptionProperties: + ... diff --git a/python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi b/python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi new file mode 100644 index 000000000000..74b50ce665d1 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi @@ -0,0 +1,141 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt +import pathlib + +from collections.abc import Callable + +from pyarrow._fs import FileSystem +from ._parquet import FileDecryptionProperties, FileEncryptionProperties +from .lib import _Weakrefable + + +class EncryptionConfiguration(_Weakrefable): + footer_key: str + column_keys: dict[str, list[str]] + encryption_algorithm: str + plaintext_footer: bool + double_wrapping: bool + cache_lifetime: dt.timedelta + internal_key_material: bool + data_key_length_bits: int + uniform_encryption: bool + + def __init__( + self, + footer_key: str, + *, + column_keys: dict[str, str | list[str]] | None = None, + encryption_algorithm: str | None = None, + plaintext_footer: bool | None = None, + double_wrapping: bool | None = None, + cache_lifetime: dt.timedelta | None = None, + internal_key_material: bool | None = None, + data_key_length_bits: int | None = None, + uniform_encryption: bool | None = None, + ) -> None: ... + + +class DecryptionConfiguration(_Weakrefable): + cache_lifetime: dt.timedelta + def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... + + +class KmsConnectionConfig(_Weakrefable): + kms_instance_id: str + kms_instance_url: str + key_access_token: str + custom_kms_conf: dict[str, str] + + def __init__( + self, + *, + kms_instance_id: str | None = None, + kms_instance_url: str | None = None, + key_access_token: str | None = None, + custom_kms_conf: dict[str, str] | None = None, + ) -> None: ... + def refresh_key_access_token(self, value: str) -> None: ... + + +class KmsClient(_Weakrefable): + def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... + def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> bytes: ... + + +class CryptoFactory(_Weakrefable): + def __init__(self, kms_client_factory: Callable[[ + KmsConnectionConfig], KmsClient]): ... + + def file_encryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> FileEncryptionProperties: ... + + def file_decryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration | None = None, + ) -> FileDecryptionProperties: ... + def remove_cache_entries_for_token(self, access_token: str) -> None: ... + def remove_cache_entries_for_all_tokens(self) -> None: ... + def rotate_master_keys( + self, + kms_connection_config: KmsConnectionConfig, + parquet_file_path: str | pathlib.Path, + filesystem: FileSystem | None = None, + double_wrapping: bool = True, + cache_lifetime_seconds: int | float = 600, + ) -> None: ... + + +class KeyMaterial(_Weakrefable): + @property + def is_footer_key(self) -> bool: ... + @property + def is_double_wrapped(self) -> bool: ... + @property + def master_key_id(self) -> str: ... + @property + def wrapped_dek(self) -> str: ... + @property + def kek_id(self) -> str: ... + @property + def wrapped_kek(self) -> str: ... + @property + def kms_instance_id(self) -> str: ... + @property + def kms_instance_url(self) -> str: ... + @staticmethod + def wrap(key_material: KeyMaterial) -> KeyMaterial: ... + @staticmethod + def parse(key_material_string: str) -> KeyMaterial: ... + + + +class FileSystemKeyMaterialStore(_Weakrefable): + def get_key_material(self, key_id: str) -> KeyMaterial: ... + def get_key_id_set(self) -> list[str]: ... + @classmethod + def for_file( + cls, + parquet_file_path: str | pathlib.Path, /, + filesystem: FileSystem | None = None + ) -> FileSystemKeyMaterialStore: + ... diff --git a/python/pyarrow-stubs/pyarrow/_s3fs.pyi b/python/pyarrow-stubs/pyarrow/_s3fs.pyi new file mode 100644 index 000000000000..f82f34d2cae9 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_s3fs.pyi @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from typing import Literal, TypedDict +from typing_extensions import Required, NotRequired + +from ._fs import FileSystem +from .lib import KeyValueMetadata + + +class _ProxyOptions(TypedDict): + scheme: Required[Literal["http", "https"]] + host: Required[str] + port: Required[int] + username: NotRequired[str] + password: NotRequired[str] + + +class S3LogLevel(enum.IntEnum): + Off = enum.auto() + Fatal = enum.auto() + Error = enum.auto() + Warn = enum.auto() + Info = enum.auto() + Debug = enum.auto() + Trace = enum.auto() + + +Off = S3LogLevel.Off +Fatal = S3LogLevel.Fatal +Error = S3LogLevel.Error +Warn = S3LogLevel.Warn +Info = S3LogLevel.Info +Debug = S3LogLevel.Debug +Trace = S3LogLevel.Trace + + +def initialize_s3( + log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 +) -> None: ... +def ensure_s3_initialized() -> None: ... +def finalize_s3() -> None: ... +def ensure_s3_finalized() -> None: ... +def resolve_s3_region(bucket: str) -> str: ... + + +class S3RetryStrategy: + max_attempts: int + def __init__(self, max_attempts=3) -> None: ... + + +class AwsStandardS3RetryStrategy(S3RetryStrategy): + ... + + +class AwsDefaultS3RetryStrategy(S3RetryStrategy): + ... + + +class S3FileSystem(FileSystem): + def __init__( + self, + *, + access_key: str | None = None, + secret_key: str | None = None, + session_token: str | None = None, + anonymous: bool = False, + region: str | None = None, + request_timeout: float | None = None, + connect_timeout: float | None = None, + scheme: Literal["http", "https"] = "https", + endpoint_override: str | None = None, + background_writes: bool = True, + default_metadata: dict | list | KeyValueMetadata | None = None, + role_arn: str | None = None, + session_name: str | None = None, + external_id: str | None = None, + load_frequency: int = 900, + proxy_options: _ProxyOptions | dict | tuple | str | None = None, + allow_bucket_creation: bool = False, + allow_bucket_deletion: bool = False, + allow_delayed_open: bool = False, + check_directory_existence_before_creation: bool = False, + tls_ca_file_path: str | None = None, + retry_strategy: S3RetryStrategy = + AwsStandardS3RetryStrategy(max_attempts=3), # noqa: Y011 + force_virtual_addressing: bool = False, + ): ... + @property + def region(self) -> str: ... diff --git a/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi b/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi new file mode 100644 index 000000000000..0715012fddc3 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi @@ -0,0 +1,133 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt + +from collections.abc import Collection, Iterator, Sequence +from decimal import Decimal +from typing import Any, Literal, Protocol, TypeAlias, TypeVar + +import numpy as np + +from numpy.typing import NDArray + +from pyarrow.lib import BooleanArray, IntegerArray, ChunkedArray + +ArrayLike: TypeAlias = Any +ScalarLike: TypeAlias = Any +Order: TypeAlias = Literal["ascending", "descending"] +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] +Compression: TypeAlias = Literal[ + "gzip", "bz2", "brotli", "lz4", "lz4_frame", "lz4_raw", "zstd", "snappy" +] +NullEncoding: TypeAlias = Literal["mask", "encode"] +NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] +TimeUnit: TypeAlias = Literal["s", "ms", "us", "ns"] +Mask: TypeAlias = ( + Sequence[bool | None] + | NDArray[np.bool_] + | BooleanArray + | ChunkedArray[Any] +) +Indices: TypeAlias = ( + Sequence[int | None] + | NDArray[np.integer[Any]] + | IntegerArray + | ChunkedArray[Any] +) + +PyScalar: TypeAlias = (bool | int | float | Decimal | str | bytes | + dt.date | dt.datetime | dt.time | dt.timedelta) + +_T = TypeVar("_T") +_V = TypeVar("_V", covariant=True) + +SingleOrList: TypeAlias = list[_T] | _T + + +class SupportEq(Protocol): + def __eq__(self, other) -> bool: ... + + +class SupportLt(Protocol): + def __lt__(self, other) -> bool: ... + + +class SupportGt(Protocol): + def __gt__(self, other) -> bool: ... + + +class SupportLe(Protocol): + def __le__(self, other) -> bool: ... + + +class SupportGe(Protocol): + def __ge__(self, other) -> bool: ... + + +FilterTuple: TypeAlias = ( + tuple[str, Literal["=", "==", "!="], SupportEq] + | tuple[str, Literal["<"], SupportLt] + | tuple[str, Literal[">"], SupportGt] + | tuple[str, Literal["<="], SupportLe] + | tuple[str, Literal[">="], SupportGe] + | tuple[str, Literal["in", "not in"], Collection] + | tuple[str, str, Any] # Allow general str for operator to avoid type errors +) + + +class Buffer(Protocol): + ... + + +class SupportPyBuffer(Protocol): + ... + + +class SupportArrowStream(Protocol): + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + + +class SupportPyArrowArray(Protocol): + def __arrow_array__(self, type=None) -> Any: ... + + +class SupportArrowArray(Protocol): + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + + +class SupportArrowDeviceArray(Protocol): + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + + +class SupportArrowSchema(Protocol): + def __arrow_c_schema__(self) -> Any: ... + + +class NullableCollection(Protocol[_V]): # type: ignore[reportInvalidTypeVarUse] + def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... + def __len__(self) -> int: ... + def __contains__(self, item: Any, /) -> bool: ... diff --git a/python/pyarrow-stubs/pyarrow/_substrait.pyi b/python/pyarrow-stubs/pyarrow/_substrait.pyi new file mode 100644 index 000000000000..6818d9822ab0 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_substrait.pyi @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable +from typing import Any + +from ._compute import Expression +from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable + + +def run_query( + plan: Buffer | int, + *, + table_provider: Callable[[list[str], Schema], Table] | None = None, + use_threads: bool = True, +) -> RecordBatchReader: ... +def _parse_json_plan(plan: bytes) -> Buffer: ... + + +class SubstraitSchema: + schema: bytes + expression: bytes + def __init__(self, schema: bytes, expression: bytes) -> None: ... + def to_pysubstrait(self) -> Any: ... + + +def serialize_schema(schema: Schema) -> SubstraitSchema: ... +def deserialize_schema(buf: Buffer | bytes | SubstraitSchema) -> Schema: ... + + +def serialize_expressions( + exprs: list[Expression], + names: list[str], + schema: Schema, + *, + allow_arrow_extensions: bool = False, +) -> Buffer: ... + + +class BoundExpressions(_Weakrefable): + @property + def schema(self) -> Schema: ... + @property + def expressions(self) -> dict[str, Expression]: ... + @classmethod + def from_substrait(cls, message: Buffer | bytes | Any) -> BoundExpressions: ... + + +def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... +def get_supported_functions() -> list[str]: ... diff --git a/python/pyarrow-stubs/pyarrow/_types.pyi b/python/pyarrow-stubs/pyarrow/_types.pyi new file mode 100644 index 000000000000..6b7a58ccfe60 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_types.pyi @@ -0,0 +1,966 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt # noqa: F401 +import sys + +from collections.abc import Mapping, Sequence, Iterable, Iterator +from decimal import Decimal # noqa: F401 + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from typing import Any, Generic, Literal + +import numpy as np +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowSchema +from pyarrow.lib import ( # noqa: F401 + Array, + ChunkedArray, + ExtensionArray, + MemoryPool, + MonthDayNano, + Table, +) +from typing_extensions import TypeVar, deprecated + +from .io import Buffer +from .scalar import ExtensionScalar +from ._stubs_typing import TimeUnit + +class _Weakrefable: + ... + + +class _Metadata(_Weakrefable): + ... + + +class DataType(_Weakrefable): + def field(self, i: int) -> Field: ... + + @property + def id(self) -> int: ... + @property + def bit_width(self) -> int: ... + + @property + def byte_width(self) -> int: ... + + @property + def num_fields(self) -> int: ... + + @property + def num_buffers(self) -> int: ... + + @property + def has_variadic_buffers(self) -> bool: ... + + # Properties that exist on specific subtypes but accessed generically + @property + def list_size(self) -> int: ... + + def __hash__(self) -> int: ... + + def equals(self, other: DataType | str, *, + check_metadata: bool = False) -> bool: ... + + def to_pandas_dtype(self) -> np.generic: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_schema__(self) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: ... + + +_AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + + +class _BasicDataType(DataType, Generic[_AsPyType]): + ... + + +class NullType(_BasicDataType[None]): + ... + + +class BoolType(_BasicDataType[bool]): + ... + + +class UInt8Type(_BasicDataType[int]): + ... + + +class Int8Type(_BasicDataType[int]): + ... + + +class UInt16Type(_BasicDataType[int]): + ... + + +class Int16Type(_BasicDataType[int]): + ... + + +class UInt32Type(_BasicDataType[int]): + ... + + +class Int32Type(_BasicDataType[int]): + ... + + +class UInt64Type(_BasicDataType[int]): + ... + + +class Int64Type(_BasicDataType[int]): + ... + + +class Float16Type(_BasicDataType[float]): + ... + + +class Float32Type(_BasicDataType[float]): + ... + + +class Float64Type(_BasicDataType[float]): + ... + + +class Date32Type(_BasicDataType[dt.date]): + ... + + +class Date64Type(_BasicDataType[dt.date]): + ... + + +class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): + ... + + +class StringType(_BasicDataType[str]): + ... + + +class LargeStringType(_BasicDataType[str]): + ... + + +class StringViewType(_BasicDataType[str]): + ... + + +class BinaryType(_BasicDataType[bytes]): + ... + + +class LargeBinaryType(_BasicDataType[bytes]): + ... + + +class BinaryViewType(_BasicDataType[bytes]): + ... + + +_Unit = TypeVar("_Unit", bound=TimeUnit, default=Literal["us"]) +_Tz = TypeVar("_Tz", str, None, default=None) + + +class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): + + @property + def unit(self) -> _Unit: ... + + @property + def tz(self) -> _Tz: ... + + +_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) + + +class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): + @property + def unit(self) -> _Time32Unit: ... + + +_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) + + +class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): + @property + def unit(self) -> _Time64Unit: ... + + +class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): + @property + def unit(self) -> _Unit: ... + + +class FixedSizeBinaryType(_BasicDataType[Decimal]): + ... + + +_Precision = TypeVar("_Precision", default=Any) +_Scale = TypeVar("_Scale", default=Any) + + +class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class ListType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + +class LargeListType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: ... + + +class ListViewType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + +class LargeListViewType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + +class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + @property + def list_size(self) -> int: ... + + +class DictionaryMemo(_Weakrefable): + ... + + +_IndexT = TypeVar( + "_IndexT", + UInt8Type, + Int8Type, + UInt16Type, + Int16Type, + UInt32Type, + Int32Type, + UInt64Type, + Int64Type, +) +_BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) +_ValueT = TypeVar("_ValueT", bound=DataType) +_Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) + + +class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): + @property + def ordered(self) -> _Ordered: ... + + @property + def index_type(self) -> _IndexT: ... + + @property + def value_type(self) -> _BasicValueT: ... + + +_K = TypeVar("_K", bound=DataType) + + +class MapType(DataType, Generic[_K, _ValueT, _Ordered]): + @property + def key_field(self) -> Field[_K]: ... + + @property + def key_type(self) -> _K: ... + + @property + def item_field(self) -> Field[_ValueT]: ... + + @property + def item_type(self) -> _ValueT: ... + + @property + def keys_sorted(self) -> _Ordered: ... + + +_Size = TypeVar("_Size", default=int) + + +class StructType(DataType): + def get_field_index(self, name: str) -> int: ... + + def field(self, i: int | str) -> Field: ... + + def get_all_field_indices(self, name: str) -> list[int]: ... + + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[Field]: ... + + __getitem__ = field + @property + def names(self) -> list[str]: ... + + @property + def fields(self) -> list[Field]: ... + + +class UnionType(DataType): + @property + def mode(self) -> Literal["sparse", "dense"]: ... + + @property + def type_codes(self) -> list[int]: ... + + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[Field]: ... + + def field(self, i: int) -> Field: ... + + __getitem__ = field + + +class SparseUnionType(UnionType): + @property + def mode(self) -> Literal["sparse"]: ... + + +class DenseUnionType(UnionType): + @property + def mode(self) -> Literal["dense"]: ... + + +_RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) + + +class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): + @property + def run_end_type(self) -> _RunEndType: ... + @property + def value_type(self) -> _BasicValueT: ... + + +_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) + + +class BaseExtensionType(DataType): + def __arrow_ext_class__(self) -> type[ExtensionArray]: ... + + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ... + + @property + def extension_name(self) -> str: ... + + @property + def storage_type(self) -> DataType: ... + + def wrap_array(self, storage: _StorageT) -> _StorageT: ... + + +class ExtensionType(BaseExtensionType): + def __init__(self, storage_type: DataType, extension_name: str) -> None: ... + + def __arrow_ext_serialize__(self) -> bytes: ... + + @classmethod + def __arrow_ext_deserialize__( + cls, storage_type: DataType, serialized: bytes) -> Self: ... + + +class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): + @property + def value_type(self) -> _ValueT: ... + + @property + def shape(self) -> list[int]: ... + + @property + def dim_names(self) -> list[str] | None: ... + + @property + def permutation(self) -> list[int] | None: ... + + +class Bool8Type(BaseExtensionType): + ... + + +class UuidType(BaseExtensionType): + ... + + +class JsonType(BaseExtensionType): + ... + + +class OpaqueType(BaseExtensionType): + @property + def type_name(self) -> str: ... + + @property + def vendor_name(self) -> str: ... + + +class UnknownExtensionType(ExtensionType): + def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... + + +def register_extension_type(ext_type: ExtensionType) -> None: ... + + +def unregister_extension_type(type_name: str) -> None: ... + + +class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): + def __init__( + self, __arg0__: Mapping[str | bytes, str | bytes] + | Iterable[tuple[str, str]] + | KeyValueMetadata + | None = None, **kwargs: str + ) -> None: ... + + def equals(self, other: KeyValueMetadata) -> bool: ... + + def __len__(self) -> int: ... + + def __contains__(self, /, __key: object) -> bool: ... # type: ignore[override] + + def __getitem__(self, /, __key: Any) -> Any: ... # type: ignore[override] + + def __iter__(self) -> Iterator[bytes]: ... + + def get_all(self, key: str) -> list[bytes]: ... + + def to_dict(self) -> dict[bytes, bytes]: ... + + +class Field(_Weakrefable, Generic[_DataTypeT]): + def equals(self, other: Field, check_metadata: bool = False) -> bool: ... + + def __hash__(self) -> int: ... + + @property + def nullable(self) -> bool: ... + + @property + def name(self) -> str: ... + + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + + @property + def type(self) -> _DataTypeT: ... + def with_metadata(self, metadata: dict[bytes | str, bytes | str] | + Mapping[bytes | str, bytes | str] | Any) -> Self: ... + + def remove_metadata(self) -> Self: ... + + def with_type(self, new_type: DataType) -> Field: ... + + def with_name(self, name: str) -> Self: ... + + def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: ... + + def flatten(self) -> list[Field]: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_schema__(self) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: ... + + +class Schema(_Weakrefable): + def __len__(self) -> int: ... + + def __getitem__(self, key: str | int) -> Field: ... + + _field = __getitem__ + def __iter__(self) -> Iterator[Field]: ... + + def __hash__(self) -> int: ... + + def __sizeof__(self) -> int: ... + @property + def pandas_metadata(self) -> dict: ... + + @property + def names(self) -> list[str]: ... + + @property + def types(self) -> list[DataType]: ... + + @property + def metadata(self) -> dict[bytes, bytes]: ... + + def empty_table(self) -> Table: ... + + def equals(self, other: Schema, check_metadata: bool = False) -> bool: ... + + @classmethod + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | + None = None) -> Schema: ... + + def field(self, i: int | str | bytes) -> Field: ... + + @deprecated("Use 'field' instead") + def field_by_name(self, name: str) -> Field: ... + + def get_field_index(self, name: str) -> int: ... + + def get_all_field_indices(self, name: str) -> list[int]: ... + + def append(self, field: Field) -> Schema: ... + + def insert(self, i: int, field: Field) -> Schema: ... + + def remove(self, i: int) -> Schema: ... + + def set(self, i: int, field: Field) -> Schema: ... + + @deprecated("Use 'with_metadata' instead") + def add_metadata(self, metadata: dict) -> Schema: ... + + def with_metadata(self, metadata: dict) -> Schema: ... + + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + + def remove_metadata(self) -> Schema: ... + + def to_string( + self, + truncate_metadata: bool = True, + show_field_metadata: bool = True, + show_schema_metadata: bool = True, + element_size_limit: int | None = None, + ) -> str: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Schema: ... + + def __arrow_c_schema__(self) -> Any: ... + + @staticmethod + def _import_from_c_capsule(schema: Any) -> Schema: ... + + +def unify_schemas( + schemas: Sequence[Schema], + *, + promote_options: Literal["default", "permissive"] = "default" +) -> Schema: ... + + +def field( + name: SupportArrowSchema | str | Any, type: _DataTypeT | str | None = None, + nullable: bool = ..., + metadata: dict[Any, Any] | None = None +) -> Field[_DataTypeT] | Field[Any]: ... + + +def null() -> NullType: ... + + +def bool_() -> BoolType: ... + + +def uint8() -> UInt8Type: ... + + +def int8() -> Int8Type: ... + + +def uint16() -> UInt16Type: ... + + +def int16() -> Int16Type: ... + + +def uint32() -> UInt32Type: ... + + +def int32() -> Int32Type: ... + + +def int64() -> Int64Type: ... + + +def uint64() -> UInt64Type: ... + + +def timestamp( + unit: _Unit | str, tz: _Tz | None = None) -> TimestampType[_Unit, _Tz]: ... + + +def time32(unit: _Time32Unit | str) -> Time32Type[_Time32Unit]: ... + + +def time64(unit: _Time64Unit | str) -> Time64Type[_Time64Unit]: ... + + +def duration(unit: _Unit | str) -> DurationType[_Unit]: ... + + +def month_day_nano_interval() -> MonthDayNanoIntervalType: ... + + +def date32() -> Date32Type: ... + + +def date64() -> Date64Type: ... + + +def float16() -> Float16Type: ... + + +def float32() -> Float32Type: ... + + +def float64() -> Float64Type: ... + + +def decimal32(precision: _Precision, scale: _Scale | + None = None) -> Decimal32Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal64(precision: _Precision, scale: _Scale | + None = None) -> Decimal64Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal128(precision: _Precision, scale: _Scale | + None = None) -> Decimal128Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal256(precision: _Precision, scale: _Scale | + None = None) -> Decimal256Type[_Precision, _Scale | Literal[0]]: ... + + +def string() -> StringType: ... + + +utf8 = string + + +def binary(length: Literal[-1] | int = ...) -> BinaryType | FixedSizeBinaryType: ... + + +def large_binary() -> LargeBinaryType: ... + + +def large_string() -> LargeStringType: ... + + +large_utf8 = large_string + + +def binary_view() -> BinaryViewType: ... + + +def string_view() -> StringViewType: ... + + +def list_( + value_type: _DataTypeT | Field[_DataTypeT] | None = None, + list_size: Literal[-1] | _Size | None = None +) -> ListType[_DataTypeT] | FixedSizeListType[_DataTypeT, _Size]: ... + + +def large_list(value_type: _DataTypeT | + Field[_DataTypeT] | None = None) -> LargeListType[_DataTypeT]: ... + + +def list_view(value_type: _DataTypeT | + Field[_DataTypeT] | None = None) -> ListViewType[_DataTypeT]: ... + + +def large_list_view( + value_type: _DataTypeT | Field[_DataTypeT] | None = None +) -> LargeListViewType[_DataTypeT]: ... + + +def map_( + key_type: _K | Field | str | None = None, + item_type: _ValueT | Field | str | None = None, + keys_sorted: bool | None = None +) -> MapType[_K, _ValueT, Literal[False]]: ... + + +def dictionary( + index_type: _IndexT | str, + value_type: _BasicValueT | str, + ordered: _Ordered | None = None +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... + + +def struct( + fields: Iterable[ + Field[Any] + | tuple[str, Field[Any] | None] + | tuple[str, DataType | None] + ] | Mapping[str, Field[Any] | DataType | None], +) -> StructType: ... + + +def sparse_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> SparseUnionType: ... + + +def dense_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> DenseUnionType: ... + + +def union( + child_fields: list[Field[Any]], mode: Literal["sparse" | "dense"] | int | str, + type_codes: list[int] | None = None) -> SparseUnionType | DenseUnionType: ... + + +def run_end_encoded( + run_end_type: _RunEndType | str | None, value_type: _BasicValueT | str | None +) -> RunEndEncodedType[_RunEndType, _BasicValueT]: ... + + +def json_(storage_type: DataType = ...) -> JsonType: ... + + +def uuid() -> UuidType: ... + + +def fixed_shape_tensor( + value_type: _ValueT, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + permutation: Sequence[int] | None = None, +) -> FixedShapeTensorType[_ValueT]: ... + + +def bool8() -> Bool8Type: ... + + +def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: ... + + +def type_for_alias(name: Any) -> DataType: ... + + +def schema( + fields: ( + Iterable[Field[Any]] + | Iterable[tuple[str, DataType | str | None]] + | Mapping[Any, DataType | str | None] + ), + metadata: Mapping[bytes, bytes] + | Mapping[str, str] + | Mapping[bytes, str] + | Mapping[str, bytes] | None = None, +) -> Schema: ... + + +def from_numpy_dtype(dtype: np.dtype[Any] | type | str) -> DataType: ... + + +__all__ = [ + "_Weakrefable", + "_Metadata", + "DataType", + "_BasicDataType", + "NullType", + "BoolType", + "UInt8Type", + "Int8Type", + "UInt16Type", + "Int16Type", + "UInt32Type", + "Int32Type", + "UInt64Type", + "Int64Type", + "Float16Type", + "Float32Type", + "Float64Type", + "Date32Type", + "Date64Type", + "MonthDayNanoIntervalType", + "StringType", + "LargeStringType", + "StringViewType", + "BinaryType", + "LargeBinaryType", + "BinaryViewType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "ListType", + "LargeListType", + "ListViewType", + "LargeListViewType", + "FixedSizeListType", + "DictionaryMemo", + "DictionaryType", + "MapType", + "StructType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "RunEndEncodedType", + "BaseExtensionType", + "ExtensionType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "KeyValueMetadata", + "Field", + "Schema", + "unify_schemas", + "field", + "null", + "bool_", + "uint8", + "int8", + "uint16", + "int16", + "uint32", + "int32", + "int64", + "uint64", + "timestamp", + "time32", + "time64", + "duration", + "month_day_nano_interval", + "date32", + "date64", + "float16", + "float32", + "float64", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "string", + "utf8", + "binary", + "large_binary", + "large_string", + "large_utf8", + "binary_view", + "string_view", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "dictionary", + "struct", + "sparse_union", + "dense_union", + "union", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "type_for_alias", + "schema", + "from_numpy_dtype", + "_Unit", + "_Tz", + "_Time32Unit", + "_Time64Unit", + "_DataTypeT", +] diff --git a/python/pyarrow-stubs/pyarrow/array.pyi b/python/pyarrow-stubs/pyarrow/array.pyi new file mode 100644 index 000000000000..547e9c949d52 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/array.pyi @@ -0,0 +1,894 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from collections.abc import Iterable, Iterator, Sequence + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from typing import ( + Any, + Generic, + Literal, + TypeVar, +) + +import numpy as np +import pandas as pd + +from pyarrow._compute import CastOptions +from pyarrow._stubs_typing import ( + ArrayLike, + Indices, + Mask, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportPyArrowArray, +) +from pyarrow.lib import ( + Buffer, + Device, + MemoryManager, + MemoryPool, + Tensor, + _Weakrefable, +) +from typing_extensions import deprecated +import builtins + +from .scalar import ( # noqa: F401 + BinaryScalar, + BinaryViewScalar, + BooleanScalar, + Date32Scalar, + Date64Scalar, + DictionaryScalar, + DoubleScalar, + DurationScalar, + ExtensionScalar, + FixedSizeBinaryScalar, + FixedSizeListScalar, + FloatScalar, + HalfFloatScalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + Int8Scalar, + LargeBinaryScalar, + LargeListScalar, + LargeStringScalar, + ListScalar, + ListViewScalar, + MapScalar, + MonthDayNanoIntervalScalar, + NullScalar, + RunEndEncodedScalar, + Scalar, + StringScalar, + StringViewScalar, + StructScalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + UInt8Scalar, + UnionScalar, +) +from .device import DeviceAllocationType +from ._types import ( # noqa: F401 + BaseExtensionType, + BinaryType, + DataType, + Field, + Float64Type, + Int64Type, + MapType, + StringType, + StructType, + _AsPyType, + _BasicDataType, + _BasicValueT, + _DataTypeT, + _IndexT, + _RunEndType, + _Size, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, +) +from ._stubs_typing import NullableCollection + + +def array( + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray + | SupportArrowDeviceArray | SupportPyArrowArray, + type: Any | None = None, + mask: Mask | pd.Series[bool] | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def asarray( + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray + | SupportArrowDeviceArray, + type: _DataTypeT | Any | None = None, +) -> Array[Scalar[_DataTypeT]] | ArrayLike: ... + + +def nulls( + size: int, + type: Any | None = None, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def repeat( + value: Any, + size: int, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def infer_type(values: Iterable[Any], mask: Mask | None = None, + from_pandas: bool = False) -> DataType: ... + + +class ArrayStatistics(_Weakrefable): + @property + def null_count(self) -> int | None: ... + + @property + def distinct_count(self) -> int | None: ... + + @property + def is_null_count_exact(self) -> bool | None: ... + + @property + def is_distinct_count_exact(self) -> bool | None: ... + + @property + def min(self) -> Any | None: ... + + @property + def is_min_exact(self) -> bool | None: ... + + @property + def max(self) -> Any | None: ... + + @property + def is_max_exact(self) -> bool | None: ... + + +_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) + + +class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): + def to_pandas( + self, + memory_pool: MemoryPool | None = None, + categories: list | tuple | None = None, + strings_to_categorical: bool = False, + zero_copy_only: bool = False, + integer_object_nulls: bool = False, + date_as_object: bool = True, + timestamp_as_object: bool = False, + use_threads: bool = True, + deduplicate_objects: bool = True, + ignore_metadata: bool = False, + safe: bool = True, + split_blocks: bool = False, + self_destruct: bool = False, + maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, + types_mapper: Any = None, # Callable[[DataType], ExtensionDtype | None] | None + coerce_temporal_nanoseconds: bool = False, + ) -> _ConvertAs: ... + + +_CastAs = TypeVar("_CastAs", bound=DataType) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=Scalar) + + +class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + def as_py(self) -> list[Any]: ... + + def diff(self, other: Self) -> str: ... + + # Private attribute used internally (e.g., for column names in batches) + _name: str | None + + def cast( + self, + target_type: _CastAs | str, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_CastAs]]: ... + + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... + + def sum(self, **kwargs) -> _Scalar_co: ... + + @property + def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... + def unique(self) -> Self: ... + + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... + + def value_counts(self) -> StructArray: ... + + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + type: _DataTypeT | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_DataTypeT]] | Array[Scalar]: ... + + @staticmethod + def from_buffers( + type: _DataTypeT, + length: int, + buffers: Sequence[Buffer | None], + null_count: int = -1, + offset=0, + children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, + ) -> Array[Scalar[_DataTypeT]]: ... + + @property + def null_count(self) -> int: ... + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... + + def to_string( + self, + *, + indent: int = 2, + top_level_indent: int = 0, + window: int = 10, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + + format = to_string + def equals(self, other: Array | Any) -> bool: ... + + def __len__(self) -> int: ... + + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: ... + + def is_nan(self) -> BooleanArray: ... + + def is_valid(self) -> BooleanArray: ... + + def fill_null( + self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ... + + def __getitem__(self, key: int | builtins.slice) -> _Scalar_co | Self: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + + def filter( + self, + mask: Mask, + *, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + ) -> Self: ... + + def index( + self: Array[_ScalarT] | Array[Scalar[_BasicDataType[_AsPyType]]], + value: _ScalarT | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def to_numpy(self, zero_copy_only: bool = True, + writable: bool = False) -> np.ndarray: ... + + def to_pylist( + self, + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[Any]: ... + + tolist = to_pylist + def validate(self, *, full: bool = False) -> None: ... + + @property + def offset(self) -> int: ... + + def buffers(self) -> list[Buffer | None]: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def __dlpack__(self, stream: int | None = None) -> Any: ... + + def __dlpack_device__(self) -> tuple[int, int]: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def statistics(self) -> ArrayStatistics | None: ... + + +class NullArray(Array[NullScalar]): + ... + + +class BooleanArray(Array[BooleanScalar]): + @property + def false_count(self) -> int: ... + @property + def true_count(self) -> int: ... + + +class NumericArray(Array[_ScalarT]): + ... + + +class IntegerArray(NumericArray[_ScalarT]): + ... + + +class FloatingPointArray(NumericArray[_ScalarT]): + ... + + +class Int8Array(IntegerArray[Int8Scalar]): + ... + + +class UInt8Array(IntegerArray[UInt8Scalar]): + ... + + +class Int16Array(IntegerArray[Int16Scalar]): + ... + + +class UInt16Array(IntegerArray[UInt16Scalar]): + ... + + +class Int32Array(IntegerArray[Int32Scalar]): + ... + + +class UInt32Array(IntegerArray[UInt32Scalar]): + ... + + +class Int64Array(IntegerArray[Int64Scalar]): + ... + + +class UInt64Array(IntegerArray[UInt64Scalar]): + ... + + +class Date32Array(NumericArray[Date32Scalar]): + ... + + +class Date64Array(NumericArray[Date64Scalar]): + ... + + +class TimestampArray(NumericArray[TimestampScalar[_Unit, _Tz]]): + ... + + +class Time32Array(NumericArray[Time32Scalar[_Time32Unit]]): + ... + + +class Time64Array(NumericArray[Time64Scalar[_Time64Unit]]): + ... + + +class DurationArray(NumericArray[DurationScalar[_Unit]]): + ... + + +class MonthDayNanoIntervalArray(Array[MonthDayNanoIntervalScalar]): + ... + + +class HalfFloatArray(FloatingPointArray[HalfFloatScalar]): + ... + + +class FloatArray(FloatingPointArray[FloatScalar]): + ... + + +class DoubleArray(FloatingPointArray[DoubleScalar]): + ... + + +class FixedSizeBinaryArray(Array[FixedSizeBinaryScalar]): + ... + + +class Decimal32Array(FixedSizeBinaryArray): + ... + + +class Decimal64Array(FixedSizeBinaryArray): + ... + + +class Decimal128Array(FixedSizeBinaryArray): + ... + + +class Decimal256Array(FixedSizeBinaryArray): + ... + + +class BaseListArray(Array[_ScalarT]): + def flatten(self, recursive: bool = False) -> Array: ... + + def value_parent_indices(self) -> Int64Array: ... + + def value_lengths(self) -> Int32Array: ... + + +class ListArray(BaseListArray[_ScalarT]): + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int] | list[int | None], + values: Array[Scalar[_DataTypeT]] | list[int] | list[float] | list[str] + | list[bytes] | list, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> (ListArray[ListScalar[ + _DataTypeT | Int64Type | Float64Type | StringType | BinaryType + ]] | ListArray): ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int32Array: ... + + +class LargeListArray(BaseListArray[LargeListScalar[_DataTypeT]]): + @classmethod + def from_arrays( + cls, + offsets: Int64Array | list[int] | list[int | None], + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int64Array: ... + + +class ListViewArray(BaseListArray[ListViewScalar[_DataTypeT]]): + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int32Array: ... + + @property + def sizes(self) -> Int32Array: ... + + +class LargeListViewArray(BaseListArray[LargeListScalar[_DataTypeT]]): + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int64Array: ... + + @property + def sizes(self) -> Int64Array: ... + + +class FixedSizeListArray(BaseListArray[FixedSizeListScalar[_DataTypeT, _Size]]): + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + list_size: _Size | None = None, + *, + type: DataType | None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, _Size | None]: ... + + @property + def values(self) -> BaseListArray[ListScalar[_DataTypeT]]: ... + + +_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) +_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) + + +class MapArray(BaseListArray[MapScalar[_MapKeyT, _MapItemT]]): + @classmethod + def from_arrays( + cls, + offsets: Int64Array | list[int] | None, + keys: Array[Scalar[_MapKeyT]] | np.ndarray | list | None = None, + items: Array[Scalar[_MapItemT]] | np.ndarray | list | None = None, + values: Array | DataType | None = None, + *, + type: DataType | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + + @property + def keys(self) -> Array: ... + + @property + def items(self) -> Array: ... + + +class UnionArray(Array[UnionScalar]): + @deprecated("Use fields() instead") + def child(self, pos: int) -> Field: ... + + def field(self, pos: int) -> Array: ... + + @property + def type_codes(self) -> Int8Array: ... + + @property + def offsets(self) -> Int32Array: ... + + @staticmethod + def from_dense( + types: Int8Array, + value_offsets: Int32Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | list[int] | None = None, + ) -> UnionArray: ... + + @staticmethod + def from_sparse( + types: Int8Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | list[int] | None = None, + ) -> UnionArray: ... + + +class StringArray(Array[StringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + + +class LargeStringArray(Array[LargeStringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + + +class StringViewArray(Array[StringViewScalar]): + ... + + +class BinaryArray(Array[BinaryScalar]): + @property + def total_values_length(self) -> int: ... + + +class LargeBinaryArray(Array[LargeBinaryScalar]): + @property + def total_values_length(self) -> int: ... + + +class BinaryViewArray(Array[BinaryViewScalar]): + ... + + +class DictionaryArray(Array[DictionaryScalar[_IndexT, _BasicValueT]]): + def dictionary_encode(self) -> Self: ... # type: ignore[override] + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: ... + + @property + def indices(self) -> Array[Scalar[_IndexT]]: ... + @property + def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... + + @staticmethod + def from_buffers( # type: ignore[override] + type: _BasicValueT, + length: int, + buffers: list[Buffer], + dictionary: Array | np.ndarray | pd.Series, + null_count: int = -1, + offset: int = 0, + ) -> DictionaryArray[Any, _BasicValueT]: ... + + @staticmethod + def from_arrays( + indices: Indices | Sequence[int | None], + dictionary: Array | np.ndarray | pd.Series | list[Any], + mask: np.ndarray | pd.Series | BooleanArray | None = None, + ordered: bool = False, + from_pandas: bool = False, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> DictionaryArray: ... + + +class StructArray(Array[StructScalar]): + def field(self, index: int | str) -> Array: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ... + + @staticmethod + def from_arrays( + arrays: Iterable[Array | np.ndarray | list], + names: Sequence[str] | list[Field] | None = None, + fields: list[Field] | None = None, + mask=None, + memory_pool: MemoryPool | None = None, + type: StructType | None = None, + ) -> StructArray: ... + + def sort(self, order: Order = "ascending", by: str | + None = None, **kwargs) -> StructArray: ... + + +class RunEndEncodedArray(Array[RunEndEncodedScalar[_RunEndType, _BasicValueT]]): + @staticmethod + def from_arrays( + run_ends: Int16Array | Int32Array | Int64Array | list[int], + values: Array | list[Any], type: DataType | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: ... + + @staticmethod + def from_buffers( # type: ignore[override] + type: DataType, + length: int, + buffers: list[Buffer] | list[None], + null_count: int = -1, + offset=0, + children: tuple[Array, Array] | list[list[int]] | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: ... + + @property + def run_ends(self) -> Array[Scalar[_RunEndType]]: ... + + @property + def values(self) -> Array[Scalar[_BasicValueT]]: ... + + def find_physical_offset(self) -> int: ... + + def find_physical_length(self) -> int: ... + + +_ArrayT = TypeVar("_ArrayT", bound=Array) + + +class ExtensionArray(Array[ExtensionScalar], Generic[_ArrayT]): + @property + def storage(self) -> Any: ... + + @staticmethod + def from_storage(typ: BaseExtensionType, + storage: _ArrayT) -> ExtensionArray[_ArrayT]: ... + + +class JsonArray(ExtensionArray[_ArrayT]): + ... + + +class UuidArray(ExtensionArray[_ArrayT]): + ... + + +class FixedShapeTensorArray(ExtensionArray[_ArrayT]): + def to_numpy_ndarray(self) -> np.ndarray: ... + + def to_tensor(self) -> Tensor: ... + + @classmethod + def from_numpy_ndarray( + cls, obj: np.ndarray, + dim_names: list[str] | tuple[str, ...] | None = None + ) -> Self: ... + + +class OpaqueArray(ExtensionArray[_ArrayT]): + ... + + +class Bool8Array(ExtensionArray): + def to_numpy(self, zero_copy_only: bool = ..., + writable: bool = ...) -> np.ndarray: ... + + @classmethod + def from_storage(cls, storage: Int8Array) -> Self: ... # type: ignore[override] + + @classmethod + def from_numpy(cls, obj: np.ndarray) -> Self: ... + + +def concat_arrays(arrays: Iterable[_ArrayT], + memory_pool: MemoryPool | None = None) -> _ArrayT: ... + + +def _empty_array(type: _DataTypeT) -> Array[Scalar[_DataTypeT]]: ... + + +__all__ = [ + "array", + "asarray", + "nulls", + "repeat", + "infer_type", + "_PandasConvertible", + "Array", + "NullArray", + "BooleanArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "FixedSizeBinaryArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "BaseListArray", + "ListArray", + "LargeListArray", + "ListViewArray", + "LargeListViewArray", + "FixedSizeListArray", + "MapArray", + "UnionArray", + "StringArray", + "LargeStringArray", + "StringViewArray", + "BinaryArray", + "LargeBinaryArray", + "BinaryViewArray", + "DictionaryArray", + "StructArray", + "RunEndEncodedArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "FixedShapeTensorArray", + "concat_arrays", + "_empty_array", + "_CastAs", +] diff --git a/python/pyarrow-stubs/pyarrow/builder.pyi b/python/pyarrow-stubs/pyarrow/builder.pyi new file mode 100644 index 000000000000..9001d9835b6c --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/builder.pyi @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable + +from pyarrow.lib import MemoryPool, _Weakrefable + +from .array import StringArray, StringViewArray + + +class StringBuilder(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | float | None): ... + + def append_values(self, values: Iterable[str | bytes | float | None]): ... + + def finish(self) -> StringArray: ... + + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + + +class StringViewBuilder(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | float | None): ... + + def append_values(self, values: Iterable[str | bytes | float | None]): ... + + def finish(self) -> StringViewArray: ... + + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + + +__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/pyarrow-stubs/pyarrow/cffi.pyi b/python/pyarrow-stubs/pyarrow/cffi.pyi new file mode 100644 index 000000000000..e4f077d7155b --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/cffi.pyi @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import cffi + +c_source: str +ffi: cffi.FFI diff --git a/python/pyarrow-stubs/pyarrow/compat.pyi b/python/pyarrow-stubs/pyarrow/compat.pyi new file mode 100644 index 000000000000..30e3ec13e0dd --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/compat.pyi @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +def encode_file_path(path: str | bytes) -> bytes: ... +def tobytes(o: str | bytes) -> bytes: ... +def frombytes(o: bytes, *, safe: bool = False): ... + + +__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/python/pyarrow-stubs/pyarrow/compute.pyi b/python/pyarrow-stubs/pyarrow/compute.pyi new file mode 100644 index 000000000000..809bccd1b92f --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/compute.pyi @@ -0,0 +1,1834 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable, Hashable, Iterable, Sequence, Mapping +from typing import Literal, TypeAlias, TypeVar, Any, ParamSpec + +import numpy as np + +# Option classes +from pyarrow._compute import ArraySortOptions as ArraySortOptions +from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions +from pyarrow._compute import CastOptions as CastOptions +from pyarrow._compute import CountOptions as CountOptions +from pyarrow._compute import CumulativeOptions as CumulativeOptions # noqa: F401 +from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions +from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions +from pyarrow._compute import ( # noqa: F401 + DictionaryEncodeOptions as DictionaryEncodeOptions) +from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions + +# Expressions +from pyarrow._compute import Expression as Expression +from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions +from pyarrow._compute import ( # noqa: F401 + ExtractRegexSpanOptions as ExtractRegexSpanOptions) +from pyarrow._compute import FilterOptions as FilterOptions +from pyarrow._compute import FunctionOptions as FunctionOptions # noqa: F401 +from pyarrow._compute import IndexOptions as IndexOptions # noqa: F401 +from pyarrow._compute import JoinOptions as JoinOptions # noqa: F401 +from pyarrow._compute import ListFlattenOptions as ListFlattenOptions +from pyarrow._compute import ListSliceOptions as ListSliceOptions +from pyarrow._compute import MakeStructOptions as MakeStructOptions +from pyarrow._compute import MapLookupOptions as MapLookupOptions +from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions +from pyarrow._compute import ModeOptions as ModeOptions +from pyarrow._compute import NullOptions as NullOptions +from pyarrow._compute import PadOptions as PadOptions +from pyarrow._compute import PairwiseOptions as PairwiseOptions +from pyarrow._compute import PartitionNthOptions as PartitionNthOptions +from pyarrow._compute import PivotWiderOptions as PivotWiderOptions +from pyarrow._compute import QuantileOptions as QuantileOptions +from pyarrow._compute import RandomOptions as RandomOptions +from pyarrow._compute import RankOptions as RankOptions +from pyarrow._compute import RankQuantileOptions as RankQuantileOptions +from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions +from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions +from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions +from pyarrow._compute import RoundOptions as RoundOptions +from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions +from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions +from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions +from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions +from pyarrow._compute import SelectKOptions as SelectKOptions +from pyarrow._compute import SetLookupOptions as SetLookupOptions +from pyarrow._compute import SkewOptions as SkewOptions +from pyarrow._compute import SliceOptions as SliceOptions +from pyarrow._compute import SortOptions as SortOptions +from pyarrow._compute import SplitOptions as SplitOptions +from pyarrow._compute import SplitPatternOptions as SplitPatternOptions # noqa: F401 +from pyarrow._compute import StrftimeOptions as StrftimeOptions +from pyarrow._compute import StrptimeOptions as StrptimeOptions +from pyarrow._compute import StructFieldOptions as StructFieldOptions +from pyarrow._compute import TakeOptions as TakeOptions +from pyarrow._compute import TDigestOptions as TDigestOptions +from pyarrow._compute import TrimOptions as TrimOptions +from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions +from pyarrow._compute import VarianceOptions as VarianceOptions +from pyarrow._compute import WeekOptions as WeekOptions +from pyarrow._compute import WinsorizeOptions as WinsorizeOptions +from pyarrow._compute import ZeroFillOptions as ZeroFillOptions + +# Functions +from pyarrow._compute import call_function as call_function # noqa: F401 +from pyarrow._compute import ( # noqa: F401 + call_tabular_function as call_tabular_function) +from pyarrow._compute import get_function as get_function # noqa: F401 +from pyarrow._compute import list_functions as list_functions # noqa: F401 +from pyarrow._compute import ( # noqa: F401 + register_scalar_function as register_scalar_function) +from pyarrow._compute import ( # noqa: F401 + register_aggregate_function as register_aggregate_function) +from pyarrow._compute import ( # noqa: F401 + register_vector_function as register_vector_function) +from pyarrow._compute import ( # noqa: F401 + register_tabular_function as register_tabular_function) + +# Function and Kernel classes +from pyarrow._compute import Function as Function # noqa: F401 +from pyarrow._compute import Kernel as Kernel # noqa: F401 +from pyarrow._compute import ScalarFunction as ScalarFunction # noqa: F401 +from pyarrow._compute import ScalarKernel as ScalarKernel # noqa: F401 +from pyarrow._compute import VectorFunction as VectorFunction # noqa: F401 +from pyarrow._compute import VectorKernel as VectorKernel # noqa: F401 +from pyarrow._compute import ( # noqa: F401 + ScalarAggregateFunction as ScalarAggregateFunction) +from pyarrow._compute import ( # noqa: F401 + ScalarAggregateKernel as ScalarAggregateKernel) +from pyarrow._compute import ( # noqa: F401 + HashAggregateFunction as HashAggregateFunction) +from pyarrow._compute import HashAggregateKernel as HashAggregateKernel # noqa: F401 + +# Udf + +from pyarrow._compute import _Order, _Placement +from pyarrow._stubs_typing import ArrayLike, ScalarLike, PyScalar, TimeUnit +from pyarrow._types import _RunEndType +from . import lib + +_P = ParamSpec("_P") +_R = TypeVar("_R") + + +class _ExprComparable(Expression): + def __ge__(self, other: Any) -> Expression: ... + def __le__(self, other: Any) -> Expression: ... + def __gt__(self, other: Any) -> Expression: ... + def __lt__(self, other: Any) -> Expression: ... + + +def field(*name_or_index: str | bytes | tuple[str | int, ...] | int) -> Expression: ... +def __ge__(self, other: Any) -> Expression: ... + + +def scalar(value: PyScalar | lib.Scalar[Any] | Mapping | lib.int64()) -> Expression: ... + + +def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... + + +# ============= compute functions ============= +_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) +_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) +_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | + lib.Scalar | lib.ChunkedArray) +ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] +ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT + +SignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.Int8Type] + | lib.Scalar[lib.Int16Type] + | lib.Scalar[lib.Int32Type] + | lib.Scalar[lib.Int64Type] +) +UnsignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.UInt8Type] + | lib.Scalar[lib.UInt16Type] + | lib.Scalar[lib.UInt32Type] + | lib.Scalar[lib.UInt64Type] +) +IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar +FloatScalar: TypeAlias = (lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] + | lib.Scalar[lib.Float64Type]) +DecimalScalar: TypeAlias = ( + lib.Scalar[lib.Decimal32Type] + | lib.Scalar[lib.Decimal64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) +NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar +NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar +BinaryScalar: TypeAlias = ( + lib.Scalar[lib.BinaryType] + | lib.Scalar[lib.LargeBinaryType] + | lib.Scalar[lib.FixedSizeBinaryType] +) +StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar +_ListScalar: TypeAlias = ( + lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] +) +_LargeListScalar: TypeAlias = ( + lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] +) +ListScalar: TypeAlias = ( + lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] +) +TemporalScalar: TypeAlias = ( + lib.Date32Scalar + | lib.Date64Scalar + | lib.Time32Scalar[Any] + | lib.Time64Scalar[Any] + | lib.TimestampScalar[Any] + | lib.DurationScalar[Any] + | lib.MonthDayNanoIntervalScalar +) +NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar +NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar + +_NumericOrTemporalScalarT = TypeVar( + "_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) +NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] +_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) +_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) +NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] +_NumericOrDurationArrayT = TypeVar( + "_NumericOrDurationArrayT", bound=NumericOrDurationArray) +NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] +_NumericOrTemporalArrayT = TypeVar( + "_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) +BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] +_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) +IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] +_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) +FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] +_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) +_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) +StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] +_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) +_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) +BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] +_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) +_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) +StringOrBinaryArray: TypeAlias = StringArray | BinaryArray +_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) +_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) +TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] +_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) +_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] +_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] +ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] + +# =============================== 1. Aggregation =============================== + + +def array_take( + array: _ArrayT | lib.Scalar | lib.Table | Expression, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.UInt64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar] + | np.ndarray + | Expression, + /, + *, + boundscheck: bool | None = None, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT | Expression: ... + + +# ========================= 1.1 functions ========================= + + +def all( + array: lib.BooleanScalar | BooleanArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... + + +any = _clone_signature(all) + + +def approximate_median( + array: NumericScalar | NumericArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def count( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... + + +def count_distinct( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... + + +def first( + array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: ... + +last = _clone_signature(first) + +def first_last( + array: lib.Array[Any] | lib.ChunkedArray[Any] | list[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | Mapping[Any, Any] | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... + + +def index( + data: lib.Array[Any] | lib.ChunkedArray[Any], + value: ScalarLike, + start: int | None = None, + end: int | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... + + +last = _clone_signature(first) +max = _clone_signature(first) +min = _clone_signature(first) +min_max = _clone_signature(first_last) + + +def mean( + array: FloatScalar | FloatArray + | lib.NumericArray[lib.Scalar[Any]] + | lib.ChunkedArray[lib.Scalar[Any]] + | lib.Scalar[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[Any]: ... + + +def mode( + array: NumericScalar | NumericArray, + /, + n: int = 1, + *, + skip_nulls: bool = True, + min_count: int = 0, + options: ModeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... + + +def product( + array: _ScalarT | lib.NumericArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: ... + + +def quantile( + array: NumericScalar | NumericArray, + /, + q: float | Sequence[float] = 0.5, + *, + interpolation: Literal["linear", "lower", + "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + options: QuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def stddev( + array: NumericScalar | NumericArray, + /, + *, + ddof: float = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def sum( + array: _NumericScalarT | NumericArray[_NumericScalarT] | lib.Expression, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | lib.Expression: ... + + +def tdigest( + array: NumericScalar | NumericArray, + /, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + options: TDigestOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def variance( + array: NumericScalar | NumericArray | ArrayLike, + /, + *, + ddof: int = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def winsorize( + array: _NumericArrayT, + /, + lower_limit: float = 0.0, + upper_limit: float = 1.0, + *, + options: WinsorizeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... + + +def skew( + array: NumericScalar | NumericArray | ArrayLike, + /, + *, + skip_nulls: bool = True, + biased: bool = True, + min_count: int = 0, + options: SkewOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def kurtosis( + array: NumericScalar | NumericArray | ArrayLike, + /, + *, + skip_nulls: bool = True, + biased: bool = True, + min_count: int = 0, + options: SkewOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def top_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... + + +def bottom_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... + + +# ========================= 2. Element-wise (“scalar”) functions ========= + +# ========================= 2.1 Arithmetic ========================= +def abs(x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None) -> ( + _NumericOrDurationT | _NumericOrDurationArrayT | Expression): ... + + +abs_checked = _clone_signature(abs) + + +def add( + x: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT + | ArrayLike | int | Expression), + y: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT + | ArrayLike | int | Expression), + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ... + + +add_checked = _clone_signature(add) + + +def divide( + x: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT + | Expression), + y: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT + | Expression), + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ... + + +divide_checked = _clone_signature(divide) + + +def exp( + exponent: _FloatArrayT | ArrayOrChunkedArray[NonFloatNumericScalar] | _FloatScalarT + | NonFloatNumericScalar | lib.DoubleScalar | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + _FloatArrayT | lib.DoubleArray | _FloatScalarT | lib.DoubleScalar | Expression): ... + + +expm1 = _clone_signature(exp) +multiply = _clone_signature(add) +multiply_checked = _clone_signature(add) + + +def negate( + x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None) -> ( + _NumericOrDurationT | _NumericOrDurationArrayT | Expression): ... + + +negate_checked = _clone_signature(negate) + + +def power( + base: _NumericScalarT | Expression | _NumericArrayT | NumericScalar, + exponent: _NumericScalarT | Expression | _NumericArrayT | NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +power_checked = _clone_signature(power) + + +def sign( + x: NumericOrDurationArray | NumericOrDurationScalar | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> ( + lib.NumericArray[lib.Int8Scalar] + | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] + | lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar | Expression +): ... + + +def sqrt( + x: NumericArray | NumericScalar | Expression, /, *, + memory_pool: lib.MemoryPool | None = None) -> ( + FloatArray | FloatScalar | Expression): ... + + +sqrt_checked = _clone_signature(sqrt) + +subtract = _clone_signature(add) +subtract_checked = _clone_signature(add) + +# ========================= 2.1 Bit-wise functions ========================= + + +def bit_wise_and( + x: _NumericScalarT | _NumericArrayT | NumericScalar | Expression + | ArrayOrChunkedArray[NumericScalar], + y: _NumericScalarT | _NumericArrayT | NumericScalar | Expression + | ArrayOrChunkedArray[NumericScalar], + /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +def bit_wise_not( + x: _NumericScalarT | _NumericArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +bit_wise_or = _clone_signature(bit_wise_and) +bit_wise_xor = _clone_signature(bit_wise_and) +shift_left = _clone_signature(bit_wise_and) +shift_left_checked = _clone_signature(bit_wise_and) +shift_right = _clone_signature(bit_wise_and) +shift_right_checked = _clone_signature(bit_wise_and) + +# ========================= 2.2 Rounding functions ========================= + + +def ceil( + x: _FloatScalarT | _FloatArrayT | Expression, /, *, memory_pool: lib.MemoryPool | + None = None) -> _FloatScalarT | _FloatArrayT | Expression: ... + + +floor = _clone_signature(ceil) + + +def round( + x: _NumericScalarT | _NumericArrayT | Expression | list, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +def round_to_multiple( + x: _NumericScalarT | _NumericArrayT | list | Expression, + /, + multiple: int | float | NumericScalar = 1.0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +def round_binary( + x: _NumericScalarT | _NumericArrayT | float | list | Expression, + s: lib.Int8Scalar + | lib.Int16Scalar + | lib.Int32Scalar + | lib.Int64Scalar + | lib.Scalar + | Iterable + | float + | Expression, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + _NumericScalarT | lib.NumericArray[_NumericScalarT] | _NumericArrayT + | Expression): ... + + +trunc = _clone_signature(ceil) + +# ========================= 2.3 Logarithmic functions ========================= + + +def ln( + x: FloatScalar | FloatArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> ( + lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] | Expression): ... + + +ln_checked = _clone_signature(ln) +log10 = _clone_signature(ln) +log10_checked = _clone_signature(ln) +log1p = _clone_signature(ln) +log1p_checked = _clone_signature(ln) +log2 = _clone_signature(ln) +log2_checked = _clone_signature(ln) + + +def logb( + x: FloatScalar | FloatArray | Expression | Any, + b: FloatScalar | FloatArray | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] | Expression | Any): ... + + +logb_checked = _clone_signature(logb) + +# ========================= 2.4 Trigonometric functions ========================= +acos = _clone_signature(ln) +acos_checked = _clone_signature(ln) +acosh = _clone_signature(ln) +acosh_checked = _clone_signature(ln) +asin = _clone_signature(ln) +asin_checked = _clone_signature(ln) +asinh = _clone_signature(ln) +atan = _clone_signature(ln) +atanh_checked = _clone_signature(ln) +atanh = _clone_signature(ln) +cos = _clone_signature(ln) +cos_checked = _clone_signature(ln) +cosh = _clone_signature(ln) +sin = _clone_signature(ln) +sin_checked = _clone_signature(ln) +sinh = _clone_signature(ln) +tan = _clone_signature(ln) +tan_checked = _clone_signature(ln) +tanh = _clone_signature(ln) + + +def atan2( + y: FloatScalar | FloatArray | Expression | Any, + x: FloatScalar | FloatArray | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] | Expression): ... + + +# ========================= 2.5 Comparisons functions ========================= +def equal( + x: lib.Scalar | lib.Array | lib.ChunkedArray | list | Expression | Any, + y: lib.Scalar | lib.Array | lib.ChunkedArray | list | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +greater = _clone_signature(equal) +greater_equal = _clone_signature(equal) +less = _clone_signature(equal) +less_equal = _clone_signature(equal) +not_equal = _clone_signature(equal) + + +def max_element_wise( + *args: ScalarOrArray[_Scalar_CoT] | Expression | ScalarLike | ArrayLike, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _Scalar_CoT | Expression | lib.Scalar | lib.Array: ... + + +min_element_wise = _clone_signature(max_element_wise) + +# ========================= 2.6 Logical functions ========================= + + +def and_( + x: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], + y: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.BooleanScalar | lib.BooleanArray | Expression + | ScalarOrArray[lib.BooleanScalar]): ... + + +and_kleene = _clone_signature(and_) +and_not = _clone_signature(and_) +and_not_kleene = _clone_signature(and_) +or_ = _clone_signature(and_) +or_kleene = _clone_signature(and_) +xor = _clone_signature(and_) + + +def invert( + x: lib.BooleanScalar | _BooleanArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | _BooleanArrayT | Expression: ... + + +# ========================= 2.10 String predicates ========================= +def ascii_is_alnum( + strings: StringScalar | StringArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +ascii_is_alpha = _clone_signature(ascii_is_alnum) +ascii_is_decimal = _clone_signature(ascii_is_alnum) +ascii_is_lower = _clone_signature(ascii_is_alnum) +ascii_is_printable = _clone_signature(ascii_is_alnum) +ascii_is_space = _clone_signature(ascii_is_alnum) +ascii_is_upper = _clone_signature(ascii_is_alnum) +utf8_is_alnum = _clone_signature(ascii_is_alnum) +utf8_is_alpha = _clone_signature(ascii_is_alnum) +utf8_is_decimal = _clone_signature(ascii_is_alnum) +utf8_is_digit = _clone_signature(ascii_is_alnum) +utf8_is_lower = _clone_signature(ascii_is_alnum) +utf8_is_numeric = _clone_signature(ascii_is_alnum) +utf8_is_printable = _clone_signature(ascii_is_alnum) +utf8_is_space = _clone_signature(ascii_is_alnum) +utf8_is_upper = _clone_signature(ascii_is_alnum) +ascii_is_title = _clone_signature(ascii_is_alnum) +utf8_is_title = _clone_signature(ascii_is_alnum) +string_is_ascii = _clone_signature(ascii_is_alnum) + +# ========================= 2.11 String transforms ========================= + + +def ascii_capitalize( + strings: _StringScalarT | _StringArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_lower = _clone_signature(ascii_capitalize) +ascii_reverse = _clone_signature(ascii_capitalize) +ascii_swapcase = _clone_signature(ascii_capitalize) +ascii_title = _clone_signature(ascii_capitalize) +ascii_upper = _clone_signature(ascii_capitalize) + + +def binary_length( + strings: ScalarOrArray[StringOrBinaryScalar] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array + | Expression +): ... + + +def binary_repeat( + strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ( + _StringOrBinaryScalarT | lib.Array[_StringOrBinaryScalarT] | _StringOrBinaryArrayT + | Expression): ... + + +def binary_replace_slice( + strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ... + + +def binary_reverse( + strings: _BinaryScalarT | _BinaryArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> _BinaryScalarT | _BinaryArrayT | Expression: ... + + +def replace_substring( + strings: _StringScalarT | _StringArrayT | Expression, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +replace_substring_regex = _clone_signature(replace_substring) + + +def utf8_capitalize( + strings: _StringScalarT | _StringArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT | _StringArrayT | Expression: ... + + +def utf8_length( + strings: lib.StringScalar | lib.LargeStringScalar | lib.StringArray + | lib.ChunkedArray[lib.StringScalar] | lib.LargeStringArray + | lib.ChunkedArray[lib.LargeStringScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array + | Expression): ... + + +utf8_lower = _clone_signature(utf8_capitalize) + + +def utf8_replace_slice( + strings: _StringScalarT | _StringArrayT | Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +utf8_reverse = _clone_signature(utf8_capitalize) +utf8_swapcase = _clone_signature(utf8_capitalize) +utf8_title = _clone_signature(utf8_capitalize) +utf8_upper = _clone_signature(utf8_capitalize) + +# ========================= 2.12 String padding ========================= + + +def ascii_center( + strings: _StringScalarT | _StringArrayT | Expression, + /, + width: int | None = None, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_lpad = _clone_signature(ascii_center) +ascii_rpad = _clone_signature(ascii_center) +utf8_center = _clone_signature(ascii_center) +utf8_lpad = _clone_signature(ascii_center) +utf8_rpad = _clone_signature(ascii_center) + + +def utf8_zero_fill( + strings: _StringScalarT | _StringArrayT | Expression, + /, + width: int | None = None, + padding: str = "0", + *, + options: ZeroFillOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +utf8_zfill = utf8_zero_fill + +# ========================= 2.13 String trimming ========================= + + +def ascii_ltrim( + strings: _StringScalarT | _StringArrayT | Expression, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_rtrim = _clone_signature(ascii_ltrim) +ascii_trim = _clone_signature(ascii_ltrim) +utf8_ltrim = _clone_signature(ascii_ltrim) +utf8_rtrim = _clone_signature(ascii_ltrim) +utf8_trim = _clone_signature(ascii_ltrim) + + +def ascii_ltrim_whitespace( + strings: _StringScalarT | _StringArrayT | Expression, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) + +# ========================= 2.14 String splitting ========================= + + +def ascii_split_whitespace( + strings: _StringScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.ListArray[_StringScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] + | Expression): ... + + +def split_pattern( + strings: _StringOrBinaryScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.ListArray[_StringOrBinaryScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] + | Expression): ... + + +split_pattern_regex = _clone_signature(split_pattern) +utf8_split_whitespace = _clone_signature(ascii_split_whitespace) + +# ========================= 2.15 String component extraction ========================= + + +def extract_regex( + strings: StringOrBinaryScalar | StringOrBinaryArray | Expression, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar | lib.StructArray | Expression: ... + + +extract_regex_span = _clone_signature(extract_regex) + + +# ========================= 2.16 String join ========================= +def binary_join( + strings, separator, /, *, memory_pool: lib.MemoryPool | None = None +) -> StringScalar | StringArray: ... + + +def binary_join_element_wise( + *strings: str + | bytes + | _StringOrBinaryScalarT + | _StringOrBinaryArrayT + | Expression + | list, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ... + + +# ========================= 2.17 String Slicing ========================= +def binary_slice( + strings: _BinaryScalarT | _BinaryArrayT | Expression | lib.Scalar, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryScalarT | _BinaryArrayT | Expression: ... + + +def utf8_slice_codeunits( + strings: _StringScalarT | _StringArrayT | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +def utf8_normalize( + strings: _StringScalarT | _StringArrayT | Expression, + /, + form: Literal["NFC", "NFKC", "NFD", "NFKD"] = "NFC", + *, + options: Utf8NormalizeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +# ========================= 2.18 Containment tests ========================= +def count_substring( + strings: lib.StringScalar | lib.BinaryScalar | lib.LargeStringScalar + | lib.LargeBinaryScalar | lib.StringArray | lib.BinaryArray + | lib.ChunkedArray[lib.StringScalar] | lib.ChunkedArray[lib.BinaryScalar] + | lib.LargeStringArray | lib.LargeBinaryArray + | lib.ChunkedArray[lib.LargeStringScalar] | lib.ChunkedArray[lib.LargeBinaryScalar] + | Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array + | Expression): ... + + +count_substring_regex = _clone_signature(count_substring) + + +def ends_with( + strings: StringScalar | BinaryScalar | StringArray | BinaryArray | Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +find_substring = _clone_signature(count_substring) +find_substring_regex = _clone_signature(count_substring) + + +def index_in( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + value_set: lib.Array | lib.ChunkedArray | Expression, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar | lib.Int32Array | Expression: ... + +def index_in_meta_binary( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + value_set: lib.Array | lib.ChunkedArray | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar | lib.Int32Array | Expression: ... + +def is_in( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + value_set: lib.Array | lib.ChunkedArray | Expression, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +is_in_meta_binary = _clone_signature(index_in_meta_binary) +match_like = _clone_signature(ends_with) +match_substring = _clone_signature(ends_with) +match_substring_regex = _clone_signature(ends_with) +starts_with = _clone_signature(ends_with) + +# ========================= 2.19 Categorizations ========================= + + +def is_finite( + values: NumericScalar | lib.NullScalar | NumericArray | lib.NullArray | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +is_inf = _clone_signature(is_finite) +is_nan = _clone_signature(is_finite) + + +def is_null( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +def is_valid( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression | ArrayLike, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +true_unless_null = _clone_signature(is_valid) + +# ========================= 2.20 Selecting / multiplexing ========================= + + +def case_when( + cond: lib.StructScalar + | lib.StructArray + | lib.ChunkedArray[lib.StructScalar] + | Expression, + /, + *cases: _ScalarOrArrayT | ArrayLike, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT | lib.Array | Expression: ... + + +def choose( + indices: ArrayLike | ScalarLike, + /, + *values: ArrayLike | ScalarLike, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: ... + + +def coalesce( + *values: _ScalarOrArrayT | Expression, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT | Expression: ... + + +def fill_null( + values: _ScalarOrArrayT | ScalarLike, fill_value: ArrayLike | ScalarLike +) -> _ScalarOrArrayT | ScalarLike: ... + + +def if_else( + cond: ArrayLike | ScalarLike, + left: ArrayLike | ScalarLike, + right: ArrayLike | ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: ... + + +# ========================= 2.21 Structural transforms ========================= + +def list_value_length( + lists: _ListArray[Any] | _LargeListArray[Any] | ListArray[Any] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array | lib.Int64Array | Expression: ... + + +def make_struct( + *args: lib.Scalar | lib.Array | lib.ChunkedArray | Expression | ArrayLike, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar | lib.StructArray | Expression: ... + + +# ========================= 2.22 Conversions ========================= +def ceil_temporal( + timestamps: _TemporalScalarT | _TemporalArrayT | Expression, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalScalarT | _TemporalArrayT | Expression: ... + + +floor_temporal = _clone_signature(ceil_temporal) +round_temporal = _clone_signature(ceil_temporal) + + +def cast( + arr: lib.Scalar | lib.Array | lib.ChunkedArray | lib.Table, + target_type: _DataTypeT | str | None = None, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.Scalar[_DataTypeT] | lib.Scalar[Any] | lib.Array[lib.Scalar[_DataTypeT]] + | lib.Array[lib.Scalar[Any]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]] + | lib.ChunkedArray[lib.Scalar[Any]] | lib.Table +): ... + + +def strftime( + timestamps: TemporalScalar | TemporalArray | Expression, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringScalar | lib.StringArray | Expression: ... + + +def strptime( + strings: StringScalar | StringArray | Expression, + /, + format: str, + unit: TimeUnit, + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar | lib.TimestampArray | Expression: ... + + +# ========================= 2.23 Temporal component extraction ========================= +def day( + values: TemporalScalar | TemporalArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None) -> ( + lib.Int64Scalar | lib.Int64Array | Expression +): ... + + +def day_of_week( + values: TemporalScalar | TemporalArray | Expression, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +day_of_year = _clone_signature(day) + + +def hour( + values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any] + | lib.TimestampArray[Any] | lib.Time32Array[Any] | lib.Time64Array[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] + | lib.ChunkedArray[lib.Time32Scalar[Any]] + | lib.ChunkedArray[lib.Time64Scalar[Any]] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +def is_dst( + values: lib.TimestampScalar | lib.TimestampArray[Any] + | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +def iso_week( + values: lib.TimestampScalar | lib.TimestampArray[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +iso_year = _clone_signature(iso_week) + + +def is_leap_year( + values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar + | lib.TimestampArray + | lib.Date32Array + | lib.Date64Array + | lib.ChunkedArray[lib.TimestampScalar] + | lib.ChunkedArray[lib.Date32Scalar] + | lib.ChunkedArray[lib.Date64Scalar] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +microsecond = _clone_signature(iso_week) +millisecond = _clone_signature(iso_week) +minute = _clone_signature(iso_week) +month = _clone_signature(day_of_week) +nanosecond = _clone_signature(hour) +quarter = _clone_signature(day_of_week) +second = _clone_signature(hour) +subsecond = _clone_signature(hour) +us_week = _clone_signature(iso_week) +us_year = _clone_signature(iso_week) +year = _clone_signature(iso_week) + + +def week( + values: lib.TimestampScalar | lib.TimestampArray + | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +def year_month_day( + values: TemporalScalar | TemporalArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.StructScalar | lib.StructArray | Expression: ... + + +iso_calendar = _clone_signature(year_month_day) + + +# ========================= 2.24 Temporal difference ========================= +def day_time_interval_between(start, end, /, *, + memory_pool: lib.MemoryPool | None = None): ... + + +def days_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array: ... + + +hours_between = _clone_signature(days_between) +microseconds_between = _clone_signature(days_between) +milliseconds_between = _clone_signature(days_between) +minutes_between = _clone_signature(days_between) + + +def month_day_nano_interval_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: ... + + +def month_interval_between(start, end, /, *, + memory_pool: lib.MemoryPool | None = None): ... + + +nanoseconds_between = _clone_signature(days_between) +quarters_between = _clone_signature(days_between) +seconds_between = _clone_signature(days_between) + + +def weeks_between( + start, + end, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array: ... + + +years_between = _clone_signature(days_between) + +# ========================= 2.25 Timezone handling ========================= + + +def assume_timezone( + timestamps: lib.TimestampScalar | lib.Scalar[lib.TimestampType] | lib.TimestampArray + | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, + timezone: str | None = None, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] + | Expression +): ... + + +def local_timestamp( + timestamps: lib.TimestampScalar | lib.TimestampArray + | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.TimestampScalar | lib.TimestampArray | Expression: ... + + +# ========================= 2.26 Random number generation ========================= +def random( + n: int, + *, + initializer: Hashable = "system", + options: RandomOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +# ========================= 3. Array-wise (“vector”) functions ========================= + +# ========================= 3.1 Cumulative Functions ========================= +def cumulative_sum( + values: _NumericArrayT | ArrayLike | Expression, + /, + start: int | float | lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT | Expression | lib.Array: ... + + +cumulative_sum_checked = _clone_signature(cumulative_sum) +cumulative_prod = _clone_signature(cumulative_sum) +cumulative_prod_checked = _clone_signature(cumulative_sum) +cumulative_max = _clone_signature(cumulative_sum) +cumulative_min = _clone_signature(cumulative_sum) +cumulative_mean = _clone_signature(cumulative_sum) +# ========================= 3.2 Associative transforms ========================= + + +def dictionary_encode( + array: _ScalarOrArrayT | Expression, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | Expression: ... + + +def dictionary_decode( + array: _ScalarOrArrayT | Expression, + /, + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | Expression: ... + + +def unique(array: _ArrayT | Expression, /, *, memory_pool: lib.MemoryPool | + None = None) -> _ArrayT | Expression: ... + + +def value_counts( + array: lib.Array | lib.ChunkedArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray | Expression: ... + +# ========================= 3.3 Selections ========================= + + +def array_filter( + array: _ArrayT | Expression, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT | Expression: ... + + +def drop_null(input: _ArrayT | Expression, /, *, memory_pool: lib.MemoryPool | + None = None) -> _ArrayT | Expression: ... + + +filter = array_filter +take = array_take + +# ========================= 3.4 Containment tests ========================= + + +def indices_nonzero( + values: lib.BooleanArray + | lib.NullArray + | NumericArray + | lib.Decimal128Array + | lib.Decimal256Array | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +# ========================= 3.5 Sorts and partitions ========================= +def array_sort_indices( + array: lib.Array | lib.ChunkedArray | Expression, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +def partition_nth_indices( + array: lib.Array | lib.ChunkedArray | Expression | Iterable, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +def pivot_wider( + keys: lib.Array | lib.ChunkedArray | Sequence[str], + values: lib.Array | lib.ChunkedArray | Sequence[Any], + /, + key_names: Sequence[str] | None = None, + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + options: PivotWiderOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... + + +def rank( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + options: RankOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... + + +def rank_quantile( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: RankQuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def rank_normal( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: RankQuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def select_k_unstable( + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table | Expression, + /, + k: int | None = None, + sort_keys: Sequence[tuple[str | Expression, str]] | None = None, + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +def sort_indices( + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table | Expression, + /, + sort_keys: Sequence[tuple[str | Expression, _Order]] | None = None, + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +# ========================= 3.6 Structural transforms ========================= +def list_element( + lists: lib.Array[ListScalar[_DataTypeT]] | lib.ChunkedArray[ListScalar[_DataTypeT]] + | ListScalar[_DataTypeT] | Expression, + index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None +) -> (lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]] + | _DataTypeT | Expression): ... + + +def list_flatten( + lists: ArrayOrChunkedArray[ListScalar[Any]] | Expression, + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any] | Expression: ... + + +def list_parent_indices( + lists: ArrayOrChunkedArray[Any] | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array | Expression: ... + + +def list_slice( + lists: ArrayOrChunkedArray[Any] | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any] | Expression: ... + + +def map_lookup( + container, + /, + query_key, + occurrence: str, + *, + options: MapLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): ... + + +def struct_field( + values, + /, + indices, + *, + options: StructFieldOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): ... + + +def fill_null_backward( + values: _ScalarOrArrayT | ScalarLike | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | ScalarLike | Expression: ... + + +def fill_null_forward( + values: _ScalarOrArrayT | ScalarLike | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | ScalarLike | Expression: ... + + +def replace_with_mask( + values: _ScalarOrArrayT | Expression, + mask: list[bool] | list[bool | None] | BooleanArray, + replacements, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | Expression: ... + + +# ========================= 3.7 Pairwise functions ========================= +def pairwise_diff( + input: _NumericOrTemporalArrayT | Expression, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT | Expression: ... + + +def run_end_encode( + input: _NumericOrTemporalArrayT | Expression, + /, + *, + run_end_type: _RunEndType | None = None, + options: RunEndEncodeOptions | None = None, + memory_pool: lib.MemoryPool | None = None +) -> _NumericOrTemporalArrayT | Expression: ... + + +def run_end_decode( + input: _NumericOrTemporalArrayT | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None +) -> _NumericOrTemporalArrayT | Expression: ... + + +pairwise_diff_checked = _clone_signature(pairwise_diff) diff --git a/python/pyarrow-stubs/pyarrow/config.pyi b/python/pyarrow-stubs/pyarrow/config.pyi new file mode 100644 index 000000000000..069b70e553ac --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/config.pyi @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import NamedTuple + + +class VersionInfo(NamedTuple): + major: int + minor: int + patch: int + + +class CppBuildInfo(NamedTuple): + version: str + version_info: VersionInfo + so_version: str + full_so_version: str + compiler_id: str + compiler_version: str + compiler_flags: str + git_id: str + git_description: str + package_kind: str + build_type: str + + +class BuildInfo(NamedTuple): + build_type: str + cpp_build_info: CppBuildInfo + + +class RuntimeInfo(NamedTuple): + simd_level: str + detected_simd_level: str + + +build_info: BuildInfo +cpp_build_info: CppBuildInfo +cpp_version: str +cpp_version_info: VersionInfo + + +def runtime_info() -> RuntimeInfo: ... +def set_timezone_db_path(path: str) -> None: ... + + +__all__ = [ + "VersionInfo", + "BuildInfo", + "CppBuildInfo", + "RuntimeInfo", + "build_info", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "set_timezone_db_path", +] diff --git a/python/pyarrow-stubs/pyarrow/csv.pyi b/python/pyarrow-stubs/pyarrow/csv.pyi new file mode 100644 index 000000000000..a7abd413aab4 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/csv.pyi @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._csv import ( + ISO8601, + ConvertOptions, + CSVStreamingReader, + CSVWriter, + InvalidRow, + ParseOptions, + ReadOptions, + WriteOptions, + open_csv, + read_csv, + write_csv, +) + +__all__ = [ + "ISO8601", + "ConvertOptions", + "CSVStreamingReader", + "CSVWriter", + "InvalidRow", + "ParseOptions", + "ReadOptions", + "WriteOptions", + "open_csv", + "read_csv", + "write_csv", +] diff --git a/python/pyarrow-stubs/pyarrow/cuda.pyi b/python/pyarrow-stubs/pyarrow/cuda.pyi new file mode 100644 index 000000000000..0394965bb738 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/cuda.pyi @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._cuda import ( + BufferReader, + BufferWriter, + Context, + CudaBuffer, + HostBuffer, + IpcMemHandle, + new_host_buffer, + read_message, + read_record_batch, + serialize_record_batch, +) + +__all__ = [ + "BufferReader", + "BufferWriter", + "Context", + "CudaBuffer", + "HostBuffer", + "IpcMemHandle", + "new_host_buffer", + "read_message", + "read_record_batch", + "serialize_record_batch", +] diff --git a/python/pyarrow-stubs/pyarrow/dataset.pyi b/python/pyarrow-stubs/pyarrow/dataset.pyi new file mode 100644 index 000000000000..66d86b14a259 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/dataset.pyi @@ -0,0 +1,199 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable, Iterable, Sequence +from typing import Literal, TypeAlias, Any + +from _typeshed import StrPath +from pyarrow._dataset import ( + CsvFileFormat, + CsvFragmentScanOptions, + Dataset, + DatasetFactory, + DirectoryPartitioning, + FeatherFileFormat, + FileFormat, + FileFragment, + FilenamePartitioning, + FileSystemDataset, + FileSystemDatasetFactory, + FileSystemFactoryOptions, + FileWriteOptions, + Fragment, + FragmentScanOptions, + HivePartitioning, + InMemoryDataset, + IpcFileFormat, + IpcFileWriteOptions, + JsonFileFormat, + JsonFragmentScanOptions, + Partitioning, + PartitioningFactory, + Scanner, + TaggedRecordBatch, + UnionDataset, + UnionDatasetFactory, + WrittenFile, + get_partition_keys, +) +from pyarrow._dataset_orc import OrcFileFormat +from pyarrow._dataset_parquet import ( + ParquetDatasetFactory, + ParquetFactoryOptions, + ParquetFileFormat, + ParquetFileFragment, + ParquetFileWriteOptions, + ParquetFragmentScanOptions, + ParquetReadOptions, + RowGroupInfo, +) +from pyarrow._dataset_parquet_encryption import ( + ParquetDecryptionConfig, + ParquetEncryptionConfig, +) +from pyarrow.compute import Expression, field, scalar +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table + +from ._fs import SupportedFileSystem + +_orc_available: bool +_parquet_available: bool + +__all__ = [ + "CsvFileFormat", + "CsvFragmentScanOptions", + "Dataset", + "DatasetFactory", + "DirectoryPartitioning", + "FeatherFileFormat", + "FileFormat", + "FileFragment", + "FilenamePartitioning", + "FileSystemDataset", + "FileSystemDatasetFactory", + "FileSystemFactoryOptions", + "FileWriteOptions", + "Fragment", + "FragmentScanOptions", + "HivePartitioning", + "InMemoryDataset", + "IpcFileFormat", + "IpcFileWriteOptions", + "JsonFileFormat", + "JsonFragmentScanOptions", + "Partitioning", + "PartitioningFactory", + "Scanner", + "TaggedRecordBatch", + "UnionDataset", + "UnionDatasetFactory", + "WrittenFile", + "get_partition_keys", + # Orc + "OrcFileFormat", + # Parquet + "ParquetDatasetFactory", + "ParquetFactoryOptions", + "ParquetFileFormat", + "ParquetFileFragment", + "ParquetFileWriteOptions", + "ParquetFragmentScanOptions", + "ParquetReadOptions", + "RowGroupInfo", + # Parquet Encryption + "ParquetDecryptionConfig", + "ParquetEncryptionConfig", + # Compute + "Expression", + "field", + "scalar", + # Dataset + "partitioning", + "parquet_dataset", + "write_dataset", +] + +_DatasetFormat: TypeAlias = ( + Literal["parquet", "ipc", "arrow", "feather", "csv", "json", "orc", str] +) + + +def partitioning( + schema: Schema = None, + *, + field_names: list[str] = None, + flavor: Literal["hive"] = None, + dictionaries: dict[str, Array] | Literal["infer"] | None = None, +) -> Partitioning | PartitioningFactory: ... + + +def parquet_dataset( + metadata_path: StrPath, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + format: ParquetFileFormat | None = None, + partitioning: Partitioning | PartitioningFactory | str | None = None, + partition_base_dir: str | None = None, +) -> FileSystemDataset: ... + + +def dataset( + source: StrPath + | Sequence[Dataset] + | Sequence[StrPath] + | Iterable[RecordBatch] + | Iterable[Table] + | RecordBatchReader + | RecordBatch + | Table, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> FileSystemDataset | UnionDataset | InMemoryDataset | Dataset: ... + + +def write_dataset( + data: Any | Dataset | Table | RecordBatch | RecordBatchReader | list[Table] + | Iterable[RecordBatch] | Scanner, + base_dir: StrPath, + *, + basename_template: str | None = None, + format: FileFormat | _DatasetFormat | None = None, + partitioning: Partitioning | PartitioningFactory | list[str] | None = None, + partitioning_flavor: str | None = None, + schema: Schema | None = None, + filesystem: SupportedFileSystem | str | None = None, + file_options: FileWriteOptions | None = None, + use_threads: bool | None = True, + max_partitions: int = 1024, + max_open_files: int = 1024, + max_rows_per_file: int = 0, + min_rows_per_group: int = 0, + max_rows_per_group: int = 1024 * 1024, # noqa: Y011 + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: + Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", + create_dir: bool = True, + preserve_order: bool | None = None, +): ... + + +def _get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ... diff --git a/python/pyarrow-stubs/pyarrow/device.pyi b/python/pyarrow-stubs/pyarrow/device.pyi new file mode 100644 index 000000000000..7787ac44deb8 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/device.pyi @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from pyarrow.lib import _Weakrefable + + +class DeviceAllocationType(enum.Enum): + CPU = enum.auto() + CUDA = enum.auto() + CUDA_HOST = enum.auto() + OPENCL = enum.auto() + VULKAN = enum.auto() + METAL = enum.auto() + VPI = enum.auto() + ROCM = enum.auto() + ROCM_HOST = enum.auto() + EXT_DEV = enum.auto() + CUDA_MANAGED = enum.auto() + ONEAPI = enum.auto() + WEBGPU = enum.auto() + HEXAGON = enum.auto() + + +class Device(_Weakrefable): + @property + def type_name(self) -> str: ... + + @property + def device_id(self) -> int: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + +class MemoryManager(_Weakrefable): + @property + def device(self) -> Device: ... + + @property + def is_cpu(self) -> bool: ... + + +def default_cpu_memory_manager() -> MemoryManager: ... + + +__all__ = ["DeviceAllocationType", "Device", + "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/pyarrow-stubs/pyarrow/error.pyi b/python/pyarrow-stubs/pyarrow/error.pyi new file mode 100644 index 000000000000..eac936afcb53 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/error.pyi @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + + +class ArrowException(Exception): + ... + + +class ArrowInvalid(ValueError, ArrowException): + ... + + +class ArrowMemoryError(MemoryError, ArrowException): + ... + + +class ArrowKeyError(KeyError, ArrowException): + ... + + +class ArrowTypeError(TypeError, ArrowException): + ... + + +class ArrowNotImplementedError(NotImplementedError, ArrowException): + ... + + +class ArrowCapacityError(ArrowException): + ... + + +class ArrowIndexError(IndexError, ArrowException): + ... + + +class ArrowSerializationError(ArrowException): + ... + + +class ArrowCancelled(ArrowException): + signum: int | None + def __init__(self, message: str, signum: int | None = None) -> None: ... + + +ArrowIOError = IOError + + +class StopToken: + ... + + +def enable_signal_handlers(enable: bool) -> None: ... + + +have_signal_refcycle: bool + + +class SignalStopHandler: + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... + def __dealloc__(self) -> None: ... + @property + def stop_token(self) -> StopToken: ... + + +__all__ = [ + "ArrowException", + "ArrowInvalid", + "ArrowMemoryError", + "ArrowKeyError", + "ArrowTypeError", + "ArrowNotImplementedError", + "ArrowCapacityError", + "ArrowIndexError", + "ArrowSerializationError", + "ArrowCancelled", + "ArrowIOError", + "StopToken", + "enable_signal_handlers", + "have_signal_refcycle", + "SignalStopHandler", +] diff --git a/python/pyarrow-stubs/pyarrow/feather.pyi b/python/pyarrow-stubs/pyarrow/feather.pyi new file mode 100644 index 000000000000..cf9d34020913 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/feather.pyi @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable +from typing import IO, Literal + +import pandas as pd + +from pyarrow import lib +from pyarrow.lib import Table +from pyarrow._typing import StrPath +from ._feather import FeatherError + + +class FeatherDataset: + path_or_paths: str | list[str] + validate_schema: bool + + def __init__(self, path_or_paths: str | + list[str], validate_schema: bool = True) -> None: ... + + def read_table(self, columns: list[str] | None = None) -> Table: ... + def validate_schemas(self, piece, table: Table) -> None: ... + + def read_pandas( + self, columns: list[str] | None = None, use_threads: bool = True + ) -> pd.DataFrame: ... + + +def check_chunked_overflow(name: str, col) -> None: ... + + +def write_feather( + df: pd.DataFrame | Table | lib.ChunkedArray, + dest: StrPath | IO, + compression: Literal["zstd", "lz4", "uncompressed", "snappy"] | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +) -> None: ... + + +def read_feather( + source: StrPath | IO | lib.NativeFile, + columns: list[str] | None = None, + use_threads: bool = True, + memory_map: bool = False, + **kwargs, +) -> pd.DataFrame: ... + + +def read_table( + source: StrPath | IO | lib.NativeFile, + columns: list[str | int] | Iterable[str | int] | None = None, + memory_map: bool = False, + use_threads: bool = True, +) -> Table: ... + + +__all__ = [ + "FeatherError", + "FeatherDataset", + "check_chunked_overflow", + "write_feather", + "read_feather", + "read_table", +] diff --git a/python/pyarrow-stubs/pyarrow/flight.pyi b/python/pyarrow-stubs/pyarrow/flight.pyi new file mode 100644 index 000000000000..dcc6ee2244b3 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/flight.pyi @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._flight import ( + Action, + ActionType, + BasicAuth, + CallInfo, + CertKeyPair, + ClientAuthHandler, + ClientMiddleware, + ClientMiddlewareFactory, + DescriptorType, + FlightCallOptions, + FlightCancelledError, + FlightClient, + FlightDataStream, + FlightDescriptor, + FlightEndpoint, + FlightError, + FlightInfo, + FlightInternalError, + FlightMetadataReader, + FlightMetadataWriter, + FlightMethod, + FlightServerBase, + FlightServerError, + FlightStreamChunk, + FlightStreamReader, + FlightStreamWriter, + FlightTimedOutError, + FlightUnauthenticatedError, + FlightUnauthorizedError, + FlightUnavailableError, + FlightWriteSizeExceededError, + GeneratorStream, + Location, + MetadataRecordBatchReader, + MetadataRecordBatchWriter, + RecordBatchStream, + Result, + SchemaResult, + ServerAuthHandler, + ServerCallContext, + ServerMiddleware, + ServerMiddlewareFactory, + Ticket, + TracingServerMiddlewareFactory, + connect, +) + +__all__ = [ + "Action", + "ActionType", + "BasicAuth", + "CallInfo", + "CertKeyPair", + "ClientAuthHandler", + "ClientMiddleware", + "ClientMiddlewareFactory", + "DescriptorType", + "FlightCallOptions", + "FlightCancelledError", + "FlightClient", + "FlightDataStream", + "FlightDescriptor", + "FlightEndpoint", + "FlightError", + "FlightInfo", + "FlightInternalError", + "FlightMetadataReader", + "FlightMetadataWriter", + "FlightMethod", + "FlightServerBase", + "FlightServerError", + "FlightStreamChunk", + "FlightStreamReader", + "FlightStreamWriter", + "FlightTimedOutError", + "FlightUnauthenticatedError", + "FlightUnauthorizedError", + "FlightUnavailableError", + "FlightWriteSizeExceededError", + "GeneratorStream", + "Location", + "MetadataRecordBatchReader", + "MetadataRecordBatchWriter", + "RecordBatchStream", + "Result", + "SchemaResult", + "ServerAuthHandler", + "ServerCallContext", + "ServerMiddleware", + "ServerMiddlewareFactory", + "Ticket", + "TracingServerMiddlewareFactory", + "connect", +] diff --git a/python/pyarrow-stubs/pyarrow/fs.pyi b/python/pyarrow-stubs/pyarrow/fs.pyi new file mode 100644 index 000000000000..77bf91939004 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/fs.pyi @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._fs import ( + FileSelector, + FileType, + FileInfo, + FileSystem, + LocalFileSystem, + SubTreeFileSystem, + _MockFileSystem, + FileSystemHandler, + PyFileSystem, + SupportedFileSystem, +) +from pyarrow._azurefs import AzureFileSystem +from pyarrow._hdfs import HadoopFileSystem +from pyarrow._gcsfs import GcsFileSystem +from pyarrow._s3fs import ( + AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy, + S3FileSystem, + S3LogLevel, + S3RetryStrategy, + ensure_s3_initialized, + finalize_s3, + ensure_s3_finalized, + initialize_s3, + resolve_s3_region, +) + +FileStats = FileInfo + + +def copy_files( + source: str, + destination: str, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, + *, + chunk_size: int = 1024 * 1024, # noqa: Y011 + use_threads: bool = True, +) -> None: ... + + +def _ensure_filesystem( + filesystem: FileSystem | str | object, + *, + use_mmap: bool = False +) -> FileSystem: ... + + +def _resolve_filesystem_and_path( + path: str | object, + filesystem: FileSystem | str | object | None = None, + *, + memory_map: bool = False +) -> tuple[FileSystem, str]: ... + + +class FSSpecHandler(FileSystemHandler): # type: ignore[misc] # All abstract methods implemented via fsspec delegation # noqa: E501 + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... + + +__all__ = [ + # _fs + "FileSelector", + "FileType", + "FileInfo", + "FileSystem", + "LocalFileSystem", + "SubTreeFileSystem", + "_MockFileSystem", + "FileSystemHandler", + "PyFileSystem", + # _azurefs + "AzureFileSystem", + # _hdfs + "HadoopFileSystem", + # _gcsfs + "GcsFileSystem", + # _s3fs + "AwsDefaultS3RetryStrategy", + "AwsStandardS3RetryStrategy", + "S3FileSystem", + "S3LogLevel", + "S3RetryStrategy", + "ensure_s3_initialized", + "finalize_s3", + "ensure_s3_finalized", + "initialize_s3", + "resolve_s3_region", + # fs + "FileStats", + "copy_files", + "FSSpecHandler", +] diff --git a/python/pyarrow-stubs/pyarrow/gandiva.pyi b/python/pyarrow-stubs/pyarrow/gandiva.pyi new file mode 100644 index 000000000000..7e129d3ed1de --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/gandiva.pyi @@ -0,0 +1,110 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable +from typing import Literal + +from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable + + +class Node(_Weakrefable): + def return_type(self) -> DataType: ... + + +class Expression(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + + +class Condition(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + + +class SelectionVector(_Weakrefable): + def to_array(self) -> Array: ... + + +class Projector(_Weakrefable): + @property + def llvm_ir(self): ... + + def evaluate( + self, batch: RecordBatch, selection: SelectionVector | None = None + ) -> list[Array]: ... + + +class Filter(_Weakrefable): + @property + def llvm_ir(self): ... + + def evaluate( + self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" + ) -> SelectionVector: ... + + +class TreeExprBuilder(_Weakrefable): + def make_literal(self, value: float | str | bytes | + bool, dtype: DataType | str | None) -> Node: ... + + def make_expression( + self, root_node: Node | None, return_field: Field) -> Expression: ... + + def make_function( + self, name: str, children: list[Node | None], + return_type: DataType) -> Node: ... + + def make_field(self, field: Field | None) -> Node: ... + + def make_if( + self, condition: Node, this_node: Node | None, + else_node: Node | None, return_type: DataType | None + ) -> Node: ... + def make_and(self, children: list[Node | None]) -> Node: ... + def make_or(self, children: list[Node | None]) -> Node: ... + def make_in_expression(self, node: Node | None, values: Iterable, + dtype: DataType) -> Node: ... + + def make_condition(self, condition: Node | None) -> Condition: ... + + +class Configuration(_Weakrefable): + def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... + + +def make_projector( + schema: Schema, + children: list[Expression | None], + pool: MemoryPool | None = None, + selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", + configuration: Configuration | None = None, +) -> Projector: ... + + +def make_filter( + schema: Schema, condition: Condition | None, + configuration: Configuration | None = None +) -> Filter: ... + + +class FunctionSignature(_Weakrefable): + def return_type(self) -> DataType: ... + def param_types(self) -> list[DataType]: ... + def name(self) -> str: ... + + +def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/python/pyarrow-stubs/pyarrow/interchange/__init__.pyi b/python/pyarrow-stubs/pyarrow/interchange/__init__.pyi new file mode 100644 index 000000000000..fd5ae83c5692 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/__init__.pyi @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from .from_dataframe import from_dataframe as from_dataframe + +__all__ = ["from_dataframe"] diff --git a/python/pyarrow-stubs/pyarrow/interchange/buffer.pyi b/python/pyarrow-stubs/pyarrow/interchange/buffer.pyi new file mode 100644 index 000000000000..e1d8ae949c90 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/buffer.pyi @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from pyarrow.lib import Buffer + + +class DlpackDeviceType(enum.IntEnum): + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class _PyArrowBuffer: + def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... + @property + def bufsize(self) -> int: ... + @property + def ptr(self) -> int: ... + def __dlpack__(self): ... + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: ... diff --git a/python/pyarrow-stubs/pyarrow/interchange/column.pyi b/python/pyarrow-stubs/pyarrow/interchange/column.pyi new file mode 100644 index 000000000000..67508ac0689c --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/column.pyi @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from collections.abc import Iterable +from typing import Any, TypeAlias, TypedDict + +from pyarrow.lib import Array, ChunkedArray + +from .buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype: TypeAlias = tuple[DtypeKind, int, str, str] + + +class ColumnNullType(enum.IntEnum): + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + data: tuple[_PyArrowBuffer, Dtype] + validity: tuple[_PyArrowBuffer, Dtype] | None + offsets: tuple[_PyArrowBuffer, Dtype] | None + + +class CategoricalDescription(TypedDict): + is_ordered: bool + is_dictionary: bool + categories: _PyArrowColumn | None + + +class Endianness(enum.Enum): + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + ... + + +class _PyArrowColumn: + _col: Array | ChunkedArray + + def __init__(self, column: Array | ChunkedArray, + allow_copy: bool = True) -> None: ... + + def size(self) -> int: ... + @property + def offset(self) -> int: ... + @property + def dtype(self) -> tuple[DtypeKind, int, str, str]: ... + @property + def describe_categorical(self) -> CategoricalDescription: ... + @property + def describe_null(self) -> tuple[ColumnNullType, Any]: ... + @property + def null_count(self) -> int: ... + @property + def metadata(self) -> dict[str, Any]: ... + def num_chunks(self) -> int: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: ... + def get_buffers(self) -> ColumnBuffers: ... diff --git a/python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi b/python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi new file mode 100644 index 000000000000..419b3e2cdb33 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Iterable, Sequence +from typing import Any + +from pyarrow.interchange.column import _PyArrowColumn +from pyarrow.lib import RecordBatch, Table + + +class _PyArrowDataFrame: + def __init__( + self, + df: Table | RecordBatch, + nan_as_null: bool = False, + allow_copy: bool = True) -> None: ... + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: ... + @property + def metadata(self) -> dict[str, Any]: ... + def num_columns(self) -> int: ... + def num_rows(self) -> int: ... + def num_chunks(self) -> int: ... + def column_names(self) -> Iterable[str]: ... + def get_column(self, i: int) -> _PyArrowColumn: ... + def get_column_by_name(self, name: str) -> _PyArrowColumn: ... + def get_columns(self) -> Iterable[_PyArrowColumn]: ... + def select_columns(self, indices: Sequence[int]) -> Self: ... + def select_columns_by_name(self, names: Sequence[str]) -> Self: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: ... diff --git a/python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi b/python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi new file mode 100644 index 000000000000..d6ad272dfc69 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Protocol, TypeAlias + +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table + +from .column import ( + ColumnBuffers, + ColumnNullType, + Dtype, + DtypeKind, +) + + +class DataFrameObject(Protocol): + def __dataframe__(self, nan_as_null: bool = False, + allow_copy: bool = True) -> Any: ... + + +ColumnObject: TypeAlias = Any + + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ... + + +def _from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ... + + +def protocol_df_chunk_to_pyarrow( + df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: ... + + +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... + + +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... + + +def categorical_column_to_dictionary( + col: ColumnObject, allow_copy: bool = True +) -> DictionaryArray: ... + + +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: ... + + +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: ... + + +def buffers_to_array( + buffers: ColumnBuffers, + data_type: tuple[DtypeKind, int, str, str], + length: int, + describe_null: ColumnNullType, + offset: int = 0, + allow_copy: bool = True, +) -> Array: ... + + +def validity_buffer_from_mask( + validity_buff: Buffer, + validity_dtype: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: ... + + +def validity_buffer_nan_sentinel( + data_pa_buffer: Buffer, + data_type: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: ... diff --git a/python/pyarrow-stubs/pyarrow/io.pyi b/python/pyarrow-stubs/pyarrow/io.pyi new file mode 100644 index 000000000000..be6a07d54186 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/io.pyi @@ -0,0 +1,430 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from collections.abc import Callable +from io import IOBase + +from _typeshed import StrPath + +import numpy as np + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Any, Literal, SupportsIndex +import builtins + +from pyarrow._stubs_typing import Compression, SupportPyBuffer +from pyarrow.lib import MemoryPool, _Weakrefable + +from .device import Device, DeviceAllocationType, MemoryManager +from ._types import KeyValueMetadata + + +def have_libhdfs() -> bool: ... + + +def io_thread_count() -> int: ... + + +def set_io_thread_count(count: int) -> None: ... + + +Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] + + +class NativeFile(_Weakrefable): + _default_chunk_size: int + + def __enter__(self) -> Self: ... + def __exit__(self, *args) -> None: ... + @property + def mode(self) -> Mode: ... + + def readable(self) -> bool: ... + def seekable(self) -> bool: ... + def isatty(self) -> bool: ... + def fileno(self) -> int: ... + + @property + def closed(self) -> bool: ... + def close(self) -> None: ... + def size(self) -> int: ... + + def metadata(self) -> KeyValueMetadata: ... + + def tell(self) -> int: ... + + def seek(self, position: int, whence: int = 0) -> int: ... + + def flush(self) -> None: ... + + def write(self, data: bytes | SupportPyBuffer) -> int: ... + + def read(self, nbytes: int | None = None) -> bytes: ... + + def get_stream(self, file_offset: int, nbytes: int) -> Self: ... + + def read_at(self, nbytes: int, offset: int) -> bytes: ... + + def read1(self, nbytes: int | None = None) -> bytes: ... + + def readall(self) -> bytes: ... + def readinto(self, b: SupportPyBuffer) -> int: ... + + def readline(self, size: int | None = None) -> bytes: ... + + def readlines(self, hint: int | None = None) -> list[bytes]: ... + + def __iter__(self) -> Self: ... + + def __next__(self) -> bytes: ... + def read_buffer(self, nbytes: int | None = None) -> Buffer: ... + + def truncate(self, pos: int | None = None) -> int: ... + + def writelines(self, lines: list[bytes]): ... + + def download(self, stream_or_path: StrPath | IOBase, + buffer_size: int | None = None) -> None: ... + + def upload(self, stream: IOBase, buffer_size: int | None) -> None: ... + + def writable(self): ... + +# ---------------------------------------------------------------------- +# Python file-like objects + + +class PythonFile(NativeFile): + def __init__(self, handle: IOBase, + mode: Literal["r", "w"] | None = None) -> None: ... + + +class MemoryMappedFile(NativeFile): + @classmethod + def create(cls, path: str, size: float) -> Self: ... + + def _open(self, path: str, + mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... + + def resize(self, new_size: int) -> None: ... + + +def memory_map( + path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" +) -> MemoryMappedFile: ... + + +create_memory_map = MemoryMappedFile.create + + +class OSFile(NativeFile): + name: str + + def __init__( + self, + path: str, + mode: Literal["r", "rb", "w", "wb", "a", "ab"] = "r", + memory_pool: MemoryPool | None = None, + ) -> None: ... + + +class FixedSizeBufferWriter(NativeFile): + def __init__(self, buffer: Buffer) -> None: ... + def set_memcopy_threads(self, num_threads: int) -> None: ... + + def set_memcopy_blocksize(self, blocksize: int) -> None: ... + + def set_memcopy_threshold(self, threshold: int) -> None: ... + + +# ---------------------------------------------------------------------- +# Arrow buffers + +class Buffer(_Weakrefable): + def __len__(self) -> int: ... + + def _assert_cpu(self) -> None: ... + @property + def size(self) -> int: ... + + @property + def address(self) -> int: ... + + def hex(self) -> bytes: ... + + @property + def is_mutable(self) -> bool: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def device(self) -> Device: ... + + @property + def memory_manager(self) -> MemoryManager: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def parent(self) -> Buffer | None: ... + + def __getitem__(self, key: int | builtins.slice) -> int | Self: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def equals(self, other: Self) -> bool: ... + + def __buffer__(self, flags: int) -> memoryview: ... + + def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... + def to_pybytes(self) -> bytes: ... + + +class ResizableBuffer(Buffer): + def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: ... + + +def allocate_buffer( + size: int, + memory_pool: MemoryPool | None = None, + resizable: Literal[False] | Literal[True] | None = None # noqa: Y030 +) -> Buffer | ResizableBuffer: ... + + +# ---------------------------------------------------------------------- +# Arrow Stream +class BufferOutputStream(NativeFile): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def getvalue(self) -> Buffer: ... + + +class MockOutputStream(NativeFile): + ... + + +class BufferReader(NativeFile): + def __init__(self, obj) -> None: ... + + +class CompressedInputStream(NativeFile): + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: str | None, + ) -> None: ... + + +class CompressedOutputStream(NativeFile): + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: str, + ) -> None: ... + + +class BufferedInputStream(NativeFile): + def __init__(self, stream: NativeFile, buffer_size: int, + memory_pool: MemoryPool | None = None) -> None: ... + + def detach(self) -> NativeFile: ... + + +class BufferedOutputStream(NativeFile): + def __init__(self, stream: NativeFile, buffer_size: int, + memory_pool: MemoryPool | None = None) -> None: ... + + def detach(self) -> NativeFile: ... + + +class TransformInputStream(NativeFile): + def __init__(self, stream: NativeFile, + transform_func: Callable[[Buffer], Any]) -> None: ... + + +class Transcoder: + def __init__(self, decoder, encoder) -> None: ... + def __call__(self, buf: Buffer): ... + + +def transcoding_input_stream( + stream: NativeFile, src_encoding: str, dest_encoding: str +) -> TransformInputStream: ... + + +def py_buffer(obj: SupportPyBuffer | np.ndarray) -> Buffer: ... + + +def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: ... + + +def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... + +# --------------------------------------------------------------------- + + +class CacheOptions(_Weakrefable): + hole_size_limit: int + range_size_limit: int + lazy: bool + prefetch_limit: int + + def __init__( + self, + *, + hole_size_limit: int | None = None, + range_size_limit: int | None = None, + lazy: bool = True, + prefetch_limit: int = 0, + ) -> None: ... + + @classmethod + def from_network_metrics( + cls, + time_to_first_byte_millis: int, + transfer_bandwidth_mib_per_sec: int, + ideal_bandwidth_utilization_frac: float = 0.9, + max_ideal_request_size_mib: int = 64, + ) -> Self: ... + + +class Codec(_Weakrefable): + def __init__(self, compression: Compression | str | None, + compression_level: int | None = None) -> None: ... + + @classmethod + def detect(cls, path: StrPath) -> Self: ... + + @staticmethod + def is_available(compression: Compression | str) -> bool: ... + + @staticmethod + def supports_compression_level(compression: Compression) -> int: ... + + @staticmethod + def default_compression_level(compression: Compression) -> int: ... + + @staticmethod + def minimum_compression_level(compression: Compression) -> int: ... + + @staticmethod + def maximum_compression_level(compression: Compression) -> int: ... + + @property + def name(self) -> Compression: ... + + @property + def compression_level(self) -> int: ... + + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030 + memory_pool: MemoryPool | None = None, + ) -> Buffer | bytes: ... + + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030 + memory_pool: MemoryPool | None = None, + ) -> Buffer | bytes: ... + + +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030 + memory_pool: MemoryPool | None = None, +) -> Buffer | bytes: ... + + +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030 + memory_pool: MemoryPool | None = None, +) -> Buffer | bytes: ... + + +def input_stream( + source: StrPath | Buffer | NativeFile | IOBase | SupportPyBuffer, + compression: + Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] | None = "detect", + buffer_size: int | str | None = None, +) -> BufferReader: ... + + +def output_stream( + source: StrPath | Buffer | NativeFile | IOBase | SupportPyBuffer, + compression: + Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] | None = "detect", + buffer_size: int | None = None, +) -> NativeFile: ... + + +__all__ = [ + "have_libhdfs", + "io_thread_count", + "set_io_thread_count", + "NativeFile", + "PythonFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "OSFile", + "FixedSizeBufferWriter", + "Buffer", + "ResizableBuffer", + "allocate_buffer", + "BufferOutputStream", + "MockOutputStream", + "BufferReader", + "CompressedInputStream", + "CompressedOutputStream", + "BufferedInputStream", + "BufferedOutputStream", + "TransformInputStream", + "Transcoder", + "transcoding_input_stream", + "py_buffer", + "foreign_buffer", + "as_buffer", + "CacheOptions", + "Codec", + "compress", + "decompress", + "input_stream", + "output_stream", +] diff --git a/python/pyarrow-stubs/pyarrow/ipc.pyi b/python/pyarrow-stubs/pyarrow/ipc.pyi new file mode 100644 index 000000000000..d153ab0f46aa --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/ipc.pyi @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from io import IOBase +from typing import Any + +from _typeshed import StrPath +import pandas as pd +import pyarrow.lib as lib + +from pyarrow.lib import ( + Alignment, + IpcReadOptions, + IpcWriteOptions, + Message, + MessageReader, + MetadataVersion, + ReadStats, + RecordBatchReader, + WriteStats, + _ReadPandasMixin, + get_record_batch_size, + get_tensor_size, + read_message, + read_record_batch, + read_schema, + read_tensor, + write_tensor, +) + + +class RecordBatchStreamReader(lib._RecordBatchStreamReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + + +class RecordBatchFileReader(lib._RecordBatchFileReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class RecordBatchFileWriter(lib._RecordBatchFileWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + + +def new_stream( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchStreamWriter: ... + + +def open_stream( + source: bytes | int | lib.Buffer | lib.NativeFile | IOBase, + *, + options: Any = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchStreamReader: ... + + +def new_file( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + metadata: lib.KeyValueMetadata | dict[bytes, bytes] | None = None, +) -> RecordBatchFileWriter: ... + + +def open_file( + source: StrPath | bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: Any = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchFileReader: ... + + +def serialize_pandas( + df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None +) -> lib.Buffer: ... + + +def deserialize_pandas( + buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + + +__all__ = [ + "Alignment", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "MetadataVersion", + "ReadStats", + "RecordBatchReader", + "WriteStats", + "_ReadPandasMixin", + "get_record_batch_size", + "get_tensor_size", + "read_message", + "read_record_batch", + "read_schema", + "read_tensor", + "write_tensor", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "new_stream", + "open_stream", + "new_file", + "open_file", + "serialize_pandas", + "deserialize_pandas", +] diff --git a/python/pyarrow-stubs/pyarrow/json.pyi b/python/pyarrow-stubs/pyarrow/json.pyi new file mode 100644 index 000000000000..67768db42e43 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/json.pyi @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json + +__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/pyarrow-stubs/pyarrow/lib.pyi b/python/pyarrow-stubs/pyarrow/lib.pyi new file mode 100644 index 000000000000..6bd9b7857bf2 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/lib.pyi @@ -0,0 +1,133 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Iterator, NamedTuple + +from .array import * # noqa: F401, F403 +from .builder import * # noqa: F401, F403 +from .compat import * # noqa: F401, F403 +from .config import * # noqa: F401, F403 +from .device import * # noqa: F401, F403 +from .error import * # noqa: F401, F403 +from .io import * # noqa: F401, F403 +from ._ipc import * # noqa: F401, F403 +from .memory import * # noqa: F401, F403 +from .pandas_shim import * # noqa: F401, F403 +from .scalar import * # noqa: F401, F403 +from .table import * # noqa: F401, F403 +from .tensor import * # noqa: F401, F403 +from ._types import * # noqa: F401, F403 +from .memory import MemoryPool +from .array import Array +from ._types import DataType + + +class MonthDayNano(tuple): + months: int + days: int + nanoseconds: int + + def __new__( + cls, + sequence: tuple[int, int, int] | list[int] = ..., + ) -> MonthDayNano: ... + + +def cpu_count() -> int: ... + + +def set_cpu_count(count: int) -> None: ... + + +def is_threading_enabled() -> bool: ... + + +def arange( + start: int, stop: int, step: int = 1, *, memory_pool: MemoryPool | None = None +) -> Array: ... + + +def is_boolean_value(obj: object) -> bool: ... + + +def is_integer_value(obj: object) -> bool: ... + + +def is_float_value(obj: object) -> bool: ... + + +def tzinfo_to_string(tz: object) -> str: ... + + +def string_to_tzinfo(tz: str) -> object: ... + + +def _ndarray_to_arrow_type(values: object, type_: object) -> object: ... + + +def _is_primitive(type_id: int) -> bool: ... + + +def ensure_type(ty: object) -> DataType: ... + + +Type_NA: int +Type_BOOL: int +Type_UINT8: int +Type_INT8: int +Type_UINT16: int +Type_INT16: int +Type_UINT32: int +Type_INT32: int +Type_UINT64: int +Type_INT64: int +Type_HALF_FLOAT: int +Type_FLOAT: int +Type_DOUBLE: int +Type_DECIMAL32: int +Type_DECIMAL64: int +Type_DECIMAL128: int +Type_DECIMAL256: int +Type_DATE32: int +Type_DATE64: int +Type_TIMESTAMP: int +Type_TIME32: int +Type_TIME64: int +Type_DURATION: int +Type_INTERVAL_MONTHS: int +Type_INTERVAL_DAY_TIME: int +Type_INTERVAL_MONTH_DAY_NANO: int +Type_BINARY: int +Type_STRING: int +Type_LARGE_BINARY: int +Type_LARGE_STRING: int +Type_FIXED_SIZE_BINARY: int +Type_BINARY_VIEW: int +Type_STRING_VIEW: int +Type_LIST: int +Type_LARGE_LIST: int +Type_LIST_VIEW: int +Type_LARGE_LIST_VIEW: int +Type_MAP: int +Type_FIXED_SIZE_LIST: int +Type_STRUCT: int +Type_SPARSE_UNION: int +Type_DENSE_UNION: int +Type_DICTIONARY: int +Type_RUN_END_ENCODED: int +UnionMode_SPARSE: int +UnionMode_DENSE: int diff --git a/python/pyarrow-stubs/pyarrow/memory.pyi b/python/pyarrow-stubs/pyarrow/memory.pyi new file mode 100644 index 000000000000..f80e01ab21c0 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/memory.pyi @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.lib import _Weakrefable + + +class MemoryPool(_Weakrefable): + def release_unused(self) -> None: ... + + def bytes_allocated(self) -> int: ... + + def total_bytes_allocated(self) -> int: ... + + def max_memory(self) -> int | None: ... + + def num_allocations(self) -> int: ... + + def print_stats(self) -> None: ... + + @property + def backend_name(self) -> str: ... + + +class LoggingMemoryPool(MemoryPool): + ... + + +class ProxyMemoryPool(MemoryPool): + ... + + +def default_memory_pool() -> MemoryPool: ... + + +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: ... + + +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: ... + + +def system_memory_pool() -> MemoryPool: ... + + +def jemalloc_memory_pool() -> MemoryPool: ... + + +def mimalloc_memory_pool() -> MemoryPool: ... + + +def set_memory_pool(pool: MemoryPool) -> None: ... + + +def log_memory_allocations(enable: bool = True) -> None: ... + + +def total_allocated_bytes() -> int: ... + + +def jemalloc_set_decay_ms(decay_ms: int) -> None: ... + + +def supported_memory_backends() -> list[str]: ... + + +__all__ = [ + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "default_memory_pool", + "proxy_memory_pool", + "logging_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "set_memory_pool", + "log_memory_allocations", + "total_allocated_bytes", + "jemalloc_set_decay_ms", + "supported_memory_backends", +] diff --git a/python/pyarrow-stubs/pyarrow/orc.pyi b/python/pyarrow-stubs/pyarrow/orc.pyi new file mode 100644 index 000000000000..f16350d0ffc9 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/orc.pyi @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from . import _orc +from ._fs import SupportedFileSystem +from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table + + +class ORCFile: + reader: _orc.ORCReader + def __init__(self, source: StrPath | NativeFile | IO) -> None: ... + @property + def metadata(self) -> KeyValueMetadata: ... + + @property + def schema(self) -> Schema: ... + + @property + def nrows(self) -> int: ... + + @property + def nstripes(self) -> int: ... + + @property + def file_version(self) -> str: ... + + @property + def software_version(self) -> str: ... + + @property + def compression(self) -> Literal["UNCOMPRESSED", + "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + + @property + def compression_size(self) -> int: ... + + @property + def writer(self) -> str: ... + + @property + def writer_version(self) -> str: ... + + @property + def row_index_stride(self) -> int: ... + + @property + def nstripe_statistics(self) -> int: ... + + @property + def content_length(self) -> int: ... + + @property + def stripe_statistics_length(self) -> int: ... + + @property + def file_footer_length(self) -> int: ... + + @property + def file_postscript_length(self) -> int: ... + + @property + def file_length(self) -> int: ... + + def read_stripe( + self, n: int, columns: list[str | int] | None = None + ) -> RecordBatch: ... + + def read(self, columns: list[str | int] | None = None) -> Table: ... + + +class ORCWriter: + writer: _orc.ORCWriter + is_open: bool + + def __init__( + self, + where: StrPath | NativeFile | IO, + *, + file_version: Any = "0.12", + batch_size: Any = 1024, + stripe_size: Any = 64 * 1024 * 1024, # noqa: Y011 + compression: Any = "UNCOMPRESSED", + compression_block_size: Any = 65536, + compression_strategy: Any = "SPEED", + row_index_stride: Any = 10000, + padding_tolerance: Any = 0.0, + dictionary_key_size_threshold: Any = 0.0, + bloom_filter_columns: Any = None, + bloom_filter_fpp: Any = 0.05, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + def __getattr__(self, name: str) -> Any: ... + def write(self, table: Table) -> None: ... + + def close(self) -> None: ... + + +def read_table( + source: StrPath | NativeFile | IO, + columns: list[str | int] | None = None, + filesystem: SupportedFileSystem | str | None = None, +) -> Table: ... + + +# TODO: should not use Any here? +def write_table( + table: Table, + where: StrPath | NativeFile | IO, + *, + file_version: Any = "0.12", + batch_size: Any = 1024, + stripe_size: Any = 64 * 1024 * 1024, # noqa: Y011 + compression: Any = 'UNCOMPRESSED', + compression_block_size: Any = 65536, + compression_strategy: Any = 'SPEED', + row_index_stride: Any = 10000, + padding_tolerance: Any = 0.0, + dictionary_key_size_threshold: Any = 0.0, + bloom_filter_columns: Any = None, + bloom_filter_fpp: Any = 0.05, +) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/pandas_compat.pyi b/python/pyarrow-stubs/pyarrow/pandas_compat.pyi new file mode 100644 index 000000000000..4e614c58a3fd --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/pandas_compat.pyi @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, TypedDict, TypeVar + +import numpy as np +import pandas as pd + +from pandas import DatetimeTZDtype + +from .lib import Array, DataType, Schema, Table, _pandas_api + +_T = TypeVar("_T") + + +def get_logical_type_map() -> dict[int, str]: ... +def get_logical_type(arrow_type: DataType) -> str: ... +def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... +def get_logical_type_from_numpy(pandas_collection) -> str: ... +def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... + + +class _ColumnMetadata(TypedDict): + name: str + field_name: str + pandas_type: int + numpy_type: str + metadata: dict | None + + +def get_column_metadata( + column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str +) -> _ColumnMetadata: ... + + +def construct_metadata( + columns_to_convert: list[pd.Series], + df: pd.DataFrame, + column_names: list[str], + index_levels: list[pd.Index], + index_descriptors: list[dict], + preserve_index: bool, + types: list[DataType], + column_field_names: list[str] = ..., +) -> dict[bytes, bytes]: ... + + +def dataframe_to_types( + df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None +) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... + + +def dataframe_to_arrays( + df: pd.DataFrame, + schema: Schema, + preserve_index: bool | None, + nthreads: int = 1, + columns: list[str] | None = None, + safe: bool = True, +) -> tuple[Array, Schema, int]: ... +def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... +def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... + + +def table_to_dataframe( + options, + table: Table, + categories=None, + ignore_metadata: bool = False, + types_mapper=None) -> pd.DataFrame: ... + + +def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... + + +__all__ = [ + "_pandas_api", +] diff --git a/python/pyarrow-stubs/pyarrow/pandas_shim.pyi b/python/pyarrow-stubs/pyarrow/pandas_shim.pyi new file mode 100644 index 000000000000..181d78e7a0c9 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/pandas_shim.pyi @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import types as stdlib_types +from collections.abc import Iterable +from typing import Any, TypeGuard + +from pandas import Categorical, DatetimeTZDtype, Index, Series, DataFrame + +from numpy import dtype +from pandas.core.dtypes.base import ExtensionDtype + + +class _PandasAPIShim: + has_sparse: bool + + def series(self, *args, **kwargs) -> Series: ... + def data_frame(self, *args, **kwargs) -> DataFrame: ... + @property + def have_pandas(self) -> bool: ... + @property + def compat(self) -> stdlib_types.ModuleType: ... + @property + def pd(self) -> stdlib_types.ModuleType: ... + def infer_dtype(self, obj: Iterable) -> str: ... + def pandas_dtype(self, dtype: str) -> dtype: ... + @property + def loose_version(self) -> Any: ... + @property + def version(self) -> str: ... + def is_v1(self) -> bool: ... + def is_ge_v21(self) -> bool: ... + def is_ge_v23(self) -> bool: ... + def is_ge_v3(self) -> bool: ... + def uses_string_dtype(self) -> bool: ... + @property + def categorical_type(self) -> type[Categorical]: ... + @property + def datetimetz_type(self) -> type[DatetimeTZDtype]: ... + @property + def extension_dtype(self) -> type[ExtensionDtype]: ... + + def is_array_like( + self, obj: Any + ) -> TypeGuard[Series | Index | Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[DatetimeTZDtype]: ... + def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... + def is_sparse(self, obj: Any) -> bool: ... + def is_data_frame(self, obj: Any) -> TypeGuard[DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[Series]: ... + def is_index(self, obj: Any) -> TypeGuard[Index]: ... + def get_values(self, obj: Any) -> bool: ... + def get_rangeindex_attribute(self, level, name): ... + + +_pandas_api: _PandasAPIShim + +__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/python/pyarrow-stubs/pyarrow/parquet/__init__.pyi b/python/pyarrow-stubs/pyarrow/parquet/__init__.pyi new file mode 100644 index 000000000000..5329bd6c66af --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/parquet/__init__.pyi @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from .core import * # noqa: F401, F403 diff --git a/python/pyarrow-stubs/pyarrow/parquet/core.pyi b/python/pyarrow-stubs/pyarrow/parquet/core.pyi new file mode 100644 index 000000000000..83326c717aeb --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/parquet/core.pyi @@ -0,0 +1,372 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from pathlib import Path + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Callable, Iterator, Iterable, Sequence +from typing import IO, Literal + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow import _parquet +from pyarrow._compute import Expression +from pyarrow._fs import FileSystem, SupportedFileSystem +from pyarrow._parquet import ( + ColumnChunkMetaData, + ColumnSchema, + FileDecryptionProperties, + FileEncryptionProperties, + FileMetaData, + ParquetLogicalType, + ParquetReader, + ParquetSchema, + RowGroupMetaData, + SortingColumn, + Statistics, +) +from pyarrow._stubs_typing import FilterTuple, SingleOrList +from pyarrow.dataset import ParquetFileFragment, Partitioning, PartitioningFactory +from pyarrow.lib import Buffer, NativeFile, RecordBatch, Schema, Table, ChunkedArray +from typing_extensions import deprecated + +__all__ = ( + "ColumnChunkMetaData", + "ColumnSchema", + "FileDecryptionProperties", + "FileEncryptionProperties", + "FileMetaData", + "ParquetDataset", + "ParquetFile", + "ParquetLogicalType", + "ParquetReader", + "ParquetSchema", + "ParquetWriter", + "RowGroupMetaData", + "SortingColumn", + "Statistics", + "read_metadata", + "read_pandas", + "read_schema", + "read_table", + "write_metadata", + "write_table", + "write_to_dataset", + "_filters_to_expression", + "filters_to_expression", +) + + +def filters_to_expression( + filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + + +@deprecated("use filters_to_expression") +def _filters_to_expression( + filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + + +_Compression: TypeAlias = Literal["gzip", "bz2", + "brotli", "lz4", "zstd", "snappy", "none"] + + +class ParquetFile: + reader: ParquetReader + common_metadata: FileMetaData + + def __init__( + self, + source: str | Path | Buffer | NativeFile | IO, + *, + metadata: FileMetaData | None = None, + common_metadata: FileMetaData | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + filesystem: SupportedFileSystem | None = None, + page_checksum_verification: bool = False, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def close(self, force: bool = False) -> None: ... + @property + def closed(self) -> bool: ... + + def read_row_group( + self, + i: int, + columns: Sequence[str | int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def read_row_groups( + self, + row_groups: Sequence[int], + columns: Iterable[str | int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def iter_batches( + self, + batch_size: int = 65536, + row_groups: Sequence[int] | None = None, + columns: Iterable[str | int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: ... + + def read( + self, + columns: Sequence[str | int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def scan_contents( + self, columns: Iterable[str | int] | None = None, batch_size: int = 65536 + ) -> int: ... + + +class ParquetWriter: + flavor: str + schema_changed: bool + schema: ParquetSchema + where: str | Path | IO + file_handler: NativeFile | None + writer: _parquet.ParquetWriter + is_open: bool + + def __init__( + self, + where: str | Path | IO | NativeFile, + schema: Schema, + filesystem: SupportedFileSystem | None = None, + flavor: str | None = None, + version: Literal["1.0", "2.4", "2.6"] = ..., + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool | list = False, + column_encoding: str | dict | None = None, + writer_engine_version=None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + max_rows_per_page: int | None = None, + **options, + ) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> Literal[False]: ... + + def write( + self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None + ) -> None: ... + def write_batch(self, batch: RecordBatch, + row_group_size: int | None = None) -> None: ... + + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def close(self) -> None: ... + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: ... + + +class ParquetDataset: + def __init__( + self, + path_or_paths: SingleOrList[str] + | SingleOrList[Path] + | SingleOrList[NativeFile] + | SingleOrList[IO], + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + *, + filters: Expression + | FilterTuple + | list[FilterTuple] + | list[list[FilterTuple]] + | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str + | list[str] + | Partitioning + | PartitioningFactory + | None = "hive", + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + def equals(self, other: ParquetDataset) -> bool: ... + @property + def schema(self) -> Schema: ... + + def read( + self, + columns: list[str] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + def read_pandas(self, **kwargs) -> Table: ... + @property + def fragments(self) -> list[ParquetFileFragment]: ... + @property + def files(self) -> list[str]: ... + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning: ... + + +def read_table( + source: SingleOrList[str] + | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO] | Buffer, + *, + columns: list | None = None, + use_threads: bool = True, + schema: Schema | None = None, + use_pandas_metadata: bool = False, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | PartitioningFactory | None = "hive", + filesystem: SupportedFileSystem | str | None = None, + filters: Expression + | FilterTuple + | list[FilterTuple] + | Sequence[Sequence[tuple]] + | None = None, + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, +) -> Table: ... + + +def read_pandas( + source: str | Path | NativeFile | IO | Buffer, columns: list | None = None, **kwargs +) -> Table: ... + + +def write_table( + table: Table, + where: str | Path | NativeFile | IO, + row_group_size: int | None = None, + version: Literal["1.0", "2.4", "2.6"] = "2.6", + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + coerce_timestamps: str | None = None, + allow_truncated_timestamps: bool = False, + data_page_size: int | None = None, + flavor: str | None = None, + filesystem: SupportedFileSystem | str | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool = False, + column_encoding: str | dict | None = None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **kwargs, +) -> None: ... + + +def write_to_dataset( + table: Table | ChunkedArray, + root_path: str | Path, + partition_cols: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + partitioning: Partitioning | list[str] | None = None, + basename_template: str | None = None, + use_threads: bool | None = None, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] + | None = None, + **kwargs, +) -> None: ... + + +def write_metadata( + schema: Schema, + where: str | NativeFile, + metadata_collector: list[FileMetaData] | None = None, + filesystem: SupportedFileSystem | None = None, + **kwargs, +) -> None: ... + + +def read_metadata( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | str | None = None, +) -> FileMetaData: ... + + +def read_schema( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | str | None = None, +) -> Schema: ... diff --git a/python/pyarrow-stubs/pyarrow/parquet/encryption.pyi b/python/pyarrow-stubs/pyarrow/parquet/encryption.pyi new file mode 100644 index 000000000000..7add1c6fa535 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/parquet/encryption.pyi @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._parquet_encryption import ( + CryptoFactory, + DecryptionConfiguration, + EncryptionConfiguration, + FileSystemKeyMaterialStore, + KmsClient, + KmsConnectionConfig, +) + +__all__ = [ + "CryptoFactory", + "DecryptionConfiguration", + "EncryptionConfiguration", + "FileSystemKeyMaterialStore", + "KmsClient", + "KmsConnectionConfig", +] diff --git a/python/pyarrow-stubs/pyarrow/scalar.pyi b/python/pyarrow-stubs/pyarrow/scalar.pyi new file mode 100644 index 000000000000..70b2ea2b3479 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/scalar.pyi @@ -0,0 +1,466 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import collections.abc +import datetime as dt +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Iterator +from typing import Any, Generic, Literal + +import numpy as np + +from pyarrow._compute import CastOptions +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from pyarrow.table import ArrayOrChunkedArray +from typing_extensions import TypeVar + +from ._types import ( # noqa: F401 + DataType, + Decimal128Type, + Date32Type, + Date64Type, + Time32Type, + Time64Type, + TimestampType, + Decimal256Type, + NullType, + BoolType, + UInt8Type, + Int8Type, + DurationType, MonthDayNanoIntervalType, BinaryType, LargeBinaryType, + FixedSizeBinaryType, StringType, LargeStringType, BinaryViewType, StringViewType, + FixedSizeListType, + Float16Type, Float32Type, Float64Type, Decimal32Type, Decimal64Type, + LargeListType, + LargeListViewType, + ListType, + ListViewType, + OpaqueType, DictionaryType, MapType, _BasicDataType, + StructType, RunEndEncodedType, + UInt16Type, Int16Type, UInt32Type, Int32Type, UInt64Type, Int64Type, + UnionType, ExtensionType, BaseExtensionType, Bool8Type, UuidType, JsonType, + _BasicValueT, + _DataTypeT, + _IndexT, + _K, + _Precision, + _RunEndType, + _Scale, + _Size, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, + _ValueT, +) + +_AsPyTypeK = TypeVar("_AsPyTypeK") +_AsPyTypeV = TypeVar("_AsPyTypeV") +_DataType_co = TypeVar("_DataType_co", bound=DataType, covariant=True) + + +class Scalar(_Weakrefable, Generic[_DataType_co]): + @property + def type(self) -> _DataType_co: ... + + @property + def is_valid(self) -> bool: ... + + def cast( + self, + target_type: None | _DataTypeT | str, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self | Scalar[_DataTypeT] | Scalar[Any]: ... + + def validate(self, *, full: bool = False) -> None: ... + + def equals(self, other: Scalar | ArrayOrChunkedArray) -> bool: ... + + def __hash__(self) -> int: ... + + def as_py(self: Scalar[Any], *, maps_as_pydicts: Literal["lossy", + "strict"] | None = None) -> Any: ... + + def as_buffer(self) -> Buffer | None: ... + + # Buffer protocol support + def __buffer__(self, flags: int) -> memoryview: ... + + # Methods for structured types (StructScalar, MapScalar, ListScalar, etc.) + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[Any]: ... + + def __getitem__(self, key: int | str) -> Any: ... + + def __contains__(self, key: object) -> bool: ... + + def keys(self) -> Iterator[str]: ... + + def items(self) -> Iterator[tuple[str, Any]]: ... + + @property + def values(self) -> Any: ... + + # Methods for compatibility with array-like interface + def to_pylist(self) -> list: ... + def tolist(self) -> list: ... + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> Any: ... + + +_NULL: NullScalar +NA: NullScalar + + +class NullScalar(Scalar[NullType]): + ... + + +class BooleanScalar(Scalar[BoolType]): + ... + + +class UInt8Scalar(Scalar[UInt8Type]): + ... + + +class Int8Scalar(Scalar[Int8Type]): + ... + + +class UInt16Scalar(Scalar[UInt16Type]): + ... + + +class Int16Scalar(Scalar[Int16Type]): + ... + + +class UInt32Scalar(Scalar[UInt32Type]): + ... + + +class Int32Scalar(Scalar[Int32Type]): + ... + + +class UInt64Scalar(Scalar[UInt64Type]): + ... + + +class Int64Scalar(Scalar[Int64Type]): + ... + + +class HalfFloatScalar(Scalar[Float16Type]): + ... + + +class FloatScalar(Scalar[Float32Type]): + ... + + +class DoubleScalar(Scalar[Float64Type]): + ... + + +class Decimal32Scalar(Scalar[Decimal32Type[_Precision, _Scale]]): + ... + + +class Decimal64Scalar(Scalar[Decimal64Type[_Precision, _Scale]]): + ... + + +class Decimal128Scalar(Scalar[Decimal128Type[_Precision, _Scale]]): + ... + + +class Decimal256Scalar(Scalar[Decimal256Type[_Precision, _Scale]]): + ... + + +class Date32Scalar(Scalar[Date32Type]): + ... + + +class Date64Scalar(Scalar[Date64Type]): + @property + def value(self) -> dt.date | None: ... + + +class Time32Scalar(Scalar[Time32Type[_Time32Unit]]): + @property + def value(self) -> dt.time | None: ... + + +class Time64Scalar(Scalar[Time64Type[_Time64Unit]]): + @property + def value(self) -> dt.time | None: ... + + +class TimestampScalar(Scalar[TimestampType[_Unit, _Tz]]): + @property + def value(self) -> int | None: ... + + +class DurationScalar(Scalar[DurationType[_Unit]]): + @property + def value(self) -> dt.timedelta | None: ... + + +class MonthDayNanoIntervalScalar(Scalar[MonthDayNanoIntervalType]): + @property + def value(self) -> MonthDayNano | None: ... + + +class BinaryScalar(Scalar[BinaryType]): + def as_buffer(self) -> Buffer: ... + + +class LargeBinaryScalar(Scalar[LargeBinaryType]): + def as_buffer(self) -> Buffer: ... + + +class FixedSizeBinaryScalar(Scalar[FixedSizeBinaryType]): + def as_buffer(self) -> Buffer: ... + + +class StringScalar(Scalar[StringType]): + def as_buffer(self) -> Buffer: ... + + +class LargeStringScalar(Scalar[LargeStringType]): + def as_buffer(self) -> Buffer: ... + + +class BinaryViewScalar(Scalar[BinaryViewType]): + def as_buffer(self) -> Buffer: ... + + +class StringViewScalar(Scalar[StringViewType]): + def as_buffer(self) -> Buffer: ... + + +class ListScalar(Scalar[ListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class FixedSizeListScalar(Scalar[FixedSizeListType[_DataTypeT, _Size]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class LargeListScalar(Scalar[LargeListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class ListViewScalar(Scalar[ListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class LargeListViewScalar(Scalar[LargeListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class StructScalar(Scalar[StructType], collections.abc.Mapping[str, Scalar]): + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[str]: ... + + def __getitem__(self, key: int | str) -> Scalar[Any]: ... + + def keys(self) -> collections.abc.KeysView[str]: # type: ignore[override] + ... + + def items(self) -> collections.abc.ItemsView[str, Scalar[Any]]: # type: ignore[override] # noqa: E501 + ... + + def _as_py_tuple(self) -> list[tuple[str, Any]]: ... + + +class MapScalar(Scalar[MapType[_K, _ValueT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> ( + tuple[Scalar[_K], _ValueT, Any] | Scalar[Any]): ... + + def __iter__(self: Scalar[ + MapType[_BasicDataType[_AsPyTypeK], _BasicDataType[_AsPyTypeV]]] + | Scalar[MapType[Any, _BasicDataType[_AsPyTypeV]]] + | Scalar[MapType[_BasicDataType[_AsPyTypeK], Any]]) -> ( + Iterator[tuple[_AsPyTypeK, _AsPyTypeV]] + | Iterator[tuple[Any, _AsPyTypeV]] + | Iterator[tuple[_AsPyTypeK, Any]] + ): ... + + +class DictionaryScalar(Scalar[DictionaryType[_IndexT, _BasicValueT]]): + @property + def index(self) -> Scalar[_IndexT]: ... + + @property + def value(self) -> Scalar[_BasicValueT]: ... + + @property + def dictionary(self) -> Array: ... + + +class RunEndEncodedScalar(Scalar[RunEndEncodedType[_RunEndType, _BasicValueT]]): + @property + def value(self) -> tuple[int, _BasicValueT] | None: ... + + +class UnionScalar(Scalar[UnionType]): + @property + def value(self) -> Any | None: ... + + @property + def type_code(self) -> str: ... + + +class ExtensionScalar(Scalar[ExtensionType]): + @property + def value(self) -> Any | None: ... + + @staticmethod + def from_storage(typ: BaseExtensionType, value) -> ExtensionScalar: ... + + +class Bool8Scalar(Scalar[Bool8Type]): + ... + + +class UuidScalar(Scalar[UuidType]): + ... + + +class JsonScalar(Scalar[JsonType]): + ... + + +class OpaqueScalar(Scalar[OpaqueType]): + ... + + +class FixedShapeTensorScalar(ExtensionScalar): + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> ( + np.ndarray): ... # type: ignore[override] + + def to_tensor(self) -> Tensor: ... + + +def scalar( + value: Any, + type: _DataTypeT | str | None = None, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Scalar[_DataTypeT] | Scalar[Any]: ... + + +__all__ = [ + "Scalar", + "_NULL", + "NA", + "NullScalar", + "BooleanScalar", + "UInt8Scalar", + "Int8Scalar", + "UInt16Scalar", + "Int16Scalar", + "UInt32Scalar", + "Int32Scalar", + "UInt64Scalar", + "Int64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "FixedSizeBinaryScalar", + "StringScalar", + "LargeStringScalar", + "BinaryViewScalar", + "StringViewScalar", + "ListScalar", + "FixedSizeListScalar", + "LargeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "StructScalar", + "MapScalar", + "DictionaryScalar", + "RunEndEncodedScalar", + "UnionScalar", + "ExtensionScalar", + "FixedShapeTensorScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "scalar", +] diff --git a/python/pyarrow-stubs/pyarrow/substrait.pyi b/python/pyarrow-stubs/pyarrow/substrait.pyi new file mode 100644 index 000000000000..b78bbd8aebd7 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/substrait.pyi @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._substrait import ( + BoundExpressions, + SubstraitSchema, + deserialize_expressions, + deserialize_schema, + get_supported_functions, + run_query, + serialize_expressions, + serialize_schema, +) + +__all__ = [ + "BoundExpressions", + "get_supported_functions", + "run_query", + "deserialize_expressions", + "serialize_expressions", + "deserialize_schema", + "serialize_schema", + "SubstraitSchema", +] diff --git a/python/pyarrow-stubs/pyarrow/table.pyi b/python/pyarrow-stubs/pyarrow/table.pyi new file mode 100644 index 000000000000..6dd61674d40c --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/table.pyi @@ -0,0 +1,686 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from collections.abc import ( + Collection, Generator, Iterable, Iterator, Sequence, Mapping) +from typing import Any, Generic, Literal, TypeVar +import builtins + +import numpy as np +import pandas as pd + +from numpy.typing import NDArray +from pyarrow._compute import ( + CastOptions, + CountOptions, + FunctionOptions, + ScalarAggregateOptions, + TDigestOptions, + VarianceOptions, +) +from pyarrow._stubs_typing import ( + Indices, + Mask, + NullEncoding, + NullSelectionBehavior, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportArrowStream, +) +from pyarrow.compute import Expression +from pyarrow.interchange.dataframe import _PyArrowDataFrame +from pyarrow.lib import Device, MemoryManager, MemoryPool, Schema +from pyarrow.lib import Field as _Field + +from .array import Array, StructArray, _CastAs, _PandasConvertible +from .device import DeviceAllocationType +from .io import Buffer +from ._ipc import RecordBatchReader +from .scalar import BooleanScalar, Int64Scalar, Scalar, StructScalar +from .tensor import Tensor +from ._stubs_typing import NullableCollection +from ._types import DataType, _AsPyType, _BasicDataType, _DataTypeT + +Field: TypeAlias = _Field[DataType] +_ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +ArrayOrChunkedArray: TypeAlias = Array[_Scalar_co] | ChunkedArray[_Scalar_co] + +_Aggregation: TypeAlias = Literal[ + "all", + "any", + "approximate_median", + "count", + "count_all", + "count_distinct", + "distinct", + "first", + "first_last", + "last", + "list", + "max", + "mean", + "min", + "min_max", + "one", + "product", + "stddev", + "sum", + "tdigest", + "variance", +] +_AggregationPrefixed: TypeAlias = Literal[ + "hash_all", + "hash_any", + "hash_approximate_median", + "hash_count", + "hash_count_all", + "hash_count_distinct", + "hash_distinct", + "hash_first", + "hash_first_last", + "hash_last", + "hash_list", + "hash_max", + "hash_mean", + "hash_min", + "hash_min_max", + "hash_one", + "hash_product", + "hash_stddev", + "hash_sum", + "hash_tdigest", + "hash_variance", +] +Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed | str +AggregateOptions: TypeAlias = (ScalarAggregateOptions | CountOptions + | TDigestOptions | VarianceOptions | FunctionOptions) + +UnarySelector: TypeAlias = str +NullarySelector: TypeAlias = tuple[()] +NarySelector: TypeAlias = list[str] | tuple[str, ...] +ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector + + +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + + def as_py(self) -> list[Any]: ... + + @property + def data(self) -> Self: ... + @property + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: ... + + # Private attribute used internally for column names + _name: str | None + + def length(self) -> int: ... + + __len__ = length + + def to_string( + self, + *, + indent: int = 0, + window: int = 5, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + + format = to_string + def validate(self, *, full: bool = False) -> None: ... + + @property + def null_count(self) -> int: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + + def __getitem__( + self, key: int | np.integer | builtins.slice) -> _Scalar_co | Self: ... + + def getitem(self, i: int) -> Scalar: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[BooleanScalar]: ... + + def is_nan(self) -> ChunkedArray[BooleanScalar]: ... + + def is_valid(self) -> ChunkedArray[BooleanScalar]: ... + + def cast( + self, target_type: _CastAs | str | None, safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None + ) -> Self | ChunkedArray[Scalar[_CastAs]]: ... + + def fill_null(self, fill_value: Scalar[_DataTypeT] | Any) -> Self: ... + + def equals(self, other: Self | Any) -> bool: ... + + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: ... + + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... + + def flatten(self, memory_pool: MemoryPool | + None = None) -> list[ChunkedArray[Any]]: ... + + def combine_chunks(self, memory_pool: MemoryPool | + None = None) -> Array[_Scalar_co]: ... + + def unique(self) -> ChunkedArray[_Scalar_co]: ... + + def value_counts(self) -> StructArray: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def filter(self, mask: Mask, + null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ... + + def index( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + value: Scalar[_DataTypeT] | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + + @property + def num_chunks(self) -> int: ... + + def chunk(self, i: int) -> Array[_Scalar_co]: ... + + @property + def chunks(self) -> list[Array[_Scalar_co]]: ... + + def iterchunks( + self: ArrayOrChunkedArray[_ScalarT], + ) -> Generator[Array, None, None]: ... + + def __iter__(self) -> Iterator[_Scalar_co]: ... + + def to_pylist( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: ... + + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + + @property + def is_cpu(self) -> bool: ... + + +def chunked_array( + arrays: Iterable[NullableCollection[Any]] + | Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray] + | Iterable[Array[_ScalarT]] | Array[_ScalarT] + | SupportArrowArray | SupportArrowStream, + type: DataType | str | None = None, +) -> ChunkedArray[Scalar[Any]] | ChunkedArray[_ScalarT]: ... + + +_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) + + +class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: ... + + def __getitem__(self, key: int | str | slice) -> _ColumnT | Self: ... + + def __len__(self) -> int: ... + def column(self, i: int | str) -> _ColumnT: ... + + @property + def column_names(self) -> list[str]: ... + + @property + def columns(self) -> list[_ColumnT]: ... + + def drop_null(self) -> Self: ... + + def field(self, i: int | str) -> Field: ... + + @classmethod + def from_pydict( + cls, + mapping: + Mapping[Any, ArrayOrChunkedArray[Any] | list[Any] | np.ndarray | range], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + @classmethod + def from_pylist( + cls, + mapping: Sequence[Mapping[str, Any]], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + def itercolumns(self) -> Generator[_ColumnT, None, None]: ... + + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def shape(self) -> tuple[int, int]: ... + + @property + def schema(self) -> Schema: ... + @property + def nbytes(self) -> int: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + + def filter( + self, + mask: Mask | Expression, + null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ... + + def to_pydict( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> dict[str, list[Any]]: ... + + def to_pylist( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> list[dict[str, Any]]: ... + + def to_string(self, *, show_metadata: bool = False, + preview_cols: int = 0) -> str: ... + + def remove_column(self, i: int) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: ... + + def add_column(self, i: int, field_: str | Field, + column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ... + + def append_column( + self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: ... + + +class RecordBatch(_Tabular[Array]): + def validate(self, *, full: bool = False) -> None: ... + + def replace_schema_metadata( + self, + metadata: dict[str, str] + | dict[bytes, bytes] + | dict[bytes, str] + | dict[str, bytes] + | None = None + ) -> Self: ... + + @property + def num_columns(self) -> int: ... + + @property + def num_rows(self) -> int: ... + + @property + def schema(self) -> Schema: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def equals(self, other: Self | Any, check_metadata: bool = False) -> bool: ... + + def select(self, columns: Iterable[str] | + Iterable[int] | NDArray[np.str_]) -> Self: ... + + def cast(self, target_schema: Schema, safe: bool | None = None, + options: CastOptions | None = None) -> Self: ... + + @classmethod + def from_arrays( + cls, + arrays: Iterable[Any], + names: list[str] | tuple[str, ...] | None = None, + schema: Schema | None = None, + metadata: Mapping[bytes, bytes] + | Mapping[str, str] + | Mapping[bytes, str] + | Mapping[str, bytes] + | None = None, + ) -> Self: ... + + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: Sequence[str | int] | None = None, + ) -> Self: ... + + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: ... + + def to_struct_array(self) -> StructArray: ... + + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): ... + + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None): ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): ... + + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def is_cpu(self) -> bool: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] + + +class Table(_Tabular[ChunkedArray[Any]]): + def validate(self, *, full: bool = False) -> None: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def select(self, columns: Iterable[str] | + Iterable[int] | NDArray[np.str_]) -> Self: ... + + def replace_schema_metadata( + self, metadata: dict[str, str] + | dict[bytes, bytes] + | dict[bytes, str] + | dict[str, bytes] + | None = None + ) -> Self: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def equals(self, other: Self | Any, check_metadata: bool = False) -> bool: ... + + def cast(self, target_schema: Schema, safe: bool | None = None, + options: CastOptions | None = None) -> Self: ... + + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: Sequence[str | int] | None = None, + safe: bool = True, + ) -> Self: ... + + @classmethod + def from_arrays( + cls, + arrays: + Collection[ArrayOrChunkedArray[Any] | Collection[NDArray[Any]] | list[Any]], + names: list[str] | tuple[str, ...] | None = None, + schema: Schema | None = None, + metadata: Mapping[bytes, bytes] + | Mapping[str, str] + | Mapping[bytes, str] + | Mapping[str, bytes] | None = None, + ) -> Self: ... + + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: ... + + def to_struct_array( + self, max_chunksize: int | None = None + ) -> ChunkedArray[StructScalar]: ... + + @classmethod + def from_batches(cls, batches: Iterable[RecordBatch], + schema: Schema | None = None) -> Self: ... + + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: ... + + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: ... + + @property + def schema(self) -> Schema: ... + + @property + def num_columns(self) -> int: ... + + @property + def num_rows(self) -> int: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + + def add_column(self, i: int, field_: str | Field, + column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + + def set_column(self, i: int, field_: str | Field, + column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def drop(self, columns: str | list[str]) -> Self: ... + + def group_by(self, keys: str | list[str], + use_threads: bool = True) -> TableGroupBy: ... + + def join( + self, + right_table: Self, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> Self: ... + + def join_asof( + self, + right_table: Self, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @property + def is_cpu(self) -> bool: ... + + +def record_batch( + data: Mapping[str, list[Any] | Array[Any]] + | Collection[Array[Any] | ChunkedArray[Any] | list[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowDeviceArray, + names: list[str] | Schema | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, +) -> RecordBatch: ... + + +def table( + data: Collection[ArrayOrChunkedArray[Any] | list[Any] | range | str] + | pd.DataFrame + | SupportArrowArray + | SupportArrowStream + | SupportArrowDeviceArray + | Mapping[str, list[Any] | Array[Any] | ChunkedArray[Any] | range] + | Mapping[str, Any], + names: list[str] | Schema | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + nthreads: int | None = None, +) -> Table: ... + + +def concat_tables( + tables: Iterable[Table], + memory_pool: MemoryPool | None = None, + promote_options: Literal["none", "default", "permissive"] = "none", + **kwargs: Any, +) -> Table: ... + + +class TableGroupBy: + + keys: str | list[str] + + def __init__(self, table: Table, keys: str | + list[str], use_threads: bool = True): ... + + def aggregate( + self, + aggregations: Iterable[ + tuple[ColumnSelector, Aggregation] + | tuple[ColumnSelector, Aggregation, AggregateOptions | None] + ], + ) -> Table: ... + + def _table(self) -> Table: ... + @property + def _use_threads(self) -> bool: ... + + +def concat_batches( + recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None +) -> RecordBatch: ... + + +__all__ = [ + "ChunkedArray", + "chunked_array", + "_Tabular", + "RecordBatch", + "table_to_blocks", + "Table", + "record_batch", + "table", + "concat_tables", + "TableGroupBy", + "concat_batches", + "Aggregation", + "AggregateOptions", +] diff --git a/python/pyarrow-stubs/pyarrow/tensor.pyi b/python/pyarrow-stubs/pyarrow/tensor.pyi new file mode 100644 index 000000000000..ba40c7b299db --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/tensor.pyi @@ -0,0 +1,268 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from collections.abc import Sequence +import numpy as np + +from pyarrow.lib import _Weakrefable +from pyarrow._types import DataType +from scipy.sparse import coo_matrix, csr_matrix +from sparse import COO # type: ignore[import-untyped, import-not-found] + + +class Tensor(_Weakrefable): + @classmethod + def from_numpy(cls, obj: np.ndarray, + dim_names: Sequence[str] | None = None) -> Self: ... + + def to_numpy(self) -> np.ndarray: ... + + def equals(self, other: Tensor) -> bool: ... + + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + + @property + def is_mutable(self) -> bool: ... + + @property + def is_contiguous(self) -> bool: ... + + @property + def ndim(self) -> int: ... + + @property + def size(self) -> str: ... + + @property + def shape(self) -> tuple[int, ...]: ... + + @property + def strides(self) -> tuple[int, ...]: ... + + @property + def type(self) -> DataType: ... + + +class SparseCOOTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + coords: np.ndarray, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, + dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_pydata_sparse( + cls, obj: COO, dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> coo_matrix: ... + + def to_pydata_sparse(self) -> COO: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def has_canonical_format(self) -> bool: ... + @property + def type(self) -> DataType: ... + + +class SparseCSRMatrix(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, + dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def type(self) -> DataType: ... + + +class SparseCSCMatrix(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + + +class SparseCSFTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: Sequence[np.ndarray], + indices: Sequence[np.ndarray], + shape: tuple[int, ...], + axis_order: Sequence[int] | None = None, + dim_names: Sequence[str] | None = None, + ) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def type(self) -> DataType: ... + + +__all__ = [ + "Tensor", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", +] diff --git a/python/pyarrow-stubs/pyarrow/tests/util.pyi b/python/pyarrow-stubs/pyarrow/tests/util.pyi new file mode 100644 index 000000000000..5ceb784588a7 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/tests/util.pyi @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Callable +from contextlib import AbstractContextManager +from decimal import Decimal +from os import PathLike +from typing import Any, Literal +import socket + +import pyarrow.fs + + +def randsign() -> int: ... +def random_seed(seed: int) -> AbstractContextManager[None]: ... +def randdecimal(precision: int, scale: int) -> Decimal: ... +def random_ascii(length: int) -> bytes: ... +def rands(nchars: int) -> str: ... +def get_modified_env_with_pythonpath() -> dict[str, str]: ... +def invoke_script(script_name: str, *args: str) -> None: ... +def changed_environ(name: str, value: str) -> AbstractContextManager[None]: ... +def change_cwd(path: str | PathLike[str]) -> AbstractContextManager[None]: ... +def disabled_gc() -> AbstractContextManager[None]: ... +def _filesystem_uri(path: str) -> str: ... + + +def memory_leak_check( + f: Callable[[], Any], + metric: Literal['rss', 'vms', 'shared'] = 'rss', + threshold: int = 131072, + iterations: int = 10, + check_interval: int = 1 +) -> None: ... + + +class FSProtocolClass: + def __init__(self, path: str | PathLike[str]) -> None: ... + def __fspath__(self) -> str: ... + + +class ProxyHandler(pyarrow.fs.FileSystemHandler): + _fs: pyarrow.fs.FileSystem + def __init__(self, fs: pyarrow.fs.FileSystem) -> None: ... + def __eq__(self, other: object) -> bool: ... + def __ne__(self, other: object) -> bool: ... + def get_type_name(self) -> str: ... + def normalize_path(self, path: str) -> str: ... + def get_file_info(self, paths: list[str]) -> list[pyarrow.fs.FileInfo]: ... + def get_file_info_selector( + self, selector: pyarrow.fs.FileSelector) -> list[pyarrow.fs.FileInfo]: ... + + def create_dir(self, path: str, recursive: bool) -> None: ... + def delete_dir(self, path: str) -> None: ... + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ... + def delete_root_dir_contents(self) -> None: ... + def delete_file(self, path: str) -> None: ... + def move(self, src: str, dest: str) -> None: ... + def copy_file(self, src: str, dest: str) -> None: ... + def open_input_stream(self, path: str) -> Any: ... + def open_input_file(self, path: str) -> Any: ... + def open_output_stream(self, path: str, metadata: dict[str, str]) -> Any: ... + def open_append_stream(self, path: str, metadata: dict[str, str]) -> Any: ... + + +def _ensure_minio_component_version(component: str, minimum_year: int) -> bool: ... +def _run_mc_command(mcdir: str, *args: str) -> None: ... +def windows_has_tzdata() -> bool: ... +def running_on_musllinux() -> bool: ... + + +def signal_wakeup_fd( + *, warn_on_full_buffer: bool = False) -> AbstractContextManager[socket.socket]: ... + + +def _configure_s3_limited_user( + s3_server: dict[str, Any], policy: str, username: str, password: str) -> None: ... + + +def _wait_for_minio_startup( + mcdir: str, address: str, access_key: str, secret_key: str) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/types.pyi b/python/pyarrow-stubs/pyarrow/types.pyi new file mode 100644 index 000000000000..9e5a0568db05 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/types.pyi @@ -0,0 +1,227 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +from enum import IntEnum + +from typing import Any + +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +import pyarrow.lib as lib + +from pyarrow.lib import ( + BinaryType, + BinaryViewType, + BoolType, + DataType, + Date32Type, + Date64Type, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + DenseUnionType, + DictionaryType, + DurationType, + FixedSizeBinaryType, + FixedSizeListType, + Float16Type, + Float32Type, + Float64Type, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + LargeBinaryType, + LargeListType, + LargeListViewType, + LargeStringType, + ListType, + ListViewType, + MapType, + MonthDayNanoIntervalType, + NullType, + RunEndEncodedType, + SparseUnionType, + StringType, + StringViewType, + StructType, + Time32Type, + Time64Type, + TimestampType, + UInt8Type, + UInt16Type, + UInt32Type, + UInt64Type, +) + +_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type +_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | UInt32Type | UInt64Type +_Integer: TypeAlias = _SignedInteger | _UnsignedInteger +_Floating: TypeAlias = Float16Type | Float32Type | Float64Type +_Decimal: TypeAlias = ( + Decimal32Type[Any, Any] + | Decimal64Type[Any, Any] + | Decimal128Type[Any, Any] + | Decimal256Type[Any, Any] +) +_Date: TypeAlias = Date32Type | Date64Type +_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] +_Interval: TypeAlias = MonthDayNanoIntervalType +_Temporal: TypeAlias = (TimestampType[Any, Any] + | DurationType[Any] | _Time | _Date | _Interval) +_Union: TypeAlias = SparseUnionType | DenseUnionType +_Nested: TypeAlias = ( + ListType[Any] + | FixedSizeListType[Any, Any] + | LargeListType[Any] + | ListViewType[Any] + | LargeListViewType[Any] + | StructType + | MapType[Any, Any, Any] + | _Union +) + + +def is_null(t: DataType) -> TypeIs[NullType]: ... +def is_boolean(t: DataType) -> TypeIs[BoolType]: ... +def is_integer(t: DataType) -> TypeIs[_Integer]: ... +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... +def is_int8(t: DataType) -> TypeIs[Int8Type]: ... +def is_int16(t: DataType) -> TypeIs[Int16Type]: ... +def is_int32(t: DataType) -> TypeIs[Int32Type]: ... +def is_int64(t: DataType) -> TypeIs[Int64Type]: ... +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... +def is_uint32(t: DataType) -> TypeIs[UInt32Type]: ... +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... +def is_floating(t: DataType) -> TypeIs[_Floating]: ... +def is_float16(t: DataType) -> TypeIs[Float16Type]: ... +def is_float32(t: DataType) -> TypeIs[Float32Type]: ... +def is_float64(t: DataType) -> TypeIs[Float64Type]: ... +def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... +def is_struct(t: DataType) -> TypeIs[StructType]: ... +def is_union(t: DataType) -> TypeIs[_Union]: ... +def is_nested(t: DataType) -> TypeIs[_Nested]: ... +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... +def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... +def is_time(t: DataType) -> TypeIs[_Time]: ... +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... +def is_binary(t: DataType) -> TypeIs[BinaryType]: ... +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... +def is_unicode(t: DataType) -> TypeIs[StringType]: ... +def is_string(t: DataType) -> TypeIs[StringType]: ... +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... +def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... +def is_date(t: DataType) -> TypeIs[_Date]: ... +def is_date32(t: DataType) -> TypeIs[Date32Type]: ... +def is_date64(t: DataType) -> TypeIs[Date64Type]: ... +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... +def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... +def is_interval(t: DataType) -> TypeIs[_Interval]: ... +def is_primitive(t: DataType) -> bool: ... +def is_boolean_value(obj: Any) -> bool: ... +def is_integer_value(obj: Any) -> bool: ... +def is_float_value(obj: Any) -> bool: ... + + +__all__ = [ + "lib", + "is_binary", + "is_binary_view", + "is_boolean", + "is_date", + "is_date32", + "is_date64", + "is_decimal", + "is_decimal128", + "is_decimal256", + "is_decimal32", + "is_decimal64", + "is_dictionary", + "is_duration", + "is_fixed_size_binary", + "is_fixed_size_list", + "is_float16", + "is_float32", + "is_float64", + "is_floating", + "is_int16", + "is_int32", + "is_int64", + "is_int8", + "is_integer", + "is_interval", + "is_large_binary", + "is_large_list", + "is_large_list_view", + "is_large_string", + "is_large_unicode", + "is_list", + "is_list_view", + "is_map", + "is_nested", + "is_null", + "is_primitive", + "is_run_end_encoded", + "is_signed_integer", + "is_string", + "is_string_view", + "is_struct", + "is_temporal", + "is_time", + "is_time32", + "is_time64", + "is_timestamp", + "is_uint16", + "is_uint32", + "is_uint64", + "is_uint8", + "is_unicode", + "is_union", + "is_unsigned_integer", +] + + +class TypesEnum(IntEnum): + INTERVAL_MONTHS = 0 + INTERVAL_DAY_TIME = 1 + INTERVAL_MONTH_DAY_NANO = 2 diff --git a/python/pyarrow-stubs/pyarrow/util.pyi b/python/pyarrow-stubs/pyarrow/util.pyi new file mode 100644 index 000000000000..c3317960c81c --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/util.pyi @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable, Sequence +from os import PathLike +from typing import Any, Protocol, TypeVar + +_F = TypeVar("_F", bound=Callable) +_N = TypeVar("_N") + + +class _DocStringComponents(Protocol): + _docstring_components: list[str] + + +def doc( + *docstrings: str | _DocStringComponents | Callable | None, **params: Any +) -> Callable[[_F], _F]: ... +def _is_iterable(obj) -> bool: ... +def _is_path_like(path) -> bool: ... +def _stringify_path(path: str | PathLike) -> str: ... +def product(seq: Sequence[_N]) -> _N: ... + + +def get_contiguous_span( + shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int +) -> tuple[int, int]: ... +def find_free_port() -> int: ... +def guid() -> str: ... +def _download_urllib(url, out_path) -> None: ... +def _download_requests(url, out_path) -> None: ... +def download_tzdata_on_windows() -> None: ... +def _deprecate_api(old_name, new_name, api, next_version, type=...): ... +def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... +def _break_traceback_cycle_from_frame(frame) -> None: ... diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 18a40d877c34..39abd3ee5715 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -35,11 +35,11 @@ import sys as _sys try: - from ._generated_version import version as __version__ + from ._generated_version import version as __version__ # type: ignore[import-untyped, import-not-found] # noqa: E501 except ImportError: # Package is not installed, parse git tag at runtime try: - import setuptools_scm + import setuptools_scm # type: ignore[import-not-found, import-untyped] # Code duplicated from setup.py to avoid a dependency on each other def parse_git(root, **kwargs): @@ -47,14 +47,14 @@ def parse_git(root, **kwargs): Parse function for setuptools_scm that ignores tags for non-C++ subprojects, e.g. apache-arrow-js-XXX tags. """ - from setuptools_scm.git import parse + from setuptools_scm.git import parse # type: ignore[import-not-found, import-untyped] # noqa: E501 kwargs['describe_command'] = \ "git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'" return parse(root, **kwargs) __version__ = setuptools_scm.get_version('../', parse=parse_git) except ImportError: - __version__ = None + __version__ = None # type: ignore[assignment] from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, MonthDayNano, VersionInfo, build_info, cpp_build_info, @@ -150,6 +150,8 @@ def print_entry(label, value): print(f" {codec: <20}: {status: <8}") +from pyarrow.lib import ( + DataType, Array, MemoryPool) # type: ignore[reportAttributeAccessIssue] from pyarrow.lib import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, @@ -167,7 +169,7 @@ def print_entry(label, value): bool8, fixed_shape_tensor, json_, opaque, uuid, field, type_for_alias, - DataType, DictionaryType, StructType, + DictionaryType, StructType, ListType, LargeListType, FixedSizeListType, ListViewType, LargeListViewType, MapType, UnionType, SparseUnionType, DenseUnionType, @@ -184,8 +186,7 @@ def print_entry(label, value): Field, Schema, schema, - unify_schemas, - Array, Tensor, + unify_schemas, Tensor, array, chunked_array, record_batch, nulls, repeat, SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix, SparseCSFTensor, @@ -240,7 +241,7 @@ def print_entry(label, value): from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, Codec, compress, decompress, allocate_buffer) -from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, +from pyarrow.lib import (LoggingMemoryPool, ProxyMemoryPool, total_allocated_bytes, set_memory_pool, default_memory_pool, system_memory_pool, jemalloc_memory_pool, mimalloc_memory_pool, @@ -362,7 +363,7 @@ def create_library_symlinks(): if _sys.platform == 'linux': bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*')) - def get_symlink_path(hard_path): + def get_symlink_path(hard_path): # type: ignore[reportRedeclaration] return hard_path.rsplit('.', 1)[0] else: bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib')) diff --git a/python/pyarrow/acero.py b/python/pyarrow/acero.py index e475e8db5c24..cd99a1bbc531 100644 --- a/python/pyarrow/acero.py +++ b/python/pyarrow/acero.py @@ -22,7 +22,7 @@ # distutils: language = c++ # cython: language_level = 3 -from pyarrow.lib import Table, RecordBatch, array +from pyarrow.lib import Table, RecordBatch, array, Schema from pyarrow.compute import Expression, field try: @@ -49,11 +49,14 @@ except ImportError: class DatasetModuleStub: class Dataset: - pass + @property + def schema(self): + return Schema() class InMemoryDataset: - pass - ds = DatasetModuleStub + def __init__(self, source): + pass + ds = DatasetModuleStub # type: ignore[assignment] def _dataset_to_decl(dataset, use_threads=True, implicit_ordering=False): @@ -306,7 +309,7 @@ def _perform_join_asof(left_operand, left_on, left_by, # AsofJoin does not return on or by columns for right_operand. right_columns = [ col for col in right_operand.schema.names - if col not in [right_on] + right_by + if col not in [right_on] + right_by # type: ignore[reportOperatorIssue] ] columns_collisions = set(left_operand.schema.names) & set(right_columns) if columns_collisions: diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ec58ac727e59..47ce1ab9b81b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3632,7 +3632,7 @@ cdef class FixedSizeListArray(BaseListArray): Or create from a values array, list size and matching type: >>> typ = pa.list_(pa.field("values", pa.int64()), 2) - >>> arr = pa.FixedSizeListArray.from_arrays(values,type=typ) + >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) >>> arr [ diff --git a/python/pyarrow/benchmark.py b/python/pyarrow/benchmark.py index 25ee1141f08d..0ee9063a9a76 100644 --- a/python/pyarrow/benchmark.py +++ b/python/pyarrow/benchmark.py @@ -18,4 +18,4 @@ # flake8: noqa -from pyarrow.lib import benchmark_PandasObjectIsNull +from pyarrow.lib import benchmark_PandasObjectIsNull # type: ignore[attr-defined] diff --git a/python/pyarrow/cffi.py b/python/pyarrow/cffi.py index 1da1a9169140..e5a1c9c1d072 100644 --- a/python/pyarrow/cffi.py +++ b/python/pyarrow/cffi.py @@ -16,8 +16,15 @@ # under the License. from __future__ import absolute_import +from typing import TYPE_CHECKING -import cffi +if TYPE_CHECKING: + import cffi +else: + try: + import cffi + except ImportError: + pass c_source = """ struct ArrowSchema { diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 8177948aaebc..b8206a54fddb 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -108,7 +108,7 @@ import warnings import pyarrow as pa -from pyarrow import _compute_docstrings +from pyarrow import _compute_docstrings # type: ignore[reportAttributeAccessIssue] from pyarrow.vendored import docscrape @@ -243,7 +243,7 @@ def _handle_options(name, options_class, options, args, kwargs): def _make_generic_wrapper(func_name, func, options_class, arity): if options_class is None: - def wrapper(*args, memory_pool=None): + def wrapper(*args, memory_pool=None): # type: ignore[misc] if arity is not Ellipsis and len(args) != arity: raise TypeError( f"{func_name} takes {arity} positional argument(s), " @@ -253,7 +253,8 @@ def wrapper(*args, memory_pool=None): return Expression._call(func_name, list(args)) return func.call(args, None, memory_pool) else: - def wrapper(*args, memory_pool=None, options=None, **kwargs): + def wrapper( # type: ignore[misc] + *args, memory_pool=None, options=None, **kwargs): if arity is not Ellipsis: if len(args) < arity: raise TypeError( @@ -610,7 +611,7 @@ def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None): sort_keys.append(("dummy", "descending")) else: sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys) - options = SelectKOptions(k, sort_keys) + options = SelectKOptions(k, sort_keys) # type: ignore[reportArgumentType] return call_function("select_k_unstable", [values], options, memory_pool) @@ -657,7 +658,7 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None): sort_keys.append(("dummy", "ascending")) else: sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys) - options = SelectKOptions(k, sort_keys) + options = SelectKOptions(k, sort_keys) # type: ignore[reportArgumentType] return call_function("select_k_unstable", [values], options, memory_pool) @@ -683,7 +684,8 @@ def random(n, *, initializer='system', options=None, memory_pool=None): memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """ - options = RandomOptions(initializer=initializer) + options = RandomOptions( + initializer=initializer) # type: ignore[reportArgumentType] return call_function("random", [], options, memory_pool, length=n) @@ -725,7 +727,7 @@ def field(*name_or_index): if isinstance(name_or_index[0], (str, int)): return Expression._field(name_or_index[0]) elif isinstance(name_or_index[0], tuple): - return Expression._nested_field(name_or_index[0]) + return Expression._nested_field(name_or_index[0]) # type: ignore else: raise TypeError( "field reference should be str, multiple str, tuple or " @@ -733,7 +735,7 @@ def field(*name_or_index): ) # In case of multiple strings not supplied in a tuple else: - return Expression._nested_field(name_or_index) + return Expression._nested_field(name_or_index) # type: ignore def scalar(value): diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index 41beaa140419..0e8ef66485ec 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -114,13 +114,13 @@ defaults['timezone_data'] = os.path.exists("/usr/share/zoneinfo") try: - import cython # noqa + import cython # type: ignore[import-untyped, import-not-found] # noqa defaults['cython'] = True except ImportError: pass try: - import fastparquet # noqa + import fastparquet # type: ignore[import-untyped, import-not-found] # noqa defaults['fastparquet'] = True except ImportError: pass @@ -347,7 +347,7 @@ def func(ctx, x): pc.register_aggregate_function(func, func_name, - func_doc, + func_doc, # type: ignore { "x": pa.float64(), }, diff --git a/python/pyarrow/cuda.py b/python/pyarrow/cuda.py index 18c530d4afe4..eeb637f0ab41 100644 --- a/python/pyarrow/cuda.py +++ b/python/pyarrow/cuda.py @@ -18,7 +18,7 @@ # flake8: noqa -from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, +from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, # type: ignore[reportMissingModuleSource] HostBuffer, BufferReader, BufferWriter, new_host_buffer, serialize_record_batch, read_message, diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 039da8c0d567..967c4b475ddf 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -54,6 +54,9 @@ get_partition_keys as _get_partition_keys, # keep for backwards compatibility _filesystemdataset_write, ) + from pyarrow.fs import FileInfo + + except ImportError as exc: raise ImportError( f"The pyarrow installation is not built with support for 'dataset' ({str(exc)})" @@ -70,7 +73,8 @@ ) try: - from pyarrow._dataset_orc import OrcFileFormat + from pyarrow._dataset_orc import ( # type: ignore[import-not-found] + OrcFileFormat) _orc_available = True except ImportError: pass @@ -371,6 +375,7 @@ def _ensure_multiple_sources(paths, filesystem=None): # possible improvement is to group the file_infos by type and raise for # multiple paths per error category if is_local: + # type: ignore[reportGeneralTypeIssues] for info in filesystem.get_file_info(paths): file_type = info.type if file_type == FileType.File: @@ -422,16 +427,18 @@ def _ensure_single_source(path, filesystem=None): filesystem, path = _resolve_filesystem_and_path(path, filesystem) # ensure that the path is normalized before passing to dataset discovery + assert isinstance(path, str) path = filesystem.normalize_path(path) # retrieve the file descriptor file_info = filesystem.get_file_info(path) + assert isinstance(file_info, FileInfo) # depending on the path type either return with a recursive # directory selector or as a list containing a single file - if file_info.type == FileType.Directory: + if file_info.type == FileType.Directory: # type: ignore[reportAttributeAccessIssue] paths_or_selector = FileSelector(path, recursive=True) - elif file_info.type == FileType.File: + elif file_info.type == FileType.File: # type: ignore[reportAttributeAccessIssue] paths_or_selector = [path] else: raise FileNotFoundError(path) @@ -1035,6 +1042,7 @@ def file_visitor(written_file): _filesystemdataset_write( scanner, base_dir, basename_template, filesystem, partitioning, preserve_order, file_options, max_partitions, file_visitor, - existing_data_behavior, max_open_files, max_rows_per_file, - min_rows_per_group, max_rows_per_group, create_dir + existing_data_behavior, # type: ignore[reportArgumentType] + max_open_files, max_rows_per_file, min_rows_per_group, + max_rows_per_group, create_dir ) diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 241c27706a6f..4b0ecb9f18e0 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -183,6 +183,7 @@ def write_feather(df, dest, compression=None, compression_level=None, f'one of {_FEATHER_SUPPORTED_CODECS}') try: + assert version in (1, 2) _feather.write_feather(table, dest, compression=compression, compression_level=compression_level, chunksize=chunksize, version=version) @@ -269,7 +270,7 @@ def read_table(source, columns=None, memory_map=False, use_threads=True): f"Got columns {columns} of types {column_type_names}") # Feather v1 already respects the column selection - if reader.version < 3: + if int(reader.version) < 3: return table # Feather v2 reads with sorted / deduplicated selection elif sorted(set(columns)) == columns: diff --git a/python/pyarrow/flight.py b/python/pyarrow/flight.py index b1836907c674..ba5008c9ecf7 100644 --- a/python/pyarrow/flight.py +++ b/python/pyarrow/flight.py @@ -16,7 +16,7 @@ # under the License. try: - from pyarrow._flight import ( # noqa:F401 + from pyarrow._flight import ( # noqa:F401 # type: ignore[import-not-found] connect, Action, ActionType, diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 670ccaaf2455..e1aa9090d2d0 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -40,7 +40,7 @@ _not_imported = [] try: - from pyarrow._azurefs import AzureFileSystem # noqa + from pyarrow._azurefs import AzureFileSystem # noqa # type: ignore[reportMissingModuleSource] except ImportError: _not_imported.append("AzureFileSystem") @@ -50,12 +50,12 @@ _not_imported.append("HadoopFileSystem") try: - from pyarrow._gcsfs import GcsFileSystem # noqa + from pyarrow._gcsfs import GcsFileSystem # noqa # type: ignore[reportMissingModuleSource] except ImportError: _not_imported.append("GcsFileSystem") try: - from pyarrow._s3fs import ( # noqa + from pyarrow._s3fs import ( # noqa # type: ignore[reportMissingModuleSource] AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy, S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized, finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region) @@ -111,7 +111,7 @@ def _ensure_filesystem(filesystem, *, use_mmap=False): else: # handle fsspec-compatible filesystems try: - import fsspec + import fsspec # type: ignore[import-untyped] except ImportError: pass else: @@ -165,6 +165,7 @@ def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False): file_info = None exists_locally = False else: + assert isinstance(file_info, FileInfo) exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that @@ -250,7 +251,9 @@ def copy_files(source, destination, destination, destination_filesystem ) + assert isinstance(source_fs, FileSystem) file_info = source_fs.get_file_info(source_path) + assert isinstance(file_info, FileInfo) if file_info.type == FileType.Directory: source_sel = FileSelector(source_path, recursive=True) _copy_files_selector(source_fs, source_sel, diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py index 4e0d66ec6659..222c289c8793 100644 --- a/python/pyarrow/orc.py +++ b/python/pyarrow/orc.py @@ -20,7 +20,7 @@ import warnings from pyarrow.lib import Table -import pyarrow._orc as _orc +import pyarrow._orc as _orc # type: ignore[reportMissingModuleSource] from pyarrow.fs import _resolve_filesystem_and_path @@ -255,9 +255,11 @@ def __init__(self, where, *, file_version=file_version, batch_size=batch_size, stripe_size=stripe_size, - compression=compression, + compression=compression, # type: ignore[reportArgumentType] compression_block_size=compression_block_size, - compression_strategy=compression_strategy, + compression_strategy=( + compression_strategy # type: ignore[reportArgumentType] + ), row_index_stride=row_index_stride, padding_tolerance=padding_tolerance, dictionary_key_size_threshold=dictionary_key_size_threshold, diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index dfca59cbf5f9..b9086ce4e86b 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -33,18 +33,18 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa -_logical_type_map = {} -_numpy_logical_type_map = {} -_pandas_logical_type_map = {} +_logical_type_map: dict[int, str] = {} +_numpy_logical_type_map: dict[int, str] = {} +_pandas_logical_type_map: dict[int, str] = {} def get_logical_type_map(): - global _logical_type_map + global _logical_type_map # noqa: F824 if not _logical_type_map: _logical_type_map.update({ @@ -90,9 +90,9 @@ def get_logical_type(arrow_type): def get_numpy_logical_type_map(): - global _numpy_logical_type_map + global _numpy_logical_type_map # noqa: F824 if not _numpy_logical_type_map: - _numpy_logical_type_map.update({ + _numpy_logical_type_map.update({ # type: ignore[reportCallIssue] np.bool_: 'bool', np.int8: 'int8', np.int16: 'int16', @@ -704,7 +704,7 @@ def get_datetimetz_type(values, dtype, type_): # If no user type passed, construct a tz-aware timestamp type tz = dtype.tz unit = dtype.unit - type_ = pa.timestamp(unit, tz) + type_ = pa.timestamp(unit, tz) # type: ignore[reportArgumentType] elif type_ is None: # Trust the NumPy dtype type_ = pa.from_numpy_dtype(values.dtype) @@ -743,7 +743,7 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= pandas Block """ - import pandas.core.internals as _int + import pandas.core.internals as _int # type: ignore[import-not-found] block_arr = item.get('block', None) placement = item['placement'] @@ -769,6 +769,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= # create ExtensionBlock arr = item['py_array'] assert len(placement) == 1 + assert isinstance(columns, list) + assert isinstance(extension_columns, dict) name = columns[placement[0]] pandas_dtype = extension_columns[name] if not hasattr(pandas_dtype, '__from_arrow__'): @@ -788,7 +790,7 @@ def make_datetimetz(unit, tz): if _pandas_api.is_v1(): unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns] tz = pa.lib.string_to_tzinfo(tz) - return _pandas_api.datetimetz_type(unit, tz=tz) + return _pandas_api.datetimetz_type(unit, tz=tz) # type: ignore[reportArgumentType] def table_to_dataframe( @@ -822,7 +824,8 @@ def table_to_dataframe( result = pa.lib.table_to_blocks(options, table, categories, list(ext_columns_dtypes.keys())) if _pandas_api.is_ge_v3(): - from pandas.api.internals import create_dataframe_from_blocks + from pandas.api.internals import ( # type: ignore[import-not-found] + create_dataframe_from_blocks) blocks = [ _reconstruct_block( @@ -834,7 +837,8 @@ def table_to_dataframe( return df else: - from pandas.core.internals import BlockManager + from pandas.core.internals import ( # type: ignore[reportMissingImports] + BlockManager) from pandas import DataFrame blocks = [ @@ -844,7 +848,8 @@ def table_to_dataframe( axes = [columns, index] mgr = BlockManager(blocks, axes) if _pandas_api.is_ge_v21(): - df = DataFrame._from_mgr(mgr, mgr.axes) + df = DataFrame._from_mgr( # type: ignore[reportAttributeAccessIssue] + mgr, mgr.axes) else: df = DataFrame(mgr) @@ -1092,10 +1097,10 @@ def _is_generated_index_name(name): def get_pandas_logical_type_map(): - global _pandas_logical_type_map + global _pandas_logical_type_map # noqa: F824 if not _pandas_logical_type_map: - _pandas_logical_type_map.update({ + _pandas_logical_type_map.update({ # type: ignore[reportCallIssue] 'date': 'datetime64[D]', 'datetime': 'datetime64[ns]', 'datetimetz': 'datetime64[ns]', @@ -1162,12 +1167,14 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): labels = getattr(columns, 'codes', None) or [None] # Convert each level to the dtype provided in the metadata - levels_dtypes = [ - (level, col_index.get('pandas_type', str(level.dtype)), - col_index.get('numpy_type', None)) + levels_dtypes = [(level, col_index.get( + 'pandas_type', + str(level.dtype) # type: ignore[reportAttributeAccessIssue] + ), + col_index.get('numpy_type', None)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={} - ) + ) ] new_levels = [] @@ -1179,7 +1186,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # bytes into unicode strings when json.loads-ing them. We need to # convert them back to bytes to preserve metadata. if dtype == np.bytes_: - level = level.map(encoder) + level = level.map(encoder) # type: ignore[reportAttributeAccessIssue] # ARROW-13756: if index is timezone aware DataTimeIndex elif pandas_dtype == "datetimetz": tz = pa.lib.string_to_tzinfo( @@ -1193,7 +1200,8 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): elif pandas_dtype == "decimal": level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level]) elif ( - level.dtype == "str" and numpy_dtype == "object" + level.dtype == "str" # type: ignore[reportAttributeAccessIssue] + and numpy_dtype == "object" and ("mixed" in pandas_dtype or pandas_dtype in ["unicode", "string"]) ): # the metadata indicate that the original dataframe used object dtype, @@ -1206,11 +1214,12 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # for pandas >= 3 we want to use the default string dtype for .columns new_levels.append(level) continue - elif level.dtype != dtype: - level = level.astype(dtype) + elif level.dtype != dtype: # type: ignore[reportAttributeAccessIssue] + level = level.astype(dtype) # type: ignore[reportAttributeAccessIssue] # ARROW-9096: if original DataFrame was upcast we keep that if level.dtype != numpy_dtype and pandas_dtype != "datetimetz": - level = level.astype(numpy_dtype) + level = level.astype( # type: ignore[reportAttributeAccessIssue] + numpy_dtype) new_levels.append(level) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 354f18124b53..639ae2a95c44 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -45,7 +45,7 @@ FileDecryptionProperties, SortingColumn) from pyarrow.fs import (LocalFileSystem, FileType, _resolve_filesystem_and_path, - _ensure_filesystem) + _ensure_filesystem, FileInfo) from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api @@ -1415,12 +1415,15 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, path_or_paths, filesystem, memory_map=memory_map ) finfo = filesystem.get_file_info(path_or_paths) + assert isinstance(finfo, FileInfo) if finfo.type == FileType.Directory: self._base_dir = path_or_paths else: single_file = path_or_paths - parquet_format = ds.ParquetFileFormat(**read_options) + parquet_format = ds.ParquetFileFormat( + **read_options # type: ignore[invalid-argument-type] + ) if single_file is not None: fragment = parquet_format.make_fragment(single_file, filesystem) @@ -1575,6 +1578,7 @@ def _get_common_pandas_metadata(self): for name in ["_common_metadata", "_metadata"]: metadata_path = os.path.join(str(self._base_dir), name) finfo = self.filesystem.get_file_info(metadata_path) + assert isinstance(finfo, FileInfo) if finfo.is_file: pq_meta = read_metadata( metadata_path, filesystem=self.filesystem) @@ -1673,6 +1677,7 @@ def files(self): >>> dataset.files ['dataset_v2_files/year=2019/...-0.parquet', ... """ + assert isinstance(self._dataset, pa.dataset.FileSystemDataset) return self._dataset.files @property @@ -1680,6 +1685,7 @@ def filesystem(self): """ The filesystem type of the Dataset source. """ + assert isinstance(self._dataset, pa.dataset.FileSystemDataset) return self._dataset.filesystem @property @@ -1687,6 +1693,7 @@ def partitioning(self): """ The partitioning of the Dataset source, if discovered. """ + assert isinstance(self._dataset, pa.dataset.FileSystemDataset) return self._dataset.partitioning @@ -1903,14 +1910,16 @@ def read_table(source, *, columns=None, use_threads=True, filesystem, path = _resolve_filesystem_and_path(source, filesystem) if filesystem is not None: - if not filesystem.get_file_info(path).is_file: + file_info = filesystem.get_file_info(path) + assert isinstance(file_info, FileInfo) + if not file_info.is_file: raise ValueError( "the 'source' argument should be " "an existing parquet file and not a directory " "when the pyarrow.dataset module is not available" ) - source = filesystem.open_input_file(path) + source = filesystem.open_input_file(path) # type: ignore dataset = ParquetFile( source, read_dictionary=read_dictionary, @@ -2083,7 +2092,8 @@ def write_table(table, where, row_group_size=None, version='2.6', def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, schema=None, partitioning=None, basename_template=None, use_threads=None, - file_visitor=None, existing_data_behavior=None, + file_visitor=None, # type: ignore[reportRedeclaration] + existing_data_behavior=None, **kwargs): """Wrapper around dataset.write_dataset for writing a Table to Parquet format by partitions. @@ -2312,7 +2322,7 @@ def write_metadata(schema, where, metadata_collector=None, filesystem=None, filesystem, where = _resolve_filesystem_and_path(where, filesystem) if hasattr(where, "seek"): # file-like - cursor_position = where.tell() + cursor_position = where.tell() # type: ignore[reportAttributeAccessIssue] writer = ParquetWriter(where, schema, filesystem, **kwargs) writer.close() @@ -2321,8 +2331,8 @@ def write_metadata(schema, where, metadata_collector=None, filesystem=None, # ParquetWriter doesn't expose the metadata until it's written. Write # it and read it again. metadata = read_metadata(where, filesystem=filesystem) - if hasattr(where, "seek"): - where.seek(cursor_position) # file-like, set cursor back. + if hasattr(where, "seek"): # file-like, set cursor back. + where.seek(cursor_position) # type: ignore[reportAttributeAccessIssue] for m in metadata_collector: metadata.append_row_groups(m) diff --git a/python/pyarrow/parquet/encryption.py b/python/pyarrow/parquet/encryption.py index df6eed913fa5..1c6835d6acfe 100644 --- a/python/pyarrow/parquet/encryption.py +++ b/python/pyarrow/parquet/encryption.py @@ -20,4 +20,5 @@ EncryptionConfiguration, DecryptionConfiguration, KmsConnectionConfig, - KmsClient) + KmsClient, + FileSystemKeyMaterialStore) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 83cabcf447d4..16fed344e4d7 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1036,7 +1036,7 @@ cdef class StructScalar(Scalar, Mapping): Parameters ---------- - index : Union[int, str] + key : Union[int, str] Index / position or name of the field. Returns diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 575444c1cfc2..3f227d3101c7 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -64,7 +64,8 @@ if os.environ.get('TZDIR', None) is None: from importlib import resources try: - os.environ['TZDIR'] = os.path.join(resources.files('tzdata'), 'zoneinfo') + tzdata_path = resources.files('tzdata') + os.environ['TZDIR'] = os.path.join(str(tzdata_path), 'zoneinfo') except ModuleNotFoundError: print( 'Package "tzdata" not found. Not setting TZDIR environment variable.' @@ -191,6 +192,7 @@ def decorate(func): def wrapper(*args, **kwargs): remaining_attempts = attempts curr_delay = delay + last_exception = None while remaining_attempts > 0: try: return func(*args, **kwargs) @@ -201,6 +203,9 @@ def wrapper(*args, **kwargs): if max_delay: curr_delay = min(curr_delay, max_delay) time.sleep(curr_delay) + # At this point, we've exhausted all attempts and last_exception must be set + # (since we must have caught at least one exception to exit the loop) + assert last_exception is not None, "No attempts were made" raise last_exception return wrapper return decorate diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py index 50da6693afff..62da25f0af32 100644 --- a/python/pyarrow/tests/interchange/test_conversion.py +++ b/python/pyarrow/tests/interchange/test_conversion.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow.interchange as pi from pyarrow.interchange.column import ( @@ -163,8 +163,8 @@ def test_pandas_roundtrip_string(): result = pi.from_dataframe(pandas_df) assert result["a"].to_pylist() == table["a"].to_pylist() - assert pa.types.is_string(table["a"].type) - assert pa.types.is_large_string(result["a"].type) + assert pa.types.is_string(table.column("a").type) + assert pa.types.is_large_string(result.column("a").type) table_protocol = table.__dataframe__() result_protocol = result.__dataframe__() @@ -193,8 +193,8 @@ def test_pandas_roundtrip_large_string(): result = pi.from_dataframe(pandas_df) assert result["a_large"].to_pylist() == table["a_large"].to_pylist() - assert pa.types.is_large_string(table["a_large"].type) - assert pa.types.is_large_string(result["a_large"].type) + assert pa.types.is_large_string(table.column("a_large").type) + assert pa.types.is_large_string(result.column("a_large").type) table_protocol = table.__dataframe__() result_protocol = result.__dataframe__() @@ -231,12 +231,12 @@ def test_pandas_roundtrip_string_with_missing(): result = pi.from_dataframe(pandas_df) assert result["a"].to_pylist() == table["a"].to_pylist() - assert pa.types.is_string(table["a"].type) - assert pa.types.is_large_string(result["a"].type) + assert pa.types.is_string(table.column("a").type) + assert pa.types.is_large_string(result.column("a").type) assert result["a_large"].to_pylist() == table["a_large"].to_pylist() - assert pa.types.is_large_string(table["a_large"].type) - assert pa.types.is_large_string(result["a_large"].type) + assert pa.types.is_large_string(table.column("a_large").type) + assert pa.types.is_large_string(result.column("a_large").type) else: # older versions of pandas do not have bitmask support # https://github.com/pandas-dev/pandas/issues/49888 @@ -261,12 +261,16 @@ def test_pandas_roundtrip_categorical(): result = pi.from_dataframe(pandas_df) assert result["weekday"].to_pylist() == table["weekday"].to_pylist() - assert pa.types.is_dictionary(table["weekday"].type) - assert pa.types.is_dictionary(result["weekday"].type) - assert pa.types.is_string(table["weekday"].chunk(0).dictionary.type) - assert pa.types.is_large_string(result["weekday"].chunk(0).dictionary.type) - assert pa.types.is_int32(table["weekday"].chunk(0).indices.type) - assert pa.types.is_int8(result["weekday"].chunk(0).indices.type) + assert pa.types.is_dictionary(table.column("weekday").type) + assert pa.types.is_dictionary(result.column("weekday").type) + table_chunk_0 = table.column("weekday").chunk(0) + result_chunk_0 = result.column("weekday").chunk(0) + assert isinstance(table_chunk_0, pa.DictionaryArray) + assert isinstance(result_chunk_0, pa.DictionaryArray) + assert pa.types.is_string(table_chunk_0.dictionary.type) + assert pa.types.is_large_string(result_chunk_0.dictionary.type) + assert pa.types.is_int32(table_chunk_0.indices.type) + assert pa.types.is_int8(result_chunk_0.indices.type) table_protocol = table.__dataframe__() result_protocol = result.__dataframe__() @@ -289,6 +293,7 @@ def test_pandas_roundtrip_categorical(): assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"] assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"] + assert desc_cat_result["categories"] is not None assert isinstance(desc_cat_result["categories"]._col, pa.Array) @@ -450,6 +455,7 @@ def test_pyarrow_roundtrip_categorical(offset, length): assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"] assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"] + assert desc_cat_result["categories"] is not None assert isinstance(desc_cat_result["categories"]._col, pa.Array) @@ -464,8 +470,8 @@ def test_pyarrow_roundtrip_large_string(): col = result.__dataframe__().get_column(0) assert col.size() == 3*1024**2 - assert pa.types.is_large_string(table[0].type) - assert pa.types.is_large_string(result[0].type) + assert pa.types.is_large_string(table.column(0).type) + assert pa.types.is_large_string(result.column(0).type) assert table.equals(result) diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index cea694d1c1ee..3208b56c42df 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.strategies as past diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index 5390a24b90d2..3cbf5801dfc1 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -16,11 +16,12 @@ # under the License. import io +from typing import cast try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa from pyarrow.tests import util @@ -137,7 +138,7 @@ def make_sample_file(table_or_df): else: a_table = pa.Table.from_pandas(table_or_df) - buf = io.BytesIO() + buf = io.BytesIO() # type: ignore[attr-defined] _write_table(a_table, buf, compression='SNAPPY', version='2.6') buf.seek(0) @@ -161,12 +162,9 @@ def alltypes_sample(size=10000, seed=0, categorical=False): 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, - 'datetime_ms': np.arange("2016-01-01T00:00:00.001", size, - dtype='datetime64[ms]'), - 'datetime_us': np.arange("2016-01-01T00:00:00.000001", size, - dtype='datetime64[us]'), - 'datetime_ns': np.arange("2016-01-01T00:00:00.000000001", size, - dtype='datetime64[ns]'), + 'datetime_ms': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='ms').values, + 'datetime_us': pd.date_range("2016-01-01T00:00:00.000001", periods=size, freq='us').values, + 'datetime_ns': pd.date_range("2016-01-01T00:00:00.000000001", periods=size, freq='ns').values, 'timedelta': np.arange(0, size, dtype="timedelta64[s]"), 'str': pd.Series([str(x) for x in range(size)]), 'empty_str': [''] * size, @@ -175,5 +173,6 @@ def alltypes_sample(size=10000, seed=0, categorical=False): 'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)], } if categorical: - arrays['str_category'] = arrays['str'].astype('category') + import pandas as pd + arrays['str_category'] = cast(pd.Series, arrays['str']).astype('category') return pd.DataFrame(arrays) diff --git a/python/pyarrow/tests/parquet/encryption.py b/python/pyarrow/tests/parquet/encryption.py index efaee1d08a93..7a6ef3de7bc1 100644 --- a/python/pyarrow/tests/parquet/encryption.py +++ b/python/pyarrow/tests/parquet/encryption.py @@ -30,7 +30,7 @@ def __init__(self, config): pe.KmsClient.__init__(self) self.master_keys_map = config.custom_kms_conf - def wrap_key(self, key_bytes, master_key_identifier): + def wrap_key(self, key_bytes, master_key_identifier): # type: ignore[override] """Not a secure cipher - the wrapped key is just the master key concatenated with key bytes""" master_key_bytes = self.master_keys_map[master_key_identifier].encode( @@ -39,7 +39,7 @@ def wrap_key(self, key_bytes, master_key_identifier): result = base64.b64encode(wrapped_key) return result - def unwrap_key(self, wrapped_key, master_key_identifier): + def unwrap_key(self, wrapped_key, master_key_identifier): # type: ignore[override] """Not a secure cipher - just extract the key from the wrapped key""" if master_key_identifier not in self.master_keys_map: diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 345aee3c4ef4..347d10cf76a1 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -35,7 +35,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pq = None + pass try: @@ -45,12 +45,12 @@ from pyarrow.tests.pandas_examples import dataframe_with_lists from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pass try: import numpy as np except ImportError: - np = None + pass # Marks all of the tests in this module # Ignore these with pytest ... -m 'not parquet' @@ -162,10 +162,10 @@ def test_invalid_source(): # Test that we provide an helpful error message pointing out # that None wasn't expected when trying to open a Parquet None file. with pytest.raises(TypeError, match="None"): - pq.read_table(None) + pq.read_table(None) # type: ignore[arg-type] with pytest.raises(TypeError, match="None"): - pq.ParquetFile(None) + pq.ParquetFile(None) # type: ignore[arg-type] def test_read_table_without_dataset(tempdir): @@ -755,7 +755,7 @@ def test_fastparquet_cross_compatibility(tempdir): # Arrow -> fastparquet file_arrow = str(tempdir / "cross_compat_arrow.parquet") - pq.write_table(table, file_arrow, compression=None) + pq.write_table(table, file_arrow, compression=None) # type: ignore[arg-type] fp_file = fp.ParquetFile(file_arrow) df_fp = fp_file.to_pandas() @@ -796,7 +796,7 @@ def test_buffer_contents( for col in table.columns: [chunk] = col.chunks buf = chunk.buffers()[1] - assert buf.to_pybytes() == buf.size * b"\0" + assert buf.to_pybytes() == buf.size * b"\0" # type: ignore[union-attr] def test_parquet_compression_roundtrip(tempdir): @@ -806,7 +806,7 @@ def test_parquet_compression_roundtrip(tempdir): # the stream due to auto-detecting the extension in the filename table = pa.table([pa.array(range(4))], names=["ints"]) path = tempdir / "arrow-10480.pyarrow.gz" - pq.write_table(table, path, compression="GZIP") + pq.write_table(table, path, compression="GZIP") # type: ignore[arg-type] result = pq.read_table(path) assert result.equals(table) @@ -831,7 +831,7 @@ def test_empty_row_groups(tempdir): def test_reads_over_batch(tempdir): data = [None] * (1 << 20) - data.append([1]) + data.append([1]) # type: ignore[reportArgumentType] # Large list with mostly nones and one final # value. This should force batched reads when # reading back. diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index 2345855a3321..af418812be82 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -24,15 +24,14 @@ from pyarrow.tests.parquet.common import (_read_table, _check_roundtrip) except ImportError: - pq = None + pass try: import pandas as pd - import pandas.testing as tm from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pd = tm = None + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index c546bc1532ac..bd48ffe71558 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -22,7 +22,7 @@ try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa @@ -33,7 +33,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pq = None + pass try: @@ -44,7 +44,7 @@ dataframe_with_lists) from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pass # Marks all of the tests in this module @@ -142,7 +142,7 @@ def test_direct_read_dictionary(): read_dictionary=['f0']) # Compute dictionary-encoded subfield - expected = pa.table([table[0].dictionary_encode()], names=['f0']) + expected = pa.table([table.column(0).dictionary_encode()], names=['f0']) assert result.equals(expected) @@ -174,7 +174,7 @@ def test_direct_read_dictionary_subfield(): expected = pa.table([expected_arr], names=['f0']) assert result.equals(expected) - assert result[0].num_chunks == 1 + assert result.column(0).num_chunks == 1 @pytest.mark.numpy @@ -260,8 +260,8 @@ def test_single_pylist_column_roundtrip(tempdir, dtype,): _write_table(table, filename) table_read = _read_table(filename) for i in range(table.num_columns): - col_written = table[i] - col_read = table_read[i] + col_written = table.column(i) + col_read = table_read.column(i) assert table.field(i).name == table_read.field(i).name assert col_read.num_chunks == 1 data_written = col_written.chunk(0) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index d3e9cda73018..14253ca7d6b2 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -20,35 +20,41 @@ import os import pathlib import sys +from typing import TYPE_CHECKING try: import numpy as np except ImportError: - np = None + pass import pytest import unittest.mock as mock import pyarrow as pa import pyarrow.compute as pc -from pyarrow.fs import (FileSelector, FileSystem, LocalFileSystem, +from pyarrow.fs import (FileSelector, FileSystem, LocalFileSystem, FileInfo, FileType, PyFileSystem, SubTreeFileSystem, FSSpecHandler) from pyarrow.tests import util from pyarrow.util import guid -try: +if TYPE_CHECKING: + import pandas as pd + import pandas.testing as tm import pyarrow.parquet as pq from pyarrow.tests.parquet.common import ( _read_table, _test_dataframe, _test_table, _write_table) -except ImportError: - pq = None +else: + try: + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import ( + _read_table, _test_dataframe, _test_table, _write_table) + except ImportError: + pass - -try: - import pandas as pd - import pandas.testing as tm - -except ImportError: - pd = tm = None + try: + import pandas as pd + import pandas.testing as tm + except ImportError: + pass # Marks all of the tests in this module @@ -70,8 +76,8 @@ def test_filesystem_uri(tempdir): assert result.equals(table) # filesystem URI - result = pq.read_table( - "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir)) + result = pq.read_table("data_dir/data.parquet", + filesystem=util._filesystem_uri(tempdir)) assert result.equals(table) @@ -553,7 +559,7 @@ def _generate_partition_directories(fs, base_dir, partition_spec, df): # ['bar', ['a', 'b', 'c']] # part_table : a pyarrow.Table to write to each partition if not isinstance(fs, FileSystem): - fs = PyFileSystem(FSSpecHandler(fs)) + fs = PyFileSystem(FSSpecHandler(fs)) # type: ignore[abstract] DEPTH = len(partition_spec) @@ -572,15 +578,15 @@ def _visit_level(base_dir, level, part_keys): if level == DEPTH - 1: # Generate example data - from pyarrow.fs import FileType - file_path = pathsep.join([level_dir, guid()]) filtered_df = _filter_partition(df, this_part_keys) part_table = pa.Table.from_pandas(filtered_df) with fs.open_output_stream(file_path) as f: _write_table(part_table, f) - assert fs.get_file_info(file_path).type != FileType.NotFound - assert fs.get_file_info(file_path).type == FileType.File + file_info = fs.get_file_info(file_path) + assert isinstance(file_info, FileInfo) + assert file_info.type != FileType.NotFound + assert file_info.type == FileType.File file_success = pathsep.join([level_dir, '_SUCCESS']) with fs.open_output_stream(file_success) as f: @@ -717,8 +723,8 @@ def test_dataset_read_pandas(tempdir): paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) - df.index = np.arange(i * size, (i + 1) * size) - df.index.name = 'index' + df.index = np.arange(i * size, (i + 1) * size) # type: ignore[assignment] + df.index.name = 'index' # type: ignore[attr-defined] path = dirpath / f'{i}.parquet' @@ -931,8 +937,7 @@ def _test_write_to_dataset_with_partitions(base_path, 'group2': list('eefeffgeee'), 'num': list(range(10)), 'nan': [np.nan] * 10, - 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype( - 'datetime64[ns]') + 'date': pd.date_range('2017-01-01', periods=10, freq='D').values.astype('datetime64[ns]') }) cols = output_df.columns.tolist() partition_by = ['group1', 'group2'] @@ -965,7 +970,7 @@ def _test_write_to_dataset_with_partitions(base_path, input_df_cols = input_df.columns.tolist() assert partition_by == input_df_cols[-1 * len(partition_by):] - input_df = input_df[cols] + input_df = input_df.loc[:, cols] # Partitioned columns become 'categorical' dtypes for col in partition_by: output_df[col] = output_df[col].astype('category') @@ -974,6 +979,7 @@ def _test_write_to_dataset_with_partitions(base_path, expected_date_type = schema.field('date').type.to_pandas_dtype() output_df["date"] = output_df["date"].astype(expected_date_type) + assert isinstance(input_df, pd.DataFrame) tm.assert_frame_equal(output_df, input_df) @@ -988,8 +994,7 @@ def _test_write_to_dataset_no_partitions(base_path, 'group1': list('aaabbbbccc'), 'group2': list('eefeffgeee'), 'num': list(range(10)), - 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype( - 'datetime64[ns]') + 'date': pd.date_range('2017-01-01', periods=10, freq='D').values.astype('datetime64[ns]') }) cols = output_df.columns.tolist() output_table = pa.Table.from_pandas(output_df) @@ -997,7 +1002,7 @@ def _test_write_to_dataset_no_partitions(base_path, if filesystem is None: filesystem = LocalFileSystem() elif not isinstance(filesystem, FileSystem): - filesystem = PyFileSystem(FSSpecHandler(filesystem)) + filesystem = PyFileSystem(FSSpecHandler(filesystem)) # type: ignore[abstract] # Without partitions, append files to root_path n = 5 @@ -1009,8 +1014,10 @@ def _test_write_to_dataset_no_partitions(base_path, recursive=True) infos = filesystem.get_file_info(selector) - output_files = [info for info in infos if info.path.endswith(".parquet")] - assert len(output_files) == n + if isinstance(infos, list): + assert all(isinstance(info, FileInfo) for info in infos) + output_files = [info for info in infos if info.path.endswith(".parquet")] + assert len(output_files) == n # Deduplicated incoming DataFrame should match # original outgoing Dataframe @@ -1020,6 +1027,7 @@ def _test_write_to_dataset_no_partitions(base_path, input_df = input_table.to_pandas() input_df = input_df.drop_duplicates() input_df = input_df[cols] + assert isinstance(input_df, pd.DataFrame) tm.assert_frame_equal(output_df, input_df) @@ -1168,11 +1176,11 @@ def test_dataset_read_dictionary(tempdir): path, read_dictionary=['f0']).read() # The order of the chunks is non-deterministic - ex_chunks = [t1[0].chunk(0).dictionary_encode(), - t2[0].chunk(0).dictionary_encode()] + ex_chunks = [t1.column(0).chunk(0).dictionary_encode(), + t2.column(0).chunk(0).dictionary_encode()] - assert result[0].num_chunks == 2 - c0, c1 = result[0].chunk(0), result[0].chunk(1) + assert result.column(0).num_chunks == 2 + c0, c1 = result.column(0).chunk(0), result.column(0).chunk(1) if c0.equals(ex_chunks[0]): assert c1.equals(ex_chunks[1]) else: diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index b89fd97cb91e..a7652a01e64f 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -22,7 +22,7 @@ try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa @@ -32,7 +32,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pq = None + pass try: @@ -41,7 +41,7 @@ from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pd = tm = None + pass # Marks all of the tests in this module @@ -56,7 +56,7 @@ def test_pandas_parquet_datetime_tz(): # coerce to [ns] due to lack of non-[ns] support. s = pd.Series([datetime.datetime(2017, 9, 6)], dtype='datetime64[us]') s = s.dt.tz_localize('utc') - s.index = s + s.index = s # type: ignore[assignment] # Both a column and an index to hit both use cases df = pd.DataFrame({'tz_aware': s, @@ -287,7 +287,8 @@ def test_coerce_int96_timestamp_unit(unit): # For either Parquet version, coercing to nanoseconds is allowed # if Int96 storage is used - expected = pa.Table.from_arrays([arrays.get(unit)]*4, names) + array_for_unit = arrays.get(unit, a_ns) + expected = pa.Table.from_arrays([array_for_unit] * 4, names) read_table_kwargs = {"coerce_int96_timestamp_unit": unit} _check_roundtrip(table, expected, read_table_kwargs=read_table_kwargs, @@ -323,6 +324,7 @@ def get_table(pq_reader_method, filename, **kwargs): # with the default resolution of ns, we get wrong values for INT96 # that are out of bounds for nanosecond range tab_error = get_table(pq_reader_method, filename) + assert tab_error is not None with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Discarding nonzero nanoseconds in conversion", @@ -333,6 +335,7 @@ def get_table(pq_reader_method, filename, **kwargs): tab_correct = get_table( pq_reader_method, filename, coerce_int96_timestamp_unit="s" ) + assert tab_correct is not None df_correct = tab_correct.to_pandas(timestamp_as_object=True) df["a"] = df["a"].astype(object) tm.assert_frame_equal(df, df_correct) diff --git a/python/pyarrow/tests/parquet/test_encryption.py b/python/pyarrow/tests/parquet/test_encryption.py index 4e2fb069bd06..82b934edf774 100644 --- a/python/pyarrow/tests/parquet/test_encryption.py +++ b/python/pyarrow/tests/parquet/test_encryption.py @@ -21,8 +21,7 @@ import pyarrow.parquet as pq import pyarrow.parquet.encryption as pe except ImportError: - pq = None - pe = None + pass else: from pyarrow.tests.parquet.encryption import (InMemoryKmsClient, MockVersioningKmsClient, @@ -131,7 +130,7 @@ def test_encrypted_parquet_write_read(tempdir, data_table): encryption_algorithm="AES_GCM_V1", cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - assert encryption_config.uniform_encryption is False + assert encryption_config.uniform_encryption is False # type: ignore[attr-defined] kms_connection_config, crypto_factory = write_encrypted_file( path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, @@ -154,11 +153,11 @@ def test_uniform_encrypted_parquet_write_read(tempdir, data_table): # Encrypt the footer and all columns with the footer key, encryption_config = pe.EncryptionConfiguration( footer_key=FOOTER_KEY_NAME, - uniform_encryption=True, + uniform_encryption=True, # type: ignore[call-arg] encryption_algorithm="AES_GCM_V1", cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - assert encryption_config.uniform_encryption is True + assert encryption_config.uniform_encryption is True # type: ignore[attr-defined] kms_connection_config, crypto_factory = write_encrypted_file( path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, b"", @@ -303,7 +302,7 @@ def test_encrypted_parquet_write_col_key_and_uniform_encryption(tempdir, data_ta column_keys={ COL_KEY_NAME: ["a", "b"], }, - uniform_encryption=True) + uniform_encryption=True) # type: ignore[call-arg] with pytest.raises(OSError, match=r"Cannot set both column_keys and uniform_encryption"): @@ -415,7 +414,7 @@ def unwrap_key(self, wrapped_key, master_key_identifier): def kms_factory(kms_connection_configuration): return WrongTypeKmsClient(kms_connection_configuration) - crypto_factory = pe.CryptoFactory(kms_factory) + crypto_factory = pe.CryptoFactory(kms_factory) # type: ignore[arg-type] with pytest.raises(TypeError): # Write with encryption properties write_encrypted_parquet(path, data_table, encryption_config, @@ -554,7 +553,7 @@ def test_encrypted_parquet_write_read_external(tempdir, data_table, result_table = read_encrypted_parquet( path, decryption_config, kms_connection_config, crypto_factory, internal_key_material=False) - store = pa._parquet_encryption.FileSystemKeyMaterialStore.for_file(path) + store = pe.FileSystemKeyMaterialStore.for_file(path) assert len(key_ids := store.get_key_id_set()) == ( len(external_encryption_config.column_keys[COL_KEY_NAME]) + 1) diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 148bfebaa67f..646873b3d4f1 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -19,11 +19,7 @@ import decimal from collections import OrderedDict import io - -try: - import numpy as np -except ImportError: - np = None +from typing import TYPE_CHECKING import pytest import pyarrow as pa @@ -31,20 +27,25 @@ from pyarrow.fs import LocalFileSystem from pyarrow.tests import util -try: - import pyarrow.parquet as pq - from pyarrow.tests.parquet.common import _write_table -except ImportError: - pq = None - - -try: +if TYPE_CHECKING: + import numpy as np import pandas as pd - import pandas.testing as tm - - from pyarrow.tests.parquet.common import alltypes_sample -except ImportError: - pd = tm = None + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import alltypes_sample, _write_table +else: + try: + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import _write_table, alltypes_sample + except ImportError: + pass + try: + import pandas as pd + except ImportError: + pass + try: + import numpy as np + except ImportError: + pass # Marks all of the tests in this module @@ -56,7 +57,7 @@ def test_parquet_metadata_api(): df = alltypes_sample(size=10000) df = df.reindex(columns=sorted(df.columns)) - df.index = np.random.randint(0, 1000000, size=len(df)) + df.index = np.random.randint(0, 1000000, size=len(df)) # type: ignore[assignment] fileh = make_sample_file(df) ncols = len(df.columns) @@ -80,15 +81,15 @@ def test_parquet_metadata_api(): col = schema[0] repr(col) - assert col.name == df.columns[0] - assert col.max_definition_level == 1 - assert col.max_repetition_level == 0 - assert col.max_repetition_level == 0 - assert col.physical_type == 'BOOLEAN' - assert col.converted_type == 'NONE' + assert col.name == df.columns[0] # type: ignore[attr-defined] + assert col.max_definition_level == 1 # type: ignore[attr-defined] + assert col.max_repetition_level == 0 # type: ignore[attr-defined] + assert col.max_repetition_level == 0 # type: ignore[attr-defined] + assert col.physical_type == 'BOOLEAN' # type: ignore[attr-defined] + assert col.converted_type == 'NONE' # type: ignore[attr-defined] col_float16 = schema[5] - assert col_float16.logical_type.type == 'FLOAT16' + assert col_float16.logical_type.type == 'FLOAT16' # type: ignore[attr-defined] with pytest.raises(IndexError): schema[ncols + 1] # +1 for index @@ -210,15 +211,16 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value, col_meta = rg_meta.column(0) stat = col_meta.statistics - assert stat.has_min_max - assert _close(type, stat.min, min_value) - assert _close(type, stat.max, max_value) - assert stat.null_count == null_count - assert stat.num_values == num_values + assert stat is not None + assert stat.has_min_max # type: ignore[attr-defined] + assert _close(type, stat.min, min_value) # type: ignore[attr-defined] + assert _close(type, stat.max, max_value) # type: ignore[attr-defined] + assert stat.null_count == null_count # type: ignore[attr-defined] + assert stat.num_values == num_values # type: ignore[attr-defined] # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount # method, missing distinct_count is represented as zero instead of None - assert stat.distinct_count == distinct_count - assert stat.physical_type == physical_type + assert stat.distinct_count == distinct_count # type: ignore[attr-defined] + assert stat.physical_type == physical_type # type: ignore[attr-defined] def _close(type, left, right): @@ -236,8 +238,10 @@ def test_parquet_raise_on_unset_statistics(): df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")}) meta = make_sample_file(pa.Table.from_pandas(df)).metadata - assert not meta.row_group(0).column(0).statistics.has_min_max - assert meta.row_group(0).column(0).statistics.max is None + stat = meta.row_group(0).column(0).statistics + assert stat is not None + assert not stat.has_min_max + assert stat.max is None def test_statistics_convert_logical_types(tempdir): @@ -271,8 +275,9 @@ def test_statistics_convert_logical_types(tempdir): pq.write_table(t, path, version='2.6') pf = pq.ParquetFile(path) stats = pf.metadata.row_group(0).column(0).statistics - assert stats.min == min_val - assert stats.max == max_val + assert stats is not None + assert stats.min == min_val # type: ignore[attr-defined] + assert stats.max == max_val # type: ignore[attr-defined] def test_parquet_write_disable_statistics(tempdir): @@ -429,29 +434,36 @@ def test_field_id_metadata(): pf = pq.ParquetFile(pa.BufferReader(contents)) schema = pf.schema_arrow - assert schema[0].metadata[field_id] == b'1' - assert schema[0].metadata[b'other'] == b'abc' + assert schema[0].metadata is not None + assert schema[0].metadata[field_id] == b'1' # type: ignore[index] + assert schema[0].metadata[b'other'] == b'abc' # type: ignore[index] list_field = schema[1] - assert list_field.metadata[field_id] == b'11' + assert list_field.metadata is not None + assert list_field.metadata[field_id] == b'11' # type: ignore[index] list_item_field = list_field.type.value_field - assert list_item_field.metadata[field_id] == b'10' + assert list_item_field.metadata is not None + assert list_item_field.metadata[field_id] == b'10' # type: ignore[index] struct_field = schema[2] - assert struct_field.metadata[field_id] == b'102' + assert struct_field.metadata is not None + assert struct_field.metadata[field_id] == b'102' # type: ignore[index] struct_middle_field = struct_field.type[0] - assert struct_middle_field.metadata[field_id] == b'101' + assert struct_middle_field.metadata is not None + assert struct_middle_field.metadata[field_id] == b'101' # type: ignore[index] struct_inner_field = struct_middle_field.type[0] - assert struct_inner_field.metadata[field_id] == b'100' + assert struct_inner_field.metadata is not None + assert struct_inner_field.metadata[field_id] == b'100' # type: ignore[index] assert schema[3].metadata is None # Invalid input is passed through (ok) but does not # have field_id in parquet (not tested) - assert schema[4].metadata[field_id] == b'xyz' - assert schema[5].metadata[field_id] == b'-1000' + assert schema[4].metadata is not None + assert schema[4].metadata[field_id] == b'xyz' # type: ignore[index] + assert schema[5].metadata[field_id] == b'-1000' # type: ignore[index] def test_parquet_file_page_index(): @@ -495,13 +507,14 @@ def test_multi_dataset_metadata(tempdir): _meta.append_row_groups(meta[0]) # Write merged metadata-only file + assert _meta is not None with open(metapath, "wb") as f: - _meta.write_metadata_file(f) + _meta.write_metadata_file(f) # type: ignore[union-attr] # Read back the metadata meta = pq.read_metadata(metapath) md = meta.to_dict() - _md = _meta.to_dict() + _md = _meta.to_dict() # type: ignore[union-attr] for key in _md: if key != 'serialized_size': assert _md[key] == md[key] @@ -695,13 +708,14 @@ def test_metadata_schema_filesystem(tempdir): assert pq.read_metadata( file_path, filesystem=LocalFileSystem()).equals(metadata) assert pq.read_metadata( + # type: ignore[arg-type] fname, filesystem=f'file:///{tempdir}').equals(metadata) assert pq.read_schema(file_uri).equals(schema) assert pq.read_schema( file_path, filesystem=LocalFileSystem()).equals(schema) assert pq.read_schema( - fname, filesystem=f'file:///{tempdir}').equals(schema) + fname, filesystem=f'file:///{tempdir}').equals(schema) # type: ignore[arg-type] with util.change_cwd(tempdir): # Pass `filesystem` arg @@ -721,7 +735,7 @@ def test_metadata_equals(): original_metadata = pq.read_metadata(pa.BufferReader(buf)) match = "Argument 'other' has incorrect type" with pytest.raises(TypeError, match=match): - original_metadata.equals(None) + original_metadata.equals(None) # type: ignore[arg-type] @pytest.mark.parametrize("t1,t2,expected_error", ( @@ -810,7 +824,7 @@ def msg(c): pq.ColumnChunkMetaData() with pytest.raises(TypeError, match=msg("RowGroupMetaData")): - pq.RowGroupMetaData() + pq.RowGroupMetaData() # type: ignore[call-arg] with pytest.raises(TypeError, match=msg("FileMetaData")): - pq.FileMetaData() + pq.FileMetaData() # type: ignore[call-arg] diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 53864ff15ea2..91ae23857344 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -17,11 +17,12 @@ import io import json +from typing import TYPE_CHECKING, cast try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa @@ -29,22 +30,29 @@ from pyarrow.util import guid from pyarrow.vendored.version import Version -try: - import pyarrow.parquet as pq - from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, - _write_table) -except ImportError: - pq = None - - -try: +if TYPE_CHECKING: import pandas as pd import pandas.testing as tm + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import ( + _read_table, _roundtrip_pandas_dataframe, _test_dataframe, + _write_table, alltypes_sample + ) +else: + try: + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import ( + _read_table, _test_dataframe, _write_table, alltypes_sample, + _roundtrip_pandas_dataframe + ) - from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe, - alltypes_sample) -except ImportError: - pd = tm = None + except ImportError: + pass + try: + import pandas as pd + import pandas.testing as tm + except ImportError: + pass # Marks all of the tests in this module @@ -58,11 +66,14 @@ def test_pandas_parquet_custom_metadata(tempdir): filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) + assert arrow_table.schema.metadata is not None assert b'pandas' in arrow_table.schema.metadata _write_table(arrow_table, filename) - metadata = pq.read_metadata(filename).metadata + file_metadata = pq.read_metadata(filename) + metadata = file_metadata.metadata + assert metadata is not None assert b'pandas' in metadata js = json.loads(metadata[b'pandas'].decode('utf8')) @@ -117,10 +128,13 @@ def test_attributes_metadata_persistence(tempdir): } table = pa.Table.from_pandas(df) + assert table.schema.metadata is not None assert b'attributes' in table.schema.metadata[b'pandas'] _write_table(table, filename) - metadata = pq.read_metadata(filename).metadata + file_metadata = pq.read_metadata(filename) + metadata = file_metadata.metadata + assert metadata is not None js = json.loads(metadata[b'pandas'].decode('utf8')) assert 'attributes' in js assert js['attributes'] == df.attrs @@ -297,8 +311,8 @@ def test_pandas_parquet_configuration_options(tempdir): @pytest.mark.pandas def test_spark_flavor_preserves_pandas_metadata(): df = _test_dataframe(size=100) - df.index = np.arange(0, 10 * len(df), 10) - df.index.name = 'foo' + df.index = np.arange(0, 10 * len(df), 10) # type: ignore[assignment] + df.index.name = 'foo' # type: ignore[attr-defined] result = _roundtrip_pandas_dataframe(df, {'flavor': 'spark'}) tm.assert_frame_equal(result, df) @@ -450,7 +464,9 @@ def test_backwards_compatible_column_metadata_handling(datadir): table = _read_table( path, columns=['a']) result = table.to_pandas() - tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) + expected_df = expected[['a']].reset_index(drop=True) + assert isinstance(expected_df, pd.DataFrame) + tm.assert_frame_equal(result, expected_df) @pytest.mark.pandas @@ -510,7 +526,7 @@ def test_pandas_categorical_roundtrip(): codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32') categories = ['foo', 'bar', 'baz'] df = pd.DataFrame({'x': pd.Categorical.from_codes( - codes, categories=categories)}) + codes, categories=categories)}) # type: ignore[arg-type] buf = pa.BufferOutputStream() pq.write_table(pa.table(df), buf) @@ -555,15 +571,18 @@ def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir): table, str(tempdir / "case1"), partition_cols=['part'], ) result = pq.read_table(str(tempdir / "case1")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) + tm.assert_frame_equal( + result[["col"]], df[["col"]]) pq.write_to_dataset(table, str(tempdir / "case2")) result = pq.read_table(str(tempdir / "case2")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) + tm.assert_frame_equal( + result[["col"]], df[["col"]]) pq.write_table(table, str(tempdir / "data.parquet")) result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) + tm.assert_frame_equal( + result[["col"]], df[["col"]]) @pytest.mark.pandas diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index a62b5c3298c9..3c5182dc56e9 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -30,15 +30,14 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _write_table except ImportError: - pq = None + pass try: - import pandas as pd import pandas.testing as tm from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pass # Marks all of the tests in this module @@ -172,7 +171,7 @@ def test_scan_contents(): pf = pq.ParquetFile(buf) assert pf.scan_contents() == 10000 - assert pf.scan_contents(df.columns[:4]) == 10000 + assert pf.scan_contents(list(df.columns[:4])) == 10000 def test_parquet_file_pass_directory_instead_of_file(tempdir): @@ -215,7 +214,7 @@ def test_iter_batches_columns_reader(tempdir, batch_size): chunk_size=chunk_size) file_ = pq.ParquetFile(filename) - for columns in [df.columns[:10], df.columns[10:]]: + for columns in [list(df.columns[:10]), list(df.columns[10:])]: batches = file_.iter_batches(batch_size=batch_size, columns=columns) batch_starts = range(0, total_size+batch_size, batch_size) for batch, start in zip(batches, batch_starts): @@ -263,9 +262,10 @@ def get_all_batches(f): tm.assert_frame_equal( batches[batch_no].to_pandas().reset_index(drop=True), - file_.read_row_groups([i]).to_pandas().iloc[900:].reset_index( - drop=True - ) + file_ + .read_row_groups([i]) + .to_pandas().iloc[900:] + .reset_index(drop=True) # type: ignore[arg-type] ) batch_no += 1 @@ -346,6 +346,7 @@ def test_read_statistics(): buf.seek(0) statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics + assert statistics is not None assert statistics.is_null_count_exact is True assert statistics.null_count == 1 assert statistics.distinct_count is None @@ -389,7 +390,8 @@ def test_parquet_file_fsspec_support(): def test_parquet_file_fsspec_support_through_filesystem_argument(): try: - from fsspec.implementations.memory import MemoryFileSystem + from fsspec.implementations.memory import ( # type: ignore[import-untyped] + MemoryFileSystem) except ImportError: pytest.skip("fsspec is not installed, skipping test") @@ -412,7 +414,7 @@ def test_parquet_file_hugginface_support(): pytest.skip("fsspec is not installed, skipping Hugging Face test") fake_hf_module = types.ModuleType("huggingface_hub") - fake_hf_module.HfFileSystem = MemoryFileSystem + fake_hf_module.HfFileSystem = MemoryFileSystem # type: ignore[attr-defined] with mock.patch.dict("sys.modules", {"huggingface_hub": fake_hf_module}): uri = "hf://datasets/apache/arrow/test.parquet" table = pa.table({"a": range(10)}) @@ -424,7 +426,7 @@ def test_parquet_file_hugginface_support(): def test_fsspec_uri_raises_if_fsspec_is_not_available(): # sadly cannot patch sys.modules because cython will still be able to import fsspec try: - import fsspec # noqa: F401 + import fsspec # type: ignore[import-untyped] # noqa: F401 except ImportError: pass else: diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index a49441f09f45..87787a0f3f00 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -23,9 +23,10 @@ try: import pyarrow.parquet as pq from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, + # type: ignore[attr-defined] _test_table, _range_integers) except ImportError: - pq = None + pass try: @@ -33,7 +34,7 @@ import pandas.testing as tm except ImportError: - pd = tm = None + pass # Marks all of the tests in this module @@ -94,10 +95,10 @@ def test_parquet_invalid_writer(tempdir): # avoid segfaults with invalid construction with pytest.raises(TypeError): some_schema = pa.schema([pa.field("x", pa.int32())]) - pq.ParquetWriter(None, some_schema) + pq.ParquetWriter(None, some_schema) # type: ignore[arg-type] with pytest.raises(TypeError): - pq.ParquetWriter(tempdir / "some_path", None) + pq.ParquetWriter(tempdir / "some_path", None) # type: ignore[arg-type] @pytest.mark.pandas @@ -335,6 +336,7 @@ def test_parquet_writer_store_schema(tempdir): writer.write_table(table) meta = pq.read_metadata(path1) + assert meta.metadata is not None assert b'ARROW:schema' in meta.metadata assert meta.metadata[b'ARROW:schema'] @@ -357,6 +359,7 @@ def test_parquet_writer_append_key_value_metadata(tempdir): writer.add_key_value_metadata({'key2': '2', 'key3': '3'}) reader = pq.ParquetFile(path) metadata = reader.metadata.metadata + assert metadata is not None assert metadata[b'key1'] == b'1' assert metadata[b'key2'] == b'2' assert metadata[b'key3'] == b'3' diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 3c31650ddf94..9188d5d41cc2 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -17,31 +17,32 @@ import datetime import sys +from typing import Any -import pytest -import hypothesis as h -import hypothesis.strategies as st +import pytest # type: ignore[import-not-found] +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] try: - import hypothesis.extra.numpy as npst + import hypothesis.extra.numpy as npst # type: ignore[import-not-found] except ImportError: - npst = None + npst = None # type: ignore[assignment] try: - import hypothesis.extra.pytz as tzst + import hypothesis.extra.pytz as tzst # type: ignore[import-not-found] except ImportError: - tzst = None + tzst = None # type: ignore[assignment] try: import zoneinfo except ImportError: - zoneinfo = None + zoneinfo = None # type: ignore[assignment] if sys.platform == 'win32': try: - import tzdata # noqa:F401 + import tzdata # type: ignore[import-not-found, import-untyped] # noqa:F401 except ImportError: - zoneinfo = None + zoneinfo = None # type: ignore[assignment] try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa @@ -151,12 +152,12 @@ timezones = st.one_of(st.none(), st.timezones()) else: timezones = st.none() -timestamp_types = st.builds( +timestamp_types: Any = st.builds( pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=timezones ) -duration_types = st.builds( +duration_types: Any = st.builds( pa.duration, st.sampled_from(['s', 'ms', 'us', 'ns']) ) @@ -253,13 +254,13 @@ def schemas(type_strategy=primitive_types, max_fields=None): all_types = st.deferred( lambda: ( - primitive_types | - list_types() | - struct_types() | - dictionary_types() | - map_types() | - list_types(all_types) | - struct_types(all_types) + primitive_types + | list_types() + | struct_types() + | dictionary_types() + | map_types() + | list_types(all_types) # type: ignore[has-type] + | struct_types(all_types) # type: ignore[has-type] ) ) all_fields = st.one_of( @@ -303,6 +304,7 @@ def arrays(draw, type, size=None, nullable=True): elif not isinstance(size, int): raise TypeError('Size must be an integer') + assert npst is not None if pa.types.is_null(ty): h.assume(nullable) value = st.none() @@ -315,6 +317,7 @@ def arrays(draw, type, size=None, nullable=True): values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,))) # Workaround ARROW-4952: no easy way to assert array equality # in a NaN-tolerant way. + assert np is not None values[np.isnan(values)] = -42.0 return pa.array(values, type=ty) elif pa.types.is_decimal(ty): @@ -340,9 +343,11 @@ def arrays(draw, type, size=None, nullable=True): offset = ty.tz.split(":") offset_hours = int(offset[0]) offset_min = int(offset[1]) - tz = datetime.timedelta(hours=offset_hours, minutes=offset_min) + tz = datetime.timezone( + datetime.timedelta(hours=offset_hours, minutes=offset_min) + ) except ValueError: - tz = zoneinfo.ZoneInfo(ty.tz) + tz = zoneinfo.ZoneInfo(str(ty.tz)) value = st.datetimes(timezones=st.just(tz), min_value=min_datetime, max_value=max_datetime) elif pa.types.is_duration(ty): @@ -501,7 +506,9 @@ def pandas_compatible_list_types( dictionary_types( value_strategy=pandas_compatible_dictionary_value_types ), - pandas_compatible_list_types(pandas_compatible_types), - struct_types(pandas_compatible_types) + pandas_compatible_list_types( + pandas_compatible_types # type: ignore[has-type] + ), + struct_types(pandas_compatible_types) # type: ignore[has-type] ) ) diff --git a/python/pyarrow/tests/test_acero.py b/python/pyarrow/tests/test_acero.py index cb97e3849fd5..1285534d08aa 100644 --- a/python/pyarrow/tests/test_acero.py +++ b/python/pyarrow/tests/test_acero.py @@ -16,6 +16,7 @@ # under the License. import pytest +from typing import Literal, cast import pyarrow as pa import pyarrow.compute as pc @@ -37,9 +38,10 @@ try: import pyarrow.dataset as ds - from pyarrow.acero import ScanNodeOptions + from pyarrow._dataset import ScanNodeOptions except ImportError: - ds = None + ds = None # type: ignore[assignment] + ScanNodeOptions = None # type: ignore[assignment, misc] pytestmark = pytest.mark.acero @@ -53,7 +55,6 @@ def table_source(): def test_declaration(): - table = pa.table({'a': [1, 2, 3], 'b': [4, 5, 6]}) table_opts = TableSourceNodeOptions(table) filter_opts = FilterNodeOptions(field('a') > 1) @@ -89,7 +90,8 @@ def test_declaration_to_reader(table_source): def test_table_source(): with pytest.raises(TypeError): - TableSourceNodeOptions(pa.record_batch([pa.array([1, 2, 3])], ["a"])) + TableSourceNodeOptions(pa.record_batch( + [pa.array([1, 2, 3])], ["a"])) table_source = TableSourceNodeOptions(None) decl = Declaration("table_source", table_source) @@ -110,9 +112,9 @@ def test_filter(table_source): # requires a pyarrow Expression with pytest.raises(TypeError): - FilterNodeOptions(pa.array([True, False, True])) + FilterNodeOptions(pa.array([True, False, True])) # type: ignore[arg-type] with pytest.raises(TypeError): - FilterNodeOptions(None) + FilterNodeOptions(None) # type: ignore[arg-type] @pytest.mark.parametrize('source', [ @@ -267,19 +269,23 @@ def test_order_by(): table = pa.table({'a': [1, 2, 3, 4], 'b': [1, 3, None, 2]}) table_source = Declaration("table_source", TableSourceNodeOptions(table)) - ord_opts = OrderByNodeOptions([("b", "ascending")]) + sort_keys = [("b", "ascending")] + sort_keys = cast(list[tuple[str, Literal["ascending", "descending"]]], sort_keys) + ord_opts = OrderByNodeOptions(sort_keys) decl = Declaration.from_sequence([table_source, Declaration("order_by", ord_opts)]) result = decl.to_table() expected = pa.table({"a": [1, 4, 2, 3], "b": [1, 2, 3, None]}) assert result.equals(expected) - ord_opts = OrderByNodeOptions([(field("b"), "descending")]) + ord_opts = OrderByNodeOptions( + [(field("b"), "descending")]) # type: ignore[arg-type] decl = Declaration.from_sequence([table_source, Declaration("order_by", ord_opts)]) result = decl.to_table() expected = pa.table({"a": [2, 4, 1, 3], "b": [3, 2, 1, None]}) assert result.equals(expected) - ord_opts = OrderByNodeOptions([(1, "descending")], null_placement="at_start") + ord_opts = OrderByNodeOptions( + [(1, "descending")], null_placement="at_start") # type: ignore[arg-type] decl = Declaration.from_sequence([table_source, Declaration("order_by", ord_opts)]) result = decl.to_table() expected = pa.table({"a": [3, 2, 4, 1], "b": [None, 3, 2, 1]}) @@ -294,10 +300,12 @@ def test_order_by(): _ = decl.to_table() with pytest.raises(ValueError, match="\"decreasing\" is not a valid sort order"): - _ = OrderByNodeOptions([("b", "decreasing")]) + _ = OrderByNodeOptions([("b", "decreasing")]) # type: ignore[arg-type] with pytest.raises(ValueError, match="\"start\" is not a valid null placement"): - _ = OrderByNodeOptions([("b", "ascending")], null_placement="start") + _ = OrderByNodeOptions( + [("b", "ascending")], null_placement="start" # type: ignore[arg-type] + ) def test_hash_join(): @@ -382,7 +390,9 @@ def test_hash_join_with_residual_filter(): # test filter expression referencing columns from both side join_opts = HashJoinNodeOptions( "left outer", left_keys="key", right_keys="key", - filter_expression=pc.equal(pc.field("a"), 5) | pc.equal(pc.field("b"), 10) + filter_expression=( + pc.equal(pc.field("a"), 5) + | pc.equal(pc.field("b"), 10)) # type: ignore[reportOperatorIssue] ) joined = Declaration( "hashjoin", options=join_opts, inputs=[left_source, right_source]) @@ -462,6 +472,8 @@ def test_asof_join(): @pytest.mark.dataset def test_scan(tempdir): + assert ds is not None + assert ScanNodeOptions is not None table = pa.table({'a': [1, 2, 3], 'b': [4, 5, 6]}) ds.write_dataset(table, tempdir / "dataset", format="parquet") dataset = ds.dataset(tempdir / "dataset", format="parquet") @@ -486,11 +498,10 @@ def test_scan(tempdir): assert decl.to_table().num_rows == 0 # projection scan option - scan_opts = ScanNodeOptions(dataset, columns={"a2": pc.multiply(field("a"), 2)}) decl = Declaration("scan", scan_opts) result = decl.to_table() # "a" is included in the result (needed later on for the actual projection) assert result["a"].to_pylist() == [1, 2, 3] # "b" is still included, but without data as it will be removed by the projection - assert pc.all(result["b"].is_null()).as_py() + assert pc.all(result.column("b").is_null()).as_py() diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py index 76a766984dab..9f61bc7ddfea 100644 --- a/python/pyarrow/tests/test_adhoc_memory_leak.py +++ b/python/pyarrow/tests/test_adhoc_memory_leak.py @@ -20,7 +20,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.util as test_util diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d09d9f45c7d0..8a257ca48d64 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -18,19 +18,23 @@ from collections.abc import Iterable import datetime import decimal -import hypothesis as h -import hypothesis.strategies as st +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] import itertools -import pytest +import pytest # type: ignore[import-not-found] import struct import subprocess import sys import weakref +from typing import TYPE_CHECKING -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None +else: + try: + import numpy as np + except ImportError: + np = None import pyarrow as pa import pyarrow.tests.strategies as past @@ -71,7 +75,7 @@ def test_constructor_raises(): # This could happen by wrong capitalization. # ARROW-2638: prevent calling extension class constructors directly with pytest.raises(TypeError): - pa.Array([1, 2]) + pa.Array([1, 2]) # type: ignore[reportCallIssue] def test_list_format(): @@ -321,11 +325,11 @@ def test_asarray(): arr = pa.array(range(4)) - # The iterator interface gives back an array of Int64Value's + # The iterator interface gives back an array of Int64Type's np_arr = np.asarray([_ for _ in arr]) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('O') - assert isinstance(np_arr[0], pa.lib.Int64Value) + assert isinstance(np_arr[0], pa.lib.Int64Type) # Calling with the arrow array gives back an array with 'int64' dtype np_arr = np.asarray(arr) @@ -649,8 +653,8 @@ def test_array_eq(): @pytest.mark.numpy def test_array_from_buffers(): - values_buf = pa.py_buffer(np.int16([4, 5, 6, 7])) - nulls_buf = pa.py_buffer(np.uint8([0b00001101])) + values_buf = pa.py_buffer(np.array([4, 5, 6, 7], dtype=np.int16())) + nulls_buf = pa.py_buffer(np.array([0b00001101], dtype=np.uint8())) arr = pa.Array.from_buffers(pa.int16(), 4, [nulls_buf, values_buf]) assert arr.type == pa.int16() assert arr.to_pylist() == [4, None, 6, 7] @@ -665,7 +669,9 @@ def test_array_from_buffers(): assert arr.to_pylist() == [None, 6, 7] with pytest.raises(TypeError): - pa.Array.from_buffers(pa.int16(), 3, ['', ''], offset=1) + pa.Array.from_buffers( + pa.int16(), 3, ['', ''], offset=1 # type: ignore[reportArgumentType] + ) def test_string_binary_from_buffers(): @@ -859,7 +865,8 @@ def test_struct_array_from_chunked(): chunked_arr = pa.chunked_array([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError, match="Expected Array"): - pa.StructArray.from_arrays([chunked_arr], ["foo"]) + pa.StructArray.from_arrays( + [chunked_arr], ["foo"]) # type: ignore[reportArgumentType] @pytest.mark.parametrize("offset", (0, 1)) @@ -1179,24 +1186,24 @@ def test_map_from_arrays(): keys = pa.array(pykeys, type='binary') items = pa.array(pyitems, type='i4') - result = pa.MapArray.from_arrays(offsets, keys, items) + result = pa.MapArray.from_arrays(offsets, keys, items) # type: ignore[arg-type] expected = pa.array(pyentries, type=pa.map_(pa.binary(), pa.int32())) assert result.equals(expected) # pass in the type explicitly - result = pa.MapArray.from_arrays(offsets, keys, items, pa.map_( - keys.type, - items.type - )) + result = pa.MapArray.from_arrays(offsets, keys, items, # type: ignore[arg-type] + pa.map_(keys.type, items.type)) assert result.equals(expected) # pass in invalid types with pytest.raises(pa.ArrowTypeError, match='Expected map type, got string'): - pa.MapArray.from_arrays(offsets, keys, items, pa.string()) + pa.MapArray.from_arrays( + offsets, keys, items, pa.string() # type: ignore[arg-type] + ) with pytest.raises(pa.ArrowTypeError, match='Mismatching map items type'): - pa.MapArray.from_arrays(offsets, keys, items, pa.map_( + pa.MapArray.from_arrays(offsets, keys, items, pa.map_( # type: ignore[arg-type] keys.type, # Larger than the original i4 pa.int64() @@ -1234,7 +1241,7 @@ def test_map_from_arrays(): # error if null bitmap and offsets with nulls passed msg1 = 'Ambiguous to specify both validity map and offsets with nulls' with pytest.raises(pa.ArrowInvalid, match=msg1): - pa.MapArray.from_arrays(offsets, keys, items, pa.map_( + pa.MapArray.from_arrays(offsets, keys, items, pa.map_( # type: ignore[arg-type] keys.type, items.type), mask=pa.array([False, True, False], type=pa.bool_()) @@ -2718,7 +2725,7 @@ def test_interval_array_from_relativedelta(): assert arr.type == pa.month_day_nano_interval() expected_list = [ None, - pa.MonthDayNano([13, 8, + pa.MonthDayNano([13, 8, # type: ignore[arg-type] (datetime.timedelta(seconds=1, microseconds=1, minutes=1, hours=1) // datetime.timedelta(microseconds=1)) * 1000])] @@ -2751,7 +2758,7 @@ def test_interval_array_from_tuple(): assert arr.type == pa.month_day_nano_interval() expected_list = [ None, - pa.MonthDayNano([1, 2, -3])] + pa.MonthDayNano([1, 2, -3])] # type: ignore[arg-type] expected = pa.array(expected_list) assert arr.equals(expected) assert arr.to_pylist() == expected_list @@ -2772,8 +2779,8 @@ def test_interval_array_from_dateoffset(): assert arr.type == pa.month_day_nano_interval() expected_list = [ None, - pa.MonthDayNano([13, 8, 3661000001001]), - pa.MonthDayNano([0, 0, 0])] + pa.MonthDayNano([13, 8, 3661000001001]), # type: ignore[arg-type] + pa.MonthDayNano([0, 0, 0])] # type: ignore[arg-type] expected = pa.array(expected_list) assert arr.equals(expected) expected_from_pandas = [ @@ -2937,7 +2944,7 @@ def test_buffers_primitive(): # Slicing does not affect the buffers but the offset a_sliced = a[1:] buffers = a_sliced.buffers() - a_sliced.offset == 1 + assert a_sliced.offset == 1 assert len(buffers) == 2 null_bitmap = buffers[0].to_pybytes() assert 1 <= len(null_bitmap) <= 64 # XXX this is varying @@ -2945,7 +2952,7 @@ def test_buffers_primitive(): assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4) - a = pa.array(np.int8([4, 5, 6])) + a = pa.array(np.array([4, 5, 6], dtype=np.int8)) buffers = a.buffers() assert len(buffers) == 2 # No null bitmap from Numpy int array @@ -3031,7 +3038,7 @@ def test_nbytes_size(): def test_invalid_tensor_constructor_repr(): # ARROW-2638: prevent calling extension class constructors directly with pytest.raises(TypeError): - repr(pa.Tensor([1])) + repr(pa.Tensor([1])) # type: ignore[reportCallIssue] def test_invalid_tensor_construction(): @@ -3549,7 +3556,7 @@ def test_array_supported_masks(): with pytest.raises(pa.ArrowTypeError): arr = pa.array([4, None, 4, 3], - mask=[1.0, 2.0, 3.0, 4.0]) + mask=[1.0, 2.0, 3.0, 4.0]) # type: ignore[reportArgumentType] with pytest.raises(pa.ArrowTypeError): arr = pa.array([4, None, 4, 3], @@ -3836,11 +3843,11 @@ def test_concat_array_invalid_type(): # ARROW-9920 - do not segfault on non-array input with pytest.raises(TypeError, match="should contain Array objects"): - pa.concat_arrays([None]) + pa.concat_arrays([None]) # type: ignore[reportArgumentType] arr = pa.chunked_array([[0, 1], [3, 4]]) with pytest.raises(TypeError, match="should contain Array objects"): - pa.concat_arrays(arr) + pa.concat_arrays(arr) # type: ignore[reportArgumentType] @pytest.mark.pandas @@ -4369,7 +4376,7 @@ def test_non_cpu_array(): with pytest.raises(NotImplementedError): [i for i in iter(arr)] with pytest.raises(NotImplementedError): - arr == arr2 + _ = arr == arr2 with pytest.raises(NotImplementedError): arr.is_null() with pytest.raises(NotImplementedError): diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 481c387d5337..f8abec902694 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -24,7 +24,7 @@ try: from pyarrow.cffi import ffi except ImportError: - ffi = None + pass import pytest @@ -32,7 +32,7 @@ import pandas as pd import pandas.testing as tm except ImportError: - pd = tm = None + pd = None # type: ignore[assignment] needs_cffi = pytest.mark.skipif(ffi is None, @@ -148,7 +148,7 @@ def test_export_import_type(): # Invalid format string pa.int32()._export_to_c(ptr_schema) bad_format = ffi.new("char[]", b"zzz") - c_schema.format = bad_format + c_schema.format = bad_format # type: ignore[attr-defined] with pytest.raises(ValueError, match="Invalid or unsupported format string"): pa.DataType._import_from_c(ptr_schema) @@ -248,9 +248,9 @@ def test_export_import_device_array(): arr = pa.array([[1], [2, 42]], type=pa.list_(pa.int32())) arr._export_to_c_device(ptr_array) - assert c_array.device_type == 1 # ARROW_DEVICE_CPU 1 - assert c_array.device_id == -1 - assert c_array.array.length == 2 + assert c_array.device_type == 1 # type: ignore[attr-defined] # ARROW_DEVICE_CPU 1 + assert c_array.device_id == -1 # type: ignore[attr-defined] + assert c_array.array.length == 2 # type: ignore[attr-defined] def check_export_import_schema(schema_factory, expected_schema_factory=None): @@ -310,9 +310,10 @@ def test_export_import_schema_float_pointer(): match = "Passing a pointer value as a float is unsafe" with pytest.warns(UserWarning, match=match): - make_schema()._export_to_c(float(ptr_schema)) + make_schema()._export_to_c(float(ptr_schema)) # type: ignore[arg-type] with pytest.warns(UserWarning, match=match): - schema_new = pa.Schema._import_from_c(float(ptr_schema)) + schema_new = pa.Schema._import_from_c( + float(ptr_schema)) # type: ignore[arg-type] assert schema_new == make_schema() @@ -405,9 +406,9 @@ def test_export_import_device_batch(): ptr_array = int(ffi.cast("uintptr_t", c_array)) batch = make_batch() batch._export_to_c_device(ptr_array) - assert c_array.device_type == 1 # ARROW_DEVICE_CPU 1 - assert c_array.device_id == -1 - assert c_array.array.length == 2 + assert c_array.device_type == 1 # type: ignore[attr-defined] # ARROW_DEVICE_CPU 1 + assert c_array.device_id == -1 # type: ignore[attr-defined] + assert c_array.array.length == 2 # type: ignore[attr-defined] def _export_import_batch_reader(ptr_stream, reader_factory): @@ -764,7 +765,7 @@ def test_import_device_no_cuda(): # patch the device type of the struct, this results in an invalid ArrowDeviceArray # but this is just to test we raise am error before actually importing buffers - c_array.device_type = 2 # ARROW_DEVICE_CUDA + c_array.device_type = 2 # type: ignore[attr-defined] # ARROW_DEVICE_CUDA with pytest.raises(ImportError, match="Trying to import data on a CUDA device"): pa.Array._import_from_c_device(ptr_array, arr.type) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 2ef14ff39be2..1682409193b6 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -31,12 +31,12 @@ try: import numpy as np except ImportError: - np = None + pass try: import pandas as pd except ImportError: - pd = None + pass import pyarrow as pa import pyarrow.compute as pc @@ -45,7 +45,7 @@ try: import pyarrow.substrait as pas except ImportError: - pas = None + pas = None # type: ignore[assignment] exported_functions = [ func for (name, func) in sorted(pc.__dict__.items()) @@ -329,9 +329,11 @@ def test_function_attributes(): def test_input_type_conversion(): # Automatic array conversion from Python arr = pc.add([1, 2], [4, None]) + assert isinstance(arr, pa.Array) assert arr.to_pylist() == [5, None] # Automatic scalar conversion from Python arr = pc.add([1, 2], 4) + assert isinstance(arr, pa.Array) assert arr.to_pylist() == [5, 6] # Other scalar type assert pc.equal(["foo", "bar", None], @@ -779,9 +781,11 @@ def test_min_max(): assert s.as_py() == {'min': 1, 'max': 6} s = pc.min_max(data, options=pc.ScalarAggregateOptions()) assert s.as_py() == {'min': 1, 'max': 6} - s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True)) + s = pc.min_max(data, options=pc.ScalarAggregateOptions( + skip_nulls=True)) assert s.as_py() == {'min': 1, 'max': 6} - s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False)) + s = pc.min_max(data, options=pc.ScalarAggregateOptions( + skip_nulls=False)) assert s.as_py() == {'min': None, 'max': None} # Options as dict of kwargs @@ -799,11 +803,11 @@ def test_min_max(): # Wrong options type options = pc.TakeOptions() with pytest.raises(TypeError): - s = pc.min_max(data, options=options) + s = pc.min_max(data, options=options) # type: ignore[arg-type] # Missing argument with pytest.raises(TypeError, match="min_max takes 1 positional"): - s = pc.min_max() + s = pc.min_max() # type: ignore[call-arg] def test_any(): @@ -844,12 +848,12 @@ def test_all(): assert pc.all(a, options=options).as_py() is None a = pa.chunked_array([[True], [True, None]]) - assert pc.all(a).as_py() is True - assert pc.all(a, options=options).as_py() is None + assert pc.all(a).as_py() is True # type: ignore[arg-type] + assert pc.all(a, options=options).as_py() is None # type: ignore[arg-type] a = pa.chunked_array([[True], [False]]) - assert pc.all(a).as_py() is False - assert pc.all(a, options=options).as_py() is False + assert pc.all(a).as_py() is False # type: ignore[arg-type] + assert pc.all(a, options=options).as_py() is False # type: ignore[arg-type] def test_is_valid(): @@ -858,7 +862,7 @@ def test_is_valid(): assert pc.is_valid(data).to_pylist() == [True, True, False] with pytest.raises(TypeError): - pc.is_valid(data, options=None) + pc.is_valid(data, options=None) # type: ignore[call-arg] def test_generated_docstrings(): @@ -1069,21 +1073,6 @@ def find_new_unicode_codepoints(): 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, } -# utf8proc does not store if a codepoint is numeric -numeric_info_missing = { - 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, - 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, - 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, - 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, - 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, - 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, - 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, - 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, - 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, - 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, - 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, - 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, - 0x10fcb, } # utf8proc has no no digit/numeric information digit_info_missing = { 0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, @@ -1102,6 +1091,7 @@ def find_new_unicode_codepoints(): 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, } +# utf8proc does not store if a codepoint is numeric numeric_info_missing = { 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, @@ -1136,7 +1126,7 @@ def test_string_py_compat_boolean(function_name, variant): py_name = function_name.replace('_', '') ignore = codepoints_ignore.get(function_name, set()) | \ find_new_unicode_codepoints() - for i in range(128 if ascii else 0x11000): + for i in range(128 if ascii else 0x11000): # type: ignore[truthy-function] if i in range(0xD800, 0xE000): continue # bug? pyarrow doesn't allow utf16 surrogates # the issues we know of, we skip @@ -1657,10 +1647,10 @@ def test_scatter(): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_array(typ): if typ == "array": - def con(values): + def con(values): # type: ignore[no-redef] return pa.array(values) else: - def con(values): + def con(values): # type: ignore[no-redef] return pa.chunked_array([values]) arr1 = con([1, 2, 3, 4, None]) @@ -1688,10 +1678,10 @@ def con(values): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_string_scalar(typ): if typ == "array": - def con(values): + def con(values): # type: ignore[no-redef] return pa.array(values) else: - def con(values): + def con(values): # type: ignore[no-redef] return pa.chunked_array([values]) arr = con(['a', 'b', 'c', None]) @@ -1725,10 +1715,10 @@ def con(values): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_scalar(typ): if typ == "array": - def con(values): + def con(values): # type: ignore[no-redef] return pa.array(values) else: - def con(values): + def con(values): # type: ignore[no-redef] return pa.chunked_array([values]) arr = con([1, 2, 3, None]) @@ -1821,8 +1811,9 @@ def test_round_to_integer(ty): "half_to_odd": [3, 3, 4, 5, -3, -3, -4, None], } for round_mode, expected in rmode_and_expected.items(): - options = RoundOptions(round_mode=round_mode) - result = round(values, options=options) + options = RoundOptions( + round_mode=round_mode) # type: ignore[arg-type] + result = round(values, options=options) # type: ignore[arg-type] expected_array = pa.array(expected, type=pa.float64()) assert expected_array.equals(result) @@ -1840,7 +1831,9 @@ def test_round(): for ndigits, expected in ndigits_and_expected.items(): options = pc.RoundOptions(ndigits, "half_towards_infinity") result = pc.round(values, options=options) - np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) + assert isinstance(result, pa.Array) + np.testing.assert_allclose( + result, pa.array(expected), equal_nan=True) assert pc.round(values, ndigits, round_mode="half_towards_infinity") == result assert pc.round(values, ndigits, "half_towards_infinity") == result @@ -1860,6 +1853,7 @@ def test_round_to_multiple(): for multiple, expected in multiple_and_expected.items(): options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity") result = pc.round_to_multiple(values, options=options) + assert isinstance(result, pa.Array) np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) assert pc.round_to_multiple(values, multiple, "half_towards_infinity") == result @@ -1867,11 +1861,11 @@ def test_round_to_multiple(): for multiple in [0, -2, pa.scalar(-10.4)]: with pytest.raises(pa.ArrowInvalid, match="Rounding multiple must be positive"): - pc.round_to_multiple(values, multiple=multiple) + pc.round_to_multiple(values, multiple=multiple) # type: ignore[arg-type] for multiple in [object, 99999999999999999999999]: with pytest.raises(TypeError, match="is not a valid multiple type"): - pc.round_to_multiple(values, multiple=multiple) + pc.round_to_multiple(values, multiple=multiple) # type: ignore[arg-type] def test_round_binary(): @@ -2056,7 +2050,8 @@ def test_logical(): def test_dictionary_decode(): array = pa.array(["a", "a", "b", "c", "b"]) dictionary_array = array.dictionary_encode() - dictionary_array_decode = pc.dictionary_decode(dictionary_array) + dictionary_array_decode = pc.dictionary_decode( + dictionary_array) assert array != dictionary_array @@ -2236,7 +2231,7 @@ def check_cast_float_to_decimal(float_ty, float_val, decimal_ty, decimal_ctx, # Round `expected` to `scale` digits after the decimal point expected = expected.quantize(decimal.Decimal(1).scaleb(-decimal_ty.scale)) s = pa.scalar(float_val, type=float_ty) - actual = pc.cast(s, decimal_ty).as_py() + actual = pc.cast(s, decimal_ty).as_py() # type: ignore[union-attr] if actual != expected: # Allow the last digit to vary. The tolerance is higher for # very high precisions as rounding errors can accumulate in @@ -2328,8 +2323,9 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits): expected = decimal.Decimal(mantissa) / 2**-float_exp expected_as_int = round(expected.scaleb(scale)) actual = pc.cast( - pa.scalar(float_val, type=float_ty), decimal_ty).as_py() - actual_as_int = round(actual.scaleb(scale)) + pa.scalar(float_val, type=float_ty), decimal_ty + ).as_py() # type: ignore[union-attr] + actual_as_int = round(actual.scaleb(scale)) # type: ignore[union-attr] # We allow for a minor rounding error between expected and actual assert abs(actual_as_int - expected_as_int) <= 1 @@ -2365,7 +2361,7 @@ def test_strptime(): @pytest.mark.pandas @pytest.mark.timezone_data def test_strftime(): - times = ["2018-03-10 09:00", "2038-01-31 12:23", None] + times: list[str | None] = ["2018-03-10 09:00", "2038-01-31 12:23", None] timezones = ["CET", "UTC", "Europe/Ljubljana"] formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H", "%I", @@ -2375,14 +2371,15 @@ def test_strftime(): formats.extend(["%c", "%x", "%X"]) for timezone in timezones: - ts = pd.to_datetime(times).tz_localize(timezone) + ts = pd.to_datetime(times).tz_localize(timezone) # type: ignore[no-matching-overload] for unit in ["s", "ms", "us", "ns"]: tsa = pa.array(ts, type=pa.timestamp(unit, timezone)) for fmt in formats: options = pc.StrftimeOptions(fmt) result = pc.strftime(tsa, options=options) + st = ts.strftime(fmt) # type: ignore[call-non-callable] # cast to the same type as result to ignore string vs large_string - expected = pa.array(ts.strftime(fmt)).cast(result.type) + expected = pa.array(st).cast(result.type) assert result.equals(expected) fmt = "%Y-%m-%dT%H:%M:%S" @@ -2390,42 +2387,48 @@ def test_strftime(): # Default format tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions()) - expected = pa.array(ts.strftime(fmt)).cast(result.type) + st = ts.strftime(fmt) # type: ignore[call-non-callable] + expected = pa.array(st).cast(result.type) assert result.equals(expected) # Default format plus timezone tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) - expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type) + st = ts.strftime(fmt + "%Z") # type: ignore[call-non-callable] + expected = pa.array(st).cast(result.type) assert result.equals(expected) # Pandas %S is equivalent to %S in arrow for unit="s" tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S")).cast(result.type) + st = ts.strftime("%S") # type: ignore[call-non-callable] + expected = pa.array(st).cast(result.type) assert result.equals(expected) # Pandas %S.%f is equivalent to %S in arrow for unit="us" tsa = pa.array(ts, type=pa.timestamp("us", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S.%f")).cast(result.type) + st = ts.strftime("%S.%f") # type: ignore[call-non-callable] + expected = pa.array(st).cast(result.type) assert result.equals(expected) # Test setting locale tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions(fmt, locale="C") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime(fmt)).cast(result.type) + st = ts.strftime(fmt) # type: ignore[call-non-callable] + expected = pa.array(st).cast(result.type) assert result.equals(expected) # Test timestamps without timezone fmt = "%Y-%m-%dT%H:%M:%S" - ts = pd.to_datetime(times) + ts = pd.to_datetime(times) # type: ignore[no-matching-overload] tsa = pa.array(ts, type=pa.timestamp("s")) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) - expected = pa.array(ts.strftime(fmt)).cast(result.type) + st = ts.strftime(fmt) # type: ignore[call-non-callable] + expected = pa.array(st).cast(result.type) # Positional format assert pc.strftime(tsa, fmt) == result @@ -2554,10 +2557,11 @@ def test_extract_datetime_components(request): def test_offset_timezone(): - arr = pc.strptime(["2012-12-12T12:12:12"], format="%Y-%m-%dT%H:%M:%S", unit="s") + arr = pc.strptime(pa.array(["2012-12-12T12:12:12"]), + format="%Y-%m-%dT%H:%M:%S", unit="s") zoned_arr = arr.cast(pa.timestamp("s", tz="+05:30")) - assert pc.hour(zoned_arr)[0].as_py() == 17 - assert pc.minute(zoned_arr)[0].as_py() == 42 + assert pc.hour(zoned_arr)[0].as_py() == 17 # type: ignore[index,arg-type] + assert pc.minute(zoned_arr)[0].as_py() == 42 # type: ignore[index,arg-type] @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) @@ -2654,12 +2658,14 @@ def test_assume_timezone(): f"timezone '{timezone}'"): pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise) - expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True]) + expected = ambiguous.tz_localize( + timezone, ambiguous=np.array([True, True, True])) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_earliest) result.equals(pa.array(expected)) - expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False]) + expected = ambiguous.tz_localize( + timezone, ambiguous=np.array([False, False, False])) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_latest) result.equals(pa.array(expected)) @@ -2748,7 +2754,9 @@ def _check_temporal_rounding(ts, values, unit): expected = np.where( expected == ts, - expected + pd.Timedelta(value, unit_shorthand[unit]), + expected + pd.Timedelta( + value, unit_shorthand[unit] # type: ignore[arg-type] + ), expected) np.testing.assert_array_equal(result, expected) @@ -2810,7 +2818,7 @@ def test_count(): with pytest.raises(ValueError, match='"something else" is not a valid count mode'): - pc.count(arr, 'something else') + pc.count(arr, 'something else') # type: ignore[arg-type] def test_index(): @@ -2860,7 +2868,7 @@ def test_partition_nth(): with pytest.raises( ValueError, match="'partition_nth_indices' cannot be called without options"): - pc.partition_nth_indices(data) + pc.partition_nth_indices(data) # type: ignore[call-arg] def test_partition_nth_null_placement(): @@ -2982,7 +2990,7 @@ def test_array_sort_indices(): assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="not a valid sort order"): - pc.array_sort_indices(arr, order="nonscending") + pc.array_sort_indices(arr, order="nonscending") # type: ignore[arg-type] def test_sort_indices_array(): @@ -3045,23 +3053,29 @@ def test_sort_indices_table(): pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) with pytest.raises(ValueError, match="not a valid sort order"): - pc.sort_indices(table, sort_keys=[("a", "nonscending")]) + pc.sort_indices( + table, sort_keys=[("a", "nonscending")] # type: ignore[list-item] + ) def test_is_in(): arr = pa.array([1, 2, None, 1, 2, 3]) result = pc.is_in(arr, value_set=pa.array([1, 3, None])) - assert result.to_pylist() == [True, False, True, True, False, True] + assert result.to_pylist() == [True, False, True, True, + False, True] result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) - assert result.to_pylist() == [True, False, False, True, False, True] + assert result.to_pylist() == [True, False, False, True, + False, True] result = pc.is_in(arr, value_set=pa.array([1, 3])) - assert result.to_pylist() == [True, False, False, True, False, True] + assert result.to_pylist() == [True, False, False, True, + False, True] result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) - assert result.to_pylist() == [True, False, False, True, False, True] + assert result.to_pylist() == [True, False, False, True, + False, True] def test_index_in(): @@ -3125,7 +3139,7 @@ def test_quantile(): with pytest.raises(ValueError, match="Quantile must be between 0 and 1"): pc.quantile(arr, q=1.1) with pytest.raises(ValueError, match="not a valid quantile interpolation"): - pc.quantile(arr, interpolation='zzz') + pc.quantile(arr, interpolation='zzz') # type: ignore[arg-type] def test_tdigest(): @@ -3234,12 +3248,13 @@ def test_cumulative_sum(start, skip_nulls): # Add `start` offset to expected array before comparing expected = pc.add(expected_arrays[i], strt if strt is not None else 0) + assert isinstance(expected, pa.Array) np.testing.assert_array_almost_equal(result.to_numpy( zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_sum([1, 2, 3], start=strt) + pc.cumulative_sum([1, 2, 3], start=strt) # type: ignore[arg-type] @pytest.mark.numpy @@ -3289,6 +3304,7 @@ def test_cumulative_prod(start, skip_nulls): # Multiply `start` offset to expected array before comparing expected = pc.multiply(expected_arrays[i], strt if strt is not None else 1) + assert isinstance(expected, pa.Array) np.testing.assert_array_almost_equal(result.to_numpy( zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) @@ -3347,8 +3363,10 @@ def test_cumulative_max(start, skip_nulls): expected = pc.max_element_wise( expected_arrays[i], strt if strt is not None else -1e9, skip_nulls=False) - np.testing.assert_array_almost_equal(result.to_numpy( - zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) + np.testing.assert_array_almost_equal( + result.to_numpy(zero_copy_only=False), + expected.to_numpy(zero_copy_only=False) + ) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): @@ -3405,8 +3423,10 @@ def test_cumulative_min(start, skip_nulls): expected = pc.min_element_wise( expected_arrays[i], strt if strt is not None else 1e9, skip_nulls=False) - np.testing.assert_array_almost_equal(result.to_numpy( - zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) + np.testing.assert_array_almost_equal( + result.to_numpy(zero_copy_only=False), + expected.to_numpy(zero_copy_only=False) + ) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): @@ -3484,7 +3504,7 @@ def test_struct_fields_options(): pc.struct_field(arr, '.a.foo') with pytest.raises(pa.ArrowInvalid, match="cannot be called without options"): - pc.struct_field(arr) + pc.struct_field(arr) # type: ignore[call-arg] def test_case_when(): @@ -3536,7 +3556,7 @@ def test_utf8_normalize(): with pytest.raises( ValueError, match='"NFZ" is not a valid Unicode normalization form'): - pc.utf8_normalize(arr, form="NFZ") + pc.utf8_normalize(arr, form="NFZ") # type: ignore[arg-type] def test_random(): @@ -3563,7 +3583,7 @@ def test_random(): with pytest.raises(TypeError, match=r"initializer should be 'system', an integer, " r"or a hashable object; got \[\]"): - pc.random(100, initializer=[]) + pc.random(100, initializer=[]) # type: ignore[arg-type] @pytest.mark.parametrize( @@ -3613,7 +3633,7 @@ def test_rank_options(): match=r'"NonExisting" is not a valid tiebreaker'): pc.RankOptions(sort_keys="descending", null_placement="at_end", - tiebreaker="NonExisting") + tiebreaker="NonExisting") # type: ignore[arg-type] def test_rank_quantile_options(): @@ -3643,7 +3663,7 @@ def test_rank_quantile_options(): assert result.equals(expected_descending) with pytest.raises(ValueError, match="not a valid sort order"): - pc.rank_quantile(arr, sort_keys="XXX") + pc.rank_quantile(arr, sort_keys="XXX") # type: ignore[arg-type] def test_rank_normal_options(): @@ -3829,21 +3849,21 @@ def test_expression_construction(): nested_field = pc.field(("nested", "field")) nested_field2 = pc.field("nested", "field") - zero | one == string - ~true == false + _ = zero | one == string + _ = ~true == false for typ in ("bool", pa.bool_()): - field.cast(typ) == true + _ = field.cast(typ) == true - field.isin([1, 2]) - nested_mixed_types.isin(["foo", "bar"]) + _ = field.isin([1, 2]) + _ = nested_mixed_types.isin(["foo", "bar"]) nested_field.isin(["foo", "bar"]) nested_field2.isin(["foo", "bar"]) with pytest.raises(TypeError): - field.isin(1) + field.isin(1) # type: ignore[arg-type] with pytest.raises(pa.ArrowInvalid): - field != object() + _ = field != object() def test_expression_boolean_operators(): @@ -3852,16 +3872,16 @@ def test_expression_boolean_operators(): false = pc.scalar(False) with pytest.raises(ValueError, match="cannot be evaluated to python True"): - true and false + _ = true and false with pytest.raises(ValueError, match="cannot be evaluated to python True"): - true or false + _ = true or false with pytest.raises(ValueError, match="cannot be evaluated to python True"): bool(true) with pytest.raises(ValueError, match="cannot be evaluated to python True"): - not true + _ = not true def test_expression_call_function(): @@ -3890,7 +3910,7 @@ def test_cast_table_raises(): table = pa.table({'a': [1, 2]}) with pytest.raises(pa.lib.ArrowTypeError): - pc.cast(table, pa.int64()) + pc.cast(table, pa.int64()) # type: ignore[arg-type] @pytest.mark.parametrize("start,stop,expected", ( diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index c10ae0f62b41..6e48a4ff0763 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -21,13 +21,18 @@ import itertools import math import re +from typing import TYPE_CHECKING, cast import hypothesis as h import pytest -try: + +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None +else: + try: + import numpy as np + except ImportError: + np = None from pyarrow.pandas_compat import _pandas_api # noqa import pyarrow as pa @@ -66,7 +71,7 @@ def __int__(self): class MyBrokenInt: def __int__(self): - 1/0 # MARKER + _ = 1/0 # MARKER def test_iterable_types(): @@ -137,7 +142,7 @@ def test_object_with_getitem(): # https://github.com/apache/arrow/issues/34944 # considered as sequence because of __getitem__, but has no length with pytest.raises(TypeError, match="has no len()"): - pa.array(ObjectWithOnlyGetitem()) + pa.array(ObjectWithOnlyGetitem()) # type: ignore[arg-type] def _as_list(xs): @@ -845,7 +850,7 @@ def test_large_binary_value(ty): assert isinstance(arr, pa.Array) assert arr.type == ty assert len(arr) == 4 - buf = arr[1].as_buffer() + buf = cast(pa.FixedSizeBinaryScalar, arr[1]).as_buffer() assert len(buf) == len(s) * nrepeats @@ -1091,11 +1096,11 @@ def expected_datetime_value(dt): ), ] utcdata = [ - pytz.utc.localize(data[0]), + pytz.utc.localize(cast(datetime.datetime, data[0])), data[1], None, - data[3].astimezone(pytz.utc), - data[4].astimezone(pytz.utc), + cast(datetime.datetime, data[3]).astimezone(pytz.utc), + cast(datetime.datetime, data[4]).astimezone(pytz.utc), ] ty = pa.timestamp(unit, tz=timezone) @@ -1223,9 +1228,9 @@ def test_sequence_timestamp_from_mixed_builtin_and_pandas_datetimes(): None, ] utcdata = [ - data[0].astimezone(pytz.utc), - pytz.utc.localize(data[1]), - data[2].astimezone(pytz.utc), + cast(datetime.datetime, data[0]).astimezone(pytz.utc), + pytz.utc.localize(cast(datetime.datetime, data[1])), + cast(datetime.datetime, data[2]).astimezone(pytz.utc), None, ] @@ -2103,8 +2108,8 @@ def test_map_from_dicts(): assert arr.to_pylist() == expected # With omitted values - data[1] = None - expected[1] = None + data[1] = None # type: ignore[call-overload] + expected[1] = None # type: ignore[call-overload] arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) @@ -2429,6 +2434,7 @@ def test_nested_auto_chunking(ty, char): } +@pytest.mark.numpy @pytest.mark.large_memory def test_array_from_pylist_data_overflow(): # Regression test for ARROW-12983 @@ -2451,6 +2457,7 @@ def test_array_from_pylist_data_overflow(): assert len(arr.chunks) > 1 +@pytest.mark.numpy @pytest.mark.slow @pytest.mark.large_memory def test_array_from_pylist_offset_overflow(): @@ -2475,6 +2482,7 @@ def test_array_from_pylist_offset_overflow(): assert len(arr.chunks) > 1 +@pytest.mark.numpy @parametrize_with_collections_types @pytest.mark.parametrize(('data', 'scalar_data', 'value_type'), [ ([True, False, None], [pa.scalar(True), pa.scalar(False), None], pa.bool_()), @@ -2512,8 +2520,10 @@ def test_array_from_pylist_offset_overflow(): pa.timestamp('us') ), ( - [pa.MonthDayNano([1, -1, -10100])], - [pa.scalar(pa.MonthDayNano([1, -1, -10100]))], + [pa.MonthDayNano([1, -1, -10100])], # type: ignore[call-arg, arg-type] + [pa.scalar( + pa.MonthDayNano([1, -1, -10100]) # type: ignore[call-arg, arg-type] + )], pa.month_day_nano_interval() ), (["a", "b"], [pa.scalar("a"), pa.scalar("b")], pa.string()), diff --git a/python/pyarrow/tests/test_cpp_internals.py b/python/pyarrow/tests/test_cpp_internals.py index 7508d8f0b981..7d652acf62f1 100644 --- a/python/pyarrow/tests/test_cpp_internals.py +++ b/python/pyarrow/tests/test_cpp_internals.py @@ -20,7 +20,8 @@ import pytest -from pyarrow._pyarrow_cpp_tests import get_cpp_tests +from pyarrow._pyarrow_cpp_tests import ( # type: ignore[import-not-found, import-untyped] # noqa: E501 + get_cpp_tests) def inject_cpp_tests(ns): diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index dce605c7156d..2c271fa9b1bf 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -178,6 +178,7 @@ def test_read_options(pickle_module): encoding='utf16', skip_rows_after_names=27) + assert opts.block_size is not None assert opts.block_size > 0 opts.block_size = 12345 assert opts.block_size == 12345 @@ -302,6 +303,7 @@ def test_convert_options(pickle_module): with pytest.raises(ValueError): opts.decimal_point = '..' + assert opts.auto_dict_max_cardinality is not None assert opts.auto_dict_max_cardinality > 0 opts.auto_dict_max_cardinality = 99999 assert opts.auto_dict_max_cardinality == 99999 @@ -323,7 +325,7 @@ def test_convert_options(pickle_module): with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): - opts.column_types = 0 + opts.column_types = 0 # type: ignore[reportAttributeAccessIssue] assert isinstance(opts.null_values, list) assert '' in opts.null_values @@ -1158,10 +1160,14 @@ def test_auto_dict_encode(self): table = self.read_bytes(rows, convert_options=opts, validate_full=False) assert table.schema == schema - dict_values = table['a'].chunk(0).dictionary + column_chunk = table.column('a').chunk(0) + assert isinstance(column_chunk, pa.DictionaryArray) + dict_values = column_chunk.dictionary assert len(dict_values) == 2 assert dict_values[0].as_py() == "ab" - assert dict_values[1].as_buffer() == b"cd\xff" + dict_value = dict_values[1] + assert isinstance(dict_value, pa.StringScalar) + assert dict_value.as_buffer() == b"cd\xff" # With invalid UTF8, checked opts.check_utf8 = True @@ -1502,7 +1508,7 @@ def signal_from_thread(): # Interruption should have arrived timely assert last_duration <= 2.0 - e = exc_info.__context__ + e = exc_info.__context__ # type: ignore[possibly-missing-attribute, misc] assert isinstance(e, pa.ArrowCancelled) assert e.signum == signal.SIGINT @@ -1866,6 +1872,9 @@ def use_threads(self): class BaseTestCompressedCSVRead: + def write_file(self, path, contents): + pass + csv_filename = "" def setUp(self): self.tmpdir = tempfile.mkdtemp(prefix='arrow-csv-test-') @@ -1997,7 +2006,7 @@ def test_write_quoting_style(): except Exception as e: # This will trigger when we try to write a comma (,) # without quotes, which is invalid - assert isinstance(e, res) + assert isinstance(e, res) # type: ignore[invalid-argument-type] break assert buf.getvalue() == res buf.seek(0) diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py index e06f479987cb..9d03a3bbff2f 100644 --- a/python/pyarrow/tests/test_cuda.py +++ b/python/pyarrow/tests/test_cuda.py @@ -103,6 +103,7 @@ def make_random_buffer(size, target='host'): assert size >= 0 buf = pa.allocate_buffer(size) assert buf.size == size + assert isinstance(buf, pa.Buffer) arr = np.frombuffer(buf, dtype=np.uint8) assert arr.size == size arr[:] = np.random.randint(low=1, high=255, size=size, dtype=np.uint8) @@ -194,12 +195,14 @@ def test_context_device_buffer(size): np.testing.assert_equal(arr[soffset:soffset + ssize], arr2) # Creating a device buffer from a slice of an array - cudabuf = global_context.buffer_from_data(arr, offset=soffset, size=ssize) + cudabuf = global_context.buffer_from_data( + arr, offset=soffset, size=ssize) assert cudabuf.size == ssize arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8) np.testing.assert_equal(arr[soffset:soffset + ssize], arr2) - cudabuf = global_context.buffer_from_data(arr[soffset:soffset+ssize]) + cudabuf = global_context.buffer_from_data( + arr[soffset:soffset+ssize]) assert cudabuf.size == ssize arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8) np.testing.assert_equal(arr[soffset:soffset + ssize], arr2) @@ -235,7 +238,8 @@ def test_context_device_buffer(size): # Creating device buffer from HostBuffer slice - cudabuf = global_context.buffer_from_data(buf, offset=soffset, size=ssize) + cudabuf = global_context.buffer_from_data( + buf, offset=soffset, size=ssize) assert cudabuf.size == ssize arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8) np.testing.assert_equal(arr[soffset:soffset+ssize], arr2) @@ -384,7 +388,8 @@ def test_copy_from_to_host(size): device_buffer.copy_from_host(buf, position=0, nbytes=nbytes) # Copy back to host and compare contents - buf2 = device_buffer.copy_to_host(position=0, nbytes=nbytes) + buf2 = device_buffer.copy_to_host( + position=0, nbytes=nbytes) arr2 = np.frombuffer(buf2, dtype=dt) np.testing.assert_equal(arr, arr2) @@ -395,7 +400,8 @@ def test_copy_to_host(size): buf = dbuf.copy_to_host() assert buf.is_cpu - np.testing.assert_equal(arr, np.frombuffer(buf, dtype=np.uint8)) + np.testing.assert_equal(arr, np.frombuffer( + buf, dtype=np.uint8)) buf = dbuf.copy_to_host(position=size//4) assert buf.is_cpu @@ -437,11 +443,13 @@ def test_copy_to_host(size): np.frombuffer(buf, dtype=np.uint8)) dbuf.copy_to_host(buf=buf, nbytes=12) - np.testing.assert_equal(arr[:12], np.frombuffer(buf, dtype=np.uint8)[:12]) + np.testing.assert_equal(arr[:12], np.frombuffer( + buf, dtype=np.uint8)[:12]) dbuf.copy_to_host(buf=buf, nbytes=12, position=6) - np.testing.assert_equal(arr[6:6+12], - np.frombuffer(buf, dtype=np.uint8)[:12]) + np.testing.assert_equal( + arr[6:6+12], np.frombuffer(buf, dtype=np.uint8)[:12] + ) for (position, nbytes) in [ (0, size+10), (10, size-5), @@ -450,7 +458,8 @@ def test_copy_to_host(size): with pytest.raises(ValueError, match=('requested copy does not ' 'fit into host buffer')): - dbuf.copy_to_host(buf=buf, position=position, nbytes=nbytes) + dbuf.copy_to_host( + buf=buf, position=position, nbytes=nbytes) @pytest.mark.parametrize("dest_ctx", ['same', 'another']) @@ -460,7 +469,9 @@ def test_copy_from_device(dest_ctx, size): lst = arr.tolist() if dest_ctx == 'another': dest_ctx = global_context1 - if buf.context.device_number == dest_ctx.device_number: + if ( + buf.context.device_number == dest_ctx.device_number + ): pytest.skip("not a multi-GPU system") else: dest_ctx = buf.context @@ -563,7 +574,10 @@ def test_buffer_device(): _, buf = make_random_buffer(size=10, target='device') assert buf.device_type == pa.DeviceAllocationType.CUDA assert isinstance(buf.device, pa.Device) - assert buf.device == global_context.memory_manager.device + assert ( + buf.device == + global_context.memory_manager.device + ) assert isinstance(buf.memory_manager, pa.MemoryManager) assert not buf.is_cpu assert not buf.device.is_cpu @@ -807,8 +821,9 @@ def test_create_table_with_device_buffers(): def other_process_for_test_IPC(handle_buffer, expected_arr): - other_context = pa.cuda.Context(0) - ipc_handle = pa.cuda.IpcMemHandle.from_buffer(handle_buffer) + other_context = cuda.Context(0) + ipc_handle = cuda.IpcMemHandle.from_buffer( + handle_buffer) ipc_buf = other_context.open_ipc_buffer(ipc_handle) ipc_buf.context.synchronize() buf = ipc_buf.copy_to_host() @@ -848,7 +863,8 @@ def test_copy_to(): batch = pa.record_batch({"col": arr}) batch_cuda = batch.copy_to(dest) - buf_cuda = batch_cuda["col"].buffers()[1] + buf_cuda = batch_cuda.column("col").buffers()[1] + assert buf_cuda is not None assert not buf_cuda.is_cpu assert buf_cuda.device_type == pa.DeviceAllocationType.CUDA assert buf_cuda.device == mm_cuda.device @@ -949,7 +965,8 @@ def test_device_interface_batch_array(): cbatch._export_to_c_device(ptr_array, ptr_schema) # Delete and recreate C++ objects from exported pointers del cbatch - cbatch_new = pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema) + cbatch_new = pa.RecordBatch._import_from_c_device( + ptr_array, ptr_schema) assert cbatch_new.schema == schema batch_new = cbatch_new.copy_to(pa.default_cpu_memory_manager()) assert batch_new.equals(batch) @@ -957,13 +974,15 @@ def test_device_interface_batch_array(): del cbatch_new # Now released with pytest.raises(ValueError, match="Cannot import released ArrowSchema"): - pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema) + pa.RecordBatch._import_from_c_device( + ptr_array, ptr_schema) # Not a struct type pa.int32()._export_to_c(ptr_schema) with pytest.raises(ValueError, match="ArrowSchema describes non-struct type"): - pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema) + pa.RecordBatch._import_from_c_device( + ptr_array, ptr_schema) def test_print_array(): diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py index 876f3c7f761c..4a5bc7975333 100644 --- a/python/pyarrow/tests/test_cuda_numba_interop.py +++ b/python/pyarrow/tests/test_cuda_numba_interop.py @@ -28,7 +28,6 @@ from numba.cuda.cudadrv.devicearray import DeviceNDArray # noqa: E402 - context_choices = None context_choice_ids = ['pyarrow.cuda', 'numba.cuda'] @@ -62,17 +61,19 @@ def test_context(c): def make_random_buffer(size, target='host', dtype='uint8', ctx=None): """Return a host or device buffer with random data. """ - dtype = np.dtype(dtype) + assert np is not None + dtype_obj = np.dtype(dtype) if target == 'host': assert size >= 0 - buf = pa.allocate_buffer(size*dtype.itemsize) - arr = np.frombuffer(buf, dtype=dtype) + buf = pa.allocate_buffer(size*dtype_obj.itemsize) + arr = np.frombuffer(buf, dtype=dtype_obj) arr[:] = np.random.randint(low=0, high=255, size=size, dtype=np.uint8) return arr, buf elif target == 'device': arr, buf = make_random_buffer(size, target='host', dtype=dtype) - dbuf = ctx.new_buffer(size * dtype.itemsize) + assert ctx is not None + dbuf = ctx.new_buffer(size * dtype_obj.itemsize) dbuf.copy_from_host(buf, position=0, nbytes=buf.size) return arr, dbuf raise ValueError('invalid target value') @@ -161,8 +162,8 @@ def __cuda_array_interface__(self): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_numba_memalloc(c, dtype): + assert np is not None ctx, nb_ctx = context_choices[c] - dtype = np.dtype(dtype) # Allocate memory using numba context # Warning: this will not be reflected in pyarrow context manager # (e.g bytes_allocated does not change) @@ -198,6 +199,7 @@ def test_pyarrow_memalloc(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_numba_context(c, dtype): + assert np is not None ctx, nb_ctx = context_choices[c] size = 10 with nb_cuda.gpus[0]: @@ -209,7 +211,10 @@ def test_numba_context(c, dtype): np.testing.assert_equal(darr.copy_to_host(), arr) darr[0] = 99 cbuf.context.synchronize() - arr2 = np.frombuffer(cbuf.copy_to_host(), dtype=dtype) + arr2 = np.frombuffer( + cbuf.copy_to_host(), + dtype=np.dtype(dtype) + ) assert arr2[0] == 99 @@ -217,6 +222,7 @@ def test_numba_context(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_pyarrow_jit(c, dtype): + assert np is not None ctx, nb_ctx = context_choices[c] @nb_cuda.jit @@ -234,5 +240,8 @@ def increment_by_one(an_array): darr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=mem) increment_by_one[blockspergrid, threadsperblock](darr) cbuf.context.synchronize() - arr1 = np.frombuffer(cbuf.copy_to_host(), dtype=arr.dtype) + arr1 = np.frombuffer( + cbuf.copy_to_host(), + dtype=arr.dtype + ) np.testing.assert_equal(arr1, arr + 1) diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py index a142e66db567..11ef01412a6f 100644 --- a/python/pyarrow/tests/test_cython.py +++ b/python/pyarrow/tests/test_cython.py @@ -89,7 +89,7 @@ def test_cython_api(tmpdir): Basic test for the Cython API. """ # Fail early if cython is not found - import cython # noqa + import cython # type: ignore[import-untyped, import-not-found] # noqa with tmpdir.as_cwd(): # Set up temporary workspace diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index d00c0c4b3eb9..ce913612bad5 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -32,7 +32,7 @@ try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa @@ -40,6 +40,7 @@ import pyarrow.csv import pyarrow.feather import pyarrow.fs as fs +from pyarrow.fs import FileInfo import pyarrow.json from pyarrow.lib import is_threading_enabled from pyarrow.tests.util import (FSProtocolClass, ProxyHandler, @@ -49,17 +50,17 @@ try: import pandas as pd except ImportError: - pd = None + pass try: import pyarrow.dataset as ds except ImportError: - ds = None + pass try: import pyarrow.parquet as pq except ImportError: - pq = None + pass # Marks all of the tests in this module # Ignore these with pytest ... -m 'not dataset' @@ -395,14 +396,16 @@ def test_filesystem_dataset(mockfs): # validation of required arguments with pytest.raises(TypeError, match="incorrect type"): - ds.FileSystemDataset(fragments, file_format, schema) + ds.FileSystemDataset(fragments, file_format, schema) # type: ignore[arg-type] # validation of root_partition with pytest.raises(TypeError, match="incorrect type"): - ds.FileSystemDataset(fragments, schema=schema, - format=file_format, root_partition=1) + ds.FileSystemDataset( + fragments, schema=schema, format=file_format, + root_partition=1) # type: ignore[arg-type] # missing required argument in from_paths with pytest.raises(TypeError, match="incorrect type"): - ds.FileSystemDataset.from_paths(fragments, format=file_format) + ds.FileSystemDataset.from_paths( + fragments, format=file_format) # type: ignore[arg-type] def test_filesystem_dataset_no_filesystem_interaction(dataset_reader): @@ -827,7 +830,8 @@ def test_partitioning(): load_back = None with pytest.raises(ValueError, match="Expected Partitioning or PartitioningFactory"): - load_back = ds.dataset(tempdir, format='ipc', partitioning=int(0)) + load_back = ds.dataset( + tempdir, format='ipc', partitioning=int(0)) # type: ignore[arg-type] assert load_back is None @@ -859,8 +863,8 @@ def test_partitioning_pickling(pickle_module): ) def test_dataset_partitioning_format( flavor: str, - expected_defined_partition: tuple, - expected_undefined_partition: tuple, + expected_defined_partition: tuple[str], + expected_undefined_partition: tuple[str], ): partitioning_schema = pa.schema([("foo", pa.string()), ("bar", pa.string())]) @@ -1215,6 +1219,7 @@ def test_make_fragment(multisourcefs): parquet_format = ds.ParquetFileFormat() dataset = ds.dataset('/plain', filesystem=multisourcefs, format=parquet_format) + assert isinstance(dataset, ds.FileSystemDataset) for path in dataset.files: fragment = parquet_format.make_fragment(path, multisourcefs) @@ -1252,7 +1257,9 @@ def test_make_fragment_with_size(s3_example_simple): assert tbl.equals(table) # true sizes -> works - sizes_true = [dataset.filesystem.get_file_info(x).size for x in dataset.files] + dataset_file_info = [dataset.filesystem.get_file_info(x) for x in dataset.files] + sizes_true = [x.size if isinstance( + x, FileInfo) else None for x in dataset_file_info] fragments_with_size = [file_format.make_fragment(path, fs, file_size=size) for path, size in zip(paths, sizes_true)] dataset_with_size = ds.FileSystemDataset( @@ -1943,6 +1950,7 @@ def test_fragments_repr(tempdir, dataset): # single-file parquet dataset (no partition information in repr) table, path = _create_single_file(tempdir) dataset = ds.dataset(path, format="parquet") + assert isinstance(dataset, ds.FileSystemDataset) fragment = list(dataset.get_fragments())[0] assert ( repr(fragment) == @@ -1954,6 +1962,7 @@ def test_fragments_repr(tempdir, dataset): path = tempdir / "data.feather" pa.feather.write_feather(table, path) dataset = ds.dataset(path, format="feather") + assert isinstance(dataset, ds.FileSystemDataset) fragment = list(dataset.get_fragments())[0] assert ( repr(fragment) == @@ -2065,7 +2074,7 @@ def test_partitioning_factory_segment_encoding(pickled, pickle_module): actual = factory.finish().to_table(columns={ "date_int": ds.field("date").cast(pa.int64()), }) - assert actual[0][0].as_py() == 1620086400 + assert actual.column(0).chunk(0)[0].as_py() == 1620086400 partitioning_factory = ds.DirectoryPartitioning.discover( ["date", "string"], segment_encoding="none") @@ -2105,7 +2114,7 @@ def test_partitioning_factory_segment_encoding(pickled, pickle_module): actual = factory.finish().to_table(columns={ "date_int": ds.field("date").cast(pa.int64()), }) - assert actual[0][0].as_py() == 1620086400 + assert actual.column(0).chunk(0)[0].as_py() == 1620086400 partitioning_factory = ds.HivePartitioning.discover( segment_encoding="none") @@ -2173,7 +2182,7 @@ def test_partitioning_factory_hive_segment_encoding_key_encoded(pickled, pickle_ actual = factory.finish().to_table(columns={ "date_int": ds.field("test'; date").cast(pa.int64()), }) - assert actual[0][0].as_py() == 1620086400 + assert actual.column(0).chunk(0)[0].as_py() == 1620086400 partitioning_factory = ds.HivePartitioning.discover( segment_encoding="uri") @@ -2231,7 +2240,7 @@ def test_dictionary_partitioning_outer_nulls_raises(tempdir): def test_positional_keywords_raises(tempdir): table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']}) with pytest.raises(TypeError): - ds.write_dataset(table, tempdir, "basename-{i}.arrow") + ds.write_dataset(table, tempdir, "basename-{i}.arrow") # type: ignore[arg-type] @pytest.mark.parquet @@ -2245,20 +2254,20 @@ def test_read_partition_keys_only(tempdir): 'key': pa.repeat(0, BATCH_SIZE + 1), 'value': np.arange(BATCH_SIZE + 1)}) pq.write_to_dataset( - table[:BATCH_SIZE], + table[:BATCH_SIZE], # type: ignore[arg-type] tempdir / 'one', partition_cols=['key']) pq.write_to_dataset( - table[:BATCH_SIZE + 1], + table[:BATCH_SIZE + 1], # type: ignore[arg-type] tempdir / 'two', partition_cols=['key']) table = pq.read_table(tempdir / 'one', columns=['key']) - assert table['key'].num_chunks == 1 + assert table.column('key').num_chunks == 1 table = pq.read_table(tempdir / 'two', columns=['key', 'value']) - assert table['key'].num_chunks == 2 + assert table.column('key').num_chunks == 2 table = pq.read_table(tempdir / 'two', columns=['key']) - assert table['key'].num_chunks == 2 + assert table.column('key').num_chunks == 2 def _has_subdirs(basedir): @@ -2319,9 +2328,9 @@ def test_partitioning_function(): with pytest.raises(ValueError): ds.partitioning() with pytest.raises(ValueError, match="Expected list"): - ds.partitioning(field_names=schema) + ds.partitioning(field_names=schema) # type: ignore[arg-type] with pytest.raises(ValueError, match="Cannot specify both"): - ds.partitioning(schema, field_names=schema) + ds.partitioning(schema, field_names=schema) # type: ignore[call-overload] # Hive partitioning part = ds.partitioning(schema, flavor="hive") @@ -2332,13 +2341,13 @@ def test_partitioning_function(): assert isinstance(part, ds.PartitioningFactory) # cannot pass list of names with pytest.raises(ValueError): - ds.partitioning(names, flavor="hive") + ds.partitioning(names, flavor="hive") # type: ignore[arg-type] with pytest.raises(ValueError, match="Cannot specify 'field_names'"): ds.partitioning(field_names=names, flavor="hive") # unsupported flavor with pytest.raises(ValueError): - ds.partitioning(schema, flavor="unsupported") + ds.partitioning(schema, flavor="unsupported") # type: ignore[arg-type] @pytest.mark.parquet @@ -2353,6 +2362,8 @@ def test_directory_partitioning_dictionary_key(mockfs): dataset = ds.dataset( "subdir", format="parquet", filesystem=mockfs, partitioning=part ) + assert isinstance(dataset, ds.FileSystemDataset) + assert dataset.partitioning is not None assert dataset.partitioning.schema == schema table = dataset.to_table() @@ -2373,6 +2384,8 @@ def test_hive_partitioning_dictionary_key(multisourcefs): dataset = ds.dataset( "hive", format="parquet", filesystem=multisourcefs, partitioning=part ) + assert isinstance(dataset, ds.FileSystemDataset) + assert dataset.partitioning is not None assert dataset.partitioning.schema == schema table = dataset.to_table() @@ -2380,11 +2393,13 @@ def test_hive_partitioning_dictionary_key(multisourcefs): month_dictionary = list(range(1, 13)) assert table.column('year').type.equals(schema.types[0]) for chunk in table.column('year').chunks: + assert isinstance(chunk, pa.DictionaryArray) actual = chunk.dictionary.to_pylist() actual.sort() assert actual == year_dictionary assert table.column('month').type.equals(schema.types[1]) for chunk in table.column('month').chunks: + assert isinstance(chunk, pa.DictionaryArray) actual = chunk.dictionary.to_pylist() actual.sort() assert actual == month_dictionary @@ -2574,6 +2589,8 @@ def test_construct_from_mixed_child_datasets(mockfs): 'subdir/2/yyy/file1.parquet'], filesystem=mockfs) b = ds.dataset('subdir', filesystem=mockfs) + assert isinstance(a, ds.FileSystemDataset) + assert isinstance(b, ds.FileSystemDataset) dataset = ds.dataset([a, b]) assert isinstance(dataset, ds.UnionDataset) @@ -2585,8 +2602,8 @@ def test_construct_from_mixed_child_datasets(mockfs): assert len(dataset.children) == 2 for child in dataset.children: - assert child.files == ['subdir/1/xxx/file0.parquet', - 'subdir/2/yyy/file1.parquet'] + assert child.files == [ # type: ignore[attr-defined] + 'subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] def test_construct_empty_dataset(): @@ -2620,7 +2637,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs): batch2 = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["b"]) with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'): - ds.dataset([child1, child2]) + ds.dataset([child1, child2]) # type: ignore[arg-type] expected = ( "Expected a list of path-like or dataset objects, or a list " @@ -2628,14 +2645,14 @@ def test_construct_from_invalid_sources_raise(multisourcefs): "types: int" ) with pytest.raises(TypeError, match=expected): - ds.dataset([1, 2, 3]) + ds.dataset([1, 2, 3]) # type: ignore[arg-type] expected = ( "Expected a path-like, list of path-likes or a list of Datasets " "instead of the given type: NoneType" ) with pytest.raises(TypeError, match=expected): - ds.dataset(None) + ds.dataset(None) # type: ignore[arg-type] expected = ( "Expected a path-like, list of path-likes or a list of Datasets " @@ -2662,7 +2679,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs): "batches or tables. The given list contains the following types:" ) with pytest.raises(TypeError, match=expected): - ds.dataset([batch1, 0]) + ds.dataset([batch1, 0]) # type: ignore[arg-type] expected = ( "Expected a list of tables or batches. The given list contains a int" @@ -2752,7 +2769,7 @@ def test_open_dataset_partitioned_directory(tempdir, dataset_reader, pickle_modu dataset = ds.dataset( str(path), partitioning=ds.partitioning( - pa.schema([("part", pa.int8())]), flavor="hive")) + schema=pa.schema([("part", pa.int8())]), flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int8())) assert dataset.schema.equals(expected_schema) @@ -2797,7 +2814,7 @@ def test_open_union_dataset(tempdir, dataset_reader, pickle_module): _, path = _create_single_file(tempdir) dataset = ds.dataset(path) - union = ds.dataset([dataset, dataset]) + union = ds.dataset([dataset, dataset]) # type: ignore[arg-type] assert isinstance(union, ds.UnionDataset) pickled = pickle_module.loads(pickle_module.dumps(union)) @@ -2807,7 +2824,7 @@ def test_open_union_dataset(tempdir, dataset_reader, pickle_module): def test_open_union_dataset_with_additional_kwargs(multisourcefs): child = ds.dataset('/plain', filesystem=multisourcefs, format='parquet') with pytest.raises(ValueError, match="cannot pass any additional"): - ds.dataset([child], format="parquet") + ds.dataset([child], format="parquet") # type: ignore[arg-type] def test_open_dataset_non_existing_file(): @@ -2894,7 +2911,7 @@ def expected_type(key): def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module): # https://issues.apache.org/jira/browse/ARROW-11400 table = pa.table({'part': np.repeat(['A', 'B'], 5), 'col': range(10)}) - part = ds.partitioning(table.select(['part']).schema, flavor="hive") + part = ds.partitioning(schema=table.select(['part']).schema, flavor="hive") ds.write_dataset(table, tempdir, partitioning=part, format="feather") dataset = ds.dataset( @@ -2902,7 +2919,7 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module) partitioning=ds.HivePartitioning.discover(infer_dictionary=True) ) expected = pa.table( - {'col': table['col'], 'part': table['part'].dictionary_encode()} + {'col': table.column('col'), 'part': table.column('part').dictionary_encode()} ) assert dataset.to_table().equals(expected) fragment = list(dataset.get_fragments())[0] @@ -2987,7 +3004,7 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple): assert dataset.to_table().equals(table) # directly passing the fsspec-handler - fs = PyFileSystem(FSSpecHandler(fs)) + fs = PyFileSystem(FSSpecHandler(fs)) # type: ignore[abstract] dataset = ds.dataset(path, format="parquet", filesystem=fs) assert dataset.to_table().equals(table) @@ -3089,7 +3106,7 @@ def test_file_format_inspect_fsspec(tempdir): format = ds.ParquetFileFormat() # manually creating a PyFileSystem instead of using fs._ensure_filesystem # which would convert an fsspec local filesystem to a native one - filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) + filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) # type: ignore[abstract] schema = format.inspect(path, filesystem) assert schema.equals(table.schema) @@ -3107,11 +3124,11 @@ def test_filter_timestamp(tempdir, dataset_reader): "id": range(10)}) # write dataset partitioned on dates (as strings) - part = ds.partitioning(table.select(['dates']).schema, flavor="hive") + part = ds.partitioning(schema=table.select(['dates']).schema, flavor="hive") ds.write_dataset(table, path, partitioning=part, format="feather") # read dataset partitioned on dates (as timestamps) - part = ds.partitioning(pa.schema([("dates", pa.timestamp("s"))]), + part = ds.partitioning(schema=pa.schema([("dates", pa.timestamp("s"))]), flavor="hive") dataset = ds.dataset(path, format="feather", partitioning=part) @@ -3162,7 +3179,7 @@ def test_filter_compute_expression(tempdir, dataset_reader): filter_ = pc.is_in(ds.field('A'), pa.array(["a", "b"])) assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 3 - filter_ = pc.hour(ds.field('B')) >= 3 + filter_ = pc.hour(ds.field('B')) >= 3 # type: ignore[operator] assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 2 days = pc.days_between(ds.field('B'), ds.field("C")) @@ -3194,12 +3211,12 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): assert child1.schema != child2.schema != child3.schema - assembled = ds.dataset([child1, child2, child3]) + assembled = ds.dataset([child1, child2, child3]) # type: ignore[arg-type] assert isinstance(assembled, ds.UnionDataset) msg = 'cannot pass any additional arguments' with pytest.raises(ValueError, match=msg): - ds.dataset([child1, child2], filesystem=multisourcefs) + ds.dataset([child1, child2], filesystem=multisourcefs) # type: ignore[arg-type] expected_schema = pa.schema([ ('date', pa.date32()), @@ -3213,7 +3230,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) - assembled = ds.dataset([child1, child3]) + assembled = ds.dataset([child1, child3]) # type: ignore[arg-type] expected_schema = pa.schema([ ('date', pa.date32()), ('index', pa.int64()), @@ -3230,6 +3247,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): ('color', pa.string()), ('date', pa.date32()), ]) + # type: ignore[arg-type] assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) @@ -3238,6 +3256,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): ('color', pa.string()), ('unknown', pa.string()) # fill with nulls ]) + # type: ignore[arg-type] assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) @@ -3248,7 +3267,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): child4 = ds.dataset(path) with pytest.raises(pa.ArrowTypeError, match='Unable to merge'): - ds.dataset([child1, child4]) + ds.dataset([child1, child4]) # type: ignore[arg-type] def test_dataset_from_a_list_of_local_directories_raises(multisourcefs): @@ -3259,7 +3278,7 @@ def test_dataset_from_a_list_of_local_directories_raises(multisourcefs): def test_union_dataset_filesystem_datasets(multisourcefs): # without partitioning - dataset = ds.dataset([ + dataset = ds.dataset([ # type: ignore[arg-type] ds.dataset('/plain', filesystem=multisourcefs), ds.dataset('/schema', filesystem=multisourcefs), ds.dataset('/hive', filesystem=multisourcefs), @@ -3273,7 +3292,7 @@ def test_union_dataset_filesystem_datasets(multisourcefs): assert dataset.schema.equals(expected_schema) # with hive partitioning for two hive sources - dataset = ds.dataset([ + dataset = ds.dataset([ # type: ignore[arg-type] ds.dataset('/plain', filesystem=multisourcefs), ds.dataset('/schema', filesystem=multisourcefs), ds.dataset('/hive', filesystem=multisourcefs, partitioning='hive') @@ -3333,7 +3352,7 @@ def _check_dataset(schema, expected, expected_schema=None): # Specifying with differing field types schema = pa.schema([('a', 'int32'), ('b', 'float64')]) dataset = ds.dataset(str(tempdir / "data.parquet"), schema=schema) - expected = pa.table([table['a'].cast('int32'), + expected = pa.table([table['a'].cast('int32'), # type: ignore[arg-type] table['b']], names=['a', 'b']) _check_dataset(schema, expected) @@ -3834,7 +3853,7 @@ def test_parquet_dataset_factory_fsspec(tempdir): fsspec_fs = fsspec.filesystem("file") # manually creating a PyFileSystem, because passing the local fsspec # filesystem would internally be converted to native LocalFileSystem - filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) + filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) # type: ignore[abstract] dataset = ds.parquet_dataset(metadata_path, filesystem=filesystem) assert dataset.schema.equals(table.schema) assert len(dataset.files) == 4 @@ -4042,12 +4061,14 @@ def test_filter_mismatching_schema(tempdir, dataset_reader): # filtering on a column with such type mismatch should implicitly # cast the column filtered = dataset_reader.to_table(dataset, filter=ds.field("col") > 2) - assert filtered["col"].equals(table["col"].cast('int64').slice(2)) + assert filtered["col"].equals(table["col"].cast( + 'int64').slice(2)) # type: ignore[arg-type] fragment = list(dataset.get_fragments())[0] filtered = dataset_reader.to_table( fragment, filter=ds.field("col") > 2, schema=schema) - assert filtered["col"].equals(table["col"].cast('int64').slice(2)) + assert filtered["col"].equals(table["col"].cast( + 'int64').slice(2)) # type: ignore[arg-type] @pytest.mark.parquet @@ -4112,6 +4133,7 @@ def test_dataset_preserved_partitioning(tempdir): # through discovery, but without partitioning _, path = _create_single_file(tempdir) dataset = ds.dataset(path) + assert isinstance(dataset, ds.FileSystemDataset) assert isinstance(dataset.partitioning, ds.DirectoryPartitioning) # TODO(GH-34884) partitioning attribute not preserved in pickling # dataset_ = ds.dataset(path) @@ -4121,10 +4143,12 @@ def test_dataset_preserved_partitioning(tempdir): # through discovery, with hive partitioning but not specified full_table, path = _create_partitioned_dataset(tempdir) dataset = ds.dataset(path) + assert isinstance(dataset, ds.FileSystemDataset) assert isinstance(dataset.partitioning, ds.DirectoryPartitioning) # through discovery, with hive partitioning (from a partitioning factory) dataset = ds.dataset(path, partitioning="hive") + assert isinstance(dataset, ds.FileSystemDataset) part = dataset.partitioning assert part is not None assert isinstance(part, ds.HivePartitioning) @@ -4133,11 +4157,12 @@ def test_dataset_preserved_partitioning(tempdir): assert part.dictionaries[0] == pa.array([0, 1, 2], pa.int32()) # through discovery, with hive partitioning (from a partitioning object) - part = ds.partitioning(pa.schema([("part", pa.int32())]), flavor="hive") + part = ds.partitioning(schema=pa.schema([("part", pa.int32())]), flavor="hive") assert isinstance(part, ds.HivePartitioning) # not a factory assert len(part.dictionaries) == 1 assert all(x is None for x in part.dictionaries) dataset = ds.dataset(path, partitioning=part) + assert isinstance(dataset, ds.FileSystemDataset) part = dataset.partitioning assert isinstance(part, ds.HivePartitioning) assert part.schema == pa.schema([("part", pa.int32())]) @@ -4147,6 +4172,7 @@ def test_dataset_preserved_partitioning(tempdir): # through manual creation -> not available dataset = ds.dataset(path, partitioning="hive") + assert isinstance(dataset, ds.FileSystemDataset) dataset2 = ds.FileSystemDataset( list(dataset.get_fragments()), schema=dataset.schema, format=dataset.format, filesystem=dataset.filesystem @@ -4192,7 +4218,7 @@ def _sort_table(tab, sort_col): import pyarrow.compute as pc sorted_indices = pc.sort_indices( tab, options=pc.SortOptions([(sort_col, 'ascending')])) - return pc.take(tab, sorted_indices) + return pc.take(tab, sorted_indices) # type: ignore[arg-type] def _check_dataset_roundtrip(dataset, base_dir, expected_files, sort_col, @@ -4265,7 +4291,7 @@ def test_write_dataset_partitioned(tempdir): target / "part=b", target / "part=b" / "part-0.arrow" ] partitioning_schema = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + schema=pa.schema([("part", pa.string())]), flavor="hive") _check_dataset_roundtrip( dataset, str(target), expected_paths, 'f1', target, partitioning=partitioning_schema) @@ -4277,7 +4303,7 @@ def test_write_dataset_partitioned(tempdir): target / "b", target / "b" / "part-0.arrow" ] partitioning_schema = ds.partitioning( - pa.schema([("part", pa.string())])) + schema=pa.schema([("part", pa.string())])) _check_dataset_roundtrip( dataset, str(target), expected_paths, 'f1', target, partitioning=partitioning_schema) @@ -4290,6 +4316,7 @@ def test_write_dataset_with_field_names(tempdir): partitioning=["b"]) load_back = ds.dataset(tempdir, format='ipc', partitioning=["b"]) + assert isinstance(load_back, ds.FileSystemDataset) files = load_back.files partitioning_dirs = { str(pathlib.Path(f).relative_to(tempdir).parent) for f in files @@ -4307,6 +4334,7 @@ def test_write_dataset_with_field_names_hive(tempdir): partitioning=["b"], partitioning_flavor="hive") load_back = ds.dataset(tempdir, format='ipc', partitioning="hive") + assert isinstance(load_back, ds.FileSystemDataset) files = load_back.files partitioning_dirs = { str(pathlib.Path(f).relative_to(tempdir).parent) for f in files @@ -4624,7 +4652,7 @@ def test_write_dataset_max_open_files(tempdir): record_batch_3, record_batch_4]) partitioning = ds.partitioning( - pa.schema([(column_names[partition_column_id], pa.string())]), + schema=pa.schema([(column_names[partition_column_id], pa.string())]), flavor="hive") data_source_1 = directory / "default" @@ -4638,7 +4666,8 @@ def test_write_dataset_max_open_files(tempdir): def _get_compare_pair(data_source, record_batch, file_format, col_id): num_of_files_generated = _get_num_of_files_generated( base_directory=data_source, file_format=file_format) - number_of_partitions = len(pa.compute.unique(record_batch[col_id])) + unique_vals = pa.compute.unique(record_batch[col_id]) + number_of_partitions = len(unique_vals) # type: ignore[arg-type] return num_of_files_generated, number_of_partitions # CASE 1: when max_open_files=default & max_open_files >= num_of_partitions @@ -4685,7 +4714,7 @@ def test_write_dataset_partitioned_dict(tempdir): target / "a", target / "a" / "part-0.arrow", target / "b", target / "b" / "part-0.arrow" ] - partitioning = ds.partitioning(pa.schema([ + partitioning = ds.partitioning(schema=pa.schema([ dataset.schema.field('part')]), dictionaries={'part': pa.array(['a', 'b'])}) # NB: dictionaries required here since we use partitioning to parse @@ -4704,7 +4733,7 @@ def test_write_dataset_use_threads(tempdir): dataset = ds.dataset(directory, partitioning="hive") partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + schema=pa.schema([("part", pa.string())]), flavor="hive") target1 = tempdir / 'partitioned1' paths_written = [] @@ -4744,7 +4773,7 @@ def test_write_dataset_use_threads_preserve_order(tempdir): batches = table.to_batches(max_chunksize=2) ds.write_dataset(batches, tempdir, format="parquet", use_threads=True, preserve_order=True) - seq = ds.dataset(tempdir).to_table(use_threads=False)['a'].to_numpy() + seq = ds.dataset(tempdir).to_table(use_threads=False).column('a').to_numpy() prev = -1 for item in seq: curr = int(item) @@ -4784,7 +4813,7 @@ def file_visitor(written_file): visited_sizes.append(written_file.size) partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + schema=pa.schema([("part", pa.string())]), flavor="hive") ds.write_dataset(table, base_dir, format="feather", basename_template='dat_{i}.arrow', partitioning=partitioning, file_visitor=file_visitor) @@ -4896,7 +4925,7 @@ def test_write_table_partitioned_dict(tempdir): pa.array(['a'] * 10 + ['b'] * 10).dictionary_encode(), ], names=['col', 'part']) - partitioning = ds.partitioning(table.select(["part"]).schema) + partitioning = ds.partitioning(schema=table.select(["part"]).schema) base_dir = tempdir / "dataset" ds.write_dataset( @@ -4917,8 +4946,7 @@ def test_write_table_partitioned_dict(tempdir): def test_write_dataset_parquet(tempdir): table = pa.table([ pa.array(range(20), type="uint32"), - pa.array(np.arange("2012-01-01", 20, dtype="datetime64[D]").astype( - "datetime64[ns]")), + pa.array(pd.date_range("2012-01-01", periods=20, freq='D').values.astype("datetime64[ns]")), pa.array(np.repeat(['a', 'b'], 10)) ], names=["f1", "f2", "part"]) @@ -5014,7 +5042,7 @@ def test_partition_dataset_parquet_file_visitor(tempdir): root_path = tempdir / 'partitioned' partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + schema=pa.schema([("part", pa.string())]), flavor="hive") paths_written = [] @@ -5047,11 +5075,11 @@ def test_write_dataset_arrow_schema_metadata(tempdir): # ensure we serialize ARROW schema in the parquet metadata, to have a # correct roundtrip (e.g. preserve non-UTC timezone) table = pa.table({"a": [pd.Timestamp("2012-01-01", tz="Europe/Brussels")]}) - assert table["a"].type.tz == "Europe/Brussels" + assert table.column("a").type.tz == "Europe/Brussels" ds.write_dataset(table, tempdir, format="parquet") result = pq.read_table(tempdir / "part-0.parquet") - assert result["a"].type.tz == "Europe/Brussels" + assert result.column("a").type.tz == "Europe/Brussels" def test_write_dataset_schema_metadata(tempdir): @@ -5092,7 +5120,7 @@ def test_write_dataset_s3(s3_example_simple): pa.array(['a'] * 10 + ['b'] * 10)], names=["f1", "f2", "part"] ) - part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") + part = ds.partitioning(schema=pa.schema([("part", pa.string())]), flavor="hive") # writing with filesystem object ds.write_dataset( @@ -5171,7 +5199,7 @@ def test_write_dataset_s3_put_only(s3_server): pa.array(['a']*10 + ['b'] * 10)], names=["f1", "f2", "part"] ) - part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") + part = ds.partitioning(schema=pa.schema([("part", pa.string())]), flavor="hive") # writing with filesystem object with create_dir flag set to false ds.write_dataset( @@ -5549,7 +5577,7 @@ def test_union_dataset_filter(tempdir, dstype): else: raise NotImplementedError - filtered_union_ds = ds.dataset((ds1, ds2)).filter( + filtered_union_ds = ds.dataset((ds1, ds2)).filter( # type: ignore[arg-type] (pc.field("colA") < 3) | (pc.field("colA") == 9) ) assert filtered_union_ds.to_table() == pa.table({ @@ -5571,7 +5599,7 @@ def test_union_dataset_filter(tempdir, dstype): filtered_ds2 = ds2.filter(pc.field("colA") < 10) with pytest.raises(ValueError, match="currently not supported"): - ds.dataset((filtered_ds1, filtered_ds2)) + ds.dataset((filtered_ds1, filtered_ds2)) # type: ignore[arg-type] def test_parquet_dataset_filter(tempdir): @@ -5672,8 +5700,9 @@ def test_dataset_partition_with_slash(tmpdir): assert dt_table == read_table.sort_by("exp_id") exp_meta = dt_table.column(1).to_pylist() - exp_meta = sorted(set(exp_meta)) # take unique - encoded_paths = ["exp_meta=" + quote(path, safe='') for path in exp_meta] + exp_meta = sorted(set(exp_meta), key=lambda x: ( + x is None, x)) # take unique, handle None + encoded_paths = ["exp_meta=" + quote(str(path), safe='') for path in exp_meta] file_paths = sorted(os.listdir(path)) assert encoded_paths == file_paths @@ -5756,6 +5785,7 @@ def test_write_dataset_write_page_index(tempdir): ) ds1 = ds.dataset(base_dir, format="parquet") + assert isinstance(ds1, ds.FileSystemDataset) for file in ds1.files: # Can retrieve sorting columns from metadata metadata = pq.read_metadata(file) @@ -5898,13 +5928,13 @@ def test_make_write_options_error(): "'pyarrow._dataset_parquet.ParquetFileFormat' objects " "doesn't apply to a 'int'") with pytest.raises(TypeError) as excinfo: - pa.dataset.ParquetFileFormat.make_write_options(43) + pa.dataset.ParquetFileFormat.make_write_options(43) # type: ignore assert msg_1 in str(excinfo.value) or msg_2 in str(excinfo.value) pformat = pa.dataset.ParquetFileFormat() msg = "make_write_options\\(\\) takes exactly 0 positional arguments" with pytest.raises(TypeError, match=msg): - pformat.make_write_options(43) + pformat.make_write_options(43) # type: ignore def test_scanner_from_substrait(dataset): @@ -5939,3 +5969,4 @@ def test_scanner_from_substrait(dataset): filter=ps.BoundExpressions.from_substrait(filtering) ).to_table() assert result.to_pydict() == {'str': ['4', '4']} +# Type stubs fixes applied diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py index 0ef3931a4cf6..3d6583523722 100644 --- a/python/pyarrow/tests/test_dataset_encryption.py +++ b/python/pyarrow/tests/test_dataset_encryption.py @@ -30,8 +30,8 @@ import pyarrow.parquet as pq import pyarrow.dataset as ds except ImportError: - pq = None - ds = None + pq = None # type: ignore[assignment] + ds = None # type: ignore[assignment] try: from pyarrow.tests.parquet.encryption import InMemoryKmsClient @@ -85,7 +85,7 @@ def create_encryption_config(footer_key=FOOTER_KEY_NAME, column_keys=COLUMN_KEYS def create_decryption_config(): - return pe.DecryptionConfiguration(cache_lifetime=300) + return pe.DecryptionConfiguration(cache_lifetime=timedelta(seconds=300)) def create_kms_connection_config(keys=KEYS): @@ -135,6 +135,8 @@ def assert_decrypts( encrypt_kms_connection_config = create_kms_connection_config(write_keys) decrypt_kms_connection_config = create_kms_connection_config(read_keys) + assert ds is not None + assert pe is not None crypto_factory = pe.CryptoFactory(kms_factory) parquet_encryption_cfg = ds.ParquetEncryptionConfig( crypto_factory, encrypt_kms_connection_config, encryption_config @@ -370,11 +372,12 @@ def test_large_row_encryption_decryption(): """Test encryption and decryption of a large number of rows.""" class NoOpKmsClient(pe.KmsClient): - def wrap_key(self, key_bytes: bytes, _: str) -> bytes: + def wrap_key(self, key_bytes: bytes, _: str) -> bytes: # type: ignore[override] b = base64.b64encode(key_bytes) return b - def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes: + def unwrap_key(self, wrapped_key: bytes, _: str # type: ignore[override] + ) -> bytes: b = base64.b64decode(wrapped_key) return b @@ -395,6 +398,9 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes: plaintext_footer=False, data_key_length_bits=128, ) + assert ds is not None + assert pe is not None + assert pq is not None pqe_config = ds.ParquetEncryptionConfig( crypto_factory, kms_config, encryption_config ) @@ -429,6 +435,9 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes: encryption_unavailable, reason="Parquet Encryption is not currently enabled" ) def test_dataset_encryption_with_selected_column_statistics(): + assert ds is not None + assert pq is not None + table = create_sample_table() encryption_config = create_encryption_config() @@ -472,7 +481,7 @@ def test_dataset_encryption_with_selected_column_statistics(): for fragment in dataset.get_fragments(): decryption_properties = crypto_factory.file_decryption_properties( - kms_connection_config, decryption_config, fragment.path, mockfs) + kms_connection_config, decryption_config, fragment.path, mockfs) # type: ignore[call-arg] with pq.ParquetFile( fragment.path, decryption_properties=decryption_properties, @@ -481,12 +490,14 @@ def test_dataset_encryption_with_selected_column_statistics(): for rg_idx in range(parquet_file.metadata.num_row_groups): row_group = parquet_file.metadata.row_group(rg_idx) - assert row_group.column(0).statistics is not None - assert row_group.column(0).statistics.min == 2019 - assert row_group.column(0).statistics.max == 2022 + stats0 = row_group.column(0).statistics + assert stats0 is not None + assert stats0.min == 2019 + assert stats0.max == 2022 - assert row_group.column(1).statistics is not None - assert row_group.column(1).statistics.min == 2 - assert row_group.column(1).statistics.max == 100 + stats1 = row_group.column(1).statistics + assert stats1 is not None + assert stats1.min == 2 + assert stats1.max == 100 assert row_group.column(2).statistics is None diff --git a/python/pyarrow/tests/test_device.py b/python/pyarrow/tests/test_device.py index dc1a51e6d009..00f8bbf720da 100644 --- a/python/pyarrow/tests/test_device.py +++ b/python/pyarrow/tests/test_device.py @@ -59,11 +59,15 @@ def test_copy_to(): batch_copied = batch.copy_to(dest) assert batch_copied.equals(batch) - assert batch_copied["col"].buffers()[1].device == mm.device - assert batch_copied["col"].buffers()[1].address != arr.buffers()[1].address + buffer = batch_copied.column("col").buffers()[1] + assert buffer is not None + assert buffer.device == mm.device + buffer_orig = arr.buffers()[1] + assert buffer_orig is not None + assert buffer.address != buffer_orig.address with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"): - arr.copy_to(mm.device.device_type) + arr.copy_to(mm.device.device_type) # type: ignore[arg-type] with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"): - batch.copy_to(mm.device.device_type) + batch.copy_to(mm.device.device_type) # type: ignore[arg-type] diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ebac37e862b6..941e73c8167a 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -22,12 +22,13 @@ import weakref from uuid import uuid4, UUID import sys +from typing import cast import pytest try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa from pyarrow.vendored.version import Version @@ -79,12 +80,14 @@ def __init__(self): def __arrow_ext_serialize__(self): # XXX pa.BaseExtensionType should expose C++ serialization method + assert isinstance(self.storage_type, IntegerType) return self.storage_type.__arrow_ext_serialize__() @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): + assert isinstance(storage_type, IntegerType) deserialized_storage_type = storage_type.__arrow_ext_deserialize__( - serialized) + storage_type, serialized) assert deserialized_storage_type == storage_type return cls() @@ -160,7 +163,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): class MyStructType(pa.ExtensionType): - storage_type = pa.struct([('left', pa.int64()), + storage_type = pa.struct([('left', pa.int64()), # type: ignore[assignment] ('right', pa.int64())]) def __init__(self): @@ -221,7 +224,7 @@ def __arrow_ext_serialize__(self): @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): assert serialized == b'' - return cls(storage_type) + return cls(storage_type, annotation=None) def ipc_write_batch(batch): @@ -432,8 +435,8 @@ def test_ext_array_wrap_array(): arr.validate(full=True) assert isinstance(arr, pa.ChunkedArray) assert arr.type == ty - assert arr.chunk(0).storage == storage.chunk(0) - assert arr.chunk(1).storage == storage.chunk(1) + assert arr.chunk(0).storage == storage.chunk(0) # type: ignore[union-attr] + assert arr.chunk(1).storage == storage.chunk(1) # type: ignore[union-attr] # Wrong storage type storage = pa.array([b"foo", b"bar", None]) @@ -442,7 +445,7 @@ def test_ext_array_wrap_array(): # Not an array or chunked array with pytest.raises(TypeError, match="Expected array or chunked array"): - ty.wrap_array(None) + ty.wrap_array(None) # type: ignore[arg-type] def test_ext_scalar_from_array(): @@ -876,7 +879,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): def __eq__(self, other): if isinstance(other, pa.BaseExtensionType): return (isinstance(self, type(other)) and - self.freq == other.freq) + self.freq == other.freq) # type: ignore[attr-defined] else: return NotImplemented @@ -902,7 +905,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): storage_type, serialized).freq return PeriodTypeWithToPandasDtype(freq) - def to_pandas_dtype(self): + def to_pandas_dtype(self): # type: ignore[override] import pandas as pd return pd.PeriodDtype(freq=self.freq) @@ -1033,7 +1036,7 @@ def test_generic_ext_array_pickling(registered_period_type, pickle_module): def test_generic_ext_type_register(registered_period_type): # test that trying to register other type does not segfault with pytest.raises(TypeError): - pa.register_extension_type(pa.string()) + pa.register_extension_type(pa.string()) # type: ignore[arg-type] # register second time raises KeyError period_type = PeriodType('D') @@ -1058,11 +1061,13 @@ def test_parquet_period(tmpdir, registered_period_type): # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" + assert meta.metadata is not None assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) - schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) + schema = pa.ipc.read_schema(pa.BufferReader( + decoded_schema)) # Since the type could be reconstructed, the extension type metadata is # absent. assert schema.field("ext").metadata == {} @@ -1434,6 +1439,7 @@ def test_tensor_class_methods(np_type_str): storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], pa.list_(arrow_type, 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) + arr = cast(pa.FixedShapeTensorArray, arr) expected = np.array( [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str) @@ -1442,7 +1448,7 @@ def test_tensor_class_methods(np_type_str): np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected) expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str)) - result = arr[1:].to_numpy_ndarray() + result = arr[1:].to_numpy_ndarray() # type: ignore[union-attr] np.testing.assert_array_equal(result, expected) values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]] @@ -1452,35 +1458,43 @@ def test_tensor_class_methods(np_type_str): tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2]) result = pa.ExtensionArray.from_storage(tensor_type, storage) + result = cast(pa.FixedShapeTensorArray, result) expected = np.array( [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=np.dtype(np_type_str) ) np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) - result = flat_arr.reshape(1, 2, 3, 2) + result_reshaped = flat_arr.reshape(1, 2, 3, 2) expected = np.array( [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=np.dtype(np_type_str) ) - np.testing.assert_array_equal(result, expected) + np.testing.assert_array_equal(result_reshaped, expected) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1]) result = pa.ExtensionArray.from_storage(tensor_type, storage) + result = cast(pa.FixedShapeTensorArray, result) expected = as_strided(flat_arr, shape=(1, 2, 3, 2), strides=(bw * 12, bw * 6, bw, bw * 3)) np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 0, 1]) - result = pa.ExtensionArray.from_storage(tensor_type, storage) + result = pa.ExtensionArray.from_storage( + tensor_type, storage) # type: ignore[assignment] expected = as_strided(flat_arr, shape=(1, 3, 2, 2), strides=(bw * 12, bw, bw * 6, bw * 2)) - np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) - - assert result.type.permutation == [2, 0, 1] - assert result.type.shape == [2, 2, 3] + np.testing.assert_array_equal( + result.to_numpy_ndarray(), expected) # type: ignore[union-attr] + + result_type = result.type + assert isinstance(result, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.permutation == [2, 0, 1] + assert result_type.shape == [2, 2, 3] assert result.to_tensor().shape == (1, 3, 2, 2) - assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) + assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, + 2 * bw) @pytest.mark.numpy @@ -1508,17 +1522,23 @@ def test_tensor_array_from_numpy(np_type_str): arr = flat_arr.reshape(1, 3, 4) tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - assert tensor_array_from_numpy.type.shape == [3, 4] - assert tensor_array_from_numpy.type.permutation == [0, 1] - assert tensor_array_from_numpy.type.dim_names is None + result_type = tensor_array_from_numpy.type + assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.shape == [3, 4] + assert result_type.permutation == [0, 1] + assert result_type.dim_names is None assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) arr = as_strided(flat_arr, shape=(1, 2, 3, 2), strides=(bw * 12, bw * 6, bw, bw * 3)) tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - assert tensor_array_from_numpy.type.shape == [2, 2, 3] - assert tensor_array_from_numpy.type.permutation == [0, 2, 1] - assert tensor_array_from_numpy.type.dim_names is None + result_type = tensor_array_from_numpy.type + assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.shape == [2, 2, 3] + assert result_type.permutation == [0, 2, 1] + assert result_type.dim_names is None assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) arr = flat_arr.reshape(1, 2, 3, 2) @@ -1532,7 +1552,8 @@ def test_tensor_array_from_numpy(np_type_str): arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=np.dtype(np_type_str)) expected = arr[1:] - result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray() + result = cast(pa.FixedShapeTensorArray, pa.FixedShapeTensorArray.from_numpy_ndarray( + arr)[1:]).to_numpy_ndarray() np.testing.assert_array_equal(result, expected) arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.dtype(np_type_str)) @@ -1559,22 +1580,27 @@ def test_tensor_array_from_numpy(np_type_str): dim_names = ["a", "b"] tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray( arr, dim_names=dim_names) - assert tensor_array_from_numpy.type.value_type == arrow_type - assert tensor_array_from_numpy.type.shape == [2, 3] - assert tensor_array_from_numpy.type.dim_names == dim_names + result_type = tensor_array_from_numpy.type + assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.value_type == arrow_type + assert result_type.shape == [2, 3] + assert result_type.dim_names == dim_names with pytest.raises(ValueError, match="The length of dim_names"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=['only_one']) with pytest.raises(TypeError, match="dim_names must be a tuple or list"): - pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=123) + pa.FixedShapeTensorArray.from_numpy_ndarray( + arr, dim_names=123) # type: ignore[arg-type] with pytest.raises(TypeError, match="dim_names must be a tuple or list"): pa.FixedShapeTensorArray.from_numpy_ndarray( - arr, dim_names=(x for x in range(2))) + arr, dim_names=(x for x in range(2))) # type: ignore[arg-type] with pytest.raises(TypeError, match="Each element of dim_names must be a string"): - pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=[0, 1]) + pa.FixedShapeTensorArray.from_numpy_ndarray( + arr, dim_names=[0, 1]) # type: ignore[arg-type] @pytest.mark.numpy @@ -1845,14 +1871,18 @@ def test_bool8_to_numpy_conversion(): assert np.array_equal(arr_to_np, np_arr_no_nulls) # same underlying buffer - assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address + buffer = arr_no_nulls.buffers()[1] + assert buffer is not None + assert arr_to_np.ctypes.data == buffer.address # if the user requests a writable array, a copy should be performed arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False, writable=True) assert np.array_equal(arr_to_np_writable, np_arr_no_nulls) # different underlying buffer - assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address + buffer = arr_no_nulls.buffers()[1] + assert buffer is not None + assert arr_to_np_writable.ctypes.data != buffer.address @pytest.mark.numpy @@ -1867,7 +1897,9 @@ def test_bool8_from_numpy_conversion(): assert arr_from_np == canonical_bool8_arr_no_nulls # same underlying buffer - assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data + buffer = arr_from_np.buffers()[1] + assert buffer is not None + assert buffer.address == np_arr_no_nulls.ctypes.data # conversion only valid for 1-D arrays with pytest.raises( @@ -1882,7 +1914,7 @@ def test_bool8_from_numpy_conversion(): ValueError, match="Cannot convert 0-D array to bool8 array", ): - pa.Bool8Array.from_numpy(np.bool_()) + pa.Bool8Array.from_numpy(np.bool_(False)) # type: ignore[arg-type] # must use compatible storage type with pytest.raises( diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 054bf920b269..a84b343b3dd2 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -26,7 +26,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.strategies as past @@ -47,7 +47,7 @@ def datadir(base_datadir): def random_path(prefix='feather_'): - return tempfile.mktemp(prefix=prefix) + return tempfile.mktemp(prefix=prefix) # type: ignore[deprecated] @pytest.fixture(scope="module", params=[1, 2]) @@ -63,7 +63,7 @@ def compression(request): yield request.param -TEST_FILES = None +TEST_FILES: list[str] | None = None def setup_module(module): @@ -72,7 +72,7 @@ def setup_module(module): def teardown_module(module): - for path in TEST_FILES: + for path in TEST_FILES: # type: ignore[union-attr] try: os.remove(path) except os.error: @@ -95,6 +95,7 @@ def _check_pandas_roundtrip(df, expected=None, path=None, if version is None: version = 2 + assert TEST_FILES is not None TEST_FILES.append(path) write_feather(df, path, compression=compression, compression_level=compression_level, version=version) @@ -114,6 +115,7 @@ def _check_arrow_roundtrip(table, path=None, compression=None): if path is None: path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) write_feather(table, path, compression=compression) if not os.path.exists(path): @@ -126,10 +128,12 @@ def _check_arrow_roundtrip(table, path=None, compression=None): def _assert_error_on_write(df, exc, path=None, version=2): # check that we are raising the exception # on writing + assert version in (1, 2) if path is None: path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) def f(): @@ -149,6 +153,7 @@ def test_dataset(version): } table = pa.table(data) + assert TEST_FILES is not None TEST_FILES.extend(paths) for index, path in enumerate(paths): rows = ( @@ -156,7 +161,8 @@ def test_dataset(version): (index + 1) * (num_values[0] // num_files), ) - write_feather(table[rows[0]: rows[1]], path, version=version) + write_feather(table[rows[0]: rows[1]], path, + version=version) # type: ignore[arg-type] data = FeatherDataset(paths).read_table() assert data.equals(table) @@ -181,6 +187,7 @@ def test_read_table(version): num_values = (100, 100) path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) values = np.random.randint(0, 100, size=num_values) @@ -206,6 +213,7 @@ def test_use_threads(version): num_values = (10, 10) path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) values = np.random.randint(0, 10, size=num_values) @@ -231,6 +239,7 @@ def test_float_nulls(version): num_values = 100 path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) null_mask = np.random.randint(0, 10, size=num_values) < 3 @@ -292,6 +301,7 @@ def test_platform_numpy_integers(version): def test_integer_with_nulls(version): # pandas requires upcast to float dtype path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] @@ -330,6 +340,7 @@ def test_boolean_no_nulls(version): def test_boolean_nulls(version): # pandas requires upcast to object dtype path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) num_values = 100 @@ -348,6 +359,7 @@ def test_boolean_nulls(version): def test_buffer_bounds_error(version): # ARROW-1676 path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) for i in range(16, 256): @@ -360,6 +372,7 @@ def test_buffer_bounds_error(version): @pytest.mark.numpy def test_boolean_object_nulls(version): + assert np is not None repeats = 100 table = pa.Table.from_arrays( [np.array([False, None, True] * repeats, dtype=object)], @@ -426,7 +439,8 @@ def test_empty_strings(version): @pytest.mark.pandas def test_all_none(version): df = pd.DataFrame({'all_none': [None] * 10}) - if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype(): + if (version == 1 and pa.pandas_compat # type: ignore[attr-defined] + ._pandas_api.uses_string_dtype()): expected = df.astype("str") else: expected = df @@ -552,6 +566,7 @@ def test_read_columns(version): @pytest.mark.numpy def test_overwritten_file(version): path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) num_values = 100 @@ -585,12 +600,12 @@ def test_filelike_objects(version): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_sparse_dataframe(version): - if not pa.pandas_compat._pandas_api.has_sparse: + if not pa.pandas_compat._pandas_api.has_sparse: # type: ignore[attr-defined] pytest.skip("version of pandas does not support SparseDataFrame") # GH #221 data = {'A': [0, 1, 2], 'B': [1, 0, 1]} - df = pd.DataFrame(data).to_sparse(fill_value=1) + df = pd.DataFrame(data).to_sparse(fill_value=1) # type: ignore[attr-defined] expected = df.to_dense() _check_pandas_roundtrip(df, expected, version=version) @@ -692,8 +707,9 @@ def test_v2_lz4_default_compression(): if not pa.Codec.is_available('lz4_frame'): pytest.skip("LZ4 compression support is not built in C++") + assert np is not None # some highly compressible data - t = pa.table([np.repeat(0, 100000)], names=['f0']) + t = pa.table([np.repeat(0, 100000)], names=['f0']) # type: ignore[arg-type] buf = io.BytesIO() write_feather(t, buf) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 9e7bb312398f..1294e681be45 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -28,19 +28,21 @@ import traceback import json from datetime import datetime +from typing import Any try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa from pyarrow.lib import IpcReadOptions, ReadStats, tobytes from pyarrow.util import find_free_port from pyarrow.tests import util +from typing import TYPE_CHECKING -try: +if TYPE_CHECKING: from pyarrow import flight from pyarrow.flight import ( FlightClient, FlightServerBase, @@ -49,13 +51,26 @@ ClientMiddleware, ClientMiddlewareFactory, FlightCallOptions, ) -except ImportError: - flight = None - FlightClient, FlightServerBase = object, object - ServerAuthHandler, ClientAuthHandler = object, object - ServerMiddleware, ServerMiddlewareFactory = object, object - ClientMiddleware, ClientMiddlewareFactory = object, object - FlightCallOptions = object +else: + try: + from pyarrow import flight + from pyarrow.flight import ( + FlightClient, FlightServerBase, + ServerAuthHandler, ClientAuthHandler, + ServerMiddleware, ServerMiddlewareFactory, + ClientMiddleware, ClientMiddlewareFactory, + FlightCallOptions, + ) + except ImportError: + flight = None # type: ignore[assignment] + FlightClient, FlightServerBase = object, object + ServerAuthHandler, ClientAuthHandler = ( # type: ignore[misc] + object, object) # type: ignore[assignment] + ServerMiddleware, ServerMiddlewareFactory = ( # type: ignore[misc] + object, object) # type: ignore[assignment] + ClientMiddleware, ClientMiddlewareFactory = ( # type: ignore[misc] + object, object) # type: ignore[assignment] + # FlightCallOptions = object # type: ignore[assignment, misc] # Marks all of the tests in this module # Ignore these with pytest ... -m 'not flight' @@ -196,7 +211,7 @@ def do_put(self, context, descriptor, reader, writer): assert buf is not None client_counter, = struct.unpack(' 0 key = 'arrow-datasets/nyc-taxi/year=2019/month=6/part-0.parquet' with fs.open_input_stream(key) as f: @@ -1931,6 +1940,8 @@ def test_s3_real_aws_region_selection(): # Taken from a registry of open S3-hosted datasets # at https://github.com/awslabs/open-data-registry fs, path = FileSystem.from_uri('s3://mf-nwp-models/README.txt') + from pyarrow.fs import S3FileSystem + assert isinstance(fs, S3FileSystem) assert fs.region == 'eu-west-1' with fs.open_input_stream(path) as f: assert b"Meteo-France Atmospheric models on AWS" in f.read(50) @@ -1938,6 +1949,8 @@ def test_s3_real_aws_region_selection(): # Passing an explicit region disables auto-selection fs, path = FileSystem.from_uri( 's3://mf-nwp-models/README.txt?region=us-east-2') + from pyarrow.fs import S3FileSystem + assert isinstance(fs, S3FileSystem) assert fs.region == 'us-east-2' # Reading from the wrong region may still work for public buckets... @@ -1948,6 +1961,8 @@ def test_s3_real_aws_region_selection(): with pytest.raises(IOError, match="Bucket '.*' not found"): FileSystem.from_uri('s3://x-arrow..nonexistent-bucket') fs, path = FileSystem.from_uri('s3://x-arrow-nonexistent-bucket?region=us-east-3') + from pyarrow.fs import S3FileSystem + assert isinstance(fs, S3FileSystem) assert fs.region == 'us-east-3' # allow_delayed_open has a side-effect of delaying errors until I/O is performed. @@ -2188,13 +2203,16 @@ def test_uwsgi_integration(): def test_fsspec_filesystem_from_uri(): try: - from fsspec.implementations.local import LocalFileSystem - from fsspec.implementations.memory import MemoryFileSystem + from fsspec.implementations.local import ( # type: ignore[import-untyped] + LocalFileSystem) + from fsspec.implementations.memory import ( # type: ignore[import-untyped] + MemoryFileSystem) except ImportError: pytest.skip("fsspec not installed") fs, path = FileSystem.from_uri("fsspec+memory://path/to/data.parquet") - expected_fs = PyFileSystem(FSSpecHandler(MemoryFileSystem())) + expected_fs = PyFileSystem(FSSpecHandler( + MemoryFileSystem())) # type: ignore[abstract] assert fs == expected_fs assert path == "/path/to/data.parquet" @@ -2202,7 +2220,8 @@ def test_fsspec_filesystem_from_uri(): # arrow local filesystem uri = "file:///tmp/my.file" fs, _ = FileSystem.from_uri(f"fsspec+{uri}") - expected_fs = PyFileSystem(FSSpecHandler(LocalFileSystem())) + expected_fs = PyFileSystem(FSSpecHandler( + LocalFileSystem())) # type: ignore[abstract] assert fs == expected_fs @@ -2212,7 +2231,7 @@ def test_fsspec_delete_root_dir_contents(): except ImportError: pytest.skip("fsspec not installed") - fs = FSSpecHandler(MemoryFileSystem()) + fs = FSSpecHandler(MemoryFileSystem()) # type: ignore[abstract] # Create some files and directories fs.create_dir("test_dir", recursive=True) @@ -2226,7 +2245,7 @@ def test_fsspec_delete_root_dir_contents(): # Verify files exist before deletion def get_type(path): - return fs.get_file_info([path])[0].type + return cast(list[FileInfo], fs.get_file_info([path]))[0].type assert get_type("test_file.txt") == FileType.File assert get_type("test_dir") == FileType.Directory @@ -2244,13 +2263,13 @@ def get_type(path): def test_huggingface_filesystem_from_uri(): pytest.importorskip("fsspec") try: - from huggingface_hub import HfFileSystem + from huggingface_hub import HfFileSystem # type: ignore[import-not-found] except ImportError: pytest.skip("huggingface_hub not installed") fs, path = FileSystem.from_uri( "hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet" ) - expected_fs = PyFileSystem(FSSpecHandler(HfFileSystem())) + expected_fs = PyFileSystem(FSSpecHandler(HfFileSystem())) # type: ignore[abstract] assert fs == expected_fs assert path == "datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet" diff --git a/python/pyarrow/tests/test_gandiva.py b/python/pyarrow/tests/test_gandiva.py index 80d119a48530..01fc6f032d5e 100644 --- a/python/pyarrow/tests/test_gandiva.py +++ b/python/pyarrow/tests/test_gandiva.py @@ -174,9 +174,12 @@ def test_in_expr_todo(): assert result.to_array().equals(pa.array([1, 2], type=pa.uint32())) # timestamp - datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877) - datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877) - datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877) + datetime_1 = datetime.datetime.fromtimestamp( + 1542238951.621877, tz=datetime.timezone.utc) + datetime_2 = datetime.datetime.fromtimestamp( + 1542238911.621877, tz=datetime.timezone.utc) + datetime_3 = datetime.datetime.fromtimestamp( + 1542238051.621877, tz=datetime.timezone.utc) arr = pa.array([datetime_1, datetime_2, datetime_3]) table = pa.Table.from_arrays([arr], ["a"]) diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py index 912953ae60d2..50d81b686aca 100644 --- a/python/pyarrow/tests/test_gdb.py +++ b/python/pyarrow/tests/test_gdb.py @@ -101,6 +101,8 @@ def wait_until_ready(self): Record output until the gdb prompt displays. Return recorded output. """ # TODO: add timeout? + assert self.proc is not None + assert self.proc.stdout is not None while (not self.last_stdout_line.startswith(b"(gdb) ") and self.proc.poll() is None): block = self.proc.stdout.read(4096) @@ -125,6 +127,8 @@ def wait_until_ready(self): return out def issue_command(self, line): + assert self.proc is not None + assert self.proc.stdin is not None line = line.encode('utf-8') + b"\n" if self.verbose: sys.stdout.buffer.write(line) @@ -158,6 +162,7 @@ def select_frame(self, func_name): m = re.search(pat, out) if m is None: pytest.fail(f"Could not select frame for function {func_name}") + return # Never reached, but helps type checker frame_num = int(m[1]) out = self.run_command(f"frame {frame_num}") @@ -165,6 +170,8 @@ def select_frame(self, func_name): def join(self): if self.proc is not None: + assert self.proc.stdin is not None + assert self.proc.stdout is not None self.proc.stdin.close() self.proc.stdout.close() # avoid ResourceWarning self.proc.kill() diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index a6d3546e57c6..3837b553b8be 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -24,16 +24,17 @@ import math import os import pathlib -import pytest +import pytest # type: ignore[import-not-found] import random import sys import tempfile +from typing import cast import weakref try: import numpy as np except ImportError: - np = None + pass from pyarrow.util import guid from pyarrow import Codec @@ -44,7 +45,7 @@ def check_large_seeks(file_factory, expected_error=None): if sys.platform in ('win32', 'darwin', 'emscripten'): pytest.skip("need sparse file support") try: - filename = tempfile.mktemp(prefix='test_io') + filename = tempfile.mkstemp(prefix='test_io')[1] with open(filename, 'wb') as f: f.truncate(2 ** 32 + 10) f.seek(2 ** 32 + 5) @@ -234,7 +235,7 @@ def read_buffer(self, nbytes): return memoryview(dst_buf)[:nbytes] duck_reader = DuckReader() - with pa.PythonFile(duck_reader, mode='r') as f: + with pa.PythonFile(duck_reader, mode='r') as f: # type: ignore[arg-type] buf = f.read_buffer(length) assert len(buf) == length assert memoryview(buf).tobytes() == dst_buf[:length] @@ -474,7 +475,7 @@ def test_buffer_to_numpy(): byte_array = bytearray(20) byte_array[0] = 42 buf = pa.py_buffer(byte_array) - array = np.frombuffer(buf, dtype="uint8") + array = np.frombuffer(buf, dtype="uint8") # type: ignore[arg-type] assert array[0] == byte_array[0] byte_array[0] += 1 assert array[0] == byte_array[0] @@ -557,7 +558,7 @@ def test_buffer_eq_bytes(): assert buf != b'some dat1' with pytest.raises(TypeError): - buf == 'some data' + _ = buf == 'some data' def test_buffer_getitem(): @@ -598,22 +599,22 @@ def test_buffer_slicing(): with pytest.raises(IndexError): buf.slice(len(buf) + 1) - assert buf[11:].to_pybytes() == b"" + assert cast(pa.Buffer, buf[11:]).to_pybytes() == b"" # Slice stop exceeds buffer length with pytest.raises(IndexError): buf.slice(1, len(buf)) - assert buf[1:11].to_pybytes() == buf.to_pybytes()[1:] + assert cast(pa.Buffer, buf[1:11]).to_pybytes() == buf.to_pybytes()[1:] # Negative length with pytest.raises(IndexError): buf.slice(1, -1) # Test slice notation - assert buf[2:].equals(buf.slice(2)) - assert buf[2:5].equals(buf.slice(2, 3)) - assert buf[-5:].equals(buf.slice(len(buf) - 5)) - assert buf[-5:-2].equals(buf.slice(len(buf) - 5, 3)) + assert cast(pa.Buffer, buf[2:]).equals(buf.slice(2)) + assert cast(pa.Buffer, buf[2:5]).equals(buf.slice(2, 3)) + assert cast(pa.Buffer, buf[-5:]).equals(buf.slice(len(buf) - 5)) + assert cast(pa.Buffer, buf[-5:-2]).equals(buf.slice(len(buf) - 5, 3)) with pytest.raises(IndexError): buf[::-1] @@ -623,7 +624,8 @@ def test_buffer_slicing(): n = len(buf) for start in range(-n * 2, n * 2): for stop in range(-n * 2, n * 2): - assert buf[start:stop].to_pybytes() == buf.to_pybytes()[start:stop] + assert cast(pa.Buffer, buf[start:stop]).to_pybytes( + ) == buf.to_pybytes()[start:stop] def test_buffer_hashing(): @@ -640,7 +642,7 @@ def test_buffer_protocol_respects_immutability(): # immutable a = b'12345' arrow_ref = pa.py_buffer(a) - numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8) + numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8) # type: ignore[arg-type] assert not numpy_ref.flags.writeable @@ -652,7 +654,8 @@ def test_foreign_buffer(): buf = pa.foreign_buffer(addr, size, obj) wr = weakref.ref(obj) del obj - assert np.frombuffer(buf, dtype=np.int32).tolist() == [1, 2] + assert (np.frombuffer(buf, dtype=np.int32).tolist() # type: ignore[arg-type] + == [1, 2]) assert wr() is not None del buf assert wr() is None @@ -688,6 +691,7 @@ def test_non_cpu_buffer(pickle_module): cuda_buf = ctx.buffer_from_data(data) arr = pa.FixedSizeBinaryArray.from_buffers(pa.binary(7), 1, [None, cuda_buf]) buf_on_gpu = arr.buffers()[1] + assert buf_on_gpu is not None assert buf_on_gpu.size == cuda_buf.size assert buf_on_gpu.address == cuda_buf.address @@ -708,7 +712,7 @@ def test_non_cpu_buffer(pickle_module): assert cuda_sliced.to_pybytes() == b'st' # Sliced buffers with same address - assert buf_on_gpu_sliced.equals(cuda_buf[2:4]) + assert cast(pa.Buffer, buf_on_gpu_sliced).equals(cuda_buf[2:4]) # Buffers on different devices msg_device = "Device on which the data resides differs between buffers" @@ -720,13 +724,14 @@ def test_non_cpu_buffer(pickle_module): arr_short = np.array([b'sting']) cuda_buf_short = ctx.buffer_from_data(arr_short) with pytest.raises(NotImplementedError, match=msg): - buf_on_gpu_sliced.equals(cuda_buf_short) + cast(pa.Buffer, buf_on_gpu_sliced).equals(cuda_buf_short) arr_short = pa.FixedSizeBinaryArray.from_buffers( pa.binary(5), 1, [None, cuda_buf_short] ) buf_on_gpu_short = arr_short.buffers()[1] + assert buf_on_gpu_short is not None with pytest.raises(NotImplementedError, match=msg): - buf_on_gpu_sliced.equals(buf_on_gpu_short) + cast(pa.Buffer, buf_on_gpu_sliced).equals(buf_on_gpu_short) with pytest.raises(NotImplementedError, match=msg): buf_on_gpu.hex() @@ -811,8 +816,9 @@ def test_cache_options_pickling(pickle_module): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -843,6 +849,7 @@ def test_compress_decompress(compression): assert isinstance(decompressed_bytes, bytes) + assert isinstance(decompressed_buf, pa.Buffer) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data @@ -852,8 +859,9 @@ def test_compress_decompress(compression): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -910,6 +918,7 @@ def test_compression_level(compression): assert isinstance(decompressed_bytes, bytes) + assert isinstance(decompressed_buf, pa.Buffer) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data @@ -951,12 +960,12 @@ def test_buffer_memoryview_is_immutable(): assert result.readonly with pytest.raises(TypeError) as exc: - result[0] = b'h' + result[0] = b'h' # type: ignore[index] assert 'cannot modify read-only' in str(exc.value) b = bytes(buf) with pytest.raises(TypeError) as exc: - b[0] = b'h' + b[0] = b'h' # type: ignore[index] assert 'cannot modify read-only' in str(exc.value) @@ -1748,9 +1757,9 @@ def test_unknown_compression_raises(): "gzip", "lz4", "zstd", - pytest.param( - "snappy", - marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("snappy", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ) ]) def test_compressed_roundtrip(compression): @@ -2021,7 +2030,7 @@ def test_input_stream_native_file(): def test_input_stream_errors(tmpdir): buf = memoryview(b"") with pytest.raises(ValueError): - pa.input_stream(buf, compression="foo") + pa.input_stream(buf, compression="foo") # type: ignore[reportArgumentType] for arg in [bytearray(), StringIO()]: with pytest.raises(TypeError): @@ -2198,7 +2207,7 @@ def check_data(data, **kwargs): def test_output_stream_errors(tmpdir): buf = memoryview(bytearray()) with pytest.raises(ValueError): - pa.output_stream(buf, compression="foo") + pa.output_stream(buf, compression="foo") # type: ignore[reportArgumentType] for arg in [bytearray(), StringIO()]: with pytest.raises(TypeError): diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 6813ed777234..93b9e7f1aa02 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -24,23 +24,27 @@ import socket import threading import weakref +from typing import TYPE_CHECKING, cast -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None + import pandas as pd + from pandas.testing import assert_frame_equal +else: + try: + import numpy as np + except ImportError: + pass + try: + from pandas.testing import assert_frame_equal + import pandas as pd + except ImportError: + pass import pyarrow as pa from pyarrow.tests.util import changed_environ, invoke_script -try: - from pandas.testing import assert_frame_equal - import pandas as pd -except ImportError: - pass - - class IpcFixture: write_stats = None @@ -48,6 +52,9 @@ def __init__(self, sink_factory=lambda: io.BytesIO()): self._sink_factory = sink_factory self.sink = self.get_sink() + def _get_writer(self, sink, schema): + ... # Implemented in subclasses + def get_sink(self): return self._sink_factory() @@ -59,6 +66,7 @@ def write_batches(self, num_batches=5, as_table=False): schema = pa.schema([('one', pa.float64()), ('two', pa.utf8())]) writer = self._get_writer(self.sink, schema) + assert writer is not None batches = [] for i in range(num_batches): @@ -385,7 +393,8 @@ def test_stream_write_table_batches(stream_fixture): 'one': np.random.randn(20), }) - b1 = pa.RecordBatch.from_pandas(df[:10], preserve_index=False) + b1 = pa.RecordBatch.from_pandas( + df[:10], preserve_index=False) # type: ignore[arg-type] b2 = pa.RecordBatch.from_pandas(df, preserve_index=False) table = pa.Table.from_batches([b1, b2, b1]) @@ -941,7 +950,7 @@ def test_ipc_file_stream_has_eos(): buffer = sink.getvalue() # skip the file magic - reader = pa.ipc.open_stream(buffer[8:]) + reader = pa.ipc.open_stream(cast(pa.Buffer, buffer[8:])) # will fail if encounters footer data instead of eos rdf = reader.read_pandas() @@ -980,7 +989,8 @@ def test_batches_with_custom_metadata_roundtrip(ipc_type): with file_factory(sink, batch.schema) as writer: for i in range(batch_count): - writer.write_batch(batch, custom_metadata={"batch_id": str(i)}) + writer.write_batch(batch, custom_metadata={ # type: ignore[arg-type] + "batch_id": str(i)}) # write a batch without custom metadata writer.write_batch(batch) diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index c3f9fe333bd0..c0b6b8ecd0d5 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -23,11 +23,16 @@ import json import string import unittest +from typing import TYPE_CHECKING -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None +else: + try: + import numpy as np + except ImportError: + pass + import pytest import pyarrow as pa @@ -317,6 +322,9 @@ def test_stress_block_sizes(self): class BaseTestJSONRead(BaseTestJSON): + def read_json(self, *args, **kwargs) -> pa.Table: # type: ignore[empty-body] + ... # Implemented in subclasses + def read_bytes(self, b, **kwargs): return self.read_json(pa.py_buffer(b), **kwargs) @@ -352,6 +360,8 @@ def test_reconcile_across_blocks(self): class BaseTestStreamingJSONRead(BaseTestJSON): + use_threads: bool = False # Set by subclasses + def open_json(self, json, *args, **kwargs): """ Reads the JSON file into memory using pyarrow's open_json diff --git a/python/pyarrow/tests/test_jvm.py b/python/pyarrow/tests/test_jvm.py index d2ba780efc7f..b5d4e74f126f 100644 --- a/python/pyarrow/tests/test_jvm.py +++ b/python/pyarrow/tests/test_jvm.py @@ -38,11 +38,13 @@ def root_allocator(): arrow_dir = os.path.join(os.path.dirname(__file__), '..', '..', '..') pom_path = os.path.join(arrow_dir, 'java', 'pom.xml') tree = ET.parse(pom_path) - version = tree.getroot().find( + version_element = tree.getroot().find( 'POM:version', namespaces={ 'POM': 'http://maven.apache.org/POM/4.0.0' - }).text + }) + assert version_element is not None + version = version_element.text jar_path = os.path.join( arrow_dir, 'java', 'tools', 'target', f'arrow-tools-{version}-jar-with-dependencies.jar') @@ -76,8 +78,8 @@ def test_jvm_buffer(root_allocator): def test_jvm_buffer_released(root_allocator): - import jpype.imports # noqa - from java.lang import IllegalArgumentException + import jpype.imports # type: ignore[import-untyped, import-not-found] # noqa + from java.lang import IllegalArgumentException # type: ignore[import-not-found] jvm_buffer = root_allocator.buffer(8) jvm_buffer.release() diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 27154a6f34f3..d0e61d758cb2 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -77,7 +77,7 @@ def fix_example_values(actual_cols, expected_cols): if not pd.isnull(v): exp = d.as_tuple().exponent factor = 10 ** -exp - converted_decimals[i] = ( + converted_decimals[i] = ( # type: ignore[call-overload,assignment] decimal.Decimal(round(v * factor)).scaleb(exp)) expected = pd.Series(converted_decimals) @@ -314,7 +314,7 @@ def test_buffer_readwrite(): # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): - orc.write_table(buffer_output_stream, table) + orc.write_table(buffer_output_stream, table) # type: ignore[arg-type] buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() @@ -356,8 +356,8 @@ def test_buffer_readwrite_with_writeoptions(): buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table( - buffer_output_stream, - table, + buffer_output_stream, # type: ignore[reportArgumentType] + table, # type: ignore[reportArgumentType] compression='uncompressed', file_version='0.11', row_index_stride=20000, @@ -444,20 +444,20 @@ def test_buffer_readwrite_with_bad_writeoptions(): orc.write_table( table, buffer_output_stream, - compression=0, + compression=0, # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression='none', + compression='none', # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression='zlid', + compression='zlid', # type: ignore[reportArgumentType] ) # compression_block_size must be a positive integer @@ -487,20 +487,20 @@ def test_buffer_readwrite_with_bad_writeoptions(): orc.write_table( table, buffer_output_stream, - compression_strategy=0, + compression_strategy=0, # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression_strategy='no', + compression_strategy='no', # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression_strategy='large', + compression_strategy='large', # type: ignore[reportArgumentType] ) # row_index_stride must be a positive integer diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index cecf10f21656..9a87c3dcbf8d 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -28,37 +28,34 @@ import hypothesis as h import hypothesis.strategies as st import pytest -try: - import numpy as np - import numpy.testing as npt - try: - _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning - except AttributeError: - from numpy.exceptions import ( - VisibleDeprecationWarning as _np_VisibleDeprecationWarning - ) -except ImportError: - np = None +import pyarrow as pa from pyarrow.pandas_compat import get_logical_type, _pandas_api from pyarrow.tests.util import invoke_script, random_ascii, rands import pyarrow.tests.strategies as past import pyarrow.tests.util as test_util from pyarrow.vendored.version import Version -import pyarrow as pa try: from pyarrow import parquet as pq except ImportError: pass -try: - import pandas as pd - import pandas.testing as tm - from .pandas_examples import dataframe_with_arrays, dataframe_with_lists -except ImportError: - pass +pd = pytest.importorskip("pandas") +np = pytest.importorskip("numpy") + +import numpy.testing as npt # noqa: E402 +import pandas.testing as tm # noqa: E402 +from .pandas_examples import dataframe_with_arrays, dataframe_with_lists # noqa: E402 +try: + _np_VisibleDeprecationWarning = ( + np.VisibleDeprecationWarning # type: ignore[attr-defined] + ) +except AttributeError: + from numpy.exceptions import ( + VisibleDeprecationWarning as _np_VisibleDeprecationWarning + ) # Marks all of the tests in this module pytestmark = pytest.mark.pandas @@ -77,14 +74,10 @@ def _alltypes_example(size=100): 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, - 'datetime[s]': np.arange("2016-01-01T00:00:00.001", size, - dtype='datetime64[s]'), - 'datetime[ms]': np.arange("2016-01-01T00:00:00.001", size, - dtype='datetime64[ms]'), - 'datetime[us]': np.arange("2016-01-01T00:00:00.001", size, - dtype='datetime64[us]'), - 'datetime[ns]': np.arange("2016-01-01T00:00:00.001", size, - dtype='datetime64[ns]'), + 'datetime[s]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='s').values, + 'datetime[ms]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='ms').values, + 'datetime[us]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='us').values, + 'datetime[ns]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='ns').values, 'timedelta64[s]': np.arange(0, size, dtype='timedelta64[s]'), 'timedelta64[ms]': np.arange(0, size, dtype='timedelta64[ms]'), 'timedelta64[us]': np.arange(0, size, dtype='timedelta64[us]'), @@ -98,7 +91,7 @@ def _alltypes_example(size=100): def _check_pandas_roundtrip(df, expected=None, use_threads=False, expected_schema=None, check_dtype=True, schema=None, - preserve_index=False, + preserve_index: bool | None = False, as_batch=False): klass = pa.RecordBatch if as_batch else pa.Table table = klass.from_pandas(df, schema=schema, @@ -723,7 +716,7 @@ def test_mismatch_metadata_schema(self): # OPTION 1: casting after conversion table = pa.Table.from_pandas(df) # cast the "datetime" column to be tz-aware - new_col = table["datetime"].cast(pa.timestamp('ns', tz="UTC")) + new_col = table.column(0).cast(pa.timestamp('ns', tz="UTC")) new_table1 = table.set_column( 0, pa.field("datetime", new_col.type), new_col ) @@ -991,7 +984,7 @@ def test_float_with_null_as_integer(self): schema = pa.schema([pa.field('has_nulls', ty)]) result = pa.Table.from_pandas(df, schema=schema, preserve_index=False) - assert result[0].chunk(0).equals(expected) + assert result.column(0).chunk(0).equals(expected) def test_int_object_nulls(self): arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) @@ -1153,7 +1146,7 @@ def test_python_datetime(self): }) table = pa.Table.from_pandas(df) - assert isinstance(table[0].chunk(0), pa.TimestampArray) + assert isinstance(table.column(0).chunk(0), pa.TimestampArray) result = table.to_pandas() # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units @@ -1210,7 +1203,7 @@ class MyDatetime(datetime): df = pd.DataFrame({"datetime": pd.Series(date_array, dtype=object)}) table = pa.Table.from_pandas(df) - assert isinstance(table[0].chunk(0), pa.TimestampArray) + assert isinstance(table.column(0).chunk(0), pa.TimestampArray) result = table.to_pandas() @@ -1234,7 +1227,7 @@ class MyDate(date): df = pd.DataFrame({"date": pd.Series(date_array, dtype=object)}) table = pa.Table.from_pandas(df) - assert isinstance(table[0].chunk(0), pa.Date32Array) + assert isinstance(table.column(0).chunk(0), pa.Date32Array) result = table.to_pandas() expected_df = pd.DataFrame( @@ -1746,7 +1739,7 @@ def test_bytes_to_binary(self): df = pd.DataFrame({'strings': values}) table = pa.Table.from_pandas(df) - assert table[0].type == pa.binary() + assert table.column(0).type == pa.binary() values2 = [b'qux', b'foo', None, b'barz', b'qux', None] expected = pd.DataFrame({'strings': values2}) @@ -1767,7 +1760,7 @@ def test_bytes_exceed_2gb(self): arr = None table = pa.Table.from_pandas(df) - assert table[0].num_chunks == 2 + assert table.column(0).num_chunks == 2 @pytest.mark.large_memory @pytest.mark.parametrize('char', ['x', b'x']) @@ -1909,13 +1902,13 @@ def test_table_str_to_categorical_without_na(self, string_type): zero_copy_only=True) # chunked array - result = table["strings"].to_pandas(strings_to_categorical=True) + result = table.column("strings").to_pandas(strings_to_categorical=True) expected = pd.Series(pd.Categorical(values), name="strings") tm.assert_series_equal(result, expected) with pytest.raises(pa.ArrowInvalid): - table["strings"].to_pandas(strings_to_categorical=True, - zero_copy_only=True) + table.column("strings").to_pandas(strings_to_categorical=True, + zero_copy_only=True) @pytest.mark.parametrize( "string_type", [pa.string(), pa.large_string(), pa.string_view()] @@ -1936,13 +1929,13 @@ def test_table_str_to_categorical_with_na(self, string_type): zero_copy_only=True) # chunked array - result = table["strings"].to_pandas(strings_to_categorical=True) + result = table.column("strings").to_pandas(strings_to_categorical=True) expected = pd.Series(pd.Categorical(values), name="strings") tm.assert_series_equal(result, expected) with pytest.raises(pa.ArrowInvalid): - table["strings"].to_pandas(strings_to_categorical=True, - zero_copy_only=True) + table.column("strings").to_pandas(strings_to_categorical=True, + zero_copy_only=True) # Regression test for ARROW-2101 def test_array_of_bytes_to_strings(self): @@ -2524,7 +2517,7 @@ def test_auto_chunking_on_list_overflow(self): table = pa.Table.from_pandas(df) table.validate(full=True) - column_a = table[0] + column_a = table.column(0) assert column_a.num_chunks == 2 assert len(column_a.chunk(0)) == 2**21 - 1 assert len(column_a.chunk(1)) == 1 @@ -3168,9 +3161,8 @@ def test_strided_data_import(self): boolean_objects[5] = None cases.append(boolean_objects) - cases.append(np.arange("2016-01-01T00:00:00.001", N * K, - dtype='datetime64[ms]') - .reshape(N, K).copy()) + cases.append(pd.date_range("2016-01-01T00:00:00.001", periods=N * K, freq='ms') + .values.reshape(N, K).copy()) strided_mask = (random_numbers > 0).astype(bool)[:, 0] @@ -3776,8 +3768,8 @@ def test_recordbatchlist_to_pandas(): def test_recordbatch_table_pass_name_to_pandas(): rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0']) t = pa.table([pa.array([1, 2, 3, 4])], names=['a0']) - assert rb[0].to_pandas().name == 'a0' - assert t[0].to_pandas().name == 'a0' + assert rb.column(0).to_pandas().name == 'a0' + assert t.column(0).to_pandas().name == 'a0' # ---------------------------------------------------------------------- @@ -4331,13 +4323,13 @@ def test_array_protocol(): # default conversion result = pa.table(df) expected = pa.array([1, 2, None], pa.int64()) - assert result[0].chunk(0).equals(expected) + assert result.column(0).chunk(0).equals(expected) # with specifying schema schema = pa.schema([('a', pa.float64())]) result = pa.table(df, schema=schema) expected2 = pa.array([1, 2, None], pa.float64()) - assert result[0].chunk(0).equals(expected2) + assert result.column(0).chunk(0).equals(expected2) # pass Series to pa.array result = pa.array(df['a']) @@ -4467,7 +4459,7 @@ def __init__(self): def __arrow_ext_serialize__(self): return b'' - def to_pandas_dtype(self): + def to_pandas_dtype(self): # type: ignore[override] return pd.Int64Dtype() @@ -4567,7 +4559,7 @@ def test_array_to_pandas(): expected = pd.Series(arr) tm.assert_series_equal(result, expected) - result = pa.table({"col": arr})["col"].to_pandas() + result = pa.table({"col": arr}).column("col").to_pandas() expected = pd.Series(arr, name="col") tm.assert_series_equal(result, expected) @@ -4626,7 +4618,6 @@ def test_array_to_pandas_types_mapper(): assert result.dtype == np.dtype("int64") -@pytest.mark.pandas def test_chunked_array_to_pandas_types_mapper(): # https://issues.apache.org/jira/browse/ARROW-9664 if Version(pd.__version__) < Version("1.2.0"): @@ -5117,7 +5108,7 @@ def test_roundtrip_nested_map_array_with_pydicts_sliced(): ty = pa.list_(pa.map_(pa.string(), pa.list_(pa.string()))) - def assert_roundtrip(series: pd.Series, data) -> None: + def assert_roundtrip(series, data): array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series, type=ty)) array_roundtrip.validate(full=True) assert data.equals(array_roundtrip) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 65f0c6081363..20a33a382e41 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -20,11 +20,12 @@ import pytest import weakref from collections.abc import Sequence, Mapping +from typing import cast try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.compute as pc @@ -68,7 +69,7 @@ pa.Time32Scalar), (datetime.datetime.now().time(), None, pa.Time64Scalar), (datetime.timedelta(days=1), None, pa.DurationScalar), - (pa.MonthDayNano([1, -1, -10100]), None, + (pa.MonthDayNano([1, -1, -10100]), None, # type: ignore[call-arg, arg-type] pa.MonthDayNanoIntervalScalar), ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar), @@ -360,7 +361,8 @@ def test_time_from_datetime_time(): def test_temporal_values(value, time_type: pa.DataType): time_scalar = pa.scalar(value, type=time_type) time_scalar.validate(full=True) - assert time_scalar.value == value + assert (time_scalar.value # type: ignore[union-attr, reportAttributeAccessIssue] + == value) def test_cast(): @@ -422,7 +424,9 @@ def test_timestamp(): expected = pd.Timestamp('2000-01-01 12:34:56') assert arrow_arr[0].as_py() == expected - assert arrow_arr[0].value * 1000**i == expected.value + value = cast(pa.TimestampScalar, arrow_arr[0]).value + assert value is not None + assert value * 1000**i == expected.value tz = 'America/New_York' arrow_type = pa.timestamp(unit, tz=tz) @@ -434,7 +438,9 @@ def test_timestamp(): .tz_convert(tz)) assert arrow_arr[0].as_py() == expected - assert arrow_arr[0].value * 1000**i == expected.value + value = cast(pa.TimestampScalar, arrow_arr[0]).value + assert value is not None + assert value * 1000**i == expected.value @pytest.mark.nopandas @@ -529,7 +535,7 @@ def test_duration_nanos_nopandas(): def test_month_day_nano_interval(): - triple = pa.MonthDayNano([-3600, 1800, -50]) + triple = pa.MonthDayNano([-3600, 1800, -50]) # type: ignore[invalid-argument-type] arr = pa.array([triple]) assert isinstance(arr[0].as_py(), pa.MonthDayNano) assert arr[0].as_py() == triple @@ -577,7 +583,7 @@ def test_binary(value, ty, scalar_typ): with pytest.raises(ValueError): memoryview(s) else: - assert buf.to_pybytes() == value + assert buf.to_pybytes() == value # type: ignore[union-attr] assert isinstance(buf, pa.Buffer) assert bytes(s) == value @@ -852,7 +858,7 @@ def test_dictionary(pickle_module): assert arr.to_pylist() == expected for j, (i, v) in enumerate(zip(indices, expected)): - s = arr[j] + s = cast(pa.DictionaryScalar, arr[j]) assert s.as_py() == v assert s.value.as_py() == v @@ -868,14 +874,14 @@ def test_run_end_encoded(): values = [1, 2, 1, None, 3] arr = pa.RunEndEncodedArray.from_arrays(run_ends, values) - scalar = arr[0] + scalar = cast(pa.RunEndEncodedScalar, arr[0]) assert isinstance(scalar, pa.RunEndEncodedScalar) assert isinstance(scalar.value, pa.Int64Scalar) assert scalar.value == pa.array(values)[0] assert scalar.as_py() == 1 # null -> .value is still a scalar, as_py returns None - scalar = arr[10] + scalar = cast(pa.RunEndEncodedScalar, arr[10]) assert isinstance(scalar.value, pa.Int64Scalar) assert scalar.as_py() is None @@ -901,13 +907,13 @@ def test_union(pickle_module): with pytest.raises(pa.ArrowNotImplementedError): pickle_module.loads(pickle_module.dumps(s)) - assert arr[0].type_code == 0 + assert cast(pa.UnionScalar, arr[0]).type_code == 0 assert arr[0].as_py() == "a" - assert arr[1].type_code == 0 + assert cast(pa.UnionScalar, arr[1]).type_code == 0 assert arr[1].as_py() == "b" - assert arr[2].type_code == 1 + assert cast(pa.UnionScalar, arr[2]).type_code == 1 assert arr[2].as_py() == 3 - assert arr[3].type_code == 1 + assert cast(pa.UnionScalar, arr[3]).type_code == 1 assert arr[3].as_py() == 4 # dense @@ -927,9 +933,9 @@ def test_union(pickle_module): with pytest.raises(pa.ArrowNotImplementedError): pickle_module.loads(pickle_module.dumps(s)) - assert arr[0].type_code == 0 + assert cast(pa.UnionScalar, arr[0]).type_code == 0 assert arr[0].as_py() == b'a' - assert arr[5].type_code == 1 + assert cast(pa.UnionScalar, arr[5]).type_code == 1 assert arr[5].as_py() == 3 diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 029e14ca1628..5a7b9989358d 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.util as test_util @@ -259,7 +259,7 @@ def test_schema(): child 0, item: int8""" with pytest.raises(TypeError): - pa.schema([None]) + pa.schema([None]) # type: ignore[list-item] def test_schema_weakref(): @@ -548,7 +548,7 @@ def test_schema_equals_invalid_type(): for val in [None, 'string', pa.array([1, 2])]: with pytest.raises(TypeError): - schema.equals(val) + schema.equals(val) # type: ignore[invalid-argument-type] def test_schema_equality_operators(): @@ -594,7 +594,7 @@ def test_schema_get_fields(): with pytest.raises(KeyError): schema.field('other') with pytest.raises(TypeError): - schema.field(0.0) + schema.field(0.0) # type: ignore[arg-type] with pytest.raises(IndexError): schema.field(4) @@ -706,6 +706,7 @@ def test_empty_table(): assert table.schema == schema +@pytest.mark.numpy @pytest.mark.pandas def test_schema_from_pandas(): import pandas as pd @@ -782,7 +783,7 @@ def test_schema_merge(): # raise proper error when passing a non-Schema value with pytest.raises(TypeError): - pa.unify_schemas([a, 1]) + pa.unify_schemas([a, 1]) # type: ignore[list-item] def test_undecodable_metadata(): diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index eca8090d77a9..2ce48b651b14 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -26,15 +26,16 @@ import pyarrow as pa try: - from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix + from scipy.sparse import ( # type: ignore[reportMissingModuleSource] + csr_array, coo_array, csr_matrix, coo_matrix) except ImportError: - coo_matrix = None - csr_matrix = None - csr_array = None - coo_array = None + coo_matrix = None # type: ignore[assignment, misc] + csr_matrix = None # type: ignore[assignment, misc] + csr_array = None # type: ignore[assignment, misc] + coo_array = None # type: ignore[assignment, misc] try: - import sparse + import sparse # type: ignore[import-untyped, import-not-found] except ImportError: sparse = None @@ -401,7 +402,7 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): assert np.array_equal(array, result_array) -@pytest.mark.skipif(not coo_matrix, reason="requires scipy") +@pytest.mark.skipif(coo_matrix is None, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, @@ -443,7 +444,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, assert out_scipy_matrix.has_canonical_format -@pytest.mark.skipif(not csr_matrix, reason="requires scipy") +@pytest.mark.skipif(csr_matrix is None, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type, @@ -483,7 +484,8 @@ def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): shape = (4, 6) dim_names = ("x", "y") - sparse_array = sparse.COO(data=data, coords=coords, shape=shape) + sparse_array = sparse.COO( # type: ignore[reportOptionalMemberAccess] + data=data, coords=coords, shape=shape) sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array, dim_names=dim_names) out_sparse_array = sparse_tensor.to_pydata_sparse() diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py index babb839b534e..9505b9a11b04 100644 --- a/python/pyarrow/tests/test_strategies.py +++ b/python/pyarrow/tests/test_strategies.py @@ -25,7 +25,7 @@ @h.given(past.all_types) def test_types(ty): - assert isinstance(ty, pa.lib.DataType) + assert isinstance(ty, pa.DataType) @h.given(past.all_fields) @@ -41,7 +41,7 @@ def test_schemas(schema): @pytest.mark.numpy @h.given(past.all_arrays) def test_arrays(array): - assert isinstance(array, pa.lib.Array) + assert isinstance(array, pa.Array) @pytest.mark.numpy diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index fcd1c8d48c5f..9ad65f0738d9 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -25,13 +25,10 @@ from pyarrow.lib import tobytes from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError -try: - import pyarrow.substrait as substrait -except ImportError: - substrait = None - # Marks all of the tests in this module # Ignore these with pytest ... -m 'not substrait' +substrait = pytest.importorskip('pyarrow.substrait') +_substrait = pytest.importorskip('pyarrow._substrait') pytestmark = pytest.mark.substrait @@ -85,7 +82,7 @@ def test_run_serialized_query(tmpdir, use_threads): query = tobytes(substrait_query.replace( "FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri())) - buf = pa._substrait._parse_json_plan(query) + buf = _substrait._parse_json_plan(query) reader = substrait.run_query(buf, use_threads=use_threads) res_tb = reader.read_all() @@ -116,7 +113,7 @@ def test_invalid_plan(): ] } """ - buf = pa._substrait._parse_json_plan(tobytes(query)) + buf = _substrait._parse_json_plan(tobytes(query)) exec_message = "Plan has no relations" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf) @@ -162,7 +159,7 @@ def test_binary_conversion_with_json_options(tmpdir, use_threads): path = _write_dummy_data_to_disk(tmpdir, file_name, table) query = tobytes(substrait_query.replace( "FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri())) - buf = pa._substrait._parse_json_plan(tobytes(query)) + buf = _substrait._parse_json_plan(tobytes(query)) reader = substrait.run_query(buf, use_threads=use_threads) res_tb = reader.read_all() @@ -181,7 +178,7 @@ def has_function(fns, ext_file, fn_name): def test_get_supported_functions(): - supported_functions = pa._substrait.get_supported_functions() + supported_functions = _substrait.get_supported_functions() # It probably doesn't make sense to exhaustively verify this list but # we can check a sample aggregate and a sample non-aggregate entry assert has_function(supported_functions, @@ -232,8 +229,8 @@ def table_provider(names, schema): } """ - buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(tobytes(substrait_query)) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() assert res_tb == test_table_1 @@ -275,7 +272,7 @@ def table_provider(names, _): } """ - buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) + buf = _substrait._parse_json_plan(tobytes(substrait_query)) exec_message = "Invalid NamedTable Source" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf, table_provider=table_provider) @@ -317,7 +314,7 @@ def table_provider(names, _): } """ query = tobytes(substrait_query) - buf = pa._substrait._parse_json_plan(tobytes(query)) + buf = _substrait._parse_json_plan(tobytes(query)) exec_message = "names for NamedTable not provided" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf, table_provider=table_provider) @@ -436,8 +433,8 @@ def table_provider(names, _): } """ - buf = pa._substrait._parse_json_plan(substrait_query) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(substrait_query) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -559,9 +556,9 @@ def table_provider(names, _): } """ - buf = pa._substrait._parse_json_plan(substrait_query) + buf = _substrait._parse_json_plan(substrait_query) with pytest.raises(pa.ArrowKeyError) as excinfo: - pa.substrait.run_query(buf, table_provider=table_provider) + substrait.run_query(buf, table_provider=table_provider) assert "No function registered" in str(excinfo.value) @@ -598,8 +595,8 @@ def table_provider(names, schema): } """ - buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(tobytes(substrait_query)) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -744,8 +741,8 @@ def table_provider(names, _): ], } """ - buf = pa._substrait._parse_json_plan(substrait_query) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(substrait_query) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=False) res_tb = reader.read_all() @@ -913,8 +910,8 @@ def table_provider(names, _): ], } """ - buf = pa._substrait._parse_json_plan(substrait_query) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(substrait_query) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=False) res_tb = reader.read_all() @@ -929,8 +926,8 @@ def table_provider(names, _): @pytest.mark.parametrize("expr", [ - pc.equal(pc.field("x"), 7), - pc.equal(pc.field("x"), pc.field("y")), + pc.equal(pc.field("x"), 7), # type: ignore[attr-defined] + pc.equal(pc.field("x"), pc.field("y")), # type: ignore[attr-defined] pc.field("x") > 50 ]) def test_serializing_expressions(expr): @@ -939,8 +936,8 @@ def test_serializing_expressions(expr): pa.field("y", pa.int32()) ]) - buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = substrait.serialize_expressions([expr], ["test_expr"], schema) + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 1 assert "test_expr" in returned.expressions @@ -958,8 +955,8 @@ def test_arrow_specific_types(): schema = pa.schema([pa.field(name, typ) for name, (typ, _) in fields.items()]) def check_round_trip(expr): - buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = substrait.serialize_expressions([expr], ["test_expr"], schema) + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema for name, (typ, val) in fields.items(): @@ -986,8 +983,8 @@ def test_arrow_one_way_types(): def check_one_way(field): expr = pc.is_null(pc.field(field.name)) - buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = substrait.serialize_expressions([expr], ["test_expr"], schema) + returned = substrait.deserialize_expressions(buf) assert alt_schema == returned.schema for field in schema: @@ -1003,14 +1000,14 @@ def test_invalid_expression_ser_des(): bad_expr = pc.equal(pc.field("z"), 7) # Invalid number of names with pytest.raises(ValueError) as excinfo: - pa.substrait.serialize_expressions([expr], [], schema) + substrait.serialize_expressions([expr], [], schema) assert 'need to have the same length' in str(excinfo.value) with pytest.raises(ValueError) as excinfo: - pa.substrait.serialize_expressions([expr], ["foo", "bar"], schema) + substrait.serialize_expressions([expr], ["foo", "bar"], schema) assert 'need to have the same length' in str(excinfo.value) # Expression doesn't match schema with pytest.raises(ValueError) as excinfo: - pa.substrait.serialize_expressions([bad_expr], ["expr"], schema) + substrait.serialize_expressions([bad_expr], ["expr"], schema) assert 'No match for FieldRef' in str(excinfo.value) @@ -1020,8 +1017,8 @@ def test_serializing_multiple_expressions(): pa.field("y", pa.int32()) ]) exprs = [pc.equal(pc.field("x"), 7), pc.equal(pc.field("x"), pc.field("y"))] - buf = pa.substrait.serialize_expressions(exprs, ["first", "second"], schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = substrait.serialize_expressions(exprs, ["first", "second"], schema) + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 2 @@ -1037,8 +1034,8 @@ def test_serializing_with_compute(): ]) expr = pc.equal(pc.field("x"), 7) expr_norm = pc.equal(pc.field(0), 7) - buf = expr.to_substrait(schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = expr.to_substrait(schema) # type: ignore[union-attr] + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 1 @@ -1046,13 +1043,13 @@ def test_serializing_with_compute(): assert str(returned.expressions["expression"]) == str(expr_norm) # Compute can't deserialize messages with multiple expressions - buf = pa.substrait.serialize_expressions([expr, expr], ["first", "second"], schema) + buf = substrait.serialize_expressions([expr, expr], ["first", "second"], schema) with pytest.raises(ValueError) as excinfo: pc.Expression.from_substrait(buf) assert 'contained multiple expressions' in str(excinfo.value) # Deserialization should be possible regardless of the expression name - buf = pa.substrait.serialize_expressions([expr], ["weirdname"], schema) + buf = substrait.serialize_expressions([expr], ["weirdname"], schema) expr2 = pc.Expression.from_substrait(buf) assert str(expr2) == str(expr_norm) @@ -1069,11 +1066,11 @@ def test_serializing_udfs(): exprs = [pc.shift_left(a, b)] with pytest.raises(ArrowNotImplementedError): - pa.substrait.serialize_expressions(exprs, ["expr"], schema) + substrait.serialize_expressions(exprs, ["expr"], schema) - buf = pa.substrait.serialize_expressions( + buf = substrait.serialize_expressions( exprs, ["expr"], schema, allow_arrow_extensions=True) - returned = pa.substrait.deserialize_expressions(buf) + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 1 assert str(returned.expressions["expr"]) == str(exprs[0]) @@ -1085,19 +1082,19 @@ def test_serializing_schema(): pa.field("x", pa.int32()), pa.field("y", pa.string()) ]) - returned = pa.substrait.deserialize_schema(substrait_schema) + returned = substrait.deserialize_schema(substrait_schema) assert expected_schema == returned - arrow_substrait_schema = pa.substrait.serialize_schema(returned) + arrow_substrait_schema = substrait.serialize_schema(returned) assert arrow_substrait_schema.schema == substrait_schema - returned = pa.substrait.deserialize_schema(arrow_substrait_schema) + returned = substrait.deserialize_schema(arrow_substrait_schema) assert expected_schema == returned - returned = pa.substrait.deserialize_schema(arrow_substrait_schema.schema) + returned = substrait.deserialize_schema(arrow_substrait_schema.schema) assert expected_schema == returned - returned = pa.substrait.deserialize_expressions(arrow_substrait_schema.expression) + returned = substrait.deserialize_expressions(arrow_substrait_schema.expression) assert returned.schema == expected_schema @@ -1114,7 +1111,7 @@ def SerializeToString(self): b'\x1a\x19\n\x06\x12\x04\n\x02\x12\x00\x1a\x0fproject_version' b'"0\n\x0fproject_version\n\x0fproject_release' b'\x12\x0c\n\x04:\x02\x10\x01\n\x04b\x02\x10\x01') - exprs = pa.substrait.BoundExpressions.from_substrait(FakeMessage(message)) + exprs = substrait.BoundExpressions.from_substrait(FakeMessage(message)) assert len(exprs.expressions) == 2 assert 'project_release' in exprs.expressions assert 'project_version' in exprs.expressions diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b65fb7d952c8..6263afd03a59 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -18,12 +18,13 @@ from collections import OrderedDict from collections.abc import Iterable import sys +from typing import cast import weakref try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa import pyarrow.compute as pc @@ -418,7 +419,8 @@ def test_to_pandas_empty_table(): table = pa.table(df) result = table.schema.empty_table().to_pandas() assert result.shape == (0, 2) - tm.assert_frame_equal(result, df.iloc[:0]) + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) @pytest.mark.pandas @@ -486,12 +488,25 @@ def test_chunked_array_unify_dictionaries(): pa.array(["foo", "bar", None, "foo"]).dictionary_encode(), pa.array(["quux", None, "foo"]).dictionary_encode(), ]) - assert arr.chunk(0).dictionary.equals(pa.array(["foo", "bar"])) - assert arr.chunk(1).dictionary.equals(pa.array(["quux", "foo"])) + chunk_0 = arr.chunk(0) + assert isinstance(chunk_0, pa.DictionaryArray) + assert chunk_0.dictionary.equals(pa.array(["foo", "bar"])) + + chunk_1 = arr.chunk(1) + assert isinstance(chunk_1, pa.DictionaryArray) + assert chunk_1.dictionary.equals(pa.array(["quux", "foo"])) + arr = arr.unify_dictionaries() expected_dict = pa.array(["foo", "bar", "quux"]) - assert arr.chunk(0).dictionary.equals(expected_dict) - assert arr.chunk(1).dictionary.equals(expected_dict) + + chunk_0 = arr.chunk(0) + assert isinstance(chunk_0, pa.DictionaryArray) + assert chunk_0.dictionary.equals(expected_dict) + + chunk_1 = arr.chunk(1) + assert isinstance(chunk_1, pa.DictionaryArray) + assert chunk_1.dictionary.equals(expected_dict) + assert arr.to_pylist() == ["foo", "bar", None, "foo", "quux", None, "foo"] @@ -716,7 +731,7 @@ def test_recordbatch_take(): def test_recordbatch_column_sets_private_name(): # ARROW-6429 rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0']) - assert rb[0]._name == 'a0' + assert rb.column(0)._name == 'a0' def test_recordbatch_from_arrays_validate_schema(): @@ -798,7 +813,7 @@ def test_recordbatch_get_field(): batch.field('d') with pytest.raises(TypeError): - batch.field(None) + batch.field(None) # type: ignore[arg-type] with pytest.raises(IndexError): batch.field(4) @@ -819,7 +834,7 @@ def test_recordbatch_select_column(): batch.column('d') with pytest.raises(TypeError): - batch.column(None) + batch.column(None) # type: ignore[arg-type] with pytest.raises(IndexError): batch.column(4) @@ -933,7 +948,10 @@ def test_table_from_struct_array_chunked_array(): [[{"ints": 1}, {"floats": 1.0}]], type=pa.struct([("ints", pa.int32()), ("floats", pa.float32())]), ) - result = pa.Table.from_struct_array(chunked_struct_array) + assert isinstance(chunked_struct_array.type, pa.StructType) + # Cast to the proper type for type checker + struct_chunked_array = cast(pa.ChunkedArray, chunked_struct_array) + result = pa.Table.from_struct_array(struct_chunked_array) assert result.equals(pa.Table.from_arrays( [ pa.array([1, None], type=pa.int32()), @@ -1189,7 +1207,7 @@ def test_recordbatch_to_tensor_null(): batch.to_tensor() result = batch.to_tensor(null_to_nan=True, row_major=False) - x = np.column_stack([arr1, arr2]).astype(np.float64, order="F") + x = np.column_stack([arr1, arr2]).astype(np.float64, order="F") # type: ignore[no-matching-overload] expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x) @@ -1223,7 +1241,7 @@ def test_recordbatch_to_tensor_null(): ) result = batch.to_tensor(null_to_nan=True, row_major=False) - x = np.column_stack([arr1, arr2]).astype(np.float32, order="F") + x = np.column_stack([arr1, arr2]).astype(np.float32, order="F") # type: ignore[no-matching-overload] expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x) @@ -1339,7 +1357,7 @@ def test_recordbatchlist_schema_equals(): def test_table_column_sets_private_name(): # ARROW-6429 t = pa.table([pa.array([1, 2, 3, 4])], names=['a0']) - assert t[0]._name == 'a0' + assert t.column(0)._name == 'a0' def test_table_equals(): @@ -1500,7 +1518,8 @@ def test_table_from_arrays_preserves_column_metadata(): field1 = pa.field('field2', pa.int64(), nullable=False) table = pa.Table.from_arrays([arr0, arr1], schema=pa.schema([field0, field1])) - assert b"a" in table.field(0).metadata + field0_metadata = table.field(0).metadata + assert field0_metadata is not None and b"a" in field0_metadata assert table.field(1).nullable is False @@ -1565,7 +1584,7 @@ def test_table_get_field(): table.field('d') with pytest.raises(TypeError): - table.field(None) + table.field(None) # type: ignore[arg-type] with pytest.raises(IndexError): table.field(4) @@ -1586,7 +1605,7 @@ def test_table_select_column(): table.column('d') with pytest.raises(TypeError): - table.column(None) + table.column(None) # type: ignore[arg-type] with pytest.raises(IndexError): table.column(4) @@ -1879,22 +1898,41 @@ def test_table_unify_dictionaries(): table = pa.Table.from_batches([batch1, batch2]) table = table.replace_schema_metadata({b"key1": b"value1"}) - assert table.column(0).chunk(0).dictionary.equals( - pa.array(["foo", "bar"])) - assert table.column(0).chunk(1).dictionary.equals( - pa.array(["quux", "foo"])) - assert table.column(1).chunk(0).dictionary.equals( - pa.array([123, 456, 789])) - assert table.column(1).chunk(1).dictionary.equals( - pa.array([456, 789])) + chunk_0_0 = table.column(0).chunk(0) + assert isinstance(chunk_0_0, pa.DictionaryArray) + assert chunk_0_0.dictionary.equals(pa.array(["foo", "bar"])) + + chunk_0_1 = table.column(0).chunk(1) + assert isinstance(chunk_0_1, pa.DictionaryArray) + assert chunk_0_1.dictionary.equals(pa.array(["quux", "foo"])) + + chunk_1_0 = table.column(1).chunk(0) + assert isinstance(chunk_1_0, pa.DictionaryArray) + assert chunk_1_0.dictionary.equals(pa.array([123, 456, 789])) + + chunk_1_1 = table.column(1).chunk(1) + assert isinstance(chunk_1_1, pa.DictionaryArray) + assert chunk_1_1.dictionary.equals(pa.array([456, 789])) table = table.unify_dictionaries(pa.default_memory_pool()) expected_dict_0 = pa.array(["foo", "bar", "quux"]) expected_dict_1 = pa.array([123, 456, 789]) - assert table.column(0).chunk(0).dictionary.equals(expected_dict_0) - assert table.column(0).chunk(1).dictionary.equals(expected_dict_0) - assert table.column(1).chunk(0).dictionary.equals(expected_dict_1) - assert table.column(1).chunk(1).dictionary.equals(expected_dict_1) + + chunk_0_0 = table.column(0).chunk(0) + assert isinstance(chunk_0_0, pa.DictionaryArray) + assert chunk_0_0.dictionary.equals(expected_dict_0) + + chunk_0_1 = table.column(0).chunk(1) + assert isinstance(chunk_0_1, pa.DictionaryArray) + assert chunk_0_1.dictionary.equals(expected_dict_0) + + chunk_1_0 = table.column(1).chunk(0) + assert isinstance(chunk_1_0, pa.DictionaryArray) + assert chunk_1_0.dictionary.equals(expected_dict_1) + + chunk_1_1 = table.column(1).chunk(1) + assert isinstance(chunk_1_1, pa.DictionaryArray) + assert chunk_1_1.dictionary.equals(expected_dict_1) assert table.to_pydict() == { 'a': ["foo", "bar", None, "foo", "quux", "foo", None, "quux"], @@ -1964,13 +2002,13 @@ def test_concat_tables_invalid_option(): t = pa.Table.from_arrays([list(range(10))], names=('a',)) with pytest.raises(ValueError, match="Invalid promote_options: invalid"): - pa.concat_tables([t, t], promote_options="invalid") + pa.concat_tables([t, t], promote_options="invalid") # type: ignore[arg-type] def test_concat_tables_none_table(): # ARROW-11997 with pytest.raises(AttributeError): - pa.concat_tables([None]) + pa.concat_tables([None]) # type: ignore[arg-type] @pytest.mark.pandas @@ -2113,7 +2151,7 @@ def test_concat_batches_different_schema(): def test_concat_batches_none_batches(): # ARROW-11997 with pytest.raises(AttributeError): - pa.concat_batches([None]) + pa.concat_batches([None]) # type: ignore[arg-type] @pytest.mark.parametrize( @@ -2264,7 +2302,7 @@ def test_from_arrays_schema(data, klass): # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): - pa.Table.from_pydict(data, schema=schema) + pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type] # Cannot pass both schema and metadata / names with pytest.raises(ValueError): @@ -2369,7 +2407,7 @@ def test_table_from_pydict_arrow_arrays(data, klass): # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): - pa.Table.from_pydict(data, schema=schema) + pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type] @pytest.mark.parametrize('data, klass', [ @@ -2386,7 +2424,7 @@ def test_table_from_pydict_schema(data, klass): schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64()), ('ints', pa.int64())]) with pytest.raises(KeyError, match='ints'): - pa.Table.from_pydict(data, schema=schema) + pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type] # data has columns not present in schema -> ignored schema = pa.schema([('strs', pa.utf8())]) @@ -2590,10 +2628,10 @@ def test_table_factory_function_args_pandas(): def test_factory_functions_invalid_input(): with pytest.raises(TypeError, match="Expected pandas DataFrame, python"): - pa.table("invalid input") + pa.table("invalid input") # type: ignore[arg-type] with pytest.raises(TypeError, match="Expected pandas DataFrame"): - pa.record_batch("invalid input") + pa.record_batch("invalid input") # type: ignore[arg-type] def test_table_repr_to_string(): @@ -2727,8 +2765,8 @@ def test_table_function_unicode_schema(): schema = pa.schema([(col_a, pa.int32()), (col_b, pa.string())]) result = pa.table(d, schema=schema) - assert result[0].chunk(0).equals(pa.array([1, 2, 3], type='int32')) - assert result[1].chunk(0).equals(pa.array(['a', 'b', 'c'], type='string')) + assert result.column(0).chunk(0).equals(pa.array([1, 2, 3], type='int32')) + assert result.column(1).chunk(0).equals(pa.array(['a', 'b', 'c'], type='string')) def test_table_take_vanilla_functionality(): @@ -3603,7 +3641,7 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr # equals() test with pytest.raises(NotImplementedError): - cuda_chunked_array == cuda_chunked_array + cuda_chunked_array == cuda_chunked_array # type: ignore[reportUnusedExpression] # to_pandas() test with pytest.raises(NotImplementedError): @@ -3860,7 +3898,7 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, # __dataframe__() test with pytest.raises(NotImplementedError): - from_dataframe(cuda_recordbatch.__dataframe__()) + from_dataframe(cuda_recordbatch.__dataframe__()) # type: ignore[misc] def verify_cuda_table(table, expected_schema): @@ -4059,7 +4097,7 @@ def test_table_non_cpu(cuda_context, cpu_table, cuda_table, # __dataframe__() test with pytest.raises(NotImplementedError): - from_dataframe(cuda_table.__dataframe__()) + from_dataframe(cuda_table.__dataframe__()) # type: ignore[misc] # __reduce__() test with pytest.raises(NotImplementedError): diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py index debb1066280c..c3726fdbbf41 100644 --- a/python/pyarrow/tests/test_tensor.py +++ b/python/pyarrow/tests/test_tensor.py @@ -213,7 +213,7 @@ def test_tensor_memoryview(): dtype = data.dtype lst = data.tolist() tensor = pa.Tensor.from_numpy(data) - m = memoryview(tensor) + m = memoryview(tensor) # type: ignore[reportArgumentType] assert m.format == expected_format assert m.shape == data.shape assert m.strides == data.strides diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 539f01724542..c224392510d1 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -24,16 +24,22 @@ import pytest import hypothesis as h import hypothesis.strategies as st -try: - import hypothesis.extra.pytz as tzst -except ImportError: - tzst = None +from typing import Any, TYPE_CHECKING import weakref -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None + import hypothesis.extra.pytz as tzst +else: + try: + import numpy as np + except ImportError: + np = None + try: + import hypothesis.extra.pytz as tzst + except ImportError: + tzst = None + import pyarrow as pa import pyarrow.types as types import pyarrow.tests.strategies as past @@ -411,7 +417,7 @@ def test_tzinfo_to_string_errors(): if tzst: timezones = tzst.timezones() else: - timezones = st.none() + timezones = st.none() # type: ignore[assignment] @h.given(timezones) @@ -465,7 +471,7 @@ class BuggyTimezone2(datetime.tzinfo): def tzname(self, dt): return None - def utcoffset(self, dt): + def utcoffset(self, dt): # type: ignore[override] return "one hour" class BuggyTimezone3(datetime.tzinfo): @@ -473,7 +479,7 @@ class BuggyTimezone3(datetime.tzinfo): Wrong timezone name type """ - def tzname(self, dt): + def tzname(self, dt): # type: ignore[override] return 240 def utcoffset(self, dt): @@ -732,13 +738,13 @@ def test_struct_type(): # Neither integer nor string with pytest.raises(TypeError): - ty[None] + ty[None] # type: ignore[reportArgumentType] with pytest.raises(TypeError): - ty.field(None) + ty.field(None) # type: ignore[reportArgumentType] for a, b in zip(ty, fields): - a == b + assert a == b # Construct from list of tuples ty = pa.struct([('a', pa.int64()), @@ -746,7 +752,7 @@ def test_struct_type(): ('b', pa.int32())]) assert list(ty) == fields for a, b in zip(ty, fields): - a == b + assert a == b # Construct from mapping fields = [pa.field('a', pa.int64()), @@ -755,7 +761,7 @@ def test_struct_type(): ('b', pa.int32())])) assert list(ty) == fields for a, b in zip(ty, fields): - a == b + assert a == b # Invalid args with pytest.raises(TypeError): @@ -862,7 +868,7 @@ def test_dictionary_type(): # invalid index type raises with pytest.raises(TypeError): - pa.dictionary(pa.string(), pa.int64()) + pa.dictionary(pa.string(), pa.int64()) # type: ignore[reportArgumentType] def test_dictionary_ordered_equals(): @@ -951,7 +957,7 @@ def test_run_end_encoded_type(): pa.run_end_encoded(None, pa.utf8()) with pytest.raises(ValueError): - pa.run_end_encoded(pa.int8(), pa.utf8()) + pa.run_end_encoded(pa.int8(), pa.utf8()) # type: ignore[reportArgumentType] @pytest.mark.parametrize('t,check_func', [ @@ -1084,12 +1090,12 @@ def test_timedelta_overflow(): pa.scalar(d, type=pa.duration('ns')) # microsecond resolution, not overflow - pa.scalar(d, type=pa.duration('us')).as_py() == d + assert pa.scalar(d, type=pa.duration('us')).as_py() == d # second/millisecond resolution, not overflow for d in [datetime.timedelta.min, datetime.timedelta.max]: - pa.scalar(d, type=pa.duration('ms')).as_py() == d - pa.scalar(d, type=pa.duration('s')).as_py() == d + _ = pa.scalar(d, type=pa.duration('ms')).as_py() == d + _ = pa.scalar(d, type=pa.duration('s')).as_py() == d def test_type_equality_operators(): @@ -1127,11 +1133,11 @@ def test_key_value_metadata(): assert m1 != {'a': 'A', 'b': 'C'} with pytest.raises(TypeError): - pa.KeyValueMetadata({'a': 1}) + pa.KeyValueMetadata({'a': 1}) # type: ignore[reportArgumentType] with pytest.raises(TypeError): - pa.KeyValueMetadata({1: 'a'}) + pa.KeyValueMetadata({1: 'a'}) # type: ignore[reportArgumentType] with pytest.raises(TypeError): - pa.KeyValueMetadata(a=1) + pa.KeyValueMetadata(a=1) # type: ignore[reportArgumentType] expected = [(b'a', b'A'), (b'b', b'B')] result = [(k, v) for k, v in m3.items()] @@ -1258,6 +1264,7 @@ def test_field_metadata(): assert f1.metadata is None assert f2.metadata == {} + assert f3.metadata is not None assert f3.metadata[b'bizz'] == b'bazz' @@ -1394,7 +1401,7 @@ def __arrow_c_schema__(self): return self.schema.__arrow_c_schema__() -class SchemaMapping(Mapping): +class SchemaMapping(Mapping[Any, Any]): def __init__(self, schema): self.schema = schema diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 93004a30618a..e028f1c0484b 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -21,7 +21,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa from pyarrow import compute as pc @@ -35,7 +35,7 @@ try: import pyarrow.dataset as ds except ImportError: - ds = None + pass def mock_udf_context(batch_length=10): @@ -381,6 +381,7 @@ def check_scalar_function(func_fixture, func = pc.get_function(name) assert func.name == name + assert batch_length is not None result = pc.call_function(name, inputs, length=batch_length) expected_output = function(mock_udf_context(batch_length), *inputs) @@ -580,8 +581,8 @@ def identity(ctx, val): } with pytest.raises(TypeError, match="DataType expected, got "): - pc.register_scalar_function(identity, func_name, - doc, in_types, out_type) + pc.register_scalar_function( + identity, func_name, doc, in_types, out_type) # type: ignore[arg-type] def test_wrong_input_type_declaration(): @@ -597,8 +598,9 @@ def identity(ctx, val): } with pytest.raises(TypeError, match="DataType expected, got "): - pc.register_scalar_function(identity, func_name, doc, - in_types, out_type) + pc.register_scalar_function( + identity, func_name, doc, in_types, # type: ignore[arg-type] + out_type) def test_scalar_udf_context(unary_func_fixture): diff --git a/python/pyarrow/tests/test_without_numpy.py b/python/pyarrow/tests/test_without_numpy.py index 55c12602ce89..c5f5671aabc8 100644 --- a/python/pyarrow/tests/test_without_numpy.py +++ b/python/pyarrow/tests/test_without_numpy.py @@ -50,6 +50,7 @@ def test_tensor_to_np(): arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] storage = pa.array(arr, pa.list_(pa.int32(), 4)) tensor_array = pa.ExtensionArray.from_storage(tensor_type, storage) + assert isinstance(tensor_array, pa.FixedShapeTensorArray) tensor = tensor_array.to_tensor() msg = "Cannot return a numpy.ndarray if NumPy is not present" diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 7e3dd4324e93..fca0fec1122a 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -171,7 +171,8 @@ def get_modified_env_with_pythonpath(): existing_pythonpath = env.get('PYTHONPATH', '') module_path = os.path.abspath( - os.path.dirname(os.path.dirname(pa.__file__))) + os.path.dirname(os.path.dirname( # type: ignore[no-matching-overload] + pa.__file__))) if existing_pythonpath: new_pythonpath = os.pathsep.join((module_path, existing_pythonpath)) @@ -336,6 +337,7 @@ def _ensure_minio_component_version(component, minimum_year): stderr=subprocess.PIPE, encoding='utf-8') as proc: if proc.wait(10) != 0: return False + assert proc.stdout is not None stdout = proc.stdout.read() pattern = component + r' version RELEASE\.(\d+)-.*' version_match = re.search(pattern, stdout) @@ -367,6 +369,8 @@ def _run_mc_command(mcdir, *args): cmd_str = ' '.join(full_args) print(f'Cmd: {cmd_str}') print(f' Return: {retval}') + assert proc.stdout is not None + assert proc.stderr is not None print(f' Stdout: {proc.stdout.read()}') print(f' Stderr: {proc.stderr.read()}') if retval != 0: diff --git a/python/pyarrow/vendored/docscrape.py b/python/pyarrow/vendored/docscrape.py index 6c4d6e01400b..47aeeed40aed 100644 --- a/python/pyarrow/vendored/docscrape.py +++ b/python/pyarrow/vendored/docscrape.py @@ -18,7 +18,7 @@ import sys -def strip_blank_lines(l): +def strip_blank_lines(l): # noqa: E741 "Remove leading and trailing blank lines from a list of lines" while l and not l[0].strip(): del l[0] @@ -62,7 +62,7 @@ def read(self): return '' def seek_next_non_empty_line(self): - for l in self[self._l:]: + for l in self[self._l:]: # noqa: E741 if l.strip(): break else: @@ -185,8 +185,9 @@ def _is_at_section(self): l2 = self._doc.peek(1).strip() # ---------- or ========== if len(l2) >= 3 and (set(l2) in ({'-'}, {'='})) and len(l2) != len(l1): snip = '\n'.join(self._doc._str[:2])+'...' - self._error_location("potentially wrong underline length... \n%s \n%s in \n%s" - % (l1, l2, snip), error=False) + self._error_location( + "potentially wrong underline length... \n%s \n%s in \n%s" + % (l1, l2, snip), error=False) return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1)) def _strip(self, doc): diff --git a/python/pyproject.toml b/python/pyproject.toml index 217dba81b873..19b2186e21ee 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -99,38 +99,26 @@ version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '24.0.0a0' -# TODO: Enable type checking once stubs are merged [tool.mypy] -files = ["pyarrow-stubs"] +files = ["pyarrow", "pyarrow-stubs"] mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" -exclude = [ - "^pyarrow/", - "^benchmarks/", - "^examples/", - "^scripts/", -] +exclude = 'pyarrow/interchange/.*|pyarrow/tests/interchange/.*|pyarrow/vendored/.*|pyarrow/tests/test_cuda*' -# TODO: Enable type checking once stubs are merged [tool.pyright] pythonPlatform = "All" pythonVersion = "3.10" -include = ["pyarrow-stubs"] -exclude = [ - "pyarrow", - "benchmarks", - "examples", - "scripts", - "build", -] +include = ["pyarrow", "pyarrow-stubs"] +exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/interchange", "pyarrow/tests/test_cuda*"] stubPath = "pyarrow-stubs" typeCheckingMode = "basic" -# TODO: Enable type checking once stubs are merged [tool.ty.src] -include = ["pyarrow-stubs"] -exclude = [ - "pyarrow", - "benchmarks", - "examples", - "scripts", -] +include = ["pyarrow", "pyarrow-stubs"] +exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/interchange", "pyarrow/tests/test_cuda*"] + +[tool.ty.environment] +root = ["pyarrow"] + +[tool.ty.rules] +unresolved-import = "ignore" +unresolved-attribute = "ignore" diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py index 406dfc54e4fc..e54f0c223ab4 100644 --- a/python/scripts/run_emscripten_tests.py +++ b/python/scripts/run_emscripten_tests.py @@ -114,7 +114,7 @@ def end_headers(self): def run_server_thread(dist_dir, q): - global _SERVER_ADDRESS + global _SERVER_ADDRESS # noqa: F824 os.chdir(dist_dir) server = http.server.HTTPServer(("", 0), TemplateOverrider) q.put(server.server_address)