From 9e4cbde2649f330f8fc57f43b43361a1a11b3eba Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Wed, 26 Nov 2025 15:53:20 -0500 Subject: [PATCH 1/2] split hdfs into extra (#36773) * split hdfs into extra * CHANGES * tox * try/catch * test fixes * add to coverage tasks --- CHANGES.md | 2 +- sdks/python/apache_beam/io/hadoopfilesystem.py | 11 +++++++++-- sdks/python/apache_beam/io/hadoopfilesystem_test.py | 7 +++++++ sdks/python/setup.py | 2 +- sdks/python/tox.ini | 9 +++++---- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 68af5a342d7d..5dd07aab92bf 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -81,7 +81,7 @@ Now Beam has full support for Milvus integration including Milvus enrichment and ## Breaking Changes -* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). +* (Python) Some Python dependencies have been split out into extras. To ensure all previously installed dependencies are installed, when installing Beam you can `pip install apache-beam[gcp,interactive,yaml,redis,hadoop,tfrecord]`, though most users will not need all of these extras ([#34554](https://github.com/apache/beam/issues/34554)). ## Deprecations diff --git a/sdks/python/apache_beam/io/hadoopfilesystem.py b/sdks/python/apache_beam/io/hadoopfilesystem.py index cf488c228a28..3287644eed8c 100644 --- a/sdks/python/apache_beam/io/hadoopfilesystem.py +++ b/sdks/python/apache_beam/io/hadoopfilesystem.py @@ -26,8 +26,6 @@ import re from typing import BinaryIO # pylint: disable=unused-import -import hdfs - from apache_beam.io import filesystemio from apache_beam.io.filesystem import BeamIOError from apache_beam.io.filesystem import CompressedFile @@ -37,6 +35,11 @@ from apache_beam.options.pipeline_options import HadoopFileSystemOptions from apache_beam.options.pipeline_options import PipelineOptions +try: + import hdfs +except ImportError: + hdfs = None + __all__ = ['HadoopFileSystem'] _HDFS_PREFIX = 'hdfs:/' @@ -108,6 +111,10 @@ def __init__(self, pipeline_options): See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`. """ super().__init__(pipeline_options) + if hdfs is None: + raise ImportError( + 'Failed to import hdfs. You can ensure it is ' + 'installed by installing the hadoop beam extra') logging.getLogger('hdfs.client').setLevel(logging.WARN) if pipeline_options is None: raise ValueError('pipeline_options is not set') diff --git a/sdks/python/apache_beam/io/hadoopfilesystem_test.py b/sdks/python/apache_beam/io/hadoopfilesystem_test.py index 8c21effc8823..eb0925224dd3 100644 --- a/sdks/python/apache_beam/io/hadoopfilesystem_test.py +++ b/sdks/python/apache_beam/io/hadoopfilesystem_test.py @@ -32,6 +32,11 @@ from apache_beam.options.pipeline_options import HadoopFileSystemOptions from apache_beam.options.pipeline_options import PipelineOptions +try: + import hdfs as actual_hdfs +except ImportError: + actual_hdfs = None + class FakeFile(io.BytesIO): """File object for FakeHdfs""" @@ -201,6 +206,7 @@ def checksum(self, path): @parameterized_class(('full_urls', ), [(False, ), (True, )]) +@unittest.skipIf(actual_hdfs is None, "hdfs extra not installed") class HadoopFileSystemTest(unittest.TestCase): def setUp(self): self._fake_hdfs = FakeHdfs() @@ -607,6 +613,7 @@ def test_delete_error(self): self.assertFalse(self.fs.exists(url2)) +@unittest.skipIf(actual_hdfs is None, "hdfs extra not installed") class HadoopFileSystemRuntimeValueProviderTest(unittest.TestCase): """Tests pipeline_options, in the form of a RuntimeValueProvider.runtime_options object.""" diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 289433f9ea5b..b700d7969832 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -379,7 +379,6 @@ def get_portability_package_data(): # TODO(https://github.com/grpc/grpc/issues/37710): Unpin grpc 'grpcio>=1.33.1,<2,!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<1.66.0; python_version <= "3.12"', # pylint: disable=line-too-long 'grpcio>=1.67.0; python_version >= "3.13"', - 'hdfs>=2.1.0,<3.0.0', 'httplib2>=0.8,<0.23.0', 'jsonpickle>=3.0.0,<4.0.0', # numpy can have breaking changes in minor versions. @@ -563,6 +562,7 @@ def get_portability_package_data(): # `--update` / `-U` flag to replace the dask release brought in # by distributed. ], + 'hadoop': ['hdfs>=2.1.0,<3.0.0'], 'yaml': [ 'docstring-parser>=0.15,<1.0', 'jinja2>=3.0,<3.2', diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index da0932728b20..431cd186c1b9 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -33,7 +33,7 @@ pip_pre = True # allow apps that support color to use it. passenv=TERM,CLOUDSDK_CONFIG,DOCKER_*,TESTCONTAINERS_*,TC_*,ALLOYDB_PASSWORD # Set [] options for pip installation of apache-beam tarball. -extras = test,dataframe,redis,tfrecord,yaml +extras = test,dataframe,hadoop,redis,tfrecord,yaml # Don't warn that these commands aren't installed. allowlist_externals = false @@ -97,8 +97,8 @@ install_command = {envbindir}/python.exe {envbindir}/pip.exe install --retries 1 list_dependencies_command = {envbindir}/python.exe {envbindir}/pip.exe freeze [testenv:py{310,311,312,313}-cloud] -; extras = test,gcp,interactive,dataframe,aws,azure,redis -extras = test,gcp,interactive,dataframe,aws,azure +; extras = test,gcp,interactive,dataframe,aws,azure +extras = test,hadoop,gcp,interactive,dataframe,aws,azure commands = python apache_beam/examples/complete/autocomplete_test.py bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" @@ -173,7 +173,7 @@ setenv = TC_SLEEP_TIME = {env:TC_SLEEP_TIME:1} # NOTE: we could add ml_test to increase the collected code coverage metrics, but it would make the suite slower. -extras = test,gcp,interactive,dataframe,aws,redis +extras = test,hadoop,gcp,interactive,dataframe,aws,redis commands = bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" "--cov-report=xml --cov=. --cov-append" @@ -228,6 +228,7 @@ deps = holdup==1.8.0 extras = gcp + hdfs allowlist_externals = bash echo From c2303be0d1d28fba9dae0eb8e5314dd375ed31cc Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Wed, 26 Nov 2025 15:55:27 -0500 Subject: [PATCH 2/2] Update CHANGES to mention extras changes (#36875) --- CHANGES.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 5dd07aab92bf..33cf7070a5fe 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -123,7 +123,7 @@ Now Beam has full support for Milvus integration including Milvus enrichment and - This change only affects pipelines that explicitly use the `pickle_library=dill` pipeline option. - While `dill==0.3.1.1` is still pre-installed on the official Beam SDK base images, it is no longer a direct dependency of the apache-beam Python package. This means it can be overridden by other dependencies in your environment. - If your pipeline uses `pickle_library=dill`, you must manually ensure `dill==0.3.1.1` is installed in both your submission and runtime environments. - - Submission environment: Install the dill extra in your local environment `pip install apache-beam[gcpdill]`. + - Submission environment: Install the dill extra in your local environment `pip install apache-beam[gcp,dill]`. - Runtime (worker) environment: Your action depends on how you manage your worker's environment. - If using default containers or custom containers with the official Beam base image e.g. `FROM apache/beam_python3.10_sdk:2.69.0` - Add `dill==0.3.1.1` to your worker's requirements file (e.g., requirements.txt) @@ -137,6 +137,9 @@ Now Beam has full support for Milvus integration including Milvus enrichment and * (Python) The deterministic fallback coder for complex types like NamedTuple, Enum, and dataclasses now normalizes filepaths for better determinism guarantees. This affects streaming pipelines updating from 2.68 to 2.69 that utilize this fallback coder. If your pipeline is affected, you may see a warning like: "Using fallback deterministic coder for type X...". To update safely sepcify the pipeline option `--update_compatibility_version=2.68.0` ([#36345](https://github.com/apache/beam/pull/36345)). * (Python) Fixed transform naming conflict when executing DataTransform on a dictionary of PColls ([#30445](https://github.com/apache/beam/issues/30445)). This may break update compatibility if you don't provide a `--transform_name_mapping`. +* (Python) Split some extras out from the core Beam package. ([#30445](https://github.com/apache/beam/issues/30445)). + - If you use Enrichment with redis, Hadoop FileSystem, TFRecord, or some other packages, you may need to install some extras. + - To retain identical behavior to before, instead of `pip install apache-beam`, use `pip install apache-beam[hadoop,gcp,interactive,redis,test,tfrecord]`. * Removed deprecated Hadoop versions (2.10.2 and 3.2.4) that are no longer supported for [Iceberg](https://github.com/apache/iceberg/issues/10940) from IcebergIO ([#36282](https://github.com/apache/beam/issues/36282)). * (Go) Coder construction on SDK side is more faithful to the specs from runners without stripping length-prefix. This may break streaming pipeline update as the underlying coder could be changed ([#36387](https://github.com/apache/beam/issues/36387)). * Minimum Go version for Beam Go updated to 1.25.2 ([#36461](https://github.com/apache/beam/issues/36461)).