From a2a9207b13e3b16a0a33b5e18392581fd1e292ae Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Mon, 10 Nov 2025 11:29:30 -0500 Subject: [PATCH 1/6] split hdfs into extra --- sdks/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 074d64ae8921..e44f09505273 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -379,7 +379,6 @@ def get_portability_package_data(): # TODO(https://github.com/grpc/grpc/issues/37710): Unpin grpc 'grpcio>=1.33.1,<2,!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<1.66.0; python_version <= "3.12"', # pylint: disable=line-too-long 'grpcio>=1.67.0; python_version >= "3.13"', - 'hdfs>=2.1.0,<3.0.0', 'httplib2>=0.8,<0.23.0', 'jsonpickle>=3.0.0,<4.0.0', # numpy can have breaking changes in minor versions. @@ -564,6 +563,7 @@ def get_portability_package_data(): # `--update` / `-U` flag to replace the dask release brought in # by distributed. ], + 'hadoop': ['hdfs>=2.1.0,<3.0.0'], 'yaml': [ 'docstring-parser>=0.15,<1.0', 'jinja2>=3.0,<3.2', From 99c2bbb2d7cfe2e987380cf4b0d7982c412fc183 Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Mon, 10 Nov 2025 11:35:10 -0500 Subject: [PATCH 2/6] CHANGES --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 222d4b82cb25..dbad2d3a0d67 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -78,7 +78,7 @@ ## Breaking Changes -* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). +* (Python) Some Python dependencies have been split out into extras. To ensure all previously installed dependencies are installed, when installing Beam you can `pip install apache-beam[gcp,interactive,yaml,redis,hadoop,tfrecord]`, though most users will not need all of these extras ([#34554](https://github.com/apache/beam/issues/34554)). ## Deprecations From 0b5e65642bf3b1c91b718331805b7eb03511dc2a Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Fri, 21 Nov 2025 10:27:51 -0500 Subject: [PATCH 3/6] tox --- sdks/python/tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 7d84ca7a2c62..9618bc7ce3d4 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -33,7 +33,7 @@ pip_pre = True # allow apps that support color to use it. passenv=TERM,CLOUDSDK_CONFIG,DOCKER_*,TESTCONTAINERS_*,TC_*,ALLOYDB_PASSWORD # Set [] options for pip installation of apache-beam tarball. -extras = test,dataframe,tfrecord,yaml +extras = test,dataframe,hdfs,tfrecord,yaml # Don't warn that these commands aren't installed. allowlist_externals = false From 9c3500c91611107a60be8e78efda9239d130c80e Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Fri, 21 Nov 2025 10:49:21 -0500 Subject: [PATCH 4/6] try/catch --- sdks/python/apache_beam/io/hadoopfilesystem.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/io/hadoopfilesystem.py b/sdks/python/apache_beam/io/hadoopfilesystem.py index cf488c228a28..3287644eed8c 100644 --- a/sdks/python/apache_beam/io/hadoopfilesystem.py +++ b/sdks/python/apache_beam/io/hadoopfilesystem.py @@ -26,8 +26,6 @@ import re from typing import BinaryIO # pylint: disable=unused-import -import hdfs - from apache_beam.io import filesystemio from apache_beam.io.filesystem import BeamIOError from apache_beam.io.filesystem import CompressedFile @@ -37,6 +35,11 @@ from apache_beam.options.pipeline_options import HadoopFileSystemOptions from apache_beam.options.pipeline_options import PipelineOptions +try: + import hdfs +except ImportError: + hdfs = None + __all__ = ['HadoopFileSystem'] _HDFS_PREFIX = 'hdfs:/' @@ -108,6 +111,10 @@ def __init__(self, pipeline_options): See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`. """ super().__init__(pipeline_options) + if hdfs is None: + raise ImportError( + 'Failed to import hdfs. You can ensure it is ' + 'installed by installing the hadoop beam extra') logging.getLogger('hdfs.client').setLevel(logging.WARN) if pipeline_options is None: raise ValueError('pipeline_options is not set') From 0d1563efac6b6793f52f952c3c9dc2947377af48 Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Fri, 21 Nov 2025 16:26:25 -0500 Subject: [PATCH 5/6] test fixes --- sdks/python/apache_beam/io/hadoopfilesystem_test.py | 7 +++++++ sdks/python/tox.ini | 1 + 2 files changed, 8 insertions(+) diff --git a/sdks/python/apache_beam/io/hadoopfilesystem_test.py b/sdks/python/apache_beam/io/hadoopfilesystem_test.py index 8c21effc8823..eb0925224dd3 100644 --- a/sdks/python/apache_beam/io/hadoopfilesystem_test.py +++ b/sdks/python/apache_beam/io/hadoopfilesystem_test.py @@ -32,6 +32,11 @@ from apache_beam.options.pipeline_options import HadoopFileSystemOptions from apache_beam.options.pipeline_options import PipelineOptions +try: + import hdfs as actual_hdfs +except ImportError: + actual_hdfs = None + class FakeFile(io.BytesIO): """File object for FakeHdfs""" @@ -201,6 +206,7 @@ def checksum(self, path): @parameterized_class(('full_urls', ), [(False, ), (True, )]) +@unittest.skipIf(actual_hdfs is None, "hdfs extra not installed") class HadoopFileSystemTest(unittest.TestCase): def setUp(self): self._fake_hdfs = FakeHdfs() @@ -607,6 +613,7 @@ def test_delete_error(self): self.assertFalse(self.fs.exists(url2)) +@unittest.skipIf(actual_hdfs is None, "hdfs extra not installed") class HadoopFileSystemRuntimeValueProviderTest(unittest.TestCase): """Tests pipeline_options, in the form of a RuntimeValueProvider.runtime_options object.""" diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 9618bc7ce3d4..4093970f7d32 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -228,6 +228,7 @@ deps = holdup==1.8.0 extras = gcp + hdfs allowlist_externals = bash echo From 7eab4e89d0faf58213049cc956c2b5dd0a648bba Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Fri, 21 Nov 2025 16:52:45 -0500 Subject: [PATCH 6/6] add to coverage tasks --- sdks/python/tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 4093970f7d32..3fd322b41b52 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -98,7 +98,7 @@ list_dependencies_command = {envbindir}/python.exe {envbindir}/pip.exe freeze [testenv:py{310,311,312,313}-cloud] ; extras = test,gcp,interactive,dataframe,aws,azure -extras = test,gcp,interactive,dataframe,aws,azure +extras = test,hdfs,gcp,interactive,dataframe,aws,azure commands = python apache_beam/examples/complete/autocomplete_test.py bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" @@ -173,7 +173,7 @@ setenv = TC_SLEEP_TIME = {env:TC_SLEEP_TIME:1} # NOTE: we could add ml_test to increase the collected code coverage metrics, but it would make the suite slower. -extras = test,gcp,interactive,dataframe,aws +extras = test,hdfs,gcp,interactive,dataframe,aws commands = bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" "--cov-report=xml --cov=. --cov-append"