From e8697e00a09077f5b9c34ea7a69fb70a5c76c243 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 30 Dec 2025 19:00:44 +0200 Subject: [PATCH 01/26] Add block_simultaneous_read to DefaultStream --- .../concurrent_read_processor.py | 161 +++++- .../concurrent_source/concurrent_source.py | 4 +- .../declarative_component_schema.yaml | 11 + .../models/declarative_component_schema.py | 7 +- .../parsers/model_to_component_factory.py | 1 + .../streams/concurrent/abstract_stream.py | 11 + .../streams/concurrent/default_stream.py | 7 + .../test_model_to_component_factory.py | 138 ++++++ .../test_concurrent_read_processor.py | 469 ++++++++++++++++++ 9 files changed, 793 insertions(+), 16 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 905999a4d..6873130c1 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -66,17 +66,49 @@ def __init__( self._streams_done: Set[str] = set() self._exceptions_per_stream_name: dict[str, List[Exception]] = {} + # Track which streams (by name) are currently active + # A stream is "active" if it's generating partitions or has partitions being read + self._active_stream_names: Set[str] = set() + + # Store which streams require blocking simultaneous reads + self._stream_block_simultaneous_read: Dict[str, bool] = { + stream.name: stream.block_simultaneous_read for stream in stream_instances_to_read_from + } + + for stream in stream_instances_to_read_from: + if stream.block_simultaneous_read: + self._logger.info( + f"Stream '{stream.name}' has block_simultaneous_read=True. " + f"Will defer starting this stream if it or its parents are active." + ) + def on_partition_generation_completed( self, sentinel: PartitionGenerationCompletedSentinel ) -> Iterable[AirbyteMessage]: """ This method is called when a partition generation is completed. 1. Remove the stream from the list of streams currently generating partitions - 2. If the stream is done, mark it as such and return a stream status message - 3. If there are more streams to read from, start the next partition generator + 2. Deactivate parent streams (they were only needed for partition generation) + 3. If the stream is done, mark it as such and return a stream status message + 4. If there are more streams to read from, start the next partition generator """ stream_name = sentinel.stream.name self._streams_currently_generating_partitions.remove(sentinel.stream.name) + + # Deactivate all parent streams now that partition generation is complete + # Parents were only needed to generate slices, they can now be reused + parent_streams = self._collect_all_parent_stream_names(stream_name) + for parent_stream_name in parent_streams: + if parent_stream_name in self._active_stream_names: + self._logger.debug(f"Removing '{parent_stream_name}' from active streams") + self._active_stream_names.discard(parent_stream_name) + if self._stream_block_simultaneous_read.get(parent_stream_name, False): + self._logger.info( + f"Parent stream '{parent_stream_name}' deactivated after " + f"partition generation completed for child '{stream_name}'. " + f"Blocked streams in the queue will be retried on next start_next_partition_generator call." + ) + # It is possible for the stream to already be done if no partitions were generated # If the partition generation process was completed and there are no partitions left to process, the stream is done if ( @@ -181,24 +213,81 @@ def _flag_exception(self, stream_name: str, exception: Exception) -> None: def start_next_partition_generator(self) -> Optional[AirbyteMessage]: """ - Start the next partition generator. - 1. Pop the next stream to read from - 2. Submit the partition generator to the thread pool manager - 3. Add the stream to the list of streams currently generating partitions - 4. Return a stream status message + Submits the next partition generator to the thread pool. + + A stream will be deferred (moved to end of queue) if: + 1. The stream itself has block_simultaneous_read=True AND is already active + 2. Any parent stream has block_simultaneous_read=True AND is currently active + + This prevents simultaneous reads of streams that shouldn't be accessed concurrently. + + :return: A status message if a partition generator was started, otherwise None """ - if self._stream_instances_to_start_partition_generation: + if not self._stream_instances_to_start_partition_generation: + return None + + # Remember initial queue size to avoid infinite loops if all streams are blocked + max_attempts = len(self._stream_instances_to_start_partition_generation) + attempts = 0 + + while self._stream_instances_to_start_partition_generation and attempts < max_attempts: + attempts += 1 + + # Pop the first stream from the queue stream = self._stream_instances_to_start_partition_generation.pop(0) + stream_name = stream.name + + # Check if this stream has block_simultaneous_read and is already active + if self._stream_block_simultaneous_read.get(stream_name, False) and stream_name in self._active_stream_names: + # Add back to the END of the queue for retry later + self._stream_instances_to_start_partition_generation.append(stream) + self._logger.info( + f"Deferring stream '{stream_name}' because it's already active " + f"(block_simultaneous_read=True). Trying next stream." + ) + continue # Try the next stream in the queue + + # Check if any parent streams have block_simultaneous_read and are currently active + parent_streams = self._collect_all_parent_stream_names(stream_name) + blocked_by_parents = [ + p for p in parent_streams + if self._stream_block_simultaneous_read.get(p, False) and p in self._active_stream_names + ] + + if blocked_by_parents: + # Add back to the END of the queue for retry later + self._stream_instances_to_start_partition_generation.append(stream) + self._logger.info( + f"Deferring stream '{stream_name}' because parent stream(s) " + f"{blocked_by_parents} are active and have block_simultaneous_read=True. Trying next stream." + ) + continue # Try the next stream in the queue + + # No blocking - start this stream + # Mark stream as active before starting + self._active_stream_names.add(stream_name) + self._streams_currently_generating_partitions.append(stream_name) + + # Also mark all parent streams as active (they will be read from during partition generation) + parent_streams = self._collect_all_parent_stream_names(stream_name) + for parent_stream_name in parent_streams: + if self._stream_block_simultaneous_read.get(parent_stream_name, False): + self._active_stream_names.add(parent_stream_name) + self._logger.info( + f"Marking parent stream '{parent_stream_name}' as active " + f"(will be read during partition generation for '{stream_name}')" + ) + self._thread_pool_manager.submit(self._partition_enqueuer.generate_partitions, stream) - self._streams_currently_generating_partitions.append(stream.name) - self._logger.info(f"Marking stream {stream.name} as STARTED") - self._logger.info(f"Syncing stream: {stream.name} ") + self._logger.info(f"Marking stream {stream_name} as STARTED") + self._logger.info(f"Syncing stream: {stream_name}") return stream_status_as_airbyte_message( stream.as_airbyte_stream(), AirbyteStreamStatus.STARTED, ) - else: - return None + + # All streams in the queue are currently blocked + return None def is_done(self) -> bool: """ @@ -230,6 +319,43 @@ def is_done(self) -> bool: def _is_stream_done(self, stream_name: str) -> bool: return stream_name in self._streams_done + def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: + """ + Recursively collect all parent stream names for a given stream. + For example, if we have: epics -> issues -> comments + Then for comments, this returns {issues, epics} + + :param stream_name: The stream to collect parents for + :return: Set of all parent stream names (recursively) + """ + parent_names: Set[str] = set() + stream = self._stream_name_to_instance.get(stream_name) + + if not stream: + return parent_names + + # Get partition router if it exists (this is where parent streams are defined) + partition_router = None + + # Try DefaultStream path first (_stream_partition_generator._stream_slicer._partition_router) + if hasattr(stream, "_stream_partition_generator") and hasattr(stream._stream_partition_generator, "_stream_slicer") and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router"): + partition_router = stream._stream_partition_generator._stream_slicer._partition_router + # Fallback to legacy path (retriever.partition_router) for backward compatibility and test mocks + elif hasattr(stream, "retriever") and hasattr(stream.retriever, "partition_router"): + partition_router = stream.retriever.partition_router + + # SubstreamPartitionRouter has parent_stream_configs + if partition_router and hasattr(partition_router, "parent_stream_configs"): + for parent_config in partition_router.parent_stream_configs: + parent_stream = parent_config.stream + parent_name = parent_stream.name + parent_names.add(parent_name) + + # Recursively collect grandparents, great-grandparents, etc. + parent_names.update(self._collect_all_parent_stream_names(parent_name)) + + return parent_names + def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: self._logger.info( f"Read {self._record_counter[stream_name]} records from {stream_name} stream" @@ -246,3 +372,12 @@ def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: else AirbyteStreamStatus.COMPLETE ) yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), stream_status) + + # Remove only this stream from active set (NOT parents) + if stream_name in self._active_stream_names: + self._active_stream_names.discard(stream_name) + if self._stream_block_simultaneous_read.get(stream_name, False): + self._logger.info( + f"Stream '{stream_name}' is no longer active. " + f"Blocked streams in the queue will be retried on next start_next_partition_generator call." + ) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte_cdk/sources/concurrent_source/concurrent_source.py index de2d93523..241282fa5 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_source.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_source.py @@ -77,7 +77,7 @@ def __init__( slice_logger: SliceLogger = DebugSliceLogger(), queue: Optional[Queue[QueueItem]] = None, message_repository: MessageRepository = InMemoryMessageRepository(), - initial_number_partitions_to_generate: int = 1, + initial_number_partitions_to_generate: int = 10, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, ) -> None: """ @@ -92,7 +92,7 @@ def __init__( self._logger = logger self._slice_logger = slice_logger self._message_repository = message_repository - self._initial_number_partitions_to_generate = initial_number_partitions_to_generate + self._initial_number_partitions_to_generate = 10 self._timeout_seconds = timeout_seconds # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index e04a82c0d..7ebe777eb 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1553,6 +1553,17 @@ definitions: default: "" example: - "Users" + block_simultaneous_read: + title: Block Simultaneous Read + description: > + When true, prevents simultaneous reading of this stream from multiple contexts + (e.g., as both a parent stream and a standalone stream). If the stream OR any + of its parent streams are currently active, this stream will be deferred until + they finish. This is useful for APIs that don't allow concurrent access to the + same endpoint. Default is false for backward compatibility. + Only applies to ConcurrentDeclarativeSource. + type: boolean + default: false retriever: title: Retriever description: Component used to coordinate how records are extracted across stream slices and request pages. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index b78a07021..5aaed138d 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -2497,6 +2497,11 @@ class Config: type: Literal["DeclarativeStream"] name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") + block_simultaneous_read: Optional[bool] = Field( + False, + description="When true, prevents simultaneous reading of this stream from multiple contexts (e.g., as both a parent stream and a standalone stream). If the stream OR any of its parent streams are currently active, this stream will be deferred until they finish. This is useful for APIs that don't allow concurrent access to the same endpoint. Default is false for backward compatibility. Only applies to ConcurrentDeclarativeSource.\n", + title="Block Simultaneous Read", + ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages.", @@ -2741,7 +2746,7 @@ class HttpRequester(BaseModelWithDeprecations): ) use_cache: Optional[bool] = Field( False, - description="Enables stream requests caching. This field is automatically set by the CDK.", + description="Enables stream requests caching. When set to true, repeated requests to the same URL will return cached responses. Parent streams automatically have caching enabled. Only set this to false if you are certain that caching should be disabled, as it may negatively impact performance when the same data is needed multiple times (e.g., for scroll-based pagination APIs where caching causes duplicate records).", title="Use Cache", ) parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3a772b691..3b29e30d6 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2118,6 +2118,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), + block_simultaneous_read=model.block_simultaneous_read or False, ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 667d088ab..0052eafa3 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -85,6 +85,17 @@ def cursor(self) -> Cursor: :return: The cursor associated with this stream. """ + @property + def block_simultaneous_read(self) -> bool: + """ + Override to return True if this stream should block simultaneous reads. + When True, prevents starting partition generation for this stream if it + OR any of its parent streams are already active. + + :return: True if simultaneous reads should be blocked, False otherwise + """ + return False # Default: allow concurrent reading + @abstractmethod def check_availability(self) -> StreamAvailability: """ diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index f5d4ccf2e..84d955105 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -26,6 +26,7 @@ def __init__( cursor: Cursor, namespace: Optional[str] = None, supports_file_transfer: bool = False, + block_simultaneous_read: bool = False, ) -> None: self._stream_partition_generator = partition_generator self._name = name @@ -36,6 +37,7 @@ def __init__( self._cursor = cursor self._namespace = namespace self._supports_file_transfer = supports_file_transfer + self._block_simultaneous_read = block_simultaneous_read def generate_partitions(self) -> Iterable[Partition]: yield from self._stream_partition_generator.generate() @@ -94,6 +96,11 @@ def log_stream_sync_configuration(self) -> None: def cursor(self) -> Cursor: return self._cursor + @property + def block_simultaneous_read(self) -> bool: + """Returns whether this stream should block simultaneous reads""" + return self._block_simultaneous_read + def check_availability(self) -> StreamAvailability: """ Check stream availability by attempting to read the first record of the stream. diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index dcdc2bcff..a83734a51 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5214,6 +5214,144 @@ def test_catalog_defined_cursor_field_stream_missing(): assert stream._cursor_field.supports_catalog_defined_cursor_field == True +def test_block_simultaneous_read_from_manifest(): + """Test that block_simultaneous_read flows through from manifest to DefaultStream""" + content = """ + parent_stream: + type: DeclarativeStream + name: "parent" + primary_key: "id" + block_simultaneous_read: true + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/parent" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + + child_stream: + type: DeclarativeStream + name: "child" + primary_key: "id" + block_simultaneous_read: true + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/child" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + partition_router: + type: SubstreamPartitionRouter + parent_stream_configs: + - type: ParentStreamConfig + stream: "#/parent_stream" + parent_key: "id" + partition_field: "parent_id" + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + parent_id: + type: string + + no_block_stream: + type: DeclarativeStream + name: "no_block" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/no_block" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + """ + + config = {"api_key": "test_key"} + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + + # Test parent stream with block_simultaneous_read: true + parent_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["parent_stream"], {} + ) + parent_stream: DefaultStream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config + ) + + assert isinstance(parent_stream, DefaultStream) + assert parent_stream.name == "parent" + assert parent_stream.block_simultaneous_read is True + + # Test child stream with block_simultaneous_read: true + child_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["child_stream"], {} + ) + child_stream: DefaultStream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config + ) + + assert isinstance(child_stream, DefaultStream) + assert child_stream.name == "child" + assert child_stream.block_simultaneous_read is True + + # Test stream without block_simultaneous_read (should default to False) + no_block_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["no_block_stream"], {} + ) + no_block_stream: DefaultStream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config + ) + + assert isinstance(no_block_stream, DefaultStream) + assert no_block_stream.name == "no_block" + assert no_block_stream.block_simultaneous_read is False + + def get_schema_loader(stream: DefaultStream): assert isinstance( stream._stream_partition_generator._partition_factory._schema_loader, diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index a681f75eb..f1fad0799 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -792,3 +792,472 @@ def test_start_next_partition_generator(self): self._thread_pool_manager.submit.assert_called_with( self._partition_enqueuer.generate_partitions, self._stream ) + + +class TestBlockSimultaneousRead(unittest.TestCase): + """Tests for block_simultaneous_read functionality""" + + def setUp(self): + self._partition_enqueuer = Mock(spec=PartitionEnqueuer) + self._thread_pool_manager = Mock(spec=ThreadPoolManager) + self._logger = Mock(spec=logging.Logger) + self._slice_logger = Mock(spec=SliceLogger) + self._message_repository = Mock(spec=MessageRepository) + self._message_repository.consume_queue.return_value = [] + self._partition_reader = Mock(spec=PartitionReader) + + def _create_mock_stream(self, name: str, block_simultaneous_read: bool = False): + """Helper to create a mock stream""" + stream = Mock(spec=AbstractStream) + stream.name = name + stream.block_simultaneous_read = block_simultaneous_read + stream.as_airbyte_stream.return_value = AirbyteStream( + name=name, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + stream.cursor.ensure_at_least_one_state_emitted = Mock() + return stream + + def _create_mock_stream_with_parent( + self, name: str, parent_stream, block_simultaneous_read: bool = False + ): + """Helper to create a mock stream with a parent stream""" + stream = self._create_mock_stream(name, block_simultaneous_read) + + # Mock the retriever and partition router for parent relationship + mock_retriever = Mock() + mock_partition_router = Mock() + mock_parent_config = Mock() + mock_parent_config.stream = parent_stream + + mock_partition_router.parent_stream_configs = [mock_parent_config] + mock_retriever.partition_router = mock_partition_router + stream.retriever = mock_retriever + + return stream + + def test_defer_stream_when_self_active(self): + """Test that a stream is deferred when it's already active""" + stream = self._create_mock_stream("stream1", block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark stream as active + handler._active_stream_names.add("stream1") + + # Try to start the stream again + result = handler.start_next_partition_generator() + + # Should return None (no stream started) + assert result is None + + # Stream should be back in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert handler._stream_instances_to_start_partition_generation[0] == stream + + # Logger should have been called to log deferral + assert any( + "Deferring stream 'stream1' because it's already active" in str(call) + for call in self._logger.info.call_args_list + ) + + def test_defer_stream_when_parent_active(self): + """Test that a stream is deferred when its parent is active""" + parent_stream = self._create_mock_stream("parent", block_simultaneous_read=True) + child_stream = self._create_mock_stream_with_parent( + "child", parent_stream, block_simultaneous_read=True + ) + + handler = ConcurrentReadProcessor( + [parent_stream, child_stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark parent as active + handler._active_stream_names.add("parent") + + # Remove parent from queue (simulate it's already started) + handler._stream_instances_to_start_partition_generation = [child_stream] + + # Try to start child + result = handler.start_next_partition_generator() + + # Should return None (child deferred) + assert result is None + + # Child should be back in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert handler._stream_instances_to_start_partition_generation[0] == child_stream + + # Logger should have been called + assert any( + "Deferring stream 'child' because parent stream(s)" in str(call) + for call in self._logger.info.call_args_list + ) + + def test_defer_stream_when_grandparent_active(self): + """Test that a stream is deferred when its grandparent is active""" + grandparent = self._create_mock_stream("grandparent", block_simultaneous_read=True) + parent = self._create_mock_stream_with_parent( + "parent", grandparent, block_simultaneous_read=True + ) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [grandparent, parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark grandparent as active + handler._active_stream_names.add("grandparent") + + # Only child in queue + handler._stream_instances_to_start_partition_generation = [child] + + # Try to start child + result = handler.start_next_partition_generator() + + # Should return None (child deferred because grandparent is active) + assert result is None + + # Child should be back in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + def test_retry_blocked_stream_after_blocker_done(self): + """Test that blocked stream is retried after blocker finishes""" + stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) + stream2 = self._create_mock_stream("stream2", block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [stream1, stream2], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start stream1 + handler.start_next_partition_generator() + assert "stream1" in handler._active_stream_names + + # Try to start stream2 (should be deferred since stream1 is active and they share block flag) + # But wait - they're not parent-child, so stream2 should start successfully + # Let me fix the test logic + + # Mark stream1 as active to simulate it's running + handler._active_stream_names.add("stream1") + handler._stream_instances_to_start_partition_generation = [stream1, stream2] + + # Try to start stream1 again (should be deferred because already active) + result = handler.start_next_partition_generator() + + # Should start stream2 instead (stream1 was deferred) + assert result is not None + assert "stream2" in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + def test_retry_blocked_stream_after_partition_generation(self): + """Test that blocked stream is retried after partition generation completes""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + handler.start_next_partition_generator() + assert "parent" in handler._active_stream_names + + # Mark parent as generating partitions and having no partitions + handler._streams_currently_generating_partitions.append("parent") + handler._streams_to_running_partitions["parent"] = set() + + # Complete partition generation for parent (parent has no partitions, so it's done) + sentinel = PartitionGenerationCompletedSentinel(parent) + messages = list(handler.on_partition_generation_completed(sentinel)) + + # Child should have been started automatically by on_partition_generation_completed + # (it calls start_next_partition_generator internally) + assert "child" in handler._active_stream_names + + # Parent should be RE-ACTIVATED because child needs to read from it during partition generation + # This is the correct behavior - prevents simultaneous reads of parent + assert "parent" in handler._active_stream_names + + # Verify the queue is now empty (both streams were started) + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + def test_blocked_stream_added_to_end_of_queue(self): + """Test that blocked streams are added to the end of the queue""" + stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) + stream2 = self._create_mock_stream("stream2", block_simultaneous_read=False) + stream3 = self._create_mock_stream("stream3", block_simultaneous_read=False) + + handler = ConcurrentReadProcessor( + [stream1, stream2, stream3], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark stream1 as active + handler._active_stream_names.add("stream1") + + # Try to start streams in order: stream1, stream2, stream3 + result1 = handler.start_next_partition_generator() + + # stream1 should be deferred, stream2 should start + assert result1 is not None + assert "stream2" in handler._active_stream_names + + # Queue should now be [stream3, stream1] (stream1 moved to end) + assert len(handler._stream_instances_to_start_partition_generation) == 2 + assert handler._stream_instances_to_start_partition_generation[0] == stream3 + assert handler._stream_instances_to_start_partition_generation[1] == stream1 + + def test_no_defer_when_flag_false(self): + """Test that blocking doesn't occur when block_simultaneous_read=False""" + stream = self._create_mock_stream("stream1", block_simultaneous_read=False) + + handler = ConcurrentReadProcessor( + [stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark stream as active + handler._active_stream_names.add("stream1") + + # Try to start the stream again (should succeed because flag is False) + result = handler.start_next_partition_generator() + + # Should return a status message (stream started) + assert result is not None + assert isinstance(result, AirbyteMessage) + + # Queue should be empty + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + def test_collect_parent_streams_multi_level(self): + """Test that _collect_all_parent_stream_names works recursively""" + grandparent = self._create_mock_stream("grandparent") + parent = self._create_mock_stream_with_parent("parent", grandparent) + child = self._create_mock_stream_with_parent("child", parent) + + handler = ConcurrentReadProcessor( + [grandparent, parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Collect parents for child + parents = handler._collect_all_parent_stream_names("child") + + # Should include both parent and grandparent + assert "parent" in parents + assert "grandparent" in parents + assert len(parents) == 2 + + def test_deactivate_parents_when_partition_generation_completes(self): + """Test that parent streams are deactivated when partition generation completes""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Manually mark both as active (simulating partition generation for child) + handler._active_stream_names.add("parent") + handler._active_stream_names.add("child") + handler._streams_currently_generating_partitions.append("child") + + # Ensure child has running partitions (so it doesn't trigger _on_stream_is_done) + mock_partition = Mock(spec=Partition) + mock_partition.stream_name.return_value = "child" + handler._streams_to_running_partitions["child"] = {mock_partition} + + # Remove both streams from the queue so start_next_partition_generator doesn't start them + # This simulates the scenario where both streams have already been started + handler._stream_instances_to_start_partition_generation = [] + + # Complete partition generation for child + sentinel = PartitionGenerationCompletedSentinel(child) + + list(handler.on_partition_generation_completed(sentinel)) + + # Parent should be deactivated (it was only needed for partition generation) + assert "parent" not in handler._active_stream_names + + # Child should still be active (it's reading records) + assert "child" in handler._active_stream_names + + def test_deactivate_only_stream_when_done(self): + """Test that only the stream itself is deactivated when done, not parents""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark both as active + handler._active_stream_names.add("parent") + handler._active_stream_names.add("child") + + # Start child and mark it as done + handler._stream_instances_to_start_partition_generation = [] + handler._streams_currently_generating_partitions = [] + handler._streams_to_running_partitions["child"] = set() + + # Call _on_stream_is_done for child + list(handler._on_stream_is_done("child")) + + # Child should be deactivated + assert "child" not in handler._active_stream_names + + # Parent should still be active (not deactivated) + assert "parent" in handler._active_stream_names + + def test_multiple_blocked_streams_retry_in_order(self): + """Test that multiple blocked streams are retried in order""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child1 = self._create_mock_stream_with_parent( + "child1", parent, block_simultaneous_read=True + ) + child2 = self._create_mock_stream_with_parent( + "child2", parent, block_simultaneous_read=True + ) + + handler = ConcurrentReadProcessor( + [parent, child1, child2], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + result = handler.start_next_partition_generator() + assert result is not None + assert "parent" in handler._active_stream_names + + # Try to start child1 (should be deferred) + result = handler.start_next_partition_generator() + # child1 is deferred, but child2 might start if it's not blocked + # Let me check the queue state + + # Both children should be deferred (parent is active) + assert len(handler._stream_instances_to_start_partition_generation) >= 1 + + def test_child_without_flag_blocked_by_parent_with_flag(self): + """Test that a child WITHOUT block_simultaneous_read is blocked by parent WITH the flag""" + # Parent has the flag, child does NOT + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=False) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark parent as active and already started (remove from queue) + handler._active_stream_names.add("parent") + handler._stream_instances_to_start_partition_generation.remove(parent) + + # Try to start child (should be deferred even though child doesn't have the flag) + result = handler.start_next_partition_generator() + + # Child should be deferred because parent has block_simultaneous_read=True and is active + assert result is None # No stream started + assert "child" not in handler._active_stream_names + # Child should be moved to end of queue (still 1 stream in queue) + assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert handler._stream_instances_to_start_partition_generation[0] == child + + def test_child_with_flag_not_blocked_by_parent_without_flag(self): + """Test that a child WITH block_simultaneous_read is NOT blocked by parent WITHOUT the flag""" + # Parent does NOT have the flag, child does + parent = self._create_mock_stream("parent", block_simultaneous_read=False) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark parent as active and already started (remove from queue) + handler._active_stream_names.add("parent") + handler._stream_instances_to_start_partition_generation.remove(parent) + + # Try to start child (should succeed even though parent is active) + result = handler.start_next_partition_generator() + + # Child should start successfully because parent doesn't have block_simultaneous_read + assert result is not None # Stream started + assert "child" in handler._active_stream_names + # Queue should now be empty (both streams started) + assert len(handler._stream_instances_to_start_partition_generation) == 0 From 1b4a9f01b0cf001bea019dfbe038e41825e8dcca Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 6 Jan 2026 17:16:31 +0200 Subject: [PATCH 02/26] Change `block_simultaneous_read` to string --- .../concurrent_read_processor.py | 92 ++++++++++--- .../concurrent_source/concurrent_source.py | 4 +- .../declarative_component_schema.yaml | 25 ++-- .../models/declarative_component_schema.py | 8 +- .../parsers/model_to_component_factory.py | 2 +- .../streams/concurrent/abstract_stream.py | 16 ++- .../streams/concurrent/default_stream.py | 6 +- .../test_model_to_component_factory.py | 14 +- .../test_concurrent_read_processor.py | 127 ++++++++++++------ 9 files changed, 201 insertions(+), 93 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 6873130c1..1901d0d0d 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -70,16 +70,21 @@ def __init__( # A stream is "active" if it's generating partitions or has partitions being read self._active_stream_names: Set[str] = set() - # Store which streams require blocking simultaneous reads - self._stream_block_simultaneous_read: Dict[str, bool] = { + # Store blocking group names for streams that require blocking simultaneous reads + # Maps stream name -> group name (empty string means no blocking) + self._stream_block_simultaneous_read: Dict[str, str] = { stream.name: stream.block_simultaneous_read for stream in stream_instances_to_read_from } + # Track which groups are currently active + # Maps group name -> set of stream names in that group + self._active_groups: Dict[str, Set[str]] = {} + for stream in stream_instances_to_read_from: if stream.block_simultaneous_read: self._logger.info( - f"Stream '{stream.name}' has block_simultaneous_read=True. " - f"Will defer starting this stream if it or its parents are active." + f"Stream '{stream.name}' is in blocking group '{stream.block_simultaneous_read}'. " + f"Will defer starting this stream if another stream in the same group or its parents are active." ) def on_partition_generation_completed( @@ -102,9 +107,16 @@ def on_partition_generation_completed( if parent_stream_name in self._active_stream_names: self._logger.debug(f"Removing '{parent_stream_name}' from active streams") self._active_stream_names.discard(parent_stream_name) - if self._stream_block_simultaneous_read.get(parent_stream_name, False): + + # Remove from active groups + parent_group = self._stream_block_simultaneous_read.get(parent_stream_name, "") + if parent_group: + if parent_group in self._active_groups: + self._active_groups[parent_group].discard(parent_stream_name) + if not self._active_groups[parent_group]: + del self._active_groups[parent_group] self._logger.info( - f"Parent stream '{parent_stream_name}' deactivated after " + f"Parent stream '{parent_stream_name}' (group '{parent_group}') deactivated after " f"partition generation completed for child '{stream_name}'. " f"Blocked streams in the queue will be retried on next start_next_partition_generator call." ) @@ -236,30 +248,50 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: # Pop the first stream from the queue stream = self._stream_instances_to_start_partition_generation.pop(0) stream_name = stream.name + stream_group = self._stream_block_simultaneous_read.get(stream_name, "") + + # Check if this stream has a blocking group and is already active + if stream_group and stream_name in self._active_stream_names: + # Add back to the END of the queue for retry later + self._stream_instances_to_start_partition_generation.append(stream) + self._logger.info( + f"Deferring stream '{stream_name}' (group '{stream_group}') because it's already active. Trying next stream." + ) + continue # Try the next stream in the queue - # Check if this stream has block_simultaneous_read and is already active - if self._stream_block_simultaneous_read.get(stream_name, False) and stream_name in self._active_stream_names: + # Check if this stream's group is already active (another stream in the same group is running) + if ( + stream_group + and stream_group in self._active_groups + and self._active_groups[stream_group] + ): # Add back to the END of the queue for retry later self._stream_instances_to_start_partition_generation.append(stream) + active_streams_in_group = self._active_groups[stream_group] self._logger.info( - f"Deferring stream '{stream_name}' because it's already active " - f"(block_simultaneous_read=True). Trying next stream." + f"Deferring stream '{stream_name}' (group '{stream_group}') because other stream(s) " + f"{active_streams_in_group} in the same group are active. Trying next stream." ) continue # Try the next stream in the queue - # Check if any parent streams have block_simultaneous_read and are currently active + # Check if any parent streams have a blocking group and are currently active parent_streams = self._collect_all_parent_stream_names(stream_name) blocked_by_parents = [ - p for p in parent_streams - if self._stream_block_simultaneous_read.get(p, False) and p in self._active_stream_names + p + for p in parent_streams + if self._stream_block_simultaneous_read.get(p, "") + and p in self._active_stream_names ] if blocked_by_parents: # Add back to the END of the queue for retry later self._stream_instances_to_start_partition_generation.append(stream) + parent_groups = { + self._stream_block_simultaneous_read.get(p, "") for p in blocked_by_parents + } self._logger.info( f"Deferring stream '{stream_name}' because parent stream(s) " - f"{blocked_by_parents} are active and have block_simultaneous_read=True. Trying next stream." + f"{blocked_by_parents} (groups {parent_groups}) are active. Trying next stream." ) continue # Try the next stream in the queue @@ -268,13 +300,24 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: self._active_stream_names.add(stream_name) self._streams_currently_generating_partitions.append(stream_name) + # Track this stream in its group if it has one + if stream_group: + if stream_group not in self._active_groups: + self._active_groups[stream_group] = set() + self._active_groups[stream_group].add(stream_name) + self._logger.debug(f"Added '{stream_name}' to active group '{stream_group}'") + # Also mark all parent streams as active (they will be read from during partition generation) parent_streams = self._collect_all_parent_stream_names(stream_name) for parent_stream_name in parent_streams: - if self._stream_block_simultaneous_read.get(parent_stream_name, False): + parent_group = self._stream_block_simultaneous_read.get(parent_stream_name, "") + if parent_group: self._active_stream_names.add(parent_stream_name) + if parent_group not in self._active_groups: + self._active_groups[parent_group] = set() + self._active_groups[parent_group].add(parent_stream_name) self._logger.info( - f"Marking parent stream '{parent_stream_name}' as active " + f"Marking parent stream '{parent_stream_name}' (group '{parent_group}') as active " f"(will be read during partition generation for '{stream_name}')" ) @@ -338,7 +381,11 @@ def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: partition_router = None # Try DefaultStream path first (_stream_partition_generator._stream_slicer._partition_router) - if hasattr(stream, "_stream_partition_generator") and hasattr(stream._stream_partition_generator, "_stream_slicer") and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router"): + if ( + hasattr(stream, "_stream_partition_generator") + and hasattr(stream._stream_partition_generator, "_stream_slicer") + and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router") + ): partition_router = stream._stream_partition_generator._stream_slicer._partition_router # Fallback to legacy path (retriever.partition_router) for backward compatibility and test mocks elif hasattr(stream, "retriever") and hasattr(stream.retriever, "partition_router"): @@ -376,8 +423,15 @@ def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: # Remove only this stream from active set (NOT parents) if stream_name in self._active_stream_names: self._active_stream_names.discard(stream_name) - if self._stream_block_simultaneous_read.get(stream_name, False): + + # Remove from active groups + stream_group = self._stream_block_simultaneous_read.get(stream_name, "") + if stream_group: + if stream_group in self._active_groups: + self._active_groups[stream_group].discard(stream_name) + if not self._active_groups[stream_group]: + del self._active_groups[stream_group] self._logger.info( - f"Stream '{stream_name}' is no longer active. " + f"Stream '{stream_name}' (group '{stream_group}') is no longer active. " f"Blocked streams in the queue will be retried on next start_next_partition_generator call." ) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte_cdk/sources/concurrent_source/concurrent_source.py index 241282fa5..de2d93523 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_source.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_source.py @@ -77,7 +77,7 @@ def __init__( slice_logger: SliceLogger = DebugSliceLogger(), queue: Optional[Queue[QueueItem]] = None, message_repository: MessageRepository = InMemoryMessageRepository(), - initial_number_partitions_to_generate: int = 10, + initial_number_partitions_to_generate: int = 1, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, ) -> None: """ @@ -92,7 +92,7 @@ def __init__( self._logger = logger self._slice_logger = slice_logger self._message_repository = message_repository - self._initial_number_partitions_to_generate = 10 + self._initial_number_partitions_to_generate = initial_number_partitions_to_generate self._timeout_seconds = timeout_seconds # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 7ebe777eb..3fceae9c7 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1556,14 +1556,23 @@ definitions: block_simultaneous_read: title: Block Simultaneous Read description: > - When true, prevents simultaneous reading of this stream from multiple contexts - (e.g., as both a parent stream and a standalone stream). If the stream OR any - of its parent streams are currently active, this stream will be deferred until - they finish. This is useful for APIs that don't allow concurrent access to the - same endpoint. Default is false for backward compatibility. - Only applies to ConcurrentDeclarativeSource. - type: boolean - default: false + Optional group name for blocking simultaneous reads. Streams with the same + block_simultaneous_read value will not be read concurrently. This prevents + duplicate API calls when a stream is used as both a standalone stream and a + parent stream, or when multiple streams share the same endpoint/session. + + If set to a non-empty string, the stream will be deferred if: + 1. Another stream in the same group is currently active + 2. Any parent stream is in an active group + + Examples: + - "issues_endpoint" - All streams with this value block each other + - "" or null - No blocking (default) + + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: string + default: "" retriever: title: Retriever description: Component used to coordinate how records are extracted across stream slices and request pages. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 5aaed138d..675fa1216 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -1,5 +1,3 @@ -# Copyright (c) 2025 Airbyte, Inc., all rights reserved. - # generated by datamodel-codegen: # filename: declarative_component_schema.yaml @@ -2497,9 +2495,9 @@ class Config: type: Literal["DeclarativeStream"] name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") - block_simultaneous_read: Optional[bool] = Field( - False, - description="When true, prevents simultaneous reading of this stream from multiple contexts (e.g., as both a parent stream and a standalone stream). If the stream OR any of its parent streams are currently active, this stream will be deferred until they finish. This is useful for APIs that don't allow concurrent access to the same endpoint. Default is false for backward compatibility. Only applies to ConcurrentDeclarativeSource.\n", + block_simultaneous_read: Optional[str] = Field( + "", + description='Optional group name for blocking simultaneous reads. Streams with the same block_simultaneous_read value will not be read concurrently. This prevents duplicate API calls when a stream is used as both a standalone stream and a parent stream, or when multiple streams share the same endpoint/session.\nIf set to a non-empty string, the stream will be deferred if: 1. Another stream in the same group is currently active 2. Any parent stream is in an active group\nExamples: - "issues_endpoint" - All streams with this value block each other - "" or null - No blocking (default)\nThis is useful for APIs that don\'t allow concurrent access to the same endpoint or session. Only applies to ConcurrentDeclarativeSource.\n', title="Block Simultaneous Read", ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3b29e30d6..229654940 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2118,7 +2118,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=model.block_simultaneous_read or False, + block_simultaneous_read=model.block_simultaneous_read or "", ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 0052eafa3..e7b24f614 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -86,15 +86,19 @@ def cursor(self) -> Cursor: """ @property - def block_simultaneous_read(self) -> bool: + def block_simultaneous_read(self) -> str: """ - Override to return True if this stream should block simultaneous reads. - When True, prevents starting partition generation for this stream if it - OR any of its parent streams are already active. + Override to return a non-empty group name if this stream should block simultaneous reads. + When a non-empty string is returned, prevents starting partition generation for this stream if: + - Another stream with the same group name is already active + - Any of its parent streams are in an active group - :return: True if simultaneous reads should be blocked, False otherwise + This allows grouping multiple streams that share the same resource (e.g., API endpoint or session) + to prevent them from running concurrently, even if they don't have a parent-child relationship. + + :return: Group name for blocking (non-empty string), or "" to allow concurrent reading """ - return False # Default: allow concurrent reading + return "" # Default: allow concurrent reading @abstractmethod def check_availability(self) -> StreamAvailability: diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 84d955105..5ca11eaf8 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -26,7 +26,7 @@ def __init__( cursor: Cursor, namespace: Optional[str] = None, supports_file_transfer: bool = False, - block_simultaneous_read: bool = False, + block_simultaneous_read: str = "", ) -> None: self._stream_partition_generator = partition_generator self._name = name @@ -97,8 +97,8 @@ def cursor(self) -> Cursor: return self._cursor @property - def block_simultaneous_read(self) -> bool: - """Returns whether this stream should block simultaneous reads""" + def block_simultaneous_read(self) -> str: + """Returns the blocking group name for this stream, or empty string if no blocking""" return self._block_simultaneous_read def check_availability(self) -> StreamAvailability: diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index a83734a51..88134d29c 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5221,7 +5221,7 @@ def test_block_simultaneous_read_from_manifest(): type: DeclarativeStream name: "parent" primary_key: "id" - block_simultaneous_read: true + block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5249,7 +5249,7 @@ def test_block_simultaneous_read_from_manifest(): type: DeclarativeStream name: "child" primary_key: "id" - block_simultaneous_read: true + block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5325,9 +5325,9 @@ def test_block_simultaneous_read_from_manifest(): assert isinstance(parent_stream, DefaultStream) assert parent_stream.name == "parent" - assert parent_stream.block_simultaneous_read is True + assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream with block_simultaneous_read: true + # Test child stream with block_simultaneous_read: "issues_endpoint" child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["child_stream"], {} ) @@ -5337,9 +5337,9 @@ def test_block_simultaneous_read_from_manifest(): assert isinstance(child_stream, DefaultStream) assert child_stream.name == "child" - assert child_stream.block_simultaneous_read is True + assert child_stream.block_simultaneous_read == "issues_endpoint" - # Test stream without block_simultaneous_read (should default to False) + # Test stream without block_simultaneous_read (should default to empty string) no_block_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["no_block_stream"], {} ) @@ -5349,7 +5349,7 @@ def test_block_simultaneous_read_from_manifest(): assert isinstance(no_block_stream, DefaultStream) assert no_block_stream.name == "no_block" - assert no_block_stream.block_simultaneous_read is False + assert no_block_stream.block_simultaneous_read == "" def get_schema_loader(stream: DefaultStream): diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index f1fad0799..e76f0576c 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -806,7 +806,7 @@ def setUp(self): self._message_repository.consume_queue.return_value = [] self._partition_reader = Mock(spec=PartitionReader) - def _create_mock_stream(self, name: str, block_simultaneous_read: bool = False): + def _create_mock_stream(self, name: str, block_simultaneous_read: str = ""): """Helper to create a mock stream""" stream = Mock(spec=AbstractStream) stream.name = name @@ -820,7 +820,7 @@ def _create_mock_stream(self, name: str, block_simultaneous_read: bool = False): return stream def _create_mock_stream_with_parent( - self, name: str, parent_stream, block_simultaneous_read: bool = False + self, name: str, parent_stream, block_simultaneous_read: str = "" ): """Helper to create a mock stream with a parent stream""" stream = self._create_mock_stream(name, block_simultaneous_read) @@ -839,7 +839,7 @@ def _create_mock_stream_with_parent( def test_defer_stream_when_self_active(self): """Test that a stream is deferred when it's already active""" - stream = self._create_mock_stream("stream1", block_simultaneous_read=True) + stream = self._create_mock_stream("stream1", block_simultaneous_read="api_group") handler = ConcurrentReadProcessor( [stream], @@ -866,15 +866,16 @@ def test_defer_stream_when_self_active(self): # Logger should have been called to log deferral assert any( - "Deferring stream 'stream1' because it's already active" in str(call) + "Deferring stream 'stream1' (group 'api_group') because it's already active" + in str(call) for call in self._logger.info.call_args_list ) def test_defer_stream_when_parent_active(self): """Test that a stream is deferred when its parent is active""" - parent_stream = self._create_mock_stream("parent", block_simultaneous_read=True) + parent_stream = self._create_mock_stream("parent", block_simultaneous_read="api_group") child_stream = self._create_mock_stream_with_parent( - "child", parent_stream, block_simultaneous_read=True + "child", parent_stream, block_simultaneous_read="api_group" ) handler = ConcurrentReadProcessor( @@ -911,11 +912,13 @@ def test_defer_stream_when_parent_active(self): def test_defer_stream_when_grandparent_active(self): """Test that a stream is deferred when its grandparent is active""" - grandparent = self._create_mock_stream("grandparent", block_simultaneous_read=True) + grandparent = self._create_mock_stream("grandparent", block_simultaneous_read="api_group") parent = self._create_mock_stream_with_parent( - "parent", grandparent, block_simultaneous_read=True + "parent", grandparent, block_simultaneous_read="api_group" + ) + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" ) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) handler = ConcurrentReadProcessor( [grandparent, parent, child], @@ -943,9 +946,9 @@ def test_defer_stream_when_grandparent_active(self): assert len(handler._stream_instances_to_start_partition_generation) == 1 def test_retry_blocked_stream_after_blocker_done(self): - """Test that blocked stream is retried after blocker finishes""" - stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) - stream2 = self._create_mock_stream("stream2", block_simultaneous_read=True) + """Test that independent streams with different groups don't block each other""" + stream1 = self._create_mock_stream("stream1", block_simultaneous_read="group1") + stream2 = self._create_mock_stream("stream2", block_simultaneous_read="group2") handler = ConcurrentReadProcessor( [stream1, stream2], @@ -961,26 +964,21 @@ def test_retry_blocked_stream_after_blocker_done(self): handler.start_next_partition_generator() assert "stream1" in handler._active_stream_names - # Try to start stream2 (should be deferred since stream1 is active and they share block flag) - # But wait - they're not parent-child, so stream2 should start successfully - # Let me fix the test logic - - # Mark stream1 as active to simulate it's running - handler._active_stream_names.add("stream1") - handler._stream_instances_to_start_partition_generation = [stream1, stream2] - - # Try to start stream1 again (should be deferred because already active) + # Stream2 should start successfully even though stream1 is active + # because they're in different groups result = handler.start_next_partition_generator() - # Should start stream2 instead (stream1 was deferred) + # Should start stream2 (different group, no blocking) assert result is not None assert "stream2" in handler._active_stream_names - assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert len(handler._stream_instances_to_start_partition_generation) == 0 def test_retry_blocked_stream_after_partition_generation(self): """Test that blocked stream is retried after partition generation completes""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1017,9 +1015,9 @@ def test_retry_blocked_stream_after_partition_generation(self): def test_blocked_stream_added_to_end_of_queue(self): """Test that blocked streams are added to the end of the queue""" - stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) - stream2 = self._create_mock_stream("stream2", block_simultaneous_read=False) - stream3 = self._create_mock_stream("stream3", block_simultaneous_read=False) + stream1 = self._create_mock_stream("stream1", block_simultaneous_read="api_group") + stream2 = self._create_mock_stream("stream2", block_simultaneous_read="") + stream3 = self._create_mock_stream("stream3", block_simultaneous_read="") handler = ConcurrentReadProcessor( [stream1, stream2, stream3], @@ -1047,8 +1045,8 @@ def test_blocked_stream_added_to_end_of_queue(self): assert handler._stream_instances_to_start_partition_generation[1] == stream1 def test_no_defer_when_flag_false(self): - """Test that blocking doesn't occur when block_simultaneous_read=False""" - stream = self._create_mock_stream("stream1", block_simultaneous_read=False) + """Test that blocking doesn't occur when block_simultaneous_read=""" "" + stream = self._create_mock_stream("stream1", block_simultaneous_read="") handler = ConcurrentReadProcessor( [stream], @@ -1099,8 +1097,10 @@ def test_collect_parent_streams_multi_level(self): def test_deactivate_parents_when_partition_generation_completes(self): """Test that parent streams are deactivated when partition generation completes""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1139,8 +1139,10 @@ def test_deactivate_parents_when_partition_generation_completes(self): def test_deactivate_only_stream_when_done(self): """Test that only the stream itself is deactivated when done, not parents""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1172,12 +1174,12 @@ def test_deactivate_only_stream_when_done(self): def test_multiple_blocked_streams_retry_in_order(self): """Test that multiple blocked streams are retried in order""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") child1 = self._create_mock_stream_with_parent( - "child1", parent, block_simultaneous_read=True + "child1", parent, block_simultaneous_read="api_group" ) child2 = self._create_mock_stream_with_parent( - "child2", parent, block_simultaneous_read=True + "child2", parent, block_simultaneous_read="api_group" ) handler = ConcurrentReadProcessor( @@ -1206,8 +1208,8 @@ def test_multiple_blocked_streams_retry_in_order(self): def test_child_without_flag_blocked_by_parent_with_flag(self): """Test that a child WITHOUT block_simultaneous_read is blocked by parent WITH the flag""" # Parent has the flag, child does NOT - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=False) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read="") handler = ConcurrentReadProcessor( [parent, child], @@ -1226,7 +1228,7 @@ def test_child_without_flag_blocked_by_parent_with_flag(self): # Try to start child (should be deferred even though child doesn't have the flag) result = handler.start_next_partition_generator() - # Child should be deferred because parent has block_simultaneous_read=True and is active + # Child should be deferred because parent has block_simultaneous_read="api_group" and is active assert result is None # No stream started assert "child" not in handler._active_stream_names # Child should be moved to end of queue (still 1 stream in queue) @@ -1236,8 +1238,10 @@ def test_child_without_flag_blocked_by_parent_with_flag(self): def test_child_with_flag_not_blocked_by_parent_without_flag(self): """Test that a child WITH block_simultaneous_read is NOT blocked by parent WITHOUT the flag""" # Parent does NOT have the flag, child does - parent = self._create_mock_stream("parent", block_simultaneous_read=False) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1261,3 +1265,42 @@ def test_child_with_flag_not_blocked_by_parent_without_flag(self): assert "child" in handler._active_stream_names # Queue should now be empty (both streams started) assert len(handler._stream_instances_to_start_partition_generation) == 0 + + def test_unrelated_streams_in_same_group_block_each_other(self): + """Test that multiple unrelated streams with the same group name block each other""" + # Create three unrelated streams (no parent-child relationship) in the same group + stream1 = self._create_mock_stream("stream1", block_simultaneous_read="shared_endpoint") + stream2 = self._create_mock_stream("stream2", block_simultaneous_read="shared_endpoint") + stream3 = self._create_mock_stream("stream3", block_simultaneous_read="shared_endpoint") + + handler = ConcurrentReadProcessor( + [stream1, stream2, stream3], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start stream1 + result = handler.start_next_partition_generator() + assert result is not None + assert "stream1" in handler._active_stream_names + assert "shared_endpoint" in handler._active_groups + assert "stream1" in handler._active_groups["shared_endpoint"] + + # Try to start stream2 (should be deferred because it's in the same group) + result = handler.start_next_partition_generator() + # stream2 should be deferred, stream3 should also be deferred + # All three are in same group, only stream1 is active + assert result is None # No stream started + + # Both stream2 and stream3 should be in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 2 + + # Verify logger was called with deferral message + assert any( + "Deferring stream 'stream2'" in str(call) and "shared_endpoint" in str(call) + for call in self._logger.info.call_args_list + ) From ec8dd6f5a587fd93a020b06dc67dc9f6192d1a78 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 6 Jan 2026 17:59:47 +0200 Subject: [PATCH 03/26] Fix StreamFacade --- airbyte_cdk/sources/streams/concurrent/adapters.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte_cdk/sources/streams/concurrent/adapters.py index 41674bdae..a9bc47e0d 100644 --- a/airbyte_cdk/sources/streams/concurrent/adapters.py +++ b/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -196,6 +196,11 @@ def cursor_field(self) -> Union[str, List[str]]: def cursor(self) -> Optional[Cursor]: # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor return self._cursor + @property + def block_simultaneous_read(self) -> str: + """Returns the blocking group name from the underlying stream""" + return self._abstract_stream.block_simultaneous_read + # FIXME the lru_cache seems to be mostly there because of typing issue @lru_cache(maxsize=None) def get_json_schema(self) -> Mapping[str, Any]: From 450d7cf772b24fa12aa11b78f6ec58feb7e8c80d Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Fri, 9 Jan 2026 18:37:35 +0200 Subject: [PATCH 04/26] Fix NoneType error when all streams are blocked --- .../sources/concurrent_source/concurrent_read_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 1901d0d0d..ec331e05d 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -129,7 +129,9 @@ def on_partition_generation_completed( ): yield from self._on_stream_is_done(stream_name) if self._stream_instances_to_start_partition_generation: - yield self.start_next_partition_generator() # type:ignore # None may be yielded + status_message = self.start_next_partition_generator() + if status_message: + yield status_message def on_partition(self, partition: Partition) -> None: """ From 3e7de2f575619160b49d018bb619dabe5d4ea368 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 12 Jan 2026 23:10:50 +0200 Subject: [PATCH 05/26] Fix unit tests --- .../test_concurrent_read_processor.py | 53 ++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index e76f0576c..7d9c52e26 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1045,7 +1045,7 @@ def test_blocked_stream_added_to_end_of_queue(self): assert handler._stream_instances_to_start_partition_generation[1] == stream1 def test_no_defer_when_flag_false(self): - """Test that blocking doesn't occur when block_simultaneous_read=""" "" + """Test that blocking doesn't occur when block_simultaneous_read="" """ stream = self._create_mock_stream("stream1", block_simultaneous_read="") handler = ConcurrentReadProcessor( @@ -1196,14 +1196,55 @@ def test_multiple_blocked_streams_retry_in_order(self): result = handler.start_next_partition_generator() assert result is not None assert "parent" in handler._active_stream_names + assert "api_group" in handler._active_groups + assert "parent" in handler._active_groups["api_group"] - # Try to start child1 (should be deferred) + # Try to start next stream (child1) - should be deferred because parent is active result = handler.start_next_partition_generator() - # child1 is deferred, but child2 might start if it's not blocked - # Let me check the queue state + assert result is None # child1 was deferred - # Both children should be deferred (parent is active) - assert len(handler._stream_instances_to_start_partition_generation) >= 1 + # After first deferral, we should still have 2 streams in queue (child1 moved to end) + assert len(handler._stream_instances_to_start_partition_generation) == 2 + # child1 was moved to the back, so the queue has the other child first + queue_streams = handler._stream_instances_to_start_partition_generation + assert child1 in queue_streams + assert child2 in queue_streams + + # Try to start next stream (child2) - should also be deferred + result = handler.start_next_partition_generator() + assert result is None # child2 was deferred + + # Both streams still in queue, but order may have changed + assert len(handler._stream_instances_to_start_partition_generation) == 2 + + # Verify neither child is active yet (both blocked by parent) + assert "child1" not in handler._active_stream_names + assert "child2" not in handler._active_stream_names + + # Verify deferral was logged for both children + logger_calls = [str(call) for call in self._logger.info.call_args_list] + assert any("Deferring stream 'child1'" in call for call in logger_calls) + assert any("Deferring stream 'child2'" in call for call in logger_calls) + + # Simulate parent completing partition generation (parent has no partitions, so it's done) + handler._streams_currently_generating_partitions.append("parent") + handler._streams_to_running_partitions["parent"] = set() + sentinel = PartitionGenerationCompletedSentinel(parent) + list(handler.on_partition_generation_completed(sentinel)) + + # After parent completes, one of the children should start (whichever was first in queue) + # We know at least one child started because the queue shrunk + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + # Verify that exactly one child is now active + children_active = [ + name for name in ["child1", "child2"] + if name in handler._active_stream_names + ] + assert len(children_active) == 1, f"Expected exactly one child active, got: {children_active}" + + # Parent should be re-activated because the active child needs to read from it + assert "parent" in handler._active_stream_names def test_child_without_flag_blocked_by_parent_with_flag(self): """Test that a child WITHOUT block_simultaneous_read is blocked by parent WITH the flag""" From b7fa9a52bb4636c4aa592cda965ba9f6155103b4 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Mon, 12 Jan 2026 21:13:12 +0000 Subject: [PATCH 06/26] Auto-fix lint and format issues --- .../streams/concurrent/test_concurrent_read_processor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 7d9c52e26..d608a823f 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1238,10 +1238,11 @@ def test_multiple_blocked_streams_retry_in_order(self): # Verify that exactly one child is now active children_active = [ - name for name in ["child1", "child2"] - if name in handler._active_stream_names + name for name in ["child1", "child2"] if name in handler._active_stream_names ] - assert len(children_active) == 1, f"Expected exactly one child active, got: {children_active}" + assert len(children_active) == 1, ( + f"Expected exactly one child active, got: {children_active}" + ) # Parent should be re-activated because the active child needs to read from it assert "parent" in handler._active_stream_names From 314ded1dacdcab1b3635a392d8ec9ead00f43edb Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 13 Jan 2026 16:53:33 +0200 Subject: [PATCH 07/26] Add retry deferred streams on stream completion --- .../concurrent_read_processor.py | 6 + .../test_concurrent_read_processor.py | 112 ++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index ec331e05d..372430be1 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -159,6 +159,7 @@ def on_partition_complete_sentinel( 1. Close the partition 2. If the stream is done, mark it as such and return a stream status message 3. Emit messages that were added to the message repository + 4. If there are more streams to read from, start the next partition generator """ partition = sentinel.partition @@ -171,6 +172,11 @@ def on_partition_complete_sentinel( and len(partitions_running) == 0 ): yield from self._on_stream_is_done(partition.stream_name()) + # Try to start the next stream in the queue (may be a deferred stream) + if self._stream_instances_to_start_partition_generation: + status_message = self.start_next_partition_generator() + if status_message: + yield status_message yield from self._message_repository.consume_queue() def on_record(self, record: Record) -> Iterable[AirbyteMessage]: diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index d608a823f..9da12fc1c 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1346,3 +1346,115 @@ def test_unrelated_streams_in_same_group_block_each_other(self): "Deferring stream 'stream2'" in str(call) and "shared_endpoint" in str(call) for call in self._logger.info.call_args_list ) + + def test_child_starts_after_parent_completes_via_partition_complete_sentinel(self): + """Test that child stream starts after parent completes via on_partition_complete_sentinel""" + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + handler.start_next_partition_generator() + assert "parent" in handler._active_stream_names + + # Try to start child (should be deferred) + result = handler.start_next_partition_generator() + assert result is None + assert "child" not in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + # Create a partition for parent and add it to running partitions + # (parent is already in _streams_currently_generating_partitions from start_next_partition_generator) + mock_partition = Mock(spec=Partition) + mock_partition.stream_name.return_value = "parent" + handler._streams_to_running_partitions["parent"].add(mock_partition) + + # Complete partition generation for parent + sentinel_gen = PartitionGenerationCompletedSentinel(parent) + list(handler.on_partition_generation_completed(sentinel_gen)) + + # Now complete the partition (this triggers stream done) + sentinel_complete = PartitionCompleteSentinel(mock_partition) + messages = list(handler.on_partition_complete_sentinel(sentinel_complete)) + + # Child should have been started automatically + assert "child" in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + # Verify a STARTED message was emitted for child + started_messages = [ + msg + for msg in messages + if msg.type == MessageType.TRACE + and msg.trace.stream_status + and msg.trace.stream_status.status == AirbyteStreamStatus.STARTED + ] + assert len(started_messages) == 1 + assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" + + def test_child_starts_after_parent_completes_via_partition_complete_sentinel(self): + """Test that child stream starts after parent completes via on_partition_complete_sentinel""" + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + handler.start_next_partition_generator() + assert "parent" in handler._active_stream_names + + # Try to start child (should be deferred) + result = handler.start_next_partition_generator() + assert result is None + assert "child" not in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + # Create a partition for parent and add it to running partitions + # (parent is already in _streams_currently_generating_partitions from start_next_partition_generator) + mock_partition = Mock(spec=Partition) + mock_partition.stream_name.return_value = "parent" + handler._streams_to_running_partitions["parent"].add(mock_partition) + + # Complete partition generation for parent + sentinel_gen = PartitionGenerationCompletedSentinel(parent) + list(handler.on_partition_generation_completed(sentinel_gen)) + + # Now complete the partition (this triggers stream done) + sentinel_complete = PartitionCompleteSentinel(mock_partition) + messages = list(handler.on_partition_complete_sentinel(sentinel_complete)) + + # Child should have been started automatically + assert "child" in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + # Verify a STARTED message was emitted for child + started_messages = [ + msg + for msg in messages + if msg.type == MessageType.TRACE + and msg.trace.stream_status + and msg.trace.stream_status.status == AirbyteStreamStatus.STARTED + ] + assert len(started_messages) == 1 + assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" From 80d8b2bff53913e6fe23e3bd7fbe2c5313e970f2 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 13 Jan 2026 17:44:32 +0200 Subject: [PATCH 08/26] Fix unit tests --- .../test_concurrent_read_processor.py | 58 +------------------ 1 file changed, 1 insertion(+), 57 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 9da12fc1c..eecf74245 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -945,7 +945,7 @@ def test_defer_stream_when_grandparent_active(self): # Child should be back in the queue assert len(handler._stream_instances_to_start_partition_generation) == 1 - def test_retry_blocked_stream_after_blocker_done(self): + def test_different_groups_do_not_block_each_other(self): """Test that independent streams with different groups don't block each other""" stream1 = self._create_mock_stream("stream1", block_simultaneous_read="group1") stream2 = self._create_mock_stream("stream2", block_simultaneous_read="group2") @@ -1402,59 +1402,3 @@ def test_child_starts_after_parent_completes_via_partition_complete_sentinel(sel ] assert len(started_messages) == 1 assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" - - def test_child_starts_after_parent_completes_via_partition_complete_sentinel(self): - """Test that child stream starts after parent completes via on_partition_complete_sentinel""" - parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") - child = self._create_mock_stream_with_parent( - "child", parent, block_simultaneous_read="api_group" - ) - - handler = ConcurrentReadProcessor( - [parent, child], - self._partition_enqueuer, - self._thread_pool_manager, - self._logger, - self._slice_logger, - self._message_repository, - self._partition_reader, - ) - - # Start parent - handler.start_next_partition_generator() - assert "parent" in handler._active_stream_names - - # Try to start child (should be deferred) - result = handler.start_next_partition_generator() - assert result is None - assert "child" not in handler._active_stream_names - assert len(handler._stream_instances_to_start_partition_generation) == 1 - - # Create a partition for parent and add it to running partitions - # (parent is already in _streams_currently_generating_partitions from start_next_partition_generator) - mock_partition = Mock(spec=Partition) - mock_partition.stream_name.return_value = "parent" - handler._streams_to_running_partitions["parent"].add(mock_partition) - - # Complete partition generation for parent - sentinel_gen = PartitionGenerationCompletedSentinel(parent) - list(handler.on_partition_generation_completed(sentinel_gen)) - - # Now complete the partition (this triggers stream done) - sentinel_complete = PartitionCompleteSentinel(mock_partition) - messages = list(handler.on_partition_complete_sentinel(sentinel_complete)) - - # Child should have been started automatically - assert "child" in handler._active_stream_names - assert len(handler._stream_instances_to_start_partition_generation) == 0 - - # Verify a STARTED message was emitted for child - started_messages = [ - msg - for msg in messages - if msg.type == MessageType.TRACE - and msg.trace.stream_status - and msg.trace.stream_status.status == AirbyteStreamStatus.STARTED - ] - assert len(started_messages) == 1 - assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" From 8c06ce6f8d2fd10ac27779d5cb93658b3a8044a7 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 13 Jan 2026 18:26:18 +0200 Subject: [PATCH 09/26] More fixes for unit tests --- .../streams/concurrent/test_concurrent_read_processor.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index eecf74245..788478ae6 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -994,10 +994,6 @@ def test_retry_blocked_stream_after_partition_generation(self): handler.start_next_partition_generator() assert "parent" in handler._active_stream_names - # Mark parent as generating partitions and having no partitions - handler._streams_currently_generating_partitions.append("parent") - handler._streams_to_running_partitions["parent"] = set() - # Complete partition generation for parent (parent has no partitions, so it's done) sentinel = PartitionGenerationCompletedSentinel(parent) messages = list(handler.on_partition_generation_completed(sentinel)) @@ -1227,8 +1223,6 @@ def test_multiple_blocked_streams_retry_in_order(self): assert any("Deferring stream 'child2'" in call for call in logger_calls) # Simulate parent completing partition generation (parent has no partitions, so it's done) - handler._streams_currently_generating_partitions.append("parent") - handler._streams_to_running_partitions["parent"] = set() sentinel = PartitionGenerationCompletedSentinel(parent) list(handler.on_partition_generation_completed(sentinel)) From 49b0174908ffe84eefaf4555c7ba5cfbea40fa41 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:48:05 +0000 Subject: [PATCH 10/26] refactor: replace per-stream block_simultaneous_read with top-level stream_groups - Remove block_simultaneous_read property from DeclarativeStream schema - Add top-level stream_groups with StreamGroup and BlockSimultaneousSyncsAction - ConcurrentDeclarativeSource parses stream_groups and injects block_simultaneous_read into stream configs before factory processing - Internal blocking logic in ConcurrentReadProcessor unchanged - Update tests for new interface Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 42 ++++++++++- .../declarative_component_schema.yaml | 66 +++++++++++------ .../models/declarative_component_schema.py | 32 +++++++-- .../parsers/model_to_component_factory.py | 2 +- .../test_model_to_component_factory.py | 14 ++-- .../test_concurrent_declarative_source.py | 71 +++++++++++++++++++ 6 files changed, 196 insertions(+), 31 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 45fe6aa2d..cdd86fda9 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -405,6 +405,14 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) + stream_name_to_group = self._build_stream_name_to_group(self._source_config) + + prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) + for stream_config in prepared_configs: + stream_name = stream_config.get("name", "") + if stream_name in stream_name_to_group: + stream_config["block_simultaneous_read"] = stream_name_to_group[stream_name] + source_streams = [ self._constructor.create_component( ( @@ -416,7 +424,7 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i self._config, emit_connector_builder_messages=self._emit_connector_builder_messages, ) - for stream_config in self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) + for stream_config in prepared_configs ] return source_streams @@ -526,6 +534,38 @@ def dynamic_streams(self) -> List[Dict[str, Any]]: with_dynamic_stream_name=True, ) + @staticmethod + def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: + """Build a mapping from stream name to group name based on the stream_groups manifest config. + + After manifest reference resolution, each stream reference in stream_groups.streams + is resolved to the full stream definition dict containing a 'name' field. + + Returns: + A dict mapping stream name -> group name for streams that belong to a group. + """ + stream_name_to_group: Dict[str, str] = {} + stream_groups = manifest.get("stream_groups", {}) + if not stream_groups: + return stream_name_to_group + + for group_name, group_config in stream_groups.items(): + streams = group_config.get("streams", []) + for stream_ref in streams: + if isinstance(stream_ref, dict): + # After reference resolution, stream_ref is a full stream definition dict + stream_name = stream_ref.get("name", "") + if stream_name: + stream_name_to_group[stream_name] = group_name + elif isinstance(stream_ref, str): + # If not resolved (shouldn't happen normally), extract name from ref path + # e.g., "#/definitions/my_stream" -> "my_stream" + if stream_ref.startswith("#/definitions/"): + stream_name = stream_ref.split("/")[-1] + stream_name_to_group[stream_name] = group_name + + return stream_name_to_group + def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config stream_configs = [] diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 3fceae9c7..1cf4a7e73 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -45,6 +45,15 @@ properties: "$ref": "#/definitions/ConcurrencyLevel" api_budget: "$ref": "#/definitions/HTTPAPIBudget" + stream_groups: + title: Stream Groups + description: > + Groups of streams that share a common resource and should not be read simultaneously. + Each group defines a set of stream references and an action that controls how concurrent + reads are managed. Only applies to ConcurrentDeclarativeSource. + type: object + additionalProperties: + "$ref": "#/definitions/StreamGroup" max_concurrent_async_job_count: title: Maximum Concurrent Asynchronous Jobs description: Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information. @@ -63,6 +72,43 @@ properties: description: A description of the connector. It will be presented on the Source documentation page. additionalProperties: false definitions: + StreamGroup: + title: Stream Group + description: > + A group of streams that share a common resource and should not be read simultaneously. + Streams in the same group will be blocked from concurrent reads based on the specified action. + type: object + required: + - streams + - action + properties: + streams: + title: Streams + description: > + List of references to streams that belong to this group. Use JSON references + to stream definitions (e.g., "#/definitions/my_stream"). + type: array + items: + type: string + action: + title: Action + description: The action to apply to streams in this group. + "$ref": "#/definitions/BlockSimultaneousSyncsAction" + BlockSimultaneousSyncsAction: + title: Block Simultaneous Syncs Action + description: > + Action that prevents streams in the same group from being read concurrently. + When applied to a stream group, streams with this action will be deferred if + another stream in the same group is currently active. + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: object + required: + - type + properties: + type: + type: string + enum: [BlockSimultaneousSyncsAction] AddedFieldDefinition: title: Definition Of Field To Add description: Defines the field to add on a record. @@ -1553,26 +1599,6 @@ definitions: default: "" example: - "Users" - block_simultaneous_read: - title: Block Simultaneous Read - description: > - Optional group name for blocking simultaneous reads. Streams with the same - block_simultaneous_read value will not be read concurrently. This prevents - duplicate API calls when a stream is used as both a standalone stream and a - parent stream, or when multiple streams share the same endpoint/session. - - If set to a non-empty string, the stream will be deferred if: - 1. Another stream in the same group is currently active - 2. Any parent stream is in an active group - - Examples: - - "issues_endpoint" - All streams with this value block each other - - "" or null - No blocking (default) - - This is useful for APIs that don't allow concurrent access to the same - endpoint or session. Only applies to ConcurrentDeclarativeSource. - type: string - default: "" retriever: title: Retriever description: Component used to coordinate how records are extracted across stream slices and request pages. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 675fa1216..fabec77e5 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -2305,6 +2305,23 @@ class Config: ) +class BlockSimultaneousSyncsAction(BaseModel): + type: Literal["BlockSimultaneousSyncsAction"] + + +class StreamGroup(BaseModel): + streams: List[str] = Field( + ..., + description='List of references to streams that belong to this group. Use JSON references to stream definitions (e.g., "#/definitions/my_stream").', + title="Streams", + ) + action: BlockSimultaneousSyncsAction = Field( + ..., + description="The action to apply to streams in this group.", + title="Action", + ) + + class Spec(BaseModel): type: Literal["Spec"] connection_specification: Dict[str, Any] = Field( @@ -2345,6 +2362,11 @@ class Config: spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None api_budget: Optional[HTTPAPIBudget] = None + stream_groups: Optional[Dict[str, StreamGroup]] = Field( + None, + description="Groups of streams that share a common resource and should not be read simultaneously. Each group defines a set of stream references and an action that controls how concurrent reads are managed. Only applies to ConcurrentDeclarativeSource.", + title="Stream Groups", + ) max_concurrent_async_job_count: Optional[Union[int, str]] = Field( None, description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.", @@ -2380,6 +2402,11 @@ class Config: spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None api_budget: Optional[HTTPAPIBudget] = None + stream_groups: Optional[Dict[str, StreamGroup]] = Field( + None, + description="Groups of streams that share a common resource and should not be read simultaneously. Each group defines a set of stream references and an action that controls how concurrent reads are managed. Only applies to ConcurrentDeclarativeSource.", + title="Stream Groups", + ) max_concurrent_async_job_count: Optional[Union[int, str]] = Field( None, description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.", @@ -2495,11 +2522,6 @@ class Config: type: Literal["DeclarativeStream"] name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") - block_simultaneous_read: Optional[str] = Field( - "", - description='Optional group name for blocking simultaneous reads. Streams with the same block_simultaneous_read value will not be read concurrently. This prevents duplicate API calls when a stream is used as both a standalone stream and a parent stream, or when multiple streams share the same endpoint/session.\nIf set to a non-empty string, the stream will be deferred if: 1. Another stream in the same group is currently active 2. Any parent stream is in an active group\nExamples: - "issues_endpoint" - All streams with this value block each other - "" or null - No blocking (default)\nThis is useful for APIs that don\'t allow concurrent access to the same endpoint or session. Only applies to ConcurrentDeclarativeSource.\n', - title="Block Simultaneous Read", - ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages.", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 229654940..5b64e0419 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2118,7 +2118,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=model.block_simultaneous_read or "", + block_simultaneous_read=getattr(model, "block_simultaneous_read", "") or "", ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 88134d29c..7b7698dce 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5214,8 +5214,14 @@ def test_catalog_defined_cursor_field_stream_missing(): assert stream._cursor_field.supports_catalog_defined_cursor_field == True -def test_block_simultaneous_read_from_manifest(): - """Test that block_simultaneous_read flows through from manifest to DefaultStream""" +def test_block_simultaneous_read_from_stream_groups(): + """Test that block_simultaneous_read flows through from stream_groups to DefaultStream. + + The stream_groups config is processed by ConcurrentDeclarativeSource which injects + block_simultaneous_read into individual stream configs before passing them to the factory. + This test verifies that the factory correctly reads block_simultaneous_read from the + extra fields on the stream config dict. + """ content = """ parent_stream: type: DeclarativeStream @@ -5315,7 +5321,7 @@ def test_block_simultaneous_read_from_manifest(): parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Test parent stream with block_simultaneous_read: true + # Test parent stream with block_simultaneous_read injected (as ConcurrentDeclarativeSource would do) parent_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["parent_stream"], {} ) @@ -5327,7 +5333,7 @@ def test_block_simultaneous_read_from_manifest(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream with block_simultaneous_read: "issues_endpoint" + # Test child stream with block_simultaneous_read injected child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["child_stream"], {} ) diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index bde6c35b1..3f10b9eb2 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5150,3 +5150,74 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): with pytest.raises(ValueError): list(source.read(logger=source.logger, config=input_config, catalog=catalog, state=[])) + + +@pytest.mark.parametrize( + "manifest,expected", + [ + pytest.param( + {}, + {}, + id="no_stream_groups", + ), + pytest.param( + {"stream_groups": {}}, + {}, + id="empty_stream_groups", + ), + pytest.param( + { + "stream_groups": { + "crm_objects": { + "streams": [ + {"name": "deals", "type": "DeclarativeStream"}, + {"name": "companies", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + }, + {"deals": "crm_objects", "companies": "crm_objects"}, + id="resolved_stream_refs", + ), + pytest.param( + { + "stream_groups": { + "group_a": { + "streams": [ + {"name": "stream1", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + "group_b": { + "streams": [ + {"name": "stream2", "type": "DeclarativeStream"}, + {"name": "stream3", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + } + }, + {"stream1": "group_a", "stream2": "group_b", "stream3": "group_b"}, + id="multiple_groups", + ), + pytest.param( + { + "stream_groups": { + "fallback_group": { + "streams": [ + "#/definitions/my_stream", + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + }, + {"my_stream": "fallback_group"}, + id="unresolved_string_refs_fallback", + ), + ], +) +def test_build_stream_name_to_group(manifest, expected): + """Test _build_stream_name_to_group correctly maps stream names to group names.""" + result = ConcurrentDeclarativeSource._build_stream_name_to_group(manifest) + assert result == expected From 219f7df6d0a743170d74a5245ae7647b066f6157 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 15:23:18 +0000 Subject: [PATCH 11/26] refactor: move stream_name_to_group into ModelToComponentFactory - Add stream_name_to_group parameter to ModelToComponentFactory.__init__() - Add set_stream_name_to_group() method for post-init configuration - Factory now looks up block_simultaneous_read from its own mapping - Remove config injection hack from ConcurrentDeclarativeSource.streams() - Update tests to use factory-based approach instead of extra fields Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 8 +- .../parsers/model_to_component_factory.py | 8 +- .../test_model_to_component_factory.py | 82 ++++++++++++++++--- 3 files changed, 80 insertions(+), 18 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index cdd86fda9..c0ab9e910 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -405,13 +405,11 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) - stream_name_to_group = self._build_stream_name_to_group(self._source_config) + self._constructor.set_stream_name_to_group( + self._build_stream_name_to_group(self._source_config) + ) prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) - for stream_config in prepared_configs: - stream_name = stream_config.get("name", "") - if stream_name in stream_name_to_group: - stream_config["block_simultaneous_read"] = stream_name_to_group[stream_name] source_streams = [ self._constructor.create_component( diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 5b64e0419..18a7c37f8 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -682,6 +682,7 @@ def __init__( max_concurrent_async_job_count: Optional[int] = None, configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, api_budget: Optional[APIBudget] = None, + stream_name_to_group: Optional[Dict[str, str]] = None, ): self._init_mappings() self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice @@ -698,9 +699,14 @@ def __init__( self._connector_state_manager = connector_state_manager or ConnectorStateManager() self._api_budget: Optional[Union[APIBudget]] = api_budget self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) + self._stream_name_to_group: Dict[str, str] = stream_name_to_group or {} # placeholder for deprecation warnings self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] + def set_stream_name_to_group(self, stream_name_to_group: Dict[str, str]) -> None: + """Set the mapping from stream name to group name for block_simultaneous_read.""" + self._stream_name_to_group = stream_name_to_group + def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { AddedFieldDefinitionModel: self.create_added_field_definition, @@ -2118,7 +2124,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=getattr(model, "block_simultaneous_read", "") or "", + block_simultaneous_read=self._stream_name_to_group.get(stream_name, ""), ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 7b7698dce..184983d35 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5215,19 +5215,17 @@ def test_catalog_defined_cursor_field_stream_missing(): def test_block_simultaneous_read_from_stream_groups(): - """Test that block_simultaneous_read flows through from stream_groups to DefaultStream. + """Test that block_simultaneous_read flows through from stream_name_to_group to DefaultStream. - The stream_groups config is processed by ConcurrentDeclarativeSource which injects - block_simultaneous_read into individual stream configs before passing them to the factory. - This test verifies that the factory correctly reads block_simultaneous_read from the - extra fields on the stream config dict. + The stream_groups config is parsed by ConcurrentDeclarativeSource into a stream_name_to_group + mapping, which is then set on the ModelToComponentFactory. The factory uses this mapping to + look up the group for each stream it creates. """ content = """ parent_stream: type: DeclarativeStream name: "parent" primary_key: "id" - block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5255,7 +5253,6 @@ def test_block_simultaneous_read_from_stream_groups(): type: DeclarativeStream name: "child" primary_key: "id" - block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5318,14 +5315,19 @@ def test_block_simultaneous_read_from_stream_groups(): config = {"api_key": "test_key"} + # Create a factory with stream_name_to_group mapping (as ConcurrentDeclarativeSource would do) + factory_with_groups = ModelToComponentFactory( + stream_name_to_group={"parent": "issues_endpoint", "child": "issues_endpoint"} + ) + parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Test parent stream with block_simultaneous_read injected (as ConcurrentDeclarativeSource would do) + # Test parent stream gets block_simultaneous_read from the factory's stream_name_to_group parent_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["parent_stream"], {} ) - parent_stream: DefaultStream = factory.create_component( + parent_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config ) @@ -5333,11 +5335,11 @@ def test_block_simultaneous_read_from_stream_groups(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream with block_simultaneous_read injected + # Test child stream gets block_simultaneous_read from the factory's stream_name_to_group child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["child_stream"], {} ) - child_stream: DefaultStream = factory.create_component( + child_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config ) @@ -5349,7 +5351,7 @@ def test_block_simultaneous_read_from_stream_groups(): no_block_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["no_block_stream"], {} ) - no_block_stream: DefaultStream = factory.create_component( + no_block_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config ) @@ -5358,6 +5360,62 @@ def test_block_simultaneous_read_from_stream_groups(): assert no_block_stream.block_simultaneous_read == "" +def test_set_stream_name_to_group(): + """Test that set_stream_name_to_group updates the factory's stream_name_to_group mapping.""" + content = """ + test_stream: + type: DeclarativeStream + name: "test" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/test" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + """ + + config = {"api_key": "test_key"} + + # Create factory without stream_name_to_group + test_factory = ModelToComponentFactory() + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + stream_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["test_stream"], {} + ) + + # Without stream_name_to_group, block_simultaneous_read should be empty + stream: DefaultStream = test_factory.create_component( + model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config + ) + assert stream.block_simultaneous_read == "" + + # After setting stream_name_to_group, block_simultaneous_read should be populated + test_factory.set_stream_name_to_group({"test": "my_group"}) + stream = test_factory.create_component( + model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config + ) + assert stream.block_simultaneous_read == "my_group" + + def get_schema_loader(stream: DefaultStream): assert isinstance( stream._stream_partition_generator._partition_factory._schema_loader, From 0390f4e8416318ebb2fd9badbbfc76aff2ecc22d Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 16:26:09 +0000 Subject: [PATCH 12/26] refactor: use stream_groups manifest in factory test instead of hardcoded dict - Test now defines stream_groups with references in the manifest YAML - Uses _build_stream_name_to_group() to derive the mapping from manifest - Removed test_set_stream_name_to_group (redundant with the manifest-based test) - Added ConcurrentDeclarativeSource import for _build_stream_name_to_group Co-Authored-By: unknown <> --- .../test_model_to_component_factory.py | 273 +++++++----------- 1 file changed, 111 insertions(+), 162 deletions(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 184983d35..93b58c36d 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -45,6 +45,9 @@ from airbyte_cdk.sources.declarative.auth.token_provider import SessionTokenProvider from airbyte_cdk.sources.declarative.checks import CheckStream from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime from airbyte_cdk.sources.declarative.decoders import JsonDecoder, PaginationDecoderDecorator from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordFilter, RecordSelector @@ -5215,117 +5218,119 @@ def test_catalog_defined_cursor_field_stream_missing(): def test_block_simultaneous_read_from_stream_groups(): - """Test that block_simultaneous_read flows through from stream_name_to_group to DefaultStream. - - The stream_groups config is parsed by ConcurrentDeclarativeSource into a stream_name_to_group - mapping, which is then set on the ModelToComponentFactory. The factory uses this mapping to - look up the group for each stream it creates. - """ + """Test that stream_groups in the manifest flow through to DefaultStream.block_simultaneous_read.""" content = """ - parent_stream: - type: DeclarativeStream - name: "parent" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/parent" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - - child_stream: - type: DeclarativeStream - name: "child" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/child" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - partition_router: - type: SubstreamPartitionRouter - parent_stream_configs: - - type: ParentStreamConfig - stream: "#/parent_stream" - parent_key: "id" - partition_field: "parent_id" - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - parent_id: - type: string - - no_block_stream: - type: DeclarativeStream - name: "no_block" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/no_block" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string + definitions: + parent_stream: + type: DeclarativeStream + name: "parent" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/parent" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + + child_stream: + type: DeclarativeStream + name: "child" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/child" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + partition_router: + type: SubstreamPartitionRouter + parent_stream_configs: + - type: ParentStreamConfig + stream: "#/definitions/parent_stream" + parent_key: "id" + partition_field: "parent_id" + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + parent_id: + type: string + + no_block_stream: + type: DeclarativeStream + name: "no_block" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/no_block" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + + stream_groups: + issues_endpoint: + streams: + - "#/definitions/parent_stream" + - "#/definitions/child_stream" + action: BlockSimultaneousSyncsAction """ config = {"api_key": "test_key"} - # Create a factory with stream_name_to_group mapping (as ConcurrentDeclarativeSource would do) - factory_with_groups = ModelToComponentFactory( - stream_name_to_group={"parent": "issues_endpoint", "child": "issues_endpoint"} - ) - parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Test parent stream gets block_simultaneous_read from the factory's stream_name_to_group + # Build stream_name_to_group from the manifest's stream_groups (as ConcurrentDeclarativeSource does) + stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group(resolved_manifest) + factory_with_groups = ModelToComponentFactory(stream_name_to_group=stream_name_to_group) + + # Test parent stream gets block_simultaneous_read from stream_groups parent_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["parent_stream"], {} + "", resolved_manifest["definitions"]["parent_stream"], {} ) parent_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config @@ -5335,9 +5340,9 @@ def test_block_simultaneous_read_from_stream_groups(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream gets block_simultaneous_read from the factory's stream_name_to_group + # Test child stream gets block_simultaneous_read from stream_groups child_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["child_stream"], {} + "", resolved_manifest["definitions"]["child_stream"], {} ) child_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config @@ -5347,9 +5352,9 @@ def test_block_simultaneous_read_from_stream_groups(): assert child_stream.name == "child" assert child_stream.block_simultaneous_read == "issues_endpoint" - # Test stream without block_simultaneous_read (should default to empty string) + # Test stream not in any group defaults to empty string no_block_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["no_block_stream"], {} + "", resolved_manifest["definitions"]["no_block_stream"], {} ) no_block_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config @@ -5360,62 +5365,6 @@ def test_block_simultaneous_read_from_stream_groups(): assert no_block_stream.block_simultaneous_read == "" -def test_set_stream_name_to_group(): - """Test that set_stream_name_to_group updates the factory's stream_name_to_group mapping.""" - content = """ - test_stream: - type: DeclarativeStream - name: "test" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/test" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - """ - - config = {"api_key": "test_key"} - - # Create factory without stream_name_to_group - test_factory = ModelToComponentFactory() - - parsed_manifest = YamlDeclarativeSource._parse(content) - resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - stream_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["test_stream"], {} - ) - - # Without stream_name_to_group, block_simultaneous_read should be empty - stream: DefaultStream = test_factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config - ) - assert stream.block_simultaneous_read == "" - - # After setting stream_name_to_group, block_simultaneous_read should be populated - test_factory.set_stream_name_to_group({"test": "my_group"}) - stream = test_factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config - ) - assert stream.block_simultaneous_read == "my_group" - - def get_schema_loader(stream: DefaultStream): assert isinstance( stream._stream_partition_generator._partition_factory._schema_loader, From cd55bfd59f29f0200f8f9f3b508243159ad7fd97 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:09:17 +0000 Subject: [PATCH 13/26] fix: only include parent stream in stream_groups to avoid deadlock Child streams that depend on parent streams should not be in the same group, as this would cause a deadlock (child needs to read parent). Co-Authored-By: unknown <> --- .../declarative/parsers/test_model_to_component_factory.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 93b58c36d..c629d43a6 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5315,7 +5315,6 @@ def test_block_simultaneous_read_from_stream_groups(): issues_endpoint: streams: - "#/definitions/parent_stream" - - "#/definitions/child_stream" action: BlockSimultaneousSyncsAction """ @@ -5340,7 +5339,7 @@ def test_block_simultaneous_read_from_stream_groups(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream gets block_simultaneous_read from stream_groups + # Test child stream is NOT in the group (to avoid deadlock with parent) child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["definitions"]["child_stream"], {} ) @@ -5350,7 +5349,7 @@ def test_block_simultaneous_read_from_stream_groups(): assert isinstance(child_stream, DefaultStream) assert child_stream.name == "child" - assert child_stream.block_simultaneous_read == "issues_endpoint" + assert child_stream.block_simultaneous_read == "" # Test stream not in any group defaults to empty string no_block_manifest = transformer.propagate_types_and_parameters( From 5066ec7f7e850aee8eccf069f070e997e0dd77da Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:20:54 +0000 Subject: [PATCH 14/26] style: fix ruff format for long line Co-Authored-By: unknown <> --- .../declarative/parsers/test_model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index c629d43a6..79bce00c5 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5324,7 +5324,9 @@ def test_block_simultaneous_read_from_stream_groups(): resolved_manifest = resolver.preprocess_manifest(parsed_manifest) # Build stream_name_to_group from the manifest's stream_groups (as ConcurrentDeclarativeSource does) - stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group(resolved_manifest) + stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group( + resolved_manifest + ) factory_with_groups = ModelToComponentFactory(stream_name_to_group=stream_name_to_group) # Test parent stream gets block_simultaneous_read from stream_groups From 61562c4263aff1460a9bb64192fdf10743bee52f Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 27 Feb 2026 15:07:31 +0000 Subject: [PATCH 15/26] refactor: move _build_stream_name_to_group into ModelToComponentFactory - Factory now owns the stream_groups resolution via set_stream_groups(manifest) - ConcurrentDeclarativeSource just calls factory.set_stream_groups(manifest) - Removed _build_stream_name_to_group from ConcurrentDeclarativeSource - Updated tests to use factory's _build_stream_name_to_group directly Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 36 +-------------- .../parsers/model_to_component_factory.py | 45 ++++++++++++++++--- .../test_model_to_component_factory.py | 11 ++--- .../test_concurrent_declarative_source.py | 5 ++- 4 files changed, 48 insertions(+), 49 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index c0ab9e910..2eeee093e 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -405,9 +405,7 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) - self._constructor.set_stream_name_to_group( - self._build_stream_name_to_group(self._source_config) - ) + self._constructor.set_stream_groups(self._source_config) prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) @@ -532,38 +530,6 @@ def dynamic_streams(self) -> List[Dict[str, Any]]: with_dynamic_stream_name=True, ) - @staticmethod - def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: - """Build a mapping from stream name to group name based on the stream_groups manifest config. - - After manifest reference resolution, each stream reference in stream_groups.streams - is resolved to the full stream definition dict containing a 'name' field. - - Returns: - A dict mapping stream name -> group name for streams that belong to a group. - """ - stream_name_to_group: Dict[str, str] = {} - stream_groups = manifest.get("stream_groups", {}) - if not stream_groups: - return stream_name_to_group - - for group_name, group_config in stream_groups.items(): - streams = group_config.get("streams", []) - for stream_ref in streams: - if isinstance(stream_ref, dict): - # After reference resolution, stream_ref is a full stream definition dict - stream_name = stream_ref.get("name", "") - if stream_name: - stream_name_to_group[stream_name] = group_name - elif isinstance(stream_ref, str): - # If not resolved (shouldn't happen normally), extract name from ref path - # e.g., "#/definitions/my_stream" -> "my_stream" - if stream_ref.startswith("#/definitions/"): - stream_name = stream_ref.split("/")[-1] - stream_name_to_group[stream_name] = group_name - - return stream_name_to_group - def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config stream_configs = [] diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 18a7c37f8..4241d0125 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -682,7 +682,6 @@ def __init__( max_concurrent_async_job_count: Optional[int] = None, configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, api_budget: Optional[APIBudget] = None, - stream_name_to_group: Optional[Dict[str, str]] = None, ): self._init_mappings() self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice @@ -699,13 +698,49 @@ def __init__( self._connector_state_manager = connector_state_manager or ConnectorStateManager() self._api_budget: Optional[Union[APIBudget]] = api_budget self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) - self._stream_name_to_group: Dict[str, str] = stream_name_to_group or {} + self._stream_name_to_group: Dict[str, str] = {} # placeholder for deprecation warnings self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] - def set_stream_name_to_group(self, stream_name_to_group: Dict[str, str]) -> None: - """Set the mapping from stream name to group name for block_simultaneous_read.""" - self._stream_name_to_group = stream_name_to_group + def set_stream_groups(self, manifest: Mapping[str, Any]) -> None: + """Build and set the stream-name-to-group mapping from the manifest's stream_groups config. + + After manifest reference resolution, each stream reference in stream_groups.streams + is resolved to the full stream definition dict containing a 'name' field. + """ + self._stream_name_to_group = self._build_stream_name_to_group(manifest) + + @staticmethod + def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: + """Build a mapping from stream name to group name based on the stream_groups manifest config. + + After manifest reference resolution, each stream reference in stream_groups.streams + is resolved to the full stream definition dict containing a 'name' field. + + Returns: + A dict mapping stream name -> group name for streams that belong to a group. + """ + stream_name_to_group: Dict[str, str] = {} + stream_groups = manifest.get("stream_groups", {}) + if not stream_groups: + return stream_name_to_group + + for group_name, group_config in stream_groups.items(): + streams = group_config.get("streams", []) + for stream_ref in streams: + if isinstance(stream_ref, dict): + # After reference resolution, stream_ref is a full stream definition dict + stream_name = stream_ref.get("name", "") + if stream_name: + stream_name_to_group[stream_name] = group_name + elif isinstance(stream_ref, str): + # If not resolved (shouldn't happen normally), extract name from ref path + # e.g., "#/definitions/my_stream" -> "my_stream" + if stream_ref.startswith("#/definitions/"): + stream_name = stream_ref.split("/")[-1] + stream_name_to_group[stream_name] = group_name + + return stream_name_to_group def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 79bce00c5..7071ffe77 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -45,9 +45,6 @@ from airbyte_cdk.sources.declarative.auth.token_provider import SessionTokenProvider from airbyte_cdk.sources.declarative.checks import CheckStream from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel -from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( - ConcurrentDeclarativeSource, -) from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime from airbyte_cdk.sources.declarative.decoders import JsonDecoder, PaginationDecoderDecorator from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordFilter, RecordSelector @@ -5323,11 +5320,9 @@ def test_block_simultaneous_read_from_stream_groups(): parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Build stream_name_to_group from the manifest's stream_groups (as ConcurrentDeclarativeSource does) - stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group( - resolved_manifest - ) - factory_with_groups = ModelToComponentFactory(stream_name_to_group=stream_name_to_group) + # Use the factory's set_stream_groups to resolve stream_groups from the manifest + factory_with_groups = ModelToComponentFactory() + factory_with_groups.set_stream_groups(resolved_manifest) # Test parent stream gets block_simultaneous_read from stream_groups parent_manifest = transformer.propagate_types_and_parameters( diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 3f10b9eb2..7676f8e2d 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -56,6 +56,9 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import ( ClientSideIncrementalRecordFilterDecorator, ) +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( + ModelToComponentFactory, +) from airbyte_cdk.sources.declarative.partition_routers import AsyncJobPartitionRouter from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( @@ -5219,5 +5222,5 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): ) def test_build_stream_name_to_group(manifest, expected): """Test _build_stream_name_to_group correctly maps stream names to group names.""" - result = ConcurrentDeclarativeSource._build_stream_name_to_group(manifest) + result = ModelToComponentFactory._build_stream_name_to_group(manifest) assert result == expected From ed82738f069ac7bd82b25e7e22402d8d97e9d30e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:56:59 +0000 Subject: [PATCH 16/26] refactor: resolve stream_groups from actual stream instances instead of factory - Removed _build_stream_name_to_group, set_stream_groups, _stream_name_to_group from factory - Factory no longer knows about stream_groups at all - Added _apply_stream_groups to ConcurrentDeclarativeSource: creates streams first, then sets block_simultaneous_read on matching DefaultStream instances - Added block_simultaneous_read setter on DefaultStream - Replaced mock-based tests with parametrized tests using real DefaultStream instances Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 30 ++++- .../parsers/model_to_component_factory.py | 42 ------- .../streams/concurrent/default_stream.py | 4 + .../test_model_to_component_factory.py | 109 ++---------------- .../test_concurrent_declarative_source.py | 69 ++++++----- 5 files changed, 81 insertions(+), 173 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 2eeee093e..fae94c7b7 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -82,6 +82,7 @@ from airbyte_cdk.sources.message.concurrent_repository import ConcurrentMessageRepository from airbyte_cdk.sources.message.repository import InMemoryMessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem from airbyte_cdk.sources.utils.slice_logger import ( AlwaysLogSliceLogger, @@ -405,8 +406,6 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) - self._constructor.set_stream_groups(self._source_config) - prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) source_streams = [ @@ -422,8 +421,35 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i ) for stream_config in prepared_configs ] + + self._apply_stream_groups(source_streams) + return source_streams + def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: + """Set block_simultaneous_read on streams based on the manifest's stream_groups config. + + Iterates over the resolved manifest's stream_groups and matches group membership + against actual created stream instances by name. + """ + stream_groups = self._source_config.get("stream_groups", {}) + if not stream_groups: + return + + # Build stream_name -> group_name mapping from the resolved manifest + stream_name_to_group: Dict[str, str] = {} + for group_name, group_config in stream_groups.items(): + for stream_ref in group_config.get("streams", []): + if isinstance(stream_ref, dict): + stream_name = stream_ref.get("name", "") + if stream_name: + stream_name_to_group[stream_name] = group_name + + # Apply group to matching stream instances + for stream in streams: + if isinstance(stream, DefaultStream) and stream.name in stream_name_to_group: + stream.block_simultaneous_read = stream_name_to_group[stream.name] + @staticmethod def _initialize_cache_for_parent_streams( stream_configs: List[Dict[str, Any]], diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 4241d0125..3a772b691 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -698,50 +698,9 @@ def __init__( self._connector_state_manager = connector_state_manager or ConnectorStateManager() self._api_budget: Optional[Union[APIBudget]] = api_budget self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) - self._stream_name_to_group: Dict[str, str] = {} # placeholder for deprecation warnings self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] - def set_stream_groups(self, manifest: Mapping[str, Any]) -> None: - """Build and set the stream-name-to-group mapping from the manifest's stream_groups config. - - After manifest reference resolution, each stream reference in stream_groups.streams - is resolved to the full stream definition dict containing a 'name' field. - """ - self._stream_name_to_group = self._build_stream_name_to_group(manifest) - - @staticmethod - def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: - """Build a mapping from stream name to group name based on the stream_groups manifest config. - - After manifest reference resolution, each stream reference in stream_groups.streams - is resolved to the full stream definition dict containing a 'name' field. - - Returns: - A dict mapping stream name -> group name for streams that belong to a group. - """ - stream_name_to_group: Dict[str, str] = {} - stream_groups = manifest.get("stream_groups", {}) - if not stream_groups: - return stream_name_to_group - - for group_name, group_config in stream_groups.items(): - streams = group_config.get("streams", []) - for stream_ref in streams: - if isinstance(stream_ref, dict): - # After reference resolution, stream_ref is a full stream definition dict - stream_name = stream_ref.get("name", "") - if stream_name: - stream_name_to_group[stream_name] = group_name - elif isinstance(stream_ref, str): - # If not resolved (shouldn't happen normally), extract name from ref path - # e.g., "#/definitions/my_stream" -> "my_stream" - if stream_ref.startswith("#/definitions/"): - stream_name = stream_ref.split("/")[-1] - stream_name_to_group[stream_name] = group_name - - return stream_name_to_group - def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { AddedFieldDefinitionModel: self.create_added_field_definition, @@ -2159,7 +2118,6 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=self._stream_name_to_group.get(stream_name, ""), ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 5ca11eaf8..d679fe0ee 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -101,6 +101,10 @@ def block_simultaneous_read(self) -> str: """Returns the blocking group name for this stream, or empty string if no blocking""" return self._block_simultaneous_read + @block_simultaneous_read.setter + def block_simultaneous_read(self, value: str) -> None: + self._block_simultaneous_read = value + def check_availability(self) -> StreamAvailability: """ Check stream availability by attempting to read the first record of the stream. diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 7071ffe77..7b7763cdb 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5215,7 +5215,12 @@ def test_catalog_defined_cursor_field_stream_missing(): def test_block_simultaneous_read_from_stream_groups(): - """Test that stream_groups in the manifest flow through to DefaultStream.block_simultaneous_read.""" + """Test that factory-created streams default to empty block_simultaneous_read. + + The factory no longer handles stream_groups — that's done by + ConcurrentDeclarativeSource._apply_stream_groups after stream creation. + This test verifies the factory creates streams without group info. + """ content = """ definitions: parent_stream: @@ -5244,75 +5249,6 @@ def test_block_simultaneous_read_from_stream_groups(): properties: id: type: string - - child_stream: - type: DeclarativeStream - name: "child" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/child" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - partition_router: - type: SubstreamPartitionRouter - parent_stream_configs: - - type: ParentStreamConfig - stream: "#/definitions/parent_stream" - parent_key: "id" - partition_field: "parent_id" - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - parent_id: - type: string - - no_block_stream: - type: DeclarativeStream - name: "no_block" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/no_block" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - - stream_groups: - issues_endpoint: - streams: - - "#/definitions/parent_stream" - action: BlockSimultaneousSyncsAction """ config = {"api_key": "test_key"} @@ -5320,45 +5256,18 @@ def test_block_simultaneous_read_from_stream_groups(): parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Use the factory's set_stream_groups to resolve stream_groups from the manifest - factory_with_groups = ModelToComponentFactory() - factory_with_groups.set_stream_groups(resolved_manifest) + factory = ModelToComponentFactory() - # Test parent stream gets block_simultaneous_read from stream_groups parent_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["definitions"]["parent_stream"], {} ) - parent_stream: DefaultStream = factory_with_groups.create_component( + parent_stream: DefaultStream = factory.create_component( model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config ) assert isinstance(parent_stream, DefaultStream) assert parent_stream.name == "parent" - assert parent_stream.block_simultaneous_read == "issues_endpoint" - - # Test child stream is NOT in the group (to avoid deadlock with parent) - child_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["definitions"]["child_stream"], {} - ) - child_stream: DefaultStream = factory_with_groups.create_component( - model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config - ) - - assert isinstance(child_stream, DefaultStream) - assert child_stream.name == "child" - assert child_stream.block_simultaneous_read == "" - - # Test stream not in any group defaults to empty string - no_block_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["definitions"]["no_block_stream"], {} - ) - no_block_stream: DefaultStream = factory_with_groups.create_component( - model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config - ) - - assert isinstance(no_block_stream, DefaultStream) - assert no_block_stream.name == "no_block" - assert no_block_stream.block_simultaneous_read == "" + assert parent_stream.block_simultaneous_read == "" def get_schema_loader(stream: DefaultStream): diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 7676f8e2d..0e07b20f0 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -56,14 +56,12 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import ( ClientSideIncrementalRecordFilterDecorator, ) -from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( - ModelToComponentFactory, -) from airbyte_cdk.sources.declarative.partition_routers import AsyncJobPartitionRouter from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( StreamSlicerPartitionGenerator, ) +from airbyte_cdk.sources.message.repository import InMemoryMessageRepository from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.checkpoint import Cursor from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor @@ -5155,17 +5153,37 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): list(source.read(logger=source.logger, config=input_config, catalog=catalog, state=[])) +def _make_default_stream(name: str) -> DefaultStream: + """Create a minimal DefaultStream instance for testing.""" + from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor + + cursor = FinalStateCursor( + stream_name=name, stream_namespace=None, message_repository=InMemoryMessageRepository() + ) + return DefaultStream( + partition_generator=Mock(), + name=name, + json_schema={}, + primary_key=[], + cursor_field=None, + logger=logging.getLogger(f"test.{name}"), + cursor=cursor, + ) + + @pytest.mark.parametrize( - "manifest,expected", + "source_config,stream_names,expected_groups", [ pytest.param( {}, - {}, + ["my_stream"], + {"my_stream": ""}, id="no_stream_groups", ), pytest.param( {"stream_groups": {}}, - {}, + ["my_stream"], + {"my_stream": ""}, id="empty_stream_groups", ), pytest.param( @@ -5180,16 +5198,15 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): } } }, - {"deals": "crm_objects", "companies": "crm_objects"}, - id="resolved_stream_refs", + ["deals", "companies", "no_group"], + {"deals": "crm_objects", "companies": "crm_objects", "no_group": ""}, + id="single_group_with_unmatched_stream", ), pytest.param( { "stream_groups": { "group_a": { - "streams": [ - {"name": "stream1", "type": "DeclarativeStream"}, - ], + "streams": [{"name": "stream1", "type": "DeclarativeStream"}], "action": {"type": "BlockSimultaneousSyncsAction"}, }, "group_b": { @@ -5201,26 +5218,20 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): }, } }, + ["stream1", "stream2", "stream3"], {"stream1": "group_a", "stream2": "group_b", "stream3": "group_b"}, id="multiple_groups", ), - pytest.param( - { - "stream_groups": { - "fallback_group": { - "streams": [ - "#/definitions/my_stream", - ], - "action": {"type": "BlockSimultaneousSyncsAction"}, - } - } - }, - {"my_stream": "fallback_group"}, - id="unresolved_string_refs_fallback", - ), ], ) -def test_build_stream_name_to_group(manifest, expected): - """Test _build_stream_name_to_group correctly maps stream names to group names.""" - result = ModelToComponentFactory._build_stream_name_to_group(manifest) - assert result == expected +def test_apply_stream_groups(source_config, stream_names, expected_groups): + """Test _apply_stream_groups sets block_simultaneous_read on matching stream instances.""" + streams = [_make_default_stream(name) for name in stream_names] + + source = Mock() + source._source_config = source_config + + ConcurrentDeclarativeSource._apply_stream_groups(source, streams) + + for stream in streams: + assert stream.block_simultaneous_read == expected_groups[stream.name] From 7ba206f6a5e75113d572f0e3946613b04205d63c Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 4 Mar 2026 18:40:21 +0200 Subject: [PATCH 17/26] Fix stream format in schema --- .../sources/concurrent_source/concurrent_read_processor.py | 4 ++-- .../sources/declarative/declarative_component_schema.yaml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 372430be1..5be8afaa3 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -258,7 +258,8 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: stream_name = stream.name stream_group = self._stream_block_simultaneous_read.get(stream_name, "") - # Check if this stream has a blocking group and is already active + # Check if this stream has a blocking group and is already active as parent stream + # (i.e. being read from during partition generation for another stream) if stream_group and stream_name in self._active_stream_names: # Add back to the END of the queue for retry later self._stream_instances_to_start_partition_generation.append(stream) @@ -316,7 +317,6 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: self._logger.debug(f"Added '{stream_name}' to active group '{stream_group}'") # Also mark all parent streams as active (they will be read from during partition generation) - parent_streams = self._collect_all_parent_stream_names(stream_name) for parent_stream_name in parent_streams: parent_group = self._stream_block_simultaneous_read.get(parent_stream_name, "") if parent_group: diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 1cf4a7e73..1ef8b5fcf 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -85,11 +85,11 @@ definitions: streams: title: Streams description: > - List of references to streams that belong to this group. Use JSON references - to stream definitions (e.g., "#/definitions/my_stream"). + List of references to streams that belong to this group. type: array items: - type: string + anyOf: + - "$ref": "#/definitions/DeclarativeStream" action: title: Action description: The action to apply to streams in this group. From d09ee9b6143c511e7b1627e4fbf93ccb004974c0 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 16:44:09 +0000 Subject: [PATCH 18/26] refactor: add get_partition_router() helper to DefaultStream Replace hasattr chain in ConcurrentReadProcessor._collect_all_parent_stream_names with DefaultStream.get_partition_router() that safely traverses the internal partition_generator -> stream_slicer -> partition_router chain using isinstance checks. Co-Authored-By: unknown <> --- .../concurrent_read_processor.py | 38 +++++++------------ .../streams/concurrent/default_stream.py | 21 +++++++++- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 5be8afaa3..69bbf8af2 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -371,42 +371,30 @@ def _is_stream_done(self, stream_name: str) -> bool: return stream_name in self._streams_done def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: - """ - Recursively collect all parent stream names for a given stream. - For example, if we have: epics -> issues -> comments - Then for comments, this returns {issues, epics} + """Recursively collect all parent stream names for a given stream. - :param stream_name: The stream to collect parents for - :return: Set of all parent stream names (recursively) + For example, if we have: epics -> issues -> comments + Then for comments, this returns {issues, epics}. """ + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, + ) + from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream + parent_names: Set[str] = set() stream = self._stream_name_to_instance.get(stream_name) if not stream: return parent_names - # Get partition router if it exists (this is where parent streams are defined) - partition_router = None - - # Try DefaultStream path first (_stream_partition_generator._stream_slicer._partition_router) - if ( - hasattr(stream, "_stream_partition_generator") - and hasattr(stream._stream_partition_generator, "_stream_slicer") - and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router") - ): - partition_router = stream._stream_partition_generator._stream_slicer._partition_router - # Fallback to legacy path (retriever.partition_router) for backward compatibility and test mocks - elif hasattr(stream, "retriever") and hasattr(stream.retriever, "partition_router"): - partition_router = stream.retriever.partition_router + partition_router = ( + stream.get_partition_router() if isinstance(stream, DefaultStream) else None + ) - # SubstreamPartitionRouter has parent_stream_configs - if partition_router and hasattr(partition_router, "parent_stream_configs"): + if isinstance(partition_router, SubstreamPartitionRouter): for parent_config in partition_router.parent_stream_configs: - parent_stream = parent_config.stream - parent_name = parent_stream.name + parent_name = parent_config.stream.name parent_names.add(parent_name) - - # Recursively collect grandparents, great-grandparents, etc. parent_names.update(self._collect_all_parent_stream_names(parent_name)) return parent_names diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index d679fe0ee..4c7267315 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -3,7 +3,10 @@ # from logging import Logger -from typing import Any, Callable, Iterable, List, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Mapping, Optional, Union + +if TYPE_CHECKING: + from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream @@ -105,6 +108,22 @@ def block_simultaneous_read(self) -> str: def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value + def get_partition_router(self) -> "PartitionRouter | None": + """Return the partition router for this stream, or None if not available.""" + from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentPerPartitionCursor, + ) + from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + StreamSlicerPartitionGenerator, + ) + + if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): + return None + stream_slicer = self._stream_partition_generator._stream_slicer + if not isinstance(stream_slicer, ConcurrentPerPartitionCursor): + return None + return stream_slicer._partition_router + def check_availability(self) -> StreamAvailability: """ Check stream availability by attempting to read the first record of the stream. From 94c4b82dfe61bcddf875211db829d6bb79ebddf2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 16:52:33 +0000 Subject: [PATCH 19/26] feat: validate no parent-child streams share a group to prevent deadlock _apply_stream_groups now checks that no stream shares a group with any of its parent streams (via get_partition_router). Raises ValueError at config time if a deadlock-causing configuration is detected. Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 24 +++- .../test_concurrent_declarative_source.py | 116 ++++++++++++++++++ 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index fae94c7b7..15fa40fe4 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -430,8 +430,13 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: """Set block_simultaneous_read on streams based on the manifest's stream_groups config. Iterates over the resolved manifest's stream_groups and matches group membership - against actual created stream instances by name. + against actual created stream instances by name. Validates that no stream shares a + group with any of its parent streams, which would cause a deadlock. """ + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, + ) + stream_groups = self._source_config.get("stream_groups", {}) if not stream_groups: return @@ -445,6 +450,23 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: if stream_name: stream_name_to_group[stream_name] = group_name + # Validate no stream shares a group with its parent streams + for stream in streams: + if not isinstance(stream, DefaultStream) or stream.name not in stream_name_to_group: + continue + partition_router = stream.get_partition_router() + if not isinstance(partition_router, SubstreamPartitionRouter): + continue + group_name = stream_name_to_group[stream.name] + for parent_config in partition_router.parent_stream_configs: + parent_name = parent_config.stream.name + if stream_name_to_group.get(parent_name) == group_name: + raise ValueError( + f"Stream '{stream.name}' and its parent stream '{parent_name}' " + f"are both in group '{group_name}'. " + f"A child stream must not share a group with its parent to avoid deadlock." + ) + # Apply group to matching stream instances for stream in streams: if isinstance(stream, DefaultStream) and stream.name in stream_name_to_group: diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 0e07b20f0..f1b4143a3 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5171,6 +5171,75 @@ def _make_default_stream(name: str) -> DefaultStream: ) +def _make_child_stream_with_parent(child_name: str, parent_stream: DefaultStream) -> DefaultStream: + """Create a DefaultStream that has a SubstreamPartitionRouter pointing to parent_stream.""" + from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentCursorFactory, + ConcurrentPerPartitionCursor, + ) + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + ParentStreamConfig, + SubstreamPartitionRouter, + ) + from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartitionFactory, + StreamSlicerPartitionGenerator, + ) + from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor + from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + EpochValueConcurrentStreamStateConverter, + ) + + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=parent_stream, + parent_key="id", + partition_field="parent_id", + config={}, + parameters={}, + ) + ], + config={}, + parameters={}, + ) + + cursor_factory = ConcurrentCursorFactory(lambda *args, **kwargs: Mock()) + message_repository = InMemoryMessageRepository() + state_converter = EpochValueConcurrentStreamStateConverter() + + per_partition_cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory, + partition_router=partition_router, + stream_name=child_name, + stream_namespace=None, + stream_state={}, + message_repository=message_repository, + connector_state_manager=Mock(), + connector_state_converter=state_converter, + cursor_field=Mock(cursor_field_key="updated_at"), + ) + + partition_factory = Mock(spec=DeclarativePartitionFactory) + partition_generator = StreamSlicerPartitionGenerator( + partition_factory=partition_factory, + stream_slicer=per_partition_cursor, + ) + + cursor = FinalStateCursor( + stream_name=child_name, stream_namespace=None, message_repository=message_repository + ) + return DefaultStream( + partition_generator=partition_generator, + name=child_name, + json_schema={}, + primary_key=[], + cursor_field=None, + logger=logging.getLogger(f"test.{child_name}"), + cursor=cursor, + ) + + @pytest.mark.parametrize( "source_config,stream_names,expected_groups", [ @@ -5235,3 +5304,50 @@ def test_apply_stream_groups(source_config, stream_names, expected_groups): for stream in streams: assert stream.block_simultaneous_read == expected_groups[stream.name] + + +def test_apply_stream_groups_raises_on_parent_child_in_same_group(): + """Test _apply_stream_groups raises ValueError when a child and its parent are in the same group.""" + parent = _make_default_stream("parent_stream") + child = _make_child_stream_with_parent("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "my_group": { + "streams": [ + {"name": "parent_stream", "type": "DeclarativeStream"}, + {"name": "child_stream", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + } + + with pytest.raises(ValueError, match="child stream must not share a group with its parent"): + ConcurrentDeclarativeSource._apply_stream_groups(source, [parent, child]) + + +def test_apply_stream_groups_allows_parent_child_in_different_groups(): + """Test _apply_stream_groups allows a child and its parent in different groups.""" + parent = _make_default_stream("parent_stream") + child = _make_child_stream_with_parent("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "group_a": { + "streams": [{"name": "parent_stream", "type": "DeclarativeStream"}], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + "group_b": { + "streams": [{"name": "child_stream", "type": "DeclarativeStream"}], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + } + } + + ConcurrentDeclarativeSource._apply_stream_groups(source, [parent, child]) + + assert parent.block_simultaneous_read == "group_a" + assert child.block_simultaneous_read == "group_b" From 0874f122649257ffcafab43b9b4ce00756478dde Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:09:13 +0000 Subject: [PATCH 20/26] feat: assert partition generation queue is empty when all streams are done Adds a safety check in is_done() that raises AirbyteTracedException (system_error) if streams remain in the partition generation queue after all streams are marked done. Also moves inline imports to module level and updates test mocks to use DefaultStream with get_partition_router(). Co-Authored-By: unknown <> --- .../concurrent_read_processor.py | 18 ++++-- .../test_concurrent_read_processor.py | 60 ++++++++++++++++--- 2 files changed, 65 insertions(+), 13 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 69bbf8af2..9a8f7d942 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -13,8 +13,12 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -354,6 +358,15 @@ def is_done(self) -> bool: for stream_name in self._stream_name_to_instance.keys() ] ) + if is_done and self._stream_instances_to_start_partition_generation: + stuck_stream_names = [ + s.name for s in self._stream_instances_to_start_partition_generation + ] + raise AirbyteTracedException( + message="Partition generation queue is not empty after all streams completed.", + internal_message=f"Streams {stuck_stream_names} remained in the partition generation queue after all streams were marked done.", + failure_type=FailureType.system_error, + ) if is_done and self._exceptions_per_stream_name: error_message = generate_failed_streams_error_message(self._exceptions_per_stream_name) self._logger.info(error_message) @@ -376,11 +389,6 @@ def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: For example, if we have: epics -> issues -> comments Then for comments, this returns {issues, epics}. """ - from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( - SubstreamPartitionRouter, - ) - from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream - parent_names: Set[str] = set() stream = self._stream_name_to_instance.get(stream_name) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 788478ae6..4cdc72d45 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -28,8 +28,12 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.message import LogMessage, MessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -822,18 +826,22 @@ def _create_mock_stream(self, name: str, block_simultaneous_read: str = ""): def _create_mock_stream_with_parent( self, name: str, parent_stream, block_simultaneous_read: str = "" ): - """Helper to create a mock stream with a parent stream""" - stream = self._create_mock_stream(name, block_simultaneous_read) + """Helper to create a mock stream with a parent stream.""" + stream = Mock(spec=DefaultStream) + stream.name = name + stream.block_simultaneous_read = block_simultaneous_read + stream.as_airbyte_stream.return_value = AirbyteStream( + name=name, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + stream.cursor.ensure_at_least_one_state_emitted = Mock() - # Mock the retriever and partition router for parent relationship - mock_retriever = Mock() - mock_partition_router = Mock() + mock_partition_router = Mock(spec=SubstreamPartitionRouter) mock_parent_config = Mock() mock_parent_config.stream = parent_stream - mock_partition_router.parent_stream_configs = [mock_parent_config] - mock_retriever.partition_router = mock_partition_router - stream.retriever = mock_retriever + stream.get_partition_router.return_value = mock_partition_router return stream @@ -1396,3 +1404,39 @@ def test_child_starts_after_parent_completes_via_partition_complete_sentinel(sel ] assert len(started_messages) == 1 assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" + + +def test_is_done_raises_when_partition_generation_queue_not_empty(): + """Test is_done raises AirbyteTracedException if streams remain in the partition generation queue.""" + partition_enqueuer = Mock(spec=PartitionEnqueuer) + thread_pool_manager = Mock(spec=ThreadPoolManager) + logger = Mock(spec=logging.Logger) + slice_logger = Mock(spec=SliceLogger) + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [] + partition_reader = Mock(spec=PartitionReader) + + stream = Mock(spec=AbstractStream) + stream.name = "stuck_stream" + stream.block_simultaneous_read = "" + stream.as_airbyte_stream.return_value = AirbyteStream( + name="stuck_stream", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + handler = ConcurrentReadProcessor( + [stream], + partition_enqueuer, + thread_pool_manager, + logger, + slice_logger, + message_repository, + partition_reader, + ) + + # Artificially mark the stream as done without removing it from the partition generation queue + handler._streams_done.add("stuck_stream") + + with pytest.raises(AirbyteTracedException, match="remained in the partition generation queue"): + handler.is_done() From c868fdbfec695258fa3b8ae8b39b070bd58ace3d Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:14:55 +0000 Subject: [PATCH 21/26] refactor: move inline imports to module level in default_stream.py and concurrent_declarative_source.py Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 7 +++---- .../streams/concurrent/default_stream.py | 21 ++++++++----------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 15fa40fe4..5ae1d232d 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -76,6 +76,9 @@ from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( ModelToComponentFactory, ) +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.declarative.resolvers import COMPONENTS_RESOLVER_TYPE_MAPPING from airbyte_cdk.sources.declarative.spec.spec import Spec from airbyte_cdk.sources.declarative.types import Config, ConnectionDefinition @@ -433,10 +436,6 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: against actual created stream instances by name. Validates that no stream shares a group with any of its parent streams, which would cause a deadlock. """ - from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( - SubstreamPartitionRouter, - ) - stream_groups = self._source_config.get("stream_groups", {}) if not stream_groups: return diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 4c7267315..6cc6e44d4 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -3,12 +3,16 @@ # from logging import Logger -from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Mapping, Optional, Union - -if TYPE_CHECKING: - from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from typing import Any, Callable, Iterable, List, Mapping, Optional, Union from airbyte_cdk.models import AirbyteStream, SyncMode +from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentPerPartitionCursor, +) +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + StreamSlicerPartitionGenerator, +) from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField @@ -108,15 +112,8 @@ def block_simultaneous_read(self) -> str: def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value - def get_partition_router(self) -> "PartitionRouter | None": + def get_partition_router(self) -> PartitionRouter | None: """Return the partition router for this stream, or None if not available.""" - from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( - ConcurrentPerPartitionCursor, - ) - from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( - StreamSlicerPartitionGenerator, - ) - if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): return None stream_slicer = self._stream_partition_generator._stream_slicer From 1fffc692f7a208c77c880c2a28ed340ec2d36873 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:26:23 +0000 Subject: [PATCH 22/26] fix: unwrap GroupingPartitionRouter in get_partition_router() to detect parent streams Co-Authored-By: unknown <> --- .../sources/streams/concurrent/default_stream.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 6cc6e44d4..a896cfbad 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -9,6 +9,9 @@ from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( ConcurrentPerPartitionCursor, ) +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( StreamSlicerPartitionGenerator, @@ -113,13 +116,20 @@ def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value def get_partition_router(self) -> PartitionRouter | None: - """Return the partition router for this stream, or None if not available.""" + """Return the partition router for this stream, or None if not available. + + If the router is a GroupingPartitionRouter, unwraps it to return the + underlying router so callers can inspect parent stream relationships. + """ if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): return None stream_slicer = self._stream_partition_generator._stream_slicer if not isinstance(stream_slicer, ConcurrentPerPartitionCursor): return None - return stream_slicer._partition_router + router = stream_slicer._partition_router + if isinstance(router, GroupingPartitionRouter): + return router.underlying_partition_router + return router def check_availability(self) -> StreamAvailability: """ From 5911051961d5c4671d6611670d3b84fac2365dd7 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:29:42 +0000 Subject: [PATCH 23/26] fix: handle GroupingPartitionRouter at call sites instead of in get_partition_router() Co-Authored-By: unknown <> --- .../concurrent_source/concurrent_read_processor.py | 5 +++++ .../declarative/concurrent_declarative_source.py | 5 +++++ .../sources/streams/concurrent/default_stream.py | 14 ++------------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 9a8f7d942..53ddf4994 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -13,6 +13,9 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( SubstreamPartitionRouter, ) @@ -398,6 +401,8 @@ def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: partition_router = ( stream.get_partition_router() if isinstance(stream, DefaultStream) else None ) + if isinstance(partition_router, GroupingPartitionRouter): + partition_router = partition_router.underlying_partition_router if isinstance(partition_router, SubstreamPartitionRouter): for parent_config in partition_router.parent_stream_configs: diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 5ae1d232d..6d3b570ab 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -76,6 +76,9 @@ from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( ModelToComponentFactory, ) +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( SubstreamPartitionRouter, ) @@ -454,6 +457,8 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: if not isinstance(stream, DefaultStream) or stream.name not in stream_name_to_group: continue partition_router = stream.get_partition_router() + if isinstance(partition_router, GroupingPartitionRouter): + partition_router = partition_router.underlying_partition_router if not isinstance(partition_router, SubstreamPartitionRouter): continue group_name = stream_name_to_group[stream.name] diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index a896cfbad..6cc6e44d4 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -9,9 +9,6 @@ from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( ConcurrentPerPartitionCursor, ) -from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( - GroupingPartitionRouter, -) from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( StreamSlicerPartitionGenerator, @@ -116,20 +113,13 @@ def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value def get_partition_router(self) -> PartitionRouter | None: - """Return the partition router for this stream, or None if not available. - - If the router is a GroupingPartitionRouter, unwraps it to return the - underlying router so callers can inspect parent stream relationships. - """ + """Return the partition router for this stream, or None if not available.""" if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): return None stream_slicer = self._stream_partition_generator._stream_slicer if not isinstance(stream_slicer, ConcurrentPerPartitionCursor): return None - router = stream_slicer._partition_router - if isinstance(router, GroupingPartitionRouter): - return router.underlying_partition_router - return router + return stream_slicer._partition_router def check_availability(self) -> StreamAvailability: """ From d01ee315e4d69128d88747e9a3205fb9a787a1bd Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 19:34:21 +0000 Subject: [PATCH 24/26] feat: check active_groups is empty in is_done() safety check Co-Authored-By: unknown <> --- .../sources/concurrent_source/concurrent_read_processor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 53ddf4994..cc39f888f 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -370,6 +370,12 @@ def is_done(self) -> bool: internal_message=f"Streams {stuck_stream_names} remained in the partition generation queue after all streams were marked done.", failure_type=FailureType.system_error, ) + if is_done and self._active_groups: + raise AirbyteTracedException( + message="Active stream groups are not empty after all streams completed.", + internal_message=f"Groups {dict(self._active_groups)} still active after all streams were marked done.", + failure_type=FailureType.system_error, + ) if is_done and self._exceptions_per_stream_name: error_message = generate_failed_streams_error_message(self._exceptions_per_stream_name) self._logger.info(error_message) From 902904955d17f937c4dd3f89e5256f2bb2764580 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 19:38:37 +0000 Subject: [PATCH 25/26] test: add missing unit tests for GroupingPartitionRouter, active_groups check, and get_partition_router Co-Authored-By: unknown <> --- .../test_concurrent_declarative_source.py | 144 ++++++++++++++++++ .../test_concurrent_read_processor.py | 85 +++++++++++ 2 files changed, 229 insertions(+) diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index f1b4143a3..ca8e474f5 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5351,3 +5351,147 @@ def test_apply_stream_groups_allows_parent_child_in_different_groups(): assert parent.block_simultaneous_read == "group_a" assert child.block_simultaneous_read == "group_b" + + +def _make_child_stream_with_grouping_router( + child_name: str, parent_stream: DefaultStream +) -> DefaultStream: + """Create a DefaultStream with GroupingPartitionRouter wrapping SubstreamPartitionRouter.""" + from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentCursorFactory, + ConcurrentPerPartitionCursor, + ) + from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, + ) + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + ParentStreamConfig, + SubstreamPartitionRouter, + ) + from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartitionFactory, + StreamSlicerPartitionGenerator, + ) + from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor + from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + EpochValueConcurrentStreamStateConverter, + ) + + substream_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=parent_stream, + parent_key="id", + partition_field="parent_id", + config={}, + parameters={}, + ) + ], + config={}, + parameters={}, + ) + + grouping_router = GroupingPartitionRouter( + group_size=10, + underlying_partition_router=substream_router, + config={}, + ) + + cursor_factory = ConcurrentCursorFactory(lambda *args, **kwargs: Mock()) + message_repository = InMemoryMessageRepository() + state_converter = EpochValueConcurrentStreamStateConverter() + + per_partition_cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory, + partition_router=grouping_router, + stream_name=child_name, + stream_namespace=None, + stream_state={}, + message_repository=message_repository, + connector_state_manager=Mock(), + connector_state_converter=state_converter, + cursor_field=Mock(cursor_field_key="updated_at"), + ) + + partition_factory = Mock(spec=DeclarativePartitionFactory) + partition_generator = StreamSlicerPartitionGenerator( + partition_factory=partition_factory, + stream_slicer=per_partition_cursor, + ) + + cursor = FinalStateCursor( + stream_name=child_name, stream_namespace=None, message_repository=message_repository + ) + return DefaultStream( + partition_generator=partition_generator, + name=child_name, + json_schema={}, + primary_key=[], + cursor_field=None, + logger=logging.getLogger(f"test.{child_name}"), + cursor=cursor, + ) + + +def test_apply_stream_groups_raises_on_parent_child_in_same_group_with_grouping_router(): + """Test _apply_stream_groups detects deadlock when GroupingPartitionRouter wraps SubstreamPartitionRouter.""" + parent = _make_default_stream("parent_stream") + child = _make_child_stream_with_grouping_router("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "my_group": { + "streams": [ + {"name": "parent_stream", "type": "DeclarativeStream"}, + {"name": "child_stream", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + } + + with pytest.raises(ValueError, match="child stream must not share a group with its parent"): + ConcurrentDeclarativeSource._apply_stream_groups(source, [parent, child]) + + +@pytest.mark.parametrize( + "stream_factory,expected_type", + [ + pytest.param( + lambda: _make_default_stream("plain_stream"), + type(None), + id="no_partition_router_returns_none", + ), + pytest.param( + lambda: _make_child_stream_with_parent("child", _make_default_stream("parent")), + "SubstreamPartitionRouter", + id="substream_returns_substream_router", + ), + pytest.param( + lambda: _make_child_stream_with_grouping_router( + "child", _make_default_stream("parent") + ), + "GroupingPartitionRouter", + id="grouping_returns_grouping_router", + ), + ], +) +def test_get_partition_router(stream_factory, expected_type): + """Test DefaultStream.get_partition_router returns the correct router type.""" + from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, + ) + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, + ) + + stream = stream_factory() + router = stream.get_partition_router() + + if expected_type is type(None): + assert router is None + elif expected_type == "SubstreamPartitionRouter": + assert isinstance(router, SubstreamPartitionRouter) + elif expected_type == "GroupingPartitionRouter": + assert isinstance(router, GroupingPartitionRouter) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 4cdc72d45..acfa03129 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -28,6 +28,9 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( SubstreamPartitionRouter, ) @@ -1440,3 +1443,85 @@ def test_is_done_raises_when_partition_generation_queue_not_empty(): with pytest.raises(AirbyteTracedException, match="remained in the partition generation queue"): handler.is_done() + + +def test_is_done_raises_when_active_groups_not_empty(): + """Test is_done raises AirbyteTracedException if active groups remain after all streams complete.""" + partition_enqueuer = Mock(spec=PartitionEnqueuer) + thread_pool_manager = Mock(spec=ThreadPoolManager) + logger = Mock(spec=logging.Logger) + slice_logger = Mock(spec=SliceLogger) + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [] + partition_reader = Mock(spec=PartitionReader) + + stream = Mock(spec=AbstractStream) + stream.name = "stuck_stream" + stream.block_simultaneous_read = "my_group" + stream.as_airbyte_stream.return_value = AirbyteStream( + name="stuck_stream", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + handler = ConcurrentReadProcessor( + [stream], + partition_enqueuer, + thread_pool_manager, + logger, + slice_logger, + message_repository, + partition_reader, + ) + + # Mark stream as done but leave the group active (simulating a bug) + handler._streams_done.add("stuck_stream") + handler._stream_instances_to_start_partition_generation.clear() + handler._active_groups["my_group"] = {"stuck_stream"} + + with pytest.raises( + AirbyteTracedException, match="still active after all streams were marked done" + ): + handler.is_done() + + +def test_collect_parent_stream_names_unwraps_grouping_partition_router(): + """Test _collect_all_parent_stream_names unwraps GroupingPartitionRouter to find parents.""" + partition_enqueuer = Mock(spec=PartitionEnqueuer) + thread_pool_manager = Mock(spec=ThreadPoolManager) + logger = Mock(spec=logging.Logger) + slice_logger = Mock(spec=SliceLogger) + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [] + partition_reader = Mock(spec=PartitionReader) + + parent_stream = Mock(spec=AbstractStream) + parent_stream.name = "parent" + parent_stream.block_simultaneous_read = "" + + # Child has a GroupingPartitionRouter wrapping a SubstreamPartitionRouter + child_stream = Mock(spec=DefaultStream) + child_stream.name = "child" + child_stream.block_simultaneous_read = "" + + mock_substream_router = Mock(spec=SubstreamPartitionRouter) + mock_parent_config = Mock() + mock_parent_config.stream = parent_stream + mock_substream_router.parent_stream_configs = [mock_parent_config] + + mock_grouping_router = Mock(spec=GroupingPartitionRouter) + mock_grouping_router.underlying_partition_router = mock_substream_router + child_stream.get_partition_router.return_value = mock_grouping_router + + handler = ConcurrentReadProcessor( + [parent_stream, child_stream], + partition_enqueuer, + thread_pool_manager, + logger, + slice_logger, + message_repository, + partition_reader, + ) + + parent_names = handler._collect_all_parent_stream_names("child") + assert parent_names == {"parent"} From 756a9667dbaf1b80de18ab84a6353d02b8fa50b1 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 20:04:25 +0000 Subject: [PATCH 26/26] fix: make deadlock validation check all ancestors, not just direct parents Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 33 +++++++++++++------ .../test_concurrent_declarative_source.py | 23 +++++++++++++ 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 6d3b570ab..292615692 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -452,21 +452,34 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: if stream_name: stream_name_to_group[stream_name] = group_name - # Validate no stream shares a group with its parent streams + # Validate no stream shares a group with any of its ancestor streams + stream_name_to_instance: Dict[str, AbstractStream] = {s.name: s for s in streams} + + def _collect_all_ancestor_names(stream_name: str) -> Set[str]: + """Recursively collect all ancestor stream names.""" + ancestors: Set[str] = set() + inst = stream_name_to_instance.get(stream_name) + if not isinstance(inst, DefaultStream): + return ancestors + router = inst.get_partition_router() + if isinstance(router, GroupingPartitionRouter): + router = router.underlying_partition_router + if not isinstance(router, SubstreamPartitionRouter): + return ancestors + for parent_config in router.parent_stream_configs: + parent_name = parent_config.stream.name + ancestors.add(parent_name) + ancestors.update(_collect_all_ancestor_names(parent_name)) + return ancestors + for stream in streams: if not isinstance(stream, DefaultStream) or stream.name not in stream_name_to_group: continue - partition_router = stream.get_partition_router() - if isinstance(partition_router, GroupingPartitionRouter): - partition_router = partition_router.underlying_partition_router - if not isinstance(partition_router, SubstreamPartitionRouter): - continue group_name = stream_name_to_group[stream.name] - for parent_config in partition_router.parent_stream_configs: - parent_name = parent_config.stream.name - if stream_name_to_group.get(parent_name) == group_name: + for ancestor_name in _collect_all_ancestor_names(stream.name): + if stream_name_to_group.get(ancestor_name) == group_name: raise ValueError( - f"Stream '{stream.name}' and its parent stream '{parent_name}' " + f"Stream '{stream.name}' and its parent stream '{ancestor_name}' " f"are both in group '{group_name}'. " f"A child stream must not share a group with its parent to avoid deadlock." ) diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index ca8e474f5..bf1f61610 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5433,6 +5433,29 @@ def _make_child_stream_with_grouping_router( ) +def test_apply_stream_groups_raises_on_grandparent_child_in_same_group(): + """Test _apply_stream_groups detects deadlock when a grandchild and grandparent share a group.""" + grandparent = _make_default_stream("grandparent_stream") + parent = _make_child_stream_with_parent("parent_stream", grandparent) + child = _make_child_stream_with_parent("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "my_group": { + "streams": [ + {"name": "grandparent_stream", "type": "DeclarativeStream"}, + {"name": "child_stream", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + } + + with pytest.raises(ValueError, match="child stream must not share a group with its parent"): + ConcurrentDeclarativeSource._apply_stream_groups(source, [grandparent, parent, child]) + + def test_apply_stream_groups_raises_on_parent_child_in_same_group_with_grouping_router(): """Test _apply_stream_groups detects deadlock when GroupingPartitionRouter wraps SubstreamPartitionRouter.""" parent = _make_default_stream("parent_stream")