From cf01a7583d3815ddc6efb11ca6c4913b24b93337 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 2 Feb 2026 21:15:22 +0000 Subject: [PATCH 01/49] feat(cdk): Add cursor age validation to StateDelegatingStream This adds an optional api_retention_period field to StateDelegatingStream that validates whether a cursor is within an API's data retention window before switching from full refresh to incremental sync. When the cursor value is older than the retention period, the connector automatically falls back to a full refresh to avoid data loss. This is useful for APIs like Stripe Events API which only retain data for 30 days. Key changes: - Add api_retention_period field to StateDelegatingStream schema (ISO8601 duration) - Implement cursor age validation in model_to_component_factory.py - Emit warning log when falling back to full refresh due to stale cursor - Add unit tests for cursor age validation Fixes: https://github.com/airbytehq/oncall/issues/11103 Co-Authored-By: unknown <> --- .../declarative_component_schema.yaml | 16 ++ .../models/declarative_component_schema.py | 152 +++++++++++------- .../parsers/model_to_component_factory.py | 102 +++++++++++- .../test_state_delegating_stream.py | 119 ++++++++++++++ 4 files changed, 325 insertions(+), 64 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index e68318cd4..64e8aba72 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -3752,6 +3752,22 @@ definitions: title: Incremental Stream description: Component used to coordinate how records are extracted across stream slices and request pages when the state provided. "$ref": "#/definitions/DeclarativeStream" + api_retention_period: + title: API Retention Period + description: | + The data retention period of the incremental API (ISO8601 duration). If the cursor value is older than this retention period, the connector will automatically fall back to a full refresh to avoid data loss. + This is useful for APIs like Stripe Events API which only retain data for 30 days. + * **PT1H**: 1 hour + * **P1D**: 1 day + * **P1W**: 1 week + * **P1M**: 1 month + * **P1Y**: 1 year + * **P30D**: 30 days + type: string + examples: + - "P30D" + - "P90D" + - "P1Y" $parameters: type: object additionalProperties: true diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 5d2f0521f..dbd437cfa 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -1,5 +1,3 @@ -# Copyright (c) 2025 Airbyte, Inc., all rights reserved. - # generated by datamodel-codegen: # filename: declarative_component_schema.yaml @@ -930,24 +928,28 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( + Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", + ) ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( + Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", + ) ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -965,7 +967,9 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], + examples=[ + {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} + ], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1469,7 +1473,9 @@ class CustomConfigTransformation(BaseModel): class_name: str = Field( ..., description="Fully-qualified name of the class that will be implementing the custom config transformation. The format is `source_..`.", - examples=["source_declarative_manifest.components.MyCustomConfigTransformation"], + examples=[ + "source_declarative_manifest.components.MyCustomConfigTransformation" + ], ) parameters: Optional[Dict[str, Any]] = Field( None, @@ -1897,7 +1903,9 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], + examples=[ + ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] + ], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -2124,7 +2132,9 @@ class RecordSelector(BaseModel): description="Responsible for filtering records to be emitted by the Source.", title="Record Filter", ) - schema_normalization: Optional[Union[SchemaNormalization, CustomSchemaNormalization]] = Field( + schema_normalization: Optional[ + Union[SchemaNormalization, CustomSchemaNormalization] + ] = Field( None, description="Responsible for normalization according to the schema.", title="Schema Normalization", @@ -2166,10 +2176,12 @@ class DpathValidator(BaseModel): ], title="Field Path", ) - validation_strategy: Union[ValidateAdheresToSchema, CustomValidationStrategy] = Field( - ..., - description="The condition that the specified config value will be evaluated against", - title="Validation Strategy", + validation_strategy: Union[ValidateAdheresToSchema, CustomValidationStrategy] = ( + Field( + ..., + description="The condition that the specified config value will be evaluated against", + title="Validation Strategy", + ) ) @@ -2186,10 +2198,12 @@ class PredicateValidator(BaseModel): ], title="Value", ) - validation_strategy: Union[ValidateAdheresToSchema, CustomValidationStrategy] = Field( - ..., - description="The validation strategy to apply to the value.", - title="Validation Strategy", + validation_strategy: Union[ValidateAdheresToSchema, CustomValidationStrategy] = ( + Field( + ..., + description="The validation strategy to apply to the value.", + title="Validation Strategy", + ) ) @@ -2214,12 +2228,12 @@ class ConfigAddFields(BaseModel): class CompositeErrorHandler(BaseModel): type: Literal["CompositeErrorHandler"] - error_handlers: List[Union[CompositeErrorHandler, DefaultErrorHandler, CustomErrorHandler]] = ( - Field( - ..., - description="List of error handlers to iterate on to determine how to handle a failed response.", - title="Error Handlers", - ) + error_handlers: List[ + Union[CompositeErrorHandler, DefaultErrorHandler, CustomErrorHandler] + ] = Field( + ..., + description="List of error handlers to iterate on to determine how to handle a failed response.", + title="Error Handlers", ) parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") @@ -2381,9 +2395,9 @@ class Config: type: Literal["DeclarativeSource"] check: Union[CheckStream, CheckDynamicStream] - streams: Optional[List[Union[ConditionalStreams, DeclarativeStream, StateDelegatingStream]]] = ( - None - ) + streams: Optional[ + List[Union[ConditionalStreams, DeclarativeStream, StateDelegatingStream]] + ] = None dynamic_streams: List[DynamicDeclarativeStream] version: str = Field( ..., @@ -2508,16 +2522,20 @@ class Config: extra = Extra.allow type: Literal["DeclarativeStream"] - name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") + name: Optional[str] = Field( + "", description="The stream name.", example=["Users"], title="Name" + ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[DatetimeBasedCursor, IncrementingCountCursor]] = Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", + incremental_sync: Optional[Union[DatetimeBasedCursor, IncrementingCountCursor]] = ( + Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", + ) ) primary_key: Optional[PrimaryKey] = Field("", title="Primary Key") schema_loader: Optional[ @@ -2691,18 +2709,20 @@ class HttpRequester(BaseModelWithDeprecations): description="For APIs that require explicit specification of the properties to query for, this component will take a static or dynamic set of properties (which can be optionally split into chunks) and allow them to be injected into an outbound request by accessing stream_partition.extra_fields.", title="Query Properties", ) - request_parameters: Optional[Union[Dict[str, Union[str, QueryProperties]], str]] = Field( - None, - description="Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.", - examples=[ - {"unit": "day"}, - { - "query": 'last_event_time BETWEEN TIMESTAMP "{{ stream_interval.start_time }}" AND TIMESTAMP "{{ stream_interval.end_time }}"' - }, - {"searchIn": "{{ ','.join(config.get('search_in', [])) }}"}, - {"sort_by[asc]": "updated_at"}, - ], - title="Query Parameters", + request_parameters: Optional[Union[Dict[str, Union[str, QueryProperties]], str]] = ( + Field( + None, + description="Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.", + examples=[ + {"unit": "day"}, + { + "query": 'last_event_time BETWEEN TIMESTAMP "{{ stream_interval.start_time }}" AND TIMESTAMP "{{ stream_interval.end_time }}"' + }, + {"searchIn": "{{ ','.join(config.get('search_in', [])) }}"}, + {"sort_by[asc]": "updated_at"}, + ], + title="Query Parameters", + ) ) request_headers: Optional[Union[Dict[str, str], str]] = Field( None, @@ -2874,7 +2894,9 @@ class QueryProperties(BaseModel): class StateDelegatingStream(BaseModel): type: Literal["StateDelegatingStream"] - name: str = Field(..., description="The stream name.", example=["Users"], title="Name") + name: str = Field( + ..., description="The stream name.", example=["Users"], title="Name" + ) full_refresh_stream: DeclarativeStream = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages when the state is empty or not provided.", @@ -2885,6 +2907,12 @@ class StateDelegatingStream(BaseModel): description="Component used to coordinate how records are extracted across stream slices and request pages when the state provided.", title="Incremental Stream", ) + api_retention_period: Optional[str] = Field( + None, + description="The data retention period of the incremental API (ISO8601 duration). If the cursor value is older than this retention period, the connector will automatically fall back to a full refresh to avoid data loss.\nThis is useful for APIs like Stripe Events API which only retain data for 30 days.\n * **PT1H**: 1 hour\n * **P1D**: 1 day\n * **P1W**: 1 week\n * **P1M**: 1 month\n * **P1Y**: 1 year\n * **P30D**: 30 days\n", + examples=["P30D", "P90D", "P1Y"], + title="API Retention Period", + ) parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") @@ -2961,13 +2989,17 @@ class AsyncRetriever(BaseModel): status_extractor: Union[DpathExtractor, CustomRecordExtractor] = Field( ..., description="Responsible for fetching the actual status of the async job." ) - download_target_extractor: Optional[Union[DpathExtractor, CustomRecordExtractor]] = Field( + download_target_extractor: Optional[ + Union[DpathExtractor, CustomRecordExtractor] + ] = Field( None, description="Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.", ) download_extractor: Optional[ Union[DpathExtractor, CustomRecordExtractor, ResponseToFileExtractor] - ] = Field(None, description="Responsible for fetching the records from provided urls.") + ] = Field( + None, description="Responsible for fetching the records from provided urls." + ) creation_requester: Union[HttpRequester, CustomRequester] = Field( ..., description="Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job.", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 2bd7d268d..198e942c7 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -78,6 +78,7 @@ DynamicStreamCheckConfig, ) from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel +from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime from airbyte_cdk.sources.declarative.decoders import ( Decoder, @@ -3568,11 +3569,104 @@ def create_state_delegating_stream( def _get_state_delegating_stream_model( self, has_parent_state: bool, model: StateDelegatingStreamModel ) -> DeclarativeStreamModel: - return ( - model.incremental_stream - if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state - else model.full_refresh_stream + stream_state = self._connector_state_manager.get_stream_state(model.name, None) + + if not stream_state and not has_parent_state: + return model.full_refresh_stream + + if model.api_retention_period and stream_state: + if self._is_cursor_older_than_retention_period( + stream_state, model.incremental_stream, model.api_retention_period, model.name + ): + return model.full_refresh_stream + + return model.incremental_stream + + def _is_cursor_older_than_retention_period( + self, + stream_state: Mapping[str, Any], + incremental_stream: DeclarativeStreamModel, + api_retention_period: str, + stream_name: str, + ) -> bool: + """Check if the cursor value in the state is older than the API's retention period. + + If the cursor is too old, the incremental API may not have data going back that far, + so we should fall back to a full refresh to avoid data loss. + + Returns True if the cursor is older than the retention period (should use full refresh). + Returns False if the cursor is within the retention period (safe to use incremental). + """ + incremental_sync = incremental_stream.incremental_sync + if not incremental_sync: + return False + + cursor_field = getattr(incremental_sync, "cursor_field", None) + if not cursor_field: + return False + + cursor_value = stream_state.get(cursor_field) + if not cursor_value: + return False + + retention_duration = parse_duration(api_retention_period) + retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration + + cursor_datetime = self._parse_cursor_datetime( + cursor_value, incremental_sync, stream_name ) + if cursor_datetime is None: + return False + + if cursor_datetime < retention_cutoff: + self._emit_warning_for_stale_cursor( + stream_name, cursor_value, api_retention_period, retention_cutoff + ) + return True + + return False + + def _parse_cursor_datetime( + self, + cursor_value: str, + incremental_sync: Any, + stream_name: str, + ) -> Optional[datetime.datetime]: + """Parse the cursor value into a datetime object using the cursor's datetime formats.""" + parser = DatetimeParser() + + datetime_format = getattr(incremental_sync, "datetime_format", None) + cursor_datetime_formats = getattr(incremental_sync, "cursor_datetime_formats", None) or [] + + formats_to_try = cursor_datetime_formats + ([datetime_format] if datetime_format else []) + + for fmt in formats_to_try: + try: + return parser.parse(cursor_value, fmt) + except (ValueError, TypeError): + continue + + logging.warning( + f"Could not parse cursor value '{cursor_value}' for stream '{stream_name}' " + f"using formats {formats_to_try}. Skipping cursor age validation." + ) + return None + + def _emit_warning_for_stale_cursor( + self, + stream_name: str, + cursor_value: str, + api_retention_period: str, + retention_cutoff: datetime.datetime, + ) -> None: + """Emit a warning message when the cursor is older than the API's retention period.""" + warning_message = ( + f"Stream '{stream_name}' has a cursor value '{cursor_value}' that is older than " + f"the API's retention period of {api_retention_period} (cutoff: {retention_cutoff.isoformat()}). " + f"Falling back to full refresh to avoid data loss. " + f"This may happen if a previous sync failed mid-way and the state was checkpointed." + ) + logging.warning(warning_message) def _create_async_job_status_mapping( self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 1239fe653..248e06d43 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -2,10 +2,13 @@ # Copyright (c) 2025 Airbyte, Inc., all rights reserved. # +import copy +import datetime import json from unittest.mock import MagicMock import freezegun +import pytest from airbyte_cdk.models import ( AirbyteStateBlob, @@ -253,3 +256,119 @@ def test_incremental_retriever(): {"id": 4, "name": "item_4", "updated_at": "2024-02-01"}, ] assert expected_incremental == incremental_records + + +def _create_manifest_with_retention_period(api_retention_period: str) -> dict: + """Create a manifest with api_retention_period set on the StateDelegatingStream.""" + manifest = copy.deepcopy(_MANIFEST) + manifest["definitions"]["TestStream"]["api_retention_period"] = api_retention_period + return manifest + + +@freezegun.freeze_time("2024-07-15") +def test_cursor_age_validation_falls_back_to_full_refresh_when_cursor_too_old(): + """Test that when cursor is older than retention period, full refresh is used.""" + manifest = _create_manifest_with_retention_period("P7D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/items"), + HttpResponse( + body=json.dumps( + [ + {"id": 1, "name": "item_1", "updated_at": "2024-07-13"}, + {"id": 2, "name": "item_2", "updated_at": "2024-07-14"}, + ] + ) + ), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(updated_at="2024-07-01"), + ), + ) + ] + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + records = get_records(source, _CONFIG, configured_catalog, state) + expected = [ + {"id": 1, "name": "item_1", "updated_at": "2024-07-13"}, + {"id": 2, "name": "item_2", "updated_at": "2024-07-14"}, + ] + assert expected == records + + +@freezegun.freeze_time("2024-07-15") +def test_cursor_age_validation_uses_incremental_when_cursor_within_retention(): + """Test that when cursor is within retention period, incremental sync is used.""" + manifest = _create_manifest_with_retention_period("P30D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest( + url="https://api.test.com/items_with_filtration?start=2024-07-13&end=2024-07-15" + ), + HttpResponse( + body=json.dumps( + [ + {"id": 3, "name": "item_3", "updated_at": "2024-07-14"}, + ] + ) + ), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(updated_at="2024-07-13"), + ), + ) + ] + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + records = get_records(source, _CONFIG, configured_catalog, state) + expected = [ + {"id": 3, "name": "item_3", "updated_at": "2024-07-14"}, + ] + assert expected == records + + +@freezegun.freeze_time("2024-07-15") +def test_cursor_age_validation_with_1_day_retention_falls_back(): + """Test cursor age validation with P1D retention period falls back to full refresh.""" + manifest = _create_manifest_with_retention_period("P1D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/items"), + HttpResponse(body=json.dumps([{"id": 1, "updated_at": "2024-07-14"}])), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(updated_at="2024-07-13"), + ), + ) + ] + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + records = get_records(source, _CONFIG, configured_catalog, state) + assert len(records) == 1 From 67bc5c81c7d989cfa468f43fb29c4a34f1864f62 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 2 Feb 2026 21:47:49 +0000 Subject: [PATCH 02/49] chore: re-trigger CI Co-Authored-By: unknown <> From 1edeeddc61cf8e667015dad193c560474730212d Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Tue, 3 Feb 2026 16:03:41 +0000 Subject: [PATCH 03/49] Auto-fix lint and format issues --- .../models/declarative_component_schema.py | 144 +++++++----------- .../parsers/model_to_component_factory.py | 4 +- 2 files changed, 59 insertions(+), 89 deletions(-) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index dbd437cfa..04dffcaff 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -928,28 +928,24 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( - Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", - ) + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( - Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", - ) + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -967,9 +963,7 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[ - {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} - ], + examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1473,9 +1467,7 @@ class CustomConfigTransformation(BaseModel): class_name: str = Field( ..., description="Fully-qualified name of the class that will be implementing the custom config transformation. The format is `source_..`.", - examples=[ - "source_declarative_manifest.components.MyCustomConfigTransformation" - ], + examples=["source_declarative_manifest.components.MyCustomConfigTransformation"], ) parameters: Optional[Dict[str, Any]] = Field( None, @@ -1903,9 +1895,7 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[ - ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] - ], + examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -2132,9 +2122,7 @@ class RecordSelector(BaseModel): description="Responsible for filtering records to be emitted by the Source.", title="Record Filter", ) - schema_normalization: Optional[ - Union[SchemaNormalization, CustomSchemaNormalization] - ] = Field( + schema_normalization: Optional[Union[SchemaNormalization, CustomSchemaNormalization]] = Field( None, description="Responsible for normalization according to the schema.", title="Schema Normalization", @@ -2176,12 +2164,10 @@ class DpathValidator(BaseModel): ], title="Field Path", ) - validation_strategy: Union[ValidateAdheresToSchema, CustomValidationStrategy] = ( - Field( - ..., - description="The condition that the specified config value will be evaluated against", - title="Validation Strategy", - ) + validation_strategy: Union[ValidateAdheresToSchema, CustomValidationStrategy] = Field( + ..., + description="The condition that the specified config value will be evaluated against", + title="Validation Strategy", ) @@ -2198,12 +2184,10 @@ class PredicateValidator(BaseModel): ], title="Value", ) - validation_strategy: Union[ValidateAdheresToSchema, CustomValidationStrategy] = ( - Field( - ..., - description="The validation strategy to apply to the value.", - title="Validation Strategy", - ) + validation_strategy: Union[ValidateAdheresToSchema, CustomValidationStrategy] = Field( + ..., + description="The validation strategy to apply to the value.", + title="Validation Strategy", ) @@ -2228,12 +2212,12 @@ class ConfigAddFields(BaseModel): class CompositeErrorHandler(BaseModel): type: Literal["CompositeErrorHandler"] - error_handlers: List[ - Union[CompositeErrorHandler, DefaultErrorHandler, CustomErrorHandler] - ] = Field( - ..., - description="List of error handlers to iterate on to determine how to handle a failed response.", - title="Error Handlers", + error_handlers: List[Union[CompositeErrorHandler, DefaultErrorHandler, CustomErrorHandler]] = ( + Field( + ..., + description="List of error handlers to iterate on to determine how to handle a failed response.", + title="Error Handlers", + ) ) parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") @@ -2395,9 +2379,9 @@ class Config: type: Literal["DeclarativeSource"] check: Union[CheckStream, CheckDynamicStream] - streams: Optional[ - List[Union[ConditionalStreams, DeclarativeStream, StateDelegatingStream]] - ] = None + streams: Optional[List[Union[ConditionalStreams, DeclarativeStream, StateDelegatingStream]]] = ( + None + ) dynamic_streams: List[DynamicDeclarativeStream] version: str = Field( ..., @@ -2522,20 +2506,16 @@ class Config: extra = Extra.allow type: Literal["DeclarativeStream"] - name: Optional[str] = Field( - "", description="The stream name.", example=["Users"], title="Name" - ) + name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[DatetimeBasedCursor, IncrementingCountCursor]] = ( - Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", - ) + incremental_sync: Optional[Union[DatetimeBasedCursor, IncrementingCountCursor]] = Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", ) primary_key: Optional[PrimaryKey] = Field("", title="Primary Key") schema_loader: Optional[ @@ -2709,20 +2689,18 @@ class HttpRequester(BaseModelWithDeprecations): description="For APIs that require explicit specification of the properties to query for, this component will take a static or dynamic set of properties (which can be optionally split into chunks) and allow them to be injected into an outbound request by accessing stream_partition.extra_fields.", title="Query Properties", ) - request_parameters: Optional[Union[Dict[str, Union[str, QueryProperties]], str]] = ( - Field( - None, - description="Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.", - examples=[ - {"unit": "day"}, - { - "query": 'last_event_time BETWEEN TIMESTAMP "{{ stream_interval.start_time }}" AND TIMESTAMP "{{ stream_interval.end_time }}"' - }, - {"searchIn": "{{ ','.join(config.get('search_in', [])) }}"}, - {"sort_by[asc]": "updated_at"}, - ], - title="Query Parameters", - ) + request_parameters: Optional[Union[Dict[str, Union[str, QueryProperties]], str]] = Field( + None, + description="Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.", + examples=[ + {"unit": "day"}, + { + "query": 'last_event_time BETWEEN TIMESTAMP "{{ stream_interval.start_time }}" AND TIMESTAMP "{{ stream_interval.end_time }}"' + }, + {"searchIn": "{{ ','.join(config.get('search_in', [])) }}"}, + {"sort_by[asc]": "updated_at"}, + ], + title="Query Parameters", ) request_headers: Optional[Union[Dict[str, str], str]] = Field( None, @@ -2894,9 +2872,7 @@ class QueryProperties(BaseModel): class StateDelegatingStream(BaseModel): type: Literal["StateDelegatingStream"] - name: str = Field( - ..., description="The stream name.", example=["Users"], title="Name" - ) + name: str = Field(..., description="The stream name.", example=["Users"], title="Name") full_refresh_stream: DeclarativeStream = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages when the state is empty or not provided.", @@ -2989,17 +2965,13 @@ class AsyncRetriever(BaseModel): status_extractor: Union[DpathExtractor, CustomRecordExtractor] = Field( ..., description="Responsible for fetching the actual status of the async job." ) - download_target_extractor: Optional[ - Union[DpathExtractor, CustomRecordExtractor] - ] = Field( + download_target_extractor: Optional[Union[DpathExtractor, CustomRecordExtractor]] = Field( None, description="Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.", ) download_extractor: Optional[ Union[DpathExtractor, CustomRecordExtractor, ResponseToFileExtractor] - ] = Field( - None, description="Responsible for fetching the records from provided urls." - ) + ] = Field(None, description="Responsible for fetching the records from provided urls.") creation_requester: Union[HttpRequester, CustomRequester] = Field( ..., description="Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job.", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 198e942c7..3693efce3 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3612,9 +3612,7 @@ def _is_cursor_older_than_retention_period( retention_duration = parse_duration(api_retention_period) retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration - cursor_datetime = self._parse_cursor_datetime( - cursor_value, incremental_sync, stream_name - ) + cursor_datetime = self._parse_cursor_datetime(cursor_value, incremental_sync, stream_name) if cursor_datetime is None: return False From 61d8d5d17295e5899daf906c7cd6f6c5ba04cebd Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Tue, 3 Feb 2026 10:04:06 -0600 Subject: [PATCH 04/49] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- unit_tests/sources/declarative/test_state_delegating_stream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 248e06d43..3734947e5 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -3,7 +3,6 @@ # import copy -import datetime import json from unittest.mock import MagicMock From 21da1129d1c809ab179cc541d1ad2e43b903ebe2 Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Tue, 3 Feb 2026 10:12:41 -0600 Subject: [PATCH 05/49] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- unit_tests/sources/declarative/test_state_delegating_stream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 3734947e5..880384601 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -7,7 +7,6 @@ from unittest.mock import MagicMock import freezegun -import pytest from airbyte_cdk.models import ( AirbyteStateBlob, From 0e33418b88e1c79fc90d2446af7485ea41e0612f Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Feb 2026 18:19:46 +0000 Subject: [PATCH 06/49] fix: Address Copilot review comments - Fix YAML bullet point indentation for consistency (Comment 8) - Add type guard for cursor_value to handle unexpected types (Comment 9) - Add test for warning log emission when cursor is too old (Comment 10) Co-Authored-By: unknown <> --- .../declarative_component_schema.yaml | 12 +++---- .../parsers/model_to_component_factory.py | 3 ++ .../test_state_delegating_stream.py | 36 +++++++++++++++++++ 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 64e8aba72..de14d55e5 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -3757,12 +3757,12 @@ definitions: description: | The data retention period of the incremental API (ISO8601 duration). If the cursor value is older than this retention period, the connector will automatically fall back to a full refresh to avoid data loss. This is useful for APIs like Stripe Events API which only retain data for 30 days. - * **PT1H**: 1 hour - * **P1D**: 1 day - * **P1W**: 1 week - * **P1M**: 1 month - * **P1Y**: 1 year - * **P30D**: 30 days + * **PT1H**: 1 hour + * **P1D**: 1 day + * **P1W**: 1 week + * **P1M**: 1 month + * **P1Y**: 1 year + * **P30D**: 30 days type: string examples: - "P30D" diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3693efce3..519930543 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3609,6 +3609,9 @@ def _is_cursor_older_than_retention_period( if not cursor_value: return False + if not isinstance(cursor_value, (str, int)): + return False + retention_duration = parse_duration(api_retention_period) retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 880384601..86d567bea 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -4,9 +4,11 @@ import copy import json +import logging from unittest.mock import MagicMock import freezegun +import pytest from airbyte_cdk.models import ( AirbyteStateBlob, @@ -370,3 +372,37 @@ def test_cursor_age_validation_with_1_day_retention_falls_back(): records = get_records(source, _CONFIG, configured_catalog, state) assert len(records) == 1 + + +@freezegun.freeze_time("2024-07-15") +def test_cursor_age_validation_emits_warning_when_falling_back(caplog): + """Test that a warning is emitted when cursor is older than retention period.""" + manifest = _create_manifest_with_retention_period("P7D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/items"), + HttpResponse(body=json.dumps([{"id": 1, "updated_at": "2024-07-14"}])), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(updated_at="2024-07-01"), + ), + ) + ] + + with caplog.at_level(logging.WARNING): + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + get_records(source, _CONFIG, configured_catalog, state) + + warning_messages = [r.message for r in caplog.records if r.levelno == logging.WARNING] + assert any("TestStream" in msg and "older than" in msg and "P7D" in msg for msg in warning_messages), ( + f"Expected warning about stale cursor not found. Warnings: {warning_messages}" + ) From 324344f7078a91074e3c27e7f4ad8fb2f973ebaa Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Feb 2026 18:21:09 +0000 Subject: [PATCH 07/49] fix: Correct ruff format for assert statement Co-Authored-By: unknown <> --- .../sources/declarative/test_state_delegating_stream.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 86d567bea..3de83c0b1 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -403,6 +403,6 @@ def test_cursor_age_validation_emits_warning_when_falling_back(caplog): get_records(source, _CONFIG, configured_catalog, state) warning_messages = [r.message for r in caplog.records if r.levelno == logging.WARNING] - assert any("TestStream" in msg and "older than" in msg and "P7D" in msg for msg in warning_messages), ( - f"Expected warning about stale cursor not found. Warnings: {warning_messages}" - ) + assert any( + "TestStream" in msg and "older than" in msg and "P7D" in msg for msg in warning_messages + ), f"Expected warning about stale cursor not found. Warnings: {warning_messages}" From da8a5a59167f22368e5d71a9b2fcbb6edb35cf8e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Feb 2026 18:24:40 +0000 Subject: [PATCH 08/49] fix: Convert cursor_value to str for type safety Co-Authored-By: unknown <> --- .../declarative/parsers/model_to_component_factory.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 519930543..c8d8c6a16 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3612,16 +3612,18 @@ def _is_cursor_older_than_retention_period( if not isinstance(cursor_value, (str, int)): return False + cursor_value_str = str(cursor_value) + retention_duration = parse_duration(api_retention_period) retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration - cursor_datetime = self._parse_cursor_datetime(cursor_value, incremental_sync, stream_name) + cursor_datetime = self._parse_cursor_datetime(cursor_value_str, incremental_sync, stream_name) if cursor_datetime is None: return False if cursor_datetime < retention_cutoff: self._emit_warning_for_stale_cursor( - stream_name, cursor_value, api_retention_period, retention_cutoff + stream_name, cursor_value_str, api_retention_period, retention_cutoff ) return True From 37e046e505c54bff2e714a932f4558010d21895b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Feb 2026 18:26:12 +0000 Subject: [PATCH 09/49] fix: Format long line for ruff compliance Co-Authored-By: unknown <> --- .../sources/declarative/parsers/model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index c8d8c6a16..60f861783 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3617,7 +3617,9 @@ def _is_cursor_older_than_retention_period( retention_duration = parse_duration(api_retention_period) retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration - cursor_datetime = self._parse_cursor_datetime(cursor_value_str, incremental_sync, stream_name) + cursor_datetime = self._parse_cursor_datetime( + cursor_value_str, incremental_sync, stream_name + ) if cursor_datetime is None: return False From dceb70d3dfbfcbeaf92e08409396a933bf9cd11c Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Tue, 3 Feb 2026 12:38:59 -0600 Subject: [PATCH 10/49] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- unit_tests/sources/declarative/test_state_delegating_stream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 3de83c0b1..aaac2967b 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -8,7 +8,6 @@ from unittest.mock import MagicMock import freezegun -import pytest from airbyte_cdk.models import ( AirbyteStateBlob, From c14f96379b8e5ec805eb101c22ebe2314bd4bb17 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Feb 2026 19:34:03 +0000 Subject: [PATCH 11/49] refactor: Move incremental_sync check to _get_state_delegating_stream_model Addresses review comment from tolik0 - the incremental_sync check is now performed in the calling method before invoking _is_cursor_older_than_retention_period. Co-Authored-By: unknown <> --- .../declarative/parsers/model_to_component_factory.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 60f861783..01ff1e4f6 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3575,8 +3575,9 @@ def _get_state_delegating_stream_model( return model.full_refresh_stream if model.api_retention_period and stream_state: - if self._is_cursor_older_than_retention_period( - stream_state, model.incremental_stream, model.api_retention_period, model.name + incremental_sync = model.incremental_stream.incremental_sync + if incremental_sync and self._is_cursor_older_than_retention_period( + stream_state, incremental_sync, model.api_retention_period, model.name ): return model.full_refresh_stream @@ -3585,7 +3586,7 @@ def _get_state_delegating_stream_model( def _is_cursor_older_than_retention_period( self, stream_state: Mapping[str, Any], - incremental_stream: DeclarativeStreamModel, + incremental_sync: Any, api_retention_period: str, stream_name: str, ) -> bool: @@ -3597,10 +3598,6 @@ def _is_cursor_older_than_retention_period( Returns True if the cursor is older than the retention period (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ - incremental_sync = incremental_stream.incremental_sync - if not incremental_sync: - return False - cursor_field = getattr(incremental_sync, "cursor_field", None) if not cursor_field: return False From 86d5ea609b89924ad0a6e3d5abaf4c2405f27d95 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Feb 2026 19:39:54 +0000 Subject: [PATCH 12/49] fix: Return True (full refresh) when cursor is invalid/unparseable Addresses review comment from tolik0 - if the cursor value or format is incorrect, we should use full_refresh_stream instead, as it indicates that the stream_state is unusable. Co-Authored-By: unknown <> --- .../declarative/parsers/model_to_component_factory.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 01ff1e4f6..93214df33 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3595,19 +3595,20 @@ def _is_cursor_older_than_retention_period( If the cursor is too old, the incremental API may not have data going back that far, so we should fall back to a full refresh to avoid data loss. - Returns True if the cursor is older than the retention period (should use full refresh). + Returns True if the cursor is older than the retention period or if the cursor is + invalid/unparseable (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ cursor_field = getattr(incremental_sync, "cursor_field", None) if not cursor_field: - return False + return True cursor_value = stream_state.get(cursor_field) if not cursor_value: - return False + return True if not isinstance(cursor_value, (str, int)): - return False + return True cursor_value_str = str(cursor_value) @@ -3618,7 +3619,7 @@ def _is_cursor_older_than_retention_period( cursor_value_str, incremental_sync, stream_name ) if cursor_datetime is None: - return False + return True if cursor_datetime < retention_cutoff: self._emit_warning_for_stale_cursor( From 567ca7aee80f5abfbcd09a18a8932e934e3d4c1b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Feb 2026 20:45:53 +0000 Subject: [PATCH 13/49] fix: Parse cursor from both full_refresh_stream and incremental_stream Address tolik0's review comment: During the first sync, the state will be produced by full_refresh_stream, and during subsequent syncs, by incremental_stream. We need to correctly parse the state for both cases. Changes: - Extract incremental_sync from both full_refresh_stream and incremental_stream - Update _is_cursor_older_than_retention_period to accept list of sources - Update _parse_cursor_datetime to collect and try formats from all sources Co-Authored-By: unknown <> --- .../parsers/model_to_component_factory.py | 49 +++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 93214df33..84780c42a 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3575,9 +3575,13 @@ def _get_state_delegating_stream_model( return model.full_refresh_stream if model.api_retention_period and stream_state: - incremental_sync = model.incremental_stream.incremental_sync - if incremental_sync and self._is_cursor_older_than_retention_period( - stream_state, incremental_sync, model.api_retention_period, model.name + incremental_sync_sources = [ + model.full_refresh_stream.incremental_sync, + model.incremental_stream.incremental_sync, + ] + incremental_sync_sources = [s for s in incremental_sync_sources if s is not None] + if incremental_sync_sources and self._is_cursor_older_than_retention_period( + stream_state, incremental_sync_sources, model.api_retention_period, model.name ): return model.full_refresh_stream @@ -3586,7 +3590,7 @@ def _get_state_delegating_stream_model( def _is_cursor_older_than_retention_period( self, stream_state: Mapping[str, Any], - incremental_sync: Any, + incremental_sync_sources: list[Any], api_retention_period: str, stream_name: str, ) -> bool: @@ -3595,11 +3599,19 @@ def _is_cursor_older_than_retention_period( If the cursor is too old, the incremental API may not have data going back that far, so we should fall back to a full refresh to avoid data loss. + The state could have been produced by either full_refresh_stream (first sync) or + incremental_stream (subsequent syncs), so we try parsing with formats from both. + Returns True if the cursor is older than the retention period or if the cursor is invalid/unparseable (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ - cursor_field = getattr(incremental_sync, "cursor_field", None) + cursor_field = None + for incremental_sync in incremental_sync_sources: + cursor_field = getattr(incremental_sync, "cursor_field", None) + if cursor_field: + break + if not cursor_field: return True @@ -3616,7 +3628,7 @@ def _is_cursor_older_than_retention_period( retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration cursor_datetime = self._parse_cursor_datetime( - cursor_value_str, incremental_sync, stream_name + cursor_value_str, incremental_sync_sources, stream_name ) if cursor_datetime is None: return True @@ -3632,16 +3644,25 @@ def _is_cursor_older_than_retention_period( def _parse_cursor_datetime( self, cursor_value: str, - incremental_sync: Any, + incremental_sync_sources: list[Any], stream_name: str, - ) -> Optional[datetime.datetime]: - """Parse the cursor value into a datetime object using the cursor's datetime formats.""" - parser = DatetimeParser() + ) -> datetime.datetime | None: + """Parse the cursor value into a datetime object using datetime formats from all sources. - datetime_format = getattr(incremental_sync, "datetime_format", None) - cursor_datetime_formats = getattr(incremental_sync, "cursor_datetime_formats", None) or [] + The state could have been produced by either full_refresh_stream (first sync) or + incremental_stream (subsequent syncs), so we try parsing with formats from both. + """ + parser = DatetimeParser() - formats_to_try = cursor_datetime_formats + ([datetime_format] if datetime_format else []) + formats_to_try: list[str] = [] + for incremental_sync in incremental_sync_sources: + datetime_format = getattr(incremental_sync, "datetime_format", None) + cursor_datetime_formats = ( + getattr(incremental_sync, "cursor_datetime_formats", None) or [] + ) + formats_to_try.extend(cursor_datetime_formats) + if datetime_format: + formats_to_try.append(datetime_format) for fmt in formats_to_try: try: @@ -3651,7 +3672,7 @@ def _parse_cursor_datetime( logging.warning( f"Could not parse cursor value '{cursor_value}' for stream '{stream_name}' " - f"using formats {formats_to_try}. Skipping cursor age validation." + f"using formats {formats_to_try}. Falling back to full refresh." ) return None From be72c5cc5c3cd59ed309babe34cc195fb8b2bd47 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Feb 2026 18:49:47 +0000 Subject: [PATCH 14/49] feat: Add support for per-partition state and IncrementingCountCursor validation - Add _extract_cursor_value_from_state helper to handle different state structures - For per-partition state, use global cursor value from 'state' key - Raise ValueError when IncrementingCountCursor is used with api_retention_period - Add unit tests for per-partition state (cursor too old and within retention) - Add unit test for IncrementingCountCursor error handling Co-Authored-By: unknown <> --- .../parsers/model_to_component_factory.py | 38 ++++- .../test_state_delegating_stream.py | 146 ++++++++++++++++++ 2 files changed, 181 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 84780c42a..a819fad65 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3599,13 +3599,22 @@ def _is_cursor_older_than_retention_period( If the cursor is too old, the incremental API may not have data going back that far, so we should fall back to a full refresh to avoid data loss. - The state could have been produced by either full_refresh_stream (first sync) or - incremental_stream (subsequent syncs), so we try parsing with formats from both. + This method handles different state structures: + - Simple cursor state: {"cursor_field": "value"} + - Per-partition state: {"state": {"cursor_field": "value"}, "states": [...]} Returns True if the cursor is older than the retention period or if the cursor is invalid/unparseable (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ + for incremental_sync in incremental_sync_sources: + if isinstance(incremental_sync, IncrementingCountCursorModel): + raise ValueError( + f"Stream '{stream_name}' uses IncrementingCountCursor which is not supported " + f"with api_retention_period. IncrementingCountCursor does not use datetime-based " + f"cursors, so cursor age validation cannot be performed." + ) + cursor_field = None for incremental_sync in incremental_sync_sources: cursor_field = getattr(incremental_sync, "cursor_field", None) @@ -3615,7 +3624,7 @@ def _is_cursor_older_than_retention_period( if not cursor_field: return True - cursor_value = stream_state.get(cursor_field) + cursor_value = self._extract_cursor_value_from_state(stream_state, cursor_field) if not cursor_value: return True @@ -3641,6 +3650,29 @@ def _is_cursor_older_than_retention_period( return False + def _extract_cursor_value_from_state( + self, + stream_state: Mapping[str, Any], + cursor_field: str, + ) -> Any: + """Extract cursor value from state, handling different state structures. + + Supports: + - Simple cursor state: {"cursor_field": "value"} -> returns "value" + - Per-partition state: {"state": {"cursor_field": "value"}, ...} -> returns "value" + (uses global cursor from "state" key) + + Returns None if cursor value cannot be extracted. + """ + if cursor_field in stream_state: + return stream_state.get(cursor_field) + + global_state = stream_state.get("state") + if isinstance(global_state, dict) and cursor_field in global_state: + return global_state.get(cursor_field) + + return None + def _parse_cursor_datetime( self, cursor_value: str, diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index aaac2967b..fe673d821 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -8,6 +8,7 @@ from unittest.mock import MagicMock import freezegun +import pytest from airbyte_cdk.models import ( AirbyteStateBlob, @@ -405,3 +406,148 @@ def test_cursor_age_validation_emits_warning_when_falling_back(caplog): assert any( "TestStream" in msg and "older than" in msg and "P7D" in msg for msg in warning_messages ), f"Expected warning about stale cursor not found. Warnings: {warning_messages}" + + +@freezegun.freeze_time("2024-07-15") +def test_cursor_age_validation_with_per_partition_state_uses_global_cursor(): + """Test that per-partition state structure uses global cursor for age validation.""" + manifest = _create_manifest_with_retention_period("P7D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/items"), + HttpResponse( + body=json.dumps( + [ + {"id": 1, "name": "item_1", "updated_at": "2024-07-13"}, + ] + ) + ), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob( + state={"updated_at": "2024-07-01"}, + states=[ + { + "partition": {"parent_id": "1"}, + "cursor": {"updated_at": "2024-07-10"}, + }, + { + "partition": {"parent_id": "2"}, + "cursor": {"updated_at": "2024-07-05"}, + }, + ], + use_global_cursor=False, + ), + ), + ) + ] + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + records = get_records(source, _CONFIG, configured_catalog, state) + assert len(records) == 1 + + +@freezegun.freeze_time("2024-07-15") +def test_cursor_age_validation_with_per_partition_state_within_retention(): + """Test per-partition state with global cursor within retention uses incremental. + + This test verifies that when the global cursor in a per-partition state structure + is within the retention period, the incremental stream is selected (not full refresh). + We verify this by checking that the incremental endpoint is called, not the full refresh one. + """ + manifest = _create_manifest_with_retention_period("P30D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest( + url="https://api.test.com/items_with_filtration", + query_params={"start": "2024-07-01", "end": "2024-07-15"}, + ), + HttpResponse( + body=json.dumps( + [ + {"id": 3, "name": "item_3", "updated_at": "2024-07-14"}, + ] + ) + ), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob( + state={"updated_at": "2024-07-10"}, + states=[ + { + "partition": {"parent_id": "1"}, + "cursor": {"updated_at": "2024-07-10"}, + }, + ], + use_global_cursor=False, + ), + ), + ) + ] + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + records = get_records(source, _CONFIG, configured_catalog, state) + assert len(records) == 1 + + +def _create_manifest_with_incrementing_count_cursor(api_retention_period: str) -> dict: + """Create a manifest with IncrementingCountCursor and api_retention_period.""" + manifest = copy.deepcopy(_MANIFEST) + manifest["definitions"]["TestStream"]["api_retention_period"] = api_retention_period + + incrementing_cursor = { + "type": "IncrementingCountCursor", + "cursor_field": "id", + "start_value": 0, + } + manifest["definitions"]["TestStream"]["full_refresh_stream"]["incremental_sync"] = ( + incrementing_cursor + ) + manifest["definitions"]["TestStream"]["incremental_stream"]["incremental_sync"] = ( + incrementing_cursor + ) + return manifest + + +def test_cursor_age_validation_raises_error_for_incrementing_count_cursor(): + """Test that IncrementingCountCursor with api_retention_period raises an error.""" + manifest = _create_manifest_with_incrementing_count_cursor("P7D") + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(id=100), + ), + ) + ] + + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + + with pytest.raises(ValueError) as exc_info: + source.discover(logger=MagicMock(), config=_CONFIG) + + assert "IncrementingCountCursor" in str(exc_info.value) + assert "not supported" in str(exc_info.value) + assert "api_retention_period" in str(exc_info.value) From 2b54cc5bf8594fc0a28a40d1c20a87fd2492f06a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 5 Feb 2026 17:12:39 +0000 Subject: [PATCH 15/49] feat: Add get_cursor_datetime_from_state method to cursor classes This adds the get_cursor_datetime_from_state method to: - DeclarativeCursor (base class with NotImplementedError default) - DatetimeBasedCursor (extracts cursor from state and parses using datetime formats) - PerPartitionCursor (extracts global cursor from 'state' key and delegates to underlying cursor) This method is used by StateDelegatingStream to validate cursor age against an API's data retention period. The method returns None if the cursor cannot be extracted or parsed, which causes StateDelegatingStream to fall back to full refresh (safe default). Co-Authored-By: unknown <> --- .../incremental/datetime_based_cursor.py | 20 ++++++++++++++++ .../incremental/declarative_cursor.py | 23 +++++++++++++++++++ .../incremental/per_partition_cursor.py | 18 +++++++++++++++ 3 files changed, 61 insertions(+) diff --git a/airbyte_cdk/legacy/sources/declarative/incremental/datetime_based_cursor.py b/airbyte_cdk/legacy/sources/declarative/incremental/datetime_based_cursor.py index 616a13d8c..b542cb6e6 100644 --- a/airbyte_cdk/legacy/sources/declarative/incremental/datetime_based_cursor.py +++ b/airbyte_cdk/legacy/sources/declarative/incremental/datetime_based_cursor.py @@ -212,6 +212,26 @@ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[S # through each slice and does not belong to a specific slice. We just return stream state as it is. return self.get_stream_state() + def get_cursor_datetime_from_state( + self, stream_state: Mapping[str, Any] + ) -> Optional[datetime.datetime]: + """Extract and parse the cursor datetime from the given stream state. + + Returns the cursor datetime if present and parseable, otherwise returns None. + """ + cursor_field_key = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ + if cursor_field_key not in stream_state: + return None + + cursor_value = stream_state.get(cursor_field_key) + if not cursor_value: + return None + + try: + return self.parse_date(str(cursor_value)) + except ValueError: + return None + def _calculate_earliest_possible_value( self, end_datetime: datetime.datetime ) -> datetime.datetime: diff --git a/airbyte_cdk/legacy/sources/declarative/incremental/declarative_cursor.py b/airbyte_cdk/legacy/sources/declarative/incremental/declarative_cursor.py index adb64d119..063d5a1f3 100644 --- a/airbyte_cdk/legacy/sources/declarative/incremental/declarative_cursor.py +++ b/airbyte_cdk/legacy/sources/declarative/incremental/declarative_cursor.py @@ -1,6 +1,8 @@ # Copyright (c) 2024 Airbyte, Inc., all rights reserved. +import datetime from abc import ABC +from typing import Any, Mapping, Optional from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer from airbyte_cdk.sources.streams.checkpoint.cursor import Cursor @@ -11,3 +13,24 @@ class DeclarativeCursor(Cursor, StreamSlicer, ABC): DeclarativeCursors are components that allow for checkpointing syncs. In addition to managing the fetching and updating of state, declarative cursors also manage stream slicing and injecting slice values into outbound requests. """ + + def get_cursor_datetime_from_state( + self, stream_state: Mapping[str, Any] + ) -> Optional[datetime.datetime]: + """Extract and parse the cursor datetime from the given stream state. + + This method is used by StateDelegatingStream to validate cursor age against + an API's data retention period. Subclasses should implement this method to + extract the cursor value from their specific state structure and parse it + into a datetime object. + + Returns None if the cursor cannot be extracted or parsed, which will cause + StateDelegatingStream to fall back to full refresh (safe default). + + Raises NotImplementedError by default - subclasses must implement this method + if they want to support cursor age validation with api_retention_period. + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not implement get_cursor_datetime_from_state. " + f"Cursor age validation with api_retention_period is not supported for this cursor type." + ) diff --git a/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py b/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py index 23746e808..85a749d67 100644 --- a/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +++ b/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py @@ -2,6 +2,7 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # +import datetime import logging from collections import OrderedDict from typing import Any, Callable, Iterable, Mapping, Optional, Union @@ -211,6 +212,23 @@ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[S return self._get_state_for_partition(stream_slice.partition) + def get_cursor_datetime_from_state( + self, stream_state: Mapping[str, Any] + ) -> Optional[datetime.datetime]: + """Extract and parse the cursor datetime from the global cursor in per-partition state. + + For per-partition cursors, the global cursor is stored under the "state" key. + This method delegates to the underlying cursor factory to parse the datetime. + + Returns None if the global cursor is not present or cannot be parsed. + """ + global_state = stream_state.get("state") + if not global_state or not isinstance(global_state, dict): + return None + + cursor = self._cursor_factory.create() + return cursor.get_cursor_datetime_from_state(global_state) + def _create_cursor(self, cursor_state: Any) -> DeclarativeCursor: cursor = self._cursor_factory.create() cursor.set_initial_state(cursor_state) From f199583f8dc46754615c8e4ff7e82a8d64157bad Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 9 Feb 2026 15:41:51 +0000 Subject: [PATCH 16/49] feat: Add get_cursor_datetime_from_state to concurrent cursor classes Co-Authored-By: unknown <> --- .../concurrent_partition_cursor.py | 19 ++++++ .../sources/streams/concurrent/cursor.py | 59 +++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index f0379368d..d1f2ca41e 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -3,6 +3,7 @@ # import copy +import datetime import logging import threading import time @@ -658,3 +659,21 @@ def get_global_state( if stream_state and "state" in stream_state else None ) + + def get_cursor_datetime_from_state( + self, stream_state: Mapping[str, Any] + ) -> datetime.datetime | None: + """Extract and parse the cursor datetime from the global cursor in per-partition state. + + For per-partition cursors, the global cursor is stored under the "state" key. + This method delegates to the underlying cursor factory to parse the datetime. + + Returns None if the global cursor is not present or cannot be parsed. + """ + global_state = stream_state.get(self._GLOBAL_STATE_KEY) + if not global_state or not isinstance(global_state, dict): + return None + + # Create a cursor to delegate the parsing + cursor = self._cursor_factory.create(stream_state={}, runtime_lookback_window=None) + return cursor.get_cursor_datetime_from_state(global_state) diff --git a/airbyte_cdk/sources/streams/concurrent/cursor.py b/airbyte_cdk/sources/streams/concurrent/cursor.py index e3a487183..122471227 100644 --- a/airbyte_cdk/sources/streams/concurrent/cursor.py +++ b/airbyte_cdk/sources/streams/concurrent/cursor.py @@ -2,6 +2,7 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # +import datetime import functools import logging import threading @@ -89,6 +90,27 @@ def stream_slices(self) -> Iterable[StreamSlice]: """ yield StreamSlice(partition={}, cursor_slice={}) + def get_cursor_datetime_from_state( + self, stream_state: Mapping[str, Any] + ) -> datetime.datetime | None: + """Extract and parse the cursor datetime from the given stream state. + + This method is used by StateDelegatingStream to validate cursor age against + an API's data retention period. Subclasses should implement this method to + extract the cursor value from their specific state structure and parse it + into a datetime object. + + Returns None if the cursor cannot be extracted or parsed, which will cause + StateDelegatingStream to fall back to full refresh (safe default). + + Raises NotImplementedError by default - subclasses must implement this method + if they want to support cursor age validation with api_retention_period. + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not implement get_cursor_datetime_from_state. " + f"Cursor age validation with api_retention_period is not supported for this cursor type." + ) + class FinalStateCursor(Cursor): """Cursor that is used to guarantee at least one state message is emitted for a concurrent stream.""" @@ -568,3 +590,40 @@ def reduce_slice_range(self, stream_slice: StreamSlice) -> StreamSlice: ) else: return stream_slice + + def get_cursor_datetime_from_state( + self, stream_state: Mapping[str, Any] + ) -> datetime.datetime | None: + """Extract and parse the cursor datetime from the given stream state. + + For concurrent cursors, the state can be in two formats: + 1. Sequential/legacy format: {cursor_field: cursor_value} + 2. Concurrent format: {state_type: "date-range", slices: [...]} + + Returns the cursor datetime if present and parseable, otherwise returns None. + """ + # Check if state is in concurrent format + if self._connector_state_converter.is_state_message_compatible(stream_state): + slices = stream_state.get("slices", []) + if not slices: + return None + # Get the most recent cursor value from the first slice (after merging) + first_slice = slices[0] + cursor_value = first_slice.get( + self._connector_state_converter.MOST_RECENT_RECORD_KEY + ) or first_slice.get(self._connector_state_converter.END_KEY) + if not cursor_value: + return None + try: + return self._connector_state_converter.parse_value(cursor_value) + except (ValueError, TypeError): + return None + + # Sequential/legacy format: {cursor_field: cursor_value} + cursor_value = stream_state.get(self._cursor_field.cursor_field_key) + if not cursor_value: + return None + try: + return self._connector_state_converter.parse_value(cursor_value) + except (ValueError, TypeError): + return None From fbda39fbe62c95ed366cf9d80b70174766fdeab6 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 9 Feb 2026 15:45:26 +0000 Subject: [PATCH 17/49] fix: Fix MyPy type errors in ConcurrentCursor.get_cursor_datetime_from_state Co-Authored-By: unknown <> --- airbyte_cdk/sources/streams/concurrent/cursor.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/airbyte_cdk/sources/streams/concurrent/cursor.py b/airbyte_cdk/sources/streams/concurrent/cursor.py index 122471227..806bc25cf 100644 --- a/airbyte_cdk/sources/streams/concurrent/cursor.py +++ b/airbyte_cdk/sources/streams/concurrent/cursor.py @@ -602,8 +602,9 @@ def get_cursor_datetime_from_state( Returns the cursor datetime if present and parseable, otherwise returns None. """ - # Check if state is in concurrent format - if self._connector_state_converter.is_state_message_compatible(stream_state): + # Check if state is in concurrent format (need to convert to dict for type compatibility) + mutable_state: MutableMapping[str, Any] = dict(stream_state) + if self._connector_state_converter.is_state_message_compatible(mutable_state): slices = stream_state.get("slices", []) if not slices: return None @@ -615,7 +616,10 @@ def get_cursor_datetime_from_state( if not cursor_value: return None try: - return self._connector_state_converter.parse_value(cursor_value) + parsed_value = self._connector_state_converter.parse_value(cursor_value) + if isinstance(parsed_value, datetime.datetime): + return parsed_value + return None except (ValueError, TypeError): return None @@ -624,6 +628,9 @@ def get_cursor_datetime_from_state( if not cursor_value: return None try: - return self._connector_state_converter.parse_value(cursor_value) + parsed_value = self._connector_state_converter.parse_value(cursor_value) + if isinstance(parsed_value, datetime.datetime): + return parsed_value + return None except (ValueError, TypeError): return None From a2d4b56aaba95bd167ae3e6eb55166efba9e1b31 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 17:25:39 +0000 Subject: [PATCH 18/49] refactor: Wire factory to use cursor class get_cursor_datetime_from_state Co-Authored-By: gl_anatolii.yatsuk@airbyte.io --- .../parsers/model_to_component_factory.py | 144 +++++------------- 1 file changed, 42 insertions(+), 102 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index a819fad65..a5490d0e8 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -78,7 +78,6 @@ DynamicStreamCheckConfig, ) from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel -from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime from airbyte_cdk.sources.declarative.decoders import ( Decoder, @@ -3561,13 +3560,13 @@ def create_state_delegating_stream( ) stream_model = self._get_state_delegating_stream_model( - False if has_parent_state is None else has_parent_state, model + False if has_parent_state is None else has_parent_state, model, config ) return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel def _get_state_delegating_stream_model( - self, has_parent_state: bool, model: StateDelegatingStreamModel + self, has_parent_state: bool, model: StateDelegatingStreamModel, config: Config ) -> DeclarativeStreamModel: stream_state = self._connector_state_manager.get_stream_state(model.name, None) @@ -3581,7 +3580,7 @@ def _get_state_delegating_stream_model( ] incremental_sync_sources = [s for s in incremental_sync_sources if s is not None] if incremental_sync_sources and self._is_cursor_older_than_retention_period( - stream_state, incremental_sync_sources, model.api_retention_period, model.name + stream_state, incremental_sync_sources, model.api_retention_period, model.name, config ): return model.full_refresh_stream @@ -3593,20 +3592,21 @@ def _is_cursor_older_than_retention_period( incremental_sync_sources: list[Any], api_retention_period: str, stream_name: str, + config: Config, ) -> bool: """Check if the cursor value in the state is older than the API's retention period. - If the cursor is too old, the incremental API may not have data going back that far, - so we should fall back to a full refresh to avoid data loss. - - This method handles different state structures: - - Simple cursor state: {"cursor_field": "value"} - - Per-partition state: {"state": {"cursor_field": "value"}, "states": [...]} + Delegates cursor datetime extraction to cursor class instances via + get_cursor_datetime_from_state, which handles format-specific parsing. Returns True if the cursor is older than the retention period or if the cursor is invalid/unparseable (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ + from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( + DatetimeBasedCursor, + ) + for incremental_sync in incremental_sync_sources: if isinstance(incremental_sync, IncrementingCountCursorModel): raise ValueError( @@ -3615,114 +3615,54 @@ def _is_cursor_older_than_retention_period( f"cursors, so cursor age validation cannot be performed." ) - cursor_field = None + cursor_datetime: datetime.datetime | None = None for incremental_sync in incremental_sync_sources: - cursor_field = getattr(incremental_sync, "cursor_field", None) - if cursor_field: + if not isinstance(incremental_sync, DatetimeBasedCursorModel): + continue + cursor = self._create_cursor_for_age_check(incremental_sync, config) + cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) + if cursor_datetime is not None: break + global_state = stream_state.get("state") + if isinstance(global_state, dict): + cursor_datetime = cursor.get_cursor_datetime_from_state(global_state) + if cursor_datetime is not None: + break - if not cursor_field: - return True - - cursor_value = self._extract_cursor_value_from_state(stream_state, cursor_field) - if not cursor_value: - return True - - if not isinstance(cursor_value, (str, int)): + if cursor_datetime is None: return True - cursor_value_str = str(cursor_value) - retention_duration = parse_duration(api_retention_period) retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration - cursor_datetime = self._parse_cursor_datetime( - cursor_value_str, incremental_sync_sources, stream_name - ) - if cursor_datetime is None: - return True - if cursor_datetime < retention_cutoff: - self._emit_warning_for_stale_cursor( - stream_name, cursor_value_str, api_retention_period, retention_cutoff + logging.warning( + f"Stream '{stream_name}' has a cursor value older than " + f"the API's retention period of {api_retention_period} " + f"(cutoff: {retention_cutoff.isoformat()}). " + f"Falling back to full refresh to avoid data loss." ) return True return False - def _extract_cursor_value_from_state( - self, - stream_state: Mapping[str, Any], - cursor_field: str, - ) -> Any: - """Extract cursor value from state, handling different state structures. - - Supports: - - Simple cursor state: {"cursor_field": "value"} -> returns "value" - - Per-partition state: {"state": {"cursor_field": "value"}, ...} -> returns "value" - (uses global cursor from "state" key) - - Returns None if cursor value cannot be extracted. - """ - if cursor_field in stream_state: - return stream_state.get(cursor_field) - - global_state = stream_state.get("state") - if isinstance(global_state, dict) and cursor_field in global_state: - return global_state.get(cursor_field) - - return None - - def _parse_cursor_datetime( - self, - cursor_value: str, - incremental_sync_sources: list[Any], - stream_name: str, - ) -> datetime.datetime | None: - """Parse the cursor value into a datetime object using datetime formats from all sources. - - The state could have been produced by either full_refresh_stream (first sync) or - incremental_stream (subsequent syncs), so we try parsing with formats from both. - """ - parser = DatetimeParser() - - formats_to_try: list[str] = [] - for incremental_sync in incremental_sync_sources: - datetime_format = getattr(incremental_sync, "datetime_format", None) - cursor_datetime_formats = ( - getattr(incremental_sync, "cursor_datetime_formats", None) or [] - ) - formats_to_try.extend(cursor_datetime_formats) - if datetime_format: - formats_to_try.append(datetime_format) - - for fmt in formats_to_try: - try: - return parser.parse(cursor_value, fmt) - except (ValueError, TypeError): - continue - - logging.warning( - f"Could not parse cursor value '{cursor_value}' for stream '{stream_name}' " - f"using formats {formats_to_try}. Falling back to full refresh." + @staticmethod + def _create_cursor_for_age_check( + model: DatetimeBasedCursorModel, config: Config + ) -> "DatetimeBasedCursor": + """Create a lightweight DatetimeBasedCursor for cursor age validation.""" + from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( + DatetimeBasedCursor, ) - return None - def _emit_warning_for_stale_cursor( - self, - stream_name: str, - cursor_value: str, - api_retention_period: str, - retention_cutoff: datetime.datetime, - ) -> None: - """Emit a warning message when the cursor is older than the API's retention period.""" - warning_message = ( - f"Stream '{stream_name}' has a cursor value '{cursor_value}' that is older than " - f"the API's retention period of {api_retention_period} (cutoff: {retention_cutoff.isoformat()}). " - f"Falling back to full refresh to avoid data loss. " - f"This may happen if a previous sync failed mid-way and the state was checkpointed." - ) - logging.warning(warning_message) + return DatetimeBasedCursor( + start_datetime="2000-01-01T00:00:00Z", + cursor_field=model.cursor_field, + datetime_format=model.datetime_format, + config=config, + parameters=model.parameters or {}, + cursor_datetime_formats=model.cursor_datetime_formats or [], + ) def _create_async_job_status_mapping( self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any From 1defe9e3bc032d8aa481d54c96074145bb92e2cd Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 17:35:11 +0000 Subject: [PATCH 19/49] fix: Fix ruff format and mypy errors in model_to_component_factory Co-Authored-By: alfredo.garcia@airbyte.io --- .../parsers/model_to_component_factory.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index a5490d0e8..2876da34f 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -11,6 +11,7 @@ import re from functools import partial from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -27,6 +28,11 @@ get_type_hints, ) +if TYPE_CHECKING: + from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( + DatetimeBasedCursor, + ) + from airbyte_protocol_dataclasses.models import ConfiguredAirbyteStream from isodate import parse_duration from pydantic.v1 import BaseModel @@ -3580,7 +3586,11 @@ def _get_state_delegating_stream_model( ] incremental_sync_sources = [s for s in incremental_sync_sources if s is not None] if incremental_sync_sources and self._is_cursor_older_than_retention_period( - stream_state, incremental_sync_sources, model.api_retention_period, model.name, config + stream_state, + incremental_sync_sources, + model.api_retention_period, + model.name, + config, ): return model.full_refresh_stream @@ -3603,10 +3613,6 @@ def _is_cursor_older_than_retention_period( invalid/unparseable (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ - from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( - DatetimeBasedCursor, - ) - for incremental_sync in incremental_sync_sources: if isinstance(incremental_sync, IncrementingCountCursorModel): raise ValueError( @@ -3652,10 +3658,10 @@ def _create_cursor_for_age_check( ) -> "DatetimeBasedCursor": """Create a lightweight DatetimeBasedCursor for cursor age validation.""" from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( - DatetimeBasedCursor, + DatetimeBasedCursor as _DatetimeBasedCursor, ) - return DatetimeBasedCursor( + return _DatetimeBasedCursor( start_datetime="2000-01-01T00:00:00Z", cursor_field=model.cursor_field, datetime_format=model.datetime_format, @@ -4067,7 +4073,7 @@ def _instantiate_parent_stream_state_manager( model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream if isinstance(model.stream, DeclarativeStreamModel) else self._get_state_delegating_stream_model( - has_parent_state, model.stream + has_parent_state, model.stream, config ).incremental_sync ) cursor_field = InterpolatedString.create( From a017dffbe79b44c7414d8344bce89e5324a96ca2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 18:05:59 +0000 Subject: [PATCH 20/49] fix: Skip retention check for concurrent state format Add guard to detect concurrent state format (state_type/slices keys) and skip the retention check rather than defaulting to full refresh. Today is_sequential_state=True is hardcoded for all declarative cursors, so concurrent format state should never appear in practice. This guard prevents spurious full-refresh fallbacks if that assumption ever changes. Co-Authored-By: alfredo.garcia@airbyte.io --- .../declarative/parsers/model_to_component_factory.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 2876da34f..15cac6282 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3613,6 +3613,14 @@ def _is_cursor_older_than_retention_period( invalid/unparseable (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ + # Skip retention check for concurrent state format (e.g. {"state_type": "date-range", "slices": [...]}). + # The DatetimeBasedCursor used for the age check only handles sequential state format. + # Today, is_sequential_state=True is hardcoded for all declarative cursors, so concurrent + # format state should never appear in practice. If that changes in the future, this guard + # prevents spurious full-refresh fallbacks until proper concurrent cursor delegation is added. + if "state_type" in stream_state or "slices" in stream_state: + return False + for incremental_sync in incremental_sync_sources: if isinstance(incremental_sync, IncrementingCountCursorModel): raise ValueError( From d3e76d4eebdf5f840ef2240e47f74c0153eaa6a7 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 18:34:19 +0000 Subject: [PATCH 21/49] fix: Skip retention check for IncrementingCountCursor instead of raising ValueError Co-Authored-By: alfredo.garcia@airbyte.io --- .../parsers/model_to_component_factory.py | 8 ---- .../test_state_delegating_stream.py | 39 ++++++++++--------- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 15cac6282..4090be02d 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3621,14 +3621,6 @@ def _is_cursor_older_than_retention_period( if "state_type" in stream_state or "slices" in stream_state: return False - for incremental_sync in incremental_sync_sources: - if isinstance(incremental_sync, IncrementingCountCursorModel): - raise ValueError( - f"Stream '{stream_name}' uses IncrementingCountCursor which is not supported " - f"with api_retention_period. IncrementingCountCursor does not use datetime-based " - f"cursors, so cursor age validation cannot be performed." - ) - cursor_datetime: datetime.datetime | None = None for incremental_sync in incremental_sync_sources: if not isinstance(incremental_sync, DatetimeBasedCursorModel): diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index fe673d821..6dde01567 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -527,27 +527,30 @@ def _create_manifest_with_incrementing_count_cursor(api_retention_period: str) - return manifest -def test_cursor_age_validation_raises_error_for_incrementing_count_cursor(): - """Test that IncrementingCountCursor with api_retention_period raises an error.""" +def test_cursor_age_validation_skips_incrementing_count_cursor(): + """Test that IncrementingCountCursor with api_retention_period is silently skipped (no error, uses incremental).""" manifest = _create_manifest_with_incrementing_count_cursor("P7D") - state = [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), - stream_state=AirbyteStateBlob(id=100), - ), + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/items"), + HttpResponse(body=json.dumps([{"id": 101, "updated_at": "2024-07-14"}])), ) - ] - source = ConcurrentDeclarativeSource( - source_config=manifest, config=_CONFIG, catalog=None, state=state - ) + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(id=100), + ), + ) + ] - with pytest.raises(ValueError) as exc_info: - source.discover(logger=MagicMock(), config=_CONFIG) + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) - assert "IncrementingCountCursor" in str(exc_info.value) - assert "not supported" in str(exc_info.value) - assert "api_retention_period" in str(exc_info.value) + records = get_records(source, _CONFIG, configured_catalog, state) + assert len(records) >= 0 From d31c26b58b47e04975b7db318e61ab2757885315 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 18:37:31 +0000 Subject: [PATCH 22/49] fix: Return False (skip) when no datetime-based cursors found for retention check Co-Authored-By: alfredo.garcia@airbyte.io --- .../declarative/parsers/model_to_component_factory.py | 10 +++++++--- .../declarative/test_state_delegating_stream.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 4090be02d..c2b69ac9f 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3621,10 +3621,14 @@ def _is_cursor_older_than_retention_period( if "state_type" in stream_state or "slices" in stream_state: return False + datetime_cursor_sources = [ + s for s in incremental_sync_sources if isinstance(s, DatetimeBasedCursorModel) + ] + if not datetime_cursor_sources: + return False + cursor_datetime: datetime.datetime | None = None - for incremental_sync in incremental_sync_sources: - if not isinstance(incremental_sync, DatetimeBasedCursorModel): - continue + for incremental_sync in datetime_cursor_sources: cursor = self._create_cursor_for_age_check(incremental_sync, config) cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) if cursor_datetime is not None: diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 6dde01567..e3265af5c 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -533,7 +533,7 @@ def test_cursor_age_validation_skips_incrementing_count_cursor(): with HttpMocker() as http_mocker: http_mocker.get( - HttpRequest(url="https://api.test.com/items"), + HttpRequest(url="https://api.test.com/items_with_filtration"), HttpResponse(body=json.dumps([{"id": 101, "updated_at": "2024-07-14"}])), ) From 653022ba4bb230962a53a54586acba9646946599 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 18:40:51 +0000 Subject: [PATCH 23/49] fix: Remove unused pytest import Co-Authored-By: unknown <> --- unit_tests/sources/declarative/test_state_delegating_stream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index e3265af5c..926654b15 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -8,7 +8,6 @@ from unittest.mock import MagicMock import freezegun -import pytest from airbyte_cdk.models import ( AirbyteStateBlob, From 43dc47e91d22f8a2eb05267f105bacb1b0a8d42c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 18:51:44 +0000 Subject: [PATCH 24/49] fix: Raise ValueError for unparseable cursor datetime when api_retention_period is set Co-Authored-By: alfredo.garcia@airbyte.io --- .../parsers/model_to_component_factory.py | 10 +++++--- .../test_state_delegating_stream.py | 23 +++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index c2b69ac9f..7f0e0d528 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3609,9 +3609,9 @@ def _is_cursor_older_than_retention_period( Delegates cursor datetime extraction to cursor class instances via get_cursor_datetime_from_state, which handles format-specific parsing. - Returns True if the cursor is older than the retention period or if the cursor is - invalid/unparseable (should use full refresh). + Returns True if the cursor is older than the retention period (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). + Raises ValueError if the cursor datetime could not be parsed from state. """ # Skip retention check for concurrent state format (e.g. {"state_type": "date-range", "slices": [...]}). # The DatetimeBasedCursor used for the age check only handles sequential state format. @@ -3640,7 +3640,11 @@ def _is_cursor_older_than_retention_period( break if cursor_datetime is None: - return True + raise ValueError( + f"Stream '{stream_name}' has api_retention_period set to '{api_retention_period}' " + f"but the cursor datetime could not be parsed from state. Check that cursor_field " + f"and datetime_format match the state format." + ) retention_duration = parse_duration(api_retention_period) retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 926654b15..506d46b3e 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -8,6 +8,7 @@ from unittest.mock import MagicMock import freezegun +import pytest from airbyte_cdk.models import ( AirbyteStateBlob, @@ -553,3 +554,25 @@ def test_cursor_age_validation_skips_incrementing_count_cursor(): records = get_records(source, _CONFIG, configured_catalog, state) assert len(records) >= 0 + + +def test_cursor_age_validation_raises_error_for_unparseable_cursor(): + """Test that unparseable cursor datetime raises ValueError when api_retention_period is set.""" + manifest = _create_manifest_with_retention_period("P7D") + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(updated_at="not-a-date"), + ), + ) + ] + + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + + with pytest.raises(ValueError, match="could not be parsed from state"): + source.discover(logger=MagicMock(), config=_CONFIG) From 1531b3946c3622ddd02ba6d90a40ecb4e67e9acf Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 21:43:39 +0000 Subject: [PATCH 25/49] refactor: Use stream cursor for retention period check, remove legacy get_cursor_datetime_from_state - Rewrite create_state_delegating_stream to create actual stream object and extract cursor - Add model-level check for IncrementingCountCursor with api_retention_period - Delegate cursor datetime extraction to cursor's get_cursor_datetime_from_state method - Remove get_cursor_datetime_from_state from legacy cursors (DeclarativeCursor, DatetimeBasedCursor, PerPartitionCursor) - Remove factory helper methods (_create_cursor_for_age_check, _get_state_delegating_stream_model) - Update tests to match new behavior Co-Authored-By: gl_anatolii.yatsuk@airbyte.io --- .../incremental/datetime_based_cursor.py | 20 ---- .../incremental/declarative_cursor.py | 23 ---- .../incremental/per_partition_cursor.py | 18 --- .../parsers/model_to_component_factory.py | 110 ++++++------------ .../test_state_delegating_stream.py | 39 +++---- 5 files changed, 54 insertions(+), 156 deletions(-) diff --git a/airbyte_cdk/legacy/sources/declarative/incremental/datetime_based_cursor.py b/airbyte_cdk/legacy/sources/declarative/incremental/datetime_based_cursor.py index b542cb6e6..616a13d8c 100644 --- a/airbyte_cdk/legacy/sources/declarative/incremental/datetime_based_cursor.py +++ b/airbyte_cdk/legacy/sources/declarative/incremental/datetime_based_cursor.py @@ -212,26 +212,6 @@ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[S # through each slice and does not belong to a specific slice. We just return stream state as it is. return self.get_stream_state() - def get_cursor_datetime_from_state( - self, stream_state: Mapping[str, Any] - ) -> Optional[datetime.datetime]: - """Extract and parse the cursor datetime from the given stream state. - - Returns the cursor datetime if present and parseable, otherwise returns None. - """ - cursor_field_key = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ - if cursor_field_key not in stream_state: - return None - - cursor_value = stream_state.get(cursor_field_key) - if not cursor_value: - return None - - try: - return self.parse_date(str(cursor_value)) - except ValueError: - return None - def _calculate_earliest_possible_value( self, end_datetime: datetime.datetime ) -> datetime.datetime: diff --git a/airbyte_cdk/legacy/sources/declarative/incremental/declarative_cursor.py b/airbyte_cdk/legacy/sources/declarative/incremental/declarative_cursor.py index 063d5a1f3..adb64d119 100644 --- a/airbyte_cdk/legacy/sources/declarative/incremental/declarative_cursor.py +++ b/airbyte_cdk/legacy/sources/declarative/incremental/declarative_cursor.py @@ -1,8 +1,6 @@ # Copyright (c) 2024 Airbyte, Inc., all rights reserved. -import datetime from abc import ABC -from typing import Any, Mapping, Optional from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer from airbyte_cdk.sources.streams.checkpoint.cursor import Cursor @@ -13,24 +11,3 @@ class DeclarativeCursor(Cursor, StreamSlicer, ABC): DeclarativeCursors are components that allow for checkpointing syncs. In addition to managing the fetching and updating of state, declarative cursors also manage stream slicing and injecting slice values into outbound requests. """ - - def get_cursor_datetime_from_state( - self, stream_state: Mapping[str, Any] - ) -> Optional[datetime.datetime]: - """Extract and parse the cursor datetime from the given stream state. - - This method is used by StateDelegatingStream to validate cursor age against - an API's data retention period. Subclasses should implement this method to - extract the cursor value from their specific state structure and parse it - into a datetime object. - - Returns None if the cursor cannot be extracted or parsed, which will cause - StateDelegatingStream to fall back to full refresh (safe default). - - Raises NotImplementedError by default - subclasses must implement this method - if they want to support cursor age validation with api_retention_period. - """ - raise NotImplementedError( - f"{self.__class__.__name__} does not implement get_cursor_datetime_from_state. " - f"Cursor age validation with api_retention_period is not supported for this cursor type." - ) diff --git a/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py b/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py index 85a749d67..23746e808 100644 --- a/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +++ b/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py @@ -2,7 +2,6 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -import datetime import logging from collections import OrderedDict from typing import Any, Callable, Iterable, Mapping, Optional, Union @@ -212,23 +211,6 @@ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[S return self._get_state_for_partition(stream_slice.partition) - def get_cursor_datetime_from_state( - self, stream_state: Mapping[str, Any] - ) -> Optional[datetime.datetime]: - """Extract and parse the cursor datetime from the global cursor in per-partition state. - - For per-partition cursors, the global cursor is stored under the "state" key. - This method delegates to the underlying cursor factory to parse the datetime. - - Returns None if the global cursor is not present or cannot be parsed. - """ - global_state = stream_state.get("state") - if not global_state or not isinstance(global_state, dict): - return None - - cursor = self._cursor_factory.create() - return cursor.get_cursor_datetime_from_state(global_state) - def _create_cursor(self, cursor_state: Any) -> DeclarativeCursor: cursor = self._cursor_factory.create() cursor.set_initial_state(cursor_state) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 7f0e0d528..09cfb53ea 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3565,86 +3565,70 @@ def create_state_delegating_stream( f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." ) - stream_model = self._get_state_delegating_stream_model( - False if has_parent_state is None else has_parent_state, model, config - ) - - return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel + if model.api_retention_period: + for stream_model in (model.full_refresh_stream, model.incremental_stream): + if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): + raise ValueError( + f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " + f"with api_retention_period. IncrementingCountCursor does not use datetime-based " + f"cursors, so cursor age validation cannot be performed." + ) - def _get_state_delegating_stream_model( - self, has_parent_state: bool, model: StateDelegatingStreamModel, config: Config - ) -> DeclarativeStreamModel: stream_state = self._connector_state_manager.get_stream_state(model.name, None) + has_parent = False if has_parent_state is None else has_parent_state - if not stream_state and not has_parent_state: - return model.full_refresh_stream + if not stream_state and not has_parent: + return self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[no-any-return] + + incremental_stream: DefaultStream = self._create_component_from_model(model.incremental_stream, config=config, **kwargs) # type: ignore[assignment] if model.api_retention_period and stream_state: - incremental_sync_sources = [ - model.full_refresh_stream.incremental_sync, - model.incremental_stream.incremental_sync, - ] - incremental_sync_sources = [s for s in incremental_sync_sources if s is not None] - if incremental_sync_sources and self._is_cursor_older_than_retention_period( - stream_state, - incremental_sync_sources, - model.api_retention_period, - model.name, - config, + cursor = incremental_stream.cursor + if self._is_cursor_older_than_retention_period( + stream_state, cursor, model.api_retention_period, model.name ): - return model.full_refresh_stream + return self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[no-any-return] - return model.incremental_stream + return incremental_stream + @staticmethod def _is_cursor_older_than_retention_period( - self, stream_state: Mapping[str, Any], - incremental_sync_sources: list[Any], + cursor: Any, api_retention_period: str, stream_name: str, - config: Config, ) -> bool: """Check if the cursor value in the state is older than the API's retention period. - Delegates cursor datetime extraction to cursor class instances via - get_cursor_datetime_from_state, which handles format-specific parsing. + Delegates cursor datetime extraction to the cursor instance via + get_cursor_datetime_from_state. Returns True if the cursor is older than the retention period (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). - Raises ValueError if the cursor datetime could not be parsed from state. """ - # Skip retention check for concurrent state format (e.g. {"state_type": "date-range", "slices": [...]}). - # The DatetimeBasedCursor used for the age check only handles sequential state format. - # Today, is_sequential_state=True is hardcoded for all declarative cursors, so concurrent - # format state should never appear in practice. If that changes in the future, this guard - # prevents spurious full-refresh fallbacks until proper concurrent cursor delegation is added. - if "state_type" in stream_state or "slices" in stream_state: - return False - - datetime_cursor_sources = [ - s for s in incremental_sync_sources if isinstance(s, DatetimeBasedCursorModel) - ] - if not datetime_cursor_sources: - return False + if not hasattr(cursor, "get_cursor_datetime_from_state"): + raise SystemError( + f"Stream '{stream_name}' cursor type '{type(cursor).__name__}' does not have " + f"get_cursor_datetime_from_state method. Cursor age validation with " + f"api_retention_period is not supported for this cursor type." + ) - cursor_datetime: datetime.datetime | None = None - for incremental_sync in datetime_cursor_sources: - cursor = self._create_cursor_for_age_check(incremental_sync, config) + try: cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) - if cursor_datetime is not None: - break + except NotImplementedError: + raise SystemError( + f"Stream '{stream_name}' cursor type '{type(cursor).__name__}' does not implement " + f"get_cursor_datetime_from_state. Cursor age validation with " + f"api_retention_period is not supported for this cursor type." + ) + + if cursor_datetime is None: global_state = stream_state.get("state") if isinstance(global_state, dict): cursor_datetime = cursor.get_cursor_datetime_from_state(global_state) - if cursor_datetime is not None: - break if cursor_datetime is None: - raise ValueError( - f"Stream '{stream_name}' has api_retention_period set to '{api_retention_period}' " - f"but the cursor datetime could not be parsed from state. Check that cursor_field " - f"and datetime_format match the state format." - ) + return True retention_duration = parse_duration(api_retention_period) retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration @@ -3660,24 +3644,6 @@ def _is_cursor_older_than_retention_period( return False - @staticmethod - def _create_cursor_for_age_check( - model: DatetimeBasedCursorModel, config: Config - ) -> "DatetimeBasedCursor": - """Create a lightweight DatetimeBasedCursor for cursor age validation.""" - from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( - DatetimeBasedCursor as _DatetimeBasedCursor, - ) - - return _DatetimeBasedCursor( - start_datetime="2000-01-01T00:00:00Z", - cursor_field=model.cursor_field, - datetime_format=model.datetime_format, - config=config, - parameters=model.parameters or {}, - cursor_datetime_formats=model.cursor_datetime_formats or [], - ) - def _create_async_job_status_mapping( self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any ) -> Mapping[str, AsyncJobStatus]: diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 506d46b3e..d0a7f1f7d 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -527,33 +527,26 @@ def _create_manifest_with_incrementing_count_cursor(api_retention_period: str) - return manifest -def test_cursor_age_validation_skips_incrementing_count_cursor(): - """Test that IncrementingCountCursor with api_retention_period is silently skipped (no error, uses incremental).""" +def test_cursor_age_validation_raises_error_for_incrementing_count_cursor(): + """Test that IncrementingCountCursor with api_retention_period raises ValueError.""" manifest = _create_manifest_with_incrementing_count_cursor("P7D") - with HttpMocker() as http_mocker: - http_mocker.get( - HttpRequest(url="https://api.test.com/items_with_filtration"), - HttpResponse(body=json.dumps([{"id": 101, "updated_at": "2024-07-14"}])), + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(id=100), + ), ) + ] - state = [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), - stream_state=AirbyteStateBlob(id=100), - ), - ) - ] - - source = ConcurrentDeclarativeSource( - source_config=manifest, config=_CONFIG, catalog=None, state=state - ) - configured_catalog = create_configured_catalog(source, _CONFIG) + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) - records = get_records(source, _CONFIG, configured_catalog, state) - assert len(records) >= 0 + with pytest.raises(ValueError, match="IncrementingCountCursor"): + source.discover(logger=MagicMock(), config=_CONFIG) def test_cursor_age_validation_raises_error_for_unparseable_cursor(): @@ -574,5 +567,5 @@ def test_cursor_age_validation_raises_error_for_unparseable_cursor(): source_config=manifest, config=_CONFIG, catalog=None, state=state ) - with pytest.raises(ValueError, match="could not be parsed from state"): + with pytest.raises(ValueError, match="not-a-date"): source.discover(logger=MagicMock(), config=_CONFIG) From b4c24c68963bde8aea5accff8ab3ede2bccec7dc Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 22:03:21 +0000 Subject: [PATCH 26/49] fix: Try both full_refresh and incremental cursors for state parsing The state format may match either the full refresh or incremental cursor, so we need to try both when checking cursor age against retention period. Co-Authored-By: gl_anatolii.yatsuk@airbyte.io --- .../parsers/model_to_component_factory.py | 50 +++++++++++-------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 09cfb53ea..256383712 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3583,49 +3583,57 @@ def create_state_delegating_stream( incremental_stream: DefaultStream = self._create_component_from_model(model.incremental_stream, config=config, **kwargs) # type: ignore[assignment] if model.api_retention_period and stream_state: - cursor = incremental_stream.cursor + full_refresh_stream: DefaultStream = self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[assignment] + cursors = [full_refresh_stream.cursor, incremental_stream.cursor] if self._is_cursor_older_than_retention_period( - stream_state, cursor, model.api_retention_period, model.name + stream_state, cursors, model.api_retention_period, model.name ): - return self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[no-any-return] + return full_refresh_stream return incremental_stream @staticmethod def _is_cursor_older_than_retention_period( stream_state: Mapping[str, Any], - cursor: Any, + cursors: list[Any], api_retention_period: str, stream_name: str, ) -> bool: """Check if the cursor value in the state is older than the API's retention period. - Delegates cursor datetime extraction to the cursor instance via - get_cursor_datetime_from_state. + Tries each cursor's get_cursor_datetime_from_state to extract the cursor datetime, + since the state format may match either the full refresh or incremental cursor. Returns True if the cursor is older than the retention period (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ - if not hasattr(cursor, "get_cursor_datetime_from_state"): - raise SystemError( - f"Stream '{stream_name}' cursor type '{type(cursor).__name__}' does not have " - f"get_cursor_datetime_from_state method. Cursor age validation with " - f"api_retention_period is not supported for this cursor type." - ) + cursor_datetime: datetime.datetime | None = None + + for cursor in cursors: + if not hasattr(cursor, "get_cursor_datetime_from_state"): + raise SystemError( + f"Stream '{stream_name}' cursor type '{type(cursor).__name__}' does not have " + f"get_cursor_datetime_from_state method. Cursor age validation with " + f"api_retention_period is not supported for this cursor type." + ) - try: - cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) - except NotImplementedError: - raise SystemError( - f"Stream '{stream_name}' cursor type '{type(cursor).__name__}' does not implement " - f"get_cursor_datetime_from_state. Cursor age validation with " - f"api_retention_period is not supported for this cursor type." - ) + try: + cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) + except NotImplementedError: + raise SystemError( + f"Stream '{stream_name}' cursor type '{type(cursor).__name__}' does not implement " + f"get_cursor_datetime_from_state. Cursor age validation with " + f"api_retention_period is not supported for this cursor type." + ) + + if cursor_datetime is not None: + break - if cursor_datetime is None: global_state = stream_state.get("state") if isinstance(global_state, dict): cursor_datetime = cursor.get_cursor_datetime_from_state(global_state) + if cursor_datetime is not None: + break if cursor_datetime is None: return True From 67f9e602ce2d82a851c41080ff3d1e132ccaff18 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 22:09:52 +0000 Subject: [PATCH 27/49] fix: Remove per-partition state fallback, let cursor classes handle state extraction Co-Authored-By: gl_anatolii.yatsuk@airbyte.io --- .../parsers/model_to_component_factory.py | 6 ------ .../declarative/test_state_delegating_stream.py | 15 ++++++--------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 256383712..a77862ce8 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3629,12 +3629,6 @@ def _is_cursor_older_than_retention_period( if cursor_datetime is not None: break - global_state = stream_state.get("state") - if isinstance(global_state, dict): - cursor_datetime = cursor.get_cursor_datetime_from_state(global_state) - if cursor_datetime is not None: - break - if cursor_datetime is None: return True diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index d0a7f1f7d..1a7df9c07 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -457,21 +457,18 @@ def test_cursor_age_validation_with_per_partition_state_uses_global_cursor(): @freezegun.freeze_time("2024-07-15") -def test_cursor_age_validation_with_per_partition_state_within_retention(): - """Test per-partition state with global cursor within retention uses incremental. +def test_cursor_age_validation_with_per_partition_state_falls_back_to_full_refresh(): + """Test that per-partition state falls back to full refresh. - This test verifies that when the global cursor in a per-partition state structure - is within the retention period, the incremental stream is selected (not full refresh). - We verify this by checking that the incremental endpoint is called, not the full refresh one. + When per-partition state is provided but the stream uses a ConcurrentCursor (not + ConcurrentPerPartitionCursor), the cursor cannot extract a datetime from the + per-partition format and returns None, causing a full refresh fallback. """ manifest = _create_manifest_with_retention_period("P30D") with HttpMocker() as http_mocker: http_mocker.get( - HttpRequest( - url="https://api.test.com/items_with_filtration", - query_params={"start": "2024-07-01", "end": "2024-07-15"}, - ), + HttpRequest(url="https://api.test.com/items"), HttpResponse( body=json.dumps( [ From 8608b5faae277dd81c83043cc2dcc74922d370ae Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 22:20:28 +0000 Subject: [PATCH 28/49] fix: Re-add _get_state_delegating_stream_model and fix ruff format Co-Authored-By: alfredo.garcia@airbyte.io --- .../parsers/model_to_component_factory.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index a77862ce8..677fb997d 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3578,12 +3578,18 @@ def create_state_delegating_stream( has_parent = False if has_parent_state is None else has_parent_state if not stream_state and not has_parent: - return self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[no-any-return] + return self._create_component_from_model( + model.full_refresh_stream, config=config, **kwargs + ) # type: ignore[no-any-return] - incremental_stream: DefaultStream = self._create_component_from_model(model.incremental_stream, config=config, **kwargs) # type: ignore[assignment] + incremental_stream: DefaultStream = self._create_component_from_model( + model.incremental_stream, config=config, **kwargs + ) # type: ignore[assignment] if model.api_retention_period and stream_state: - full_refresh_stream: DefaultStream = self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[assignment] + full_refresh_stream: DefaultStream = self._create_component_from_model( + model.full_refresh_stream, config=config, **kwargs + ) # type: ignore[assignment] cursors = [full_refresh_stream.cursor, incremental_stream.cursor] if self._is_cursor_older_than_retention_period( stream_state, cursors, model.api_retention_period, model.name @@ -3646,6 +3652,15 @@ def _is_cursor_older_than_retention_period( return False + def _get_state_delegating_stream_model( + self, has_parent_state: bool, model: StateDelegatingStreamModel, config: Config + ) -> DeclarativeStreamModel: + return ( + model.incremental_stream + if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state + else model.full_refresh_stream + ) + def _create_async_job_status_mapping( self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any ) -> Mapping[str, AsyncJobStatus]: From 8faa0ae6528c178a88610b7dad90ba1f83eb294e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 22:35:27 +0000 Subject: [PATCH 29/49] Revert "fix: Re-add _get_state_delegating_stream_model and fix ruff format" This reverts commit 8608b5faae277dd81c83043cc2dcc74922d370ae. --- .../parsers/model_to_component_factory.py | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 677fb997d..a77862ce8 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3578,18 +3578,12 @@ def create_state_delegating_stream( has_parent = False if has_parent_state is None else has_parent_state if not stream_state and not has_parent: - return self._create_component_from_model( - model.full_refresh_stream, config=config, **kwargs - ) # type: ignore[no-any-return] + return self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[no-any-return] - incremental_stream: DefaultStream = self._create_component_from_model( - model.incremental_stream, config=config, **kwargs - ) # type: ignore[assignment] + incremental_stream: DefaultStream = self._create_component_from_model(model.incremental_stream, config=config, **kwargs) # type: ignore[assignment] if model.api_retention_period and stream_state: - full_refresh_stream: DefaultStream = self._create_component_from_model( - model.full_refresh_stream, config=config, **kwargs - ) # type: ignore[assignment] + full_refresh_stream: DefaultStream = self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[assignment] cursors = [full_refresh_stream.cursor, incremental_stream.cursor] if self._is_cursor_older_than_retention_period( stream_state, cursors, model.api_retention_period, model.name @@ -3652,15 +3646,6 @@ def _is_cursor_older_than_retention_period( return False - def _get_state_delegating_stream_model( - self, has_parent_state: bool, model: StateDelegatingStreamModel, config: Config - ) -> DeclarativeStreamModel: - return ( - model.incremental_stream - if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state - else model.full_refresh_stream - ) - def _create_async_job_status_mapping( self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any ) -> Mapping[str, AsyncJobStatus]: From ea7a757890132134f120605e25ad107ddaf56a32 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 22:36:01 +0000 Subject: [PATCH 30/49] fix: ruff format long lines in create_state_delegating_stream Co-Authored-By: alfredo.garcia@airbyte.io --- .../parsers/model_to_component_factory.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index a77862ce8..8c2375ef9 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3578,12 +3578,18 @@ def create_state_delegating_stream( has_parent = False if has_parent_state is None else has_parent_state if not stream_state and not has_parent: - return self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[no-any-return] + return self._create_component_from_model( + model.full_refresh_stream, config=config, **kwargs + ) # type: ignore[no-any-return] - incremental_stream: DefaultStream = self._create_component_from_model(model.incremental_stream, config=config, **kwargs) # type: ignore[assignment] + incremental_stream: DefaultStream = self._create_component_from_model( + model.incremental_stream, config=config, **kwargs + ) # type: ignore[assignment] if model.api_retention_period and stream_state: - full_refresh_stream: DefaultStream = self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[assignment] + full_refresh_stream: DefaultStream = self._create_component_from_model( + model.full_refresh_stream, config=config, **kwargs + ) # type: ignore[assignment] cursors = [full_refresh_stream.cursor, incremental_stream.cursor] if self._is_cursor_older_than_retention_period( stream_state, cursors, model.api_retention_period, model.name From 714c6670f252c37111cd60b2728678418c07ab5a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 22:56:06 +0000 Subject: [PATCH 31/49] fix: Restore _get_state_delegating_stream_model and fix MyPy errors Co-Authored-By: gl_anatolii.yatsuk@airbyte.io --- .../parsers/model_to_component_factory.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 8c2375ef9..788681d41 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3578,9 +3578,7 @@ def create_state_delegating_stream( has_parent = False if has_parent_state is None else has_parent_state if not stream_state and not has_parent: - return self._create_component_from_model( - model.full_refresh_stream, config=config, **kwargs - ) # type: ignore[no-any-return] + return self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[no-any-return] incremental_stream: DefaultStream = self._create_component_from_model( model.incremental_stream, config=config, **kwargs @@ -3652,6 +3650,16 @@ def _is_cursor_older_than_retention_period( return False + def _get_state_delegating_stream_model( + self, has_parent_state: bool, model: StateDelegatingStreamModel + ) -> DeclarativeStreamModel: + """Return the appropriate underlying stream model based on state.""" + return ( + model.incremental_stream + if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state + else model.full_refresh_stream + ) + def _create_async_job_status_mapping( self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any ) -> Mapping[str, AsyncJobStatus]: @@ -4055,7 +4063,7 @@ def _instantiate_parent_stream_state_manager( model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream if isinstance(model.stream, DeclarativeStreamModel) else self._get_state_delegating_stream_model( - has_parent_state, model.stream, config + has_parent_state, model.stream ).incremental_sync ) cursor_field = InterpolatedString.create( From 16a895e37fe1dda2f80eb7eda77f5b9a75501ae2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 19 Feb 2026 00:03:31 +0000 Subject: [PATCH 32/49] fix: Handle FinalStateCursor gracefully and detect final-state for retention check - In _is_cursor_older_than_retention_period, continue to next cursor when NotImplementedError is raised instead of crashing with SystemError - Detect final state (NO_CURSOR_STATE_KEY) and skip retention check, returning incremental stream directly - Add test for final-state detection with api_retention_period Co-Authored-By: alfredo.garcia@airbyte.io --- .../parsers/model_to_component_factory.py | 22 +++++------ .../test_state_delegating_stream.py | 37 +++++++++++++++++++ 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 788681d41..380b133e2 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -618,6 +618,7 @@ NoopMessageRepository, ) from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository +from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY from airbyte_cdk.sources.streams.call_rate import ( APIBudget, FixedWindowCallRatePolicy, @@ -3578,7 +3579,14 @@ def create_state_delegating_stream( has_parent = False if has_parent_state is None else has_parent_state if not stream_state and not has_parent: - return self._create_component_from_model(model.full_refresh_stream, config=config, **kwargs) # type: ignore[no-any-return] + return self._create_component_from_model( # type: ignore[no-any-return] + model.full_refresh_stream, config=config, **kwargs + ) + + if stream_state and stream_state.get(NO_CURSOR_STATE_KEY): + return self._create_component_from_model( # type: ignore[no-any-return] + model.incremental_stream, config=config, **kwargs + ) incremental_stream: DefaultStream = self._create_component_from_model( model.incremental_stream, config=config, **kwargs @@ -3615,20 +3623,12 @@ def _is_cursor_older_than_retention_period( for cursor in cursors: if not hasattr(cursor, "get_cursor_datetime_from_state"): - raise SystemError( - f"Stream '{stream_name}' cursor type '{type(cursor).__name__}' does not have " - f"get_cursor_datetime_from_state method. Cursor age validation with " - f"api_retention_period is not supported for this cursor type." - ) + continue try: cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) except NotImplementedError: - raise SystemError( - f"Stream '{stream_name}' cursor type '{type(cursor).__name__}' does not implement " - f"get_cursor_datetime_from_state. Cursor age validation with " - f"api_retention_period is not supported for this cursor type." - ) + continue if cursor_datetime is not None: break diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 1a7df9c07..e8a5573b5 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -566,3 +566,40 @@ def test_cursor_age_validation_raises_error_for_unparseable_cursor(): with pytest.raises(ValueError, match="not-a-date"): source.discover(logger=MagicMock(), config=_CONFIG) + + +@freezegun.freeze_time("2024-07-15") +def test_final_state_cursor_skips_retention_check_and_uses_incremental(): + """When state is a final state from FinalStateCursor, skip retention check and use incremental.""" + manifest = _create_manifest_with_retention_period("P7D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest( + url="https://api.test.com/items_with_filtration?start=2024-07-01&end=2024-07-15" + ), + HttpResponse( + body=json.dumps( + [ + {"id": 1, "name": "item_1", "updated_at": "2024-07-14"}, + ] + ) + ), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(__ab_no_cursor_state_message=True), + ), + ) + ] + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + records = get_records(source, _CONFIG, configured_catalog, state) + assert len(records) == 1 From bddc671b53062fcc7db1fae142046836298feb68 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:12:10 +0000 Subject: [PATCH 33/49] refactor: Move FinalStateCursor handling to cursor classes, replace has_parent_state with actual parent state - FinalStateCursor.get_cursor_datetime_from_state returns now(utc) - ConcurrentCursor handles NO_CURSOR_STATE_KEY sentinel in get_cursor_datetime_from_state - Remove explicit NO_CURSOR_STATE_KEY check from factory create_state_delegating_stream - Replace has_parent_state: bool with parent_state: Optional[Mapping] in factory methods - Rename local variable to extracted_parent_state to avoid shadowing parameter Co-Authored-By: gl_anatolii.yatsuk@airbyte.io --- .../parsers/model_to_component_factory.py | 48 +++++++------------ .../sources/streams/concurrent/cursor.py | 9 ++++ 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 380b133e2..5566a168f 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -618,7 +618,6 @@ NoopMessageRepository, ) from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository -from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY from airbyte_cdk.sources.streams.call_rate import ( APIBudget, FixedWindowCallRatePolicy, @@ -3555,7 +3554,6 @@ def create_state_delegating_stream( self, model: StateDelegatingStreamModel, config: Config, - has_parent_state: Optional[bool] = None, **kwargs: Any, ) -> DefaultStream: if ( @@ -3576,23 +3574,17 @@ def create_state_delegating_stream( ) stream_state = self._connector_state_manager.get_stream_state(model.name, None) - has_parent = False if has_parent_state is None else has_parent_state - if not stream_state and not has_parent: + if not stream_state: return self._create_component_from_model( # type: ignore[no-any-return] model.full_refresh_stream, config=config, **kwargs ) - if stream_state and stream_state.get(NO_CURSOR_STATE_KEY): - return self._create_component_from_model( # type: ignore[no-any-return] - model.incremental_stream, config=config, **kwargs - ) - incremental_stream: DefaultStream = self._create_component_from_model( model.incremental_stream, config=config, **kwargs ) # type: ignore[assignment] - if model.api_retention_period and stream_state: + if model.api_retention_period: full_refresh_stream: DefaultStream = self._create_component_from_model( model.full_refresh_stream, config=config, **kwargs ) # type: ignore[assignment] @@ -3651,12 +3643,15 @@ def _is_cursor_older_than_retention_period( return False def _get_state_delegating_stream_model( - self, has_parent_state: bool, model: StateDelegatingStreamModel + self, + model: StateDelegatingStreamModel, + parent_state: Optional[Mapping[str, Any]] = None, ) -> DeclarativeStreamModel: """Return the appropriate underlying stream model based on state.""" return ( model.incremental_stream - if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state + if self._connector_state_manager.get_stream_state(model.name, None) + or parent_state else model.full_refresh_stream ) @@ -3987,17 +3982,13 @@ def create_substream_partition_router( def create_parent_stream_config_with_substream_wrapper( self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any ) -> Any: - # getting the parent state child_state = self._connector_state_manager.get_stream_state(stream_name, None) - # This flag will be used exclusively for StateDelegatingStream when a parent stream is created - has_parent_state = bool( - self._connector_state_manager.get_stream_state(stream_name, None) - if model.incremental_dependency - else False + parent_state: Optional[Mapping[str, Any]] = ( + child_state if model.incremental_dependency and child_state else None ) connector_state_manager = self._instantiate_parent_stream_state_manager( - child_state, config, model, has_parent_state + child_state, config, model, parent_state ) substream_factory = ModelToComponentFactory( @@ -4029,7 +4020,7 @@ def _instantiate_parent_stream_state_manager( child_state: MutableMapping[str, Any], config: Config, model: ParentStreamConfigModel, - has_parent_state: bool, + parent_state: Optional[Mapping[str, Any]] = None, ) -> ConnectorStateManager: """ With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the @@ -4041,21 +4032,18 @@ def _instantiate_parent_stream_state_manager( """ if model.incremental_dependency and child_state: parent_stream_name = model.stream.name or "" - parent_state = ConcurrentPerPartitionCursor.get_parent_state( + extracted_parent_state = ConcurrentPerPartitionCursor.get_parent_state( child_state, parent_stream_name ) - if not parent_state: - # there are two migration cases: state value from child stream or from global state - parent_state = ConcurrentPerPartitionCursor.get_global_state( + if not extracted_parent_state: + extracted_parent_state = ConcurrentPerPartitionCursor.get_global_state( child_state, parent_stream_name ) - if not parent_state and not isinstance(parent_state, dict): + if not extracted_parent_state and not isinstance(extracted_parent_state, dict): cursor_values = child_state.values() if cursor_values and len(cursor_values) == 1: - # We assume the child state is a pair `{: }` and we will use the - # cursor value as a parent state. incremental_sync_model: Union[ DatetimeBasedCursorModel, IncrementingCountCursorModel, @@ -4063,14 +4051,14 @@ def _instantiate_parent_stream_state_manager( model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream if isinstance(model.stream, DeclarativeStreamModel) else self._get_state_delegating_stream_model( - has_parent_state, model.stream + model.stream, parent_state=parent_state ).incremental_sync ) cursor_field = InterpolatedString.create( incremental_sync_model.cursor_field, parameters=incremental_sync_model.parameters or {}, ).eval(config) - parent_state = AirbyteStateMessage( + extracted_parent_state = AirbyteStateMessage( type=AirbyteStateType.STREAM, stream=AirbyteStreamState( stream_descriptor=StreamDescriptor( @@ -4081,7 +4069,7 @@ def _instantiate_parent_stream_state_manager( ), ), ) - return ConnectorStateManager([parent_state] if parent_state else []) + return ConnectorStateManager([extracted_parent_state] if extracted_parent_state else []) return ConnectorStateManager([]) diff --git a/airbyte_cdk/sources/streams/concurrent/cursor.py b/airbyte_cdk/sources/streams/concurrent/cursor.py index 806bc25cf..09d088410 100644 --- a/airbyte_cdk/sources/streams/concurrent/cursor.py +++ b/airbyte_cdk/sources/streams/concurrent/cursor.py @@ -156,6 +156,12 @@ def ensure_at_least_one_state_emitted(self) -> None: def should_be_synced(self, record: Record) -> bool: return True + def get_cursor_datetime_from_state( + self, stream_state: Mapping[str, Any] + ) -> datetime.datetime | None: + """FinalStateCursor indicates a completed full refresh; cursor is always current.""" + return datetime.datetime.now(datetime.timezone.utc) + class ConcurrentCursor(Cursor): _START_BOUNDARY = 0 @@ -602,6 +608,9 @@ def get_cursor_datetime_from_state( Returns the cursor datetime if present and parseable, otherwise returns None. """ + if stream_state.get(NO_CURSOR_STATE_KEY): + return datetime.datetime.now(datetime.timezone.utc) + # Check if state is in concurrent format (need to convert to dict for type compatibility) mutable_state: MutableMapping[str, Any] = dict(stream_state) if self._connector_state_converter.is_state_message_compatible(mutable_state): From 8828eeae2f36487e95b652d64db1bec2fecdc80c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:36:53 +0000 Subject: [PATCH 34/49] refactor: Clean NO_CURSOR_STATE_KEY from ConcurrentCursor, add tests for FinalStateCursor and parent retention Co-Authored-By: gl_anatolii.yatsuk@airbyte.io --- .../parsers/model_to_component_factory.py | 4 + .../sources/streams/concurrent/cursor.py | 3 - .../test_state_delegating_stream.py | 212 ++++++++++++++++++ .../sources/streams/concurrent/test_cursor.py | 9 + 4 files changed, 225 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 5566a168f..e9e7360ab 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -618,6 +618,7 @@ NoopMessageRepository, ) from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository +from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY from airbyte_cdk.sources.streams.call_rate import ( APIBudget, FixedWindowCallRatePolicy, @@ -3611,6 +3612,9 @@ def _is_cursor_older_than_retention_period( Returns True if the cursor is older than the retention period (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ + if stream_state.get(NO_CURSOR_STATE_KEY): + return False + cursor_datetime: datetime.datetime | None = None for cursor in cursors: diff --git a/airbyte_cdk/sources/streams/concurrent/cursor.py b/airbyte_cdk/sources/streams/concurrent/cursor.py index 09d088410..3ae935843 100644 --- a/airbyte_cdk/sources/streams/concurrent/cursor.py +++ b/airbyte_cdk/sources/streams/concurrent/cursor.py @@ -608,9 +608,6 @@ def get_cursor_datetime_from_state( Returns the cursor datetime if present and parseable, otherwise returns None. """ - if stream_state.get(NO_CURSOR_STATE_KEY): - return datetime.datetime.now(datetime.timezone.utc) - # Check if state is in concurrent format (need to convert to dict for type compatibility) mutable_state: MutableMapping[str, Any] = dict(stream_state) if self._connector_state_converter.is_state_message_compatible(mutable_state): diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index e8a5573b5..7b943c195 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -603,3 +603,215 @@ def test_final_state_cursor_skips_retention_check_and_uses_incremental(): records = get_records(source, _CONFIG, configured_catalog, state) assert len(records) == 1 + + +_PARENT_CHILD_MANIFEST: dict = { + "version": "6.0.0", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["ChildStream"]}, + "definitions": { + "ParentStream": { + "type": "StateDelegatingStream", + "name": "ParentStream", + "full_refresh_stream": { + "type": "DeclarativeStream", + "name": "ParentStream", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": {}, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.test.com", + "path": "/parents", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + }, + "incremental_sync": { + "type": "DatetimeBasedCursor", + "start_datetime": { + "datetime": "{{ format_datetime(config['start_date'], '%Y-%m-%d') }}" + }, + "end_datetime": {"datetime": "{{ now_utc().strftime('%Y-%m-%d') }}"}, + "datetime_format": "%Y-%m-%d", + "cursor_datetime_formats": ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"], + "cursor_field": "updated_at", + }, + }, + "incremental_stream": { + "type": "DeclarativeStream", + "name": "ParentStream", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": {}, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.test.com", + "path": "/parents_incremental", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + }, + "incremental_sync": { + "type": "DatetimeBasedCursor", + "start_datetime": { + "datetime": "{{ format_datetime(config['start_date'], '%Y-%m-%d') }}" + }, + "end_datetime": {"datetime": "{{ now_utc().strftime('%Y-%m-%d') }}"}, + "datetime_format": "%Y-%m-%d", + "cursor_datetime_formats": ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"], + "cursor_granularity": "P1D", + "step": "P15D", + "cursor_field": "updated_at", + "start_time_option": { + "type": "RequestOption", + "field_name": "start", + "inject_into": "request_parameter", + }, + "end_time_option": { + "type": "RequestOption", + "field_name": "end", + "inject_into": "request_parameter", + }, + }, + }, + }, + "ChildStream": { + "type": "DeclarativeStream", + "name": "ChildStream", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": {}, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.test.com", + "path": "/children/{{ stream_slice.parent_id }}", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "partition_router": { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + { + "stream": "#/definitions/ParentStream", + "parent_key": "id", + "partition_field": "parent_id", + "incremental_dependency": True, + } + ], + }, + }, + "incremental_sync": { + "type": "DatetimeBasedCursor", + "start_datetime": { + "datetime": "{{ format_datetime(config['start_date'], '%Y-%m-%d') }}" + }, + "end_datetime": {"datetime": "{{ now_utc().strftime('%Y-%m-%d') }}"}, + "datetime_format": "%Y-%m-%d", + "cursor_datetime_formats": ["%Y-%m-%d"], + "cursor_field": "updated_at", + }, + }, + }, + "streams": [{"$ref": "#/definitions/ChildStream"}], + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": [], + "properties": {}, + "additionalProperties": True, + }, + "documentation_url": "https://example.org", + "type": "Spec", + }, +} + + +def _create_parent_child_manifest_with_retention_period( + api_retention_period: str, +) -> dict: + manifest = copy.deepcopy(_PARENT_CHILD_MANIFEST) + manifest["definitions"]["ParentStream"]["api_retention_period"] = api_retention_period + return manifest + + +@freezegun.freeze_time("2024-07-15") +def test_parent_state_delegating_stream_retention_falls_back_to_full_refresh(): + """When parent StateDelegatingStream has old cursor in child state, retention triggers full refresh for parent.""" + manifest = _create_parent_child_manifest_with_retention_period("P7D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/parents"), + HttpResponse( + body=json.dumps( + [{"id": 1, "name": "parent_1", "updated_at": "2024-07-14"}] + ) + ), + ) + http_mocker.get( + HttpRequest(url="https://api.test.com/children/1"), + HttpResponse( + body=json.dumps( + [{"id": 10, "name": "child_1", "updated_at": "2024-07-14"}] + ) + ), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor( + name="ChildStream", namespace=None + ), + stream_state=AirbyteStateBlob( + use_global_cursor=False, + state={"updated_at": "2024-07-14"}, + states=[], + parent_state={"ParentStream": {"updated_at": "2024-06-01"}}, + lookback_window=0, + ), + ), + ) + ] + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + records = get_records(source, _CONFIG, configured_catalog, state) + assert len(records) == 1 diff --git a/unit_tests/sources/streams/concurrent/test_cursor.py b/unit_tests/sources/streams/concurrent/test_cursor.py index 34c92800d..bfce44933 100644 --- a/unit_tests/sources/streams/concurrent/test_cursor.py +++ b/unit_tests/sources/streams/concurrent/test_cursor.py @@ -13,6 +13,7 @@ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY from airbyte_cdk.sources.streams.concurrent.clamping import ( ClampingEndProvider, ClampingStrategy, @@ -24,6 +25,7 @@ ConcurrentCursor, CursorField, CursorValueType, + FinalStateCursor, ) from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import ( @@ -1387,3 +1389,10 @@ def test_given_partitioned_state_with_multiple_slices_when_should_be_synced_then ) == True ) + + +@freezegun.freeze_time("2024-07-15") +def test_final_state_cursor_get_cursor_datetime_from_state_returns_current_datetime(): + cursor = FinalStateCursor("test_stream", None, Mock(spec=MessageRepository)) + result = cursor.get_cursor_datetime_from_state({NO_CURSOR_STATE_KEY: True}) + assert result == datetime(2024, 7, 15, tzinfo=timezone.utc) From 6b65b7ad42c8a86caaa2e352b732cd2013b181fd Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:09:55 +0000 Subject: [PATCH 35/49] style: Fix ruff format issues in factory and test files Co-Authored-By: gl_anatolii.yatsuk@airbyte.io --- .../parsers/model_to_component_factory.py | 3 +-- .../declarative/test_state_delegating_stream.py | 12 +++--------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index e9e7360ab..a290b4102 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3654,8 +3654,7 @@ def _get_state_delegating_stream_model( """Return the appropriate underlying stream model based on state.""" return ( model.incremental_stream - if self._connector_state_manager.get_stream_state(model.name, None) - or parent_state + if self._connector_state_manager.get_stream_state(model.name, None) or parent_state else model.full_refresh_stream ) diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 7b943c195..146606fd8 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -778,17 +778,13 @@ def test_parent_state_delegating_stream_retention_falls_back_to_full_refresh(): http_mocker.get( HttpRequest(url="https://api.test.com/parents"), HttpResponse( - body=json.dumps( - [{"id": 1, "name": "parent_1", "updated_at": "2024-07-14"}] - ) + body=json.dumps([{"id": 1, "name": "parent_1", "updated_at": "2024-07-14"}]) ), ) http_mocker.get( HttpRequest(url="https://api.test.com/children/1"), HttpResponse( - body=json.dumps( - [{"id": 10, "name": "child_1", "updated_at": "2024-07-14"}] - ) + body=json.dumps([{"id": 10, "name": "child_1", "updated_at": "2024-07-14"}]) ), ) @@ -796,9 +792,7 @@ def test_parent_state_delegating_stream_retention_falls_back_to_full_refresh(): AirbyteStateMessage( type=AirbyteStateType.STREAM, stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor( - name="ChildStream", namespace=None - ), + stream_descriptor=StreamDescriptor(name="ChildStream", namespace=None), stream_state=AirbyteStateBlob( use_global_cursor=False, state={"updated_at": "2024-07-14"}, From 17f857a25b9493a4a145c40b47e6086356579f20 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:51:55 +0000 Subject: [PATCH 36/49] fix: Raise error for incompatible cursor types with api_retention_period Co-Authored-By: unknown <> --- .../parsers/model_to_component_factory.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index a290b4102..59a48cd5f 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3619,12 +3619,20 @@ def _is_cursor_older_than_retention_period( for cursor in cursors: if not hasattr(cursor, "get_cursor_datetime_from_state"): - continue + raise ValueError( + f"Stream '{stream_name}' uses a cursor type ({type(cursor).__name__}) that does not " + f"support cursor age validation. The cursor must implement get_cursor_datetime_from_state " + f"to use api_retention_period." + ) try: cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) except NotImplementedError: - continue + raise ValueError( + f"Stream '{stream_name}' uses a cursor type ({type(cursor).__name__}) that does not " + f"implement cursor age validation. The cursor's get_cursor_datetime_from_state method " + f"raised NotImplementedError. Remove api_retention_period or use a compatible cursor type." + ) if cursor_datetime is not None: break From 116339554e63b779f2c5c27f46f40124d69e936a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 19 Feb 2026 20:26:25 +0000 Subject: [PATCH 37/49] refactor: Simplify cursor age validation per brianjlai's review - Remove loop over cursors, just check incremental cursor directly - Change FinalStateCursor.get_cursor_datetime_from_state to return None (full refresh doesn't track cursor datetime, NO_CURSOR_STATE_KEY is handled separately in _is_cursor_older_than_retention_period) - Remove hasattr check since all cursors implement the method - Update docstrings to explain the design - Fixes bug where FinalStateCursor returning now() would short-circuit before checking the incremental cursor Co-Authored-By: unknown <> --- .../parsers/model_to_component_factory.py | 42 +++++-------------- .../sources/streams/concurrent/cursor.py | 10 ++++- .../sources/streams/concurrent/test_cursor.py | 10 +++-- 3 files changed, 26 insertions(+), 36 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 59a48cd5f..221e1ff85 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3586,58 +3586,38 @@ def create_state_delegating_stream( ) # type: ignore[assignment] if model.api_retention_period: - full_refresh_stream: DefaultStream = self._create_component_from_model( - model.full_refresh_stream, config=config, **kwargs - ) # type: ignore[assignment] - cursors = [full_refresh_stream.cursor, incremental_stream.cursor] if self._is_cursor_older_than_retention_period( - stream_state, cursors, model.api_retention_period, model.name + stream_state, + incremental_stream.cursor, + model.api_retention_period, + model.name, ): - return full_refresh_stream + return self._create_component_from_model( # type: ignore[no-any-return] + model.full_refresh_stream, config=config, **kwargs + ) return incremental_stream @staticmethod def _is_cursor_older_than_retention_period( stream_state: Mapping[str, Any], - cursors: list[Any], + cursor: Any, api_retention_period: str, stream_name: str, ) -> bool: """Check if the cursor value in the state is older than the API's retention period. - Tries each cursor's get_cursor_datetime_from_state to extract the cursor datetime, - since the state format may match either the full refresh or incremental cursor. - Returns True if the cursor is older than the retention period (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ + # FinalStateCursor state format - previous sync was a completed full refresh if stream_state.get(NO_CURSOR_STATE_KEY): return False - cursor_datetime: datetime.datetime | None = None - - for cursor in cursors: - if not hasattr(cursor, "get_cursor_datetime_from_state"): - raise ValueError( - f"Stream '{stream_name}' uses a cursor type ({type(cursor).__name__}) that does not " - f"support cursor age validation. The cursor must implement get_cursor_datetime_from_state " - f"to use api_retention_period." - ) - - try: - cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) - except NotImplementedError: - raise ValueError( - f"Stream '{stream_name}' uses a cursor type ({type(cursor).__name__}) that does not " - f"implement cursor age validation. The cursor's get_cursor_datetime_from_state method " - f"raised NotImplementedError. Remove api_retention_period or use a compatible cursor type." - ) - - if cursor_datetime is not None: - break + cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) if cursor_datetime is None: + # Cursor couldn't parse the state - fall back to full refresh to be safe return True retention_duration = parse_duration(api_retention_period) diff --git a/airbyte_cdk/sources/streams/concurrent/cursor.py b/airbyte_cdk/sources/streams/concurrent/cursor.py index 3ae935843..5da805548 100644 --- a/airbyte_cdk/sources/streams/concurrent/cursor.py +++ b/airbyte_cdk/sources/streams/concurrent/cursor.py @@ -159,8 +159,14 @@ def should_be_synced(self, record: Record) -> bool: def get_cursor_datetime_from_state( self, stream_state: Mapping[str, Any] ) -> datetime.datetime | None: - """FinalStateCursor indicates a completed full refresh; cursor is always current.""" - return datetime.datetime.now(datetime.timezone.utc) + """FinalStateCursor has no cursor datetime. + + Full refresh streams don't track a cursor position - they always read all data. + The FinalStateCursor state format ({NO_CURSOR_STATE_KEY: True}) is handled + separately in _is_cursor_older_than_retention_period before this method is called. + Returns None to indicate this cursor cannot parse datetime-based state. + """ + return None class ConcurrentCursor(Cursor): diff --git a/unit_tests/sources/streams/concurrent/test_cursor.py b/unit_tests/sources/streams/concurrent/test_cursor.py index bfce44933..fafd18bb0 100644 --- a/unit_tests/sources/streams/concurrent/test_cursor.py +++ b/unit_tests/sources/streams/concurrent/test_cursor.py @@ -1391,8 +1391,12 @@ def test_given_partitioned_state_with_multiple_slices_when_should_be_synced_then ) -@freezegun.freeze_time("2024-07-15") -def test_final_state_cursor_get_cursor_datetime_from_state_returns_current_datetime(): +def test_final_state_cursor_get_cursor_datetime_from_state_returns_none(): + """FinalStateCursor returns None because full refresh doesn't track a cursor datetime. + + The NO_CURSOR_STATE_KEY state format is handled separately in + _is_cursor_older_than_retention_period before get_cursor_datetime_from_state is called. + """ cursor = FinalStateCursor("test_stream", None, Mock(spec=MessageRepository)) result = cursor.get_cursor_datetime_from_state({NO_CURSOR_STATE_KEY: True}) - assert result == datetime(2024, 7, 15, tzinfo=timezone.utc) + assert result is None From acd7156a75c2c8d156f3658d2d49ff15dddbf439 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 19 Feb 2026 20:57:39 +0000 Subject: [PATCH 38/49] fix: Use Cursor type instead of Any for cursor parameter Co-Authored-By: unknown <> --- .../sources/declarative/parsers/model_to_component_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 221e1ff85..15ae5d590 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3601,7 +3601,7 @@ def create_state_delegating_stream( @staticmethod def _is_cursor_older_than_retention_period( stream_state: Mapping[str, Any], - cursor: Any, + cursor: Cursor, api_retention_period: str, stream_name: str, ) -> bool: From 8afe8e1d5a3c83c0477b30a116363179e428e575 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 20 Feb 2026 15:39:23 +0000 Subject: [PATCH 39/49] fix: Clear state when falling back to full refresh due to stale cursor When the cursor is older than the API retention period and we fall back to full refresh, clear the stream state and emit an empty state message to the platform. This ensures the platform does not retain stale state that would cause missed records on subsequent syncs. Co-Authored-By: alfredo.garcia@airbyte.io --- .../parsers/model_to_component_factory.py | 7 +++ .../test_state_delegating_stream.py | 46 +++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 15ae5d590..de6903244 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3592,6 +3592,13 @@ def create_state_delegating_stream( model.api_retention_period, model.name, ): + self._connector_state_manager.update_state_for_stream( + model.name, None, {} + ) + state_message = self._connector_state_manager.create_state_message( + model.name, None + ) + self._message_repository.emit_message(state_message) return self._create_component_from_model( # type: ignore[no-any-return] model.full_refresh_stream, config=config, **kwargs ) diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 146606fd8..df913b188 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -305,6 +305,52 @@ def test_cursor_age_validation_falls_back_to_full_refresh_when_cursor_too_old(): assert expected == records +@freezegun.freeze_time("2024-07-15") +def test_cursor_age_validation_clears_state_when_falling_back_to_full_refresh(): + """Test that state is cleared when cursor is older than retention period.""" + manifest = _create_manifest_with_retention_period("P7D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/items"), + HttpResponse( + body=json.dumps( + [ + {"id": 1, "name": "item_1", "updated_at": "2024-07-13"}, + {"id": 2, "name": "item_2", "updated_at": "2024-07-14"}, + ] + ) + ), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(updated_at="2024-07-01"), + ), + ) + ] + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + all_messages = list( + source.read( + logger=MagicMock(), config=_CONFIG, catalog=configured_catalog, state=state + ) + ) + + state_messages = [msg for msg in all_messages if msg.type == Type.STATE] + assert len(state_messages) > 0, "Expected at least one state message" + first_state = state_messages[0].state.stream.stream_state + assert first_state == AirbyteStateBlob(), ( + f"Expected first state message to be empty (clearing stale state), got: {first_state}" + ) + + @freezegun.freeze_time("2024-07-15") def test_cursor_age_validation_uses_incremental_when_cursor_within_retention(): """Test that when cursor is within retention period, incremental sync is used.""" From 2a4f3857a25da39c6655e0f924a0ba6bab543af2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 20 Feb 2026 16:17:26 +0000 Subject: [PATCH 40/49] style: Fix ruff format issues in state clearing code Co-Authored-By: alfredo.garcia@airbyte.io --- .../declarative/parsers/model_to_component_factory.py | 8 ++------ .../sources/declarative/test_state_delegating_stream.py | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index de6903244..e8a4028bf 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3592,12 +3592,8 @@ def create_state_delegating_stream( model.api_retention_period, model.name, ): - self._connector_state_manager.update_state_for_stream( - model.name, None, {} - ) - state_message = self._connector_state_manager.create_state_message( - model.name, None - ) + self._connector_state_manager.update_state_for_stream(model.name, None, {}) + state_message = self._connector_state_manager.create_state_message(model.name, None) self._message_repository.emit_message(state_message) return self._create_component_from_model( # type: ignore[no-any-return] model.full_refresh_stream, config=config, **kwargs diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index df913b188..668b5d616 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -338,9 +338,7 @@ def test_cursor_age_validation_clears_state_when_falling_back_to_full_refresh(): configured_catalog = create_configured_catalog(source, _CONFIG) all_messages = list( - source.read( - logger=MagicMock(), config=_CONFIG, catalog=configured_catalog, state=state - ) + source.read(logger=MagicMock(), config=_CONFIG, catalog=configured_catalog, state=state) ) state_messages = [msg for msg in all_messages if msg.type == Type.STATE] From e4f71ff0046b72f2d3c376a9966f7e8343654b83 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Feb 2026 18:43:15 +0000 Subject: [PATCH 41/49] fix: Implement tolik0's FinalStateCursor feedback with NO_CURSOR_STATE_KEY handling - FinalStateCursor.get_cursor_datetime_from_state returns now() for NO_CURSOR_STATE_KEY state, else None - Updated _is_cursor_older_than_retention_period to check both cursors in sequence (full refresh first, then incremental) - Kept early return for NO_CURSOR_STATE_KEY since full_refresh_stream cursor is DatetimeBasedCursor, not FinalStateCursor Co-Authored-By: unknown <> --- .../parsers/model_to_component_factory.py | 32 +++++++++++++------ .../sources/streams/concurrent/cursor.py | 14 +++++--- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index e8a4028bf..eaa3de3dc 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3586,8 +3586,12 @@ def create_state_delegating_stream( ) # type: ignore[assignment] if model.api_retention_period: + full_refresh_stream: DefaultStream = self._create_component_from_model( + model.full_refresh_stream, config=config, **kwargs + ) # type: ignore[assignment] if self._is_cursor_older_than_retention_period( stream_state, + full_refresh_stream.cursor, incremental_stream.cursor, model.api_retention_period, model.name, @@ -3595,36 +3599,44 @@ def create_state_delegating_stream( self._connector_state_manager.update_state_for_stream(model.name, None, {}) state_message = self._connector_state_manager.create_state_message(model.name, None) self._message_repository.emit_message(state_message) - return self._create_component_from_model( # type: ignore[no-any-return] - model.full_refresh_stream, config=config, **kwargs - ) + return full_refresh_stream return incremental_stream @staticmethod def _is_cursor_older_than_retention_period( stream_state: Mapping[str, Any], - cursor: Cursor, + full_refresh_cursor: Cursor, + incremental_cursor: Cursor, api_retention_period: str, stream_name: str, ) -> bool: """Check if the cursor value in the state is older than the API's retention period. + Checks cursors in sequence: full refresh cursor first, then incremental cursor. + If state has NO_CURSOR_STATE_KEY, it means the previous sync was a completed full + refresh, so the cursor is "current" and we should use incremental. + Returns True if the cursor is older than the retention period (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ - # FinalStateCursor state format - previous sync was a completed full refresh + # NO_CURSOR_STATE_KEY indicates a completed full refresh - cursor is "current" if stream_state.get(NO_CURSOR_STATE_KEY): return False - cursor_datetime = cursor.get_cursor_datetime_from_state(stream_state) + retention_duration = parse_duration(api_retention_period) + retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration + + # Check full refresh cursor first + cursor_datetime = full_refresh_cursor.get_cursor_datetime_from_state(stream_state) + # If full refresh cursor returns None, check incremental cursor if cursor_datetime is None: - # Cursor couldn't parse the state - fall back to full refresh to be safe - return True + cursor_datetime = incremental_cursor.get_cursor_datetime_from_state(stream_state) - retention_duration = parse_duration(api_retention_period) - retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration + if cursor_datetime is None: + # Neither cursor could parse the state - fall back to full refresh to be safe + return True if cursor_datetime < retention_cutoff: logging.warning( diff --git a/airbyte_cdk/sources/streams/concurrent/cursor.py b/airbyte_cdk/sources/streams/concurrent/cursor.py index 5da805548..11eaad235 100644 --- a/airbyte_cdk/sources/streams/concurrent/cursor.py +++ b/airbyte_cdk/sources/streams/concurrent/cursor.py @@ -159,13 +159,17 @@ def should_be_synced(self, record: Record) -> bool: def get_cursor_datetime_from_state( self, stream_state: Mapping[str, Any] ) -> datetime.datetime | None: - """FinalStateCursor has no cursor datetime. + """Return now() if state indicates a completed full refresh, else None. - Full refresh streams don't track a cursor position - they always read all data. - The FinalStateCursor state format ({NO_CURSOR_STATE_KEY: True}) is handled - separately in _is_cursor_older_than_retention_period before this method is called. - Returns None to indicate this cursor cannot parse datetime-based state. + When the state has NO_CURSOR_STATE_KEY: True, it means the previous sync was a + completed full refresh. Returning now() indicates the cursor is "current" and + within any retention period, so we should use incremental sync. + + For any other state format, return None to indicate this cursor cannot parse it, + allowing the incremental cursor to handle the state instead. """ + if stream_state.get(NO_CURSOR_STATE_KEY): + return datetime.datetime.now(datetime.timezone.utc) return None From 9340d3cb49673c74dd3172b67dd575aec7b2fb00 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Feb 2026 18:50:57 +0000 Subject: [PATCH 42/49] fix: Update FinalStateCursor test to match new behavior per tolik0's request Co-Authored-By: unknown <> --- .../sources/streams/concurrent/test_cursor.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_cursor.py b/unit_tests/sources/streams/concurrent/test_cursor.py index fafd18bb0..a34e044b5 100644 --- a/unit_tests/sources/streams/concurrent/test_cursor.py +++ b/unit_tests/sources/streams/concurrent/test_cursor.py @@ -1391,12 +1391,21 @@ def test_given_partitioned_state_with_multiple_slices_when_should_be_synced_then ) -def test_final_state_cursor_get_cursor_datetime_from_state_returns_none(): - """FinalStateCursor returns None because full refresh doesn't track a cursor datetime. +@freezegun.freeze_time("2024-07-15") +def test_final_state_cursor_get_cursor_datetime_from_state_returns_now_for_no_cursor_state(): + """FinalStateCursor returns now() for NO_CURSOR_STATE_KEY state, else None. - The NO_CURSOR_STATE_KEY state format is handled separately in - _is_cursor_older_than_retention_period before get_cursor_datetime_from_state is called. + When state has NO_CURSOR_STATE_KEY: True, it means the previous sync was a completed + full refresh. Returning now() indicates the cursor is "current" and within any + retention period, so we should use incremental sync. """ cursor = FinalStateCursor("test_stream", None, Mock(spec=MessageRepository)) - result = cursor.get_cursor_datetime_from_state({NO_CURSOR_STATE_KEY: True}) - assert result is None + + result_with_no_cursor_key = cursor.get_cursor_datetime_from_state({NO_CURSOR_STATE_KEY: True}) + assert result_with_no_cursor_key == datetime(2024, 7, 15, tzinfo=timezone.utc) + + result_without_no_cursor_key = cursor.get_cursor_datetime_from_state({"some_other_key": "value"}) + assert result_without_no_cursor_key is None + + result_with_empty_state = cursor.get_cursor_datetime_from_state({}) + assert result_with_empty_state is None From e021f587449082b2e8c590926abaa7b29fd850e2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Feb 2026 18:52:31 +0000 Subject: [PATCH 43/49] style: Fix ruff format issues in test file Co-Authored-By: unknown <> --- unit_tests/sources/streams/concurrent/test_cursor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unit_tests/sources/streams/concurrent/test_cursor.py b/unit_tests/sources/streams/concurrent/test_cursor.py index a34e044b5..13fe1df87 100644 --- a/unit_tests/sources/streams/concurrent/test_cursor.py +++ b/unit_tests/sources/streams/concurrent/test_cursor.py @@ -1404,7 +1404,9 @@ def test_final_state_cursor_get_cursor_datetime_from_state_returns_now_for_no_cu result_with_no_cursor_key = cursor.get_cursor_datetime_from_state({NO_CURSOR_STATE_KEY: True}) assert result_with_no_cursor_key == datetime(2024, 7, 15, tzinfo=timezone.utc) - result_without_no_cursor_key = cursor.get_cursor_datetime_from_state({"some_other_key": "value"}) + result_without_no_cursor_key = cursor.get_cursor_datetime_from_state( + {"some_other_key": "value"} + ) assert result_without_no_cursor_key is None result_with_empty_state = cursor.get_cursor_datetime_from_state({}) From 1dcc8abac12029caacd957d8b100f148a914190e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Feb 2026 19:25:07 +0000 Subject: [PATCH 44/49] refactor: Remove early return for NO_CURSOR_STATE_KEY per tolik0's request FinalStateCursor.get_cursor_datetime_from_state now handles NO_CURSOR_STATE_KEY by returning now(), so the explicit early return is no longer needed. Co-Authored-By: unknown <> --- .../declarative/parsers/model_to_component_factory.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index eaa3de3dc..bdbc8fefe 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3614,16 +3614,13 @@ def _is_cursor_older_than_retention_period( """Check if the cursor value in the state is older than the API's retention period. Checks cursors in sequence: full refresh cursor first, then incremental cursor. - If state has NO_CURSOR_STATE_KEY, it means the previous sync was a completed full - refresh, so the cursor is "current" and we should use incremental. + FinalStateCursor returns now() for completed full refresh state (NO_CURSOR_STATE_KEY), + which is always within retention, so we use incremental. For other states, it returns + None and we fall back to checking the incremental cursor. Returns True if the cursor is older than the retention period (should use full refresh). Returns False if the cursor is within the retention period (safe to use incremental). """ - # NO_CURSOR_STATE_KEY indicates a completed full refresh - cursor is "current" - if stream_state.get(NO_CURSOR_STATE_KEY): - return False - retention_duration = parse_duration(api_retention_period) retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration From 6d95923fbe34e36750ee4fa224833c17fb43d3bf Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Feb 2026 19:28:07 +0000 Subject: [PATCH 45/49] fix: Remove unused NO_CURSOR_STATE_KEY import Co-Authored-By: unknown <> --- .../sources/declarative/parsers/model_to_component_factory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index bdbc8fefe..aaf661ffa 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -618,7 +618,6 @@ NoopMessageRepository, ) from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository -from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY from airbyte_cdk.sources.streams.call_rate import ( APIBudget, FixedWindowCallRatePolicy, From a3a2073ef4a08246d06b2757cc3e6c6197ff8977 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Feb 2026 20:39:09 +0000 Subject: [PATCH 46/49] fix: Update FinalStateCursor test to match actual ConcurrentCursor behavior The test expected FinalStateCursor.get_cursor_datetime_from_state to be called when state has NO_CURSOR_STATE_KEY, but both streams get ConcurrentCursor instances which cannot parse that state format. Both return None, causing the implementation to correctly fall back to full refresh as the safe default. Updated the test to mock the full refresh URL and reflect this behavior. Co-Authored-By: unknown <> --- .../sources/declarative/test_state_delegating_stream.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 668b5d616..6930ae6bd 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -613,15 +613,14 @@ def test_cursor_age_validation_raises_error_for_unparseable_cursor(): @freezegun.freeze_time("2024-07-15") -def test_final_state_cursor_skips_retention_check_and_uses_incremental(): - """When state is a final state from FinalStateCursor, skip retention check and use incremental.""" +def test_final_state_cursor_falls_back_to_full_refresh_when_state_unparseable(): + """When state is a final state (NO_CURSOR_STATE_KEY), ConcurrentCursor cannot parse it, + so both cursors return None and the implementation falls back to full refresh as the safe default.""" manifest = _create_manifest_with_retention_period("P7D") with HttpMocker() as http_mocker: http_mocker.get( - HttpRequest( - url="https://api.test.com/items_with_filtration?start=2024-07-01&end=2024-07-15" - ), + HttpRequest(url="https://api.test.com/items"), HttpResponse( body=json.dumps( [ From 020d2f5253a5cb668b6adc72eae1e97a357550f5 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 21:46:46 +0000 Subject: [PATCH 47/49] fix: Skip state emission for streams not in configured catalog When cursor age validation detects a stale cursor on a StateDelegatingStream, it clears the stream state and emits an empty state message. However, if the stream is not in the user's configured catalog (e.g. a parent stream created as a dependency), the destination does not know about it and crashes with 'Stream not found'. This fix checks whether the stream is in the configured catalog before emitting the state-clearing message. If no catalog is provided (e.g. during discover), state is emitted as before for backward compatibility. Co-Authored-By: unknown <> --- .../parsers/model_to_component_factory.py | 17 ++++- .../test_state_delegating_stream.py | 68 +++++++++++++++++++ 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index aaf661ffa..38a09156a 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3596,8 +3596,21 @@ def create_state_delegating_stream( model.name, ): self._connector_state_manager.update_state_for_stream(model.name, None, {}) - state_message = self._connector_state_manager.create_state_message(model.name, None) - self._message_repository.emit_message(state_message) + # Only emit the state-clearing message if this stream is in the + # configured catalog (or if no catalog was provided, e.g. during + # discover / connector builder). Streams that are NOT selected by the + # user but are instantiated as parent-stream dependencies must not emit + # state messages because the destination does not know about them and + # will crash with "Stream not found". + stream_is_in_catalog = ( + not self._stream_name_to_configured_stream # no catalog → emit by default + or model.name in self._stream_name_to_configured_stream + ) + if stream_is_in_catalog: + state_message = self._connector_state_manager.create_state_message( + model.name, None + ) + self._message_repository.emit_message(state_message) return full_refresh_stream return incremental_stream diff --git a/unit_tests/sources/declarative/test_state_delegating_stream.py b/unit_tests/sources/declarative/test_state_delegating_stream.py index 6930ae6bd..7cb717e42 100644 --- a/unit_tests/sources/declarative/test_state_delegating_stream.py +++ b/unit_tests/sources/declarative/test_state_delegating_stream.py @@ -852,3 +852,71 @@ def test_parent_state_delegating_stream_retention_falls_back_to_full_refresh(): configured_catalog = create_configured_catalog(source, _CONFIG) records = get_records(source, _CONFIG, configured_catalog, state) assert len(records) == 1 + + +@freezegun.freeze_time("2024-07-15") +def test_unconfigured_parent_stream_does_not_emit_state_on_retention_fallback(): + """When a parent StateDelegatingStream has stale cursor state but is NOT in the + configured catalog (only the child is selected), no state message should be emitted + for the parent. Previously this would emit a state message for the parent stream, + causing the destination to crash with 'Stream not found'.""" + manifest = _create_parent_child_manifest_with_retention_period("P7D") + + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/parents"), + HttpResponse( + body=json.dumps([{"id": 1, "name": "parent_1", "updated_at": "2024-07-14"}]) + ), + ) + http_mocker.get( + HttpRequest(url="https://api.test.com/children/1"), + HttpResponse( + body=json.dumps([{"id": 10, "name": "child_1", "updated_at": "2024-07-14"}]) + ), + ) + + # ParentStream has stale state (older than 7 days) but ParentStream is NOT + # in the configured catalog — only ChildStream is selected. + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="ParentStream", namespace=None), + stream_state=AirbyteStateBlob(updated_at="2024-06-01"), + ), + ), + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="ChildStream", namespace=None), + stream_state=AirbyteStateBlob( + use_global_cursor=False, + state={"updated_at": "2024-07-14"}, + states=[], + parent_state={"ParentStream": {"updated_at": "2024-06-01"}}, + lookback_window=0, + ), + ), + ), + ] + source = ConcurrentDeclarativeSource( + source_config=manifest, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + all_messages = list( + source.read(logger=MagicMock(), config=_CONFIG, catalog=configured_catalog, state=state) + ) + + # No state message should reference ParentStream since it's not in the catalog + state_messages = [msg for msg in all_messages if msg.type == Type.STATE] + parent_state_messages = [ + msg + for msg in state_messages + if msg.state.stream.stream_descriptor.name == "ParentStream" + ] + assert len(parent_state_messages) == 0, ( + f"Expected no state messages for unconfigured ParentStream, " + f"but got {len(parent_state_messages)}: {parent_state_messages}" + ) From 21bb2a9c939d5d1c7250c89e7da73bb80b5dbd04 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 21:51:38 +0000 Subject: [PATCH 48/49] refactor: Move catalog check to skip entire retention validation for unconfigured streams Instead of running cursor age validation and then suppressing just the state message, skip the entire api_retention_period block for streams not in the configured catalog. This avoids unnecessary work (creating both stream components, comparing cursor age) for parent-stream dependencies that the destination doesn't know about. Co-Authored-By: unknown <> --- .../parsers/model_to_component_factory.py | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 38a09156a..4ac99be58 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3584,7 +3584,16 @@ def create_state_delegating_stream( model.incremental_stream, config=config, **kwargs ) # type: ignore[assignment] - if model.api_retention_period: + # Only run cursor age validation for streams that are in the configured + # catalog (or when no catalog was provided, e.g. during discover / connector + # builder). Streams not selected by the user but instantiated as parent-stream + # dependencies must not go through this path because it emits state messages + # that the destination does not know about, causing "Stream not found" crashes. + stream_is_in_catalog = ( + not self._stream_name_to_configured_stream # no catalog → validate by default + or model.name in self._stream_name_to_configured_stream + ) + if model.api_retention_period and stream_is_in_catalog: full_refresh_stream: DefaultStream = self._create_component_from_model( model.full_refresh_stream, config=config, **kwargs ) # type: ignore[assignment] @@ -3596,21 +3605,10 @@ def create_state_delegating_stream( model.name, ): self._connector_state_manager.update_state_for_stream(model.name, None, {}) - # Only emit the state-clearing message if this stream is in the - # configured catalog (or if no catalog was provided, e.g. during - # discover / connector builder). Streams that are NOT selected by the - # user but are instantiated as parent-stream dependencies must not emit - # state messages because the destination does not know about them and - # will crash with "Stream not found". - stream_is_in_catalog = ( - not self._stream_name_to_configured_stream # no catalog → emit by default - or model.name in self._stream_name_to_configured_stream + state_message = self._connector_state_manager.create_state_message( + model.name, None ) - if stream_is_in_catalog: - state_message = self._connector_state_manager.create_state_message( - model.name, None - ) - self._message_repository.emit_message(state_message) + self._message_repository.emit_message(state_message) return full_refresh_stream return incremental_stream From 2a2459df95af58e726ca534d5578d7206c87293d Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 21:54:51 +0000 Subject: [PATCH 49/49] style: Fix ruff format issue in create_state_delegating_stream Co-Authored-By: unknown <> --- .../sources/declarative/parsers/model_to_component_factory.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 4ac99be58..bb3c1e653 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3605,9 +3605,7 @@ def create_state_delegating_stream( model.name, ): self._connector_state_manager.update_state_for_stream(model.name, None, {}) - state_message = self._connector_state_manager.create_state_message( - model.name, None - ) + state_message = self._connector_state_manager.create_state_message(model.name, None) self._message_repository.emit_message(state_message) return full_refresh_stream