From c7dee62b392052baffd067edcc4fddbcc2c34322 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 20:18:52 +0000 Subject: [PATCH 1/3] fix: use flexible datetime parsing in filter_files_by_globs_and_start_date Replace strict datetime.strptime with ab_datetime_parse in filter_files_by_globs_and_start_date to accept valid ISO8601 dates without microseconds (e.g. 2025-01-01T00:00:00Z). Also handle timezone-naive file.last_modified by assuming UTC for comparison, preventing TypeError on mixed naive/aware comparisons. Closes #920 Co-Authored-By: AJ Steers --- .../file_based/file_based_stream_reader.py | 12 ++++++-- .../test_file_based_stream_reader.py | 28 +++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 7443dccd6..6e3bd1651 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -5,7 +5,7 @@ import logging import time from abc import ABC, abstractmethod -from datetime import datetime +from datetime import datetime, timezone from enum import Enum from io import IOBase from os import makedirs, path @@ -24,6 +24,7 @@ from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError from airbyte_cdk.sources.file_based.file_record_data import FileRecordData from airbyte_cdk.sources.file_based.remote_file import RemoteFile, UploadableRemoteFile +from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse class FileReadMode(Enum): @@ -105,7 +106,7 @@ def filter_files_by_globs_and_start_date( Utility method for filtering files based on globs. """ start_date = ( - datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT) + ab_datetime_parse(self.config.start_date) if self.config and self.config.start_date else None ) @@ -113,7 +114,12 @@ def filter_files_by_globs_and_start_date( for file in files: if self.file_matches_globs(file, globs): - if file.uri not in seen and (not start_date or file.last_modified >= start_date): + last_modified = ( + file.last_modified + if file.last_modified.tzinfo is not None + else file.last_modified.replace(tzinfo=timezone.utc) + ) + if file.uri not in seen and (not start_date or last_modified >= start_date): seen.add(file.uri) yield file diff --git a/unit_tests/sources/file_based/test_file_based_stream_reader.py b/unit_tests/sources/file_based/test_file_based_stream_reader.py index 13fa1025c..210cf8b87 100644 --- a/unit_tests/sources/file_based/test_file_based_stream_reader.py +++ b/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -408,6 +408,34 @@ def documentation_url(cls) -> AnyUrl: set(), id="all_csvs_modified_exactly_on_start_date", ), + pytest.param( + ["**/*.csv"], + {"start_date": "2023-06-01T00:00:00Z", "streams": []}, + {"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"}, + set(), + id="start_date_without_microseconds", + ), + pytest.param( + ["**/*.csv"], + {"start_date": "2023-06-10T00:00:00Z", "streams": []}, + set(), + set(), + id="start_date_without_microseconds_modified_before", + ), + pytest.param( + ["**/*.csv"], + {"start_date": "2023-06-01T00:00:00+00:00", "streams": []}, + {"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"}, + set(), + id="start_date_with_utc_offset", + ), + pytest.param( + ["**/*.csv"], + {"start_date": "2023-06-01", "streams": []}, + {"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"}, + set(), + id="start_date_date_only", + ), ], ) def test_globs_and_prefixes_from_globs( From 1f52c101d768f33a77c3b0856465cea57bf13a38 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 20:22:01 +0000 Subject: [PATCH 2/3] fix: remove unused datetime import Co-Authored-By: AJ Steers --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 6e3bd1651..826d89112 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -5,7 +5,7 @@ import logging import time from abc import ABC, abstractmethod -from datetime import datetime, timezone +from datetime import timezone from enum import Enum from io import IOBase from os import makedirs, path From 432976eebd199e16f2bdc96eeffe4391c8c20883 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 20:29:39 +0000 Subject: [PATCH 3/3] test: add non-zero timezone offset test case per CodeRabbit review Co-Authored-By: AJ Steers --- .../sources/file_based/test_file_based_stream_reader.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/unit_tests/sources/file_based/test_file_based_stream_reader.py b/unit_tests/sources/file_based/test_file_based_stream_reader.py index 210cf8b87..7058545bd 100644 --- a/unit_tests/sources/file_based/test_file_based_stream_reader.py +++ b/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -429,6 +429,13 @@ def documentation_url(cls) -> AnyUrl: set(), id="start_date_with_utc_offset", ), + pytest.param( + ["**/*.csv"], + {"start_date": "2023-06-05T08:54:07+05:00", "streams": []}, + {"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"}, + set(), + id="start_date_with_non_zero_offset", + ), pytest.param( ["**/*.csv"], {"start_date": "2023-06-01", "streams": []},