Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions airbyte_cdk/sources/file_based/file_based_stream_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging
import time
from abc import ABC, abstractmethod
from datetime import datetime
from datetime import timezone
Copy link

Copilot AI Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

datetime is no longer used in this module after switching to ab_datetime_parse; consider removing it from the from datetime import ... import to avoid dead code / lint noise.

Suggested change
from datetime import timezone

Copilot uses AI. Check for mistakes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already addressed in the second commit (1f52c10) — datetime was removed, only timezone is imported now.


Devin session

from enum import Enum
from io import IOBase
from os import makedirs, path
Expand All @@ -24,6 +24,7 @@
from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
from airbyte_cdk.sources.file_based.remote_file import RemoteFile, UploadableRemoteFile
from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse


class FileReadMode(Enum):
Expand Down Expand Up @@ -105,15 +106,20 @@ def filter_files_by_globs_and_start_date(
Utility method for filtering files based on globs.
"""
start_date = (
datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
ab_datetime_parse(self.config.start_date)
if self.config and self.config.start_date
else None
)
seen = set()

for file in files:
if self.file_matches_globs(file, globs):
if file.uri not in seen and (not start_date or file.last_modified >= start_date):
last_modified = (
file.last_modified
if file.last_modified.tzinfo is not None
else file.last_modified.replace(tzinfo=timezone.utc)
)
Comment on lines +117 to +121
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "1) Inspect RemoteFile definition (last_modified type/contract):"
fd 'remote_file.py$' --exec sed -n '1,260p' {}

echo
echo "2) Find call sites constructing RemoteFile / UploadableRemoteFile with last_modified:"
rg -nP --type=py -C3 'RemoteFile\(|UploadableRemoteFile\('

echo
echo "3) Find last_modified assignments and datetime parsing sources:"
rg -nP --type=py -C3 'last_modified\s*=|strptime\(|fromtimestamp\(|parser\.parse\(|ab_datetime_parse\('

Repository: airbytehq/airbyte-python-cdk

Length of output: 50385


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Production code (non-test) that constructs RemoteFile with last_modified ==="
rg -t py 'RemoteFile\(' --glob '!unit_tests' --glob '!test_' -B2 -A2 'last_modified' | head -100

echo
echo "=== Datetime parsing in production (non-test) sources ==="
rg -t py '(strptime|fromtimestamp|parser\.parse|datetime\.now)' --glob '!unit_tests' --glob '!test_' -B2 -A2 | head -150

echo
echo "=== Check for validators on RemoteFile or last_modified ==="
rg -t py 'validator|@root_validator' airbyte_cdk/sources/file_based/

echo
echo "=== Search for actual S3/GCS/HTTPS remote file implementations ==="
fd -t f -e py --glob '*remote*file*' --glob '!test' | xargs rg -l 'RemoteFile\|UploadableRemoteFile' 2>/dev/null | head -20

Repository: airbytehq/airbyte-python-cdk

Length of output: 223


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Production code RemoteFile/UploadableRemoteFile implementations ==="
find airbyte_cdk/sources/file_based -name '*.py' -type f ! -path '*/test*' -exec grep -l 'class.*RemoteFile' {} \;

echo
echo "=== Search for datetime creation in these implementation files ==="
find airbyte_cdk/sources/file_based -name '*.py' -type f ! -path '*/test*' -exec grep -H 'last_modified.*=' {} \; | head -50

echo
echo "=== Check for datetime/timezone imports in file_based_stream_reader.py ==="
sed -n '1,50p' airbyte_cdk/sources/file_based/file_based_stream_reader.py

echo
echo "=== Look at code around line 117-121 to see imports and context ==="
sed -n '1,15p' airbyte_cdk/sources/file_based/file_based_stream_reader.py
rg -t py 'from datetime|import.*timezone' airbyte_cdk/sources/file_based/file_based_stream_reader.py

echo
echo "=== Search for test that explicitly tests UTC-naive assumption ==="
rg -t py 'UTC|utc.*naive|naive.*utc|tzinfo' --glob '*test*' -B2 -A2 | head -80

Repository: airbytehq/airbyte-python-cdk

Length of output: 10704


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Tests in file_based_stream_reader test file ==="
rg -t py 'def test.*last_modified|def test.*utc|def test.*naive' unit_tests/sources/file_based/test_file_based_stream_reader.py -A 5

echo
echo "=== Look at the actual code lines 117-121 context ==="
sed -n '110,130p' airbyte_cdk/sources/file_based/file_based_stream_reader.py

echo
echo "=== Check if RemoteFile has any validators ==="
cat airbyte_cdk/sources/file_based/remote_file.py

Repository: airbytehq/airbyte-python-cdk

Length of output: 123


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Code context around lines 117-121 ==="
sed -n '110,130p' airbyte_cdk/sources/file_based/file_based_stream_reader.py

echo
echo "=== Full RemoteFile definition ==="
cat airbyte_cdk/sources/file_based/remote_file.py

Repository: airbytehq/airbyte-python-cdk

Length of output: 2412


Verify that all RemoteFile.last_modified producers generate UTC-naive datetimes.

The concern is valid: the code assumes naive datetimes are UTC (line 120: .replace(tzinfo=timezone.utc)), but evidence shows datetime.now() is used in production code (e.g., unstructured_parser.py), which generates local-naive datetimes, not UTC-naive. If the system runs in a non-UTC timezone, this would cause incorrect time-based filtering.

Could we ensure all RemoteFile producers either (a) use timezone-aware UTC datetimes, or (b) consistently use a UTC-aware datetime parser? Adding a guard test or validator on RemoteFile.last_modified to document/enforce this contract would also help prevent regressions.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@airbyte_cdk/sources/file_based/file_based_stream_reader.py` around lines 117
- 121, RemoteFile.last_modified is sometimes produced as a local-naive datetime
(e.g., via datetime.now()) but the reader assumes naive means UTC; fix by making
producers emit timezone-aware UTC datetimes (replace datetime.now() with
datetime.now(timezone.utc) in producers such as unstructured_parser.py), and add
a defensive validator/normalizer on the RemoteFile model (e.g.,
RemoteFile.__post_init__ or factory) that asserts or converts last_modified to
an aware UTC datetime (use .astimezone(timezone.utc) if tzinfo present or attach
timezone.utc if you choose to treat naive as UTC); also add a unit test that
constructs RemoteFile with naive and aware datetimes to ensure the validator
enforces/normalizes to UTC so file_based_stream_reader.py no longer needs to
assume naive==UTC.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the non-zero timezone offset test case in 432976e. Good suggestion — this exercises the full offset normalization path.


Devin session

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Valid observation about datetime.now() in producers like unstructured_parser.py — that's a pre-existing issue where naive-local datetimes could be produced in non-UTC environments. However, fixing all RemoteFile producers and adding a model-level validator is out of scope for this PR, which is focused on the start_date parsing bug (#920).

The defensive normalization here (assuming naive = UTC) preserves the existing behavior from before this change — the old strptime also produced naive datetimes that were implicitly treated as UTC. This PR doesn't make that situation worse; it just makes the assumption explicit.

A follow-up to enforce UTC-aware datetimes at the RemoteFile model level would be a good improvement but should be a separate effort.


Devin session

if file.uri not in seen and (not start_date or last_modified >= start_date):
seen.add(file.uri)
yield file

Expand Down
35 changes: 35 additions & 0 deletions unit_tests/sources/file_based/test_file_based_stream_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,41 @@ def documentation_url(cls) -> AnyUrl:
set(),
id="all_csvs_modified_exactly_on_start_date",
),
pytest.param(
["**/*.csv"],
{"start_date": "2023-06-01T00:00:00Z", "streams": []},
{"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"},
set(),
id="start_date_without_microseconds",
),
pytest.param(
["**/*.csv"],
{"start_date": "2023-06-10T00:00:00Z", "streams": []},
set(),
set(),
id="start_date_without_microseconds_modified_before",
),
pytest.param(
["**/*.csv"],
{"start_date": "2023-06-01T00:00:00+00:00", "streams": []},
{"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"},
set(),
id="start_date_with_utc_offset",
),
pytest.param(
["**/*.csv"],
{"start_date": "2023-06-05T08:54:07+05:00", "streams": []},
{"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"},
set(),
id="start_date_with_non_zero_offset",
),
pytest.param(
["**/*.csv"],
{"start_date": "2023-06-01", "streams": []},
{"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"},
set(),
id="start_date_date_only",
),
],
)
def test_globs_and_prefixes_from_globs(
Expand Down
Loading