Skip to content
1 change: 0 additions & 1 deletion src/post_processing/dataclass/data_aplose.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,6 @@ def plot(
color = kwargs.get("color")
season = kwargs.get("season")
effort = kwargs.get("effort")

if not bin_size:
msg = "'bin_size' missing for histogram plot."
raise ValueError(msg)
Expand Down
8 changes: 7 additions & 1 deletion src/post_processing/dataclass/detection_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from __future__ import annotations

from dataclasses import dataclass
from dataclasses import dataclass, fields
from pathlib import Path
from typing import TYPE_CHECKING, Literal

Expand Down Expand Up @@ -44,6 +44,12 @@ class DetectionFilter:
box: bool = False
filename_format: str = None

def __getitem__(self, key: str):
"""Return the value of the given key."""
if key in {f.name for f in fields(self)}:
return getattr(self, key)
raise KeyError(key)

@classmethod
def from_yaml(
cls,
Expand Down
153 changes: 120 additions & 33 deletions src/post_processing/dataclass/recording_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,15 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING

from osekit.config import TIMESTAMP_FORMATS_EXPORTED_FILES
from osekit.utils.timestamp_utils import strptime_from_text
from pandas import (
Series,
Timedelta,
cut,
date_range,
interval_range,
read_csv,
to_datetime,
)

from post_processing.utils.core_utils import (
get_time_range_and_bin_size,
localize_timestamps,
)
from post_processing.utils.filtering_utils import (
find_delimiter,
)
Expand All @@ -33,7 +29,7 @@

@dataclass(frozen=True)
class RecordingPeriod:
"""A class to handle recording periods."""
"""Represents recording effort over time, aggregated into bins."""

counts: Series
timebin_origin: Timedelta
Expand All @@ -42,33 +38,124 @@ class RecordingPeriod:
def from_path(
cls,
config: DetectionFilter,
date_format: str = TIMESTAMP_FORMATS_EXPORTED_FILES,
*,
bin_size: Timedelta | BaseOffset,
) -> RecordingPeriod:
"""Return a list of Timestamps corresponding to recording periods."""
"""Vectorized creation of recording coverage from CSV with start/end datetimes.

This method reads a CSV with columns:
- "start_recording"
- "end_recording"
- "start_deployment"
- "end_deployment"

It computes the **effective recording interval** as the intersection between
recording and deployment periods, builds a fine-grained timeline at
`timebin_origin` resolution, and aggregates effort into `bin_size` bins.

Parameters
----------
config
Configuration object containing at least:
- `timestamp_file`: path to CSV
- `timebin_origin`: Timedelta resolution of detections
bin_size : Timedelta or BaseOffset
Size of the aggregation bin (e.g., Timedelta("1H") or "1D").

Returns
-------
RecordingPeriod
Object containing `counts` (Series indexed by IntervalIndex) and
`timebin_origin`.

"""
# Read CSV and parse datetime columns
timestamp_file = config.timestamp_file
delim = find_delimiter(timestamp_file)
timestamp_df = read_csv(timestamp_file, delimiter=delim)

if "timestamp" in timestamp_df.columns:
msg = "Parsing 'timestamp' column not implemented yet."
raise NotImplementedError(msg)

if "filename" in timestamp_df.columns:
timestamps = [
strptime_from_text(ts, date_format)
for ts in timestamp_df["filename"]
]
timestamps = localize_timestamps(timestamps, config.timezone)
time_vector, bin_size = get_time_range_and_bin_size(timestamps, bin_size)

binned = cut(timestamps, time_vector)
max_annot = bin_size / config.timebin_origin

return cls(counts=binned.value_counts().sort_index().clip(upper=max_annot),
timebin_origin=config.timebin_origin,
)

msg = "Could not parse timestamps."
raise ValueError(msg)
df = read_csv(
config.timestamp_file,
parse_dates=[
"start_recording",
"end_recording",
"start_deployment",
"end_deployment",
],
delimiter=delim,
)

if df.empty:
msg = "CSV is empty."
raise ValueError(msg)

# Ensure all required columns are present
required_columns = {
"start_recording",
"end_recording",
"start_deployment",
"end_deployment",
}

missing = required_columns - set(df.columns)

if missing:
msg = f"CSV is missing required columns: {', '.join(sorted(missing))}"
raise ValueError(msg)

# Normalize timezones: convert to UTC, then remove tz info (naive)
for col in [
"start_recording",
"end_recording",
"start_deployment",
"end_deployment",
]:
df[col] = to_datetime(df[col], utc=True).dt.tz_convert(None)

# Compute effective recording intervals (intersection)
df["effective_start_recording"] = df[
["start_recording", "start_deployment"]
].max(axis=1)

df["effective_end_recording"] = df[
["end_recording", "end_deployment"]
].min(axis=1)

# Remove rows with no actual recording interval
df = df.loc[df["effective_start_recording"] < df["effective_end_recording"]].copy()

if df.empty:
msg = "No valid recording intervals after deployment intersection."
raise ValueError(msg)

# Build fine-grained timeline at `timebin_origin` resolution
origin = config.timebin_origin
time_index = date_range(
start=df["effective_start_recording"].min(),
end=df["effective_end_recording"].max(),
freq=origin,
)

# Initialize effort vector (0 = no recording, 1 = recording)
# Compare each timestamp to all intervals in a vectorized manner
effort = Series(0, index=time_index)

# Vectorized interval coverage
t_vals = time_index.to_numpy()[:, None]
start_vals = df["effective_start_recording"].to_numpy()
end_vals = df["effective_end_recording"].to_numpy()

# Boolean matrix: True if the timestamp is within any recording interval
covered = (t_vals >= start_vals) & (t_vals < end_vals)
effort[:] = covered.any(axis=1).astype(int)

# Aggregate effort into user-defined bin_size
counts = effort.resample(bin_size).sum()

# Replace index with IntervalIndex for downstream compatibility
counts.index = interval_range(
start=counts.index[0],
periods=len(counts),
freq=bin_size,
closed="left",
)

return cls(counts=counts, timebin_origin=origin)
10 changes: 4 additions & 6 deletions src/post_processing/utils/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from astral.sun import sunrise, sunset
from matplotlib import pyplot as plt
from osekit.config import TIMESTAMP_FORMAT_AUDIO_FILE
from osekit.utils.timestamp_utils import strptime_from_text, strftime_osmose_format
from osekit.utils.timestamp_utils import strftime_osmose_format, strptime_from_text
from pandas import (
DataFrame,
DatetimeIndex,
Expand Down Expand Up @@ -255,7 +255,6 @@ def add_weak_detection(
new_line.append(np.nan)
df.loc[df.index.max() + 1] = new_line


return df.sort_values(by=["start_datetime", "annotator"]).reset_index(drop=True)


Expand Down Expand Up @@ -509,11 +508,10 @@ def get_time_range_and_bin_size(

if isinstance(bin_size, Timedelta):
return timestamp_range, bin_size
elif isinstance(bin_size, BaseOffset):
if isinstance(bin_size, BaseOffset):
return timestamp_range, timestamp_range[1] - timestamp_range[0]
else:
msg = "bin_size must be a Timedelta or BaseOffset."
raise TypeError(msg)
msg = "bin_size must be a Timedelta or BaseOffset."
raise TypeError(msg)


def round_begin_end_timestamps(
Expand Down
26 changes: 14 additions & 12 deletions src/post_processing/utils/filtering_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import TYPE_CHECKING

import pytz
from osekit.utils.timestamp_utils import strptime_from_text
from pandas import (
DataFrame,
Timedelta,
Expand Down Expand Up @@ -509,8 +510,8 @@ def reshape_timebin(
timebin_new: Timedelta
The size of the new time bin.
timestamp_audio: list[Timestamp]
A list of Timestamp objects corresponding to the shape
in which the data should be reshaped.
A list of Timestamp objects corresponding to the start of each wav
that corresponds to a detection

Returns
-------
Expand Down Expand Up @@ -570,16 +571,17 @@ def get_filename_timestamps(df: DataFrame, date_parser: str) -> list[Timestamp]:

"""
tz = get_timezone(df)
try:
return [
to_datetime(
ts,
format=date_parser,
).tz_localize(tz) for ts in df["filename"]
]
except ValueError:
msg = """Could not parse timestamps from `df["filename"]`."""
raise ValueError(msg) from None
timestamps = [
strptime_from_text(
ts,
datetime_template=date_parser,
) for ts in df["filename"]
]

if all(t.tz is None for t in timestamps):
timestamps = [t.tz_localize(tz) for t in timestamps]

return timestamps


def ensure_in_list(value: str, candidates: list[str], label: str) -> None:
Expand Down
Loading