diff --git a/pyproject.toml b/pyproject.toml index 0fe1ae6..2a8d810 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "tqdm >=4.67.1", "gpxpy >=1.6.2", "notebook>=7.4.5", + "scikit-learn>=1.8.0", ] [dependency-groups] diff --git a/src/post_processing/dataclass/data_aplose.py b/src/post_processing/dataclass/data_aplose.py index c4736d1..91ceb90 100644 --- a/src/post_processing/dataclass/data_aplose.py +++ b/src/post_processing/dataclass/data_aplose.py @@ -434,6 +434,7 @@ def plot( time_range=time, show_rise_set=show_rise_set, season=season, + effort=effort, coordinates=self.coordinates, ) diff --git a/src/post_processing/utils/core_utils.py b/src/post_processing/utils/core_utils.py index 25ea173..60f0a69 100644 --- a/src/post_processing/utils/core_utils.py +++ b/src/post_processing/utils/core_utils.py @@ -484,4 +484,4 @@ def timedelta_to_str(td: Timedelta) -> str: return f"{seconds // 3600}h" if seconds % 60 == 0: return f"{seconds // 60}min" - return f"{seconds}s" + return f"{seconds}s" \ No newline at end of file diff --git a/src/post_processing/utils/filtering_utils.py b/src/post_processing/utils/filtering_utils.py index 25fcd5a..accefdb 100644 --- a/src/post_processing/utils/filtering_utils.py +++ b/src/post_processing/utils/filtering_utils.py @@ -16,6 +16,7 @@ Timedelta, Timestamp, concat, + cut, date_range, read_csv, to_datetime, @@ -436,9 +437,10 @@ def _create_result_dataframe( dataset: str, label: str, annotator: str, + dpm_count: list[int] | None = None, ) -> DataFrame: """Create result DataFrame for one annotator-label combination.""" - return DataFrame({ + df = DataFrame({ "dataset": [dataset] * len(file_vector), "filename": file_vector, "start_time": [0] * len(file_vector), @@ -451,6 +453,9 @@ def _create_result_dataframe( "end_datetime": [t + timebin_new for t in start_datetime], "type": ["WEAK"] * len(file_vector), }) + if dpm_count is not None: + df["dpm_count"] = dpm_count + return df def _normalize_timezones(df: DataFrame) -> DataFrame: @@ -510,6 +515,15 @@ def _process_annotator_label_pair( if not start_datetime: return None + if annotator.lower() in {"fpod", "cpod"}: + bins = list(time_vector) + [time_vector[-1] + timebin_new] + counts = cut(ts_detect_beg, bins=bins, right=False).value_counts().sort_index() + dpm_count = [ + counts.iloc[i] for i, detected in enumerate(detect_vec) if detected + ] + else: + dpm_count = None + return _create_result_dataframe( file_vector, start_datetime, @@ -518,6 +532,7 @@ def _process_annotator_label_pair( dataset, label, annotator, + dpm_count=dpm_count, ) diff --git a/src/post_processing/utils/fpod_utils.py b/src/post_processing/utils/fpod_utils.py index 20fd904..9c15fae 100644 --- a/src/post_processing/utils/fpod_utils.py +++ b/src/post_processing/utils/fpod_utils.py @@ -1,649 +1,493 @@ +"""FPOD/ CPOD processing functions.""" + from __future__ import annotations -from pathlib import Path -from typing import TYPE_CHECKING +import logging +from typing import TYPE_CHECKING, Any -import pytz +import matplotlib.dates as mdates +import matplotlib.pyplot as plt import seaborn as sns -from matplotlib import pyplot as plt -from osekit.config import TIMESTAMP_FORMAT_AUDIO_FILE -from osekit.utils.timestamp_utils import strftime_osmose_format, strptime_from_text +from matplotlib import patches +from numpy import ( + argsort, + dtype, + exp, + float64, + linspace, + log, + nan, + ndarray, + sort, + sqrt, + zeros, +) +from osekit.utils.timestamp import strftime_osmose_format, strptime_from_text from pandas import ( DataFrame, + DateOffset, Series, Timedelta, Timestamp, concat, - date_range, notna, read_csv, - read_excel, to_datetime, + to_numeric, ) +from scipy import stats +from sklearn import mixture -from post_processing import logger -from post_processing.utils.core_utils import get_coordinates, get_sun_times +from post_processing.utils.filtering_utils import find_delimiter +from user_case.config import season_color, site_colors if TYPE_CHECKING: + from pathlib import Path import pytz + from sklearn.mixture import GaussianMixture + +logger = logging.getLogger(__name__) -def fpod2aplose( +def pod2aplose( df: DataFrame, tz: pytz.timezone, dataset_name: str, annotation: str, - bin_size: int = 60, + annotator: str, + bin_size: Timedelta, ) -> DataFrame: - """Format FPOD DataFrame to match APLOSE format. + """Format PODs DataFrame to match an APLOSE format. Parameters ---------- df: DataFrame FPOD result dataframe tz: pytz.timezone - Timezone object to get non-naïve datetimes + Timezone object to get non-naïve datetime. dataset_name: str - dataset name + dataset name. annotation: str - annotation name - bin_size: int - Duration of the detections in seconds + annotation name. + annotator: str + annotator name. + bin_size: Timedelta + Duration of the detections in seconds. Returns ------- DataFrame - An APLOSE formatted DataFrame + An APLOSE formatted DataFrame. """ - fpod_start_dt = sorted( - [ - tz.localize(strptime_from_text(entry, "%d/%m/%Y %H:%M")) - for entry in df["Date heure"] - ], - ) - - fpod_end_dt = sorted( - [entry + Timedelta(seconds=bin_size) for entry in fpod_start_dt], - ) + fpod_start_dt = [tz.localize(entry) for entry in df["Datetime"]] data = { "dataset": [dataset_name] * len(df), - "filename": [""] * len(df), + "filename": list(fpod_start_dt), "start_time": [0] * len(df), - "end_time": [bin_size] * len(df), + "end_time": [bin_size.total_seconds()] * len(df), "start_frequency": [0] * len(df), "end_frequency": [0] * len(df), "annotation": [annotation] * len(df), - "annotator": ["FPOD"] * len(df), - "start_datetime": [strftime_osmose_format(entry) for entry in fpod_start_dt], - "end_datetime": [strftime_osmose_format(entry) for entry in fpod_end_dt], - "is_box": [0] * len(df), + "annotator": [annotator] * len(df), + "start_datetime": [ + strftime_osmose_format(entry.floor(bin_size)) for entry in fpod_start_dt + ], + "end_datetime": [ + strftime_osmose_format(entry.ceil(bin_size)) for entry in fpod_start_dt + ], + "type": ["WEAK"] * len(df), + "deploy": df["Deploy"].tolist(), } return DataFrame(data) -def cpod2aplose( - df: DataFrame, - tz: pytz.BaseTzInfo, - dataset_name: str, - annotation: str, - bin_size: int = 60, - extra_columns: list | None = None, +def load_pod_folder( + folder: Path, + ext: str, ) -> DataFrame: - """Format CPOD DataFrame to match APLOSE format. + """Read POD's result files from a folder. Parameters ---------- - df: DataFrame - CPOD result dataframe - tz: pytz.BaseTzInfo - Timezone object to get non-naïve datetimes - dataset_name: str - dataset name - annotation: str - annotation name - bin_size: int, optional - Duration of the detections in seconds - extra_columns: list, optional - Additional columns added from df to data + folder: Path + Folder's place. + ext: str + File extension of result files. Returns ------- DataFrame - An APLOSE formatted DataFrame + Concatenated data. + + Raises + ------ + ValueError + If no result files are found. """ - df_cpod = df.rename(columns={"ChunkEnd": "Date heure"}) + if ext not in {"csv", "txt"}: + msg = f"Invalid file extension: {ext}" + raise ValueError(msg) - # remove lines where the C-POD stopped working - df_cpod = df_cpod.drop( - df_cpod.loc[df_cpod["Date heure"] == " at minute "].index, - ) - data = fpod2aplose(df_cpod, tz, dataset_name, annotation, bin_size) - data["annotator"] = data.loc[data["annotator"] == "FPOD"] = "CPOD" - if extra_columns: - for col in extra_columns: - if col in df_cpod.columns: - data[col] = df_cpod[col].tolist() - else: - msg = f"Column '{col}' does not exist and will be ignored." - logger.warning(msg) + all_files = sorted(folder.rglob(f"*.{ext}")) - return DataFrame(data) + if not all_files: + msg = f"No .{ext} files found in {folder}" + raise ValueError(msg) + all_data = [] + for file in all_files: + sep = find_delimiter(file) + df = read_csv( + file, + sep=sep, + dtype={"microsec": "Int32"}, + usecols=lambda col: col not in {"SmoothedICI", "ICIslope"}, + ).dropna() -def usable_data_phase( - d_meta: DataFrame, - df: DataFrame, - dpl: str, -) -> DataFrame: - """Calculate the percentage of usable data. + df["Deploy"] = file.stem.strip().lower().replace(" ", "_") + all_data.append(df) - Considering the deployment dates and the collected data. + data = concat(all_data, ignore_index=True) - Parameters - ---------- - df: DataFrame - CPOD result DataFrame - d_meta: DataFrame - Metadata DataFrame with deployments information (previously exported as json) - dpl: str - Deployment of interest where percentage of usable data will be calculated + if ext == "csv": + return _process_csv_data(data) + if ext == "txt": + return _process_txt_data(data) - Returns - ------- - DataFrame - Returns the percentage of usable datas in the chosen phase + msg = f"Could not load {ext} result folder" + raise ValueError(msg) - """ - d_meta.loc[:, ["deployment_date", "recovery_date"]] = d_meta[ - ["deployment_date", "recovery_date"] - ].apply( - to_datetime, - ) - df["start_datetime"] = to_datetime(df["start_datetime"]) - phase = d_meta.loc[d_meta["name"] == dpl].reset_index() - data = df.loc[df["name"] == dpl].reset_index() - start_date = phase.loc[0, "deployment_date"] - end_date = phase.loc[0, "recovery_date"] +def _process_csv_data(data: DataFrame) -> DataFrame: + """Process CSV data with filtering and datetime conversion.""" + data_filtered = _filter_csv_data(data) + data_filtered["Datetime"] = [ + strptime_from_text(dt, "%d/%m/%Y %H:%M") for dt in data_filtered["ChunkEnd"] + ] + return data_filtered.sort_values(by=["Datetime"]).reset_index(drop=True) + - # Calculate the percentage of collected data on the phase length of time - if data.empty: - percentage_data = 0 - msg = "No data for this phase" +def _filter_csv_data(data: DataFrame) -> DataFrame: + """Filter CSV data based on available columns.""" + if "%TimeLost" in data.columns: + data_filtered = data[data["File"].notna()].copy() + data_filtered = data_filtered[data_filtered["Nall/m"].notna()] else: - df_end = data.loc[data.index[-1], "start_datetime"] - df_start = data.loc[data.index[0], "start_datetime"] - act_length = df_end - df_start - p_length = end_date - start_date - percentage_data = act_length * 100 / p_length - msg = f"Percentage of usable data : {percentage_data}%" + data_filtered = data[data["DPM"] > 0].copy() + data_filtered = data_filtered[data_filtered["MinsOn"].notna()] - logger.info(msg) - return percentage_data + return data_filtered -def meta_cut_aplose( - d_meta: DataFrame, - df: DataFrame, -) -> DataFrame: - """From APLOSE DataFrame with all rows to filtered DataFrame. +def _process_txt_data(data: DataFrame) -> DataFrame: + """Process TXT data with datetime conversion.""" + data["Datetime"] = data.apply(get_feeding_buzz_datetime, axis=1) + return data.drop_duplicates().sort_values(by=["Datetime"]).reset_index(drop=True) - Parameters - ---------- - df: DataFrame - CPOD result dataframe - d_meta: DataFrame - Metadata dataframe with deployments information (previously exported as json) - Returns - ------- - DataFrame - An APLOSE DataFrame with data from beginning to end of each deployment. - Returns the percentage of usable datas. +def get_feeding_buzz_datetime(row: Series) -> Timestamp: + """Convert feeding buzz timestamp into a standard Timestamp. + The conversion method differs based on the POD type. """ - d_meta.loc[:, ["deployment_date", "recovery_date"]] = d_meta[ - ["deployment_date", "recovery_date"] - ].apply(to_datetime) - df["start_datetime"] = to_datetime( - df["start_datetime"], - format=TIMESTAMP_FORMAT_AUDIO_FILE, - ) - - # Add DPM column - df["DPM"] = (df["Nfiltered"] > 0).astype(int) - - # Extract corresponding line - campaign = df.iloc[0]["dataset"] - phase = d_meta.loc[d_meta["name"] == campaign].reset_index() - start_date = phase.loc[0, "deployment_date"] - end_date = phase.loc[0, "recovery_date"] - df = df[ - (df["start_datetime"] >= start_date) & (df["start_datetime"] <= end_date) - ].copy() - - # Calculate the percentage of collected data on the phase length of time - if df.empty: - msg = "No data for this phase" - else: - df_end = df.loc[df.index[-1], "start_datetime"] - df_start = df.loc[df.index[0], "start_datetime"] - act_length = df_end - df_start - p_length = end_date - start_date - percentage_data = act_length * 100 / p_length - on = int(df.loc[df.MinsOn == 1, "MinsOn"].count()) - percentage_on = percentage_data * (on / len(df)) - msg = f"Percentage of usable data : {percentage_on}%" - - logger.info(msg) - return df - - -def format_calendar(path: Path) -> DataFrame: - """Format calendar. + exceptions = [] + try: + return ( + Timestamp("1899-12-30") + + Timedelta(minutes=row["Minute"]) + + Timedelta(microseconds=row["microsec"]) + ) + except (KeyError, TypeError, ValueError) as e: + exceptions.append(e) - Parameters - ---------- - path: Path - Excel calendar path + try: + return (strptime_from_text(row["Minute"], "%-d/%-m/%Y %H:%M") + + Timedelta(microseconds=row["microsec"])) + except (KeyError, TypeError, ValueError) as e: + exceptions.append(e) - """ - df_calendar = read_excel(path) - df_calendar = df_calendar[df_calendar["Site group"] == "Data"].copy() - - return df_calendar.rename( - columns={ - "Start": "start_datetime", - "Stop": "end_datetime", - "Site": "site.name", - }, - ) + msg = "Could not convert feeding buzz timestamp." + raise ExceptionGroup(msg, exceptions) -def dpm_to_dph( +def process_feeding_buzz( df: DataFrame, - tz: pytz.BaseTzInfo, - dataset_name: str, - annotation: str, - bin_size: int = 3600, - extra_columns: list | None = None, + species: str, ) -> DataFrame: - """From CPOD result DataFrame to APLOSE formatted DataFrame. + """Process a POD feeding buzz detection DataFrame. + + Give the feeding buzz duration, depending on the studied species + (`delphinid`, `porpoise` or `commerson`). Parameters ---------- df: DataFrame - CPOD result DataFrame - tz: pytz.BaseTzInfo - Timezone object to get timezone-aware datetimes - dataset_name: str - dataset name - annotation: str - annotation name - bin_size: int - Duration of the detections in seconds - extra_columns: list, optional - Additional columns added from df to data + Path to cpod.exe feeding buzz file + species: str + Select the species to use between porpoise and Commerson's dolphin Returns ------- DataFrame - An APLOSE DataFrame + Containing all ICIs for every positive minute to click """ - df["start_datetime"] = to_datetime(df["start_datetime"], utc=True) - df["end_datetime"] = to_datetime(df["end_datetime"], utc=True) - df["Date heure"] = df["start_datetime"].dt.floor("h") - dph = df.groupby(["Date heure"])["DPM"].sum().reset_index() - dph["Date heure"] = dph["Date heure"].apply( - lambda x: Timestamp(x).strftime(format="%d/%m/%Y %H:%M:%S"), - ) - - return cpod2aplose(dph, tz, dataset_name, annotation, bin_size, extra_columns) + df["ICI"] = df["Datetime"].diff() + df["Datetime"] = df["Datetime"].dt.floor("min") + + if species.lower() == "delphinid": # Herzing et al., 2014 + df["Buzz"] = ( + df["ICI"] + .between( + Timedelta(0), + Timedelta(seconds=0.02), + ) + .astype(int) + ) + elif species.lower() == "porpoise": # Nuuttila et al., 2013 + df["Buzz"] = ( + df["ICI"] + .between( + Timedelta(0), + Timedelta(seconds=0.01), + ) + .astype(int) + ) + elif species.lower() == "commerson": # Reyes Reyes et al., 2015 + df["Buzz"] = ( + df["ICI"] + .between( + Timedelta(0), + Timedelta(seconds=0.005), + ) + .astype(int) + ) + else: + msg = "This species is not supported" + raise ValueError(msg) + df_buzz = df.groupby(["Datetime"])["Buzz"].sum().reset_index() + df_buzz["Foraging"] = to_numeric( + df_buzz["Buzz"] != 0, + downcast="integer", + ).astype(int) -def assign_phase( - meta: DataFrame, - data: DataFrame, - site: str, -) -> DataFrame: - """Add a column to an APLOSE DataFrame to specify the name of the phase. + return df_buzz - The name of the phase is attributed according to metadata. - Parameters - ---------- - meta: DataFrame - Metadata dataframe with deployments information (previously exported as json). - data: DataFrame - Contain positive hours to detections. - site: str - Name of the site you wish to assign phases to. +def compute_ici(df: DataFrame) -> DataFrame: + """Calculate Inter-Click Intervals (in minutes) from feeding buzz timestamps.""" + df = df.copy() + df["ICI_minutes"] = df["Datetime"].diff().dt.total_seconds() / 60 + return df[df["ICI_minutes"] > 0].dropna(subset=["ICI_minutes"]) - Returns - ------- - DataFrame - The same dataframe with the column Phase. - """ - data["start_datetime"] = to_datetime(data["start_datetime"], utc=True) - meta["deployment_date"] = to_datetime(meta["deployment_date"], utc=True) - meta["recovery_date"] = to_datetime(meta["recovery_date"], utc=True) - - meta = meta[meta["site.name"] == site].copy() - - data["name"] = None - for _, meta_row in meta.iterrows(): - j = 0 - while j < len(data): - if ( - meta_row["deployment_date"] - <= data.loc[j, "start_datetime"] - < meta_row["recovery_date"] - ): - data.loc[j, "name"] = meta_row["name"] - j += 1 - return data - - -def assign_phase_simple( - meta: DataFrame, - data: DataFrame, -) -> DataFrame: - """Add column to an Aplose DataFrame to specify the phase, according to metadata. +def fit_gmm(df: DataFrame, comp: int) -> tuple[DataFrame, ndarray, GaussianMixture]: + """Fit a GMM on log-transformed ICIs and label clusters by ascending mean. Parameters ---------- - meta: DataFrame - Metadata dataframe with deployments information (previously exported as json). - data: DataFrame - Contain positive hours to detections. + df: DataFrame + POD loaded dataframe + comp: int + Number of components to apply to the GMM. Returns ------- - DataFrame - The same dataframe with the column Phase. + tuple + Returns the enriched DataFrame, the log-ICI array, and the fitted GMM. """ - data["start_datetime"] = to_datetime(data["start_datetime"], utc=True) - data["end_datetime"] = to_datetime(data["end_datetime"], dayfirst=True, utc=True) - meta["deployment_date"] = to_datetime(meta["deployment_date"], utc=True) - meta["recovery_date"] = to_datetime(meta["recovery_date"], utc=True) - meta["deployment_date"] = meta["deployment_date"].dt.floor("d") - meta["recovery_date"] = meta["recovery_date"].dt.floor("d") - - data["name"] = None - for site in data["site.name"].unique(): - site_meta = meta[meta["site.name"] == site] - site_data = data[data["site.name"] == site] - - for _, meta_row in site_meta.iterrows(): - time_filter = ( - meta_row["deployment_date"] <= site_data["start_datetime"] - ) & (site_data["start_datetime"] < meta_row["recovery_date"]) - data.loc[site_data.index[time_filter], "name"] = meta_row["name"] + df = compute_ici(df) + ici_log = log(df["ICI_minutes"].to_numpy()).reshape(-1, 1) - return data + gmm = mixture.GaussianMixture( + n_components=comp, covariance_type="full", random_state=42, n_init=20, + ) + labels = gmm.fit_predict(ici_log) + rank = argsort(argsort(gmm.means_.flatten())) + df["cluster"] = rank[labels] -def generate_hourly_detections(meta: DataFrame, site: str) -> DataFrame: - """Create a DataFrame with one line per hour between start and end dates. + return df, ici_log, gmm - Keep the number of detections per hour between these dates. - Parameters - ---------- - meta: DataFrame - Metadata dataframe with deployments information (previously exported as json) - site: str - A way to isolate the site you want to work on. +def cluster_info(gmm: GaussianMixture) -> list[dict]: + """Extract per-component statistics from a fitted GMM, sorted by ascending mean.""" + component_names = ["Buzz ICIs", "Regular ICIs", "Long ICIs"] + sorted_means = sort(gmm.means_, axis=0) - Returns - ------- - DataFrame - A full period of time with positive and negative hours to detections. - - """ - df_meta = meta[meta["site.name"] == site].copy() - df_meta["deployment_date"] = to_datetime(df_meta["deployment_date"]) - df_meta["recovery_date"] = to_datetime(df_meta["recovery_date"]) - df_meta["deployment_date"] = df_meta["deployment_date"].dt.floor("h") - df_meta["recovery_date"] = df_meta["recovery_date"].dt.floor("h") - df_meta = df_meta.sort_values(by=["deployment_date"]) - - records = [ - {"name": row["name"], "start_datetime": date} - for _, row in df_meta.iterrows() - for date in date_range( - start=row["deployment_date"], end=row["recovery_date"], freq="h", - ) + return [ + { + "name": component_names[i], + "id": i, + "mean_log": sorted_means[i][0], + "std_log": sqrt(gmm.covariances_[i][0][0]), + "mean_minutes": exp(sorted_means[i][0]), + "mean_ms": exp(sorted_means[i][0]) * 60 * 1000, + } + for i in range(gmm.n_components) ] - return DataFrame(records) +def _mixture_density(gmm: GaussianMixture, x_range: ndarray) -> ndarray: + """Compute the total GMM mixture density over x_range.""" + density = zeros(len(x_range)) + for idx in range(gmm.n_components): + mean = gmm.means_[idx][0] + std = sqrt(gmm.covariances_[idx][0][0]) + density += gmm.weights_[idx] * stats.norm.pdf(x_range, mean, std) + return density -def merging_tab(meta: DataFrame, data: DataFrame) -> DataFrame: - """Create a DataFrame with one line per hour between start and end dates. - Keep the number of detections per hour between these dates. +def gmm_feeding_buzz(df: DataFrame, comp: int) -> DataFrame: + """Categorize ICIs with a GMM and aggregate foraging activity per minute. Parameters ---------- - meta: DataFrame - Metadata with deployments information (previously exported as json) - data: DataFrame - Contain positive hours to detections + df: DataFrame + POD loaded dataframe + comp: int + Number of components to apply to the GMM. Returns ------- DataFrame - A full period of time with positive and negative hours to detections. + A DataFrame of two columns : minute positive to feeding buzz or not and number of buzzes. """ - data["start_datetime"] = to_datetime(data["start_datetime"], utc=True) - meta["start_datetime"] = to_datetime(meta["start_datetime"], utc=True) - - deploy_detec = data["name"].unique() - df_filtered = meta[meta["name"].isin(deploy_detec)] - - output = df_filtered.merge( - data[["name", "start_datetime", "DPM", "Nfiltered"]], - on=["name", "start_datetime"], - how="outer", - ) - output["DPM"] = output["DPM"].fillna(0) - output["Nfiltered"] = output["Nfiltered"].fillna(0) - - output["Day"] = output["start_datetime"].dt.day - output["Month"] = output["start_datetime"].dt.month - output["Year"] = output["start_datetime"].dt.year - output["hour"] = output["start_datetime"].dt.hour - - return output - - -def feeding_buzz(df: DataFrame, species: str) -> DataFrame: - """Process a CPOD/FPOD feeding buzz detection file. - - Gives the feeding buzz duration, depending on the studied species. + df, _, _ = fit_gmm(df, comp) - Parameters - ---------- - df: DataFrame - Path to cpod.exe feeding buzz file - species: str - Select the species to use between porpoise and Commerson's dolphin + df["Buzz"] = nan + df.loc[df["cluster"] == 0, "Buzz"] = 1 + df["start_datetime"] = df["Datetime"].dt.floor("min") - Returns - ------- - DataFrame - Containing all ICIs for every positive minutes to clicks + df_buzz = df.groupby("start_datetime")["Buzz"].sum().reset_index() + df_buzz["Foraging"] = to_numeric(df_buzz["Buzz"] != 0, downcast="integer").astype(int) + return df_buzz - """ - df.columns = df.columns.str.upper() - df["MICROSEC"] = df["MICROSEC"] / 1e6 - col = "DATE HEURE MINUTE" - col2 = "HEURE MINUTE" - if col in df.columns: - df[["DATE", "HEURE", "MINUTE"]] = df[col].str.split(" ", expand=True) - df["Time"] = (df["DATE"].astype(str) + " " + - df["HEURE"].astype(str) + ":" + - df["MINUTE"].astype(str) + ":" + - df["MICROSEC"].astype(str)) - df["Time"] = to_datetime(df["Time"], dayfirst=True) - elif col2 in df.columns: - df[["HEURE", "MINUTE"]] = df[col2].str.split(" ", expand=True) - df["Time"] = (df["DATE"].astype(str) + " " + - df["HEURE"].astype(str) + ":" + - df["MINUTE"].astype(str) + ":" + - df["MICROSEC"].astype(str)) - df["Time"] = to_datetime(df["Time"], dayfirst=True) - else: - df["Time"] = (df["MINUTE"].astype(str) + ":" + df["MICROSEC"].astype(str)) - df["Time"] = to_datetime(df["Time"], dayfirst=True) - df = df.sort_values(by="Time").reset_index(drop=True) - df["ICI"] = df["Time"].diff().dt.total_seconds() +def plot_gmm_ici(df: DataFrame, comp: int) -> tuple[plt.Figure, plt.Axes]: + """Plot a histogram of log ICIs overlaid with GMM components and total mixture.""" + df, ici_log, gmm = fit_gmm(df, comp) - df["Buzz"] = 0 - if species == "Porpoise": - feeding_idx = df.index[df["ICI"] < 0.01] - else: - feeding_idx = df.index[df["ICI"] >= 0.005] + x_flat = sort(ici_log.flatten()) + x_range = linspace(ici_log.min(), ici_log.max(), 2000) - df.loc[feeding_idx, "Buzz"] = 1 - df.loc[feeding_idx - 1, "Buzz"] = 1 - df.loc[df.index < 0, "Buzz"] = 0 + fig, ax = plt.subplots(figsize=(12, 7)) + ax.hist( + ici_log, bins=200, histtype="bar", density=True, + alpha=0.6, color="lightgray", edgecolor="black", linewidth=0.5, + ) - df["start_datetime"] = df["Time"].dt.floor("min") - df["start_datetime"] = to_datetime(df["start_datetime"], dayfirst=False, utc=True) - f = df.groupby(["start_datetime"])["Buzz"].sum().reset_index() + lines = [] + for idx in range(comp): + mean, std, weight = gmm.means_[idx, 0], sqrt(gmm.covariances_[idx, 0, 0]), gmm.weights_[idx] + (line,) = ax.plot( + x_flat, weight * stats.norm.pdf(x_flat, mean, std), + label=f"(μ={mean:.2f}, σ={std:.2f})", + ) + lines.append(line) - f["Foraging"] = (f["Buzz"] != 0).astype(int) + (mix_line,) = ax.plot( + x_range, _mixture_density(gmm, x_range), + linewidth=2, color="black", linestyle="--", label="Total mixture", alpha=0.7, + ) + lines.append(mix_line) - return f + ax.set(xlabel="Log ICI (log minutes)", ylabel="Density", title="GMM clustering of Inter-Click Intervals") + ax.legend(handles=lines) + ax.grid(True, alpha=0.3, linestyle="--") + plt.tight_layout() + plt.show() + return fig, ax -def assign_daytime( - df: DataFrame, -) -> DataFrame: - """Assign datetime categories to events. +def process_timelost(df: DataFrame, threshold: int = 0) -> Series[Any]: + """Process TimeLost DataFrame. - Categorize daytime of the detection (among 4 categories). + Returns relevant columns and reshape into hourly data. Parameters ---------- df: DataFrame - Contains positive hours to detections. + All your Environmental data files. + threshold: float + TimeLost threshold. Returns ------- - DataFrame - The same dataframe with the column daytime. + %TimeLost DataFrame. """ - start = df.iloc[0]["Time"] - stop = df.iloc[-1]["Time"] - lat, lon = get_coordinates() - _, _, dawn, day, dusk, night = get_sun_times(start, stop, lat, lon) - dawn = Series(dawn, name="dawn") - day = Series(day, name="day") - dusk = Series(dusk, name="dusk") - night = Series(night, name="night") - jour = concat([day, night, dawn, dusk], axis=1) - - for i, row in df.iterrows(): - dpm_i = row["Time"] - if notna(dpm_i): # Check if time is not NaN - jour_i = jour[ - (jour["dusk"].dt.year == dpm_i.year) & - (jour["dusk"].dt.month == dpm_i.month) & - (jour["dusk"].dt.day == dpm_i.day) - ] - if not jour_i.empty: # Ensure there"s a matching row - jour_i = jour_i.iloc[0] # Extract first match - if dpm_i <= jour_i["day"]: - df.loc[i, "REGIME"] = 1 - elif dpm_i < jour_i["dawn"]: - df.loc[i, "REGIME"] = 2 - elif dpm_i < jour_i["dusk"]: - df.loc[i, "REGIME"] = 3 - elif dpm_i > jour_i["night"]: - df.loc[i, "REGIME"] = 1 - elif dpm_i > jour_i["dusk"]: - df.loc[i, "REGIME"] = 4 - else: - df.loc[i, "REGIME"] = 1 - - return df - + if threshold not in range(101): + msg = "Threshold must integer between 0 and 100." + raise ValueError(msg) + + df["Datetime"] = df["Datetime"].dt.floor("h") + cols_to_drop = [ + col + for col in df.columns + if col + not in { + "File", + "Datetime", + "Temp", + "Angle", + "%TimeLost", + "Deploy", + } + ] + return ( + df[df["%TimeLost"] <= threshold] + .drop( + columns=cols_to_drop, + ) + .sort_values(["Datetime"]) + .reset_index(drop=True) + ) -def process_files_in_folder(folder_path: Path, species: str) -> DataFrame: - """Process a folder containing all CPOD/FPOD feeding buzz detection files. - Apply the feeding buzz function to these files. +def create_matrix( + df: DataFrame, + group_cols: list, + agg_cols: list, +) -> DataFrame: + """Create a stats matrix (mean & std). Parameters ---------- - folder_path: Path - Path to the folder. - species: str - Select the species to use between porpoise and Commerson's dolphin + df : DataFrame + Extended frame with raw data to calculate stats for + group_cols : list + Additional columns to group by + agg_cols : list + Columns to aggregate Returns ------- - DataFrame - Compiled feeding buzz detection positive minutes. + Give a matrix of the data in [agg_cols] grouped by [group_cols]. """ - all_files = list(Path(folder_path).rglob("*.txt")) - all_data = [] - - for file in all_files: - file_path = folder_path / file - df = read_csv(file_path, sep="\t") - processed_df = feeding_buzz(df, species) - processed_df["file"] = file - all_data.append(processed_df) - - return concat(all_data, ignore_index=True) + matrix = df.groupby(group_cols).agg({col: ["mean", "std"] for col in agg_cols}) + matrix = matrix.reset_index() - -colors = { - "DY1": "#118B50", - "DY2": "#5DB996", - "DY3": "#B0DB9C", - "DY4": "#E3F0AF", - "CA4": "#5EABD6", - "Walde": "#FFB4B4", -} - - -def extract_site(df: DataFrame) -> DataFrame: - """Create new columns: site.name and campaign.name, in order to match the metadata. - - Parameters - ---------- - df: DataFrame - All values concatenated - - Returns - ------- - DataFrame - The same dataframe with two additional columns. - - """ - df[["site.name", "campaign.name"]] = df["name"].str.split("_", expand=True) - return df + matrix.columns = group_cols + [ + f"{col}_{stat}" for col in agg_cols for stat in ["mean", "std"] + ] + return matrix -def percent_calc(data: DataFrame, time_unit: str | None = None) -> DataFrame: - """Calculate percentage of clicks, feeding buzzes and positive hours to detection. +def percent_calc( + data: DataFrame, + time_unit: str | None = None, +) -> DataFrame: + """Calculate the percentage of clicks, feeding buzzes and positive hours to detection. Computed on the entire effort and for every site. @@ -660,164 +504,157 @@ def percent_calc(data: DataFrame, time_unit: str | None = None) -> DataFrame: DataFrame """ - group_cols = ["site.name"] - if time_unit is not None: - group_cols.insert(0, time_unit) - - # Aggregate and compute metrics - df = data.groupby(group_cols).agg({ - "DPH": "sum", - "DPM": "sum", - "Day": "size", - "Foraging": "sum", - }).reset_index() - - df["%click"] = df["DPM"] * 100 / (df["Day"] * 60) - df["%DPH"] = df["DPH"] * 100 / df["Day"] - df["FBR"] = df["Foraging"] * 100 / df["DPM"] - df["%buzz"] = df["Foraging"] * 100 / (df["Day"] * 60) + df = ( + data + .groupby(time_unit) + .agg( + { + "DPh": "sum", + "dpm_count": "sum", + "Day": "size", + "Foraging": "sum", + }, + ) + .reset_index() + ) + + df["%click"] = df["dpm_count"] * 100 / (df["Day"] * 60) + df["%DPh"] = df["DPh"] * 100 / df["Day"] + df["FBR"] = df.apply( + lambda row: (row["Foraging"] * 100 / row["dpm_count"]) + if row["dpm_count"] > 0 + else 0, + axis=1, + ) + df["%buzzes"] = df["Foraging"] * 100 / (df["Day"] * 60) return df -def site_percent(df: DataFrame, metric: str) -> None: - """Plot a graph with percentage of minutes positive to detection for every site. +def percent_barplot(df: DataFrame, unit: str, metric: str) -> None: + """Plot a graph with the percentage of minutes positive to detection for every site. Parameters ---------- df: DataFrame All percentages grouped by site + unit: str + Time unit the data are grouped in metric: str - Type of percentage you want to show on the graph + Type of percentage shown on the graph """ - ax = sns.barplot(data=df, x="site.name", - y=metric, - hue="site.name", - dodge=False, - palette=colors, - ) - ax.set_title(f"{metric} per site") + fig, ax = plt.subplots() + ax.bar(df[unit].astype(str), df[metric], color="#0072b2") + ax.set_title(f"{metric} per {unit}") ax.set_ylabel(f"{metric}") - if metric == "%buzzes": + ax.set_xlabel(f"{unit}") + if metric in {"%buzzes", "FBR"}: for _, bar in enumerate(ax.patches): bar.set_hatch("/") + plt.setp(ax.get_xticklabels(), rotation=45) plt.show() -def year_percent(df: DataFrame, metric: str) -> None: - """Plot a graph with the percentage of minutes positive to detection per site/year. +def calendar( + data: DataFrame, +) -> None: + """Produce the calendar of the given data. Deployments and actual collection of data. Parameters ---------- - df: DataFrame - All percentages grouped by site and year - metric: str - Type of percentage you want to show on the graph + data: DataFrame + Custom file containing all beginning and end of deployment and recordings. """ - sites = df["site.name"].unique() - n_sites = len(sites) - fig, axs = plt.subplots(n_sites, 1, figsize=(14, 2.5 * n_sites), sharex=True) - if n_sites == 1: - axs = [axs] - for i, site in enumerate(sorted(sites)): - site_data = df[df["site.name"] == site] - ax = axs[i] - ax.bar(site_data["Year"], - site_data[metric], - label=f"Site {site}", - color=colors.get(site, "gray"), - ) - ax.set_title(f"Site {site}") - ax.set_ylim(0, max(df[metric]) + 0.2) - ax.set_ylabel(metric) - if i != 3: - ax.set_xlabel("") - else: - ax.set_xlabel("Year") - if metric == "%buzzes": - for _, bar in enumerate(ax.patches): - bar.set_hatch("/") - fig.suptitle(f"{metric} per year", fontsize=16) - plt.show() - - -def month_percent(df: DataFrame, metric: str) -> None: - """Plot a graph with the percentage of minutes positive to detection per site/month. - - Parameters - ---------- - df: DataFrame - All percentages grouped by site and month - metric: str - Type of percentage you want to show on the graph + for i in data["Site"].unique(): + mask = data["Site"] == i + data["start_recording"] = to_datetime(data["start_recording"]) + data["end_recording"] = to_datetime(data["end_recording"]) + data["start_deployment"] = to_datetime(data["start_deployment"]) + data["end_deployment"] = to_datetime(data["end_deployment"]) + + data.loc[ + mask & (data["start_recording"] < data["start_deployment"]), + "start_recording", + ] = data.loc[ + mask & (data["start_recording"] < data["start_deployment"]), + "start_deployment", + ] + + data.loc[ + mask & (data["end_recording"] > data["end_deployment"]), + "end_recording"] = data.loc[ + mask & (data["end_recording"] > data["end_deployment"]), "end_deployment"] + + data.loc[mask & (data["start_recording"] > data["end_recording"]), + ["start_recording", "end_recording"]] = None + data = data.sort_values(["Phase", "start_deployment"]).reset_index(drop=True) + + data["color"] = data["Site"].map(site_colors) + + # Create the figure + fig, ax = plt.subplots(figsize=(14, 4)) + + sites = sorted(data["Site"].unique(), reverse=True) + site_mapping = {site: idx for idx, site in enumerate(sites)} + + for _, row in data.iterrows(): + y_pos = site_mapping[row["Site"]] + ax.broken_barh( + [ + ( + row["start_deployment"], + row["end_deployment"] - row["start_deployment"], + ), + ], + (y_pos - 0.3, 0.6), + facecolors="#F5F5F5", + edgecolors="black", + linewidth=0.8, + ) - """ - sites = df["site.name"].unique() - n_sites = len(sites) - fig, axs = plt.subplots(n_sites, 1, figsize=(14, 2.5 * n_sites), sharex=True) - if n_sites == 1: - axs = [axs] - for i, site in enumerate(sorted(sites)): - site_data = df[df["site.name"] == site] - ax = axs[i] - ax.bar(site_data["Month"], - site_data[metric], - label=f"Site {site}", - color=colors.get(site, "gray"), - ) - ax.set_title(f"{site} - Percentage of postitive to detection minutes per month") - ax.set_ylim(0, max(df[metric]) + 0.2) - ax.set_ylabel(metric) - ax.set_xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], - ["Jan", "Feb", "Mar", "Apr", "May", "Jun", - "Jul", "Agu", "Sep", "Oct", "Nov", "Dec", - ], - ) - if i != 3: - ax.set_xlabel("") - else: - ax.set_xlabel("Months") - if metric == "%buzzes": - for _, bar in enumerate(ax.patches): - bar.set_hatch("/") - fig.suptitle(f"{metric} per month", fontsize=16) + if (notna(row["start_recording"]) and notna(row["end_recording"]) and + row["end_recording"] > row["start_recording"]): + ax.broken_barh( + [(row["start_recording"], + row["end_recording"] - row["start_recording"])], + (y_pos - 0.15, 0.3), + facecolors=row["color"], + edgecolors="black", + linewidth=0.8, + ) + + ax.set_yticks(range(len(sites))) + ax.set_yticklabels(sites, fontsize=15) + + plt.xticks(fontsize=15) + plt.tight_layout() plt.show() -def hour_percent(df: DataFrame, metric: str) -> None: - """Plot a graph with the percentage of minutes positive to detection per site/hour. +def matrice_hist(df: DataFrame, unit: str, metric: str) -> None: + """Plot a graph with the percentage of minutes positive to detection for every site. Parameters ---------- df: DataFrame - All percentages grouped by site and hour + All percentages grouped by site + unit: str + Time unit you want to group your data in metric: str Type of percentage you want to show on the graph """ - sites = df["site.name"].unique() - n_sites = len(sites) - fig, axs = plt.subplots(n_sites, 1, figsize=(14, 2.5 * n_sites), sharex=True) - if n_sites == 1: - axs = [axs] - for i, site in enumerate(sorted(sites)): - site_data = df[df["site.name"] == site] - ax = axs[i] - ax.bar(site_data["hour"], - site_data[metric], - label=f"Site {site}", - color=colors.get(site, "gray"), - ) - ax.set_title(f"Site {site} - Percentage of positive to detection per hour") - ax.set_ylim(0, max(df[metric]) + 0.2) - ax.set_ylabel(metric) - if i != 3: - ax.set_xlabel("") - else: - ax.set_xlabel("Hour") - if metric == "%buzzes": - for _, bar in enumerate(ax.patches): - bar.set_hatch("/") - fig.suptitle(f"{metric} per hour", fontsize=16) + fig, ax = plt.subplots() + ax.bar(df[unit], df[f"{metric}_mean"], color="#0072b2") + ax.set_xlabel(f"{unit}") + ax.set_ylabel(f"{metric}") + plt.errorbar(df[unit], df[f"{metric}_mean"], df[f"{metric}_std"], + fmt=".", color="Black", elinewidth=2, capthick=10, + errorevery=1, alpha=0.5, ms=4, capsize=2) + ax.set_ylim(0, max(df[f"{metric}_mean"] + df[f"{metric}_std"]) * 1.1) + if metric in {"%buzzes", "FBR"}: + for _, bar in enumerate(ax.patches): + bar.set_hatch("/") + plt.setp(ax.get_xticklabels(), rotation=45, ha="right") plt.show() diff --git a/src/post_processing/utils/glider_utils.py b/src/post_processing/utils/glider_utils.py index 626371c..aaa0224 100644 --- a/src/post_processing/utils/glider_utils.py +++ b/src/post_processing/utils/glider_utils.py @@ -175,7 +175,7 @@ def load_glider_nav(directory: Path) -> DataFrame: msg = f"Directory '{directory}' does not exist." raise FileNotFoundError(msg) - file = [f for f in directory.glob("*.gz") if "gli" in f.name] + file = [f for f in directory.rglob("*.gz") if "gli" in f.name] if not len(file) > 0: msg = f"Directory '{directory}' does not contain '.gz' files." diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 8f84334..35f3a98 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -4,7 +4,7 @@ import logging from collections import Counter -from itertools import cycle +from itertools import cycle, pairwise from typing import TYPE_CHECKING import matplotlib.pyplot as plt @@ -31,7 +31,8 @@ get_labels_and_annotators, get_sun_times, get_time_range_and_bin_size, - timedelta_to_str, round_begin_end_timestamps, + round_begin_end_timestamps, + timedelta_to_str, ) from post_processing.utils.filtering_utils import ( filter_by_annotator, @@ -237,6 +238,7 @@ def scatter( season = kwargs.get("season", False) coordinates = kwargs.get("coordinates", False) effort = kwargs.get("effort", False) + legend = kwargs.get("legend", False) _prepare_timeline_plot( df=df, @@ -280,6 +282,7 @@ def scatter( shade_no_effort( ax=ax, observed=effort, + legend=legend, ) @@ -321,6 +324,7 @@ def heatmap(df: DataFrame, show_rise_set = kwargs.get("show_rise_set", False) season = kwargs.get("season", False) coordinates = kwargs.get("coordinates", False) + effort = kwargs.get("effort", False) begin = time_range[0] end = time_range[-1] @@ -355,13 +359,40 @@ def heatmap(df: DataFrame, if 0 <= c_idx < len(cell_bins) - 1: mat[dt.hour, c_idx] += 1 + if effort is not None: + sampled_dates = { + interval.left.date() + for interval in effort.counts.index + if effort.counts[interval] > 0 + } + + sampled = np.zeros((24, len(cell_bins) - 1), dtype=bool) + for col, (cell_start, _cell_end) in enumerate(pairwise(cell_bins)): + if cell_start.date() in sampled_dates: + sampled[:, col] = True + + unsampled_mask = ~sampled + else: + unsampled_mask = np.zeros((24, len(cell_bins) - 1), dtype=bool) + masked_mat = np.ma.array(mat, mask=unsampled_mask) + + base_cmap = ( + ax.get_figure().get_axes()[0].images[0].cmap + if ax.get_figure().get_axes()[0].images + else plt.cm.viridis + ) + + cmap = base_cmap.copy() + cmap.set_bad(color="white") + im = ax.imshow( - mat, + masked_mat, extent=(begin, end, 0, 24), vmin=0, vmax=mat.max(), aspect="auto", origin="lower", + cmap=cmap, ) if coordinates and season: @@ -579,7 +610,7 @@ def timeline( ax.grid(color="k", linestyle="-", linewidth=0.2) ax.set_yticks(np.arange(0, len(labels), 1)) - ax.set_yticklabels(labels[::-1]) + ax.set_yticklabels(labels) ax.set_xlabel("Date") ax.set_xlim( df["start_datetime"].min().floor("1d"), diff --git a/tests/test_fpod_utils.py b/tests/test_fpod_utils.py new file mode 100644 index 0000000..aec01a5 --- /dev/null +++ b/tests/test_fpod_utils.py @@ -0,0 +1,669 @@ +"""FPOD/ CPOD processing functions tests.""" +from pathlib import Path + +import pytest +import pytz +from pandas import DataFrame + +from post_processing.utils.fpod_utils import ( + load_pod_folder, + pod2aplose, +) + +CLICKS_CPOD = """Minute,microsec,cycles,SPL_Pa,kHz,Bandwidth,end kHz,Qn,TrN +25/1/2019 11:45,55643215,7,38,130,0,121,2,38 +25/1/2019 11:45,55707365,7,44,130,0,125,2,38 +25/1/2019 11:45,55770865,7,36,132,0,131,2,38 +25/1/2019 11:45,55830500,11,34,136,1,108,2,38 +25/1/2019 11:45,55890495,10,33,135,1,131,2,38 +""" + +CLICKS_FPOD = """File,Minute,microsec,ICI,TrnAvPRF,Ncyc,ClkKHZ,IPIbefore,IPIatMax,IPIplus1,IPIplus2,EndIPI,ClkIPIrange,maxPk,maxPkE,Pkminus1%,Pkplus1%,PkAt,AmpReversals,tRateScore,Qn,TrnIDn,ClassID,Log(PRF)*10 +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,64358756,40266515,10595,98,11,121,256,33,34,34,31,3,78,78,98,91,6,1,10,2,1,0,19 +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,64358756,40276675,10160,98,11,121,256,33,33,33,33,3,79,79,98,91,5,1,10,2,1,0,19 +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,64358756,40286600,9925,98,11,121,256,33,33,33,33,2,84,84,88,94,4,1,10,2,1,0,20 +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,64358756,40296440,9840,98,10,121,256,33,33,34,33,3,79,79,91,100,4,1,10,2,1,0,20 +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,64358756,40306520,10080,98,11,121,256,33,33,34,33,3,76,76,92,96,4,1,10,2,1,0,19 +""" + +TIMELOST = """File podN,ChunkEnd,Minute,Temp,Angle,MinutesON,NBHF_DPM,DPM,Nfiltered/m,kHz_continuous_noise,NBHFclx,DOL_DPM,DOLclx,SONAR_DPM,SONARclx,Nall/m,%TimeLost,%m SonarRisk,%mSediment noise,LandmarkSeq_total,avOpThreshold +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,6669,05/05/2022 10:59,64348499,21.4,0,0m ON,0,108,14,0,0,0,0,0,0,,,0,0,0,0 +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,6669,05/05/2022 11:59,64348559,21.4,0,0m ON,0,108,14,0,0,0,0,0,0,548.9,100,0,0,0,0 +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,6669,05/05/2022 12:59,64348619,22.4,0,0,0,81.6,60,0,0,0,0,0,0,0.2,100,0,0,0,0 +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,6669,05/05/2022 13:59,64348679,23,4,1.62,20,78,60,0,0,0,0,0,0,0,100,0,0,0,0 +CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3,6669,05/05/2022 14:59,64348739,23,3,0.28,0,78,60,0,0,0,0,0,0,0,100,0,0,0,0 +""" + + +@pytest.fixture +def pod_dataframe() -> DataFrame: + return DataFrame({ + "File": [ + "Site A ile Haute 2019 01 25 POD3055 file01.CP3", + "Site A ile Haute 2019 01 25 POD3055 file01.CP3", + "Site A ile Haute 2019 01 25 POD3055 file01.CP3", + "Site A ile Haute 2019 01 25 POD3055 file01.CP3", + "Site A ile Haute 2019 01 25 POD3055 file01.CP3", + ], + "podN": [6669, 6669, 6669, 6669, 6669], + "ChunkEnd": [ + "24/01/2019 06:17", + "24/01/2019 06:18", + "24/01/2019 06:19", + "24/01/2019 06:20", + "24/01/2019 06:21", + ], + "Minute": [64348546, 64348547, 64348548, 64348549, 64348550], + "DPM": [0, 1, 1, 0, 0], + "Nall": [0, 216, 75, 0, 28], + "MinsOn": [0, 1, 1, 1, 1], + }) + + +@pytest.fixture +def click_dataframe() -> DataFrame: + return DataFrame({ + "File": [ + "CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3", + "CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3", + "CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3", + "CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3", + "CETIROISEPHASE1POINTB 2022 05 05 FPOD_6669 file0.FP3", + ], + "microsec": [40255920, 40266515, 40276675, 40286600, 40296440], + "Minute": [64348546, 64348547, 64348548, 64348549, 64348550], + }) + + +@pytest.fixture +def pod_aplose(sample_df: DataFrame) -> DataFrame: + """Create a POD Dataframe for testing.""" + sample_df["type"] = "WEAK" + return sample_df + + +# csv_folder +def test_folder_single_csv(pod_dataframe: DataFrame, tmp_path: Path) -> None: + """Test processing a single CSV file.""" + csv_file = tmp_path / "pod_folder" / "pod_dataframe.csv" + csv_file.parent.mkdir(parents=True, exist_ok=True) + pod_dataframe.to_csv(csv_file, index=False) + result = load_pod_folder(csv_file.parent, ext="csv") + + assert isinstance(result, DataFrame) + assert "Deploy" in result.columns + assert all(result["Deploy"] == "pod_dataframe") + assert list(result.columns) == ["File", "podN", "ChunkEnd", "Minute", "DPM", + "Nall", "MinsOn", "Deploy", "Datetime"] + + +def test_folder_single_txt( + monkeypatch: pytest.MonkeyPatch, + click_dataframe: DataFrame, + tmp_path: Path) -> None: + """Test processing a single CSV file.""" + monkeypatch.setattr("post_processing.utils.fpod_utils.process_feeding_buzz", + lambda df, species: df) + txt_file = tmp_path / "click_folder" / "click_dataframe.txt" + txt_file.parent.mkdir(parents=True, exist_ok=True) + click_dataframe.to_csv(txt_file, index=False) + result = load_pod_folder(txt_file.parent, ext="txt") + + assert isinstance(result, DataFrame) + assert "Deploy" in result.columns + assert all(result["Deploy"] == "click_dataframe") + assert list(result.columns) == [ + "File", + "microsec", + "Minute", + "Deploy", + "Datetime", + ] + + +def test_folder_multiple(pod_dataframe: DataFrame, tmp_path: Path) -> None: + """Test processing multiple CSV files.""" + csv_file = tmp_path / "pod_folder" / "pod_dataframe1.csv", "pod_dataframe2.csv" + + +@pytest.mark.parametrize( + ("mocked_df", "should_raise"), + [ + pytest.param( + DataFrame({ + "ChunkEnd": ["01/01/2024 12:00"], + "DPM": [1], + "MinsOn": [30.0], + "microsec": [100], + }), + False, + id="valid-dpm-columns", + ), + pytest.param( + DataFrame({ + "ChunkEnd": ["01/01/2024 12:00"], + "%TimeLost": [0.1], + "Nall/m": [1.0], + "File": ["f1"], + "microsec": [100], + }), + False, + id="valid-timelost-columns", + ), + pytest.param( + DataFrame({ + "ChunkEnd": ["01/01/2024 12:00"], + "col1": [0.1], + "Nall/m": [1.0], + "File": ["f1"], + "microsec": [100], + }), + True, + id="invalid-missing-timelost", + ), + pytest.param( + DataFrame({ + "ChunkEnd": ["01/01/2024 12:00"], + "%TimeLost": [0.1], + "col1": [1.0], + "File": ["f1"], + "microsec": [100], + }), + True, + id="invalid-missing-nall", + ), + pytest.param( + DataFrame({ + "ChunkEnd": ["01/01/2024 12:00"], + "File": ["f1"], + "col1": [1], + "MinsOn": ["x"], + "microsec": [100], + }), + True, + id="invalid-missing-dpm", + ), + pytest.param( + DataFrame({ + "ChunkEnd": ["01/01/2024 12:00"], + "File": ["f1"], + "DPM": [1], + "col3": ["x"], + "microsec": [100], + }), + True, + id="invalid-missing-minson", + ), + pytest.param( + DataFrame({"col1": [1], "col2": [2], "col3": [3]}), + True, + id="invalid-no-required-columns", + ), + ], +) +def test_right_csv_format( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + mocked_df: DataFrame, + should_raise: bool, + ) -> None: + """Mocked read_csv to test load_pod_folder column validation.""" + fake_path = Path("fake/deploy_01.csv") + + monkeypatch.setattr(Path, "rglob", lambda self, pattern: [fake_path]) + monkeypatch.setattr("post_processing.utils.fpod_utils.find_delimiter", lambda f: ";") + monkeypatch.setattr("post_processing.utils.fpod_utils.read_csv", lambda *args, **kwargs: mocked_df) + + if should_raise: + with pytest.raises((ValueError, KeyError)): + load_pod_folder(Path("fake/folder"), "csv") + else: + result = load_pod_folder(Path("fake/folder"), "csv") + assert isinstance(result, DataFrame) + + +# pod2aplose +@pytest.fixture +def sample_df(): + """Create a sample POD DataFrame for testing.""" + return DataFrame({ + "ChunkEnd": ["15/01/2024 10:30", "15/01/2024 11:00", "15/01/2024 09:45"], + "deploy.name": ["deploy1", "deploy2", "deploy1"], + }) + + +@pytest.fixture +def timezone(): + """Return UTC timezone for testing.""" + return pytz.UTC + + +def test_pod2aplose_basic_structure(sample_df, timezone) -> None: + """Test that basic structure and required columns are present.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="test_dataset", + annotation="test_annotation", + annotator="test_annotator", + ) + + expected_columns = [ + "dataset", + "filename", + "start_time", + "end_time", + "start_frequency", + "end_frequency", + "annotation", + "annotator", + "start_datetime", + "end_datetime", + "deploy", + ] + + assert isinstance(result, DataFrame) + assert list(result.columns) == expected_columns + assert len(result) == len(sample_df) + + +def test_pod2aplose_dataset_propagation(sample_df, timezone) -> None: + """Test that dataset name is propagated to all rows.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="my_dataset", + annotation="click", + annotator="john", + ) + + assert all(result["dataset"] == "my_dataset") + + +def test_pod2aplose_annotation_propagation(sample_df, timezone) -> None: + """Test that annotation is propagated to all rows.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="dataset", + annotation="porpoise_click", + annotator="john", + ) + + assert all(result["annotation"] == "porpoise_click") + + +def test_pod2aplose_annotator_propagation(sample_df, timezone) -> None: + """Test that annotator is propagated to all rows.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="alice", + ) + + assert all(result["annotator"] == "alice") + + +def test_pod2aplose_default_bin_size(sample_df, timezone) -> None: + """Test default bin_size of 60 seconds.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john", + ) + + assert all(result["start_time"] == 0) + assert all(result["end_time"] == 60) + + +def test_pod2aplose_custom_bin_size(sample_df, timezone) -> None: + """Test custom bin_size parameter.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john", + bin_size=120, + ) + + assert all(result["start_time"] == 0) + assert all(result["end_time"] == 120) + + +def test_pod2aplose_frequency_values(sample_df, timezone) -> None: + """Test that frequency values are set to 0.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john", + ) + + assert all(result["start_frequency"] == 0) + assert all(result["end_frequency"] == 0) + + +def test_pod2aplose_is_box_values(sample_df, timezone) -> None: + """Test that is_box values are set to 0.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john", + ) + + assert all(result["is_box"] == 0) + + +def test_pod2aplose_deploy_name_preserved(sample_df, timezone) -> None: + """Test that deploy.name values are preserved from input.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john", + ) + + # After sorting, deploy.name should still be present + assert "deploy.name" in result.columns + assert len(result["deploy.name"]) == len(sample_df) + assert set(result["deploy.name"]) == {"deploy1", "deploy2"} + + +def test_pod2aplose_sorting_by_datetime(timezone) -> None: + """Test that rows are sorted by datetime.""" + df = DataFrame({ + "ChunkEnd": ["15/01/2024 12:00", "15/01/2024 10:00", "15/01/2024 11:00"], + "deploy.name": ["d1", "d2", "d3"], + }) + + result = pod2aplose( + df=df, tz=timezone, dataset_name="dataset", annotation="click", annotator="john" + ) + + # Check that deploy.name follows the sorted order (by time) + assert result["deploy.name"].tolist() == ["d2", "d3", "d1"] + + +def test_pod2aplose_datetime_formatting() -> None: + """Test that datetime strings are properly formatted.""" + df = DataFrame({"ChunkEnd": ["01/02/2024 14:30"], "deploy.name": ["deploy1"]}) + + result = pod2aplose( + df=df, + tz=pytz.UTC, + dataset_name="dataset", + annotation="click", + annotator="john", + bin_size=60, + ) + + # Check that datetime strings are present and not empty + assert len(result["start_datetime"].iloc[0]) > 0 + assert len(result["end_datetime"].iloc[0]) > 0 + assert len(result["filename"].iloc[0]) > 0 + + +def test_pod2aplose_end_datetime_offset(timezone) -> None: + """Test that end_datetime is offset by bin_size from start_datetime.""" + df = DataFrame({"ChunkEnd": ["15/01/2024 10:00"], "deploy.name": ["deploy1"]}) + + result = pod2aplose( + df=df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john", + bin_size=120, + ) + + # Both should be valid datetime strings + assert result["start_datetime"].iloc[0] != result["end_datetime"].iloc[0] + + +def test_pod2aplose_different_timezones() -> None: + """Test with different timezone.""" + df = DataFrame({"ChunkEnd": ["15/01/2024 10:00"], "deploy.name": ["deploy1"]}) + + tz_paris = pytz.timezone("Europe/Paris") + + result = pod2aplose( + df=df, tz=tz_paris, dataset_name="dataset", annotation="click", annotator="john" + ) + + assert len(result) == 1 + assert result["dataset"].iloc[0] == "dataset" + + +def test_pod2aplose_empty_dataframe(timezone) -> None: + """Test handling of empty DataFrame.""" + df = DataFrame({"ChunkEnd": [], "deploy.name": []}) + + result = pod2aplose( + df=df, tz=timezone, dataset_name="dataset", annotation="click", annotator="john" + ) + + assert len(result) == 0 + assert list(result.columns) == [ + "dataset", + "filename", + "start_time", + "end_time", + "start_frequency", + "end_frequency", + "annotation", + "annotator", + "start_datetime", + "end_datetime", + "is_box", + "deploy.name", + ] + + +def test_pod2aplose_single_row(timezone) -> None: + """Test with single row DataFrame.""" + df = DataFrame({"ChunkEnd": ["20/03/2024 15:45"], "deploy.name": ["single_deploy"]}) + + result = pod2aplose( + df=df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john", + bin_size=90, + ) + + assert len(result) == 1 + assert result["deploy.name"].iloc[0] == "single_deploy" + assert result["end_time"].iloc[0] == 90 + + +def test_pod2aplose_does_not_modify_original(sample_df, timezone) -> None: + """Test that the original DataFrame is not modified.""" + original_columns = sample_df.columns.tolist() + original_len = len(sample_df) + + pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john", + ) + + # Original DataFrame should be unchanged + assert sample_df.columns.tolist() == original_columns + assert len(sample_df) == original_len + assert "_temp_dt" not in sample_df.columns + + +def test_pod2aplose_large_bin_size(sample_df, timezone) -> None: + """Test with large bin_size value.""" + result = pod2aplose( + df=sample_df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john", + bin_size=3600, # 1 hour + ) + + assert all(result["end_time"] == 3600) + + +def test_pod2aplose_index_reset(timezone) -> None: + """Test that index is properly reset after sorting.""" + df = DataFrame({ + "ChunkEnd": ["15/01/2024 12:00", "15/01/2024 10:00"], + "deploy.name": ["d1", "d2"] + }) + + result = pod2aplose( + df=df, + tz=timezone, + dataset_name="dataset", + annotation="click", + annotator="john" + ) + + # Index should be 0, 1 after reset + assert result.index.tolist() == [0, 1] + +# meta_cut_aplose + + +# build_range + + +# feeding_buzz + + +# assign_daytime + + +# fb_folder +# def test_fb_folder_non_existent() -> None: +# with pytest.raises(FileNotFoundError): +# txt_folder(Path("/non/existent/folder")) +# +# def test_fb_folder_no_files(tmp_path: pytest.fixture) -> None: +# with pytest.raises(ValueError, match="No .txt files found"): +# txt_folder(tmp_path) + +# extract_site +# def test_extract_site(self) -> None: +# input_data = [ +# {"deploy.name":"Walde_Phase46"}, +# {"deploy.name":"Site A Ile Haute_Phase8"}, +# {"deploy.name":"Site B Ile Heugh_Phase9"}, +# {"deploy.name":"Point E_Phase 4"}, +# ] +# expected_site = [ +# "Walde", +# "Site A Ile Haute", +# "Site B Ile Heugh", +# "Point E", +# ] +# expected_campaign = [ +# "Phase46", +# "Phase8", +# "Phase9", +# "Phase 4", +# ] +# +# for variant, (input_row, site, campaign) in enumerate( +# zip(input_data, expected_site, expected_campaign, strict=False), start=1): +# with self.subTest( +# f"variation #{variant}", +# deploy_name=input_row["deploy.name"], +# expected_site=site, +# expected_campaign=campaign, +# ): +# df = DataFrame([input_row]) +# result = extract_site(df) +# actual_site = result["site.name"].iloc[0] +# actual_campaign = result["campaign.name"].iloc[0] +# +# error_message_site = ( +# f'Called extract_site() with deploy.name="{input_row["deploy.name"]}". ' +# f'The function returned site.name="{actual_site}", but the test ' +# f'expected "{expected_site}".' +# ) +# +# error_message_campaign = ( +# f'Called extract_site() with deploy.name="{input_row["deploy.name"]}". ' +# f'The function returned campaign.name="{actual_campaign}", but the test' +# f'expected "{expected_campaign}".' +# ) +# +# assert actual_site == expected_site, error_message_site +# assert actual_campaign == expected_campaign, error_message_campaign +# +# assert "deploy.name" in result.columns +# assert "value" in result.columns + +# csv_folder +# def test_csv_folder_non_existent() -> None: +# with pytest.raises(FileNotFoundError): +# csv_folder(Path("/non/existent/folder")) +# +# def test_csv_folder_no_files(tmp_path: pytest.fixture) -> None: +# with pytest.raises(ValueError, match="No .csv files found"): +# csv_folder(tmp_path) + +# is_dpm_col + + +# pf_datetime + + +# build_aggregation_dict + + +# resample_dpm + + +# parse_timestamps +# def test_parse_timestamps() -> None: +# df = DataFrame({"date": ["2024-01-01T10:00:00", "06/01/2025 08:35"]}) +# result = parse_timestamps(df, "date") +# expected = DataFrame({"date": ["2024-01-01 10:00:00", +# "2025-01-06 08:35:00"]}).astype("datetime64[ns]") +# assert_frame_equal(result, expected) + +# deploy_period +# def test_deploy_period() -> None: +# df = DataFrame( +# { +# "deploy.name": ["A", "A", "B"], +# "start_datetime": [ +# datetime(2024, 1, 1, 10, 0, tzinfo=datetime.timezone.utc), +# datetime(2024, 1, 2, 15, 30, tzinfo=datetime.timezone.utc), +# datetime(2024, 1, 3, 8, 0, tzinfo=datetime.timezone.utc), +# ], +# }) +# +# expected = DataFrame( +# { +# "deploy.name": ["A", "B"], +# "Début": [ +# datetime(2024, 1, 1, 10, 0, tzinfo=datetime.timezone.utc), +# datetime(2024, 1, 3, 8, 0, tzinfo=datetime.timezone.utc), +# ], +# "Fin": [ +# datetime(2024, 1, 2, 15, 30, tzinfo=datetime.timezone.utc), +# datetime(2024, 1, 3, 8, 0, tzinfo=datetime.timezone.utc), +# ], +# }) +# result = deploy_period(df) +# assert_frame_equal(result, expected) + +# actual_data \ No newline at end of file diff --git a/user_case/config.py b/user_case/config.py new file mode 100644 index 0000000..bf74b37 --- /dev/null +++ b/user_case/config.py @@ -0,0 +1,11 @@ +from pathlib import Path + +import yaml + +config_file = Path(r"C:\Users\fouinel\PycharmProjects\OSmOSE_post_processing\user_case\config.yaml") + +config = yaml.safe_load(config_file.read_text()) if config_file.exists() else {} + +site_colors = config.get("site_colors", {"Site A Haute": "#118B50", "Site B Heugh": "#5DB996", "Site C Chat": "#B0DB9C", "Site D Simone": "#E3F0AF", "CA4": "#80D8C3", "Walde": "#4DA8DA", "Point C": "#932F67", "Point D": "#D92C54", "Point E": "#DDDEAB", "Point F": "#8ABB6C", "Point G": "#456882"}) + +season_color = config.get("season_color", {"spring": "green", "summer": "orange", "autumn": "brown", "winter": "blue"}) \ No newline at end of file diff --git a/user_case/config.yaml b/user_case/config.yaml new file mode 100644 index 0000000..6da6e08 --- /dev/null +++ b/user_case/config.yaml @@ -0,0 +1,19 @@ +site_colors: + CA4: '#80D8C3' + Point C: '#932F67' + Point D: '#D92C54' + Point E: '#DDDEAB' + Point F: '#4E61D3' + Point G: '#456882' + Site A Haute: '#118B50' + Site B Heugh: '#5DB996' + Site C Chat: '#B0DB9C' + Site D Simone: '#E3F0AF' + Walde: '#4DA8DA' + 02Mn Sud Cotentin: '#FB4141' + +season_color : + spring: "green" + summer: "orange" + autumn: "brown" + winter: "blue" \ No newline at end of file diff --git a/uv.lock b/uv.lock index e26f84b..27f35d3 100644 --- a/uv.lock +++ b/uv.lock @@ -600,6 +600,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" }, + { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" }, { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" }, { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, @@ -610,6 +611,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" }, + { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" }, { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" }, { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, @@ -620,6 +622,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, @@ -805,6 +808,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "joblib" +version = "1.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, +] + [[package]] name = "json5" version = "0.12.1" @@ -1627,6 +1639,7 @@ dependencies = [ { name = "pypamguard" }, { name = "pytz" }, { name = "pyyaml" }, + { name = "scikit-learn" }, { name = "scipy" }, { name = "seaborn" }, { name = "soundfile" }, @@ -1660,6 +1673,7 @@ requires-dist = [ { name = "pypamguard", specifier = ">=1.0.0" }, { name = "pytz", specifier = ">=2025.1" }, { name = "pyyaml", specifier = ">=6.0.2" }, + { name = "scikit-learn", specifier = ">=1.8.0" }, { name = "scipy", specifier = ">=1.15.2" }, { name = "seaborn", specifier = ">=0.12.2" }, { name = "soundfile", specifier = ">=0.12.1" }, @@ -2128,6 +2142,50 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/a8/001d4a7c2b37623a3fd7463208267fb906df40ff31db496157549cfd6e72/ruff-0.12.11-py3-none-win_arm64.whl", hash = "sha256:bae4d6e6a2676f8fb0f98b74594a048bae1b944aab17e9f5d504062303c6dbea", size = 12135290, upload-time = "2025-08-28T13:59:06.933Z" }, ] +[[package]] +name = "scikit-learn" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/74/e6a7cc4b820e95cc38cf36cd74d5aa2b42e8ffc2d21fe5a9a9c45c1c7630/scikit_learn-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5fb63362b5a7ddab88e52b6dbb47dac3fd7dafeee740dc6c8d8a446ddedade8e", size = 8548242, upload-time = "2025-12-10T07:07:51.568Z" }, + { url = "https://files.pythonhosted.org/packages/49/d8/9be608c6024d021041c7f0b3928d4749a706f4e2c3832bbede4fb4f58c95/scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:5025ce924beccb28298246e589c691fe1b8c1c96507e6d27d12c5fadd85bfd76", size = 8079075, upload-time = "2025-12-10T07:07:53.697Z" }, + { url = "https://files.pythonhosted.org/packages/dd/47/f187b4636ff80cc63f21cd40b7b2d177134acaa10f6bb73746130ee8c2e5/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4496bb2cf7a43ce1a2d7524a79e40bc5da45cf598dbf9545b7e8316ccba47bb4", size = 8660492, upload-time = "2025-12-10T07:07:55.574Z" }, + { url = "https://files.pythonhosted.org/packages/97/74/b7a304feb2b49df9fafa9382d4d09061a96ee9a9449a7cbea7988dda0828/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0bcfe4d0d14aec44921545fd2af2338c7471de9cb701f1da4c9d85906ab847a", size = 8931904, upload-time = "2025-12-10T07:07:57.666Z" }, + { url = "https://files.pythonhosted.org/packages/9f/c4/0ab22726a04ede56f689476b760f98f8f46607caecff993017ac1b64aa5d/scikit_learn-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:35c007dedb2ffe38fe3ee7d201ebac4a2deccd2408e8621d53067733e3c74809", size = 8019359, upload-time = "2025-12-10T07:07:59.838Z" }, + { url = "https://files.pythonhosted.org/packages/24/90/344a67811cfd561d7335c1b96ca21455e7e472d281c3c279c4d3f2300236/scikit_learn-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:8c497fff237d7b4e07e9ef1a640887fa4fb765647f86fbe00f969ff6280ce2bb", size = 7641898, upload-time = "2025-12-10T07:08:01.36Z" }, + { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770, upload-time = "2025-12-10T07:08:03.251Z" }, + { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458, upload-time = "2025-12-10T07:08:05.336Z" }, + { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341, upload-time = "2025-12-10T07:08:07.732Z" }, + { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022, upload-time = "2025-12-10T07:08:09.862Z" }, + { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409, upload-time = "2025-12-10T07:08:12.028Z" }, + { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760, upload-time = "2025-12-10T07:08:13.688Z" }, + { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045, upload-time = "2025-12-10T07:08:15.215Z" }, + { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324, upload-time = "2025-12-10T07:08:17.561Z" }, + { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651, upload-time = "2025-12-10T07:08:19.952Z" }, + { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045, upload-time = "2025-12-10T07:08:22.11Z" }, + { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994, upload-time = "2025-12-10T07:08:23.943Z" }, + { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518, upload-time = "2025-12-10T07:08:25.71Z" }, + { url = "https://files.pythonhosted.org/packages/24/05/1af2c186174cc92dcab2233f327336058c077d38f6fe2aceb08e6ab4d509/scikit_learn-1.8.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c22a2da7a198c28dd1a6e1136f19c830beab7fdca5b3e5c8bba8394f8a5c45b3", size = 8528667, upload-time = "2025-12-10T07:08:27.541Z" }, + { url = "https://files.pythonhosted.org/packages/a8/25/01c0af38fe969473fb292bba9dc2b8f9b451f3112ff242c647fee3d0dfe7/scikit_learn-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:6b595b07a03069a2b1740dc08c2299993850ea81cce4fe19b2421e0c970de6b7", size = 8066524, upload-time = "2025-12-10T07:08:29.822Z" }, + { url = "https://files.pythonhosted.org/packages/be/ce/a0623350aa0b68647333940ee46fe45086c6060ec604874e38e9ab7d8e6c/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:29ffc74089f3d5e87dfca4c2c8450f88bdc61b0fc6ed5d267f3988f19a1309f6", size = 8657133, upload-time = "2025-12-10T07:08:31.865Z" }, + { url = "https://files.pythonhosted.org/packages/b8/cb/861b41341d6f1245e6ca80b1c1a8c4dfce43255b03df034429089ca2a2c5/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb65db5d7531bccf3a4f6bec3462223bea71384e2cda41da0f10b7c292b9e7c4", size = 8923223, upload-time = "2025-12-10T07:08:34.166Z" }, + { url = "https://files.pythonhosted.org/packages/76/18/a8def8f91b18cd1ba6e05dbe02540168cb24d47e8dcf69e8d00b7da42a08/scikit_learn-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:56079a99c20d230e873ea40753102102734c5953366972a71d5cb39a32bc40c6", size = 8096518, upload-time = "2025-12-10T07:08:36.339Z" }, + { url = "https://files.pythonhosted.org/packages/d1/77/482076a678458307f0deb44e29891d6022617b2a64c840c725495bee343f/scikit_learn-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:3bad7565bc9cf37ce19a7c0d107742b320c1285df7aab1a6e2d28780df167242", size = 7754546, upload-time = "2025-12-10T07:08:38.128Z" }, + { url = "https://files.pythonhosted.org/packages/2d/d1/ef294ca754826daa043b2a104e59960abfab4cf653891037d19dd5b6f3cf/scikit_learn-1.8.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:4511be56637e46c25721e83d1a9cea9614e7badc7040c4d573d75fbe257d6fd7", size = 8848305, upload-time = "2025-12-10T07:08:41.013Z" }, + { url = "https://files.pythonhosted.org/packages/5b/e2/b1f8b05138ee813b8e1a4149f2f0d289547e60851fd1bb268886915adbda/scikit_learn-1.8.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:a69525355a641bf8ef136a7fa447672fb54fe8d60cab5538d9eb7c6438543fb9", size = 8432257, upload-time = "2025-12-10T07:08:42.873Z" }, + { url = "https://files.pythonhosted.org/packages/26/11/c32b2138a85dcb0c99f6afd13a70a951bfdff8a6ab42d8160522542fb647/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2656924ec73e5939c76ac4c8b026fc203b83d8900362eb2599d8aee80e4880f", size = 8678673, upload-time = "2025-12-10T07:08:45.362Z" }, + { url = "https://files.pythonhosted.org/packages/c7/57/51f2384575bdec454f4fe4e7a919d696c9ebce914590abf3e52d47607ab8/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15fc3b5d19cc2be65404786857f2e13c70c83dd4782676dd6814e3b89dc8f5b9", size = 8922467, upload-time = "2025-12-10T07:08:47.408Z" }, + { url = "https://files.pythonhosted.org/packages/35/4d/748c9e2872637a57981a04adc038dacaa16ba8ca887b23e34953f0b3f742/scikit_learn-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:00d6f1d66fbcf4eba6e356e1420d33cc06c70a45bb1363cd6f6a8e4ebbbdece2", size = 8774395, upload-time = "2025-12-10T07:08:49.337Z" }, + { url = "https://files.pythonhosted.org/packages/60/22/d7b2ebe4704a5e50790ba089d5c2ae308ab6bb852719e6c3bd4f04c3a363/scikit_learn-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f28dd15c6bb0b66ba09728cf09fd8736c304be29409bd8445a080c1280619e8c", size = 8002647, upload-time = "2025-12-10T07:08:51.601Z" }, +] + [[package]] name = "scipy" version = "1.16.0" @@ -2442,6 +2500,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/9e/2064975477fdc887e47ad42157e214526dcad8f317a948dee17e1659a62f/terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0", size = 14154, upload-time = "2024-03-12T14:34:36.569Z" }, ] +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, +] + [[package]] name = "tinycss2" version = "1.4.0"