From 9bf82a9602c9036239f55863b90abdcb65214d6f Mon Sep 17 00:00:00 2001 From: Adil Date: Tue, 24 Feb 2026 00:49:04 +0530 Subject: [PATCH] feat: warn when cohort downsampling occurs (fixes #912) When n_samples > max_cohort_size, the dataset is randomly downsampled without notification. This adds a UserWarning explaining the original and new sample counts, and how to disable downsampling. No change to logic, defaults, or returned data. --- malariagen_data/anoph/snp_data.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py index da000dd96..0da42d55e 100644 --- a/malariagen_data/anoph/snp_data.py +++ b/malariagen_data/anoph/snp_data.py @@ -1,3 +1,4 @@ +import warnings from functools import lru_cache from typing import Any, Dict, List, Optional, Tuple, Union @@ -1253,6 +1254,12 @@ def _snp_calls( if max_cohort_size is not None: n_samples = ds.sizes["samples"] if n_samples > max_cohort_size: + warnings.warn( + f"Cohort downsampled from {n_samples} to {max_cohort_size} " + "samples. Set max_cohort_size=None to disable downsampling.", + UserWarning, + stacklevel=2, + ) rng = np.random.default_rng(seed=random_seed) loc_downsample = rng.choice( n_samples, size=max_cohort_size, replace=False