From 75271e44aa5051b8f9d80b4b0eeeed1a2d2ea936 Mon Sep 17 00:00:00 2001 From: Hamidreza Keshavarz <32555614+hamidkm9@users.noreply.github.com> Date: Tue, 6 Jan 2026 23:23:25 +0100 Subject: [PATCH 1/4] Scalable Kernel and Stochastic Boosting --- README.md | 117 ++++- src/linearboost/linear_boost.py | 766 ++++++++++++++++++++++++++++++-- 2 files changed, 850 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index eb7ec38..eaf33bc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # LinearBoost Classifier -![Lastest Release](https://img.shields.io/badge/release-v0.1.3-green) +![Lastest Release](https://img.shields.io/badge/release-v0.1.5-green) [![PyPI Version](https://img.shields.io/pypi/v/linearboost)](https://pypi.org/project/linearboost/) ![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue) [![PyPI Downloads](https://static.pepy.tech/badge/linearboost)](https://pepy.tech/projects/linearboost) @@ -30,6 +30,81 @@ Key Features: - Exceptional Speed: Blazing fast training and inference times - Resource Efficient: Low memory usage, ideal for large datasets +--- + +## 🚀 New in Version 0.1.5 + +The latest release introduces major architectural improvements designed for **scalability**, **robustness on imbalanced data**, and **training speed**. + +### ⚡ Scalable Kernel Approximation + +LinearBoost now supports **Kernel Approximation** via `kernel_approx='rff'` or `kernel_approx='nystrom'`. + +**Why it matters:** Previously, non-linear kernels required computing a full \(O(n^2)\) kernel matrix, which is memory-intensive for large datasets. + +**New Capability:** You can now map inputs to a lower-dimensional feature space using: +- **Random Fourier Features (RFF)** — for RBF kernels +- **Nyström Approximation** — for any kernel type + +This enables linear time complexity while retaining non-linear decision boundaries. + +```python +# Example: Using kernel approximation for scalable non-linear classification +clf = LinearBoostClassifier( + kernel='rbf', + kernel_approx='rff', # or 'nystrom' + n_components=256 +) +``` + +### 🎯 Stochastic Boosting & Regularization + +Advanced regularization techniques to prevent overfitting and reduce variance: + +- **Subsampling (`subsample`)**: Enables Stochastic Gradient Boosting by training each estimator on a random fraction of the training data. +- **Shrinkage (`shrinkage`)**: Scales the contribution of each new estimator (learning rate decay), effectively "slowing down" learning for better generalization. + +```python +clf = LinearBoostClassifier( + subsample=0.8, # Use 80% of data per iteration + shrinkage=0.9 # Scale each estimator's contribution by 0.9 +) +``` + +### ⚖️ Optimized for Imbalanced Data + +The internal boosting logic has been overhauled to prioritize **F1-Score optimization**: + +- **Adaptive Class Weighting**: The algorithm dynamically adjusts sample weights based on class frequencies within the boosting loop, aggressively correcting errors on minority classes. +- **F1-Based Estimator Weighting**: Estimators are rewarded not just for accuracy, but specifically for their F1 performance. + +### ⏱️ Early Stopping + +Training can now stop automatically when validation scores plateau: + +- **Standard validation splits** via `validation_fraction` +- **Out-of-Bag (OOB) Evaluation**: When using subsampling (`subsample < 1.0`), LinearBoost utilizes unused samples for validation without reducing training set size. + +```python +clf = LinearBoostClassifier( + n_estimators=500, + early_stopping=True, + validation_fraction=0.1, # 10% held out for validation + n_iter_no_change=5, # Stop after 5 iterations with no improvement + tol=1e-4 +) + +# Or with OOB evaluation (automatic when subsampling) +clf = LinearBoostClassifier( + n_estimators=500, + subsample=0.8, # Enables OOB evaluation + early_stopping=True, + n_iter_no_change=5 +) +``` + +--- + ## 🚀 New Major Release (v0.1.3) The `LinearBoost` and `SEFR` classifiers use kernels to solve non-linear problems. Kernels work by projecting data into a different perspective, allowing a simple linear model to capture complex, curved patterns. @@ -92,7 +167,7 @@ The documentation is available at https://linearboost.readthedocs.io/. The following parameters yielded optimal results during testing. All results are based on 10-fold Cross-Validation: - **`n_estimators`**: - A range of 10 to 200 is suggested, with higher values potentially improving performance at the cost of longer training times. + A range of 10 to 200 is suggested, with higher values potentially improving performance at the cost of longer training times. When using `early_stopping=True`, you can set a higher value (e.g., 500) and let training stop automatically. - **`learning_rate`**: Values between 0.01 and 1 typically perform well. Adjust based on the dataset's complexity and noise. @@ -111,6 +186,28 @@ The following parameters yielded optimal results during testing. All results are - `quantile-uniform`: Normalizes features to a uniform distribution. - `quantile-normal`: Normalizes features to a normal (Gaussian) distribution. +- **`kernel`** *(new in v0.1.3)*: + Choose based on data complexity: + - `linear`: Fastest, for linearly separable data. + - `rbf`: Most flexible, works well for complex non-linear patterns. + - `poly`: For polynomial relationships. + - `sigmoid`: For sigmoid-like decision boundaries. + +- **`kernel_approx`** *(new in v0.1.5)*: + For large datasets with non-linear kernels: + - `None`: Use full kernel matrix (default, exact but \(O(n^2)\) memory). + - `'rff'`: Random Fourier Features (only with `kernel='rbf'`). + - `'nystrom'`: Nyström approximation (works with any kernel). + +- **`subsample`** *(new in v0.1.5)*: + Values in (0, 1] control stochastic boosting. Use `0.8` for variance reduction while maintaining speed. + +- **`shrinkage`** *(new in v0.1.5)*: + Values in (0, 1] scale each estimator's contribution. Use `0.8-0.95` to improve generalization. + +- **`early_stopping`** *(new in v0.1.5)*: + Set to `True` with `n_iter_no_change=5` and `tol=1e-4` to automatically stop training when validation performance plateaus. + These parameters should serve as a solid starting point for most datasets. For fine-tuning, consider using hyperparameter optimization tools like [Optuna](https://optuna.org/). Results @@ -235,10 +332,15 @@ params = { #### LinearBoost ```python params = { - 'n_estimators': trial.suggest_int('n_estimators', 10, 200), + 'n_estimators': trial.suggest_int('n_estimators', 10, 500), 'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1), 'algorithm': trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R']), - 'scaler': trial.suggest_categorical('scaler', ['minmax', 'robust', 'quantile-uniform', 'quantile-normal']) + 'scaler': trial.suggest_categorical('scaler', ['minmax', 'robust', 'quantile-uniform', 'quantile-normal']), + 'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']), + 'subsample': trial.suggest_float('subsample', 0.6, 1.0), + 'shrinkage': trial.suggest_float('shrinkage', 0.7, 1.0), + 'early_stopping': True, + 'n_iter_no_change': 5, } ``` @@ -253,9 +355,10 @@ LinearBoost's combination of **runtime efficiency** and **high accuracy** makes Future Developments ----------------------------- -These are not supported in this current version, but are in the future plans: -- Supporting categorical variables -- Adding regression +These are not yet supported in this current version, but are in the future plans: +- Supporting categorical variables natively +- Adding regression support (`LinearBoostRegressor`) +- Multi-output classification Reference Paper ----------------------------- diff --git a/src/linearboost/linear_boost.py b/src/linearboost/linear_boost.py index 93d8918..6ec8e5d 100644 --- a/src/linearboost/linear_boost.py +++ b/src/linearboost/linear_boost.py @@ -41,6 +41,9 @@ from sklearn.utils._param_validation import Interval, StrOptions from sklearn.utils.multiclass import check_classification_targets, type_of_target from sklearn.utils.validation import check_is_fitted +from sklearn.kernel_approximation import RBFSampler, Nystroem +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, f1_score, roc_auc_score from ._utils import SKLEARN_V1_6_OR_LATER, check_X_y, validate_data from .sefr import SEFR @@ -304,6 +307,22 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): kernel : {'linear', 'poly', 'rbf', 'sigmoid'} or callable, default='linear' Specifies the kernel type to be used in the algorithm. If a callable is given, it is used to pre-compute the kernel matrix. + + kernel_approx : {'rff', 'nystrom'} or None, default=None + Optional kernel approximation strategy for non-linear kernels. + + - 'rff': Use Random Fourier Features (RBFSampler). Only valid when + ``kernel='rbf'``. Approximates the RBF kernel via an explicit + low-dimensional feature map. + - 'nystrom': Use Nyström approximation (Nystroem). Can be used with + 'rbf', 'poly', or 'sigmoid' kernels. + - None: Use exact kernel with full Gram matrix (O(n^2) memory). + + n_components : int, default=256 + Dimensionality of the kernel feature map when using kernel approximation. + Acts as the number of random features (for 'rff') or the rank of the + approximation (for 'nystrom'). Must be >= 1. + gamma : float, default=None Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. If None, then it is @@ -336,6 +355,52 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): - y_pred: Estimated target values. - sample_weight: Sample weights (optional). + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If True, it requires ``n_iter_no_change`` to be set. + + If ``subsample < 1.0`` (subsampling is enabled), Out-of-Bag (OOB) evaluation + is automatically used instead of a fixed validation split. This is more + data-efficient as it uses all training data while still providing validation + feedback. OOB evaluation uses samples not included in each iteration's + subsample for validation. + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. Only used if ``early_stopping`` is True + and ``subsample >= 1.0`` (no subsampling). When subsampling is enabled, + OOB evaluation is used instead and this parameter is ignored. + + n_iter_no_change : int, default=5 + Number of iterations with no improvement to wait before early stopping. + Only used if ``early_stopping`` is True. Must be >= 1. + + tol : float, default=1e-4 + Tolerance for the optimization. When the loss or score is not improving + by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, + convergence is considered to be reached and training stops. + Only used if ``early_stopping`` is True. Must be >= 0. + + subsample : float, default=1.0 + The fraction of samples to be used for fitting the individual base + learners. If smaller than 1.0 this results in Stochastic Gradient + Boosting. `subsample` interacts with the parameter `n_estimators`. + Choosing `subsample < 1.0` leads to a reduction of variance + and an increase in bias. Values must be in the range `(0, 1]`. + + shrinkage : float, default=1.0 + Shrinkage parameter for regularization. Each estimator weight is + multiplied by this factor. Values < 1.0 reduce the contribution of + each base learner, helping to prevent overfitting and improve + generalization. This is similar to the shrinkage used in gradient + boosting methods. + + - If `shrinkage = 1.0`: no shrinkage (full weight) + - If `shrinkage < 1.0`: apply shrinkage (e.g., 0.8 means 80% weight) + + Values must be in the range `(0, 1]`. Typical values are in the range + `[0.8, 1.0]` for moderate regularization or `1.0` for no regularization. + Attributes ---------- estimator_ : estimator @@ -420,6 +485,14 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): None, ], "loss_function": [None, callable], + "kernel_approx": [StrOptions({"rff", "nystrom"}), None], + "n_components": [Interval(Integral, 1, None, closed="left")], + "early_stopping": ["boolean"], + "validation_fraction": [Interval(Real, 0, 1, closed="neither")], + "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None], + "tol": [Interval(Real, 0, None, closed="left")], + "subsample": [Interval(Real, 0, 1, closed="right")], + "shrinkage": [Interval(Real, 0, 1, closed="right")], } def __init__( @@ -435,17 +508,45 @@ def __init__( gamma=None, degree=3, coef0=1, + kernel_approx=None, + n_components=256, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + tol=1e-4, + subsample=1.0, + shrinkage=1.0, ): - # Create SEFR estimator with 'precomputed' kernel if we're using kernels - # Use string comparison that's safe for arrays (will raise TypeError for arrays) + self.algorithm = algorithm + self.scaler = scaler + self.class_weight = class_weight + self.loss_function = loss_function + self.kernel = kernel + self.gamma = gamma + self.degree = degree + self.coef0 = coef0 + self.kernel_approx = kernel_approx + self.n_components = n_components + self.early_stopping = early_stopping + self.validation_fraction = validation_fraction + self.n_iter_no_change = n_iter_no_change + self.tol = tol + self.subsample = subsample + self.shrinkage = shrinkage + + # Decide how SEFR sees the input: + # - If we use a kernel approximation, the base estimator should work + # on explicit features (linear kernel). + # - Otherwise: + # - 'linear' -> use SEFR with linear kernel + # - non-linear -> SEFR expects a precomputed Gram matrix try: - if kernel == "linear": + if self.kernel_approx is not None or kernel == "linear": base_estimator = SEFR(kernel="linear") else: base_estimator = SEFR(kernel="precomputed") except (ValueError, TypeError): # If kernel is an array or invalid type, default to linear - # Parameter validation will catch this later in fit() base_estimator = SEFR(kernel="linear") super().__init__( @@ -461,6 +562,12 @@ def __init__( self.gamma = gamma self.degree = degree self.coef0 = coef0 + self.early_stopping = early_stopping + self.validation_fraction = validation_fraction + self.n_iter_no_change = n_iter_no_change + self.tol = tol + self.subsample = subsample + self.shrinkage = shrinkage if SKLEARN_V1_6_OR_LATER: @@ -541,6 +648,9 @@ def _get_kernel_matrix(self, X, Y=None): degree=self.degree, coef0=self.coef0, ) + def _use_kernel_approx(self) -> bool: + """Return True if we should use kernel approximation.""" + return self.kernel != "linear" and self.kernel_approx is not None def fit(self, X, y, sample_weight=None) -> Self: """Build a LinearBoost classifier from the training set (X, y). @@ -597,14 +707,57 @@ def fit(self, X, y, sample_weight=None) -> Self: self.classes_ = np.unique(y) self.n_classes_ = self.classes_.shape[0] - # Store training data for kernel computation during prediction - if self.kernel != "linear": + # ----- Kernel handling & approximation ----- + self.kernel_approx_ = None # will be set if approximation is used + self.X_fit_ = None + self.K_train_ = None + + if self.kernel == "linear": + # Pure linear: no kernel, no approximation + training_data = X_transformed + + elif self._use_kernel_approx(): + # Use kernel approximation instead of full Gram matrix + if self.kernel_approx == "rff": + if self.kernel != "rbf": + raise ValueError( + "kernel_approx='rff' is only supported with kernel='rbf'. " + f"Got kernel='{self.kernel}'." + ) + # Ensure gamma is set + gamma = self.gamma + if gamma is None: + gamma = 1.0 / X_transformed.shape[1] + + self.kernel_approx_ = RBFSampler( + gamma=gamma, + n_components=self.n_components, + # random_state can be None; AdaBoost's randomness is separate + ) + elif self.kernel_approx == "nystrom": + self.kernel_approx_ = Nystroem( + kernel=self.kernel, + gamma=self.gamma, + degree=self.degree, + coef0=self.coef0, + n_components=self.n_components, + # random_state can be None + ) + else: + raise ValueError( + f"Unknown kernel_approx='{self.kernel_approx}'. " + "Valid options are 'rff', 'nystrom', or None." + ) + + training_data = self.kernel_approx_.fit_transform(X_transformed) + + else: + # Exact kernel with full Gram matrix (original behavior) self.X_fit_ = X_transformed # Precompute kernel matrix ONCE for all estimators self.K_train_ = self._get_kernel_matrix(X_transformed) training_data = self.K_train_ - else: - training_data = X_transformed + # ----- end kernel handling ----- if self.class_weight is not None: if isinstance(self.class_weight, str) and self.class_weight != "balanced": @@ -618,6 +771,97 @@ def fit(self, X, y, sample_weight=None) -> Self: else: sample_weight = expanded_class_weight + # Handle early stopping with validation split or OOB evaluation + validation_data = None + y_val = None + training_data_val = None + X_val_transformed = None # Store original features for validation (needed for exact kernels) + use_oob = False # Flag to use OOB evaluation instead of fixed validation split + + # Use OOB evaluation if subsampling is enabled and early stopping is requested + if (self.early_stopping and self.n_iter_no_change is not None and + self.subsample < 1.0): + # Check if we can use OOB (skip for exact kernels) + is_exact_kernel = (not self._use_kernel_approx() and self.kernel != "linear") + if not is_exact_kernel: + use_oob = True + # Store full data for OOB evaluation + # For exact kernels, we need to store original features, not kernel matrix + if not self._use_kernel_approx() and self.kernel != "linear": + # This shouldn't happen since we check is_exact_kernel above, but just in case + validation_data = (X_transformed, y, sample_weight) + else: + validation_data = (training_data, y, sample_weight) + + if self.early_stopping and self.n_iter_no_change is not None and not use_oob: + # Split BEFORE kernel computation for exact kernels + # For exact kernels, we need to split X_transformed, not the kernel matrix + if not self._use_kernel_approx() and self.kernel != "linear": + # For exact kernels, split the original features + n_samples = X_transformed.shape[0] + n_val_samples = max(1, int(self.validation_fraction * n_samples)) + + from sklearn.model_selection import StratifiedShuffleSplit + splitter = StratifiedShuffleSplit( + n_splits=1, + test_size=n_val_samples, + random_state=42 + ) + train_idx, val_idx = next(splitter.split(X_transformed, y)) + + # Split original features + X_train_transformed = X_transformed[train_idx] + X_val_transformed = X_transformed[val_idx] + y_train = y[train_idx] + y_val = y[val_idx] + + # Recompute kernel matrix for training only + self.X_fit_ = X_train_transformed + self.K_train_ = self._get_kernel_matrix(X_train_transformed) + training_data = self.K_train_ + + # Split sample weights if provided + if sample_weight is not None: + sample_weight_val = sample_weight[val_idx] + sample_weight = sample_weight[train_idx] + else: + sample_weight_val = None + + # Store validation data (original features for kernel computation) + validation_data = (X_val_transformed, y_val, sample_weight_val) + y = y_train + else: + # For linear or approximate kernels, split after transformation + n_samples = training_data.shape[0] + n_val_samples = max(1, int(self.validation_fraction * n_samples)) + + from sklearn.model_selection import StratifiedShuffleSplit + splitter = StratifiedShuffleSplit( + n_splits=1, + test_size=n_val_samples, + random_state=42 + ) + train_idx, val_idx = next(splitter.split(training_data, y)) + + # Split training data + training_data_val = training_data[val_idx] + y_val = y[val_idx] + training_data = training_data[train_idx] + y_train = y[train_idx] + + # Split sample weights if provided + if sample_weight is not None: + sample_weight_val = sample_weight[val_idx] + sample_weight = sample_weight[train_idx] + else: + sample_weight_val = None + + # Store validation data for checking + validation_data = (training_data_val, y_val, sample_weight_val) + y = y_train + else: + y_train = y + with warnings.catch_warnings(): if SKLEARN_V1_6_OR_LATER: warnings.filterwarnings( @@ -630,8 +874,316 @@ def fit(self, X, y, sample_weight=None) -> Self: category=FutureWarning, message=".*parameter 'algorithm' is deprecated.*", ) - # Pass the precomputed kernel matrix (or raw features for linear) - return super().fit(training_data, y, sample_weight) + + # If early stopping is enabled, use custom boosting loop + if self.early_stopping and self.n_iter_no_change is not None and validation_data is not None: + return self._fit_with_early_stopping(training_data, y_train, sample_weight, validation_data, use_oob=use_oob) + else: + # Pass the precomputed kernel matrix (or raw features for linear) + return super().fit(training_data, y_train, sample_weight) + + def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob=False): + """Fit with early stopping based on validation error or OOB evaluation. + + Parameters + ---------- + X : array-like + Training data (features or kernel matrix) + y : array-like + Training labels + sample_weight : array-like + Sample weights + validation_data : tuple + If use_oob=False: (X_val, y_val, sample_weight_val) + If use_oob=True: (X_full, y_full, sample_weight_full) - full dataset for OOB + use_oob : bool + If True, use OOB samples for validation instead of fixed split + """ + if use_oob: + # For OOB, validation_data contains the full dataset + X_full, y_full, sample_weight_full = validation_data + # We'll track OOB samples per iteration + oob_indices_history = [] + else: + # Traditional validation split + X_val, y_val, sample_weight_val = validation_data + + # Initialize from parent class + from sklearn.utils import check_random_state + + # Initialize attributes needed for boosting + # Ensure estimator_ is set (needed by _make_estimator) + if not hasattr(self, 'estimator_') or self.estimator_ is None: + # Reuse the same logic from __init__ to create base estimator + try: + if self.kernel_approx is not None or self.kernel == "linear": + from .sefr import SEFR + self.estimator_ = SEFR(kernel="linear") + else: + from .sefr import SEFR + self.estimator_ = SEFR(kernel="precomputed") + except (ValueError, TypeError): + from .sefr import SEFR + self.estimator_ = SEFR(kernel="linear") + + self.estimators_ = [] + self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) + self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) + + # Initialize sample weights + if sample_weight is None: + sample_weight = np.ones(X.shape[0], dtype=np.float64) + sample_weight /= sample_weight.sum() + + random_state = check_random_state(None) + + # Track best validation score and iterations without improvement + best_val_score = -np.inf + n_no_improvement = 0 + best_n_estimators = 0 + + # For OOB, we need to store X_fit_ reference for exact kernels + if use_oob and hasattr(self, 'X_fit_') and self.X_fit_ is not None: + # Store reference to training features for kernel computation + pass # Already stored + + # Early stopping loop + for iboost in range(self.n_estimators): + # Perform a single boost + # For OOB, we need to track which samples were used + if use_oob: + boost_result = self._boost( + iboost, X, y, sample_weight, random_state, return_oob_indices=True + ) + if len(boost_result) == 4: + sample_weight, estimator_weight, estimator_error, oob_indices = boost_result + oob_indices_history.append(oob_indices) + else: + sample_weight, estimator_weight, estimator_error = boost_result + oob_indices_history.append(None) + else: + sample_weight, estimator_weight, estimator_error = self._boost( + iboost, X, y, sample_weight, random_state + ) + + if sample_weight is None: + break + + # Store results + self.estimator_weights_[iboost] = estimator_weight + self.estimator_errors_[iboost] = estimator_error + + # Evaluate on validation set or OOB samples using F1/ROC-AUC + if use_oob and len(oob_indices_history) > 0 and oob_indices_history[-1] is not None: + # Use OOB samples from current iteration + oob_idx = oob_indices_history[-1] + if len(oob_idx) > 0: + # Get OOB data (X_full is already transformed features/kernel matrix) + X_oob = X_full[oob_idx] + y_oob = y_full[oob_idx] + + # Get predictions and probabilities for F1/ROC-AUC + val_pred = self._staged_predict_single(X_oob, iboost + 1) + val_proba = self._staged_predict_proba_single(X_oob, iboost + 1) + + # Compute F1 score (primary metric) + f1_val = f1_score(y_oob, val_pred, average='weighted', zero_division=0.0) + + # Compute ROC-AUC if possible (requires probabilities) + try: + if val_proba is not None and val_proba.shape[1] >= 2: + roc_auc_val = roc_auc_score(y_oob, val_proba[:, 1], average='weighted') + # Combined metric: 70% F1, 30% ROC-AUC + val_score = 0.7 * f1_val + 0.3 * roc_auc_val + else: + val_score = f1_val + except (ValueError, IndexError): + # Fallback to F1 only if ROC-AUC fails + val_score = f1_val + else: + # No OOB samples (shouldn't happen with subsample < 1.0), skip validation + val_score = best_val_score + else: + # Traditional validation split + val_pred = self._staged_predict_single(X_val, iboost + 1) + val_proba = self._staged_predict_proba_single(X_val, iboost + 1) + + # Compute F1 score (primary metric) + f1_val = f1_score(y_val, val_pred, average='weighted', zero_division=0.0) + + # Compute ROC-AUC if possible + try: + if val_proba is not None and val_proba.shape[1] >= 2: + roc_auc_val = roc_auc_score(y_val, val_proba[:, 1], average='weighted') + # Combined metric: 70% F1, 30% ROC-AUC + val_score = 0.7 * f1_val + 0.3 * roc_auc_val + else: + val_score = f1_val + except (ValueError, IndexError): + # Fallback to F1 only if ROC-AUC fails + val_score = f1_val + + # Check for improvement + if val_score > best_val_score + self.tol: + best_val_score = val_score + n_no_improvement = 0 + best_n_estimators = iboost + 1 + else: + n_no_improvement += 1 + + # Early stopping check + if n_no_improvement >= self.n_iter_no_change: + # Trim estimators to best point + if best_n_estimators > 0: + self.estimators_ = self.estimators_[:best_n_estimators] + self.estimator_weights_ = self.estimator_weights_[:best_n_estimators] + self.estimator_errors_ = self.estimator_errors_[:best_n_estimators] + break + + return self + + def _staged_predict_single(self, X, n_estimators): + """Predict using first n_estimators for validation. + + X can be either: + - Transformed features (for linear/approximate kernels) + - Kernel matrix (for exact kernels) + - Original features (for exact kernels - will compute kernel) + """ + if n_estimators == 0: + # Return majority class + return np.full(X.shape[0], self.classes_[0]) + + # For exact kernels, if X is original features, compute kernel matrix + if (not self._use_kernel_approx() and self.kernel != "linear" and + hasattr(self, 'X_fit_') and self.X_fit_ is not None and + X.shape[1] == self.X_fit_.shape[1] and X.shape[1] != self.X_fit_.shape[0]): + # X appears to be original features, compute kernel matrix + X = self._get_kernel_matrix(X, self.X_fit_) + + if self.algorithm == "SAMME.R": + classes = self.classes_ + n_classes = len(classes) + + pred = sum( + self._samme_proba(estimator, n_classes, X) + for estimator in self.estimators_[:n_estimators] + ) + if n_estimators > 0: + weights_sum = self.estimator_weights_[:n_estimators].sum() + if weights_sum > 0: + pred /= weights_sum + if n_classes == 2: + pred[:, 0] *= -1 + decision = pred.sum(axis=1) + else: + decision = pred + else: + # SAMME algorithm + classes = self.classes_ + pred = np.zeros((X.shape[0], n_classes)) + + for i, estimator in enumerate(self.estimators_[:n_estimators]): + predictions = estimator.predict(X) + for j, class_label in enumerate(classes): + pred[:, j] += ( + self.estimator_weights_[i] * (predictions == class_label) + ) + + decision = pred + + if self.n_classes_ == 2: + return self.classes_.take((decision > 0).astype(int), axis=0) + else: + return self.classes_.take(np.argmax(decision, axis=1), axis=0) + + def _staged_predict_proba_single(self, X, n_estimators): + """Predict probabilities using first n_estimators for validation. + + Similar to _staged_predict_single but returns probabilities instead of predictions. + + Parameters + ---------- + X : array-like + Validation data (features or kernel matrix) + n_estimators : int + Number of estimators to use + + Returns + ------- + proba : ndarray of shape (n_samples, n_classes) + Class probabilities + """ + if n_estimators == 0: + # Return uniform probabilities + return np.ones((X.shape[0], self.n_classes_)) / self.n_classes_ + + # For exact kernels, if X is original features, compute kernel matrix + if (not self._use_kernel_approx() and self.kernel != "linear" and + hasattr(self, 'X_fit_') and self.X_fit_ is not None and + X.shape[1] == self.X_fit_.shape[1] and X.shape[1] != self.X_fit_.shape[0]): + # X appears to be original features, compute kernel matrix + X = self._get_kernel_matrix(X, self.X_fit_) + + if self.algorithm == "SAMME.R": + # Use decision function and convert to probabilities + # This matches how predict_proba works in the parent class + classes = self.classes_ + n_classes = len(classes) + + pred = sum( + self._samme_proba(estimator, n_classes, X) + for estimator in self.estimators_[:n_estimators] + ) + if n_estimators > 0: + weights_sum = self.estimator_weights_[:n_estimators].sum() + if weights_sum > 0: + pred /= weights_sum + else: + # No valid weights, return uniform + return np.ones((X.shape[0], n_classes)) / n_classes + + # Convert SAMME.R output to probabilities + # _samme_proba returns log-probability-like values (n_samples, n_classes) + if n_classes == 2: + # For binary: pred is 2D (n_samples, 2) + # Convert to probabilities using softmax + exp_pred = np.exp(pred - np.max(pred, axis=1, keepdims=True)) + proba = exp_pred / np.sum(exp_pred, axis=1, keepdims=True) + # Ensure probabilities are in correct order [class_0, class_1] + # and sum to 1 + proba = np.clip(proba, 1e-9, 1 - 1e-9) + proba = proba / np.sum(proba, axis=1, keepdims=True) + else: + # Multi-class: use softmax + exp_pred = np.exp(pred - np.max(pred, axis=1, keepdims=True)) + proba = exp_pred / np.sum(exp_pred, axis=1, keepdims=True) + + return proba + else: + # SAMME algorithm: use weighted voting + classes = self.classes_ + n_classes = len(classes) + proba = np.zeros((X.shape[0], n_classes)) + + for i, estimator in enumerate(self.estimators_[:n_estimators]): + if hasattr(estimator, 'predict_proba'): + estimator_proba = estimator.predict_proba(X) + weight = self.estimator_weights_[i] + proba += weight * estimator_proba + else: + # Fallback: use predictions + predictions = estimator.predict(X) + weight = self.estimator_weights_[i] + for j, class_label in enumerate(classes): + proba[:, j] += weight * (predictions == class_label) + + # Normalize + proba_sum = np.sum(proba, axis=1, keepdims=True) + proba_sum[proba_sum == 0] = 1.0 # Avoid division by zero + proba /= proba_sum + + return proba @staticmethod def _samme_proba(estimator, n_classes, X): @@ -654,7 +1206,41 @@ def _samme_proba(estimator, n_classes, X): log_proba - (1.0 / n_classes) * log_proba.sum(axis=1)[:, np.newaxis] ) - def _boost(self, iboost, X, y, sample_weight, random_state): + def _compute_adaptive_learning_rate(self, iboost, estimator_error, base_learning_rate): + """ + Compute adaptive learning rate based on iteration and estimator error. + + Parameters + ---------- + iboost : int + Current boosting iteration index (0-based) + estimator_error : float + Classification error of the current estimator (0-0.5) + base_learning_rate : float + Base learning rate from user parameter + + Returns + ------- + adaptive_lr : float + Adaptive learning rate adjusted for iteration and error + """ + # Exponential decay: reduce learning rate as we progress + # Factor starts at 1.0 and decays to ~0.7 over all iterations + iteration_decay = 1.0 - (iboost / max(self.n_estimators, 1)) * 0.3 + + # Error-based adjustment: lower rate for high error estimators + # High error (0.5) -> factor ~0.57, Low error (0.0) -> factor 1.0 + error_factor = 1.0 / (1.0 + estimator_error * 1.5) + + # Combine factors + adaptive_lr = base_learning_rate * iteration_decay * error_factor + + # Clamp to reasonable range: at least 0.01, at most base_learning_rate + adaptive_lr = np.clip(adaptive_lr, 0.01, base_learning_rate) + + return adaptive_lr + + def _boost(self, iboost, X, y, sample_weight, random_state, return_oob_indices=False): """ Implement a single boost using precomputed kernel matrix or raw features. @@ -663,10 +1249,53 @@ def _boost(self, iboost, X, y, sample_weight, random_state): X : ndarray For kernel methods, this is the precomputed kernel matrix. For linear methods, this is the raw feature matrix. + return_oob_indices : bool, default=False + If True, return OOB indices along with other results. """ estimator = self._make_estimator(random_state=random_state) - estimator.fit(X, y, sample_weight=sample_weight) + oob_indices = None + + # Apply subsampling if enabled + # Note: For exact kernels (precomputed kernel matrices), subsampling is skipped + # because it would require tracking subsample indices per estimator for correct prediction + is_exact_kernel = (X.shape[0] == X.shape[1] and X.shape[0] == y.shape[0] and + not self._use_kernel_approx() and self.kernel != "linear") + + if self.subsample < 1.0 and not is_exact_kernel: + n_samples = X.shape[0] + n_subsample = max(1, int(self.subsample * n_samples)) + + # Use stratified sampling to maintain class distribution + from sklearn.model_selection import StratifiedShuffleSplit + splitter = StratifiedShuffleSplit( + n_splits=1, + train_size=n_subsample, + random_state=random_state.randint(0, 2**31 - 1) + ) + subsample_idx, _ = next(splitter.split(X, y)) + + # Track OOB indices if requested + if return_oob_indices: + all_indices = np.arange(n_samples) + oob_indices = np.setdiff1d(all_indices, subsample_idx) + + # Subsample data and weights (for feature matrices, subsample rows only) + X_subsample = X[subsample_idx] + y_subsample = y[subsample_idx] + if sample_weight is not None: + sample_weight_subsample = sample_weight[subsample_idx].copy() + # Normalize subsampled weights + sample_weight_subsample /= sample_weight_subsample.sum() + else: + sample_weight_subsample = None + + # Fit estimator on subsampled data + estimator.fit(X_subsample, y_subsample, sample_weight=sample_weight_subsample) + else: + # No subsampling - use all data + estimator.fit(X, y, sample_weight=sample_weight) + # Always evaluate on full dataset for proper error computation if self.algorithm == "SAMME.R": y_pred = estimator.predict(X) @@ -676,31 +1305,69 @@ def _boost(self, iboost, X, y, sample_weight, random_state): ) if estimator_error <= 0: + if return_oob_indices: + return sample_weight, 1.0, 0.0, oob_indices return sample_weight, 1.0, 0.0 elif estimator_error >= 0.5: if len(self.estimators_) > 1: self.estimators_.pop(-1) + if return_oob_indices: + return None, None, None, None return None, None, None - # Compute SEFR-specific weight update - estimator_weight = self.learning_rate * np.log( - (1 - estimator_error) / estimator_error + # Compute adaptive learning rate + adaptive_lr = self._compute_adaptive_learning_rate( + iboost, estimator_error, self.learning_rate ) + + # Compute F1 score for this estimator to inform weight calculation + # This aligns estimator weighting with F1 optimization target + f1 = f1_score(y, y_pred, sample_weight=sample_weight, average='weighted') + + # F1 bonus: reward estimators with good F1 performance + # Scale: 0.5 F1 -> 1.0x multiplier, 1.0 F1 -> 1.2x multiplier + # This ensures estimators contributing to F1 get higher weights + f1_bonus = 1.0 + (f1 - 0.5) * 0.6 + + # Compute base weight from error rate + base_weight = np.log((1 - estimator_error) / max(estimator_error, 1e-10)) + + # Apply F1 bonus to estimator weight + estimator_weight = self.shrinkage * adaptive_lr * base_weight * f1_bonus if iboost < self.n_estimators - 1: - sample_weight = np.exp( - np.log(sample_weight) - + estimator_weight * incorrect * (sample_weight > 0) - ) + # Compute class frequencies for imbalance handling + # This gives higher weight boosts to minority class samples when misclassified + unique_classes, class_counts = np.unique(y, return_counts=True) + class_freq = class_counts / len(y) + class_weights = {cls: 1.0 / freq for cls, freq in zip(unique_classes, class_freq)} + + # Apply class-aware weight updates (minority class gets higher boost) + for cls in unique_classes: + cls_mask = y == cls + cls_weight = class_weights[cls] # Inverse frequency weighting + sample_weight[cls_mask] = np.exp( + np.log(sample_weight[cls_mask] + 1e-10) + + estimator_weight * incorrect[cls_mask] * cls_weight + * (sample_weight[cls_mask] > 0) + ) + + # Normalize to prevent numerical issues + sample_weight /= np.sum(sample_weight) + if return_oob_indices: + return sample_weight, estimator_weight, estimator_error, oob_indices return sample_weight, estimator_weight, estimator_error else: # standard SAMME + # Always evaluate on full dataset for proper error computation y_pred = estimator.predict(X) incorrect = y_pred != y estimator_error = np.mean(np.average(incorrect, weights=sample_weight)) if estimator_error <= 0: + if return_oob_indices: + return sample_weight, 1.0, 0.0, oob_indices return sample_weight, 1.0, 0.0 if estimator_error >= 0.5: self.estimators_.pop(-1) @@ -708,17 +1375,49 @@ def _boost(self, iboost, X, y, sample_weight, random_state): raise ValueError( "BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble cannot be fit." ) + if return_oob_indices: + return None, None, None, None return None, None, None - estimator_weight = self.learning_rate * np.log( - (1.0 - estimator_error) / max(estimator_error, 1e-10) + # Compute adaptive learning rate + adaptive_lr = self._compute_adaptive_learning_rate( + iboost, estimator_error, self.learning_rate ) - - sample_weight *= np.exp(estimator_weight * incorrect) + + # Compute F1 score for this estimator to inform weight calculation + # This aligns estimator weighting with F1 optimization target + f1 = f1_score(y, y_pred, sample_weight=sample_weight, average='weighted') + + # F1 bonus: reward estimators with good F1 performance + # Scale: 0.5 F1 -> 1.0x multiplier, 1.0 F1 -> 1.2x multiplier + # This ensures estimators contributing to F1 get higher weights + f1_bonus = 1.0 + (f1 - 0.5) * 0.6 + + # Compute base weight from error rate + base_weight = np.log((1.0 - estimator_error) / max(estimator_error, 1e-10)) + + # Apply F1 bonus to estimator weight + estimator_weight = self.shrinkage * adaptive_lr * base_weight * f1_bonus + + # Compute class frequencies for imbalance handling + # This gives higher weight boosts to minority class samples when misclassified + unique_classes, class_counts = np.unique(y, return_counts=True) + class_freq = class_counts / len(y) + class_weights = {cls: 1.0 / freq for cls, freq in zip(unique_classes, class_freq)} + + # Apply class-aware weight updates (minority class gets higher boost) + for cls in unique_classes: + cls_mask = y == cls + cls_weight = class_weights[cls] # Inverse frequency weighting + sample_weight[cls_mask] *= np.exp( + estimator_weight * incorrect[cls_mask] * cls_weight + ) # Normalize sample weights sample_weight /= np.sum(sample_weight) + if return_oob_indices: + return sample_weight, estimator_weight, estimator_error, oob_indices return sample_weight, estimator_weight, estimator_error def decision_function(self, X): @@ -742,11 +1441,26 @@ class in ``classes_``, respectively. check_is_fitted(self) X_transformed = self.scaler_.transform(X) + # Decide which representation to use at prediction time: if self.kernel == "linear": - # For linear kernel, pass raw features test_data = X_transformed + + elif self._use_kernel_approx(): + # Apply the same feature map as during training + if self.kernel_approx_ is None: + raise RuntimeError( + "Kernel approximation object is not fitted. " + "This should not happen if 'fit' completed successfully." + ) + test_data = self.kernel_approx_.transform(X_transformed) + else: - # For kernel methods, compute kernel matrix between test and training data + # Exact kernel: compute kernel matrix between test and training data + if self.X_fit_ is None: + raise RuntimeError( + "Training data for exact kernel is not stored. " + "This should not happen if 'fit' completed successfully." + ) test_data = self._get_kernel_matrix(X_transformed, self.X_fit_) if self.algorithm == "SAMME.R": @@ -789,4 +1503,4 @@ def predict(self, X): if self.n_classes_ == 2: return self.classes_.take(pred > 0, axis=0) - return self.classes_.take(np.argmax(pred, axis=1), axis=0) + return self.classes_.take(np.argmax(pred, axis=1), axis=0) \ No newline at end of file From 83b37745fc4113c506f21f3856a7892a7c5a1d60 Mon Sep 17 00:00:00 2001 From: Hamidreza Keshavarz <32555614+hamidkm9@users.noreply.github.com> Date: Sun, 15 Feb 2026 21:38:14 +0100 Subject: [PATCH 2/4] Added Gradient Boosting to LinearBoost --- README.md | 72 +++- pyproject.toml | 1 - src/linearboost/__init__.py | 2 +- src/linearboost/linear_boost.py | 569 +++++++++++++++++++++++++++++++- src/linearboost/sefr.py | 3 +- 5 files changed, 624 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index eaf33bc..8bf0dc5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # LinearBoost Classifier -![Lastest Release](https://img.shields.io/badge/release-v0.1.5-green) +![Latest Release](https://img.shields.io/badge/release-v0.1.7-green) [![PyPI Version](https://img.shields.io/pypi/v/linearboost)](https://pypi.org/project/linearboost/) ![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue) [![PyPI Downloads](https://static.pepy.tech/badge/linearboost)](https://pepy.tech/projects/linearboost) @@ -32,9 +32,45 @@ Key Features: --- +## 🚀 New in Version 0.1.7 + +### Gradient Boosting Mode + +LinearBoost now supports **gradient boosting** in addition to AdaBoost via the `boosting_type` parameter: + +- **`boosting_type='adaboost'`** (default): Classic AdaBoost (SAMME or SAMME.R) that reweights samples by classification error. +- **`boosting_type='gradient'`**: Fits each base estimator to pseudo-residuals (negative gradient of log-loss). Often better for highly non-linear or XOR-like patterns and smoother decision boundaries. + +```python +# Gradient boosting for complex non-linear patterns +clf = LinearBoostClassifier( + boosting_type='gradient', + n_estimators=200, + kernel='rbf' +) +``` + +### Class Weighting & Custom Loss + +- **`class_weight`**: Use `'balanced'` or a dict of class weights for imbalanced data. Weights are applied in the boosting loop. +- **`loss_function`**: Optional callable `(y_true, y_pred, sample_weight) -> float` for custom optimization objectives. + +```python +clf = LinearBoostClassifier( + class_weight='balanced', # Adjust for imbalanced classes + n_estimators=200 +) +``` + +### Default Algorithm + +The default **`algorithm`** is now **`'SAMME.R'`** for faster convergence and typically lower test error with fewer iterations (when using `boosting_type='adaboost'`). + +--- + ## 🚀 New in Version 0.1.5 -The latest release introduces major architectural improvements designed for **scalability**, **robustness on imbalanced data**, and **training speed**. +Version 0.1.5 introduced major architectural improvements designed for **scalability**, **robustness on imbalanced data**, and **training speed**. ### ⚡ Scalable Kernel Approximation @@ -157,8 +193,7 @@ Version 0.1.2 of **LinearBoost Classifier** is released. Here are the changes: - Improved Scikit-learn compatibility. -Get Started and Documentation ------------------------------ +## Get Started and Documentation The documentation is available at https://linearboost.readthedocs.io/. @@ -172,13 +207,20 @@ The following parameters yielded optimal results during testing. All results are - **`learning_rate`**: Values between 0.01 and 1 typically perform well. Adjust based on the dataset's complexity and noise. -- **`algorithm`**: - Use either `SAMME` or `SAMME.R`. The choice depends on the specific problem: +- **`algorithm`** (when `boosting_type='adaboost'`): + Use either `SAMME` or `SAMME.R` (default). SAMME.R typically converges faster with lower test error. - `SAMME`: May be better for datasets with clearer separations between classes. - - `SAMME.R`: Can handle more nuanced class probabilities. + - `SAMME.R`: Uses class probabilities; often better for nuanced boundaries. **Note:** As of scikit-learn v1.6, the `algorithm` parameter is deprecated and will be removed in v1.8. LinearBoostClassifier will only implement the 'SAMME' algorithm in newer versions. +- **`boosting_type`** *(new in v0.1.7)*: + - `'adaboost'`: Classic AdaBoost (default). + - `'gradient'`: Gradient boosting on pseudo-residuals; try for highly non-linear or XOR-like data. + +- **`class_weight`** *(new in v0.1.7)*: + Use `'balanced'` for imbalanced datasets so class weights are adjusted automatically. + - **`scaler`**: The following scaling methods are recommended based on dataset characteristics: - `minmax`: Best for datasets where features are on different scales but bounded. @@ -210,8 +252,7 @@ The following parameters yielded optimal results during testing. All results are These parameters should serve as a solid starting point for most datasets. For fine-tuning, consider using hyperparameter optimization tools like [Optuna](https://optuna.org/). -Results -------- +## Results All of the results are reported based on 10-fold Cross-Validation. The weighted F1 score is reported, i.e. f1_score(y_valid, y_pred, average = 'weighted'). @@ -337,6 +378,8 @@ params = { 'algorithm': trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R']), 'scaler': trial.suggest_categorical('scaler', ['minmax', 'robust', 'quantile-uniform', 'quantile-normal']), 'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']), + 'boosting_type': trial.suggest_categorical('boosting_type', ['adaboost', 'gradient']), + 'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']), 'subsample': trial.suggest_float('subsample', 0.6, 1.0), 'shrinkage': trial.suggest_float('shrinkage', 0.7, 1.0), 'early_stopping': True, @@ -353,19 +396,18 @@ LinearBoost's combination of **runtime efficiency** and **high accuracy** makes *Discusses how LinearBoost outperforms traditional boosting frameworks in terms of speed while maintaining accuracy.* -Future Developments ------------------------------ +## Future Developments + These are not yet supported in this current version, but are in the future plans: - Supporting categorical variables natively - Adding regression support (`LinearBoostRegressor`) - Multi-output classification -Reference Paper ------------------------------ +## Reference Paper + The paper is written by Hamidreza Keshavarz (Independent Researcher based in Berlin, Germany) and Reza Rawassizadeh (Department of Computer Science, Metropolitan college, Boston University, United States). It will be available soon. -License -------- +## License This project is licensed under the terms of the MIT license. See [LICENSE](https://github.com/LinearBoost/linearboost-classifier/blob/main/LICENSE) for additional details. diff --git a/pyproject.toml b/pyproject.toml index 8adb6a0..e2638b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,6 @@ authors = [ ] description = "LinearBoost Classifier is a rapid and accurate classification algorithm that builds upon a very fast, linear classifier." readme = "README.md" -readme-content-type = "text/markdown" keywords = [ "classification", "classifier", "linear", "adaboost", "boosting", "boost" ] diff --git a/src/linearboost/__init__.py b/src/linearboost/__init__.py index 4445701..6c2c0a8 100644 --- a/src/linearboost/__init__.py +++ b/src/linearboost/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.4" +__version__ = "0.1.7" from .linear_boost import LinearBoostClassifier from .sefr import SEFR diff --git a/src/linearboost/linear_boost.py b/src/linearboost/linear_boost.py index 6ec8e5d..494df76 100644 --- a/src/linearboost/linear_boost.py +++ b/src/linearboost/linear_boost.py @@ -401,6 +401,23 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): Values must be in the range `(0, 1]`. Typical values are in the range `[0.8, 1.0]` for moderate regularization or `1.0` for no regularization. + boosting_type : {'adaboost', 'gradient'}, default='adaboost' + The type of boosting algorithm to use: + + - 'adaboost': Use the AdaBoost algorithm (SAMME or SAMME.R) which + reweights samples based on classification errors. This is the + original LinearBoost approach. + - 'gradient': Use gradient boosting which fits each new estimator + to the pseudo-residuals (negative gradient of log-loss). This can + be more effective for complex non-linear patterns and provides + smoother decision boundaries. + + When ``boosting_type='gradient'``: + - The ``algorithm`` parameter is ignored + - Each estimator predicts pseudo-residuals instead of class labels + - The ensemble prediction is the sum of estimator predictions + - Better suited for XOR-like and highly non-linear patterns + Attributes ---------- estimator_ : estimator @@ -450,6 +467,14 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): The precomputed kernel matrix on training data, stored when kernel != 'linear'. + F_ : ndarray of shape (n_samples,) + The raw prediction scores (log-odds) from gradient boosting. + Only present when ``boosting_type='gradient'``. + + init_score_ : float + The initial score (log-odds of class prior) for gradient boosting. + Only present when ``boosting_type='gradient'``. + Notes ----- This classifier only supports binary classification tasks. @@ -493,6 +518,7 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): "tol": [Interval(Real, 0, None, closed="left")], "subsample": [Interval(Real, 0, 1, closed="right")], "shrinkage": [Interval(Real, 0, 1, closed="right")], + "boosting_type": [StrOptions({"adaboost", "gradient"})], } def __init__( @@ -516,6 +542,7 @@ def __init__( tol=1e-4, subsample=1.0, shrinkage=1.0, + boosting_type="adaboost", ): self.algorithm = algorithm self.scaler = scaler @@ -533,6 +560,7 @@ def __init__( self.tol = tol self.subsample = subsample self.shrinkage = shrinkage + self.boosting_type = boosting_type # Decide how SEFR sees the input: # - If we use a kernel approximation, the base estimator should work @@ -568,6 +596,7 @@ def __init__( self.tol = tol self.subsample = subsample self.shrinkage = shrinkage + self.boosting_type = boosting_type if SKLEARN_V1_6_OR_LATER: @@ -875,12 +904,16 @@ def fit(self, X, y, sample_weight=None) -> Self: message=".*parameter 'algorithm' is deprecated.*", ) + # Use gradient boosting if specified + if self.boosting_type == "gradient": + return self._fit_gradient_boosting(training_data, y_train, sample_weight, validation_data, use_oob=use_oob) + # If early stopping is enabled, use custom boosting loop if self.early_stopping and self.n_iter_no_change is not None and validation_data is not None: return self._fit_with_early_stopping(training_data, y_train, sample_weight, validation_data, use_oob=use_oob) - else: - # Pass the precomputed kernel matrix (or raw features for linear) - return super().fit(training_data, y_train, sample_weight) + + # Pass the precomputed kernel matrix (or raw features for linear) + return super().fit(training_data, y_train, sample_weight) def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob=False): """Fit with early stopping based on validation error or OOB evaluation. @@ -1041,6 +1074,466 @@ def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob break return self + + def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_oob=False): + """Fit using gradient boosting instead of AdaBoost. + + Gradient boosting fits each new estimator to the pseudo-residuals + (negative gradient of the log-loss), which can handle non-linear + patterns more effectively than sample reweighting. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data (features or kernel matrix) + y : array-like of shape (n_samples,) + Target labels (0 or 1) + sample_weight : array-like of shape (n_samples,) or None + Sample weights + validation_data : tuple or None + Validation data for early stopping + use_oob : bool + Whether to use OOB evaluation + + Returns + ------- + self : object + Fitted estimator + """ + from sklearn.utils import check_random_state + + n_samples = X.shape[0] + + # Determine if we're using exact (precomputed) kernels + # Exact kernel: non-linear kernel without approximation + self._gradient_exact_kernel = ( + not self._use_kernel_approx() and self.kernel != "linear" + ) + + # For exact kernels, X is the kernel matrix K_train_ + # We also have X_fit_ which contains the original transformed features + if self._gradient_exact_kernel: + # Store all training indices for computing prediction kernels + self._gradient_train_indices = np.arange(n_samples) + + # Initialize estimator list and weights + self.estimators_ = [] + self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) + self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) + + # For exact kernels, store estimator training info + self._gradient_estimator_info = [] + + # Map labels to 0/1 if needed + y_binary = np.where(y == self.classes_[0], 0, 1).astype(np.float64) + + # Initialize with log-odds of class prior + pos_rate = np.clip(y_binary.mean(), 1e-10, 1 - 1e-10) + self.init_score_ = np.log(pos_rate / (1 - pos_rate)) + + # Current predictions (log-odds space) + F = np.full(n_samples, self.init_score_, dtype=np.float64) + self.F_ = F # Store for reference + + # Sample weights + if sample_weight is None: + sample_weight = np.ones(n_samples, dtype=np.float64) + sample_weight = sample_weight / sample_weight.sum() + + random_state = check_random_state(None) + + # Early stopping tracking + best_val_score = -np.inf + n_no_improvement = 0 + best_n_estimators = 0 + + # Validation data setup + if validation_data is not None: + X_val, y_val, _ = validation_data + if use_oob: + y_val_binary = np.where(y_val == self.classes_[0], 0, 1).astype(np.float64) + else: + y_val_binary = np.where(y_val == self.classes_[0], 0, 1).astype(np.float64) + + for iboost in range(self.n_estimators): + # Compute probabilities from current predictions + p = 1 / (1 + np.exp(-F)) + p = np.clip(p, 1e-10, 1 - 1e-10) + + # Compute pseudo-residuals (negative gradient of log-loss) + # For log-loss: gradient = p - y, so negative gradient = y - p + residuals = y_binary - p + + # Apply subsample if enabled + if self.subsample < 1.0: + n_subsample = max(1, int(self.subsample * n_samples)) + subsample_idx = random_state.choice( + n_samples, size=n_subsample, replace=False + ) + X_train = X[subsample_idx] + residuals_train = residuals[subsample_idx] + weights_train = sample_weight[subsample_idx] + oob_idx = np.setdiff1d(np.arange(n_samples), subsample_idx) + else: + X_train = X + residuals_train = residuals + weights_train = sample_weight + oob_idx = None + + # Convert residuals to binary labels for SEFR + # Positive residual (y > p) -> class 1 (need to increase prediction) + # Negative residual (y < p) -> class 0 (need to decrease prediction) + residual_labels = (residuals_train > 0).astype(int) + + # Use magnitude of residuals as sample weights (larger residuals = more important) + residual_weights = np.abs(residuals_train) * weights_train + residual_weights = residual_weights / (residual_weights.sum() + 1e-10) + + # Create SEFR estimator with appropriate kernel settings + # For exact kernels, we need to handle the kernel matrix properly + if self._gradient_exact_kernel: + estimator = SEFR(kernel="precomputed") + # For subsampling, extract the relevant submatrix of the kernel + if self.subsample < 1.0: + # X_train is already indexed from subsample_idx + # But for precomputed kernel, we need K[subsample_idx][:, subsample_idx] + X_train_kernel = X[np.ix_(subsample_idx, subsample_idx)] + else: + X_train_kernel = X # Full kernel matrix + else: + estimator = SEFR(kernel="linear") + X_train_kernel = X_train # Original features or approximated features + + try: + # Check if we have both classes in the residual labels + unique_labels = np.unique(residual_labels) + if len(unique_labels) < 2: + # All residuals have the same sign - use a constant prediction + # Store a dummy estimator and skip + self.estimators_.append(None) + self.estimator_weights_[iboost] = 0.0 + self._gradient_estimator_info.append(None) + continue + + estimator.fit(X_train_kernel, residual_labels, sample_weight=residual_weights) + except Exception: + # If fitting fails, stop boosting + break + + # Store info about which training samples were used (for exact kernel prediction) + if self._gradient_exact_kernel: + if self.subsample < 1.0: + estimator_info = {'train_idx': subsample_idx.copy()} + else: + estimator_info = {'train_idx': np.arange(n_samples)} + else: + estimator_info = None + + # Get continuous predictions from SEFR using predict_proba + # Use the probability of class 1 (positive residual direction) + # Transform to [-1, 1] range: 2 * proba - 1 + if self._gradient_exact_kernel: + # For exact kernels, compute kernel between all training and this estimator's training + if self.subsample < 1.0: + # K_pred[i, j] = kernel(X[i], X_train[j]) where X_train are the subsampled points + K_pred = X[:, subsample_idx] # X is K_train_, get relevant columns + else: + K_pred = X # Full kernel matrix + proba = estimator.predict_proba(K_pred) + else: + proba = estimator.predict_proba(X) + h = 2 * proba[:, 1] - 1 # Maps [0, 1] to [-1, 1] + + # Line search for optimal step size (simplified Newton step) + # For log-loss, optimal step is approximately residuals / (p * (1-p)) + # We use a simplified approach with learning_rate * shrinkage + step_size = self.learning_rate * self.shrinkage + + # Update predictions + F = F + step_size * h + + # Store estimator and its info + self.estimators_.append(estimator) + self.estimator_weights_[iboost] = step_size + self._gradient_estimator_info.append(estimator_info) + + # Compute training error (log-loss) + p_new = 1 / (1 + np.exp(-F)) + p_new = np.clip(p_new, 1e-10, 1 - 1e-10) + train_loss = -np.mean( + y_binary * np.log(p_new) + (1 - y_binary) * np.log(1 - p_new) + ) + self.estimator_errors_[iboost] = train_loss + + # Early stopping check + if self.early_stopping and validation_data is not None: + if use_oob and oob_idx is not None and len(oob_idx) > 0: + # Use OOB samples + # For exact kernels, we need kernel between OOB samples and training + if self._gradient_exact_kernel: + # Pass the kernel submatrix for OOB samples + val_pred = self._gradient_predict_internal(X, oob_idx, iboost + 1) + val_proba = self._gradient_predict_proba_internal(X, oob_idx, iboost + 1) + else: + val_pred = self._gradient_predict(X[oob_idx], iboost + 1) + val_proba = self._gradient_predict_proba(X[oob_idx], iboost + 1) + y_oob_binary = y_binary[oob_idx] + + f1_val = f1_score( + (y_oob_binary > 0.5).astype(int), + val_pred, + average='weighted', + zero_division=0.0 + ) + try: + roc_auc_val = roc_auc_score(y_oob_binary, val_proba[:, 1]) + val_score = 0.7 * f1_val + 0.3 * roc_auc_val + except (ValueError, IndexError): + val_score = f1_val + else: + # Use validation set + val_pred = self._gradient_predict(X_val, iboost + 1) + val_proba = self._gradient_predict_proba(X_val, iboost + 1) + + f1_val = f1_score( + y_val_binary.astype(int), + val_pred, + average='weighted', + zero_division=0.0 + ) + try: + roc_auc_val = roc_auc_score(y_val_binary, val_proba[:, 1]) + val_score = 0.7 * f1_val + 0.3 * roc_auc_val + except (ValueError, IndexError): + val_score = f1_val + + if val_score > best_val_score + self.tol: + best_val_score = val_score + n_no_improvement = 0 + best_n_estimators = iboost + 1 + else: + n_no_improvement += 1 + + if self.n_iter_no_change is not None and n_no_improvement >= self.n_iter_no_change: + if best_n_estimators > 0: + self.estimators_ = self.estimators_[:best_n_estimators] + self.estimator_weights_ = self.estimator_weights_[:best_n_estimators] + self.estimator_errors_ = self.estimator_errors_[:best_n_estimators] + self._gradient_estimator_info = self._gradient_estimator_info[:best_n_estimators] + break + + # Trim arrays to actual number of estimators + n_fitted = len(self.estimators_) + self.estimator_weights_ = self.estimator_weights_[:n_fitted] + self.estimator_errors_ = self.estimator_errors_[:n_fitted] + self._gradient_estimator_info = self._gradient_estimator_info[:n_fitted] + + return self + + def _gradient_predict_proba_internal(self, K_train, sample_idx, n_estimators=None): + """Internal method for exact kernel prediction during training. + + Used for OOB evaluation where we have the full training kernel matrix. + + Parameters + ---------- + K_train : ndarray of shape (n_train, n_train) + Full training kernel matrix + sample_idx : ndarray + Indices of samples to predict (rows to use) + n_estimators : int or None + Number of estimators to use + + Returns + ------- + proba : ndarray of shape (len(sample_idx), 2) + """ + if n_estimators is None: + n_estimators = len(self.estimators_) + + n_samples = len(sample_idx) + F = np.full(n_samples, self.init_score_, dtype=np.float64) + + for i in range(min(n_estimators, len(self.estimators_))): + estimator = self.estimators_[i] + if estimator is None: + continue + + info = self._gradient_estimator_info[i] + if info is not None: + # Exact kernel: get K[sample_idx, train_idx] + train_idx = info['train_idx'] + K_pred = K_train[np.ix_(sample_idx, train_idx)] + else: + K_pred = K_train[sample_idx] + + proba_est = estimator.predict_proba(K_pred) + h = 2 * proba_est[:, 1] - 1 + F = F + self.estimator_weights_[i] * h + + p = 1 / (1 + np.exp(-F)) + p = np.clip(p, 1e-10, 1 - 1e-10) + return np.column_stack([1 - p, p]) + + def _gradient_predict_internal(self, K_train, sample_idx, n_estimators=None): + """Internal method for exact kernel prediction during training.""" + proba = self._gradient_predict_proba_internal(K_train, sample_idx, n_estimators) + return self.classes_.take(np.argmax(proba, axis=1), axis=0) + + def _gradient_predict_proba(self, X, n_estimators=None): + """Predict class probabilities using gradient boosting ensemble. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input samples (original features - will be transformed) + n_estimators : int or None + Number of estimators to use (None = all) + + Returns + ------- + proba : ndarray of shape (n_samples, 2) + Class probabilities + """ + if n_estimators is None: + n_estimators = len(self.estimators_) + + n_samples = X.shape[0] + + # Start with initial score + F = np.full(n_samples, self.init_score_, dtype=np.float64) + + # For exact kernels, we need to compute kernel between X and training samples + # X here is the transformed (scaled) features + is_exact_kernel = getattr(self, '_gradient_exact_kernel', False) + + # Add contributions from estimators + for i in range(min(n_estimators, len(self.estimators_))): + estimator = self.estimators_[i] + if estimator is None: + # Skip None estimators (from failed fits or single-class residuals) + continue + + if is_exact_kernel: + # Get estimator info to know which training samples were used + info = self._gradient_estimator_info[i] + if info is not None: + train_idx = info['train_idx'] + # Compute kernel between X and the training samples used by this estimator + X_train_subset = self.X_fit_[train_idx] + else: + X_train_subset = self.X_fit_ + + # Compute kernel matrix between test and training samples + K_pred = self._compute_kernel_matrix(X, X_train_subset) + proba_est = estimator.predict_proba(K_pred) + else: + # For linear or approximated kernels, X is already the right format + proba_est = estimator.predict_proba(X) + + h = 2 * proba_est[:, 1] - 1 # Maps [0, 1] to [-1, 1] + F = F + self.estimator_weights_[i] * h + + # Convert to probabilities + p = 1 / (1 + np.exp(-F)) + p = np.clip(p, 1e-10, 1 - 1e-10) + + proba = np.column_stack([1 - p, p]) + return proba + + def _gradient_predict(self, X, n_estimators=None): + """Predict class labels using gradient boosting ensemble. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input samples + n_estimators : int or None + Number of estimators to use (None = all) + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Predicted class labels + """ + proba = self._gradient_predict_proba(X, n_estimators) + return self.classes_.take(np.argmax(proba, axis=1), axis=0) + + def _gradient_decision_function(self, X): + """Compute decision function using gradient boosting. + + Returns the raw log-odds scores. + """ + n_samples = X.shape[0] + + # Start with initial score + F = np.full(n_samples, self.init_score_, dtype=np.float64) + + # For exact kernels, we need to compute kernel between X and training samples + is_exact_kernel = getattr(self, '_gradient_exact_kernel', False) + + # Add contributions from estimators + for i in range(len(self.estimators_)): + estimator = self.estimators_[i] + if estimator is None: + # Skip None estimators (from failed fits or single-class residuals) + continue + + if is_exact_kernel: + # Get estimator info to know which training samples were used + info = self._gradient_estimator_info[i] + if info is not None: + train_idx = info['train_idx'] + X_train_subset = self.X_fit_[train_idx] + else: + X_train_subset = self.X_fit_ + + # Compute kernel matrix between test and training samples + K_pred = self._compute_kernel_matrix(X, X_train_subset) + proba_est = estimator.predict_proba(K_pred) + else: + proba_est = estimator.predict_proba(X) + + h = 2 * proba_est[:, 1] - 1 # Maps [0, 1] to [-1, 1] + F = F + self.estimator_weights_[i] * h + + return F + + def _compute_kernel_matrix(self, X, Y=None): + """Compute kernel matrix with appropriate parameters for the kernel type. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) + First input + Y : array-like of shape (n_samples_Y, n_features), optional + Second input. If None, compute K(X, X). + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel matrix + """ + gamma = self.gamma if self.gamma is not None else 1.0 / X.shape[1] + + # Build kernel parameters based on kernel type + if self.kernel == 'rbf': + return pairwise_kernels(X, Y, metric='rbf', gamma=gamma) + elif self.kernel == 'poly': + return pairwise_kernels( + X, Y, metric='poly', + gamma=gamma, degree=self.degree, coef0=self.coef0 + ) + elif self.kernel == 'sigmoid': + return pairwise_kernels( + X, Y, metric='sigmoid', + gamma=gamma, coef0=self.coef0 + ) + elif self.kernel == 'linear': + return pairwise_kernels(X, Y, metric='linear') + else: + # Custom or callable kernel + return pairwise_kernels(X, Y, metric=self.kernel) def _staged_predict_single(self, X, n_estimators): """Predict using first n_estimators for validation. @@ -1441,7 +1934,18 @@ class in ``classes_``, respectively. check_is_fitted(self) X_transformed = self.scaler_.transform(X) - # Decide which representation to use at prediction time: + # For gradient boosting, handle kernels differently + if self.boosting_type == "gradient": + if self.kernel == "linear": + test_data = X_transformed + elif self._use_kernel_approx(): + test_data = self.kernel_approx_.transform(X_transformed) + else: + # For exact kernels, pass transformed features + test_data = X_transformed + return self._gradient_decision_function(test_data) + + # Decide which representation to use at prediction time (for AdaBoost): if self.kernel == "linear": test_data = X_transformed @@ -1498,9 +2002,64 @@ def predict(self, X): y : ndarray of shape (n_samples,) The predicted classes. """ + # For gradient boosting, use sigmoid threshold + if self.boosting_type == "gradient": + check_is_fitted(self) + X_transformed = self.scaler_.transform(X) + + # Transform data based on kernel type + if self.kernel == "linear": + test_data = X_transformed + elif self._use_kernel_approx(): + test_data = self.kernel_approx_.transform(X_transformed) + else: + # For exact kernels, pass transformed features - kernel will be computed + # inside _gradient_predict for each estimator + test_data = X_transformed + + return self._gradient_predict(test_data) + pred = self.decision_function(X) if self.n_classes_ == 2: return self.classes_.take(pred > 0, axis=0) - return self.classes_.take(np.argmax(pred, axis=1), axis=0) \ No newline at end of file + return self.classes_.take(np.argmax(pred, axis=1), axis=0) + + def predict_proba(self, X): + """Predict class probabilities for X. + + The predicted class probabilities of an input sample is computed as + the weighted mean predicted class probabilities of the classifiers + in the ensemble. + + Parameters + ---------- + X : {array-like} of shape (n_samples, n_features) + The training input samples. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of + outputs is the same of that of the :term:`classes_` attribute. + """ + # For gradient boosting, use sigmoid of log-odds + if self.boosting_type == "gradient": + check_is_fitted(self) + X_transformed = self.scaler_.transform(X) + + # Transform data based on kernel type + if self.kernel == "linear": + test_data = X_transformed + elif self._use_kernel_approx(): + test_data = self.kernel_approx_.transform(X_transformed) + else: + # For exact kernels, pass transformed features - kernel will be computed + # inside _gradient_predict_proba for each estimator + test_data = X_transformed + + return self._gradient_predict_proba(test_data) + + # For AdaBoost, use parent implementation + return super().predict_proba(X) \ No newline at end of file diff --git a/src/linearboost/sefr.py b/src/linearboost/sefr.py index 0b4e723..69a0c42 100644 --- a/src/linearboost/sefr.py +++ b/src/linearboost/sefr.py @@ -280,9 +280,10 @@ def fit(self, X, y, sample_weight=None) -> Self: else: K = self._get_kernel_matrix(X) + # Validate sample weights pos_labels = y_ == 1 neg_labels = y_ == 0 - + pos_sample_weight, neg_sample_weight = None, None if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32) From b9ad7a95a6a86e104bab2ed6e13ff8edf65d8d59 Mon Sep 17 00:00:00 2001 From: Hamidreza Keshavarz <32555614+hamidkm9@users.noreply.github.com> Date: Sun, 15 Feb 2026 21:52:37 +0100 Subject: [PATCH 3/4] Update linear_boost.py --- src/linearboost/linear_boost.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/linearboost/linear_boost.py b/src/linearboost/linear_boost.py index 494df76..76ae56f 100644 --- a/src/linearboost/linear_boost.py +++ b/src/linearboost/linear_boost.py @@ -42,8 +42,7 @@ from sklearn.utils.multiclass import check_classification_targets, type_of_target from sklearn.utils.validation import check_is_fitted from sklearn.kernel_approximation import RBFSampler, Nystroem -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, f1_score, roc_auc_score +from sklearn.metrics import f1_score, roc_auc_score from ._utils import SKLEARN_V1_6_OR_LATER, check_X_y, validate_data from .sefr import SEFR From 88d090e7ab2ddd1dd4c88d7a33ae7eb70c712eb4 Mon Sep 17 00:00:00 2001 From: Hamidreza Keshavarz <32555614+hamidkm9@users.noreply.github.com> Date: Sun, 15 Feb 2026 21:55:09 +0100 Subject: [PATCH 4/4] Spacing issues fixed --- src/linearboost/linear_boost.py | 563 ++++++++++++++++++-------------- src/linearboost/sefr.py | 2 +- 2 files changed, 326 insertions(+), 239 deletions(-) diff --git a/src/linearboost/linear_boost.py b/src/linearboost/linear_boost.py index 76ae56f..4088de8 100644 --- a/src/linearboost/linear_boost.py +++ b/src/linearboost/linear_boost.py @@ -306,7 +306,7 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): kernel : {'linear', 'poly', 'rbf', 'sigmoid'} or callable, default='linear' Specifies the kernel type to be used in the algorithm. If a callable is given, it is used to pre-compute the kernel matrix. - + kernel_approx : {'rff', 'nystrom'} or None, default=None Optional kernel approximation strategy for non-linear kernels. @@ -357,7 +357,7 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): early_stopping : bool, default=False Whether to use early stopping to terminate training when validation score is not improving. If True, it requires ``n_iter_no_change`` to be set. - + If ``subsample < 1.0`` (subsampling is enabled), Out-of-Bag (OOB) evaluation is automatically used instead of a fixed validation split. This is more data-efficient as it uses all training data while still providing validation @@ -393,16 +393,16 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): each base learner, helping to prevent overfitting and improve generalization. This is similar to the shrinkage used in gradient boosting methods. - + - If `shrinkage = 1.0`: no shrinkage (full weight) - If `shrinkage < 1.0`: apply shrinkage (e.g., 0.8 means 80% weight) - + Values must be in the range `(0, 1]`. Typical values are in the range `[0.8, 1.0]` for moderate regularization or `1.0` for no regularization. boosting_type : {'adaboost', 'gradient'}, default='adaboost' The type of boosting algorithm to use: - + - 'adaboost': Use the AdaBoost algorithm (SAMME or SAMME.R) which reweights samples based on classification errors. This is the original LinearBoost approach. @@ -410,7 +410,7 @@ class LinearBoostClassifier(_DenseAdaBoostClassifier): to the pseudo-residuals (negative gradient of log-loss). This can be more effective for complex non-linear patterns and provides smoother decision boundaries. - + When ``boosting_type='gradient'``: - The ``algorithm`` parameter is ignored - Each estimator predicts pseudo-residuals instead of class labels @@ -676,6 +676,7 @@ def _get_kernel_matrix(self, X, Y=None): degree=self.degree, coef0=self.coef0, ) + def _use_kernel_approx(self) -> bool: """Return True if we should use kernel approximation.""" return self.kernel != "linear" and self.kernel_approx is not None @@ -803,14 +804,19 @@ def fit(self, X, y, sample_weight=None) -> Self: validation_data = None y_val = None training_data_val = None - X_val_transformed = None # Store original features for validation (needed for exact kernels) + X_val_transformed = ( + None # Store original features for validation (needed for exact kernels) + ) use_oob = False # Flag to use OOB evaluation instead of fixed validation split - + # Use OOB evaluation if subsampling is enabled and early stopping is requested - if (self.early_stopping and self.n_iter_no_change is not None and - self.subsample < 1.0): + if ( + self.early_stopping + and self.n_iter_no_change is not None + and self.subsample < 1.0 + ): # Check if we can use OOB (skip for exact kernels) - is_exact_kernel = (not self._use_kernel_approx() and self.kernel != "linear") + is_exact_kernel = not self._use_kernel_approx() and self.kernel != "linear" if not is_exact_kernel: use_oob = True # Store full data for OOB evaluation @@ -820,7 +826,7 @@ def fit(self, X, y, sample_weight=None) -> Self: validation_data = (X_transformed, y, sample_weight) else: validation_data = (training_data, y, sample_weight) - + if self.early_stopping and self.n_iter_no_change is not None and not use_oob: # Split BEFORE kernel computation for exact kernels # For exact kernels, we need to split X_transformed, not the kernel matrix @@ -828,33 +834,32 @@ def fit(self, X, y, sample_weight=None) -> Self: # For exact kernels, split the original features n_samples = X_transformed.shape[0] n_val_samples = max(1, int(self.validation_fraction * n_samples)) - + from sklearn.model_selection import StratifiedShuffleSplit + splitter = StratifiedShuffleSplit( - n_splits=1, - test_size=n_val_samples, - random_state=42 + n_splits=1, test_size=n_val_samples, random_state=42 ) train_idx, val_idx = next(splitter.split(X_transformed, y)) - + # Split original features X_train_transformed = X_transformed[train_idx] X_val_transformed = X_transformed[val_idx] y_train = y[train_idx] y_val = y[val_idx] - + # Recompute kernel matrix for training only self.X_fit_ = X_train_transformed self.K_train_ = self._get_kernel_matrix(X_train_transformed) training_data = self.K_train_ - + # Split sample weights if provided if sample_weight is not None: sample_weight_val = sample_weight[val_idx] sample_weight = sample_weight[train_idx] else: sample_weight_val = None - + # Store validation data (original features for kernel computation) validation_data = (X_val_transformed, y_val, sample_weight_val) y = y_train @@ -862,28 +867,27 @@ def fit(self, X, y, sample_weight=None) -> Self: # For linear or approximate kernels, split after transformation n_samples = training_data.shape[0] n_val_samples = max(1, int(self.validation_fraction * n_samples)) - + from sklearn.model_selection import StratifiedShuffleSplit + splitter = StratifiedShuffleSplit( - n_splits=1, - test_size=n_val_samples, - random_state=42 + n_splits=1, test_size=n_val_samples, random_state=42 ) train_idx, val_idx = next(splitter.split(training_data, y)) - + # Split training data training_data_val = training_data[val_idx] y_val = y[val_idx] training_data = training_data[train_idx] y_train = y[train_idx] - + # Split sample weights if provided if sample_weight is not None: sample_weight_val = sample_weight[val_idx] sample_weight = sample_weight[train_idx] else: sample_weight_val = None - + # Store validation data for checking validation_data = (training_data_val, y_val, sample_weight_val) y = y_train @@ -902,21 +906,39 @@ def fit(self, X, y, sample_weight=None) -> Self: category=FutureWarning, message=".*parameter 'algorithm' is deprecated.*", ) - + # Use gradient boosting if specified if self.boosting_type == "gradient": - return self._fit_gradient_boosting(training_data, y_train, sample_weight, validation_data, use_oob=use_oob) - + return self._fit_gradient_boosting( + training_data, + y_train, + sample_weight, + validation_data, + use_oob=use_oob, + ) + # If early stopping is enabled, use custom boosting loop - if self.early_stopping and self.n_iter_no_change is not None and validation_data is not None: - return self._fit_with_early_stopping(training_data, y_train, sample_weight, validation_data, use_oob=use_oob) - + if ( + self.early_stopping + and self.n_iter_no_change is not None + and validation_data is not None + ): + return self._fit_with_early_stopping( + training_data, + y_train, + sample_weight, + validation_data, + use_oob=use_oob, + ) + # Pass the precomputed kernel matrix (or raw features for linear) return super().fit(training_data, y_train, sample_weight) - def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob=False): + def _fit_with_early_stopping( + self, X, y, sample_weight, validation_data, use_oob=False + ): """Fit with early stopping based on validation error or OOB evaluation. - + Parameters ---------- X : array-like @@ -939,46 +961,49 @@ def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob else: # Traditional validation split X_val, y_val, sample_weight_val = validation_data - + # Initialize from parent class from sklearn.utils import check_random_state - + # Initialize attributes needed for boosting # Ensure estimator_ is set (needed by _make_estimator) - if not hasattr(self, 'estimator_') or self.estimator_ is None: + if not hasattr(self, "estimator_") or self.estimator_ is None: # Reuse the same logic from __init__ to create base estimator try: if self.kernel_approx is not None or self.kernel == "linear": from .sefr import SEFR + self.estimator_ = SEFR(kernel="linear") else: from .sefr import SEFR + self.estimator_ = SEFR(kernel="precomputed") except (ValueError, TypeError): from .sefr import SEFR + self.estimator_ = SEFR(kernel="linear") - + self.estimators_ = [] self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) - + # Initialize sample weights if sample_weight is None: sample_weight = np.ones(X.shape[0], dtype=np.float64) sample_weight /= sample_weight.sum() - + random_state = check_random_state(None) - + # Track best validation score and iterations without improvement best_val_score = -np.inf n_no_improvement = 0 best_n_estimators = 0 - + # For OOB, we need to store X_fit_ reference for exact kernels - if use_oob and hasattr(self, 'X_fit_') and self.X_fit_ is not None: + if use_oob and hasattr(self, "X_fit_") and self.X_fit_ is not None: # Store reference to training features for kernel computation pass # Already stored - + # Early stopping loop for iboost in range(self.n_estimators): # Perform a single boost @@ -988,7 +1013,9 @@ def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob iboost, X, y, sample_weight, random_state, return_oob_indices=True ) if len(boost_result) == 4: - sample_weight, estimator_weight, estimator_error, oob_indices = boost_result + sample_weight, estimator_weight, estimator_error, oob_indices = ( + boost_result + ) oob_indices_history.append(oob_indices) else: sample_weight, estimator_weight, estimator_error = boost_result @@ -997,34 +1024,42 @@ def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob sample_weight, estimator_weight, estimator_error = self._boost( iboost, X, y, sample_weight, random_state ) - + if sample_weight is None: break - + # Store results self.estimator_weights_[iboost] = estimator_weight self.estimator_errors_[iboost] = estimator_error - + # Evaluate on validation set or OOB samples using F1/ROC-AUC - if use_oob and len(oob_indices_history) > 0 and oob_indices_history[-1] is not None: + if ( + use_oob + and len(oob_indices_history) > 0 + and oob_indices_history[-1] is not None + ): # Use OOB samples from current iteration oob_idx = oob_indices_history[-1] if len(oob_idx) > 0: # Get OOB data (X_full is already transformed features/kernel matrix) X_oob = X_full[oob_idx] y_oob = y_full[oob_idx] - + # Get predictions and probabilities for F1/ROC-AUC val_pred = self._staged_predict_single(X_oob, iboost + 1) val_proba = self._staged_predict_proba_single(X_oob, iboost + 1) - + # Compute F1 score (primary metric) - f1_val = f1_score(y_oob, val_pred, average='weighted', zero_division=0.0) - + f1_val = f1_score( + y_oob, val_pred, average="weighted", zero_division=0.0 + ) + # Compute ROC-AUC if possible (requires probabilities) try: if val_proba is not None and val_proba.shape[1] >= 2: - roc_auc_val = roc_auc_score(y_oob, val_proba[:, 1], average='weighted') + roc_auc_val = roc_auc_score( + y_oob, val_proba[:, 1], average="weighted" + ) # Combined metric: 70% F1, 30% ROC-AUC val_score = 0.7 * f1_val + 0.3 * roc_auc_val else: @@ -1039,14 +1074,18 @@ def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob # Traditional validation split val_pred = self._staged_predict_single(X_val, iboost + 1) val_proba = self._staged_predict_proba_single(X_val, iboost + 1) - + # Compute F1 score (primary metric) - f1_val = f1_score(y_val, val_pred, average='weighted', zero_division=0.0) - + f1_val = f1_score( + y_val, val_pred, average="weighted", zero_division=0.0 + ) + # Compute ROC-AUC if possible try: if val_proba is not None and val_proba.shape[1] >= 2: - roc_auc_val = roc_auc_score(y_val, val_proba[:, 1], average='weighted') + roc_auc_val = roc_auc_score( + y_val, val_proba[:, 1], average="weighted" + ) # Combined metric: 70% F1, 30% ROC-AUC val_score = 0.7 * f1_val + 0.3 * roc_auc_val else: @@ -1054,7 +1093,7 @@ def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob except (ValueError, IndexError): # Fallback to F1 only if ROC-AUC fails val_score = f1_val - + # Check for improvement if val_score > best_val_score + self.tol: best_val_score = val_score @@ -1062,25 +1101,29 @@ def _fit_with_early_stopping(self, X, y, sample_weight, validation_data, use_oob best_n_estimators = iboost + 1 else: n_no_improvement += 1 - + # Early stopping check if n_no_improvement >= self.n_iter_no_change: # Trim estimators to best point if best_n_estimators > 0: self.estimators_ = self.estimators_[:best_n_estimators] - self.estimator_weights_ = self.estimator_weights_[:best_n_estimators] + self.estimator_weights_ = self.estimator_weights_[ + :best_n_estimators + ] self.estimator_errors_ = self.estimator_errors_[:best_n_estimators] break - + return self - def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_oob=False): + def _fit_gradient_boosting( + self, X, y, sample_weight, validation_data=None, use_oob=False + ): """Fit using gradient boosting instead of AdaBoost. - + Gradient boosting fits each new estimator to the pseudo-residuals (negative gradient of the log-loss), which can handle non-linear patterns more effectively than sample reweighting. - + Parameters ---------- X : array-like of shape (n_samples, n_features) @@ -1093,76 +1136,80 @@ def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_ Validation data for early stopping use_oob : bool Whether to use OOB evaluation - + Returns ------- self : object Fitted estimator """ from sklearn.utils import check_random_state - + n_samples = X.shape[0] - + # Determine if we're using exact (precomputed) kernels # Exact kernel: non-linear kernel without approximation self._gradient_exact_kernel = ( not self._use_kernel_approx() and self.kernel != "linear" ) - + # For exact kernels, X is the kernel matrix K_train_ # We also have X_fit_ which contains the original transformed features if self._gradient_exact_kernel: # Store all training indices for computing prediction kernels self._gradient_train_indices = np.arange(n_samples) - + # Initialize estimator list and weights self.estimators_ = [] self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) - + # For exact kernels, store estimator training info self._gradient_estimator_info = [] - + # Map labels to 0/1 if needed y_binary = np.where(y == self.classes_[0], 0, 1).astype(np.float64) - + # Initialize with log-odds of class prior pos_rate = np.clip(y_binary.mean(), 1e-10, 1 - 1e-10) self.init_score_ = np.log(pos_rate / (1 - pos_rate)) - + # Current predictions (log-odds space) F = np.full(n_samples, self.init_score_, dtype=np.float64) self.F_ = F # Store for reference - + # Sample weights if sample_weight is None: sample_weight = np.ones(n_samples, dtype=np.float64) sample_weight = sample_weight / sample_weight.sum() - + random_state = check_random_state(None) - + # Early stopping tracking best_val_score = -np.inf n_no_improvement = 0 best_n_estimators = 0 - + # Validation data setup if validation_data is not None: X_val, y_val, _ = validation_data if use_oob: - y_val_binary = np.where(y_val == self.classes_[0], 0, 1).astype(np.float64) + y_val_binary = np.where(y_val == self.classes_[0], 0, 1).astype( + np.float64 + ) else: - y_val_binary = np.where(y_val == self.classes_[0], 0, 1).astype(np.float64) - + y_val_binary = np.where(y_val == self.classes_[0], 0, 1).astype( + np.float64 + ) + for iboost in range(self.n_estimators): # Compute probabilities from current predictions p = 1 / (1 + np.exp(-F)) p = np.clip(p, 1e-10, 1 - 1e-10) - + # Compute pseudo-residuals (negative gradient of log-loss) # For log-loss: gradient = p - y, so negative gradient = y - p residuals = y_binary - p - + # Apply subsample if enabled if self.subsample < 1.0: n_subsample = max(1, int(self.subsample * n_samples)) @@ -1178,16 +1225,16 @@ def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_ residuals_train = residuals weights_train = sample_weight oob_idx = None - + # Convert residuals to binary labels for SEFR # Positive residual (y > p) -> class 1 (need to increase prediction) # Negative residual (y < p) -> class 0 (need to decrease prediction) residual_labels = (residuals_train > 0).astype(int) - + # Use magnitude of residuals as sample weights (larger residuals = more important) residual_weights = np.abs(residuals_train) * weights_train residual_weights = residual_weights / (residual_weights.sum() + 1e-10) - + # Create SEFR estimator with appropriate kernel settings # For exact kernels, we need to handle the kernel matrix properly if self._gradient_exact_kernel: @@ -1202,7 +1249,7 @@ def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_ else: estimator = SEFR(kernel="linear") X_train_kernel = X_train # Original features or approximated features - + try: # Check if we have both classes in the residual labels unique_labels = np.unique(residual_labels) @@ -1213,21 +1260,23 @@ def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_ self.estimator_weights_[iboost] = 0.0 self._gradient_estimator_info.append(None) continue - - estimator.fit(X_train_kernel, residual_labels, sample_weight=residual_weights) + + estimator.fit( + X_train_kernel, residual_labels, sample_weight=residual_weights + ) except Exception: # If fitting fails, stop boosting break - + # Store info about which training samples were used (for exact kernel prediction) if self._gradient_exact_kernel: if self.subsample < 1.0: - estimator_info = {'train_idx': subsample_idx.copy()} + estimator_info = {"train_idx": subsample_idx.copy()} else: - estimator_info = {'train_idx': np.arange(n_samples)} + estimator_info = {"train_idx": np.arange(n_samples)} else: estimator_info = None - + # Get continuous predictions from SEFR using predict_proba # Use the probability of class 1 (positive residual direction) # Transform to [-1, 1] range: 2 * proba - 1 @@ -1242,20 +1291,20 @@ def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_ else: proba = estimator.predict_proba(X) h = 2 * proba[:, 1] - 1 # Maps [0, 1] to [-1, 1] - + # Line search for optimal step size (simplified Newton step) # For log-loss, optimal step is approximately residuals / (p * (1-p)) # We use a simplified approach with learning_rate * shrinkage step_size = self.learning_rate * self.shrinkage - + # Update predictions F = F + step_size * h - + # Store estimator and its info self.estimators_.append(estimator) self.estimator_weights_[iboost] = step_size self._gradient_estimator_info.append(estimator_info) - + # Compute training error (log-loss) p_new = 1 / (1 + np.exp(-F)) p_new = np.clip(p_new, 1e-10, 1 - 1e-10) @@ -1263,7 +1312,7 @@ def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_ y_binary * np.log(p_new) + (1 - y_binary) * np.log(1 - p_new) ) self.estimator_errors_[iboost] = train_loss - + # Early stopping check if self.early_stopping and validation_data is not None: if use_oob and oob_idx is not None and len(oob_idx) > 0: @@ -1271,18 +1320,22 @@ def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_ # For exact kernels, we need kernel between OOB samples and training if self._gradient_exact_kernel: # Pass the kernel submatrix for OOB samples - val_pred = self._gradient_predict_internal(X, oob_idx, iboost + 1) - val_proba = self._gradient_predict_proba_internal(X, oob_idx, iboost + 1) + val_pred = self._gradient_predict_internal( + X, oob_idx, iboost + 1 + ) + val_proba = self._gradient_predict_proba_internal( + X, oob_idx, iboost + 1 + ) else: val_pred = self._gradient_predict(X[oob_idx], iboost + 1) val_proba = self._gradient_predict_proba(X[oob_idx], iboost + 1) y_oob_binary = y_binary[oob_idx] - + f1_val = f1_score( - (y_oob_binary > 0.5).astype(int), - val_pred, - average='weighted', - zero_division=0.0 + (y_oob_binary > 0.5).astype(int), + val_pred, + average="weighted", + zero_division=0.0, ) try: roc_auc_val = roc_auc_score(y_oob_binary, val_proba[:, 1]) @@ -1293,47 +1346,56 @@ def _fit_gradient_boosting(self, X, y, sample_weight, validation_data=None, use_ # Use validation set val_pred = self._gradient_predict(X_val, iboost + 1) val_proba = self._gradient_predict_proba(X_val, iboost + 1) - + f1_val = f1_score( - y_val_binary.astype(int), - val_pred, - average='weighted', - zero_division=0.0 + y_val_binary.astype(int), + val_pred, + average="weighted", + zero_division=0.0, ) try: roc_auc_val = roc_auc_score(y_val_binary, val_proba[:, 1]) val_score = 0.7 * f1_val + 0.3 * roc_auc_val except (ValueError, IndexError): val_score = f1_val - + if val_score > best_val_score + self.tol: best_val_score = val_score n_no_improvement = 0 best_n_estimators = iboost + 1 else: n_no_improvement += 1 - - if self.n_iter_no_change is not None and n_no_improvement >= self.n_iter_no_change: + + if ( + self.n_iter_no_change is not None + and n_no_improvement >= self.n_iter_no_change + ): if best_n_estimators > 0: self.estimators_ = self.estimators_[:best_n_estimators] - self.estimator_weights_ = self.estimator_weights_[:best_n_estimators] - self.estimator_errors_ = self.estimator_errors_[:best_n_estimators] - self._gradient_estimator_info = self._gradient_estimator_info[:best_n_estimators] + self.estimator_weights_ = self.estimator_weights_[ + :best_n_estimators + ] + self.estimator_errors_ = self.estimator_errors_[ + :best_n_estimators + ] + self._gradient_estimator_info = self._gradient_estimator_info[ + :best_n_estimators + ] break - + # Trim arrays to actual number of estimators n_fitted = len(self.estimators_) self.estimator_weights_ = self.estimator_weights_[:n_fitted] self.estimator_errors_ = self.estimator_errors_[:n_fitted] self._gradient_estimator_info = self._gradient_estimator_info[:n_fitted] - + return self - + def _gradient_predict_proba_internal(self, K_train, sample_idx, n_estimators=None): """Internal method for exact kernel prediction during training. - + Used for OOB evaluation where we have the full training kernel matrix. - + Parameters ---------- K_train : ndarray of shape (n_train, n_train) @@ -1342,38 +1404,38 @@ def _gradient_predict_proba_internal(self, K_train, sample_idx, n_estimators=Non Indices of samples to predict (rows to use) n_estimators : int or None Number of estimators to use - + Returns ------- proba : ndarray of shape (len(sample_idx), 2) """ if n_estimators is None: n_estimators = len(self.estimators_) - + n_samples = len(sample_idx) F = np.full(n_samples, self.init_score_, dtype=np.float64) - + for i in range(min(n_estimators, len(self.estimators_))): estimator = self.estimators_[i] if estimator is None: continue - + info = self._gradient_estimator_info[i] if info is not None: # Exact kernel: get K[sample_idx, train_idx] - train_idx = info['train_idx'] + train_idx = info["train_idx"] K_pred = K_train[np.ix_(sample_idx, train_idx)] else: K_pred = K_train[sample_idx] - + proba_est = estimator.predict_proba(K_pred) h = 2 * proba_est[:, 1] - 1 F = F + self.estimator_weights_[i] * h - + p = 1 / (1 + np.exp(-F)) p = np.clip(p, 1e-10, 1 - 1e-10) return np.column_stack([1 - p, p]) - + def _gradient_predict_internal(self, K_train, sample_idx, n_estimators=None): """Internal method for exact kernel prediction during training.""" proba = self._gradient_predict_proba_internal(K_train, sample_idx, n_estimators) @@ -1381,14 +1443,14 @@ def _gradient_predict_internal(self, K_train, sample_idx, n_estimators=None): def _gradient_predict_proba(self, X, n_estimators=None): """Predict class probabilities using gradient boosting ensemble. - + Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples (original features - will be transformed) n_estimators : int or None Number of estimators to use (None = all) - + Returns ------- proba : ndarray of shape (n_samples, 2) @@ -1396,60 +1458,60 @@ def _gradient_predict_proba(self, X, n_estimators=None): """ if n_estimators is None: n_estimators = len(self.estimators_) - + n_samples = X.shape[0] - + # Start with initial score F = np.full(n_samples, self.init_score_, dtype=np.float64) - + # For exact kernels, we need to compute kernel between X and training samples # X here is the transformed (scaled) features - is_exact_kernel = getattr(self, '_gradient_exact_kernel', False) - + is_exact_kernel = getattr(self, "_gradient_exact_kernel", False) + # Add contributions from estimators for i in range(min(n_estimators, len(self.estimators_))): estimator = self.estimators_[i] if estimator is None: # Skip None estimators (from failed fits or single-class residuals) continue - + if is_exact_kernel: # Get estimator info to know which training samples were used info = self._gradient_estimator_info[i] if info is not None: - train_idx = info['train_idx'] + train_idx = info["train_idx"] # Compute kernel between X and the training samples used by this estimator X_train_subset = self.X_fit_[train_idx] else: X_train_subset = self.X_fit_ - + # Compute kernel matrix between test and training samples K_pred = self._compute_kernel_matrix(X, X_train_subset) proba_est = estimator.predict_proba(K_pred) else: # For linear or approximated kernels, X is already the right format proba_est = estimator.predict_proba(X) - + h = 2 * proba_est[:, 1] - 1 # Maps [0, 1] to [-1, 1] F = F + self.estimator_weights_[i] * h - + # Convert to probabilities p = 1 / (1 + np.exp(-F)) p = np.clip(p, 1e-10, 1 - 1e-10) - + proba = np.column_stack([1 - p, p]) return proba def _gradient_predict(self, X, n_estimators=None): """Predict class labels using gradient boosting ensemble. - + Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples n_estimators : int or None Number of estimators to use (None = all) - + Returns ------- y_pred : ndarray of shape (n_samples,) @@ -1460,83 +1522,81 @@ def _gradient_predict(self, X, n_estimators=None): def _gradient_decision_function(self, X): """Compute decision function using gradient boosting. - + Returns the raw log-odds scores. """ n_samples = X.shape[0] - + # Start with initial score F = np.full(n_samples, self.init_score_, dtype=np.float64) - + # For exact kernels, we need to compute kernel between X and training samples - is_exact_kernel = getattr(self, '_gradient_exact_kernel', False) - + is_exact_kernel = getattr(self, "_gradient_exact_kernel", False) + # Add contributions from estimators for i in range(len(self.estimators_)): estimator = self.estimators_[i] if estimator is None: # Skip None estimators (from failed fits or single-class residuals) continue - + if is_exact_kernel: # Get estimator info to know which training samples were used info = self._gradient_estimator_info[i] if info is not None: - train_idx = info['train_idx'] + train_idx = info["train_idx"] X_train_subset = self.X_fit_[train_idx] else: X_train_subset = self.X_fit_ - + # Compute kernel matrix between test and training samples K_pred = self._compute_kernel_matrix(X, X_train_subset) proba_est = estimator.predict_proba(K_pred) else: proba_est = estimator.predict_proba(X) - + h = 2 * proba_est[:, 1] - 1 # Maps [0, 1] to [-1, 1] F = F + self.estimator_weights_[i] * h - + return F def _compute_kernel_matrix(self, X, Y=None): """Compute kernel matrix with appropriate parameters for the kernel type. - + Parameters ---------- X : array-like of shape (n_samples_X, n_features) First input Y : array-like of shape (n_samples_Y, n_features), optional Second input. If None, compute K(X, X). - + Returns ------- K : ndarray of shape (n_samples_X, n_samples_Y) Kernel matrix """ gamma = self.gamma if self.gamma is not None else 1.0 / X.shape[1] - + # Build kernel parameters based on kernel type - if self.kernel == 'rbf': - return pairwise_kernels(X, Y, metric='rbf', gamma=gamma) - elif self.kernel == 'poly': + if self.kernel == "rbf": + return pairwise_kernels(X, Y, metric="rbf", gamma=gamma) + elif self.kernel == "poly": return pairwise_kernels( - X, Y, metric='poly', - gamma=gamma, degree=self.degree, coef0=self.coef0 + X, Y, metric="poly", gamma=gamma, degree=self.degree, coef0=self.coef0 ) - elif self.kernel == 'sigmoid': + elif self.kernel == "sigmoid": return pairwise_kernels( - X, Y, metric='sigmoid', - gamma=gamma, coef0=self.coef0 + X, Y, metric="sigmoid", gamma=gamma, coef0=self.coef0 ) - elif self.kernel == 'linear': - return pairwise_kernels(X, Y, metric='linear') + elif self.kernel == "linear": + return pairwise_kernels(X, Y, metric="linear") else: # Custom or callable kernel return pairwise_kernels(X, Y, metric=self.kernel) - + def _staged_predict_single(self, X, n_estimators): """Predict using first n_estimators for validation. - + X can be either: - Transformed features (for linear/approximate kernels) - Kernel matrix (for exact kernels) @@ -1545,18 +1605,23 @@ def _staged_predict_single(self, X, n_estimators): if n_estimators == 0: # Return majority class return np.full(X.shape[0], self.classes_[0]) - + # For exact kernels, if X is original features, compute kernel matrix - if (not self._use_kernel_approx() and self.kernel != "linear" and - hasattr(self, 'X_fit_') and self.X_fit_ is not None and - X.shape[1] == self.X_fit_.shape[1] and X.shape[1] != self.X_fit_.shape[0]): + if ( + not self._use_kernel_approx() + and self.kernel != "linear" + and hasattr(self, "X_fit_") + and self.X_fit_ is not None + and X.shape[1] == self.X_fit_.shape[1] + and X.shape[1] != self.X_fit_.shape[0] + ): # X appears to be original features, compute kernel matrix X = self._get_kernel_matrix(X, self.X_fit_) - + if self.algorithm == "SAMME.R": classes = self.classes_ n_classes = len(classes) - + pred = sum( self._samme_proba(estimator, n_classes, X) for estimator in self.estimators_[:n_estimators] @@ -1574,33 +1639,33 @@ def _staged_predict_single(self, X, n_estimators): # SAMME algorithm classes = self.classes_ pred = np.zeros((X.shape[0], n_classes)) - + for i, estimator in enumerate(self.estimators_[:n_estimators]): predictions = estimator.predict(X) for j, class_label in enumerate(classes): - pred[:, j] += ( - self.estimator_weights_[i] * (predictions == class_label) + pred[:, j] += self.estimator_weights_[i] * ( + predictions == class_label ) - + decision = pred - + if self.n_classes_ == 2: return self.classes_.take((decision > 0).astype(int), axis=0) else: return self.classes_.take(np.argmax(decision, axis=1), axis=0) - + def _staged_predict_proba_single(self, X, n_estimators): """Predict probabilities using first n_estimators for validation. - + Similar to _staged_predict_single but returns probabilities instead of predictions. - + Parameters ---------- X : array-like Validation data (features or kernel matrix) n_estimators : int Number of estimators to use - + Returns ------- proba : ndarray of shape (n_samples, n_classes) @@ -1609,20 +1674,25 @@ def _staged_predict_proba_single(self, X, n_estimators): if n_estimators == 0: # Return uniform probabilities return np.ones((X.shape[0], self.n_classes_)) / self.n_classes_ - + # For exact kernels, if X is original features, compute kernel matrix - if (not self._use_kernel_approx() and self.kernel != "linear" and - hasattr(self, 'X_fit_') and self.X_fit_ is not None and - X.shape[1] == self.X_fit_.shape[1] and X.shape[1] != self.X_fit_.shape[0]): + if ( + not self._use_kernel_approx() + and self.kernel != "linear" + and hasattr(self, "X_fit_") + and self.X_fit_ is not None + and X.shape[1] == self.X_fit_.shape[1] + and X.shape[1] != self.X_fit_.shape[0] + ): # X appears to be original features, compute kernel matrix X = self._get_kernel_matrix(X, self.X_fit_) - + if self.algorithm == "SAMME.R": # Use decision function and convert to probabilities # This matches how predict_proba works in the parent class classes = self.classes_ n_classes = len(classes) - + pred = sum( self._samme_proba(estimator, n_classes, X) for estimator in self.estimators_[:n_estimators] @@ -1634,7 +1704,7 @@ def _staged_predict_proba_single(self, X, n_estimators): else: # No valid weights, return uniform return np.ones((X.shape[0], n_classes)) / n_classes - + # Convert SAMME.R output to probabilities # _samme_proba returns log-probability-like values (n_samples, n_classes) if n_classes == 2: @@ -1650,16 +1720,16 @@ def _staged_predict_proba_single(self, X, n_estimators): # Multi-class: use softmax exp_pred = np.exp(pred - np.max(pred, axis=1, keepdims=True)) proba = exp_pred / np.sum(exp_pred, axis=1, keepdims=True) - + return proba else: # SAMME algorithm: use weighted voting classes = self.classes_ n_classes = len(classes) proba = np.zeros((X.shape[0], n_classes)) - + for i, estimator in enumerate(self.estimators_[:n_estimators]): - if hasattr(estimator, 'predict_proba'): + if hasattr(estimator, "predict_proba"): estimator_proba = estimator.predict_proba(X) weight = self.estimator_weights_[i] proba += weight * estimator_proba @@ -1669,12 +1739,12 @@ def _staged_predict_proba_single(self, X, n_estimators): weight = self.estimator_weights_[i] for j, class_label in enumerate(classes): proba[:, j] += weight * (predictions == class_label) - + # Normalize proba_sum = np.sum(proba, axis=1, keepdims=True) proba_sum[proba_sum == 0] = 1.0 # Avoid division by zero proba /= proba_sum - + return proba @staticmethod @@ -1698,10 +1768,12 @@ def _samme_proba(estimator, n_classes, X): log_proba - (1.0 / n_classes) * log_proba.sum(axis=1)[:, np.newaxis] ) - def _compute_adaptive_learning_rate(self, iboost, estimator_error, base_learning_rate): + def _compute_adaptive_learning_rate( + self, iboost, estimator_error, base_learning_rate + ): """ Compute adaptive learning rate based on iteration and estimator error. - + Parameters ---------- iboost : int @@ -1710,7 +1782,7 @@ def _compute_adaptive_learning_rate(self, iboost, estimator_error, base_learning Classification error of the current estimator (0-0.5) base_learning_rate : float Base learning rate from user parameter - + Returns ------- adaptive_lr : float @@ -1719,20 +1791,22 @@ def _compute_adaptive_learning_rate(self, iboost, estimator_error, base_learning # Exponential decay: reduce learning rate as we progress # Factor starts at 1.0 and decays to ~0.7 over all iterations iteration_decay = 1.0 - (iboost / max(self.n_estimators, 1)) * 0.3 - + # Error-based adjustment: lower rate for high error estimators # High error (0.5) -> factor ~0.57, Low error (0.0) -> factor 1.0 error_factor = 1.0 / (1.0 + estimator_error * 1.5) - + # Combine factors adaptive_lr = base_learning_rate * iteration_decay * error_factor - + # Clamp to reasonable range: at least 0.01, at most base_learning_rate adaptive_lr = np.clip(adaptive_lr, 0.01, base_learning_rate) - + return adaptive_lr - def _boost(self, iboost, X, y, sample_weight, random_state, return_oob_indices=False): + def _boost( + self, iboost, X, y, sample_weight, random_state, return_oob_indices=False + ): """ Implement a single boost using precomputed kernel matrix or raw features. @@ -1746,31 +1820,36 @@ def _boost(self, iboost, X, y, sample_weight, random_state, return_oob_indices=F """ estimator = self._make_estimator(random_state=random_state) oob_indices = None - + # Apply subsampling if enabled # Note: For exact kernels (precomputed kernel matrices), subsampling is skipped # because it would require tracking subsample indices per estimator for correct prediction - is_exact_kernel = (X.shape[0] == X.shape[1] and X.shape[0] == y.shape[0] and - not self._use_kernel_approx() and self.kernel != "linear") - + is_exact_kernel = ( + X.shape[0] == X.shape[1] + and X.shape[0] == y.shape[0] + and not self._use_kernel_approx() + and self.kernel != "linear" + ) + if self.subsample < 1.0 and not is_exact_kernel: n_samples = X.shape[0] n_subsample = max(1, int(self.subsample * n_samples)) - + # Use stratified sampling to maintain class distribution from sklearn.model_selection import StratifiedShuffleSplit + splitter = StratifiedShuffleSplit( n_splits=1, train_size=n_subsample, - random_state=random_state.randint(0, 2**31 - 1) + random_state=random_state.randint(0, 2**31 - 1), ) subsample_idx, _ = next(splitter.split(X, y)) - + # Track OOB indices if requested if return_oob_indices: all_indices = np.arange(n_samples) oob_indices = np.setdiff1d(all_indices, subsample_idx) - + # Subsample data and weights (for feature matrices, subsample rows only) X_subsample = X[subsample_idx] y_subsample = y[subsample_idx] @@ -1780,9 +1859,11 @@ def _boost(self, iboost, X, y, sample_weight, random_state, return_oob_indices=F sample_weight_subsample /= sample_weight_subsample.sum() else: sample_weight_subsample = None - + # Fit estimator on subsampled data - estimator.fit(X_subsample, y_subsample, sample_weight=sample_weight_subsample) + estimator.fit( + X_subsample, y_subsample, sample_weight=sample_weight_subsample + ) else: # No subsampling - use all data estimator.fit(X, y, sample_weight=sample_weight) @@ -1811,19 +1892,19 @@ def _boost(self, iboost, X, y, sample_weight, random_state, return_oob_indices=F adaptive_lr = self._compute_adaptive_learning_rate( iboost, estimator_error, self.learning_rate ) - + # Compute F1 score for this estimator to inform weight calculation # This aligns estimator weighting with F1 optimization target - f1 = f1_score(y, y_pred, sample_weight=sample_weight, average='weighted') - + f1 = f1_score(y, y_pred, sample_weight=sample_weight, average="weighted") + # F1 bonus: reward estimators with good F1 performance # Scale: 0.5 F1 -> 1.0x multiplier, 1.0 F1 -> 1.2x multiplier # This ensures estimators contributing to F1 get higher weights f1_bonus = 1.0 + (f1 - 0.5) * 0.6 - + # Compute base weight from error rate base_weight = np.log((1 - estimator_error) / max(estimator_error, 1e-10)) - + # Apply F1 bonus to estimator weight estimator_weight = self.shrinkage * adaptive_lr * base_weight * f1_bonus @@ -1832,18 +1913,22 @@ def _boost(self, iboost, X, y, sample_weight, random_state, return_oob_indices=F # This gives higher weight boosts to minority class samples when misclassified unique_classes, class_counts = np.unique(y, return_counts=True) class_freq = class_counts / len(y) - class_weights = {cls: 1.0 / freq for cls, freq in zip(unique_classes, class_freq)} - + class_weights = { + cls: 1.0 / freq for cls, freq in zip(unique_classes, class_freq) + } + # Apply class-aware weight updates (minority class gets higher boost) for cls in unique_classes: cls_mask = y == cls cls_weight = class_weights[cls] # Inverse frequency weighting sample_weight[cls_mask] = np.exp( np.log(sample_weight[cls_mask] + 1e-10) - + estimator_weight * incorrect[cls_mask] * cls_weight + + estimator_weight + * incorrect[cls_mask] + * cls_weight * (sample_weight[cls_mask] > 0) ) - + # Normalize to prevent numerical issues sample_weight /= np.sum(sample_weight) @@ -1875,19 +1960,19 @@ def _boost(self, iboost, X, y, sample_weight, random_state, return_oob_indices=F adaptive_lr = self._compute_adaptive_learning_rate( iboost, estimator_error, self.learning_rate ) - + # Compute F1 score for this estimator to inform weight calculation # This aligns estimator weighting with F1 optimization target - f1 = f1_score(y, y_pred, sample_weight=sample_weight, average='weighted') - + f1 = f1_score(y, y_pred, sample_weight=sample_weight, average="weighted") + # F1 bonus: reward estimators with good F1 performance # Scale: 0.5 F1 -> 1.0x multiplier, 1.0 F1 -> 1.2x multiplier # This ensures estimators contributing to F1 get higher weights f1_bonus = 1.0 + (f1 - 0.5) * 0.6 - + # Compute base weight from error rate base_weight = np.log((1.0 - estimator_error) / max(estimator_error, 1e-10)) - + # Apply F1 bonus to estimator weight estimator_weight = self.shrinkage * adaptive_lr * base_weight * f1_bonus @@ -1895,8 +1980,10 @@ def _boost(self, iboost, X, y, sample_weight, random_state, return_oob_indices=F # This gives higher weight boosts to minority class samples when misclassified unique_classes, class_counts = np.unique(y, return_counts=True) class_freq = class_counts / len(y) - class_weights = {cls: 1.0 / freq for cls, freq in zip(unique_classes, class_freq)} - + class_weights = { + cls: 1.0 / freq for cls, freq in zip(unique_classes, class_freq) + } + # Apply class-aware weight updates (minority class gets higher boost) for cls in unique_classes: cls_mask = y == cls @@ -2005,7 +2092,7 @@ def predict(self, X): if self.boosting_type == "gradient": check_is_fitted(self) X_transformed = self.scaler_.transform(X) - + # Transform data based on kernel type if self.kernel == "linear": test_data = X_transformed @@ -2015,9 +2102,9 @@ def predict(self, X): # For exact kernels, pass transformed features - kernel will be computed # inside _gradient_predict for each estimator test_data = X_transformed - + return self._gradient_predict(test_data) - + pred = self.decision_function(X) if self.n_classes_ == 2: @@ -2047,7 +2134,7 @@ def predict_proba(self, X): if self.boosting_type == "gradient": check_is_fitted(self) X_transformed = self.scaler_.transform(X) - + # Transform data based on kernel type if self.kernel == "linear": test_data = X_transformed @@ -2057,8 +2144,8 @@ def predict_proba(self, X): # For exact kernels, pass transformed features - kernel will be computed # inside _gradient_predict_proba for each estimator test_data = X_transformed - + return self._gradient_predict_proba(test_data) - + # For AdaBoost, use parent implementation - return super().predict_proba(X) \ No newline at end of file + return super().predict_proba(X) diff --git a/src/linearboost/sefr.py b/src/linearboost/sefr.py index 69a0c42..1a328c1 100644 --- a/src/linearboost/sefr.py +++ b/src/linearboost/sefr.py @@ -283,7 +283,7 @@ def fit(self, X, y, sample_weight=None) -> Self: # Validate sample weights pos_labels = y_ == 1 neg_labels = y_ == 0 - + pos_sample_weight, neg_sample_weight = None, None if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)