From 8230bd46990752b7ce3f4684428e0f2689e2b4ea Mon Sep 17 00:00:00 2001 From: Thierry RAMORASOAVINA Date: Thu, 22 Jan 2026 16:54:02 +0100 Subject: [PATCH 1/2] Add the `n_feature_parts` parameter to the supervised estimators - KhiopsClassifier, KhiopsRegressor and KhiopsEncoder --- CHANGELOG.md | 5 +++++ khiops/sklearn/estimators.py | 28 +++++++++++++++++++++++++++- tests/test_sklearn.py | 15 +++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc440647..58ed6a29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ - Example: 10.2.1.4 is the 5th version that supports khiops 10.2.1. - Internals: Changes in *Internals* sections are unlikely to be of interest for data scientists. +## Unreleased + +### Added +- (`sklearn`) `n_feature_parts` parameter to the supervised estimators + ## 11.0.0.2 - 2026-01-26 ## Fixed diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 3ad6776b..f1605dfc 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -1195,6 +1195,7 @@ def __init__( specific_pairs=None, all_possible_pairs=True, construction_rules=None, + n_feature_parts=0, verbose=False, output_dir=None, auto_sort=True, @@ -1211,6 +1212,7 @@ def __init__( self.specific_pairs = specific_pairs self.all_possible_pairs = all_possible_pairs self.construction_rules = construction_rules + self.n_feature_parts = n_feature_parts self._original_target_dtype = None self._predicted_target_meta_data_tag = None self._khiops_baseline_model_prefix = None @@ -1300,6 +1302,12 @@ def _fit_check_params(self, ds, **kwargs): for rule in self.construction_rules: if not isinstance(rule, str): raise TypeError(type_error_message(rule, rule, str)) + if not isinstance(self.n_feature_parts, int): + raise TypeError( + type_error_message("n_feature_parts", self.n_feature_parts, int) + ) + if self.n_feature_parts < 0: + raise ValueError("'n_feature_parts' must be positive") def _fit_train_model(self, ds, computation_dir, **kwargs): # Train the model with Khiops @@ -1384,6 +1392,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): kwargs["max_trees"] = kwargs.pop("n_trees") kwargs["max_text_features"] = kwargs.pop("n_text_features") kwargs["text_features"] = kwargs.pop("type_text_features") + kwargs["max_parts"] = kwargs.pop("n_feature_parts") # Add the additional_data_tables parameter kwargs["additional_data_tables"] = additional_data_tables @@ -1513,6 +1522,7 @@ def __init__( specific_pairs=None, all_possible_pairs=True, construction_rules=None, + n_feature_parts=0, verbose=False, output_dir=None, auto_sort=True, @@ -1525,6 +1535,7 @@ def __init__( specific_pairs=specific_pairs, all_possible_pairs=all_possible_pairs, construction_rules=construction_rules, + n_feature_parts=n_feature_parts, verbose=verbose, output_dir=output_dir, auto_sort=auto_sort, @@ -1685,7 +1696,10 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): construction_rules : list of str, optional Allowed rules for the automatic feature construction. If not set, Khiops uses the multi-table construction rules listed in - `kh.DEFAULT_CONSTRUCTION_RULES ` + `kh.DEFAULT_CONSTRUCTION_RULES `. + n_feature_parts : int, default 0 + Maximum number of variable parts produced by preprocessing methods. If equal + to 0 it is automatically calculated. group_target_value : bool, default ``False`` Allows grouping of the target values in classification. It can substantially increase the training time. @@ -1744,6 +1758,7 @@ def __init__( specific_pairs=None, all_possible_pairs=True, construction_rules=None, + n_feature_parts=0, group_target_value=False, verbose=False, output_dir=None, @@ -1757,6 +1772,7 @@ def __init__( n_selected_features=n_selected_features, n_evaluated_features=n_evaluated_features, construction_rules=construction_rules, + n_feature_parts=n_feature_parts, verbose=verbose, output_dir=output_dir, auto_sort=auto_sort, @@ -2086,6 +2102,9 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): Allowed rules for the automatic feature construction. If not set, Khiops uses the multi-table construction rules listed in `kh.DEFAULT_CONSTRUCTION_RULES `. + n_feature_parts : int, default 0 + Maximum number of variable parts produced by preprocessing methods. If equal + to 0 it is automatically calculated. verbose : bool, default ``False`` If ``True`` it prints debug information and it does not erase temporary files when fitting, predicting or transforming. @@ -2129,6 +2148,7 @@ def __init__( n_selected_features=0, n_evaluated_features=0, construction_rules=None, + n_feature_parts=0, verbose=False, output_dir=None, auto_sort=True, @@ -2141,6 +2161,7 @@ def __init__( n_selected_features=n_selected_features, n_evaluated_features=n_evaluated_features, construction_rules=construction_rules, + n_feature_parts=n_feature_parts, verbose=verbose, output_dir=output_dir, auto_sort=auto_sort, @@ -2296,6 +2317,9 @@ class KhiopsEncoder(TransformerMixin, KhiopsSupervisedEstimator): Allowed rules for the automatic feature construction. If not set, Khiops uses the multi-table construction rules listed in `kh.DEFAULT_CONSTRUCTION_RULES `. + n_feature_parts : int, default 0 + Maximum number of variable parts produced by preprocessing methods. If equal + to 0 it is automatically calculated. informative_features_only : bool, default ``True`` If ``True`` keeps only informative features. group_target_value : bool, default ``False`` @@ -2374,6 +2398,7 @@ def __init__( specific_pairs=None, all_possible_pairs=True, construction_rules=None, + n_feature_parts=0, informative_features_only=True, group_target_value=False, keep_initial_variables=False, @@ -2390,6 +2415,7 @@ def __init__( n_text_features=n_text_features, type_text_features=type_text_features, construction_rules=construction_rules, + n_feature_parts=n_feature_parts, verbose=verbose, output_dir=output_dir, auto_sort=auto_sort, diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 1c515652..c8312fca 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -762,6 +762,7 @@ def setUpClass(cls): "specific_pairs": [("age", "race")], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "max_parts": 3, "group_target_value": False, "additional_data_tables": {}, } @@ -790,6 +791,7 @@ def setUpClass(cls): "max_selected_variables": 1, "max_evaluated_variables": 3, "construction_rules": ["TableMode", "TableSelection"], + "max_parts": 5, "additional_data_tables": {}, } }, @@ -818,6 +820,7 @@ def setUpClass(cls): "specific_pairs": [("age", "race")], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "max_parts": 7, "informative_variables_only": True, "group_target_value": False, "keep_initial_categorical_variables": False, @@ -860,6 +863,7 @@ def setUpClass(cls): "specific_pairs": [], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "max_parts": 4, "group_target_value": False, "additional_data_tables": {"SpliceJunctionDNA"}, } @@ -889,6 +893,7 @@ def setUpClass(cls): "max_selected_variables": 1, "max_evaluated_variables": 3, "construction_rules": ["TableMode", "TableSelection"], + "max_parts": 6, "additional_data_tables": {"SpliceJunctionDNA"}, } }, @@ -918,6 +923,7 @@ def setUpClass(cls): "specific_pairs": [], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "max_parts": 8, "informative_variables_only": True, "group_target_value": False, "keep_initial_categorical_variables": False, @@ -1435,6 +1441,7 @@ def test_parameter_transfer_classifier_fit_from_monotable_dataframe(self): "specific_pairs": [("age", "race")], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "n_feature_parts": 3, "group_target_value": False, }, ) @@ -1458,6 +1465,7 @@ def test_parameter_transfer_classifier_fit_from_monotable_dataframe_with_df_y( "specific_pairs": [("age", "race")], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "n_feature_parts": 3, "group_target_value": False, }, ) @@ -1480,6 +1488,7 @@ def test_parameter_transfer_classifier_fit_from_multitable_dataframe(self): "specific_pairs": [], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "n_feature_parts": 4, "group_target_value": False, }, ) @@ -1517,6 +1526,7 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe(self): "specific_pairs": [("age", "race")], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "n_feature_parts": 7, "informative_features_only": True, "group_target_value": False, "keep_initial_variables": False, @@ -1543,6 +1553,7 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe_with_df_y( "specific_pairs": [("age", "race")], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "n_feature_parts": 7, "informative_features_only": True, "group_target_value": False, "keep_initial_variables": False, @@ -1568,6 +1579,7 @@ def test_parameter_transfer_encoder_fit_from_multitable_dataframe(self): "specific_pairs": [], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], + "n_feature_parts": 8, "informative_features_only": True, "group_target_value": False, "keep_initial_variables": False, @@ -1608,6 +1620,7 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe(self): "n_text_features": 300000, "type_text_features": "ngrams", "construction_rules": ["TableMode", "TableSelection"], + "n_feature_parts": 5, }, ) @@ -1626,6 +1639,7 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe_with_df_y( "n_text_features": 300000, "type_text_features": "ngrams", "construction_rules": ["TableMode", "TableSelection"], + "n_feature_parts": 5, }, ) @@ -1644,6 +1658,7 @@ def test_parameter_transfer_regressor_fit_from_multitable_dataframe(self): "n_selected_features": 1, "n_evaluated_features": 3, "construction_rules": ["TableMode", "TableSelection"], + "n_feature_parts": 6, }, ) From 127535658c1ce4294c15b1b2e81517d25051d97d Mon Sep 17 00:00:00 2001 From: Thierry RAMORASOAVINA Date: Thu, 12 Feb 2026 18:21:12 +0100 Subject: [PATCH 2/2] Sklearn : rephrase error messages to be more precise --- khiops/sklearn/estimators.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index f1605dfc..1af5d06f 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -180,7 +180,7 @@ def _check_pair_parameters(estimator): if not isinstance(estimator.n_pairs, int): raise TypeError(type_error_message("n_pairs", estimator.n_pairs, int)) if estimator.n_pairs < 0: - raise ValueError("'n_pairs' must be positive") + raise ValueError("'n_pairs' must be non-negative") if estimator.specific_pairs is not None: if not is_list_like(estimator.specific_pairs): raise TypeError( @@ -955,7 +955,7 @@ def _simplify( type_error_message("'max_part_numbers' values", value, int) ) elif value < 0: - raise ValueError("'max_part_numbers' values must be positive") + raise ValueError("'max_part_numbers' values must be non-negative") # Create temporary directory and tables computation_dir = self._create_computation_dir("simplify") output_dir = self._get_output_dir(computation_dir) @@ -1272,17 +1272,17 @@ def _fit_check_params(self, ds, **kwargs): if not isinstance(self.n_features, int): raise TypeError(type_error_message("n_features", self.n_features, int)) if self.n_features < 0: - raise ValueError("'n_features' must be positive") + raise ValueError("'n_features' must be non-negative") if not isinstance(self.n_trees, int): raise TypeError(type_error_message("n_trees", self.n_trees, int)) if self.n_trees < 0: - raise ValueError("'n_trees' must be positive") + raise ValueError("'n_trees' must be non-negative") if not isinstance(self.n_text_features, int): raise TypeError( type_error_message("n_text_features", self.n_text_features, int) ) if self.n_text_features < 0: - raise ValueError("'n_text_features' must be positive") + raise ValueError("'n_text_features' must be non-negative") if not isinstance(self.type_text_features, str): raise TypeError( type_error_message("type_text_features", self.type_text_features, str) @@ -1307,7 +1307,7 @@ def _fit_check_params(self, ds, **kwargs): type_error_message("n_feature_parts", self.n_feature_parts, int) ) if self.n_feature_parts < 0: - raise ValueError("'n_feature_parts' must be positive") + raise ValueError("'n_feature_parts' must be non-negative") def _fit_train_model(self, ds, computation_dir, **kwargs): # Train the model with Khiops @@ -1635,9 +1635,9 @@ def _fit_check_params(self, ds, **kwargs): # Check estimator parameters if self.n_evaluated_features < 0: - raise ValueError("'n_evaluated_features' must be positive") + raise ValueError("'n_evaluated_features' must be non-negative") if self.n_selected_features < 0: - raise ValueError("'n_selected_features' must be positive") + raise ValueError("'n_selected_features' must be non-negative") # Note: scikit-learn **requires** inherit first the mixins and then other classes