diff --git a/doc/requirements.txt b/doc/requirements.txt index 68b7011a..fb67912d 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -4,6 +4,6 @@ ipykernel>=6.9.1 nbconvert==6.4.4 nbformat==5.3.0 numpydoc>=1.5.0 -pandas>=0.25.3,<=2.3.3 +pandas>=2.3.3,<=4.0.0 scikit-learn>=1.7.2,<1.9.0 sphinx-copybutton>=0.5.0 diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py index de1b46e1..eeff05e0 100644 --- a/khiops/sklearn/dataset.py +++ b/khiops/sklearn/dataset.py @@ -13,7 +13,7 @@ import numpy as np import pandas as pd -import sklearn +from pandas.core.dtypes.common import is_numeric_dtype, is_string_dtype from scipy import sparse as sp from sklearn.utils import check_array from sklearn.utils.validation import column_or_1d @@ -33,6 +33,12 @@ # pylint --disable=all --enable=invalid-names dataset.py # pylint: disable=invalid-name +# Set a special pandas option to force the new string data type (`StringDType`) +# even for version 2.0 which is still required for python 3.10. +# This new string data type does not map any longer to the corresponding numpy one +# and will break the code unless a special care is taken +pd.options.future.infer_string = True + def check_dataset_spec(ds_spec): """Checks that a dataset spec is valid @@ -393,16 +399,19 @@ def write_internal_data_table(dataframe, file_path_or_stream): def _column_or_1d_with_dtype(y, dtype=None): - # 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2; - if sklearn.__version__ < "1.2": - if pd.api.types.is_string_dtype(dtype) and y.isin(["True", "False"]).all(): - warnings.warn( - "'y' stores strings restricted to 'True'/'False' values: " - "The predict method may return a bool vector." - ) - return column_or_1d(y, warn=True) - else: - return column_or_1d(y, warn=True, dtype=dtype) + """Checks the data is of the provided `dtype`. + If a problem is detected a warning is printed or an error raised, + otherwise the pandas object is transformed into a numpy.array + """ + + # Since pandas 3.0 (and even in 2.0 if the option is activated) + # a new StringDType is used to handle strings. + # It does not match any longer the one recognized by numpy. + # We need to force the translation to the numpy dtype + # whenever a pandas string is detected (`is_string_dtype` returns `True`). + if is_string_dtype(dtype): + dtype = np.dtype(str) + return column_or_1d(y, warn=True, dtype=dtype) class Dataset: @@ -965,21 +974,23 @@ def __init__(self, name, dataframe, key=None): # Initialize feature columns and verify their types self.column_ids = self.data_source.columns.values - if not np.issubdtype(self.column_ids.dtype, np.integer): - if np.issubdtype(self.column_ids.dtype, object): - for i, column_id in enumerate(self.column_ids): - if not isinstance(column_id, str): - raise TypeError( - f"Dataframe column ids must be either all integers or " - f"all strings. Column id at index {i} ('{column_id}') is" - f" of type '{type(column_id).__name__}'" - ) - else: - raise TypeError( - f"Dataframe column ids must be either all integers or " - f"all strings. The column index has dtype " - f"'{self.column_ids.dtype}'" - ) + # Ensure the feature columns are either all string + # or all numeric but not a mix of both. + # Warning : the new pandas string data type (`StringDType`) + # - by default in pandas 3.0 or forced in pandas 2.0 - + # cannot be evaluated by `np.issubdtype`, any attempt will raise an error. + if not is_numeric_dtype(self.column_ids) and not is_string_dtype( + self.column_ids + ): + previous_type = None + for i, column_id in enumerate(self.column_ids): + if previous_type is not None and type(column_id) != previous_type: + raise TypeError( + f"Dataframe column ids must be either all integers or " + f"all strings. Column id at index {i} ('{column_id}') is" + f" of type '{type(column_id).__name__}'" + ) + previous_type = type(column_id) # Initialize Khiops types self.khiops_types = {} @@ -988,7 +999,8 @@ def __init__(self, name, dataframe, key=None): column_numpy_type = column.dtype column_max_size = None if isinstance(column_numpy_type, pd.StringDtype): - column_max_size = column.str.len().max() + # Warning pandas.Series.str.len() returns a float64 + column_max_size = int(column.str.len().max()) self.khiops_types[column_id] = get_khiops_type( column_numpy_type, column_max_size ) @@ -1161,7 +1173,7 @@ def __init__(self, name, matrix, key=None): raise TypeError( type_error_message("matrix", matrix, "scipy.sparse.spmatrix") ) - if not np.issubdtype(matrix.dtype, np.number): + if not is_numeric_dtype(matrix.dtype): raise TypeError( type_error_message("'matrix' dtype", matrix.dtype, "numeric") ) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 157f20c3..9567ac1c 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -2021,7 +2021,7 @@ def predict_proba(self, X): y_probas, (pd.DataFrame, np.ndarray) ), "y_probas is not a Pandas DataFrame nor Numpy array" y_probas = y_probas.reindex( - self._sorted_prob_variable_names(), axis=1, copy=False + self._sorted_prob_variable_names(), axis=1 ).to_numpy(copy=False) assert isinstance(y_probas, (str, np.ndarray)), "Expected str or np.ndarray" @@ -2265,7 +2265,7 @@ def predict(self, X): # Transform to np.ndarray if isinstance(y_pred, pd.DataFrame): - y_pred = y_pred.astype("float64", copy=False).to_numpy(copy=False).ravel() + y_pred = y_pred.astype("float64").to_numpy(copy=False).ravel() assert isinstance(y_pred, (str, np.ndarray)), "Expected str or np.array" return y_pred diff --git a/packaging/conda/meta.yaml b/packaging/conda/meta.yaml index b81c7656..afe252e2 100644 --- a/packaging/conda/meta.yaml +++ b/packaging/conda/meta.yaml @@ -24,7 +24,7 @@ requirements: run: - python - khiops-core =11.0.0 - - pandas >=0.25.3,<=2.3.3 + - pandas >=2.3.3,<=4.0.0 - scikit-learn>=1.7.2,<1.9.0 run_constrained: # do not necessary use the latest version diff --git a/pyproject.toml b/pyproject.toml index fd34aa0d..89cc730f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,7 +105,7 @@ classifiers = [ requires-python = ">=3.8" dependencies = [ # do not use the latest versions, to avoid undesired breaking changes - "pandas>=0.25.3,<=2.3.3", + "pandas>=2.3.3,<=4.0.0", "scikit-learn>=1.7.2,<1.9.0", ] diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index b2f2b35c..2addd20d 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -74,18 +74,24 @@ def create_monotable_dataframe(self): 1077, 1077, ], + # Since pandas 3.0 the default precision for parsing a datetime + # is now microseconds (us) instead of nanoseconds (ns) + # unless enough precision is given. + # Unfortunately only the changelog states this, not the docstring. + # To avoid any comparison error in tests + # we need to add the required precision to the datetime "Date": pd.to_datetime( [ - "2019-03-22", - "2019-03-23", - "2019-03-24", - "2019-03-25", - "2019-03-26", - "2019-03-27", - "2019-03-28", - "2019-03-29", - "2019-03-30", - "2019-03-31", + "2019-03-22 00:00:00.123456789", + "2019-03-23 00:00:00.123456789", + "2019-03-24 00:00:00.123456789", + "2019-03-25 00:00:00.123456789", + "2019-03-26 00:00:00.123456789", + "2019-03-27 00:00:00.123456789", + "2019-03-28 00:00:00.123456789", + "2019-03-29 00:00:00.123456789", + "2019-03-30 00:00:00.123456789", + "2019-03-31 00:00:00.123456789", ], ), "New": [ @@ -499,6 +505,12 @@ def test_out_file_from_dataframe_monotable(self): out_table = pd.read_csv(out_table_path, sep="\t", dtype={"Title": "string"}) # Cast "Date" columns to datetime as we don't automatically recognize dates + # Since pandas 3.0 the default precision for parsing a datetime + # is now microseconds (us) instead of nanoseconds (ns) + # unless enough precision is given. + # Unfortunately only the changelog states this, not the docstring. + # To avoid any comparison error in tests + # we need to add the required precision to the datetime out_table["Date"] = out_table["Date"].astype("datetime64[ns]") ref_table = spec["main_table"][0] ref_table["class"] = y diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py index 41b39efa..c1e99ed3 100644 --- a/tests/test_dataset_errors.py +++ b/tests/test_dataset_errors.py @@ -595,6 +595,6 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self): output_error_msg = str(context.exception) expected_msg = ( "Dataframe column ids must be either all integers or all " - "strings. Column id at index 0 ('1') is of type 'int'" + "strings. Column id at index 1 ('Age') is of type 'str'" ) self.assertEqual(output_error_msg, expected_msg) diff --git a/tests/test_helper.py b/tests/test_helper.py index f328bac1..12828819 100644 --- a/tests/test_helper.py +++ b/tests/test_helper.py @@ -319,8 +319,11 @@ def prepare_data(data, target_variable, primary_table=None, y_as_dataframe=False data, test_size=0.3, random_state=1, shuffle=False ) - y_test = data_test[target_variable] - y_train = data_train[target_variable] + # Since pandas 3.0, numbers in an array but with a carriage-return + # are lazily and wrongly inferred as `object` dtype instead of `int64` + # forcing pandas to `infer_objects` fixes the error + y_test = data_test.infer_objects()[target_variable] + y_train = data_train.infer_objects()[target_variable] # Create training labels as single-column dataframe if y_as_dataframe: