Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ ipykernel>=6.9.1
nbconvert==6.4.4
nbformat==5.3.0
numpydoc>=1.5.0
pandas>=0.25.3,<=2.3.3
pandas>=2.3.3,<=4.0.0
scikit-learn>=1.7.2,<1.9.0
sphinx-copybutton>=0.5.0
68 changes: 40 additions & 28 deletions khiops/sklearn/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import numpy as np
import pandas as pd
import sklearn
from pandas.core.dtypes.common import is_numeric_dtype, is_string_dtype
from scipy import sparse as sp
from sklearn.utils import check_array
from sklearn.utils.validation import column_or_1d
Expand All @@ -33,6 +33,12 @@
# pylint --disable=all --enable=invalid-names dataset.py
# pylint: disable=invalid-name

# Set a special pandas option to force the new string data type (`StringDType`)
# even for version 2.0 which is still required for python 3.10.
# This new string data type does not map any longer to the corresponding numpy one
# and will break the code unless a special care is taken
pd.options.future.infer_string = True


def check_dataset_spec(ds_spec):
"""Checks that a dataset spec is valid
Expand Down Expand Up @@ -393,16 +399,19 @@ def write_internal_data_table(dataframe, file_path_or_stream):


def _column_or_1d_with_dtype(y, dtype=None):
# 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2;
if sklearn.__version__ < "1.2":
if pd.api.types.is_string_dtype(dtype) and y.isin(["True", "False"]).all():
warnings.warn(
"'y' stores strings restricted to 'True'/'False' values: "
"The predict method may return a bool vector."
)
return column_or_1d(y, warn=True)
else:
return column_or_1d(y, warn=True, dtype=dtype)
"""Checks the data is of the provided `dtype`.
If a problem is detected a warning is printed or an error raised,
otherwise the pandas object is transformed into a numpy.array
"""

# Since pandas 3.0 (and even in 2.0 if the option is activated)
# a new StringDType is used to handle strings.
# It does not match any longer the one recognized by numpy.
# We need to force the translation to the numpy dtype
# whenever a pandas string is detected (`is_string_dtype` returns `True`).
if is_string_dtype(dtype):
dtype = np.dtype(str)
return column_or_1d(y, warn=True, dtype=dtype)


class Dataset:
Expand Down Expand Up @@ -965,21 +974,23 @@ def __init__(self, name, dataframe, key=None):

# Initialize feature columns and verify their types
self.column_ids = self.data_source.columns.values
if not np.issubdtype(self.column_ids.dtype, np.integer):
if np.issubdtype(self.column_ids.dtype, object):
for i, column_id in enumerate(self.column_ids):
if not isinstance(column_id, str):
raise TypeError(
f"Dataframe column ids must be either all integers or "
f"all strings. Column id at index {i} ('{column_id}') is"
f" of type '{type(column_id).__name__}'"
)
else:
raise TypeError(
f"Dataframe column ids must be either all integers or "
f"all strings. The column index has dtype "
f"'{self.column_ids.dtype}'"
)
# Ensure the feature columns are either all string
# or all numeric but not a mix of both.
# Warning : the new pandas string data type (`StringDType`)
# - by default in pandas 3.0 or forced in pandas 2.0 -
# cannot be evaluated by `np.issubdtype`, any attempt will raise an error.
if not is_numeric_dtype(self.column_ids) and not is_string_dtype(
self.column_ids
):
previous_type = None
for i, column_id in enumerate(self.column_ids):
if previous_type is not None and type(column_id) != previous_type:
raise TypeError(
f"Dataframe column ids must be either all integers or "
f"all strings. Column id at index {i} ('{column_id}') is"
f" of type '{type(column_id).__name__}'"
)
previous_type = type(column_id)

# Initialize Khiops types
self.khiops_types = {}
Expand All @@ -988,7 +999,8 @@ def __init__(self, name, dataframe, key=None):
column_numpy_type = column.dtype
column_max_size = None
if isinstance(column_numpy_type, pd.StringDtype):
column_max_size = column.str.len().max()
# Warning pandas.Series.str.len() returns a float64
column_max_size = int(column.str.len().max())
self.khiops_types[column_id] = get_khiops_type(
column_numpy_type, column_max_size
)
Expand Down Expand Up @@ -1161,7 +1173,7 @@ def __init__(self, name, matrix, key=None):
raise TypeError(
type_error_message("matrix", matrix, "scipy.sparse.spmatrix")
)
if not np.issubdtype(matrix.dtype, np.number):
if not is_numeric_dtype(matrix.dtype):
raise TypeError(
type_error_message("'matrix' dtype", matrix.dtype, "numeric")
)
Expand Down
4 changes: 2 additions & 2 deletions khiops/sklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2021,7 +2021,7 @@ def predict_proba(self, X):
y_probas, (pd.DataFrame, np.ndarray)
), "y_probas is not a Pandas DataFrame nor Numpy array"
y_probas = y_probas.reindex(
self._sorted_prob_variable_names(), axis=1, copy=False
self._sorted_prob_variable_names(), axis=1
).to_numpy(copy=False)

assert isinstance(y_probas, (str, np.ndarray)), "Expected str or np.ndarray"
Expand Down Expand Up @@ -2265,7 +2265,7 @@ def predict(self, X):

# Transform to np.ndarray
if isinstance(y_pred, pd.DataFrame):
y_pred = y_pred.astype("float64", copy=False).to_numpy(copy=False).ravel()
y_pred = y_pred.astype("float64").to_numpy(copy=False).ravel()

assert isinstance(y_pred, (str, np.ndarray)), "Expected str or np.array"
return y_pred
Expand Down
2 changes: 1 addition & 1 deletion packaging/conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ requirements:
run:
- python
- khiops-core =11.0.0
- pandas >=0.25.3,<=2.3.3
- pandas >=2.3.3,<=4.0.0
- scikit-learn>=1.7.2,<1.9.0
run_constrained:
# do not necessary use the latest version
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ classifiers = [
requires-python = ">=3.8"
dependencies = [
# do not use the latest versions, to avoid undesired breaking changes
"pandas>=0.25.3,<=2.3.3",
"pandas>=2.3.3,<=4.0.0",
"scikit-learn>=1.7.2,<1.9.0",
]

Expand Down
32 changes: 22 additions & 10 deletions tests/test_dataset_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,18 +74,24 @@ def create_monotable_dataframe(self):
1077,
1077,
],
# Since pandas 3.0 the default precision for parsing a datetime
# is now microseconds (us) instead of nanoseconds (ns)
# unless enough precision is given.
# Unfortunately only the changelog states this, not the docstring.
# To avoid any comparison error in tests
# we need to add the required precision to the datetime
"Date": pd.to_datetime(
[
"2019-03-22",
"2019-03-23",
"2019-03-24",
"2019-03-25",
"2019-03-26",
"2019-03-27",
"2019-03-28",
"2019-03-29",
"2019-03-30",
"2019-03-31",
"2019-03-22 00:00:00.123456789",
"2019-03-23 00:00:00.123456789",
"2019-03-24 00:00:00.123456789",
"2019-03-25 00:00:00.123456789",
"2019-03-26 00:00:00.123456789",
"2019-03-27 00:00:00.123456789",
"2019-03-28 00:00:00.123456789",
"2019-03-29 00:00:00.123456789",
"2019-03-30 00:00:00.123456789",
"2019-03-31 00:00:00.123456789",
],
),
"New": [
Expand Down Expand Up @@ -499,6 +505,12 @@ def test_out_file_from_dataframe_monotable(self):
out_table = pd.read_csv(out_table_path, sep="\t", dtype={"Title": "string"})

# Cast "Date" columns to datetime as we don't automatically recognize dates
# Since pandas 3.0 the default precision for parsing a datetime
# is now microseconds (us) instead of nanoseconds (ns)
# unless enough precision is given.
# Unfortunately only the changelog states this, not the docstring.
# To avoid any comparison error in tests
# we need to add the required precision to the datetime
out_table["Date"] = out_table["Date"].astype("datetime64[ns]")
ref_table = spec["main_table"][0]
ref_table["class"] = y
Expand Down
2 changes: 1 addition & 1 deletion tests/test_dataset_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,6 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self):
output_error_msg = str(context.exception)
expected_msg = (
"Dataframe column ids must be either all integers or all "
"strings. Column id at index 0 ('1') is of type 'int'"
"strings. Column id at index 1 ('Age') is of type 'str'"
)
self.assertEqual(output_error_msg, expected_msg)
7 changes: 5 additions & 2 deletions tests/test_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,11 @@ def prepare_data(data, target_variable, primary_table=None, y_as_dataframe=False
data, test_size=0.3, random_state=1, shuffle=False
)

y_test = data_test[target_variable]
y_train = data_train[target_variable]
# Since pandas 3.0, numbers in an array but with a carriage-return
# are lazily and wrongly inferred as `object` dtype instead of `int64`
# forcing pandas to `infer_objects` fixes the error
y_test = data_test.infer_objects()[target_variable]
y_train = data_train.infer_objects()[target_variable]

# Create training labels as single-column dataframe
if y_as_dataframe:
Expand Down
Loading