-
Notifications
You must be signed in to change notification settings - Fork 342
Expand file tree
/
Copy pathbase_encoder.py
More file actions
313 lines (251 loc) · 10.5 KB
/
base_encoder.py
File metadata and controls
313 lines (251 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import warnings
from typing import List, Union
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
from feature_engine._check_init_parameters.check_variables import \
_check_variables_input_value
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_missing_values_docstring, _variables_categorical_docstring)
from feature_engine._docstrings.init_parameters.encoders import \
_ignore_format_docstring
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import (_check_optional_contains_na,
_check_X_matches_training_df,
check_X)
from feature_engine.tags import _return_tags
from feature_engine.variable_handling import (check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables)
@Substitution(
ignore_format=_ignore_format_docstring,
variables=_variables_categorical_docstring,
)
class CategoricalInitMixin:
"""Shared initialization parameters across transformers. Sets and checks init
parameters.
Parameters
----------
{variables}.
{ignore_format}
"""
def __init__(
self,
variables: Union[None, int, str, List[Union[str, int]]] = None,
ignore_format: bool = False,
) -> None:
if not isinstance(ignore_format, bool):
raise ValueError(
"ignore_format takes only booleans True and False. "
f"Got {ignore_format} instead."
)
self.variables = _check_variables_input_value(variables)
self.ignore_format = ignore_format
@Substitution(
missing_values=_missing_values_docstring,
ignore_format=_ignore_format_docstring,
variables=_variables_categorical_docstring,
)
class CategoricalInitMixinNA:
"""Shared initialization parameters across transformers. Sets and checks init
parameters.
Parameters
----------
{variables}.
{missing_values}
{ignore_format}
"""
def __init__(
self,
variables: Union[None, int, str, List[Union[str, int]]] = None,
missing_values: str = "raise",
ignore_format: bool = False,
) -> None:
if missing_values not in ["raise", "ignore"]:
raise ValueError(
"missing_values takes only values 'raise' or 'ignore'. "
f"Got {missing_values} instead."
)
if not isinstance(ignore_format, bool):
raise ValueError(
"ignore_format takes only booleans True and False. "
f"Got {ignore_format} instead."
)
self.variables = _check_variables_input_value(variables)
self.ignore_format = ignore_format
self.missing_values = missing_values
class CategoricalMethodsMixin(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin):
"""Shared methods across categorical transformers.
- BaseEstimator brings methods get_params() and set_params().
- TransformerMixin brings method fit_transform()
- GetFeatureNamesOutMixin brings method get_feature_names_out().
"""
def _check_na(self, X: pd.DataFrame, variables):
if self.missing_values == "raise":
_check_optional_contains_na(X, variables)
def _check_or_select_variables(self, X: pd.DataFrame):
"""
Finds categorical variables, or alternatively checks that the variables
entered by the user are of type object (categorical).
Checks absence of NA.
Parameters
----------
X: Pandas DataFrame
Raises
------
TypeError
If any user provided variable is not categorical
ValueError
If there are no categorical variables in the df or the df is empty
If the variable(s) contain null values
"""
# select variables to encode
if self.ignore_format is True:
if self.variables is None:
variables_ = find_all_variables(X)
else:
variables_ = check_all_variables(X, self.variables)
else:
if self.variables is None:
variables_ = find_categorical_variables(X)
else:
variables_ = check_categorical_variables(X, self.variables)
return variables_
def _get_feature_names_in(self, X: pd.DataFrame):
"""
Returns attributes `featrure_names_in_` and `n_feature_names_in_`, which are
standard for all transformers in the library.
"""
# save input features
self.feature_names_in_ = X.columns.tolist()
# save train set shape
self.n_features_in_ = X.shape[1]
def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Checks that the input is a dataframe and of the same size than the one used
in the fit method. Checks absence of NA.
Parameters
----------
X: Pandas DataFrame
Raises
------
TypeError
If the input is not a Pandas DataFrame
ValueError
- If the variable(s) contain null values.
- If the df has different number of features than the df used in fit()
Returns
-------
X: Pandas DataFrame
The same dataframe entered by the user.
"""
# Check method fit has been called
check_is_fitted(self)
# check that input is a dataframe
X = check_X(X)
# Check input data contains same number of columns as df used to fit
_check_X_matches_training_df(X, self.n_features_in_)
# reorder df to match train set
X = X[self.feature_names_in_]
return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Replace categories with the learned parameters.
Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features].
The dataset to transform.
Returns
-------
X_new: pandas dataframe of shape = [n_samples, n_features].
The dataframe containing the categories replaced by numbers.
"""
X = self._check_transform_input_and_state(X)
# check if dataset contains na
if self.missing_values == "raise":
_check_optional_contains_na(X, self.variables_)
X = self._encode(X)
return X
def _encode(self, X: pd.DataFrame) -> pd.DataFrame:
# replace categories by the learned parameters
for feature in self.encoder_dict_.keys():
# Detect unseen categories BEFORE mapping so we can name them
if self.unseen == "warn":
unseen_cats = set(X[feature].dropna().unique()) - set(
self.encoder_dict_[feature].keys()
)
if unseen_cats:
warnings.warn(
f"Variable {feature!r} contains unseen categories: "
f"{unseen_cats}. These will be encoded as NaN.",
UserWarning,
)
X[feature] = X[feature].map(self.encoder_dict_[feature])
# if original variables are cast as categorical, they will remain
# categorical after the encoding, and this is probably not desired
if X[feature].dtype.name == "category":
if all(isinstance(x, int) for x in X[feature]):
X[feature] = X[feature].astype("int")
else:
X[feature] = X[feature].astype("float")
if self.unseen == "encode":
X[self.variables_] = X[self.variables_].fillna(self._unseen)
else:
# check if nan values were introduced by the transformation
self._check_nan_values_after_transformation(X)
return X
def _check_nan_values_after_transformation(self, X):
# check if NaN values were introduced by the encoding
if X[self.variables_].isnull().sum().sum() > 0:
# obtain the name(s) of the columns have null values
nan_columns = (
X[self.encoder_dict_.keys()]
.columns[X[self.encoder_dict_.keys()].isnull().any()]
.tolist()
)
if len(nan_columns) > 1:
nan_columns_str = ", ".join(nan_columns)
else:
nan_columns_str = nan_columns[0]
if self.unseen == "ignore":
warnings.warn(
"During the encoding, NaN values were introduced in the feature(s) "
f"{nan_columns_str}."
)
elif self.unseen == "raise":
raise ValueError(
"During the encoding, NaN values were introduced in the feature(s) "
f"{nan_columns_str}."
)
# 'warn': per-variable warnings were already issued in _encode before
# the mapping, so nothing more to do here.
def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Convert the encoded variable back to the original values.
Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features].
The transformed dataframe.
Returns
-------
X_tr: pandas dataframe of shape = [n_samples, n_features].
The un-transformed dataframe, with the categorical variables containing the
original values.
"""
X = self._check_transform_input_and_state(X)
# replace encoded categories by the original values
for feature in self.encoder_dict_.keys():
inv_map = {v: k for k, v in self.encoder_dict_[feature].items()}
X[feature] = X[feature].map(inv_map)
return X
def _more_tags(self):
tags_dict = _return_tags()
tags_dict["variables"] = "categorical"
# the below test will fail because sklearn requires to check for inf, but
# you can't check inf of categorical data, numpy returns and error.
# so we need to leave without this test
tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA"
return tags_dict
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
return tags