19
19
from sklearn .exceptions import NotFittedError
20
20
from sklearn .impute import SimpleImputer
21
21
from sklearn .pipeline import make_pipeline
22
- from sklearn .preprocessing import OneHotEncoder , StandardScaler
22
+ from sklearn .preprocessing import OrdinalEncoder
23
23
24
24
from autoPyTorch .data .base_feature_validator import BaseFeatureValidator , SupportedFeatTypes
25
25
from autoPyTorch .data .utils import (
32
32
33
33
def _create_column_transformer (
34
34
preprocessors : Dict [str , List [BaseEstimator ]],
35
- numerical_columns : List [str ],
36
35
categorical_columns : List [str ],
37
36
) -> ColumnTransformer :
38
37
"""
@@ -43,49 +42,36 @@ def _create_column_transformer(
43
42
Args:
44
43
preprocessors (Dict[str, List[BaseEstimator]]):
45
44
Dictionary containing list of numerical and categorical preprocessors.
46
- numerical_columns (List[str]):
47
- List of names of numerical columns
48
45
categorical_columns (List[str]):
49
46
List of names of categorical columns
50
47
51
48
Returns:
52
49
ColumnTransformer
53
50
"""
54
51
55
- numerical_pipeline = 'drop'
56
- categorical_pipeline = 'drop'
57
- if len (numerical_columns ) > 0 :
58
- numerical_pipeline = make_pipeline (* preprocessors ['numerical' ])
59
- if len (categorical_columns ) > 0 :
60
- categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
52
+ categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
61
53
62
54
return ColumnTransformer ([
63
- ('categorical_pipeline' , categorical_pipeline , categorical_columns ),
64
- ('numerical_pipeline' , numerical_pipeline , numerical_columns )],
65
- remainder = 'drop'
55
+ ('categorical_pipeline' , categorical_pipeline , categorical_columns )],
56
+ remainder = 'passthrough'
66
57
)
67
58
68
59
69
60
def get_tabular_preprocessors () -> Dict [str , List [BaseEstimator ]]:
70
61
"""
71
62
This function creates a Dictionary containing a list
72
63
of numerical and categorical preprocessors
73
-
74
64
Returns:
75
65
Dict[str, List[BaseEstimator]]
76
66
"""
77
67
preprocessors : Dict [str , List [BaseEstimator ]] = dict ()
78
68
79
69
# Categorical Preprocessors
80
- onehot_encoder = OneHotEncoder (categories = 'auto' , sparse = False , handle_unknown = 'ignore' )
70
+ ordinal_encoder = OrdinalEncoder (handle_unknown = 'use_encoded_value' ,
71
+ unknown_value = - 1 )
81
72
categorical_imputer = SimpleImputer (strategy = 'constant' , copy = False )
82
73
83
- # Numerical Preprocessors
84
- numerical_imputer = SimpleImputer (strategy = 'median' , copy = False )
85
- standard_scaler = StandardScaler (with_mean = True , with_std = True , copy = False )
86
-
87
- preprocessors ['categorical' ] = [categorical_imputer , onehot_encoder ]
88
- preprocessors ['numerical' ] = [numerical_imputer , standard_scaler ]
74
+ preprocessors ['categorical' ] = [categorical_imputer , ordinal_encoder ]
89
75
90
76
return preprocessors
91
77
@@ -180,31 +166,47 @@ def _fit(
180
166
if hasattr (X , "iloc" ) and not issparse (X ):
181
167
X = cast (pd .DataFrame , X )
182
168
183
- self .all_nan_columns = set ([column for column in X .columns if X [column ].isna ().all ()])
169
+ all_nan_columns = X .columns [X .isna ().all ()]
170
+ for col in all_nan_columns :
171
+ X [col ] = pd .to_numeric (X [col ])
172
+
173
+ # Handle objects if possible
174
+ exist_object_columns = has_object_columns (X .dtypes .values )
175
+ if exist_object_columns :
176
+ X = self .infer_objects (X )
184
177
185
- categorical_columns , numerical_columns , feat_type = self ._get_columns_info (X )
178
+ self .dtypes = [dt .name for dt in X .dtypes ] # Also note this change in self.dtypes
179
+ self .all_nan_columns = set (all_nan_columns )
186
180
187
- self .enc_columns = categorical_columns
181
+ self .enc_columns , self . feat_type = self . _get_columns_info ( X )
188
182
189
- preprocessors = get_tabular_preprocessors ()
190
- self .column_transformer = _create_column_transformer (
191
- preprocessors = preprocessors ,
192
- numerical_columns = numerical_columns ,
193
- categorical_columns = categorical_columns ,
194
- )
183
+ if len (self .enc_columns ) > 0 :
195
184
196
- # Mypy redefinition
197
- assert self .column_transformer is not None
198
- self .column_transformer .fit (X )
185
+ preprocessors = get_tabular_preprocessors ()
186
+ self .column_transformer = _create_column_transformer (
187
+ preprocessors = preprocessors ,
188
+ categorical_columns = self .enc_columns ,
189
+ )
199
190
200
- # The column transformer reorders the feature types
201
- # therefore, we need to change the order of columns as well
202
- # This means categorical columns are shifted to the left
191
+ # Mypy redefinition
192
+ assert self . column_transformer is not None
193
+ self . column_transformer . fit ( X )
203
194
204
- self .feat_type = sorted (
205
- feat_type ,
206
- key = functools .cmp_to_key (self ._comparator )
207
- )
195
+ # The column transformer moves categorical columns before all numerical columns
196
+ # therefore, we need to sort categorical columns so that it complies this change
197
+
198
+ self .feat_type = sorted (
199
+ self .feat_type ,
200
+ key = functools .cmp_to_key (self ._comparator )
201
+ )
202
+
203
+ encoded_categories = self .column_transformer .\
204
+ named_transformers_ ['categorical_pipeline' ].\
205
+ named_steps ['ordinalencoder' ].categories_
206
+ self .categories = [
207
+ list (range (len (cat )))
208
+ for cat in encoded_categories
209
+ ]
208
210
209
211
# differently to categorical_columns and numerical_columns,
210
212
# this saves the index of the column.
@@ -289,6 +291,23 @@ def transform(
289
291
X = cast (Type [pd .DataFrame ], X )
290
292
> >> >> >> [FIX ] Tests after rebase of `reg_cocktails` (#359)
291
293
294
+ if self .all_nan_columns is None :
295
+ raise ValueError ('_fit must be called before calling transform' )
296
+
297
+ for col in list (self .all_nan_columns ):
298
+ X [col ] = np .nan
299
+ X [col ] = pd .to_numeric (X [col ])
300
+
301
+ if len (self .categorical_columns ) > 0 :
302
+ # when some categorical columns are not all nan in the training set
303
+ # but they are all nan in the testing or validation set
304
+ # we change those columns to `object` dtype
305
+ # to ensure that these columns are changed to appropriate dtype
306
+ # in self.infer_objects
307
+ all_nan_cat_cols = set (X [self .enc_columns ].columns [X [self .enc_columns ].isna ().all ()])
308
+ dtype_dict = {col : 'object' for col in self .enc_columns if col in all_nan_cat_cols }
309
+ X = X .astype (dtype_dict )
310
+
292
311
# Check the data here so we catch problems on new test data
293
312
self ._check_data (X )
294
313
@@ -297,11 +316,6 @@ def transform(
297
316
# We need to convert the column in test data to
298
317
# object otherwise the test column is interpreted as float
299
318
if self .column_transformer is not None :
300
- if len (self .categorical_columns ) > 0 :
301
- categorical_columns = self .column_transformer .transformers_ [0 ][- 1 ]
302
- for column in categorical_columns :
303
- if X [column ].isna ().all ():
304
- X [column ] = X [column ].astype ('object' )
305
319
X = self .column_transformer .transform (X )
306
320
307
321
# Sparse related transformations
@@ -416,7 +430,6 @@ def _check_data(
416
430
self .column_order = column_order
417
431
418
432
dtypes = [dtype .name for dtype in X .dtypes ]
419
-
420
433
diff_cols = X .columns [[s_dtype != dtype for s_dtype , dtype in zip (self .dtypes , dtypes )]]
421
434
if len (self .dtypes ) == 0 :
422
435
self .dtypes = dtypes
@@ -428,7 +441,7 @@ def _check_data(
428
441
def _get_columns_info (
429
442
self ,
430
443
X : pd .DataFrame ,
431
- ) - > Tuple [List [str ], List [str ], List [ str ] ]:
444
+ ) -> Tuple [List [str ], List [str ]]:
432
445
"""
433
446
Return the columns to be encoded from a pandas dataframe
434
447
@@ -447,15 +460,12 @@ def _get_columns_info(
447
460
"""
448
461
449
462
# Register if a column needs encoding
450
- numerical_columns = []
451
463
categorical_columns = []
452
464
# Also, register the feature types for the estimator
453
465
feat_type = []
454
466
455
467
# Make sure each column is a valid type
456
468
for i , column in enumerate (X .columns ):
457
- if self .all_nan_columns is not None and column in self .all_nan_columns :
458
- continue
459
469
column_dtype = self .dtypes [i ]
460
470
err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
461
471
"but input column {} has an invalid type `{}`." .format (column , column_dtype )
@@ -466,7 +476,6 @@ def _get_columns_info(
466
476
# TypeError: data type not understood in certain pandas types
467
477
elif is_numeric_dtype (column_dtype ):
468
478
feat_type .append ('numerical' )
469
- numerical_columns .append (column )
470
479
elif column_dtype == 'object' :
471
480
# TODO verify how would this happen when we always convert the object dtypes to category
472
481
raise TypeError (
@@ -492,7 +501,7 @@ def _get_columns_info(
492
501
"before feeding it to AutoPyTorch." .format (err_msg )
493
502
)
494
503
495
- return categorical_columns , numerical_columns , feat_type
504
+ return categorical_columns , feat_type
496
505
497
506
def list_to_pandas (
498
507
self ,
@@ -562,22 +571,26 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
562
571
pd.DataFrame
563
572
"""
564
573
if hasattr (self , 'object_dtype_mapping' ):
565
- # Mypy does not process the has attr. This dict is defined below
566
- for key , dtype in self . object_dtype_mapping . items (): # type: ignore[has-type]
567
- # honor the training data types
568
- try :
569
- X [ key ] = X [ key ]. astype ( dtype . name )
570
- except Exception as e :
571
- # Try inference if possible
572
- self . logger . warning ( f'Casting the column { key } to { dtype } caused the exception { e } ' )
573
- pass
574
+ # honor the training data types
575
+ try :
576
+ # Mypy does not process the has attr.
577
+ X = X . astype ( self . object_dtype_mapping ) # type: ignore[has-type]
578
+ except Exception as e :
579
+ # Try inference if possible
580
+ self . logger . warning ( f'Casting the columns to training dtypes ' # type: ignore[has-type]
581
+ f' { self . object_dtype_mapping } caused the exception { e } ' )
582
+ pass
574
583
else :
575
- # Calling for the first time to infer the categories
576
- X = X .infer_objects ()
577
- for column , data_type in zip (X .columns , X .dtypes ):
578
- if not is_numeric_dtype (data_type ):
579
- X [column ] = X [column ].astype ('category' )
580
-
584
+ if len (self .dtypes ) != 0 :
585
+ # when train data has no object dtype, but test does
586
+ # we prioritise the datatype given in training data
587
+ dtype_dict = {col : dtype for col , dtype in zip (X .columns , self .dtypes )}
588
+ X = X .astype (dtype_dict )
589
+ else :
590
+ # Calling for the first time to infer the categories
591
+ X = X .infer_objects ()
592
+ dtype_dict = {col : 'category' for col , dtype in zip (X .columns , X .dtypes ) if not is_numeric_dtype (dtype )}
593
+ X = X .astype (dtype_dict )
581
594
# only numerical attributes and categories
582
595
self .object_dtype_mapping = {column : data_type for column , data_type in zip (X .columns , X .dtypes )}
583
596
0 commit comments