Skip to content

Commit 9c28f3a

Browse files
committed
[FIX] Enable preprocessing in reg_cocktails (#369)
* enable preprocessing and remove is_small_preprocess * address comments from shuhei and fix precommit checks * fix tests * fix precommit checks * add suggestions from shuhei for astype use * address speed issue when using object_dtype_mapping * make code more readable * improve documentation for base network embedding
1 parent afc25f1 commit 9c28f3a

34 files changed

+182
-793
lines changed

autoPyTorch/api/tabular_classification.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1818
from autoPyTorch.datasets.resampling_strategy import (
1919
HoldoutValTypes,
20-
CrossValTypes,
2120
ResamplingStrategies,
2221
)
2322
from autoPyTorch.datasets.tabular_dataset import TabularDataset
@@ -419,8 +418,13 @@ def search(
419418
y_test=y_test,
420419
resampling_strategy=self.resampling_strategy,
421420
resampling_strategy_args=self.resampling_strategy_args,
421+
<<<<<<< HEAD
422422
dataset_name=dataset_name,
423423
dataset_compression=self._dataset_compression)
424+
=======
425+
dataset_name=dataset_name
426+
)
427+
>>>>>>> [FIX] Enable preprocessing in reg_cocktails (#369)
424428

425429
return self._search(
426430
dataset=self.dataset,

autoPyTorch/api/tabular_regression.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1818
from autoPyTorch.datasets.resampling_strategy import (
1919
HoldoutValTypes,
20-
CrossValTypes,
2120
ResamplingStrategies,
2221
)
2322
from autoPyTorch.datasets.tabular_dataset import TabularDataset
@@ -420,8 +419,13 @@ def search(
420419
y_test=y_test,
421420
resampling_strategy=self.resampling_strategy,
422421
resampling_strategy_args=self.resampling_strategy_args,
422+
<<<<<<< HEAD
423423
dataset_name=dataset_name,
424424
dataset_compression=self._dataset_compression)
425+
=======
426+
dataset_name=dataset_name
427+
)
428+
>>>>>>> [FIX] Enable preprocessing in reg_cocktails (#369)
425429

426430
return self._search(
427431
dataset=self.dataset,

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 80 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from sklearn.exceptions import NotFittedError
2020
from sklearn.impute import SimpleImputer
2121
from sklearn.pipeline import make_pipeline
22-
from sklearn.preprocessing import OneHotEncoder, StandardScaler
22+
from sklearn.preprocessing import OrdinalEncoder
2323

2424
from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
2525
from autoPyTorch.data.utils import (
@@ -32,7 +32,6 @@
3232

3333
def _create_column_transformer(
3434
preprocessors: Dict[str, List[BaseEstimator]],
35-
numerical_columns: List[str],
3635
categorical_columns: List[str],
3736
) -> ColumnTransformer:
3837
"""
@@ -43,49 +42,36 @@ def _create_column_transformer(
4342
Args:
4443
preprocessors (Dict[str, List[BaseEstimator]]):
4544
Dictionary containing list of numerical and categorical preprocessors.
46-
numerical_columns (List[str]):
47-
List of names of numerical columns
4845
categorical_columns (List[str]):
4946
List of names of categorical columns
5047
5148
Returns:
5249
ColumnTransformer
5350
"""
5451

55-
numerical_pipeline = 'drop'
56-
categorical_pipeline = 'drop'
57-
if len(numerical_columns) > 0:
58-
numerical_pipeline = make_pipeline(*preprocessors['numerical'])
59-
if len(categorical_columns) > 0:
60-
categorical_pipeline = make_pipeline(*preprocessors['categorical'])
52+
categorical_pipeline = make_pipeline(*preprocessors['categorical'])
6153

6254
return ColumnTransformer([
63-
('categorical_pipeline', categorical_pipeline, categorical_columns),
64-
('numerical_pipeline', numerical_pipeline, numerical_columns)],
65-
remainder='drop'
55+
('categorical_pipeline', categorical_pipeline, categorical_columns)],
56+
remainder='passthrough'
6657
)
6758

6859

6960
def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
7061
"""
7162
This function creates a Dictionary containing a list
7263
of numerical and categorical preprocessors
73-
7464
Returns:
7565
Dict[str, List[BaseEstimator]]
7666
"""
7767
preprocessors: Dict[str, List[BaseEstimator]] = dict()
7868

7969
# Categorical Preprocessors
80-
onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
70+
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value',
71+
unknown_value=-1)
8172
categorical_imputer = SimpleImputer(strategy='constant', copy=False)
8273

83-
# Numerical Preprocessors
84-
numerical_imputer = SimpleImputer(strategy='median', copy=False)
85-
standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False)
86-
87-
preprocessors['categorical'] = [categorical_imputer, onehot_encoder]
88-
preprocessors['numerical'] = [numerical_imputer, standard_scaler]
74+
preprocessors['categorical'] = [categorical_imputer, ordinal_encoder]
8975

9076
return preprocessors
9177

@@ -180,31 +166,47 @@ def _fit(
180166
if hasattr(X, "iloc") and not issparse(X):
181167
X = cast(pd.DataFrame, X)
182168

183-
self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()])
169+
all_nan_columns = X.columns[X.isna().all()]
170+
for col in all_nan_columns:
171+
X[col] = pd.to_numeric(X[col])
172+
173+
# Handle objects if possible
174+
exist_object_columns = has_object_columns(X.dtypes.values)
175+
if exist_object_columns:
176+
X = self.infer_objects(X)
184177

185-
categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
178+
self.dtypes = [dt.name for dt in X.dtypes] # Also note this change in self.dtypes
179+
self.all_nan_columns = set(all_nan_columns)
186180

187-
self.enc_columns = categorical_columns
181+
self.enc_columns, self.feat_type = self._get_columns_info(X)
188182

189-
preprocessors = get_tabular_preprocessors()
190-
self.column_transformer = _create_column_transformer(
191-
preprocessors=preprocessors,
192-
numerical_columns=numerical_columns,
193-
categorical_columns=categorical_columns,
194-
)
183+
if len(self.enc_columns) > 0:
195184

196-
# Mypy redefinition
197-
assert self.column_transformer is not None
198-
self.column_transformer.fit(X)
185+
preprocessors = get_tabular_preprocessors()
186+
self.column_transformer = _create_column_transformer(
187+
preprocessors=preprocessors,
188+
categorical_columns=self.enc_columns,
189+
)
199190

200-
# The column transformer reorders the feature types
201-
# therefore, we need to change the order of columns as well
202-
# This means categorical columns are shifted to the left
191+
# Mypy redefinition
192+
assert self.column_transformer is not None
193+
self.column_transformer.fit(X)
203194

204-
self.feat_type = sorted(
205-
feat_type,
206-
key=functools.cmp_to_key(self._comparator)
207-
)
195+
# The column transformer moves categorical columns before all numerical columns
196+
# therefore, we need to sort categorical columns so that it complies this change
197+
198+
self.feat_type = sorted(
199+
self.feat_type,
200+
key=functools.cmp_to_key(self._comparator)
201+
)
202+
203+
encoded_categories = self.column_transformer.\
204+
named_transformers_['categorical_pipeline'].\
205+
named_steps['ordinalencoder'].categories_
206+
self.categories = [
207+
list(range(len(cat)))
208+
for cat in encoded_categories
209+
]
208210

209211
# differently to categorical_columns and numerical_columns,
210212
# this saves the index of the column.
@@ -289,6 +291,23 @@ def transform(
289291
X = cast(Type[pd.DataFrame], X)
290292
>>>>>>> [FIX] Tests after rebase of `reg_cocktails` (#359)
291293

294+
if self.all_nan_columns is None:
295+
raise ValueError('_fit must be called before calling transform')
296+
297+
for col in list(self.all_nan_columns):
298+
X[col] = np.nan
299+
X[col] = pd.to_numeric(X[col])
300+
301+
if len(self.categorical_columns) > 0:
302+
# when some categorical columns are not all nan in the training set
303+
# but they are all nan in the testing or validation set
304+
# we change those columns to `object` dtype
305+
# to ensure that these columns are changed to appropriate dtype
306+
# in self.infer_objects
307+
all_nan_cat_cols = set(X[self.enc_columns].columns[X[self.enc_columns].isna().all()])
308+
dtype_dict = {col: 'object' for col in self.enc_columns if col in all_nan_cat_cols}
309+
X = X.astype(dtype_dict)
310+
292311
# Check the data here so we catch problems on new test data
293312
self._check_data(X)
294313

@@ -297,11 +316,6 @@ def transform(
297316
# We need to convert the column in test data to
298317
# object otherwise the test column is interpreted as float
299318
if self.column_transformer is not None:
300-
if len(self.categorical_columns) > 0:
301-
categorical_columns = self.column_transformer.transformers_[0][-1]
302-
for column in categorical_columns:
303-
if X[column].isna().all():
304-
X[column] = X[column].astype('object')
305319
X = self.column_transformer.transform(X)
306320

307321
# Sparse related transformations
@@ -416,7 +430,6 @@ def _check_data(
416430
self.column_order = column_order
417431

418432
dtypes = [dtype.name for dtype in X.dtypes]
419-
420433
diff_cols = X.columns[[s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)]]
421434
if len(self.dtypes) == 0:
422435
self.dtypes = dtypes
@@ -428,7 +441,7 @@ def _check_data(
428441
def _get_columns_info(
429442
self,
430443
X: pd.DataFrame,
431-
) -> Tuple[List[str], List[str], List[str]]:
444+
) -> Tuple[List[str], List[str]]:
432445
"""
433446
Return the columns to be encoded from a pandas dataframe
434447
@@ -447,15 +460,12 @@ def _get_columns_info(
447460
"""
448461

449462
# Register if a column needs encoding
450-
numerical_columns = []
451463
categorical_columns = []
452464
# Also, register the feature types for the estimator
453465
feat_type = []
454466

455467
# Make sure each column is a valid type
456468
for i, column in enumerate(X.columns):
457-
if self.all_nan_columns is not None and column in self.all_nan_columns:
458-
continue
459469
column_dtype = self.dtypes[i]
460470
err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
461471
"but input column {} has an invalid type `{}`.".format(column, column_dtype)
@@ -466,7 +476,6 @@ def _get_columns_info(
466476
# TypeError: data type not understood in certain pandas types
467477
elif is_numeric_dtype(column_dtype):
468478
feat_type.append('numerical')
469-
numerical_columns.append(column)
470479
elif column_dtype == 'object':
471480
# TODO verify how would this happen when we always convert the object dtypes to category
472481
raise TypeError(
@@ -492,7 +501,7 @@ def _get_columns_info(
492501
"before feeding it to AutoPyTorch.".format(err_msg)
493502
)
494503

495-
return categorical_columns, numerical_columns, feat_type
504+
return categorical_columns, feat_type
496505

497506
def list_to_pandas(
498507
self,
@@ -562,22 +571,26 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
562571
pd.DataFrame
563572
"""
564573
if hasattr(self, 'object_dtype_mapping'):
565-
# Mypy does not process the has attr. This dict is defined below
566-
for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type]
567-
# honor the training data types
568-
try:
569-
X[key] = X[key].astype(dtype.name)
570-
except Exception as e:
571-
# Try inference if possible
572-
self.logger.warning(f'Casting the column {key} to {dtype} caused the exception {e}')
573-
pass
574+
# honor the training data types
575+
try:
576+
# Mypy does not process the has attr.
577+
X = X.astype(self.object_dtype_mapping) # type: ignore[has-type]
578+
except Exception as e:
579+
# Try inference if possible
580+
self.logger.warning(f'Casting the columns to training dtypes ' # type: ignore[has-type]
581+
f'{self.object_dtype_mapping} caused the exception {e}')
582+
pass
574583
else:
575-
# Calling for the first time to infer the categories
576-
X = X.infer_objects()
577-
for column, data_type in zip(X.columns, X.dtypes):
578-
if not is_numeric_dtype(data_type):
579-
X[column] = X[column].astype('category')
580-
584+
if len(self.dtypes) != 0:
585+
# when train data has no object dtype, but test does
586+
# we prioritise the datatype given in training data
587+
dtype_dict = {col: dtype for col, dtype in zip(X.columns, self.dtypes)}
588+
X = X.astype(dtype_dict)
589+
else:
590+
# Calling for the first time to infer the categories
591+
X = X.infer_objects()
592+
dtype_dict = {col: 'category' for col, dtype in zip(X.columns, X.dtypes) if not is_numeric_dtype(dtype)}
593+
X = X.astype(dtype_dict)
581594
# only numerical attributes and categories
582595
self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}
583596

autoPyTorch/datasets/base_dataset.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,6 @@ def __init__(
155155
self.holdout_validators: Dict[str, HoldOutFunc] = {}
156156
self.no_resampling_validators: Dict[str, NoResamplingFunc] = {}
157157
self.random_state = np.random.RandomState(seed=seed)
158-
self.no_resampling_validators: Dict[str, NoResamplingFunc] = {}
159158
self.shuffle = shuffle
160159
self.resampling_strategy = resampling_strategy
161160
self.resampling_strategy_args = resampling_strategy_args
@@ -165,10 +164,6 @@ def __init__(
165164
if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
166165
self.output_shape, self.output_type = _get_output_properties(self.train_tensors)
167166

168-
# TODO: Look for a criteria to define small enough to preprocess
169-
# False for the regularization cocktails initially
170-
self.is_small_preprocess = False
171-
172167
# Make sure cross validation splits are created once
173168
self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes)
174169
self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes)

autoPyTorch/datasets/resampling_strategy.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,6 @@ def __call__(self, random_state: np.random.RandomState, val_share: float,
3939
...
4040

4141

42-
class NoResamplingFunc(Protocol):
43-
def __call__(self,
44-
random_state: np.random.RandomState,
45-
indices: np.ndarray) -> np.ndarray:
46-
...
47-
48-
4942
class CrossValTypes(IntEnum):
5043
"""The type of cross validation
5144

0 commit comments

Comments
 (0)