Open
Description
Currently, it can be hard to use a "parametric" transformer in a DataFrameMapper because the parameters of the underlying transformers aren't exposed. This means you can't adjust the parameters of one of those transformers using GridSearchCV or RandomizedSearchCV.
Example:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
pipeline = Pipeline([
('vectorizer',
DataFrameMapper([
('document_contents', CountVectorizer())
], df_out=False)),
('classifier', MultinomialNB())
])
pipeline.get_params()
These are the params I get:
{'memory': None,
'steps': [('vectorizer', DataFrameMapper(default=False, df_out=False,
features=[('document_contents', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None))],
input_df=False, sparse=False)),
('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
'vectorizer': DataFrameMapper(default=False, df_out=False,
features=[('document_contents', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None))],
input_df=False, sparse=False),
'classifier': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
'vectorizer__default': False,
'vectorizer__df_out': False,
'vectorizer__features': [('document_contents',
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None))],
'vectorizer__input_df': False,
'vectorizer__sparse': False,
'classifier__alpha': 1.0,
'classifier__class_prior': None,
'classifier__fit_prior': True}
Naively, I would expect something like this
{'memory': None,
'steps': [('vectorizer', DataFrameMapper(default=False, df_out=False,
features=[('document_contents', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None))],
input_df=False, sparse=False)),
('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
'vectorizer': DataFrameMapper(default=False, df_out=False,
features=[('document_contents', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None))],
input_df=False, sparse=False),
'classifier': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
'vectorizer__document_contents__analyzer': 'word',
'vectorizer__document_contents__binary': False,
'vectorizer__document_contents__decode_error': 'strict',
'vectorizer__document_contents__dtype': numpy.int64,
'vectorizer__document_contents__encoding': 'utf-8',
'vectorizer__document_contents__input': 'content',
'vectorizer__document_contents__lowercase': True,
'vectorizer__document_contents__max_df': 1.0,
'vectorizer__document_contents__max_features': None,
'vectorizer__document_contents__min_df': 1,
'vectorizer__document_contents__ngram_range': (1, 1),
'vectorizer__document_contents__preprocessor': None,
'vectorizer__document_contents__stop_words': None,
'vectorizer__document_contents__strip_accents': None,
'vectorizer__document_contents__token_pattern': '(?u)\\b\\w\\w+\\b',
'vectorizer__document_contents__tokenizer': None,
'vectorizer__document_contents__vocabulary': None,
'vectorizer__default': False,
'vectorizer__df_out': False,
'vectorizer__input_df': False,
'vectorizer__sparse': False,
'classifier__alpha': 1.0,
'classifier__class_prior': None,
'classifier__fit_prior': True}
which would be very handy for, say, using GridSearchCV to compare word and character analyzers.
This seems like it shouldn't be too hard to implement. If there's interest I can start digging around the codebase to try to spend some time on it.