Skip to content

Expose parameters from transformers as parameters of the mapper #159

Open
@gwerbin

Description

@gwerbin

Currently, it can be hard to use a "parametric" transformer in a DataFrameMapper because the parameters of the underlying transformers aren't exposed. This means you can't adjust the parameters of one of those transformers using GridSearchCV or RandomizedSearchCV.

Example:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper

pipeline = Pipeline([
    ('vectorizer',
        DataFrameMapper([
            ('document_contents', CountVectorizer())
        ], df_out=False)),
    ('classifier', MultinomialNB())
])

pipeline.get_params()

These are the params I get:

{'memory': None,
 'steps': [('vectorizer', DataFrameMapper(default=False, df_out=False,
           features=[('document_contents', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None))],
           input_df=False, sparse=False)),
  ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'vectorizer': DataFrameMapper(default=False, df_out=False,
         features=[('document_contents', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None))],
         input_df=False, sparse=False),
 'classifier': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 'vectorizer__default': False,
 'vectorizer__df_out': False,
 'vectorizer__features': [('document_contents',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None))],
 'vectorizer__input_df': False,
 'vectorizer__sparse': False,
 'classifier__alpha': 1.0,
 'classifier__class_prior': None,
 'classifier__fit_prior': True}

Naively, I would expect something like this

{'memory': None,
 'steps': [('vectorizer', DataFrameMapper(default=False, df_out=False,
           features=[('document_contents', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None))],
           input_df=False, sparse=False)),
  ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'vectorizer': DataFrameMapper(default=False, df_out=False,
         features=[('document_contents', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None))],
         input_df=False, sparse=False),
 'classifier': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 'vectorizer__document_contents__analyzer': 'word',
 'vectorizer__document_contents__binary': False,
 'vectorizer__document_contents__decode_error': 'strict',
 'vectorizer__document_contents__dtype': numpy.int64,
 'vectorizer__document_contents__encoding': 'utf-8',
 'vectorizer__document_contents__input': 'content',
 'vectorizer__document_contents__lowercase': True,
 'vectorizer__document_contents__max_df': 1.0,
 'vectorizer__document_contents__max_features': None,
 'vectorizer__document_contents__min_df': 1,
 'vectorizer__document_contents__ngram_range': (1, 1),
 'vectorizer__document_contents__preprocessor': None,
 'vectorizer__document_contents__stop_words': None,
 'vectorizer__document_contents__strip_accents': None,
 'vectorizer__document_contents__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vectorizer__document_contents__tokenizer': None,
 'vectorizer__document_contents__vocabulary': None,
 'vectorizer__default': False,
 'vectorizer__df_out': False,
 'vectorizer__input_df': False,
 'vectorizer__sparse': False,
 'classifier__alpha': 1.0,
 'classifier__class_prior': None,
 'classifier__fit_prior': True}

which would be very handy for, say, using GridSearchCV to compare word and character analyzers.

This seems like it shouldn't be too hard to implement. If there's interest I can start digging around the codebase to try to spend some time on it.

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions