10

I'm trying to use TransformedTargetRegressor in a model pipeline and run a GridSearchCV on top of it.

Here is a minimal working example:

from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor


X,y = make_regression()

model_pipe = Pipeline([
    ('model', TransformedTargetRegressor(RandomForestRegressor()))
])

params={'model__n_estimators': [1, 10, 50]}


model = GridSearchCV(model_pipe, param_grid= params)

model.fit(X,y)

This model results in the following error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-48-828bdf0e7ede> in <module>
     17 model = GridSearchCV(model_pipe, param_grid= params)
     18 
---> 19 model.fit(X,y)

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    686                 return results
    687 
--> 688             self._run_search(evaluate_candidates)
    689 
    690         # For multi-metric evaluation, store the best_index_, best_params_ and

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1147     def _run_search(self, evaluate_candidates):
   1148         """Search all candidates in param_grid"""
-> 1149         evaluate_candidates(ParameterGrid(self.param_grid))
   1150 
   1151 

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
    665                                for parameters, (train, test)
    666                                in product(candidate_params,
--> 667                                           cv.split(X, y, groups)))
    668 
    669                 if len(out) < 1:

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
   1001             # remaining jobs.
   1002             self._iterating = False
-> 1003             if self.dispatch_one_batch(iterator):
   1004                 self._iterating = self._original_iterator is not None
   1005 

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    832                 return False
    833             else:
--> 834                 self._dispatch(tasks)
    835                 return True
    836 

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in _dispatch(self, batch)
    751         with self._lock:
    752             job_idx = len(self._jobs)
--> 753             job = self._backend.apply_async(batch, callback=cb)
    754             # A job can complete so quickly than its callback is
    755             # called before we get here, causing self._jobs to

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    199     def apply_async(self, func, callback=None):
    200         """Schedule a func to be run"""
--> 201         result = ImmediateResult(func)
    202         if callback:
    203             callback(result)

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    580         # Don't delay the application, to avoid keeping the input
    581         # arguments in memory
--> 582         self.results = batch()
    583 
    584     def get(self):

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    501     train_scores = {}
    502     if parameters is not None:
--> 503         estimator.set_params(**parameters)
    504 
    505     start_time = time.time()

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
    162         self
    163         """
--> 164         self._set_params('steps', **kwargs)
    165         return self
    166 

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
     48                 self._replace_estimator(attr, name, params.pop(name))
     49         # 3. Step parameters and other initialisation arguments
---> 50         super().set_params(**params)
     51         return self
     52 

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/base.py in set_params(self, **params)
    231 
    232         for key, sub_params in nested_params.items():
--> 233             valid_params[key].set_params(**sub_params)
    234 
    235         return self

~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/base.py in set_params(self, **params)
    222                                  'Check the list of available parameters '
    223                                  'with `estimator.get_params().keys()`.' %
--> 224                                  (key, self))
    225 
    226             if delim:

ValueError: Invalid parameter n_estimators for estimator TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
                           regressor=RandomForestRegressor(bootstrap=True,
                                                           criterion='mse',
                                                           max_depth=None,
                                                           max_features='auto',
                                                           max_leaf_nodes=None,
                                                           min_impurity_decrease=0.0,
                                                           min_impurity_split=None,
                                                           min_samples_leaf=1,
                                                           min_samples_split=2,
                                                           min_weight_fraction_leaf=0.0,
                                                           n_estimators='warn',
                                                           n_jobs=None,
                                                           oob_score=False,
                                                           random_state=None,
                                                           verbose=0,
                                                           warm_start=False),
                           transformer=None). Check the list of available parameters with `estimator.get_params().keys()`.

This model runs when I remove TransformedTargetRegressor from the pipeline and just pass the random forest. Why is this? How can I use TransformedTargetRegressor in a pipeline as I have shown above?

Venkatachalam
  • 16,288
  • 9
  • 49
  • 77
Demetri Pananos
  • 6,770
  • 9
  • 42
  • 73

3 Answers3

12

The RandomForestRegressor is stored as regressor param in TransformedTargetRegressor.

Hence, the right way to define the params for GridSearchCV is

params={'model__regressor__n_estimators': [1, 10, 50]}
Venkatachalam
  • 16,288
  • 9
  • 49
  • 77
  • 5
    This is the right answer. Wrapping the GridSearchCV in TransformedTargetregressor would have the GridSearchCV optimize outside of the transformed space, which would never be what we want. – wingedsubmariner Feb 01 '20 at 17:14
  • In my case the right order is 'regressor__model__...' – volperossa Dec 26 '21 at 09:53
  • @wingedsubmariner Not sure why TransformedTargetRegressor isn't appropriate here. I might be missing it, but the purpose of transformedTargetReg is to apply a transform before training [see example here](https://scikit-learn.org/stable/modules/generated/sklearn.compose.TransformedTargetRegressor.html). If it doesn't work can you offer a solution that allows you to do a non-linear transform within a pipeline? – mmann1123 Jan 04 '22 at 02:46
2

Seems like people are having issues with zeros in y. Consider the following using log1p and expm1. See another worked example here

X,y = make_regression()

model_pipe = Pipeline([
('model', TransformedTargetRegressor(regressor=RandomForestRegressor(),
                                     func=np.log1p, 
                                     inverse_func=np.expm1))
])

params={'model__regressor__n_estimators': [1, 10, 50]}


model = GridSearchCV(model_pipe, param_grid= params)

model.fit(X,y)
    
mmann1123
  • 5,031
  • 7
  • 41
  • 49
-1

I've found out the answer. The TransformedTargetregressor needs to be applied to the grid search estimator as so

from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor


X,y = make_regression()

model_pipe = Pipeline([
    ('model', RandomForestRegressor())
])

params={'model__n_estimators': [1, 10, 50]}


model = TransformedTargetRegressor(GridSearchCV(model_pipe, param_grid= params), func=np.log, inverse_func=np.exp)

model.fit(X,y)
Demetri Pananos
  • 6,770
  • 9
  • 42
  • 73
  • The above code results in **error**, *ValueError: Input contains NaN, infinity or a value too large for dtype('float64').* You may need to correct the input. – nikn8 Sep 21 '20 at 10:16
  • 0s will be problematic, see example below using log1p and expm1 – mmann1123 Jan 13 '21 at 16:42
  • 1
    This procedure will first transform the target and will then use the transformed target to undertake gridsearch incl. cross validation. This means that the transformed data will be split up again for `k` cross validation splits. That will result in targets that are distorted to a certain extent. Very problematic with time series for example, when stationarity is not given. In my understanding I would like to transform the target after splitting it up. So: `GridSearchCV(TransformedTargetRegressor(pipeline))` would be the correct approach. Opposed to: `TransformTarget(GridSearch(pipeline))` – Pascal Nov 11 '21 at 11:45