All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough

Question

Can I ask, when I run this code, it produces an output without error:

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score, cross_val_predict,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2, f_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBRegressor
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import Pipeline
from scipy.stats import spearmanr
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,classification_report

import pickle
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score
from sklearn.datasets import make_classification

#Generate fake data
X, y = make_classification(n_samples=5000, n_classes=2, n_features=20, n_redundant=0,random_state=0) #fake data

X_train = X[:4500] #.iloc for df
y_train = y[:4500]

X_test = X[4500:]#.reset_index(drop=True,inplace=True)
y_test = y[4500:]

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

def run_SVC(X_train, y_train, X_test, y_test,output_file,data_name,refit_score='precision_score'):
  '''
  run SVC algorithm, with CV and hyperparameter tuning.
  '''
  short_dataname =  data_name.strip().split('/')
  file_model_name = output_file  + '_svc_' + short_dataname[-1]

  clf = SVC()
  skf = StratifiedKFold(n_splits=2,random_state=42,shuffle=True)
  #fs = SelectKBest(score_func = mutual_info_classif)
  pipeline = Pipeline(steps=[('svc',clf)]) #,('sel',fs)
  print(pipeline.get_params().keys())

  search = GridSearchCV(
        pipeline,
        param_grid={
              'svc__C': [0.01, 0.1, 10, 1000], ##Regularization
              'svc__gamma': [0.0001, 0.01,  1, 10],
              'svc__kernel':['linear','rbf'],
        },
        return_train_score=True,
        verbose=3,
        refit=refit_score,
        scoring=scorers,
        cv=skf,
        n_jobs=-1,
        )
    
  search.fit(X_train, y_train)
    
  # make the predictions
  y_pred = search.predict(X_test)
    
  print('Best params for {}'.format(refit_score))
  print(search.best_params_)
  print(classification_report(y_test,y_pred)) #labels=['neg','pos']    
  return

print(run_SVC(X_train,y_train,X_test,y_test,'test.txt','dataset'))

When i comment in the only two lines that are commented out (#fs = SelectKBest(score_func = mutual_info_classif)) and fs in the line after that, I get the error:

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SVC()' (type <class 'sklearn.svm._classes.SVC'>) doesn't

I can see that other people have addressed this on SO before, e.g. here, so I tried to follow that person's answer, but my SelectKBest is already before my pipeline - when I move the line with 'fs' to be higher in my code (which I thought was what the answer was saying), I get the same error.

Could someone show me where I'm going wrong here and what I'm meant to change to remove this error?

For a quick answer: you can't have an `estimator` like `SVC()` as second-to-last step in a `Pipeline` (neither as a generic intermediate - i.e. not final - step); you can only have a `transformer` as intermediate step (https://stackoverflow.com/questions/54899647/what-is-the-difference-between-transformer-and-estimator-in-sklearn#:~:text=Transformer%20is%20a%20type%20of%20Estimator%20that%20implements,Note%3A%20Estimator%27s%20aren%27t%20used%20to%20predict%20values%20directly.). That's why when `SVC()` is the last step in the pipeline, the code's working and in the other case it is not. — amiola, Jan 21 '22 at 17:06

score 1 · Accepted Answer · answered Jan 21 '22 at 17:06

1

The order of the steps in a Pipeline matters, and only the last step can be a non-transformer like your svc.

answered Jan 21 '22 at 17:06

Ben Reiniger

10,517
3
16
29

All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough

1 Answers1