0

Trying to use CalibratedClassifierCV in Scikit Learn after finding the best parameters for a Pipeline with the following code.

from sklearn.calibration import CalibratedClassifierCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

pipeline = Pipeline([
    ('vect', CountVectorizer(token_pattern=r'(?u)\b\w+\b')),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

parameters = {
    'vect__max_features': (1000, ),
    'vect__max_df': (0.75, 1.0),
    'vect__min_df': (1, 5),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__binary': (True, False),
    'tfidf__use_idf': (True, False),
    'clf__class_weight': (None, 'balanced'),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
data = fetch_20newsgroups(
    categories=['alt.atheism', 'talk.religion.misc', 'sci.med'],
    remove=['headers', 'footers', 'quotes']
)

grid_search.fit(data.data, data.target)

best_parameters = grid_search.best_estimator_.get_params()

pipeline.set_params(**dict(best_parameters.items()))
model = CalibratedClassifierCV(base_estimator=pipeline, method='sigmoid')
model = model.fit(data.data, data.target)

This fails on the last fit, ValueError: Found input variables with inconsistent numbers of samples: [1, 1451].

Looking at the documentation I don't see why this shouldn't work. I've tried reshaping the array but it fails due to the Pipeline expecting a string sample as input.

I'm using scikit-learn 0.18, but had the same problem with 0.17.1

Full trace below.

.../lib/python3.5/site-packages/sklearn/calibration.py in fit(self, X, y, sample_weight)
    123         """
    124         X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
--> 125                          force_all_finite=False)
    126         X, y = indexable(X, y)
    127         lb = LabelBinarizer().fit(y)

.../lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    529         y = y.astype(np.float64)
    530
--> 531     check_consistent_length(X, y)
    532
    533     return X, y

.../lib/python3.5/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    179     if len(uniques) > 1:
    180         raise ValueError("Found input variables with inconsistent numbers of"
--> 181                          " samples: %r" % [int(l) for l in lengths])
    182
    183
hpaulj
  • 221,503
  • 14
  • 230
  • 353
vierja
  • 436
  • 6
  • 10
  • Same `fit` error in recent http://stackoverflow.com/questions/39950021/found-arrays-with-inconsistent-numbers-of-samples – hpaulj Oct 10 '16 at 07:10
  • See http://stackoverflow.com/questions/30813044/sklearn-found-arrays-with-inconsistent-numbers-of-samples-when-calling-linearre – hpaulj Oct 10 '16 at 07:18

0 Answers0