I am trying to vectorize the text column and then standardize the numeric column. Following are the python scripts:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
numeric_features = ["sklearn_cosine_similarity"] # numeric feature
categorical_features = ["clean_text"] # text feature
categorical_transformer = CountVectorizer()
preprocess = make_column_transformer((CountVectorizer(), categorical_features),(StandardScaler(), numeric_features))
pipeline = Pipeline(
[
("vect", preprocess),
("knnr", KNeighborsRegressor())
]
)
parameters = {
"vect__countvectorizer__max_features" : [None, 50,100],
"vect__countvectorizer__ngram_range" : [(1, 1)], # unigrams
"knnr__n_neighbors" : [3,4,5],
"knnr__weights" :['uniform', 'distance'],
"knnr__leaf_size" :[20],
'knnr__metric' : ['euclidean']
}
grid_search = GridSearchCV(pipeline, parameters, verbose=4,scoring='neg_root_mean_squared_error',cv=3)
grid_search.fit(X_train[['clean_text','sklearn_cosine_similarity']].values, y_train.values)
type(X_train[['clean_text','sklearn_cosine_similarity']]) is pandas.core.frame.DataFrame
type(y_train) is pandas.core.frame.DataFrame
Getting following errors while running the fit,
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=uniform, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=uniform, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=uniform, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=uniform, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=uniform, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=uniform, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=uniform, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=uniform, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=uniform, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=distance, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=distance, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=distance, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=distance, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=distance, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=distance, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=distance, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=distance, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=3, knnr__weights=distance, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=uniform, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=uniform, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=uniform, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=uniform, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=uniform, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=uniform, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=uniform, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=uniform, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=uniform, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=distance, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=distance, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=distance, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=distance, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=distance, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=distance, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=distance, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=distance, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=4, knnr__weights=distance, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=uniform, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=uniform, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=uniform, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=uniform, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=uniform, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=uniform, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=uniform, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=uniform, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=uniform, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=distance, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=distance, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=distance, vect__countvectorizer__max_features=None, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=distance, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=distance, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=distance, vect__countvectorizer__max_features=50, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 1/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=distance, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 2/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=distance, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
[CV 3/3] END knnr__leaf_size=20, knnr__metric=euclidean, knnr__n_neighbors=5, knnr__weights=distance, vect__countvectorizer__max_features=100, vect__countvectorizer__ngram_range=(1, 1); total time= 0.0s
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\anaconda3\envs\deepai\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
373 try:
--> 374 all_columns = X.columns
375 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_9996/3908756850.py in <module>
1 from time import time
2 t0 = time()
----> 3 grid_search.fit(X_train[['clean_text','sklearn_cosine_similarity']].values, y_train.values)
4 print("done in %0.3fs" % (time() - t0))
~\anaconda3\envs\deepai\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\envs\deepai\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
878 refit_start_time = time.time()
879 if y is not None:
--> 880 self.best_estimator_.fit(X, y, **fit_params)
881 else:
882 self.best_estimator_.fit(X, **fit_params)
~\anaconda3\envs\deepai\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~\anaconda3\envs\deepai\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~\anaconda3\envs\deepai\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
~\anaconda3\envs\deepai\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~\anaconda3\envs\deepai\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
504 self._validate_transformers()
505 self._validate_column_callables(X)
--> 506 self._validate_remainder(X)
507
508 result = self._fit_transform(X, y, _fit_transform_one)
~\anaconda3\envs\deepai\lib\site-packages\sklearn\compose\_column_transformer.py in _validate_remainder(self, X)
330 cols = []
331 for columns in self._columns:
--> 332 cols.extend(_get_column_indices(X, columns))
333
334 remaining_idx = sorted(set(range(self._n_features)) - set(cols))
~\anaconda3\envs\deepai\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
374 all_columns = X.columns
375 except AttributeError:
--> 376 raise ValueError("Specifying the columns using strings is only "
377 "supported for pandas DataFrames")
378 if isinstance(key, str):
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
if more details required then please let me know.
Please do not delete this question as I have not found any similar question. if duplicate question is there then please refer the link.