import pandas as pd
new_data = pd.DataFrame({'at': [15967, 290.865, 307.329, 902.444, 700.898, 800, 850, 900, 1000, 5000, 10000, 5000, 30000, 90000, 200000, 10000, 5000, 30000, 90000, 200000],
'cogs': [26094.000, 246.466, 325.912, 124.903, 1044.110, 800, 850, 900, 1000, 5000, 10000, 5000, 30000, 90000, 200000, 10000, 5000, 30000, 90000, 200000],
'division': ['Retail Trade', 'Services', 'Manufacturing', 'Services', 'Manufacturing', 'Retail Trade', 'Services', 'Manufacturing', 'Services', 'Manufacturing', 'Retail Trade', 'Services', 'Manufacturing', 'Services', 'Manufacturing', 'Retail Trade', 'Services', 'Manufacturing', 'Services', 'Manufacturing'],
'bankrupt': [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
})
I have a dataset with 22 columns (created a sample dataset above).The target is "bankrupt", remaining columns are feature. I will like to create a pipeline to onehotencoder the categorical variable "division". For the remaining feature columns, i will like to conduct standardscaler and minmax via gridsearch to find out the optimal result.
#library
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline #i used imblearn pipeline as I will like to do SMOTE later on
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
categorical_features = new_data['division']
categorical_features.head()
numerical_features = new_data.drop(columns=['division','bankrupt'])
numerical_features.head()
cat_preprocessor = Pipeline(steps=[
('oh', OneHotEncoder(handle_unknown='ignore'))
])
num_preprocessor = Pipeline(steps=[
('ss', StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
('cat', cat_preprocessor, categorical_features),
('num', num_preprocessor, numerical_features)
])
model = Pipeline(steps=[
('prep', preprocessor)
])
param_grid = {
'prep__num__ss': [StandardScaler(), MinMaxScaler()]
}
gs = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='roc_auc',
n_jobs=-1,
cv=2
)
#Split the dataset into training set and test set
X = new_data.drop(columns=['bankrupt'])
Y = new_data['bankrupt']
X_train, X_test, y_train, y_test = train_test_split(X,
Y, test_size=0.2,
random_state=2021, stratify=Y)
gs.fit(X_train)
Error message
KeyError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Retail Trade'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\anaconda3\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
395 for col in columns:
--> 396 col_idx = all_columns.get_loc(col)
397 if not isinstance(col_idx, numbers.Integral):
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3081 except KeyError as err:
-> 3082 raise KeyError(key) from err
3083
KeyError: 'Retail Trade'
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-10-82a9838329a1> in <module>
----> 1 gs.fit(X_train)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
880 self.best_estimator_.fit(X, y, **fit_params)
881 else:
--> 882 self.best_estimator_.fit(X, **fit_params)
883 refit_end_time = time.time()
884 self.refit_time_ = refit_end_time - refit_start_time
~\anaconda3\lib\site-packages\imblearn\pipeline.py in fit(self, X, y, **fit_params)
264 if self._final_estimator != "passthrough":
265 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 266 self._final_estimator.fit(Xt, yt, **fit_params_last_step)
267 return self
268
~\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in fit(self, X, y)
469 # we use fit_transform to make sure to set sparse_output_ (for which we
470 # need the transformed data) to have consistent output type in predict
--> 471 self.fit_transform(X, y=y)
472 return self
473
~\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
504 self._validate_transformers()
505 self._validate_column_callables(X)
--> 506 self._validate_remainder(X)
507
508 result = self._fit_transform(X, y, _fit_transform_one)
~\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in _validate_remainder(self, X)
330 cols = []
331 for columns in self._columns:
--> 332 cols.extend(_get_column_indices(X, columns))
333
334 remaining_idx = sorted(set(range(self._n_features)) - set(cols))
~\anaconda3\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
403 raise ValueError(
404 "A given column is not a column of the dataframe"
--> 405 ) from e
406
407 return column_indices
ValueError: A given column is not a column of the dataframe
I checked all columns are present in the dataframe. Appreciate all help.