I've imbalanced dataset and I applied RandomOverSampler
to get balanced data set.
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
Afterwards I have followed this kaggle post RandomForest implementation for feature selection
https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial (go to the bottom of the page you will see similar implementation.)
I have a similar real data set like titanic:) and trying to get feature importances out of it!
The problem I'm having is that even though the classifier accuracy is very high ~0.99% the feature importance I'm getting is in the order of ~0.1%. What would be causing that? or its ok?
Here is the code I'm using, similar one that I provided in the link. Go to the bottom of the page.
classifiers = [RandomForestClassifier(random_state=SEED,
criterion='gini',
n_estimators=20,
bootstrap=True,
max_depth=5,
n_jobs=-1)]
#DecisionTreeClassifier(),
#LogisticRegression(),
#KNeighborsClassifier()]
#GradientBoostingClassifier(),
#SVC(probability=True), GaussianNB()]
log_cols = ["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)
SEED = 42
N = 15
skf = StratifiedKFold(n_splits=N, random_state=None, shuffle=True)
importances = pd.DataFrame(np.zeros((X.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=data.columns)
acc_dict = {}
for fold, (train_index, test_index) in enumerate(skf.split(X_over, y_over)):
X_train, X_test = X_over[train_index], X_over[test_index]
y_train, y_test = y_over[train_index], y_over[test_index]
for clf in classifiers:
#pipe1=make_pipeline(sampling,clf)
print(clf)
name = clf.__class__.__name__
clf.fit(X_train, y_train)
train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
if 'Random' in name:
importances.iloc[:, fold - 1] = clf.feature_importances_
if name in acc_dict:
acc_dict[name] += acc
else:
acc_dict[name] = acc
#doing grid search for best input parameters for RF
#CV_rfc = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)
#CV_rfc.fit(X_train, y_train)
for clf in acc_dict:
acc_dict[clf] = acc_dict[clf] / 10.0
log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
log = log.append(log_entry)
I'm getting almost the same feature importance value best is ~0.1%
By doing confusion Matrix check suggested from @AlexSerraMarrugat
EDIT
Test: 0.9926166568222091
Train: 0.9999704661911724
EDIT2
Tried randomoversplit afterwards:
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
x_over, y_over = oversample.fit_resample(X_train,Y_train)
# summarize class distribution
print(Counter(y_over))
print(len(x_over))
#Creating confusion matrix
from sklearn.metrics import plot_confusion_matrix
clf = RandomForestClassifier(random_state=0) #Here change the hyperparameters
clf.fit(x_over, y_over)
predict_y=clf.predict(x_test)
plot_confusion_matrix(clf, x_test, y_test, cmap=plt.cm.Blues)
print("Test: ", clf.score(x_test, y_test))
print("Train: ", clf.score(x_over, y_over))
Test: 0.9926757235676315 Train: 1.0
EDIT3 Confusion matrix for Train data
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(clf, X_train, Y_train, cmap=plt.cm.Blues)
print("Train: ", clf.score(X_train, Y_train))