I'm trying to classify a text to a 6 different classes. Since I'm having an imbalanced dataset, I'm also using SMOTETomek method that should synthetically balance the dataset with additional artificial samples.
I've noticed a huge score difference when applying it via pipeline vs 'Step by step" where the only difference is (I believe) the place I'm using train_test_split
Here are my features and labels:
for curr_features, label in self.training_data:
features.append(curr_features)
labels.append(label)
algorithms = [
linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None),
naive_bayes.MultinomialNB(),
naive_bayes.BernoulliNB(),
tree.DecisionTreeClassifier(max_depth=1000),
tree.ExtraTreeClassifier(),
ensemble.ExtraTreesClassifier(),
svm.LinearSVC(),
neighbors.NearestCentroid(),
ensemble.RandomForestClassifier(),
linear_model.RidgeClassifier(),
]
Using Pipeline:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
# Provide Report for all algorithms
score_dict = {}
for algorithm in algorithms:
model = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('smote', SMOTETomek()),
('classifier', algorithm)
])
model.fit(X_train, y_train)
# Score
score = model.score(X_test, y_test)
score_dict[model] = int(score * 100)
sorted_score_dict = {k: v for k, v in sorted(score_dict.items(), key=lambda item: item[1])}
for classifier, score in sorted_score_dict.items():
print(f'{classifier.__class__.__name__}: score is {score}%')
Using Step by Step:
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
cv = vectorizer.fit_transform(features)
text_tf = transformer.fit_transform(cv).toarray()
smt = SMOTETomek()
X_smt, y_smt = smt.fit_resample(text_tf, labels)
X_train, X_test, y_train, y_test = train_test_split(X_smt, y_smt, test_size=0.2, random_state=0)
self.test_classifiers(X_train, X_test, y_train, y_test, algorithms)
def test_classifiers(self, X_train, X_test, y_train, y_test, classifiers_list):
score_dict = {}
for model in classifiers_list:
model.fit(X_train, y_train)
# Score
score = model.score(X_test, y_test)
score_dict[model] = int(score * 100)
print()
print("SCORE:")
sorted_score_dict = {k: v for k, v in sorted(score_dict.items(), key=lambda item: item[1])}
for model, score in sorted_score_dict.items():
print(f'{model.__class__.__name__}: score is {score}%')
I'm getting (for the best classifier model) around 65% using pipeline vs 90% using step by step. Not sure what am I missing.