Many possible way to do it.
like pipeline most common and popular to used with dataset.
but you want to change hyper-parameter of train test split function and mostly pipeline is used after the train-test split function. So there is way to it.
Suggestion :
- In place of change the 'test_size' hyper parameter of train_test_split function use use grid-search cross validation. (RandomizedSearchCV is also better option).
- Change 'random_state' hyper-parameter is gives good results some times.
Code:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
def classify(model, X, y):
best_score = []
for i in np.arange(0.1,0.5,0.1):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = i, random_state = 21167)
model.fit(X_train, y_train)
best_model = model.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
score = best_model.score(y_test, y_pred)
best_score.append(score)
return best_score
After execute the above code then execute the below code:
X = dataset.drop(column = ['target_column_name'], axis = 1)
y = dataset['target_column_name']
pipe = Pipeline([('standardscaling',StandardScaler()),
('pca', PCA(n_components = 20)),
('classifier', LogisticRegression())]) # this the pipeline where you can add the operation performed by you in same sequence like standardscaler then, pca then, model like logisticregression
grid_param = [
{"classifier": [LogisticRegression()],
"classifier__penalty": ['l2'],
"classifier__C": np.logspace(0, 4, 10),
"classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
},
{"classifier": [DecisionTreeClassifier()],
"classifier__criterion" : ["gini","entropy"],
"classifier__max_depth" : [3,4,5],
"classifier__min_samples_leaf" : [3,5,7],
"classifier__random_state" : [0,5,10]
},
{"classifier": [RandomForestClassifier()],
"classifier__criterion" : ["gini","entropy"],
"classifier__n_estimators": [10, 100, 200],
"classifier__max_depth":[5,8,15,None],
"classifier__min_samples_leaf":[1,2,5,10],
"classifier__max_leaf_nodes": [2, 5,10],
},
{"classifier": [AdaBoostClassifier()],
"classifier__algorithm" : ['SAMME', 'SAMME.R'],
"classifier__random_state" : [0,5,10,15,20],
},
{"classifier": [BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=5,
min_samples_leaf=5,
random_state=0))],
"classifier__n_estimators" : [20,30,40],
"classifier__bootstrap_features" : [True,False],
"classifier__oob_score" : [True,False],
"classifier__random_state" : [0,5,10],
},
{"classifier": [GradientBoostingClassifier()],
"classifier__min_samples_leaf" : [3,5],
"classifier__max_features" : ['auto', 'sqrt', 'log2'],
"classifier__random_state" : [10,15,20],
}
] # this is example of hyper parameter tunning
gridsearch = GridSearchCV(pipe, grid_param, cv=4, verbose=0, n_jobs=-1) # Fit grid search
best_score_list = classify(gridsearch, X, y)
index = best_score_list.index(max(best_score)) # this help you to find out test_size value (use random_state value in palce of test_size)
best_score_list[index]
- To learn more, ask questions.
- To move one step ahead, think different.
- Don't copy paste it. Understand it. Do it in your own way.
- Any query contact at 'kumartyagisumit@gmail.com'