0

I keep getting the error about the attribute when it worked 20 minutes ago. I am not sure what can be going wrong, when I set up the code on a seperate notebook it ran and GridSearchCV moved along smoothly. Do I need to update Scikit-Learn? I posted the entire code because I believe it is essential in case little details are missing. Any help is appreciated.

import pandas as pd
train_data = pd.read_csv("~/Desktop/Personal/Data/train.csv")
test_features = pd.read_csv("~/Desktop/Personal/Data/test.csv")
test_survived = pd.read_csv("~/Desktop/Personal/Data/gender_submission.csv")

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

def data_process(data):
    data = data.drop("Cabin", 1)
    data = data.drop("Embarked", 1)
    data = data.drop("Ticket",1)
    data = data.drop("Name", 1)
    data = data.drop("PassengerId", 1)
    data["Sex"] = LabelEncoder().fit_transform(data["Sex"])

    numerical_attr = ["Age", "Pclass", "SibSp", "Parch", "Fare"]

    for attr in numerical_attr:
        data[attr].fillna(round(data[attr].mean(), 0), inplace=True)
    return data

train_data = data_process(train_data)

test_features = data_process(test_features).to_numpy()

test_survived = test_survived.drop("PassengerId", 1).to_numpy()

full_train_features = train_data.drop("Survived", 1).to_numpy()

full_train_survived = train_data.drop(["Age", "Pclass", "SibSp", "Parch", "Fare", "Sex"], 1).to_numpy().ravel()

train_set,test_set = train_test_split(train_data, test_size = 0.3, random_state = 1)

part_train_set_features = train_set.drop("Survived", 1).to_numpy()
part_train_set_survived = train_set.drop(["Age", "Pclass", "SibSp", "Parch", "Fare", "Sex"], 1).to_numpy().ravel()

val_set_features = test_set.drop("Survived", 1).to_numpy()
val_set_survived = test_set.drop(["Age", "Pclass", "SibSp", "Parch", "Fare", "Sex"], 1).to_numpy().ravel()

log_reg = LogisticRegression(solver = 'liblinear')

log_reg.fit(part_train_set_features, part_train_set_survived)

predict_log_reg_base = log_reg.predict(val_set_features)
accuracy_log_reg_base = accuracy_score(predict_log_reg_base, val_set_survived)
print(accuracy_log_reg_base)


fixed_range1 = range(1,21)
c_values = [i/10 for i in fixed_range1]

fixed_range2 = range(10,21)
max_iter_values = [i*10 for i in fixed_range2]

parameters_log_reg = {'C' : c_values, 'penalty' : ['l1', 'l2'], 'max_iter' : max_iter_values}

log_reg_best = GridSearchCV(LogisticRegression(solver = 'liblinear'), parameters_log_reg, return_train_score = True)

final_log_reg = log_reg_best.best_estimator_
desertnaut
  • 57,590
  • 26
  • 140
  • 166
  • Does this answer your question? [How to get Best Estimator on GridSearchCV (Random Forest Classifier Scikit)](https://stackoverflow.com/questions/30102973/how-to-get-best-estimator-on-gridsearchcv-random-forest-classifier-scikit) – Ben Reiniger Jun 03 '20 at 21:09

1 Answers1

1

You need to fit it first:

# define
log_reg_best = GridSearchCV(LogisticRegression(solver = 'liblinear'), parameters_log_reg, return_train_score = True)
# fit
log_reg_best.fit(part_train_set_features, part_train_set_survived)
# get best estimator
final_log_reg = log_reg_best.best_estimator_
desertnaut
  • 57,590
  • 26
  • 140
  • 166