import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('coords.csv',sep=';') #Cargo el archivo csv
x = df.iloc[1:,1:] #features values
y = df.iloc[1:,0] #target value
y = y.apply(lambda y: y.encode())
print(x)
print(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1234)
print(x_train)
print(y_train)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
pipelines = {
'lr':make_pipeline(StandardScaler(), LogisticRegression()),
'rc':make_pipeline(StandardScaler(), RidgeClassifier()),
'rf':make_pipeline(StandardScaler(), RandomForestClassifier()),
'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier()),
}
fit_models = {}
for algo, pipeline in pipelines.items():
model = pipeline.fit(x_train, y_train)
fit_models[algo] = model
print(fit_models)
print(fit_models['lr'].predict(x_test))
print(fit_models['rc'].predict(x_test))
print(fit_models['rf'].predict(x_test))
print(fit_models['gb'].predict(x_test))
I was having a problem when trying to load strings from a csv file, because it tells me:
Traceback (most recent call last):
File "3_Train_Custom_Model_Using_Scikit_Learn.py", line 99, in <module>
model = pipeline.fit(x_train, y_train)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\utils\optimize.py", line 243, in _check_optimize_result
).format(solver, result.status, result.message.decode("latin1"))
AttributeError: 'str' object has no attribute 'decode'
And when I add y = y.apply (lambda y: y.encode ())
because I thought I needed to transform strings to bytes, I get this:
Traceback (most recent call last):
File "3_Train_Custom_Model_Using_Scikit_Learn.py", line 99, in <module>
model = pipeline.fit(x_train, y_train)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 335, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1345, in fit
check_classification_targets(y)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 169, in check_classification_targets
y_type = type_of_target(y)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 263, in type_of_target
raise ValueError('You appear to be using a legacy multi-label data'
ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.
How do I so that the data framed in red from the csv that you see in the following Excel screenshot, which would be the targets, are saved in the variable y
, and those that are framed in blue that It would be the features (x1, y1, z1, v1, x2, y2, z2, v2, ..., x501, y501, z501, v501) that must be saved in the variable x
.