This is not a fix my code question.(code works but not for the data used). This question directly relates to the sklearn fit() function call as a technical enquiry.
Module call below.
from sklearn import svm
svm.SVC.fit(X,y)
The code should fit the data (links provided below)
Resulting in a plot like the below image:
Looking for an expert opinion on performance or bug issue with the below code, please note the code runs but its the data I am running through it. When I pass in a log based, t-SNE the model just will not fit, I have ran it for hours and it should complete in seceonds.
Things I have tried: Waiting hours for it to complete Spun up a box and paid google for a beasty box
I am begining to think that this may have something to do with all the floats being so small. However the float32 constraint should make this ok. Would really appreciate any advice or ideas.
data I am using available here (X)
the corresponding y values can be grabbed here
print ("start")
import matplotlib.pyplot as plt
from sklearn import svm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn import cross_validation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import manifold, datasets, decomposition, discriminant_analysis
def plot_confusion_matrix(cm, classes,normalize=False,title='Confusion matrix',cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
print ('read packages')
######################################## Log data ########################################\
# df = pd.read_csv('logValuesOfWineData.csv').dropna().astype(np.float32)
# y = df['qualityBand'].values.astype(int)
# y = y.values
# subdf = df[[ 'Logfixed_acidity', 'Logvolatile_acidity','Logcitric_acid', 'Logresidual_sugar', 'Logchlorides',
# 'Logfree_sulfur_dioxide', 'Logtotal_sulfur_dioxide', 'Logdensity',
# 'Logsulphates', 'Logalcohol']]
# y = df['qualityBand'].map({1: 1, 2:2, 3:3})
# # removed free sulfar, rewsidual sugar, volatile acidity looks too normal
# # subdf = df[[ 'Logdensity','Logalcohol']]
# X = subdf.values
######################################## normal data ########################################
# names = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
# 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
# 'pH', 'sulphates', 'alcohol']
df = pd.read_csv('winequalityN.csv').dropna().astype(np.float32)
y = df['qualityBand'].values.astype(int)
# y = y.values
subdf = df[['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
'pH', 'sulphates', 'alcohol']]
y = df['qualityBand'].map({1: 0, 2:0, 3:1})
X = subdf.values
print ('read data')
################################## PCA ######################################################
# X = subdf.values
# print ('about to PCA')
# X_pca = decomposition.PCA(n_components=2).fit_transform(X)
# X = X_pca
kk = pd.read_csv('test.csv').dropna().astype(np.float32)
X = kk.values
# kk = pd.read_csv('dfX_pca_Normal.csv').dropna().astype(np.float32)
# X = kk.values
print ('finished PCA')
################################## X_tsne ######################################################
# X_tsne = manifold.TSNE(n_components=2, init='pca').fit_transform(X)
# X = X_tsne
# kk = pd.read_csv('dfX_tsne_log.csv').dropna().astype(np.float32)
# X = kk.values
# kk = pd.read_csv('dfX_tsne_Normal.csv').dropna().astype(np.float32)
# X = kk.values
##############################################################################################
y = df['qualityBand'].map({1: 0, 2:0, 3:1})
y = y.values
print ('started')
# need cross val
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.20, random_state=5)
# # we create 40 separable points
# np.random.seed(0)
# X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
# Y = [0] * 20 + [1] * 20
# fit the model
# Model 1
clf = svm.SVC(kernel='linear', probability = True)
print ('about to fit')
clf.fit(X, y)
print ('fit')
print ('model fit')
# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]
# plot the parallels to the separating hyperplane that pass through the
# support vectors
b = clf.support_vectors_[0]
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0])
# plot the line, the points, and the nearest vectors to the plane
plt.plot(xx, yy, 'k-')
plt.plot(xx, yy_down, 'k--')
plt.plot(xx, yy_up, 'k--')
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=80, facecolors='none')
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
class_names = ['0','1']
plt.ylabel('PCA 1')
plt.xlabel('PCA 2')
plt.title('Transformed Support Vector Machine {1: 0, 2:0, 3:1}')
plt.axis('tight')
plt.show()
clf.predict(X_test)
probas = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)
print (classification_report(y_test, y_pred))
import scikitplot as skplt
import matplotlib.pyplot as plt
# y_pred = clf.predict(X_train)
# y_true = # ground truth labels
# y_probas = # predicted probabilities generated by sklearn classifier
skplt.metrics.plot_roc_curve(y_test, probas)
# plt.show()
print (classification_report(y_test, y_pred))
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='SVM Line: Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
title='SVM Line: Normalized confusion matrix')
plt.show()
print("Accuracy", metrics.accuracy_score(y_test, y_pred))
print ('finito')