I have 9164 points, where 4303 are labeled as the class I want to predict and 4861 are labeled as not that class. They are no duplicate points.
Following How to split into train, test and evaluation sets in sklearn?, and since my dataset
is a tuple of 3 items (id, vector, label), I do:
df = pd.DataFrame(dataset)
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
train_labels = construct_labels(train)
train_data = construct_data(train)
test_labels = construct_labels(test)
test_data = construct_data(test)
def predict_labels(test_data, classifier):
labels = []
for test_d in test_data:
labels.append(classifier.predict([test_d]))
return np.array(labels)
def construct_labels(df):
labels = []
for index, row in df.iterrows():
if row[2] == 'Trump':
labels.append('Atomium')
else:
labels.append('Not Trump')
return np.array(labels)
def construct_data(df):
first_row = df.iloc[0]
data = np.array([first_row[1]])
for index, row in df.iterrows():
if first_row[0] != row[0]:
data = np.concatenate((data, np.array([row[1]])), axis=0)
return data
and then:
>>> classifier = SVC(verbose=True)
>>> classifier.fit(train_data, train_labels)
[LibSVM].......*..*
optimization finished, #iter = 9565
obj = -2718.376533, rho = 0.132062
nSV = 5497, nBSV = 2550
Total nSV = 5497
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=True)
>>> predicted_labels = predict_labels(test_data, classifier)
>>> for p, t in zip(predicted_labels, test_labels):
... if p == t:
... correct = correct + 1
and I get correct only 943 labels out of 1833 (=len(test_labels)) -> (943*100/1843 = 51.4%)
I am suspecting I am missing something big time here, maybe I should set a parameter to the classifier to do more refined work or something?
Note: First time using SVMs here, so anything you might get for granted, I might have not even imagine...
Attempt:
I went ahed and decreased the number of negative examples to 4303 (same number as positive examples). This slightly improved accuracy.
Edit after the answer:
>>> print(clf.best_estimator_)
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
>>> classifier = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
... decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
... max_iter=-1, probability=False, random_state=None, shrinking=True,
... tol=0.001, verbose=False)
>>> classifier.fit(train_data, train_labels)
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
Also I tried clf.fit(train_data, train_labels)
, which performed the same.
Edit with data (the data are not random):
>>> train_data[0]
array([ 20.21062112, 27.924016 , 137.13815308, 130.97432804,
... # there are 256 coordinates in total
67.76352596, 56.67798138, 104.89566517, 10.02616417])
>>> train_labels[0]
'Not Trump'
>>> train_labels[1]
'Trump'