Can you please tell me how to calculate distance between every point in my testData properly.
For now I am getting only one single value, whereas I should get distance from each point in data set and be able to assign it a class. I have to use numpy for this.
======================================================================== Now the problem is that I am getting this error and don't know how to fix it.
KeyError: 0
I am trying to obtain accuracy of classified labels. Any ideas, please?
import matplotlib.pyplot as plt
import random
import numpy as np
import operator
from sklearn.cross_validation import train_test_split
# In[1]
def readFile():
f = open('iris.data', 'r')
d = np.dtype([ ('features',np.float,(4,)),('class',np.str_,20)])
data = np.genfromtxt(f, dtype = d ,delimiter=",")
dataPoints = data['features']
labels = data['class']
return dataPoints, labels
# In[2]
def normalizeData(dataPoints):
#normalize the data so the values will be between 0 and 1
dataPointsNorm = (dataPoints - dataPoints.min())/(dataPoints.max() - dataPoints.min())
return dataPointsNorm
def crossVal(dataPointsNorm):
# spliting for train and test set for crossvalidation
trainData, testData = train_test_split(dataPointsNorm, test_size=0.20, random_state=25)
return trainData, testData
def calculateDistance(trainData, testData):
#Euclidean distance calculation on numpy arrays
distance = np.sqrt(np.sum((trainData - testData)**2, axis=-1))
# Argsort sorts indices from closest to furthest neighbor, in ascending order
sortDistance = distance.argsort()
return distance, sortDistance
# In[4]
def classifyKnn(testData, trainData, labels, k):
# Calculating nearest neighbours and based on majority vote assigning the class
classCount = {}
for i in range(k):
distance, sortedDistIndices = calculateDistance(trainData, testData[i])
voteLabel = labels[sortedDistIndices][i]
#print voteLabel
classCount[voteLabel] = classCount.get(voteLabel,0)+1
print 'Class Count: ', classCount
# Sorting dictionary to return voted class
sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0], classCount
def testAccuracy(testData, classCount):
correct = 0
for x in range(len(testData)):
print 'HERE !!!!!!!!!!!!!!'
if testData[x][-1] is classCount[x]:
correct += 1
return (correct/float(len(testData))) * 100.0
def main():
dataPoints, labels = readFile()
dataPointsNorm = normalizeData(dataPoints)
trainData, testData = crossVal(dataPointsNorm)
result, classCount = classifyKnn(testData, trainData, labels, 5)
print result
accuracy = testAccuracy(testData, classCount)
print accuracy
main()
I have it normalized, split into train and test calc distance (wrong).
Thanks for any tips.