For the distance, you need to change your formula to
def getDistance(x, y, x_i, y_i):
return sqrt((x_i -x)^2 + (y_i - y)^2)
with (x,y) being your datapoint and (x_i, y_i) being a point from the curve.
Consider using NumPy for vectorization. Explicitly looping through your data points will most likely be less efficient, depending on your use case, it might however be quick enough. (If you need to run it on a regular basis, I think vectorization will easily outspeed the explicit way) This could look something like this:
import numpy as np # Universal abbreviation for the module
datapoints = np.random.rand(3,2) # Returns a vector with randomized entries of size 3x2 (Imagine it as 3 sets of x- and y-values
contour1 = np.random.rand(1000, 2) # Other than the size (which is 1000x2) no different than datapoints
contour2 = np.random.rand(1000, 2)
contour3 = np.random.rand(1000, 2)
def squareDistanceUnvectorized(datapoint, contour):
retVal = 0.
print("Using datapoint with values x:{}, y:{}".format(datapoint[0], datapoint[1]))
lengthOfContour = np.size(contour, 0) # This gets you the number of lines in the vector
for pointID in range(lengthOfContour):
squaredXDiff = np.square(contour[pointID,0] - datapoint[0])
squaredYDiff = np.square(contour[pointID,1] - datapoint[1])
retVal += np.sqrt(squaredXDiff + squaredYDiff)
retVal = retVal / lengthOfContour # As we want the average, we are dividing the sum by the element count
return retVal
if __name__ == "__main__":
noOfDatapoints = np.size(datapoints,0)
contID = 0
for currentDPID in range(noOfDatapoints):
dist1 = squareDistanceUnvectorized(datapoints[currentDPID,:], contour1)
dist2 = squareDistanceUnvectorized(datapoints[currentDPID,:], contour2)
dist3 = squareDistanceUnvectorized(datapoints[currentDPID,:], contour3)
if dist1 > dist2 and dist1 > dist3:
contID = 1
elif dist2 > dist1 and dist2 > dist3:
contID = 2
elif dist3 > dist1 and dist3 > dist2:
contID = 3
else:
contID = 0
if contID == 0:
print("Datapoint {} is inbetween two contours".format(currentDPID))
else:
print("Datapoint {} is closest to contour {}".format(currentDPID, contID))
Okay, now moving on to vector-land.
I have taken the liberty to adjust this part to what I think is your dataset. Try it and let me know if it works.
import numpy as np
import pandas as pd
# Generate 1000 points (2-dim Vector) with random values between 0 and 1. Make them strings afterwards.
# This is the first contour
random2Ddata1 = np.random.rand(1000,2)
listOfX1 = [str(x) for x in random2Ddata1[:,0]]
listOfY1 = [str(y) for y in random2Ddata1[:,1]]
# Do the same for a second contour, except that we de-center this 255 units into the first dimension
random2Ddata2 = np.random.rand(1000,2)+[255,0]
listOfX2 = [str(x) for x in random2Ddata2[:,0]]
listOfY2 = [str(y) for y in random2Ddata2[:,1]]
# After this step, our 'contours' are basically two blobs of datapoints whose centers are approx. 255 units apart.
# Generate a set of 4 datapoints and make them a Pandas-DataFrame
datapoints = {'X': ['0.5', '0', '255.5', '0'], 'Y': ['0.5', '0', '0.5', '-254.5']}
datapoints = pd.DataFrame(datapoints, columns=['X', 'Y'])
# Do the same for the two contours
contour1 = {'Xf': listOfX1, 'Yf': listOfY1}
contour1 = pd.DataFrame(contour1, columns=['Xf', 'Yf'])
contour2 = {'Xf': listOfX2, 'Yf': listOfY2}
contour2 = pd.DataFrame(contour2, columns=['Xf', 'Yf'])
# We do now have 4 datapoints.
# - The first datapoint is basically where we expect the mean of the first contour to be.
# Contour 1 consists of 1000 points with x, y- values between 0 and 1
# - The second datapoint is at the origin. Its distances should be similar to the once of the first datapoint
# - The third datapoint would be the result of shifting the first datapoint 255 units into the positive first dimension
# - The fourth datapoint would be the result of shifting the first datapoint 255 units into the negative second dimension
# Transformation into numpy array
# First the x and y values of the data points
dpArray = ((datapoints.values).T).astype(np.float)
c1Array = ((contour1.values).T).astype(np.float)
c2Array = ((contour2.values).T).astype(np.float)
# This did the following:
# - Transform the datapoints and contours into numpy arrays
# - Transpose them afterwards so that if we want all x values, we can write var[0,:] instead of var[:,0].
# A personal preference, maybe
# - Convert all the values into floats.
# Now, we iterate through the contours. If you have a lot of them, putting them into a list beforehand would do the job
for contourid, contour in enumerate([c1Array, c2Array]):
# Now for the datapoints
for _index, _value in enumerate(dpArray[0,:]):
# The next two lines do vectorization magic.
# First, we square the difference between one dpArray entry and the contour x values.
# You might notice that contour[0,:] returns an 1x1000 vector while dpArray[0,_index] is an 1x1 float value.
# This works because dpArray[0,_index] is broadcasted to fit the size of contour[0,:].
dx = np.square(dpArray[0,_index] - contour[0,:])
# The same happens for dpArray[1,_index] and contour[1,:]
dy = np.square(dpArray[1,_index] - contour[1,:])
# Now, we take (for one datapoint and one contour) the mean value and print it.
# You could write it into an array or do basically anything with it that you can imagine
distance = np.mean(np.sqrt(dx+dy))
print("Mean distance between contour {} and datapoint {}: {}".format(contourid+1, _index+1, distance))
# But you want to be able to call this... so here we go, generating a function out of it!
def getDistanceFromDatapointsToListOfContoursFindBetterName(datapoints, listOfContourDataFrames):
""" Takes a DataFrame with points and a list of different contours to return the average distance for each combination"""
dpArray = ((datapoints.values).T).astype(np.float)
listOfContours = []
for item in listOfContourDataFrames:
listOfContours.append(((item.values).T).astype(np.float))
retVal = np.zeros((np.size(dpArray,1), len(listOfContours)))
for contourid, contour in enumerate(listOfContours):
for _index, _value in enumerate(dpArray[0,:]):
dx = np.square(dpArray[0,_index] - contour[0,:])
dy = np.square(dpArray[1,_index] - contour[1,:])
distance = np.mean(np.sqrt(dx+dy))
print("Mean distance between contour {} and datapoint {}: {}".format(contourid+1, _index+1, distance))
retVal[_index, contourid] = distance
return retVal
# And just to see that it is, indeed, returning the same results, run it once
getDistanceFromDatapointsToListOfContoursFindBetterName(datapoints, [contour1, contour2])