def buildTree(data):
f = 300 # Length of item vector that will be indexed
t = AnnoyIndex(f, 'angular')
trees = 10
v = data.columns.get_loc("Vektoren")
for i in range(len(data.index)):
t.add_item(i, data.iloc[i,v])
t.build(trees) # 10 trees
t.save('test.ann')
# ...
u = AnnoyIndex(f, 'angular')
u.load('test.ann') # super fast, will just mmap the file
data['Nachbarn'] = ''
data['Index'] = ''
data['Distance'] = ''
sag = data.columns.get_loc("SAG-Nummer")
for i in range(len(data.index)):
sagList = []
annoyVektor = []
for index in u.get_nns_by_item(i, trees):
sagList.append(data.iloc[index, sag])
indexList = u.get_nns_by_item(i, trees,include_distances=True)
data.at[i, 'Index'] = indexList[0]
data.at[i,'Nachbarn'] = sagList
data.at[i,'Distance'] = indexList[1]
return data
The returned data includes a list of neigherst neigbours and their distance to the element of the dataset.
I dont't understand how to plot this data into a 2d scatter-plot.