I'm doing some behavior analysis where I track behaviors over time and then create n-grams of those behaviors.
sample_n_gram_list = [['scratch', 'scratch', 'scratch', 'scratch', 'scratch'],
['scratch', 'scratch', 'scratch', 'scratch', 'smell/sniff'],
['scratch', 'scratch', 'scratch', 'sit', 'stand']]
I want to be able to cluster these n-grams, but I need to create a pre-computed distance matrix using a custom metric. My metric appears to work fine, but when I try to create the distance matrix using the sklearn function, I get an error:
ValueError: could not convert string to float: 'scratch'
I've looked at the documentation https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html and it's not particularly clear on this topic.
Anyone familiar with how to use this properly?
The full code is below:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.mlab as mlab
import math
import hashlib
import networkx as nx
import itertools
import hdbscan
from sklearn.metrics.pairwise import pairwise_distances
def get_levenshtein_distance(path1, path2):
"""
https://en.wikipedia.org/wiki/Levenshtein_distance
:param path1:
:param path2:
:return:
"""
matrix = [[0 for x in range(len(path2) + 1)] for x in range(len(path1) + 1)]
for x in range(len(path1) + 1):
matrix[x][0] = x
for y in range(len(path2) + 1):
matrix[0][y] = y
for x in range(1, len(path1) + 1):
for y in range(1, len(path2) + 1):
if path1[x - 1] == path2[y - 1]:
matrix[x][y] = min(
matrix[x - 1][y] + 1,
matrix[x - 1][y - 1],
matrix[x][y - 1] + 1
)
else:
matrix[x][y] = min(
matrix[x - 1][y] + 1,
matrix[x - 1][y - 1] + 1,
matrix[x][y - 1] + 1
)
return matrix[len(path1)][len(path2)]
sample_n_gram_list = [['scratch', 'scratch', 'scratch', 'scratch', 'scratch'],
['scratch', 'scratch', 'scratch', 'scratch', 'smell/sniff'],
['scratch', 'scratch', 'scratch', 'sit', 'stand']]
print("should be 0")
print(get_levenshtein_distance(sample_n_gram_list[1],sample_n_gram_list[1]))
print("should be 1")
print(get_levenshtein_distance(sample_n_gram_list[1],sample_n_gram_list[0]))
print("should be 2")
print(get_levenshtein_distance(sample_n_gram_list[0],sample_n_gram_list[2]))
clust_number = 2
distance_matrix = pairwise_distances(sample_n_gram_list, metric=get_levenshtein_distance)
clusterer = hdbscan.HDBSCAN(metric='precomputed')
clusterer.fit(distance_matrix)
clusterer.labels_