@jeff 's solution is changed
As version of scikit-learn 1.1.2, you don't need to use scipy's sparse
before cosine_similarity
.
All you need is cosine_similarity
from typing import Tuple
import numpy as np
import perfplot
import scipy
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn_internal
from scipy import spatial
from scipy import sparse
import sklearn.preprocessing as pp
target_dtype = "float16"
class prettyfloat(float):
def __repr__(self):
return "%.2f" % self
def cosine_similarity_sklearn(x):
return cosine_similarity_sklearn_internal(x)
def cosine_similarity_sklearn_sparse(x):
x_sparse = sparse.csr_matrix(x)
return cosine_similarity_sklearn_internal(x_sparse)
def cosine_similarity_einsum(x, y=None):
"""
Calculate the cosine similarity between two vectors.
if x == y, only use x
"""
# cosine_similarity in einsum notation without astype
normed_x = x / np.linalg.norm(x, axis=1)[:, None]
normed_y = y / np.linalg.norm(y, axis=1)[:, None] if y else normed_x
return np.einsum("ik,jk->ij", normed_x, normed_y)
def cosine_similarity_scipy(x, y=None):
"""
Calculate the cosine similarity between two vectors.
if x == y, only use x
"""
return 1 - spatial.distance.cosine(x, x)
def setup_n(n) -> Tuple[np.ndarray, np.ndarray]:
nd_arr = np.random.randn(int(2 ** n), 512).astype(target_dtype)
return nd_arr
def equality_check(a, b):
if type(a) != np.ndarray:
a = a.todense()
if type(b) != np.ndarray:
b = b.todense()
return np.isclose(a.astype(target_dtype), b.astype(target_dtype), atol=1e-3).all()
fig = perfplot.show(
setup=setup_n,
n_range=[k for k in range(1, 10)],
kernels=[
cosine_similarity_sklearn,
cosine_similarity_sklearn_sparse,
cosine_similarity_einsum,
# cosine_similarity_scipy,
],
labels=["sk-def", "sk+sparse", "einsum"],
logx=False,
logy=False,
xlabel='2^n',
equality_check=equality_check,
)
Using perfplot, it show, `from typing import Tuple
import numpy as np
import perfplot
import scipy
from sklearn.metrics.pairwise import cosine_similarity` is the best.
in scikit-learn==1.1.2,1.1.3
It can be different result in float64 and float16.
For float64,

For float16,
