TSNE from sklearn with mahalanobis metric

Question

With TSNE from sklearn with mahalanobis metric I am getting following error

from sklearn.manifold import TSNE      
tsne = TSNE( verbose=1, perplexity=40, n_iter=250,learning_rate=50, random_state=0,metric='mahalanobis')
pt=data.sample(frac=0.1).values
tsne_results = tsne.fit_transform(pt)

ValueError: Must provide either V or VI for Mahalanobis distance

How to provide an method_parameters for the Mahalanobis distance?

Vivek Kumar · Accepted Answer · 2018-08-10T09:28:44.907

Indeed there is no option to define the metric_params as in the other cases. For example other pairwise distance based classes provide a metric_params parameter to pass additional params to the distance function. Like

have this:

metric_params : dict, optional (default = None)

    Additional keyword arguments for the metric function.

This answer here shows how to use this param.

But TSNE has no way to send the additional parameters. So for now, you need to extend the class and over-ride the __init__() to send the parameters and then _fit() method to actually use them.

We can do this:

from time import time
import numpy as np
import scipy.sparse as sp
from sklearn.manifold import TSNE
from sklearn.externals.six import string_types
from sklearn.utils import check_array, check_random_state
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.manifold.t_sne import _joint_probabilities, _joint_probabilities_nn
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

class MyTSNE(TSNE):
    def __init__(self, n_components=2, perplexity=30.0,
                 early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
                 n_iter_without_progress=300, min_grad_norm=1e-7,
                 metric="euclidean", metric_params=None, #<=ADDED
                 init="random", verbose=0,
                 random_state=None, method='barnes_hut', angle=0.5):
        self.n_components = n_components
        self.perplexity = perplexity
        self.early_exaggeration = early_exaggeration
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.n_iter_without_progress = n_iter_without_progress
        self.min_grad_norm = min_grad_norm
        self.metric = metric
        self.metric_params = metric_params  #<=ADDED
        self.init = init
        self.verbose = verbose
        self.random_state = random_state
        self.method = method
        self.angle = angle

    def _fit(self, X, skip_num_points=0):
        if self.method not in ['barnes_hut', 'exact']:
            raise ValueError("'method' must be 'barnes_hut' or 'exact'")
        if self.angle < 0.0 or self.angle > 1.0:
            raise ValueError("'angle' must be between 0.0 - 1.0")
        if self.metric == "precomputed":
            if isinstance(self.init, string_types) and self.init == 'pca':
                raise ValueError("The parameter init=\"pca\" cannot be "
                                 "used with metric=\"precomputed\".")
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square distance matrix")
            if np.any(X < 0):
                raise ValueError("All distances should be positive, the "
                                 "precomputed distances given as X is not "
                                 "correct")
        if self.method == 'barnes_hut' and sp.issparse(X):
            raise TypeError('A sparse matrix was passed, but dense '
                            'data is required for method="barnes_hut". Use '
                            'X.toarray() to convert to a dense numpy array if '
                            'the array is small enough for it to fit in '
                            'memory. Otherwise consider dimensionality '
                            'reduction techniques (e.g. TruncatedSVD)')
        else:
            X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                            dtype=[np.float32, np.float64])
        if self.method == 'barnes_hut' and self.n_components > 3:
            raise ValueError("'n_components' should be inferior to 4 for the "
                             "barnes_hut algorithm as it relies on "
                             "quad-tree or oct-tree.")
        random_state = check_random_state(self.random_state)

        if self.early_exaggeration < 1.0:
            raise ValueError("early_exaggeration must be at least 1, but is {}"
                             .format(self.early_exaggeration))

        if self.n_iter < 250:
            raise ValueError("n_iter should be at least 250")

        n_samples = X.shape[0]

        neighbors_nn = None
        if self.method == "exact":
            if self.metric == "precomputed":
                distances = X
            else:
                if self.verbose:
                    print("[t-SNE] Computing pairwise distances...")

                if self.metric == "euclidean":
                    distances = pairwise_distances(X, metric=self.metric,
                                                   squared=True,
                                                   **self.metric_params) #<=ADDED
                else:
                    distances = pairwise_distances(X, metric=self.metric,
                                                   **self.metric_params) #<=ADDED

                if np.any(distances < 0):
                    raise ValueError("All distances should be positive, the "
                                     "metric given is not correct")

            P = _joint_probabilities(distances, self.perplexity, self.verbose)
            assert np.all(np.isfinite(P)), "All probabilities should be finite"
            assert np.all(P >= 0), "All probabilities should be non-negative"
            assert np.all(P <= 1), ("All probabilities should be less "
                                    "or then equal to one")

        else:
            k = min(n_samples - 1, int(3. * self.perplexity + 1))

            if self.verbose:
                print("[t-SNE] Computing {} nearest neighbors...".format(k))

            knn = NearestNeighbors(algorithm='auto', n_neighbors=k,
                                   metric=self.metric, 
                                   metric_params = self.metric_params) #<=ADDED
            t0 = time()
            knn.fit(X)
            duration = time() - t0
            if self.verbose:
                print("[t-SNE] Indexed {} samples in {:.3f}s...".format(
                    n_samples, duration))

            t0 = time()
            distances_nn, neighbors_nn = knn.kneighbors(
                None, n_neighbors=k)
            duration = time() - t0
            if self.verbose:
                print("[t-SNE] Computed neighbors for {} samples in {:.3f}s..."
                      .format(n_samples, duration))

            del knn

            if self.metric == "euclidean":
                distances_nn **= 2

            P = _joint_probabilities_nn(distances_nn, neighbors_nn,
                                        self.perplexity, self.verbose)

        if isinstance(self.init, np.ndarray):
            X_embedded = self.init
        elif self.init == 'pca':
            pca = PCA(n_components=self.n_components, svd_solver='randomized',
                      random_state=random_state)
            X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
        elif self.init == 'random':
            X_embedded = 1e-4 * random_state.randn(
                n_samples, self.n_components).astype(np.float32)
        else:
            raise ValueError("'init' must be 'pca', 'random', or "
                             "a numpy array")

        degrees_of_freedom = max(self.n_components - 1.0, 1)

        return self._tsne(P, degrees_of_freedom, n_samples,
                          X_embedded=X_embedded,
                          neighbors=neighbors_nn,
                          skip_num_points=skip_num_points)

I have marked (#<=ADDED) on the changes. Now try using this class instead of TSNE like this:

tsne = MyTSNE(verbose=1,perplexity=40,n_iter=250,learning_rate=50, random_state=0,
              metric='mahalanobis', metric_params={'V': np.cov(X)})

pt=data.sample(frac=0.1).values
tsne_results = tsne.fit_transform(pt)

Note: Other classes that I mentioned on the top check the metric_params for valid params, but I have not done that, so make sure you pass correct parameters in it, else it will give errors.

You should post the issue on scikit-learn issues page on github

Bravo!!! just minor comment change the line to metric=self.metric, metric_params=self.metric_params) #<=ADDED — Arman, Aug 10 '18 at 09:18
@Arman Aah, yes. I copy-pasted that from above usage of `pairwise_distances` where the params are keyword arguments. I forgot the `NearestNeighbors` require dict, not `kwargs`. Thanks. Changed — Vivek Kumar, Aug 10 '18 at 09:22
@Arman Yes, I saw and commented some info about the root cause of issue. — Vivek Kumar, Aug 10 '18 at 10:36

TSNE from **sklearn** with **mahalanobis** metric

1 Answers1

TSNE from sklearn with mahalanobis metric