Merge pull request #924 from jlmelville/tswspectral

lmcinnes · web-flow · commit 27a89123bf10 · 2023-03-29T13:57:35.000Z
new spectral initialization option: a truncated SVD-warmed lobpcg
diff --git a/umap/spectral.py b/umap/spectral.py
@@ -1,9 +1,12 @@
+import warnings
+
 from warnings import warn
 
 import numpy as np
 
 import scipy.sparse
 import scipy.sparse.csgraph
+import sklearn.decomposition
 
 from sklearn.manifold import SpectralEmbedding
 from sklearn.metrics import pairwise_distances
@@ -130,7 +133,7 @@ def component_layout(
                     component_centroids, metric=metric, **metric_kwds
                 )
 
-    affinity_matrix = np.exp(-(distance_matrix ** 2))
+    affinity_matrix = np.exp(-(distance_matrix**2))
 
     component_embedding = SpectralEmbedding(
         n_components=dim, affinity="precomputed", random_state=random_state
@@ -352,3 +355,106 @@ def spectral_layout(data, graph, dim, random_state, metric="euclidean", metric_k
             "Falling back to random initialisation!"
         )
         return random_state.uniform(low=-10.0, high=10.0, size=(graph.shape[0], dim))
+
+
+def tswspectral_layout(
+    data, graph, dim, random_state, metric="euclidean", metric_kwds={}
+):
+    """Given a graph compute the spectral embedding of the graph. This is
+    simply the eigenvectors of the laplacian of the graph. Here we use the
+    normalized laplacian and a truncated SVD-based guess of the
+    eigenvectors to "warm" up the lobpcg eigensolver. This function should
+    give results of similar accuracy to the spectral_layout function, but
+    may converge more quickly for graph Laplacians that cause
+    spectral_layout to take an excessive amount of time to complete.
+
+    Parameters
+    ----------
+    data: array of shape (n_samples, n_features)
+        The source data
+
+    graph: sparse matrix
+        The (weighted) adjacency matrix of the graph as a sparse matrix.
+
+    dim: int
+        The dimension of the space into which to embed.
+
+    random_state: numpy RandomState or equivalent
+        A state capable being used as a numpy random state.
+
+    metric: string or callable (optional, default 'euclidean')
+        The metric used to measure distances among the source data points.
+        Used only if the multiple connected components are found in the
+        graph.
+
+    metric_kwds: dict (optional, default {})
+        Keyword arguments to be passed to the metric function.
+        If metric is 'precomputed', 'linkage' keyword can be used to specify
+        'average', 'complete', or 'single' linkage. Default is 'average'.
+        Used only if the multiple connected components are found in the
+        graph.
+
+    Returns
+    -------
+    embedding: array of shape (n_vertices, dim)
+        The spectral embedding of the graph.
+    """
+    n_samples = graph.shape[0]
+    n_components, labels = scipy.sparse.csgraph.connected_components(graph)
+
+    if n_components > 1:
+        return multi_component_layout(
+            data,
+            graph,
+            n_components,
+            labels,
+            dim,
+            random_state,
+            metric=metric,
+            metric_kwds=metric_kwds,
+        )
+
+    diag_data = np.asarray(graph.sum(axis=0))
+    D = scipy.sparse.spdiags(1.0 / np.sqrt(diag_data), 0, n_samples, n_samples)
+    # L is a shifted version of what we will pass to the eigensolver (I - L)
+    # The eigenvectors of I - L coincide with the first few singular vectors
+    # of L so we can carry out truncated SVD on L to get a guess to pass to lobpcg
+    L = D * graph * D
+
+    k = dim + 1
+    tsvd = sklearn.decomposition.TruncatedSVD(
+        n_components=k, random_state=random_state, algorithm="arpack", tol=1e-2
+    )
+    guess = tsvd.fit_transform(L)
+
+    # for a normalized Laplacian, the first eigenvector is always sqrt(D) so replace
+    # the tsvd guess with the exact value. Scaling it to length one seems to help.
+    guess[:, 0] = np.sqrt(diag_data[0] / np.linalg.norm(diag_data[0]))
+
+    I = scipy.sparse.identity(n_samples, dtype=np.float64)
+
+    # lobpcg emits a UserWarning if convergence was not reached within `maxiter`
+    # so we will just have to catch that instead of an Error
+    # This will also trigger when lobpcg decides the problem size is too small
+    # for it to deal with but there is little chance that this would happen
+    # in most real use cases
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        try:
+            eigenvalues, eigenvectors = scipy.sparse.linalg.lobpcg(
+                I - L,
+                guess,
+                largest=False,
+                tol=1e-4,
+                maxiter=graph.shape[0] * 5,
+            )
+        except UserWarning:
+            warn(
+                "WARNING: spectral initialisation failed! The eigenvector solver\n"
+                "failed. This is likely due to too small an eigengap. Consider\n"
+                "adding some noise or jitter to your data.\n\n"
+                "Falling back to random initialisation!"
+            )
+            return random_state.uniform(low=-10.0, high=10.0, size=(n_samples, dim))
+    order = np.argsort(eigenvalues)[1:k]
+    return eigenvectors[:, order]
diff --git a/umap/tests/test_spectral.py b/umap/tests/test_spectral.py
@@ -0,0 +1,22 @@
+from umap.spectral import spectral_layout, tswspectral_layout
+
+import numpy as np
+
+
+def test_tsw_spectral_init(iris):
+    # create an arbitrary (dense) random affinity matrix
+    seed = 42
+    rng = np.random.default_rng(seed=seed)
+    # matrix must be of sufficient size of lobpcg will refuse to work on it
+    n = 20
+    graph = rng.standard_normal(n * n).reshape((n, n)) ** 2
+    graph = graph.T * graph
+
+    spec = spectral_layout(None, graph, 2, random_state=seed)
+    tsw_spec = tswspectral_layout(None, graph, 2, random_state=seed)
+
+    # make sure the two methods produce matrices that are close in values
+    rmsd = np.sqrt(np.mean(np.sum((np.abs(spec) - np.abs(tsw_spec)) ** 2, axis=1)))
+    assert (
+        rmsd < 1e-6
+    ), "tsvd-warmed spectral init insufficiently close to standard spectral init"
diff --git a/umap/umap_.py b/umap/umap_.py
@@ -38,7 +38,7 @@
     csr_unique,
     fast_knn_indices,
 )
-from umap.spectral import spectral_layout
+from umap.spectral import spectral_layout, tswspectral_layout
 from umap.layouts import (
     optimize_layout_euclidean,
     optimize_layout_generic,
@@ -1115,6 +1115,18 @@ def simplicial_set_embedding(
         embedding = noisy_scale_coords(
             embedding, random_state, max_coord=10, noise=0.0001
         )
+    elif isinstance(init, str) and init == "tswspectral":
+        embedding = tswspectral_layout(
+            data,
+            graph,
+            n_components,
+            random_state,
+            metric=metric,
+            metric_kwds=metric_kwds,
+        )
+        embedding = noisy_scale_coords(
+            embedding, random_state, max_coord=10, noise=0.0001
+        )
     else:
         init_data = np.array(init)
         if len(init_data.shape) == 2:
@@ -1459,7 +1471,13 @@ class UMAP(BaseEstimator):
 
             * 'spectral': use a spectral embedding of the fuzzy 1-skeleton
             * 'random': assign initial embedding positions at random.
-            * 'pca': use the first n_components from PCA applied to the input data.
+            * 'pca': use the first n_components from PCA applied to the
+            input data.
+            * 'tswspectral': use a spectral embedding of the fuzzy
+            1-skeleton, using a truncated singular value decomposition to
+            "warm" up the eigensolver. This is intended as an alternative
+            to the 'spectral' method, if that takes an  excessively long
+            time to complete initialization (or fails to complete).
             * A numpy array of initial embedding positions.
 
     min_dist: float (optional, default 0.1)
@@ -1738,8 +1756,12 @@ def _validate_parameters(self):
             "pca",
             "spectral",
             "random",
+            "tswspectral",
         ):
-            raise ValueError('string init values must be "pca", "spectral" or "random"')
+            raise ValueError(
+                'string init values must be one of: "pca", "tswspectral",'
+                ' "spectral" or "random"'
+            )
         if (
             isinstance(self.init, np.ndarray)
             and self.init.shape[1] != self.n_components
@@ -1769,18 +1791,26 @@ def _validate_parameters(self):
         if self.n_components < 1:
             raise ValueError("n_components must be greater than 0")
         self.n_epochs_list = None
-        if isinstance(self.n_epochs, list) or isinstance(self.n_epochs, tuple) or \
-                isinstance(self.n_epochs, np.ndarray):
-            if not issubclass(np.array(self.n_epochs).dtype.type, np.integer) or \
-                    not np.all(np.array(self.n_epochs) >= 0):
-                raise ValueError("n_epochs must be a nonnegative integer "
-                                 "or a list of nonnegative integers")
+        if (
+            isinstance(self.n_epochs, list)
+            or isinstance(self.n_epochs, tuple)
+            or isinstance(self.n_epochs, np.ndarray)
+        ):
+            if not issubclass(
+                np.array(self.n_epochs).dtype.type, np.integer
+            ) or not np.all(np.array(self.n_epochs) >= 0):
+                raise ValueError(
+                    "n_epochs must be a nonnegative integer "
+                    "or a list of nonnegative integers"
+                )
             self.n_epochs_list = list(self.n_epochs)
         elif self.n_epochs is not None and (
-                self.n_epochs < 0 or not isinstance(self.n_epochs, int)
+            self.n_epochs < 0 or not isinstance(self.n_epochs, int)
         ):
-            raise ValueError("n_epochs must be a nonnegative integer "
-                             "or a list of nonnegative integers")
+            raise ValueError(
+                "n_epochs must be a nonnegative integer "
+                "or a list of nonnegative integers"
+            )
         if self.metric_kwds is None:
             self._metric_kwds = {}
         else:
@@ -2742,7 +2772,9 @@ def fit(self, X, y=None, force_all_finite=True):
             print(ts(), "Construct embedding")
 
         if self.transform_mode == "embedding":
-            epochs = self.n_epochs_list if self.n_epochs_list is not None else self.n_epochs
+            epochs = (
+                self.n_epochs_list if self.n_epochs_list is not None else self.n_epochs
+            )
             self.embedding_, aux_data = self._fit_embed_data(
                 self._raw_data[index],
                 epochs,
@@ -2752,11 +2784,15 @@ def fit(self, X, y=None, force_all_finite=True):
 
             if self.n_epochs_list is not None:
                 if "embedding_list" not in aux_data:
-                    raise KeyError("No list of embedding were found in 'aux_data'. "
-                                   "It is likely the layout optimization function "
-                                   "doesn't support the list of int for 'n_epochs'.")
+                    raise KeyError(
+                        "No list of embedding were found in 'aux_data'. "
+                        "It is likely the layout optimization function "
+                        "doesn't support the list of int for 'n_epochs'."
+                    )
                 else:
-                    self.embedding_list_ = [e[inverse] for e in aux_data["embedding_list"]]
+                    self.embedding_list_ = [
+                        e[inverse] for e in aux_data["embedding_list"]
+                    ]
 
             # Assign any points that are fully disconnected from our manifold(s) to have embedding
             # coordinates of np.nan.  These will be filtered by our plotting functions automatically.