Skip to content

Commit 27a8912

Browse files
authored
Merge pull request #924 from jlmelville/tswspectral
new spectral initialization option: a truncated SVD-warmed lobpcg
2 parents e95f43c + fe32113 commit 27a8912

File tree

3 files changed

+182
-18
lines changed

3 files changed

+182
-18
lines changed

umap/spectral.py

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import warnings
2+
13
from warnings import warn
24

35
import numpy as np
46

57
import scipy.sparse
68
import scipy.sparse.csgraph
9+
import sklearn.decomposition
710

811
from sklearn.manifold import SpectralEmbedding
912
from sklearn.metrics import pairwise_distances
@@ -130,7 +133,7 @@ def component_layout(
130133
component_centroids, metric=metric, **metric_kwds
131134
)
132135

133-
affinity_matrix = np.exp(-(distance_matrix ** 2))
136+
affinity_matrix = np.exp(-(distance_matrix**2))
134137

135138
component_embedding = SpectralEmbedding(
136139
n_components=dim, affinity="precomputed", random_state=random_state
@@ -352,3 +355,106 @@ def spectral_layout(data, graph, dim, random_state, metric="euclidean", metric_k
352355
"Falling back to random initialisation!"
353356
)
354357
return random_state.uniform(low=-10.0, high=10.0, size=(graph.shape[0], dim))
358+
359+
360+
def tswspectral_layout(
361+
data, graph, dim, random_state, metric="euclidean", metric_kwds={}
362+
):
363+
"""Given a graph compute the spectral embedding of the graph. This is
364+
simply the eigenvectors of the laplacian of the graph. Here we use the
365+
normalized laplacian and a truncated SVD-based guess of the
366+
eigenvectors to "warm" up the lobpcg eigensolver. This function should
367+
give results of similar accuracy to the spectral_layout function, but
368+
may converge more quickly for graph Laplacians that cause
369+
spectral_layout to take an excessive amount of time to complete.
370+
371+
Parameters
372+
----------
373+
data: array of shape (n_samples, n_features)
374+
The source data
375+
376+
graph: sparse matrix
377+
The (weighted) adjacency matrix of the graph as a sparse matrix.
378+
379+
dim: int
380+
The dimension of the space into which to embed.
381+
382+
random_state: numpy RandomState or equivalent
383+
A state capable being used as a numpy random state.
384+
385+
metric: string or callable (optional, default 'euclidean')
386+
The metric used to measure distances among the source data points.
387+
Used only if the multiple connected components are found in the
388+
graph.
389+
390+
metric_kwds: dict (optional, default {})
391+
Keyword arguments to be passed to the metric function.
392+
If metric is 'precomputed', 'linkage' keyword can be used to specify
393+
'average', 'complete', or 'single' linkage. Default is 'average'.
394+
Used only if the multiple connected components are found in the
395+
graph.
396+
397+
Returns
398+
-------
399+
embedding: array of shape (n_vertices, dim)
400+
The spectral embedding of the graph.
401+
"""
402+
n_samples = graph.shape[0]
403+
n_components, labels = scipy.sparse.csgraph.connected_components(graph)
404+
405+
if n_components > 1:
406+
return multi_component_layout(
407+
data,
408+
graph,
409+
n_components,
410+
labels,
411+
dim,
412+
random_state,
413+
metric=metric,
414+
metric_kwds=metric_kwds,
415+
)
416+
417+
diag_data = np.asarray(graph.sum(axis=0))
418+
D = scipy.sparse.spdiags(1.0 / np.sqrt(diag_data), 0, n_samples, n_samples)
419+
# L is a shifted version of what we will pass to the eigensolver (I - L)
420+
# The eigenvectors of I - L coincide with the first few singular vectors
421+
# of L so we can carry out truncated SVD on L to get a guess to pass to lobpcg
422+
L = D * graph * D
423+
424+
k = dim + 1
425+
tsvd = sklearn.decomposition.TruncatedSVD(
426+
n_components=k, random_state=random_state, algorithm="arpack", tol=1e-2
427+
)
428+
guess = tsvd.fit_transform(L)
429+
430+
# for a normalized Laplacian, the first eigenvector is always sqrt(D) so replace
431+
# the tsvd guess with the exact value. Scaling it to length one seems to help.
432+
guess[:, 0] = np.sqrt(diag_data[0] / np.linalg.norm(diag_data[0]))
433+
434+
I = scipy.sparse.identity(n_samples, dtype=np.float64)
435+
436+
# lobpcg emits a UserWarning if convergence was not reached within `maxiter`
437+
# so we will just have to catch that instead of an Error
438+
# This will also trigger when lobpcg decides the problem size is too small
439+
# for it to deal with but there is little chance that this would happen
440+
# in most real use cases
441+
with warnings.catch_warnings():
442+
warnings.simplefilter("error")
443+
try:
444+
eigenvalues, eigenvectors = scipy.sparse.linalg.lobpcg(
445+
I - L,
446+
guess,
447+
largest=False,
448+
tol=1e-4,
449+
maxiter=graph.shape[0] * 5,
450+
)
451+
except UserWarning:
452+
warn(
453+
"WARNING: spectral initialisation failed! The eigenvector solver\n"
454+
"failed. This is likely due to too small an eigengap. Consider\n"
455+
"adding some noise or jitter to your data.\n\n"
456+
"Falling back to random initialisation!"
457+
)
458+
return random_state.uniform(low=-10.0, high=10.0, size=(n_samples, dim))
459+
order = np.argsort(eigenvalues)[1:k]
460+
return eigenvectors[:, order]

umap/tests/test_spectral.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from umap.spectral import spectral_layout, tswspectral_layout
2+
3+
import numpy as np
4+
5+
6+
def test_tsw_spectral_init(iris):
7+
# create an arbitrary (dense) random affinity matrix
8+
seed = 42
9+
rng = np.random.default_rng(seed=seed)
10+
# matrix must be of sufficient size of lobpcg will refuse to work on it
11+
n = 20
12+
graph = rng.standard_normal(n * n).reshape((n, n)) ** 2
13+
graph = graph.T * graph
14+
15+
spec = spectral_layout(None, graph, 2, random_state=seed)
16+
tsw_spec = tswspectral_layout(None, graph, 2, random_state=seed)
17+
18+
# make sure the two methods produce matrices that are close in values
19+
rmsd = np.sqrt(np.mean(np.sum((np.abs(spec) - np.abs(tsw_spec)) ** 2, axis=1)))
20+
assert (
21+
rmsd < 1e-6
22+
), "tsvd-warmed spectral init insufficiently close to standard spectral init"

umap/umap_.py

Lines changed: 53 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
csr_unique,
3939
fast_knn_indices,
4040
)
41-
from umap.spectral import spectral_layout
41+
from umap.spectral import spectral_layout, tswspectral_layout
4242
from umap.layouts import (
4343
optimize_layout_euclidean,
4444
optimize_layout_generic,
@@ -1115,6 +1115,18 @@ def simplicial_set_embedding(
11151115
embedding = noisy_scale_coords(
11161116
embedding, random_state, max_coord=10, noise=0.0001
11171117
)
1118+
elif isinstance(init, str) and init == "tswspectral":
1119+
embedding = tswspectral_layout(
1120+
data,
1121+
graph,
1122+
n_components,
1123+
random_state,
1124+
metric=metric,
1125+
metric_kwds=metric_kwds,
1126+
)
1127+
embedding = noisy_scale_coords(
1128+
embedding, random_state, max_coord=10, noise=0.0001
1129+
)
11181130
else:
11191131
init_data = np.array(init)
11201132
if len(init_data.shape) == 2:
@@ -1459,7 +1471,13 @@ class UMAP(BaseEstimator):
14591471
14601472
* 'spectral': use a spectral embedding of the fuzzy 1-skeleton
14611473
* 'random': assign initial embedding positions at random.
1462-
* 'pca': use the first n_components from PCA applied to the input data.
1474+
* 'pca': use the first n_components from PCA applied to the
1475+
input data.
1476+
* 'tswspectral': use a spectral embedding of the fuzzy
1477+
1-skeleton, using a truncated singular value decomposition to
1478+
"warm" up the eigensolver. This is intended as an alternative
1479+
to the 'spectral' method, if that takes an excessively long
1480+
time to complete initialization (or fails to complete).
14631481
* A numpy array of initial embedding positions.
14641482
14651483
min_dist: float (optional, default 0.1)
@@ -1738,8 +1756,12 @@ def _validate_parameters(self):
17381756
"pca",
17391757
"spectral",
17401758
"random",
1759+
"tswspectral",
17411760
):
1742-
raise ValueError('string init values must be "pca", "spectral" or "random"')
1761+
raise ValueError(
1762+
'string init values must be one of: "pca", "tswspectral",'
1763+
' "spectral" or "random"'
1764+
)
17431765
if (
17441766
isinstance(self.init, np.ndarray)
17451767
and self.init.shape[1] != self.n_components
@@ -1769,18 +1791,26 @@ def _validate_parameters(self):
17691791
if self.n_components < 1:
17701792
raise ValueError("n_components must be greater than 0")
17711793
self.n_epochs_list = None
1772-
if isinstance(self.n_epochs, list) or isinstance(self.n_epochs, tuple) or \
1773-
isinstance(self.n_epochs, np.ndarray):
1774-
if not issubclass(np.array(self.n_epochs).dtype.type, np.integer) or \
1775-
not np.all(np.array(self.n_epochs) >= 0):
1776-
raise ValueError("n_epochs must be a nonnegative integer "
1777-
"or a list of nonnegative integers")
1794+
if (
1795+
isinstance(self.n_epochs, list)
1796+
or isinstance(self.n_epochs, tuple)
1797+
or isinstance(self.n_epochs, np.ndarray)
1798+
):
1799+
if not issubclass(
1800+
np.array(self.n_epochs).dtype.type, np.integer
1801+
) or not np.all(np.array(self.n_epochs) >= 0):
1802+
raise ValueError(
1803+
"n_epochs must be a nonnegative integer "
1804+
"or a list of nonnegative integers"
1805+
)
17781806
self.n_epochs_list = list(self.n_epochs)
17791807
elif self.n_epochs is not None and (
1780-
self.n_epochs < 0 or not isinstance(self.n_epochs, int)
1808+
self.n_epochs < 0 or not isinstance(self.n_epochs, int)
17811809
):
1782-
raise ValueError("n_epochs must be a nonnegative integer "
1783-
"or a list of nonnegative integers")
1810+
raise ValueError(
1811+
"n_epochs must be a nonnegative integer "
1812+
"or a list of nonnegative integers"
1813+
)
17841814
if self.metric_kwds is None:
17851815
self._metric_kwds = {}
17861816
else:
@@ -2742,7 +2772,9 @@ def fit(self, X, y=None, force_all_finite=True):
27422772
print(ts(), "Construct embedding")
27432773

27442774
if self.transform_mode == "embedding":
2745-
epochs = self.n_epochs_list if self.n_epochs_list is not None else self.n_epochs
2775+
epochs = (
2776+
self.n_epochs_list if self.n_epochs_list is not None else self.n_epochs
2777+
)
27462778
self.embedding_, aux_data = self._fit_embed_data(
27472779
self._raw_data[index],
27482780
epochs,
@@ -2752,11 +2784,15 @@ def fit(self, X, y=None, force_all_finite=True):
27522784

27532785
if self.n_epochs_list is not None:
27542786
if "embedding_list" not in aux_data:
2755-
raise KeyError("No list of embedding were found in 'aux_data'. "
2756-
"It is likely the layout optimization function "
2757-
"doesn't support the list of int for 'n_epochs'.")
2787+
raise KeyError(
2788+
"No list of embedding were found in 'aux_data'. "
2789+
"It is likely the layout optimization function "
2790+
"doesn't support the list of int for 'n_epochs'."
2791+
)
27582792
else:
2759-
self.embedding_list_ = [e[inverse] for e in aux_data["embedding_list"]]
2793+
self.embedding_list_ = [
2794+
e[inverse] for e in aux_data["embedding_list"]
2795+
]
27602796

27612797
# Assign any points that are fully disconnected from our manifold(s) to have embedding
27622798
# coordinates of np.nan. These will be filtered by our plotting functions automatically.

0 commit comments

Comments
 (0)