Skip to content

Commit 6cb51b2

Browse files
committed
Initialize ARPACK eigsh
`v0 = random_state.rand(M.shape[0])` leads to an initial residual vector in ARPACK which is all positive. However, this is not the absolute or squared residual, but a true difference. Thus, it is better to initialize with `v0=random_state.uniform(-1, 1, M.shape[0])` to have an equally distributed sign. This is the way that ARPACK initializes the residuals. The effect of the previous initialization is that eigsh frequently does not converge to the correct eigenvalues, e.g. negative eigenvalues for s.p.d. matrix, which leads to an incorrect null-space. - initialized all occurences of sklearn.utils.arpack.eigsh the same way it would be initialzed by ARPACK - regression test to test behavior of new initialization
1 parent 8b990c3 commit 6cb51b2

File tree

6 files changed

+55
-9
lines changed

6 files changed

+55
-9
lines changed

doc/whats_new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ Bug fixes
4343
- Fixed bug in :func:`manifold.spectral_embedding` where diagonal of unnormalized
4444
Laplacian matrix was incorrectly set to 1. By `Peter Fischer`_.
4545

46+
- Fixed incorrect initialization of :func:`utils.arpack.eigsh` on all
47+
occurrences. Affects :class:`cluster.SpectralBiclustering`,
48+
:class:`decomposition.KernelPCA`, :class:`manifold.LocallyLinearEmbedding`,
49+
and :class:`manifold.SpectralEmbedding`. By `Peter Fischer`_.
4650

4751
API changes summary
4852
-------------------

sklearn/cluster/bicluster.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from . import KMeans, MiniBatchKMeans
1515
from ..base import BaseEstimator, BiclusterMixin
1616
from ..externals import six
17+
from ..utils import check_random_state
1718
from ..utils.arpack import eigsh, svds
1819

1920
from ..utils.extmath import (make_nonnegative, norm, randomized_svd,
@@ -140,12 +141,18 @@ def _svd(self, array, n_components, n_discard):
140141
# some eigenvalues of A * A.T are negative, causing
141142
# sqrt() to be np.nan. This causes some vectors in vt
142143
# to be np.nan.
143-
_, v = eigsh(safe_sparse_dot(array.T, array),
144-
ncv=self.n_svd_vecs)
144+
A = safe_sparse_dot(array.T, array)
145+
random_state = check_random_state(self.random_state)
146+
# initialize with [-1,1] as in ARPACK
147+
v0 = random_state.uniform(-1, 1, A.shape[0])
148+
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
145149
vt = v.T
146150
if np.any(np.isnan(u)):
147-
_, u = eigsh(safe_sparse_dot(array, array.T),
148-
ncv=self.n_svd_vecs)
151+
A = safe_sparse_dot(array, array.T)
152+
random_state = check_random_state(self.random_state)
153+
# initialize with [-1,1] as in ARPACK
154+
v0 = random_state.uniform(-1, 1, A.shape[0])
155+
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
149156

150157
assert_all_finite(u)
151158
assert_all_finite(vt)

sklearn/decomposition/kernel_pca.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import numpy as np
77
from scipy import linalg
88

9+
from ..utils import check_random_state
910
from ..utils.arpack import eigsh
1011
from ..utils.validation import check_is_fitted
1112
from ..exceptions import NotFittedError
@@ -103,7 +104,8 @@ class KernelPCA(BaseEstimator, TransformerMixin):
103104
def __init__(self, n_components=None, kernel="linear",
104105
gamma=None, degree=3, coef0=1, kernel_params=None,
105106
alpha=1.0, fit_inverse_transform=False, eigen_solver='auto',
106-
tol=0, max_iter=None, remove_zero_eig=False):
107+
tol=0, max_iter=None, remove_zero_eig=False,
108+
random_state=None):
107109
if fit_inverse_transform and kernel == 'precomputed':
108110
raise ValueError(
109111
"Cannot fit_inverse_transform with a precomputed kernel.")
@@ -120,6 +122,7 @@ def __init__(self, n_components=None, kernel="linear",
120122
self.tol = tol
121123
self.max_iter = max_iter
122124
self._centerer = KernelCenterer()
125+
self.random_state = random_state
123126

124127
@property
125128
def _pairwise(self):
@@ -158,10 +161,14 @@ def _fit_transform(self, K):
158161
self.lambdas_, self.alphas_ = linalg.eigh(
159162
K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))
160163
elif eigen_solver == 'arpack':
164+
random_state = check_random_state(self.random_state)
165+
# initialize with [-1,1] as in ARPACK
166+
v0 = random_state.uniform(-1, 1, K.shape[0])
161167
self.lambdas_, self.alphas_ = eigsh(K, n_components,
162168
which="LA",
163169
tol=self.tol,
164-
maxiter=self.max_iter)
170+
maxiter=self.max_iter,
171+
v0=v0)
165172

166173
# sort eigenvectors in descending order
167174
indices = self.lambdas_.argsort()[::-1]

sklearn/manifold/locally_linear.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
148148

149149
if eigen_solver == 'arpack':
150150
random_state = check_random_state(random_state)
151-
v0 = random_state.rand(M.shape[0])
151+
# initialize with [-1,1] as in ARPACK
152+
v0 = random_state.uniform(-1, 1, M.shape[0])
152153
try:
153154
eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
154155
tol=tol, maxiter=max_iter,

sklearn/manifold/spectral_embedding_.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,9 +259,10 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
259259
# We are computing the opposite of the laplacian inplace so as
260260
# to spare a memory allocation of a possibly very large array
261261
laplacian *= -1
262+
v0 = random_state.uniform(-1, 1, laplacian.shape[0])
262263
lambdas, diffusion_map = eigsh(laplacian, k=n_components,
263264
sigma=1.0, which='LM',
264-
tol=eigen_tol)
265+
tol=eigen_tol, v0=v0)
265266
embedding = diffusion_map.T[n_components::-1] * dd
266267
except RuntimeError:
267268
# When submatrices are exactly singular, an LU decomposition

sklearn/utils/tests/test_utils.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
import numpy as np
44
import scipy.sparse as sp
55
from scipy.linalg import pinv2
6+
from scipy.linalg import eigh
67
from itertools import chain
78

89
from sklearn.utils.testing import (assert_equal, assert_raises, assert_true,
910
assert_almost_equal, assert_array_equal,
10-
SkipTest, assert_raises_regex)
11+
SkipTest, assert_raises_regex,
12+
assert_greater_equal)
1113

1214
from sklearn.utils import check_random_state
1315
from sklearn.utils import deprecated
@@ -18,7 +20,9 @@
1820
from sklearn.utils import shuffle
1921
from sklearn.utils import gen_even_slices
2022
from sklearn.utils.extmath import pinvh
23+
from sklearn.utils.arpack import eigsh
2124
from sklearn.utils.mocking import MockDataFrame
25+
from sklearn.utils.graph import graph_laplacian
2226

2327

2428
def test_make_rng():
@@ -126,6 +130,28 @@ def test_pinvh_simple_complex():
126130
assert_almost_equal(np.dot(a, a_pinv), np.eye(3))
127131

128132

133+
def test_arpack_eigsh_initialization():
134+
'''
135+
Non-regression test that shows null-space computation is better with
136+
initialization of eigsh from [-1,1] instead of [0,1]
137+
'''
138+
random_state = check_random_state(42)
139+
140+
A = random_state.rand(50, 50)
141+
A = np.dot(A.T, A) # create s.p.d. matrix
142+
A = graph_laplacian(A)
143+
k = 5
144+
145+
# Test if eigsh is working correctly
146+
# New initialization [-1,1] (as in original ARPACK)
147+
# Was [0,1] before, with which this test could fail
148+
v0 = random_state.uniform(-1,1, A.shape[0])
149+
w, _ = eigsh(A, k=k, sigma=0.0, v0=v0)
150+
151+
# Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest
152+
assert_greater_equal(w[0], 0)
153+
154+
129155
def test_column_or_1d():
130156
EXAMPLES = [
131157
("binary", ["spam", "egg", "spam"]),

0 commit comments

Comments
 (0)