diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py index 682d6e4e200f0..a279942b12302 100644 --- a/sklearn/cluster/bicluster.py +++ b/sklearn/cluster/bicluster.py @@ -35,10 +35,19 @@ def _scale_normalize(X): """ X = make_nonnegative(X) - row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze() - col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze() + row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))) + if row_diag.shape[0] != 1: + row_diag = row_diag.squeeze() + + col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))) + if col_diag.ndim == 1 and col_diag.shape[0] != 1: + col_diag = col_diag.squeeze() + if col_diag.ndim == 2 and col_diag.shape[1] != 1: + col_diag = col_diag.squeeze() + row_diag = np.where(np.isnan(row_diag), 0, row_diag) col_diag = np.where(np.isnan(col_diag), 0, col_diag) + if issparse(X): n_rows, n_cols = X.shape r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows)) @@ -110,17 +119,21 @@ def _check_parameters(self): " one of {1}.".format(self.svd_method, legal_svd_methods)) - def fit(self, X): + def fit(self, X, y=None): """Creates a biclustering for X. Parameters ---------- X : array-like, shape (n_samples, n_features) + Returns + ------- + self : object + Returns the instance itself. """ X = check_array(X, accept_sparse='csr', dtype=np.float64) self._check_parameters() - self._fit(X) + return self._fit(X) def _svd(self, array, n_components, n_discard): """Returns first `n_components` left and right singular @@ -156,6 +169,8 @@ def _svd(self, array, n_components, n_discard): assert_all_finite(u) assert_all_finite(vt) + if u.shape[1] == 1 and vt.shape[0] == 1: + n_discard = 0 u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T @@ -278,6 +293,7 @@ def _fit(self, X): normalized_data, row_diag, col_diag = _scale_normalize(X) n_sv = 1 + int(np.ceil(np.log2(self.n_clusters))) u, v = self._svd(normalized_data, n_sv, n_discard=1) + z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v)) @@ -291,6 +307,7 @@ def _fit(self, X): for c in range(self.n_clusters)) self.columns_ = np.vstack(self.column_labels_ == c for c in range(self.n_clusters)) + return self class SpectralBiclustering(BaseSpectral): @@ -475,6 +492,7 @@ def _fit(self, X): self.columns_ = np.vstack(self.column_labels_ == label for _ in range(n_row_clusters) for label in range(n_col_clusters)) + return self def _fit_best_piecewise(self, vectors, n_best, n_clusters): """Find the ``n_best`` vectors that are best approximated by piecewise diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 8a7d1babd24ed..8a9b979ce44e6 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -23,7 +23,6 @@ from sklearn.utils.testing import _named_check import sklearn -from sklearn.cluster.bicluster import BiclusterMixin from sklearn.linear_model.base import LinearClassifierMixin from sklearn.utils.estimator_checks import ( @@ -63,8 +62,6 @@ def test_non_meta_estimators(): # input validation etc for non-meta estimators estimators = all_estimators() for name, Estimator in estimators: - if issubclass(Estimator, BiclusterMixin): - continue if name.startswith("_"): continue for check in _yield_all_checks(name, Estimator): @@ -214,6 +211,7 @@ def test_transformer_n_iter(): check_transformer_n_iter, name), name, estimator + def test_get_params_invariance(): # Test for estimators that support get_params, that # get_params(deep=False) is a subset of get_params(deep=True) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index c2c01caf68e2e..2e5d811a3cc2c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -288,7 +288,9 @@ def set_testing_parameters(estimator): if "decision_function_shape" in params: # SVC estimator.set_params(decision_function_shape='ovo') - + if "n_best" in params: + # BiCluster + estimator.set_params(n_best=1) if estimator.__class__.__name__ == "SelectFdr": # be tolerant of noisy datasets (not actually speed) estimator.set_params(alpha=.5) @@ -335,6 +337,8 @@ def check_estimator_sparse_data(name, Estimator): X_csr = sparse.csr_matrix(X) y = (4 * rng.rand(40)).astype(np.int) for sparse_format in ['csr', 'csc', 'dok', 'lil', 'coo', 'dia', 'bsr']: + if name == 'SpectralCoclustering': + continue X = X_csr.asformat(sparse_format) # catch deprecation warnings with ignore_warnings(category=DeprecationWarning): @@ -684,7 +688,11 @@ def check_fit_score_takes_y(name, Estimator): @ignore_warnings def check_estimators_dtypes(name, Estimator): rnd = np.random.RandomState(0) - X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) + if name in ["BoxCoxTransformer", "SpectralCoclustering", + "SpectralBiclustering"]: + X_train_32 = 3 * rnd.uniform(1., 2., size=(20, 5)).astype(np.float32) + else: + X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) X_train_64 = X_train_32.astype(np.float64) X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) @@ -1309,7 +1317,7 @@ def check_class_weight_balanced_linear_classifier(name, Classifier): @ignore_warnings(category=DeprecationWarning) def check_estimators_overwrite_params(name, Estimator): - X, y = make_blobs(random_state=0, n_samples=9) + X, y = make_blobs(random_state=0, n_samples=9, n_features=3) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min()