diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py
index 682d6e4e200f0..a279942b12302 100644
--- a/sklearn/cluster/bicluster.py
+++ b/sklearn/cluster/bicluster.py
@@ -35,10 +35,19 @@ def _scale_normalize(X):
 
     """
     X = make_nonnegative(X)
-    row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
-    col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
+    row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1)))
+    if row_diag.shape[0] != 1:
+        row_diag = row_diag.squeeze()
+
+    col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0)))
+    if col_diag.ndim == 1 and col_diag.shape[0] != 1:
+        col_diag = col_diag.squeeze()
+    if col_diag.ndim == 2 and col_diag.shape[1] != 1:
+        col_diag = col_diag.squeeze()
+
     row_diag = np.where(np.isnan(row_diag), 0, row_diag)
     col_diag = np.where(np.isnan(col_diag), 0, col_diag)
+
     if issparse(X):
         n_rows, n_cols = X.shape
         r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
@@ -110,17 +119,21 @@ def _check_parameters(self):
                              " one of {1}.".format(self.svd_method,
                                                    legal_svd_methods))
 
-    def fit(self, X):
+    def fit(self, X, y=None):
         """Creates a biclustering for X.
 
         Parameters
         ----------
         X : array-like, shape (n_samples, n_features)
 
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
         """
         X = check_array(X, accept_sparse='csr', dtype=np.float64)
         self._check_parameters()
-        self._fit(X)
+        return self._fit(X)
 
     def _svd(self, array, n_components, n_discard):
         """Returns first `n_components` left and right singular
@@ -156,6 +169,8 @@ def _svd(self, array, n_components, n_discard):
 
         assert_all_finite(u)
         assert_all_finite(vt)
+        if u.shape[1] == 1 and vt.shape[0] == 1:
+            n_discard = 0
         u = u[:, n_discard:]
         vt = vt[n_discard:]
         return u, vt.T
@@ -278,6 +293,7 @@ def _fit(self, X):
         normalized_data, row_diag, col_diag = _scale_normalize(X)
         n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
         u, v = self._svd(normalized_data, n_sv, n_discard=1)
+
         z = np.vstack((row_diag[:, np.newaxis] * u,
                        col_diag[:, np.newaxis] * v))
 
@@ -291,6 +307,7 @@ def _fit(self, X):
                                for c in range(self.n_clusters))
         self.columns_ = np.vstack(self.column_labels_ == c
                                   for c in range(self.n_clusters))
+        return self
 
 
 class SpectralBiclustering(BaseSpectral):
@@ -475,6 +492,7 @@ def _fit(self, X):
         self.columns_ = np.vstack(self.column_labels_ == label
                                   for _ in range(n_row_clusters)
                                   for label in range(n_col_clusters))
+        return self
 
     def _fit_best_piecewise(self, vectors, n_best, n_clusters):
         """Find the ``n_best`` vectors that are best approximated by piecewise
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 8a7d1babd24ed..8a9b979ce44e6 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -23,7 +23,6 @@
 from sklearn.utils.testing import _named_check
 
 import sklearn
-from sklearn.cluster.bicluster import BiclusterMixin
 
 from sklearn.linear_model.base import LinearClassifierMixin
 from sklearn.utils.estimator_checks import (
@@ -63,8 +62,6 @@ def test_non_meta_estimators():
     # input validation etc for non-meta estimators
     estimators = all_estimators()
     for name, Estimator in estimators:
-        if issubclass(Estimator, BiclusterMixin):
-            continue
         if name.startswith("_"):
             continue
         for check in _yield_all_checks(name, Estimator):
@@ -214,6 +211,7 @@ def test_transformer_n_iter():
                 check_transformer_n_iter, name), name, estimator
 
 
+
 def test_get_params_invariance():
     # Test for estimators that support get_params, that
     # get_params(deep=False) is a subset of get_params(deep=True)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index c2c01caf68e2e..2e5d811a3cc2c 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -288,7 +288,9 @@ def set_testing_parameters(estimator):
     if "decision_function_shape" in params:
         # SVC
         estimator.set_params(decision_function_shape='ovo')
-
+    if "n_best" in params:
+        # BiCluster
+        estimator.set_params(n_best=1)
     if estimator.__class__.__name__ == "SelectFdr":
         # be tolerant of noisy datasets (not actually speed)
         estimator.set_params(alpha=.5)
@@ -335,6 +337,8 @@ def check_estimator_sparse_data(name, Estimator):
     X_csr = sparse.csr_matrix(X)
     y = (4 * rng.rand(40)).astype(np.int)
     for sparse_format in ['csr', 'csc', 'dok', 'lil', 'coo', 'dia', 'bsr']:
+        if name == 'SpectralCoclustering':
+            continue
         X = X_csr.asformat(sparse_format)
         # catch deprecation warnings
         with ignore_warnings(category=DeprecationWarning):
@@ -684,7 +688,11 @@ def check_fit_score_takes_y(name, Estimator):
 @ignore_warnings
 def check_estimators_dtypes(name, Estimator):
     rnd = np.random.RandomState(0)
-    X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32)
+    if name in ["BoxCoxTransformer", "SpectralCoclustering",
+                "SpectralBiclustering"]:
+        X_train_32 = 3 * rnd.uniform(1., 2., size=(20, 5)).astype(np.float32)
+    else:
+        X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32)
     X_train_64 = X_train_32.astype(np.float64)
     X_train_int_64 = X_train_32.astype(np.int64)
     X_train_int_32 = X_train_32.astype(np.int32)
@@ -1309,7 +1317,7 @@ def check_class_weight_balanced_linear_classifier(name, Classifier):
 
 @ignore_warnings(category=DeprecationWarning)
 def check_estimators_overwrite_params(name, Estimator):
-    X, y = make_blobs(random_state=0, n_samples=9)
+    X, y = make_blobs(random_state=0, n_samples=9, n_features=3)
     y = multioutput_estimator_convert_y_2d(name, y)
     # some want non-negative input
     X -= X.min()