8000 [MRG] add random_state in tests estimators by kkatrio · Pull Request #8563 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] add random_state in tests estimators #8563

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8000
18 changes: 9 additions & 9 deletions sklearn/cluster/tests/test_bicluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,30 +235,30 @@ def test_perfect_checkerboard():
def test_errors():
data = np.arange(25).reshape((5, 5))

model = SpectralBiclustering(n_clusters=(3, 3, 3))
model = SpectralBiclustering(n_clusters=(3, 3, 3), random_state=42)
assert_raises(ValueError, model.fit, data)

model = SpectralBiclustering(n_clusters='abc')
model = SpectralBiclustering(n_clusters='abc', random_state=42)
assert_raises(ValueError, model.fit, data)

model = SpectralBiclustering(n_clusters=(3, 'abc'))
model = SpectralBiclustering(n_clusters=(3, 'abc'), random_state=42)
assert_raises(ValueError, model.fit, data)

model = SpectralBiclustering(method='unknown')
model = SpectralBiclustering(method='unknown', random_state=42)
assert_raises(ValueError, model.fit, data)

model = SpectralBiclustering(svd_method='unknown')
model = SpectralBiclustering(svd_method='unknown', random_state=42)
assert_raises(ValueError, model.fit, data)

model = SpectralBiclustering(n_components=0)
model = SpectralBiclustering(n_components=0, random_state=42)
assert_raises(ValueError, model.fit, data)

model = SpectralBiclustering(n_best=0)
model = SpectralBiclustering(n_best=0, random_state=42)
assert_raises(ValueError, model.fit, data)

model = SpectralBiclustering(n_components=3, n_best=4)
model = SpectralBiclustering(n_components=3, n_best=4, random_state=42)
assert_raises(ValueError, model.fit, data)

model = SpectralBiclustering()
model = SpectralBiclustering(random_state=42)
data = np.arange(27).reshape((3, 3, 3))
assert_raises(ValueError, model.fit, data)
14 changes: 7 additions & 7 deletions sklearn/cluster/tests/test_birch.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

def test_n_samples_leaves_roots():
# Sanity check for the number of samples in leaves and roots
X, y = make_blobs(n_samples=10)
X, y = make_blobs(n_samples=10, random_state=42)
brc = Birch()
brc.fit(X)
n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
Expand All @@ -35,7 +35,7 @@ def test_n_samples_leaves_roots():

def test_partial_fit():
# Test that fit is equivalent to calling partial_fit multiple times
X, y = make_blobs(n_samples=100)
X, y = make_blobs(n_samples=100, random_state=42)
brc = Birch(n_clusters=3)
brc.fit(X)
brc_partial = Birch(n_clusters=None)
Expand Down Expand Up @@ -71,7 +71,7 @@ def test_birch_predict():

def test_n_clusters():
# Test that n_clusters param works properly
X, y = make_blobs(n_samples=100, centers=10)
X, y = make_blobs(n_samples=100, centers=10, random_state=42)
brc1 = Birch(n_clusters=10)
brc1.fit(X)
assert_greater(len(brc1.subcluster_centers_), 10)
Expand All @@ -86,7 +86,7 @@ def test_n_clusters():
assert_array_equal(brc1.labels_, brc2.labels_)

# Test that the wrong global clustering step raises an Error.
clf = ElasticNet()
clf = ElasticNet(random_state=42)
brc3 = Birch(n_clusters=clf)
assert_raises(ValueError, brc3.fit, X)

Expand All @@ -97,7 +97,7 @@ def test_n_clusters():

def test_sparse_X():
# Test that sparse and dense data give same results
X, y = make_blobs(n_samples=100, centers=10)
X, y = make_blobs(n_samples=100, centers=10, random_state=42)
brc = Birch(n_clusters=10)
brc.fit(X)

Expand All @@ -120,7 +120,7 @@ def check_branching_factor(node, branching_factor):

def test_branching_factor():
# Test that nodes have at max branching_factor number of subclusters
X, y = make_blobs()
X, y = make_blobs(random_state=42)
branching_factor = 9

# Purposefully set a low threshold to maximize the subclusters.
Expand Down Expand Up @@ -150,7 +150,7 @@ def check_threshold(birch_instance, threshold):

def test_threshold():
# Test that the leaf subclusters have a threshold lesser than radius
X, y = make_blobs(n_samples=80, centers=4)
X, y = make_blobs(n_samples=80, centers=4, random_state=42)
brc = Birch(threshold=0.5, n_clusters=None)
brc.fit(X)
check_threshold(brc, 0.5)
Expand Down
63 changes: 41 additions & 22 deletions sklearn/cluster/tests/test_k_means.py
F438
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ def test_minibatch_update_consistency():
# step 1: compute the dense minibatch update
old_inertia, incremental_diff = _mini_batch_step(
X_mb, x_mb_squared_norms, new_centers, counts,
buffer, 1, None, random_reassign=False)
buffer, 1, None, random_reassign=False,
random_state=42)
assert_greater(old_inertia, 0.0)

# compute the new inertia on the same batch to check that it decreased
Expand All @@ -133,7 +134,8 @@ def test_minibatch_update_consistency():
# step 2: compute the sparse minibatch update
old_inertia_csr, incremental_diff_csr = _mini_batch_step(
X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr,
buffer_csr, 1, None, random_reassign=False)
buffer_csr, 1, None, random_reassign=False,
random_state=42)
assert_greater(old_inertia_csr, 0.0)

# compute the new inertia on the same batch to check that it decreased
Expand Down Expand Up @@ -216,7 +218,7 @@ def test_k_means_plus_plus_init_2_jobs():
def test_k_means_precompute_distances_flag():
# check that a warning is raised if the precompute_distances flag is not
# supported
km = KMeans(precompute_distances="wrong")
km = KMeans(precompute_distances="wrong", random_state=42)
assert_raises(ValueError, km.fit, X)


Expand Down Expand Up @@ -263,8 +265,10 @@ def test_k_means_n_init():

# two regression tests on bad n_init argument
# previous bug: n_init <= 0 threw non-informative TypeError (#3858)
assert_raises_regex(ValueError, "n_init", KMeans(n_init=0).fit, X)
assert_raises_regex(ValueError, "n_init", KMeans(n_init=-1).fit, X)
assert_raises_regex(ValueError, "n_init", KMeans(n_init=0,
random_state=42).fit, X)
assert_raises_regex(ValueError, "n_init", KMeans(n_init=-1,
random_state=42).fit, X)


def test_k_means_explicit_init_shape():
Expand Down Expand Up @@ -331,7 +335,8 @@ def test_mb_k_means_plus_plus_init_sparse_matrix():


def test_minibatch_init_with_large_k():
mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20)
mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20,
random_state=42)
# Check that a warning is raised, as the number clusters is larger
# than the init_size
assert_warns(RuntimeWarning, mb_k_means.fit, X)
Expand Down Expand Up @@ -516,12 +521,14 @@ def test_minibatch_set_init_size():


def test_k_means_invalid_init():
km = KMeans(init="invalid", n_init=1, n_clusters=n_clusters)
km = KMeans(init="invalid", n_init=1, n_clusters=n_clusters,
random_state=42)
assert_raises(ValueError, km.fit, X)


def test_mini_match_k_means_invalid_init():
km = MiniBatchKMeans(init="invalid", n_init=1, n_clusters=n_clusters)
km = MiniBatchKMeans(init="invalid", n_init=1, n_clusters=n_clusters,
random_state=42)
assert_raises(ValueError, km.fit, X)


Expand Down Expand Up @@ -605,7 +612,8 @@ def test_predict_minibatch_dense_input():

def test_predict_minibatch_kmeanspp_init_sparse_input():
mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
n_init=10).fit(X_csr)
n_init=10,
random_state=42).fit(X_csr)

# sanity check: re-predict labeling for training set samples
assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_)
Expand All @@ -621,6 +629,7 @@ def test_predict_minibatch_kmeanspp_init_sparse_input():

def test_predict_minibatch_random_init_sparse_input():
mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='random',
random_state=42,
n_init=10).fit(X_csr)

# sanity check: re-predict labeling for training set samples
Expand All @@ -643,15 +652,20 @@ def test_int_input():
init_int = X_int[:2]

fitted_models = [
KMeans(n_clusters=2).fit(X_int),
KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int),
KMeans(n_clusters=2, random_state=42).fit(X_int),
KMeans(n_clusters=2, init=init_int, n_init=1,
random_state=42).fit(X_int),
# mini batch kmeans is very unstable on such a small dataset hence
# we use many inits
MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int),
MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int_csr),
MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2,
random_state=42).fit(X_int),
MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2,
random_state=42).fit(X_int_csr),
MiniBatchKMeans(n_clusters=2, batch_size=2,
random_state=42,
init=init_int, n_init=1).fit(X_int),
MiniBatchKMeans(n_clusters=2, batch_size=2,
random_state=42,
init=init_int, n_init=1).fit(X_int_csr),
]

Expand All @@ -665,7 +679,7 @@ def test_int_input():


def test_transform():
km = KMeans(n_clusters=n_clusters)
km = KMeans(n_clusters=n_clusters, random_state=42)
km.fit(X)
X_new = km.transform(km.cluster_centers_)

Expand Down Expand Up @@ -730,7 +744,8 @@ def test_k_means_function():
sys.stdout = StringIO()
try:
cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
verbose=True)
verbose=True,
random_state=42)
finally:
sys.stdout = old_stdout
centers = cluster_centers
Expand Down Expand Up @@ -765,7 +780,7 @@ def test_x_squared_norms_init_centroids():

def test_max_iter_error():

km = KMeans(max_iter=-1)
km = KMeans(max_iter=-1, random_state=42)
assert_raise_message(ValueError, 'Number of iterations should be',
km.fit, X)

Expand Down Expand Up @@ -821,7 +836,8 @@ def test_k_means_init_centers():
X_test = dtype(X_small)
init_centers_test = dtype(init_centers)
assert_array_equal(init_centers, init_centers_test)
km = KMeans(init=init_centers_test, n_clusters=3, n_init=1)
km = KMeans(init=init_centers_test, n_clusters=3, n_init=1,
random_state=42)
km.fit(X_test)
assert_equal(False, np.may_share_memory(km.cluster_centers_, init_centers))

Expand All @@ -833,14 +849,15 @@ def test_sparse_k_means_init_centers():
X = iris.data

# Get a local optimum
centers = KMeans(n_clusters=3).fit(X).cluster_centers_
centers = KMeans(n_clusters=3, random_state=42).fit(X).cluster_centers_

# Fit starting from a local optimum shouldn't change the solution
np.testing.assert_allclose(
centers,
KMeans(n_clusters=3,
init=centers,
n_init=1).fit(X).cluster_centers_
n_init=1,
random_state=42).fit(X).cluster_centers_
)

# The same should be true when X is sparse
Expand All @@ -849,7 +866,8 @@ def test_sparse_k_means_init_centers():
centers,
KMeans(n_clusters=3,
init=centers,
n_init=1).fit(X_sparse).cluster_centers_
n_init=1,
random_state=42).fit(X_sparse).cluster_centers_
)


Expand All @@ -860,10 +878,11 @@ def test_sparse_validate_centers():
X = iris.data

# Get a local optimum
centers = KMeans(n_clusters=4).fit(X).cluster_centers_
centers = KMeans(n_clusters=4, random_state=42).fit(X).cluster_centers_

# Test that a ValueError is raised for validate_center_shape
classifier = KMeans(n_clusters=3, init=centers, n_init=1)
classifier = KMeans(n_clusters=3, init=centers, n_init=1,
random_state=42)

msg = "The shape of the initial centers \(\(4L?, 4L?\)\) " \
"does not match the number of clusters 3"
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/tests/test_mean_shift.py
10000
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

def test_estimate_bandwidth():
# Test estimate_bandwidth
bandwidth = estimate_bandwidth(X, n_samples=200)
bandwidth = estimate_bandwidth(X, n_samples=200, random_state=42)
assert_true(0.9 <= bandwidth <= 1.5)


Expand Down
3 changes: 2 additions & 1 deletion sklearn/cluster/tests/test_spectral.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ def histogram(x, y, **kwargs):
assert_equal((X.shape[0],), labels.shape)

# raise error on unknown affinity
sp = SpectralClustering(n_clusters=2, affinity='<unknown>')
sp = SpectralClustering(n_clusters=2, affinity='<unknown>',
random_state=42)
assert_raises(ValueError, sp.fit, X)


Expand Down
6 changes: 3 additions & 3 deletions sklearn/covariance/tests/test_robust_covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_fast_mcd_on_invalid_input():

def test_mcd_class_on_invalid_input():
X = np.arange(100)
mcd = MinCovDet()
mcd = MinCovDet(random_state=42)
assert_raise_message(ValueError, 'Got X with X.ndim=1',
mcd.fit, X)

Expand Down Expand Up @@ -88,14 +88,14 @@ def test_mcd_issue1127():
# (i.e. n_support = n_samples)
rnd = np.random.RandomState(0)
X = rnd.normal(size=(3, 1))
mcd = MinCovDet()
mcd = MinCovDet(random_state=42)
mcd.fit(X)


def test_outlier_detection():
rnd = np.random.RandomState(0)
X = rnd.randn(100, 10)
clf = EllipticEnvelope(contamination=0.1)
clf = EllipticEnvelope(contamination=0.1, random_state=42)
assert_raises(NotFittedError, clf.predict, X)
assert_raises(NotFittedError, clf.decision_function, X)
clf.fit(X)
Expand Down
3 changes: 2 additions & 1 deletion sklearn/decomposition/tests/test_dict_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def test_dict_learning_nonzero_coefs():

def test_dict_learning_unknown_fit_algorithm():
n_components = 5
dico = DictionaryLearning(n_components, fit_algorithm='<unknown>')
dico = DictionaryLearning(n_components, fit_algorithm='<unknown>',
random_state=42)
assert_raises(ValueError, dico.fit, X)


Expand Down
6 changes: 4 additions & 2 deletions sklearn/decomposition/tests/test_fastica.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ def g_test(x):
assert_raises(ValueError, fastica, m.T, fun=np.tanh,
algorithm=algo)
else:
X = PCA(n_components=2, whiten=True).fit_transform(m.T)
X = PCA(n_components=2, whiten=True,
random_state=42).fit_transform(m.T)
k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False)
assert_raises(ValueError, fastica, X, fun=np.tanh,
algorithm=algo)
Expand Down Expand Up @@ -129,7 +130,8 @@ def g_test(x):
ica = FastICA(fun=fn, algorithm=algo, random_state=0)
assert_raises(ValueError, ica.fit, m.T)

assert_raises(TypeError, FastICA(fun=moves.xrange(10)).fit, m.T)
assert_raises(TypeError, FastICA(fun=moves.xrange(10),
random_state=42).fit, m.T)


def test_fastica_nowhiten():
Expand Down
Loading
0