8000 [MRG] Fix TSNE init as NumPy array by b-carter · Pull Request #7208 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] Fix TSNE init as NumPy array #7208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Aug 24, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions sklearn/manifold/t_sne.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from . import _utils
from . import _barnes_hut_tsne
from ..utils.fixes import astype
from ..externals.six import string_types


MACHINE_EPSILON = np.finfo(np.double).eps
Expand Down Expand Up @@ -567,8 +568,9 @@ class TSNE(BaseEstimator):
the distance between them. The default is "euclidean" which is
interpreted as squared euclidean distance.

init : string, optional (default: "random")
Initialization of embedding. Possible options are 'random' and 'pca'.
init : string or numpy array, optional (default: "random")
Initialization of embedding. Possible options are 'random', 'pca',
and a numpy array of shape (n_samples, n_components).
PCA initialization cannot be used with precomputed distances and is
usually more globally stable than random initialization.

Expand Down Expand Up @@ -643,8 +645,10 @@ def __init__(self, n_components=2, perplexity=30.0,
n_iter_without_progress=30, min_grad_norm=1e-7,
metric="euclidean", init="random", verbose=0,
random_state=None, method='barnes_hut', angle=0.5):
if init not in ["pca", "random"] or isinstance(init, np.ndarray):
msg = "'init' must be 'pca', 'random' or a NumPy array"
if not ((isinstance(init, string_types) and
init in ["pca", "random"]) or
isinstance(init, np.ndarray)):
msg = "'init' must be 'pca', 'random', or a numpy array"
raise ValueError(msg)
self.n_components = n_components
self.perplexity = perplexity
Expand Down Expand Up @@ -707,7 +711,7 @@ def _fit(self, X, skip_num_points=0):
raise ValueError("n_iter should be at least 200")

if self.metric == "precomputed":
if self.init == 'pca':
if isinstance(self.init, string_types) and self.init == 'pca':
raise ValueError("The parameter init=\"pca\" cannot be used "
"with metric=\"precomputed\".")
if X.shape[0] != X.shape[1]:
Expand Down Expand Up @@ -763,12 +767,12 @@ def _fit(self, X, skip_num_points=0):
assert np.all(P <= 1), ("All probabilities should be less "
"or then equal to one")

if self.init == 'pca':
if isinstance(self.init, np.ndarray):
X_embedded = self.init
elif self.init == 'pca':
pca = PCA(n_components=self.n_components, svd_solver='randomized',
random_state=random_state)
X_embedded = pca.fit_transform(X)
elif isinstance(self.init, np.ndarray):
X_embedded = self.init
elif self.init == 'random':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will throw warnings if self.init is a numpy array.

Copy link
Contributor Author
@b-carter b-carter Aug 19, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed this as well. I also wrote a test that generated this warning. Would it make sense to push this test? I'm not sure how to assert that no FutureWarning is generated (I found 'assert_no_warnings' but I see that it's depreciating).

The two lines currently in the test are:
tsne = TSNE(init=np.zeros((2, 2)))
tsne.fit(np.array([[0, 0], [0, 0]]))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like there's one more place that would throw a warning for the same reason in the event that metric='precomputed'. I'll fix this one as well using a type check first. Same question as above in terms of whether it's a good idea to write a test to check that a FutureError isn't thrown?

X_embedded = None
else:
Expand Down
19 changes: 17 additions & 2 deletions sklearn/manifold/tests/test_t_sne.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from sklearn.utils.testing import assert_less_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_less
from sklearn.utils.testing import assert_raises_regexp
Expand Down Expand Up @@ -305,11 +306,25 @@ def test_non_square_precomputed_distances():


def test_init_not_available():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just edit the comment instead of removing it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

# 'init' must be 'pca' or 'random'.
m = "'init' must be 'pca', 'random' or a NumPy array"
# 'init' must be 'pca', 'random', or numpy array.
m = "'init' must be 'pca', 'random', or a numpy array"
assert_raises_regexp(ValueError, m, TSNE, init="not available")


def test_init_ndarray():
# Initialize TSNE with ndarray and test fit
tsne = TSNE(init=np.zeros((100, 2)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also do a fit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me know if the test I put in the comment below looks good. If so, I'll push it in so you can merge.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please just add a tsne.fit here and assess at least the size of the fitted attributes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, do you think it's also worthwhile to check the case of init with ndarray and metric='precomputed'? (one change in this PR involves fixing a FutureWarning in this case)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just added another test, should be good now.

X_embedded = tsne.fit_transform(np.ones((100, 5)))
assert_array_equal(np.zeros((100, 2)), X_embedded)


def test_init_ndarray_precomputed():
# Initialize TSNE with ndarray and metric 'precomputed'
# Make sure no FutureWarning is thrown from _fit
tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed")
tsne.fit(np.zeros((100, 100)))


def test_distance_not_available():
# 'metric' must be valid.
tsne = TSNE(metric="not available")
Expand Down
0