From 9d499fedc027a161d382aac2614784f3a84453f3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 29 Oct 2019 22:50:47 +0100 Subject: [PATCH 01/37] DOC issue compiling documentation --- .readthedocs.yml | 2 +- build_tools/travis/install.sh | 8 ++++---- requirements.txt | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 58c4f6258..ccd1cd045 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -4,7 +4,7 @@ formats: - none requirements_file: requirements.txt python: - version: 3.6 + version: 3.7 pip_install: true extra_requirements: - tests diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 342a56883..57447d29c 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -28,10 +28,10 @@ if [[ "$DISTRIB" == "conda" ]]; then MINICONDA_PATH=/home/travis/miniconda chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH export PATH=$MINICONDA_PATH/bin:$PATH - conda config --set always_yes yes --set changeps1 no - conda install conda=4.6 - conda update -q conda - conda info -a + # conda config --set always_yes yes --set changeps1 no + # conda install conda=4.6 + # conda update -q conda + # conda info -a # Configure the conda environment and put it in the path using the # provided versions diff --git a/requirements.txt b/requirements.txt index dd0a12569..54464e995 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy>=1.11 scipy>=0.17 scikit-learn>=0.21 +joblib>=0.11 From b2c1d499b4422141aec20f8c82c378bf76d8f4d2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 29 Oct 2019 23:01:19 +0100 Subject: [PATCH 02/37] iter --- build_tools/circle/build_doc.sh | 6 +++--- build_tools/travis/install.sh | 9 ++------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 3f30c27d4..23df4e0f9 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -88,12 +88,12 @@ conda update --yes --quiet conda # Configure the conda environment and put it in the path using the # provided versions -conda create -n $CONDA_ENV_NAME --yes --quiet python=3.6 +conda create -n $CONDA_ENV_NAME --yes --quiet python=3.7 source activate $CONDA_ENV_NAME -conda install --yes pip numpy scipy pillow matplotlib sphinx \ +conda install --yes pip numpy scipy joblib pillow matplotlib sphinx \ sphinx_rtd_theme numpydoc pandas keras -pip install --pre scikit-learn +pip install scikit-learn pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git # Build and install imbalanced-learn in dev mode diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 57447d29c..52be6219e 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -28,10 +28,6 @@ if [[ "$DISTRIB" == "conda" ]]; then MINICONDA_PATH=/home/travis/miniconda chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH export PATH=$MINICONDA_PATH/bin:$PATH - # conda config --set always_yes yes --set changeps1 no - # conda install conda=4.6 - # conda update -q conda - # conda info -a # Configure the conda environment and put it in the path using the # provided versions @@ -47,8 +43,7 @@ if [[ "$DISTRIB" == "conda" ]]; then fi if [[ "$SKLEARN_VERSION" == "master" ]]; then - conda install --yes cython - pip install -U git+https://github.com/scikit-learn/scikit-learn.git + pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn else conda install --yes scikit-learn=$SKLEARN_VERSION fi @@ -66,7 +61,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then source testvenv/bin/activate pip3 install scikit-learn - pip3 install pandas keras tensorflow + pip3 install pandas pip3 install pytest pytest-cov codecov sphinx numpydoc fi From 0dddad4959986a37e179f8b19f1fc1cf91d90a1c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 29 Oct 2019 23:03:08 +0100 Subject: [PATCH 03/37] iter --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 60fa30fd3..f674869cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,11 +32,11 @@ matrix: # Ubuntu 14.04 environment - env: DISTRIB="ubuntu" # Latest release - - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="0.21.2" + - env: DISTRIB="conda" PYTHON_VERSION="3.7" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="*" OPTIONAL_DEPS="true" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="0.21.2" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="*" OPTIONAL_DEPS="false" - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" From c13add11421b1c364967825d749718e3b9137690 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 29 Oct 2019 23:26:48 +0100 Subject: [PATCH 04/37] iter --- build_tools/travis/install.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 52be6219e..a746ab5ac 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -35,12 +35,13 @@ if [[ "$DISTRIB" == "conda" ]]; then source activate testenv conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION - if [[ "$OPTIONAL_DEPS" == "true" ]]; then - conda install --yes pandas keras tensorflow + if [[ "$OPTIONAL_DEPS" == "keras" ]]; then + conda install --yes pandas keras tensorflow=1 KERAS_BACKEND=tensorflow python -c "import keras.backend" sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; - fi + elif [[ "$OPTIONAL_DEPS" == "tensorflow" ]]; then + conda install --yes pandas tensorflow if [[ "$SKLEARN_VERSION" == "master" ]]; then pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn From d3e377035d40413b20f982153523c0c30b1dc3b7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 29 Oct 2019 23:29:46 +0100 Subject: [PATCH 05/37] iter --- build_tools/travis/install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index a746ab5ac..368f98453 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -42,6 +42,7 @@ if [[ "$DISTRIB" == "conda" ]]; then sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; elif [[ "$OPTIONAL_DEPS" == "tensorflow" ]]; then conda install --yes pandas tensorflow + fi if [[ "$SKLEARN_VERSION" == "master" ]]; then pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn From f326765fc16e20d3acc368e1cc2533e3dd2c2988 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 29 Oct 2019 23:37:47 +0100 Subject: [PATCH 06/37] iter --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f674869cd..d4ab11e1f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,7 +34,10 @@ matrix: # Latest release - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="*" - OPTIONAL_DEPS="true" + OPTIONAL_DEPS="keras" + - env: DISTRIB="conda" PYTHON_VERSION="3.7" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="*" + OPTIONAL_DEPS="tensorflow" - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="*" OPTIONAL_DEPS="false" From bb9e69b35a269e1d2eacb80d3d8f4dfdd5b281ab Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 29 Oct 2019 23:41:46 +0100 Subject: [PATCH 07/37] iter --- build_tools/circle/build_doc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 23df4e0f9..0c5446ea1 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -92,7 +92,7 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python=3.7 source activate $CONDA_ENV_NAME conda install --yes pip numpy scipy joblib pillow matplotlib sphinx \ - sphinx_rtd_theme numpydoc pandas keras + sphinx_rtd_theme numpydoc pandas tensorflow pip install scikit-learn pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git From 3dcbb4aafabbf39dff0b85184f067e21e605f30e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 30 Oct 2019 00:01:02 +0100 Subject: [PATCH 08/37] iter --- imblearn/ensemble/tests/test_bagging.py | 83 +++++++++++++------------ 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/imblearn/ensemble/tests/test_bagging.py b/imblearn/ensemble/tests/test_bagging.py index ee7276694..307dcdd31 100644 --- a/imblearn/ensemble/tests/test_bagging.py +++ b/imblearn/ensemble/tests/test_bagging.py @@ -4,6 +4,7 @@ # License: MIT import numpy as np +import pytest from sklearn.datasets import load_iris, make_hastie_10_2 from sklearn.model_selection import (GridSearchCV, ParameterGrid, @@ -15,8 +16,7 @@ from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest from sklearn.utils.testing import (assert_array_equal, - assert_array_almost_equal, assert_raises, - assert_warns, assert_warns_message, + assert_array_almost_equal, assert_allclose) from imblearn.datasets import make_imbalance @@ -194,13 +194,13 @@ def test_oob_score_classification(): assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators - assert_warns(UserWarning, - BalancedBaggingClassifier( - base_estimator=base_estimator, - n_estimators=1, - bootstrap=True, - oob_score=True, - random_state=0).fit, X_train, y_train) + with pytest.warns(UserWarning): + BalancedBaggingClassifier( + base_estimator=base_estimator, + n_estimators=1, + bootstrap=True, + oob_score=True, + random_state=0).fit(X_train, y_train) def test_single_estimator(): @@ -238,36 +238,34 @@ def test_error(): base = DecisionTreeClassifier() # Test n_estimators - assert_raises(ValueError, - BalancedBaggingClassifier(base, n_estimators=1.5).fit, X, y) - assert_raises(ValueError, - BalancedBaggingClassifier(base, n_estimators=-1).fit, X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, n_estimators=1.5).fit(X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, n_estimators=-1).fit(X, y) # Test max_samples - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_samples=-1).fit, X, y) - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_samples=0.0).fit, X, y) - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_samples=2.0).fit, X, y) - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_samples=1000).fit, X, y) - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_samples="foobar").fit, X, - y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_samples=-1).fit(X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_samples=0.0).fit(X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_samples=2.0).fit(X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_samples=1000).fit(X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_samples="foobar").fit(X, y) # Test max_features - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_features=-1).fit, X, y) - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_features=0.0).fit, X, y) - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_features=2.0).fit, X, y) - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_features=5).fit, X, y) - assert_raises(ValueError, - BalancedBaggingClassifier(base, max_features="foobar").fit, - X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_features=-1).fit(X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_features=0.0).fit(X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_features=2.0).fit(X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_features=5).fit(X, y) + with pytest.raises(ValueError): + BalancedBaggingClassifier(base, max_features="foobar").fit(X, y) # Test support of decision_function assert not (hasattr( @@ -364,7 +362,8 @@ def test_warm_start_smaller_n_estimators(): clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) - assert_raises(ValueError, clf.fit, X, y) + with pytest.raises(ValueError): + clf.fit(X, y) def test_warm_start_equal_n_estimators(): @@ -380,9 +379,9 @@ def test_warm_start_equal_n_estimators(): # modify X to nonsense values, this should not change anything X_train += 1. - assert_warns_message(UserWarning, - "Warm-start fitting without increasing n_estimators" - " does not", clf.fit, X_train, y_train) + warn_msg = "Warm-start fitting without increasing n_estimators does not" + with pytest.warns(UserWarning, match=warn_msg): + clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test)) @@ -412,7 +411,8 @@ def test_warm_start_with_oob_score_fails(): X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier( n_estimators=5, warm_start=True, oob_score=True) - assert_raises(ValueError, clf.fit, X, y) + with pytest.raises(ValueError): + clf.fit(X, y) def test_oob_score_removed_on_warm_start(): @@ -424,7 +424,8 @@ def test_oob_score_removed_on_warm_start(): clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) - assert_raises(AttributeError, getattr, clf, "oob_score_") + with pytest.raises(AttributeError): + getattr(clf, "oob_score_") def test_oob_score_consistency(): From 702f09284f6ea5648ef0e9c319ce3dd786db5da8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 30 Oct 2019 08:14:52 +0100 Subject: [PATCH 09/37] iter --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index d4ab11e1f..3a11f42da 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,13 +33,13 @@ matrix: - env: DISTRIB="ubuntu" # Latest release - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="*" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" OPTIONAL_DEPS="keras" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="*" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" OPTIONAL_DEPS="tensorflow" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="*" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" OPTIONAL_DEPS="false" - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" From d58cf2fb62805e9877d3d8f26b0238910f54a09e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 30 Oct 2019 08:16:24 +0100 Subject: [PATCH 10/37] iter --- appveyor.yml | 4 ++-- build_tools/circle/build_doc.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 4822a5d81..5c3316abd 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -30,8 +30,8 @@ install: - conda update conda -y -q - conda create -n testenv --yes python=%PYTHON_VERSION% pip - activate testenv - - conda install scipy numpy -y -q - - conda install scikit-learn -y -q + - conda install scipy numpy joblib -y -q + - pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn - conda install %OPTIONAL_DEP% -y -q - conda install pytest pytest-cov -y -q - pip install codecov diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 0c5446ea1..d364c74d0 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -93,7 +93,7 @@ source activate $CONDA_ENV_NAME conda install --yes pip numpy scipy joblib pillow matplotlib sphinx \ sphinx_rtd_theme numpydoc pandas tensorflow -pip install scikit-learn +pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git # Build and install imbalanced-learn in dev mode From 7e6df7b8927d7e46361e07b56ab681af53b692bd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 30 Oct 2019 08:25:07 +0100 Subject: [PATCH 11/37] iter --- .travis.yml | 3 --- build_tools/travis/install.sh | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3a11f42da..e062ec44e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -41,9 +41,6 @@ matrix: - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" OPTIONAL_DEPS="false" - - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" - OPTIONAL_DEPS="false" allow_failures: - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 368f98453..f55145a17 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -62,7 +62,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then virtualenv --system-site-packages --python=python3 testvenv source testvenv/bin/activate - pip3 install scikit-learn + pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn pip3 install pandas pip3 install pytest pytest-cov codecov sphinx numpydoc From 3397810f360832e00388b32de7f9e79b882071f9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 13:17:19 +0100 Subject: [PATCH 12/37] change import which are breaking with sklearn master --- imblearn/ensemble/_forest.py | 4 ++-- imblearn/ensemble/_weight_boosting.py | 2 +- imblearn/keras/_generator.py | 2 +- imblearn/metrics/_classification.py | 4 ++-- imblearn/metrics/tests/test_classification.py | 2 +- imblearn/tensorflow/_generator.py | 2 +- imblearn/tests/test_base.py | 2 +- imblearn/utils/estimator_checks.py | 4 ++-- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 9c6fecbbb..93ca07285 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -17,8 +17,8 @@ from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble.base import _set_random_states -from sklearn.ensemble.forest import _parallel_build_trees +from sklearn.ensemble._base import _set_random_states +from sklearn.ensemble._forest import _parallel_build_trees from sklearn.exceptions import DataConversionWarning from sklearn.utils import check_array from sklearn.utils import check_random_state diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index 7de421086..1c54dacc5 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -5,7 +5,7 @@ from sklearn.base import clone from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble.base import _set_random_states +from sklearn.ensemble._base import _set_random_states from sklearn.tree import DecisionTreeClassifier from sklearn.utils import safe_indexing diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 75f371e5e..040e9944f 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -44,7 +44,7 @@ def import_from_tensforflow(): from sklearn.base import clone from sklearn.utils import safe_indexing from sklearn.utils import check_random_state -from sklearn.utils.testing import set_random_state +from sklearn.utils._testing import set_random_state from ..under_sampling import RandomUnderSampler from ..utils import Substitution diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index c728b5a9a..b4e3dc843 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -20,8 +20,8 @@ import numpy as np import scipy as sp -from sklearn.metrics.classification import (_check_targets, _prf_divide, - precision_recall_fscore_support) +from sklearn.metrics._classification import (_check_targets, _prf_divide, + precision_recall_fscore_support) from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import unique_labels diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 49e62da85..a52be9419 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -17,7 +17,7 @@ from sklearn.utils.fixes import np_version from sklearn.utils.validation import check_random_state from sklearn.utils.testing import assert_allclose, assert_array_equal -from sklearn.utils.testing import assert_no_warnings +from sklearn.utils._testing import assert_no_warnings from sklearn.metrics import accuracy_score, average_precision_score from sklearn.metrics import brier_score_loss, cohen_kappa_score from sklearn.metrics import jaccard_score, precision_score diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index f2e80794b..a218d63e5 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -5,7 +5,7 @@ from sklearn.base import clone from sklearn.utils import safe_indexing from sklearn.utils import check_random_state -from sklearn.utils.testing import set_random_state +from sklearn.utils._testing import set_random_state from ..under_sampling import RandomUnderSampler from ..utils import Substitution diff --git a/imblearn/tests/test_base.py b/imblearn/tests/test_base.py index ca8203093..25a976d34 100644 --- a/imblearn/tests/test_base.py +++ b/imblearn/tests/test_base.py @@ -9,7 +9,7 @@ from sklearn.datasets import load_iris from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_allclose_dense_sparse +from sklearn.utils._testing import assert_allclose_dense_sparse from imblearn.datasets import make_imbalance from imblearn import FunctionSampler diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index b1ae04762..598dafa87 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -21,8 +21,8 @@ from sklearn.utils.estimator_checks import check_estimator \ as sklearn_check_estimator, check_parameters_default_constructible from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import assert_raises_regex -from sklearn.utils.testing import set_random_state +from sklearn.utils._testing import assert_raises_regex +from sklearn.utils._testing import set_random_state from sklearn.utils.multiclass import type_of_target from imblearn.base import BaseSampler From a03be265b8861b7f1a1c2b14150cc6a1a5d7d0f6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 14:51:02 +0100 Subject: [PATCH 13/37] fix in forest --- imblearn/ensemble/_forest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 93ca07285..d678a39e9 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -42,7 +42,8 @@ def _local_parallel_build_trees(sampler, tree, forest, X, y, sample_weight, sample_weight = safe_indexing(sample_weight, sampler.sample_indices_) tree = _parallel_build_trees(tree, forest, X_resampled, y_resampled, sample_weight, tree_idx, n_trees, - verbose=verbose, class_weight=class_weight) + verbose=verbose, class_weight=class_weight, + n_samples_bootstrap=X_resampled.shape[0]) return sampler, tree From f2b8a270b508f9990c3b59567d312d4f3e6097cc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 15:00:00 +0100 Subject: [PATCH 14/37] MAINT remove EasyEnsemble and BalancedCascade --- README.rst | 7 +- doc/api.rst | 2 - imblearn/ensemble/__init__.py | 11 +- imblearn/ensemble/_bagging.py | 2 +- imblearn/ensemble/_balance_cascade.py | 208 ------------------ imblearn/ensemble/_easy_ensemble.py | 116 ---------- .../ensemble/tests/test_balance_cascade.py | 156 ------------- imblearn/ensemble/tests/test_easy_ensemble.py | 91 -------- imblearn/tests/test_common.py | 4 - setup.cfg | 16 +- 10 files changed, 22 insertions(+), 591 deletions(-) delete mode 100644 imblearn/ensemble/_balance_cascade.py delete mode 100644 imblearn/ensemble/tests/test_balance_cascade.py diff --git a/README.rst b/README.rst index 62f10740c..dbe4842b0 100644 --- a/README.rst +++ b/README.rst @@ -162,10 +162,9 @@ Below is a list of the methods currently implemented in this module. 2. SMOTE + ENN [11]_ * Ensemble classifier using samplers internally - 1. EasyEnsemble [13]_ - 2. BalanceCascade [13]_ - 3. Balanced Random Forest [16]_ - 4. Balanced Bagging + 1. Easy Ensemble classifier [13]_ + 2. Balanced Random Forest [16]_ + 3. Balanced Bagging The different algorithms are presented in the sphinx-gallery_. diff --git a/doc/api.rst b/doc/api.rst index e98dfe47b..de1ba4eb4 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -111,10 +111,8 @@ Prototype selection :toctree: generated/ :template: class.rst - ensemble.BalanceCascade ensemble.BalancedBaggingClassifier ensemble.BalancedRandomForestClassifier - ensemble.EasyEnsemble ensemble.EasyEnsembleClassifier ensemble.RUSBoostClassifier diff --git a/imblearn/ensemble/__init__.py b/imblearn/ensemble/__init__.py index 6c7c50b57..3dac8cee8 100644 --- a/imblearn/ensemble/__init__.py +++ b/imblearn/ensemble/__init__.py @@ -3,13 +3,14 @@ under-sampled subsets combined inside an ensemble. """ -from ._easy_ensemble import EasyEnsemble from ._easy_ensemble import EasyEnsembleClassifier -from ._balance_cascade import BalanceCascade from ._bagging import BalancedBaggingClassifier from ._forest import BalancedRandomForestClassifier from ._weight_boosting import RUSBoostClassifier -__all__ = ['EasyEnsemble', 'EasyEnsembleClassifier', - 'BalancedBaggingClassifier', 'BalanceCascade', - 'BalancedRandomForestClassifier', 'RUSBoostClassifier'] +__all__ = [ + 'BalancedBaggingClassifier', + 'BalancedRandomForestClassifier', + 'EasyEnsembleClassifier', + 'RUSBoostClassifier', +] diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index 89e764330..2deaf5289 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -127,7 +127,7 @@ class BalancedBaggingClassifier(BaggingClassifier): See also -------- - BalanceCascade, EasyEnsemble + BalancedRandomForestClassifier, EasyEnsembleClassifier References ---------- diff --git a/imblearn/ensemble/_balance_cascade.py b/imblearn/ensemble/_balance_cascade.py deleted file mode 100644 index ecc596628..000000000 --- a/imblearn/ensemble/_balance_cascade.py +++ /dev/null @@ -1,208 +0,0 @@ -"""Class to perform under-sampling using balace cascade.""" - -# Authors: Guillaume Lemaitre -# Christos Aridas -# License: MIT - -from collections import Counter - -import numpy as np - -from sklearn.base import ClassifierMixin, clone -from sklearn.neighbors import KNeighborsClassifier -from sklearn.utils import check_random_state, safe_indexing -from sklearn.model_selection import cross_val_predict -from sklearn.utils.deprecation import deprecated - -from .base import BaseEnsembleSampler -from ..under_sampling.base import BaseUnderSampler -from ..utils import check_sampling_strategy -from ..utils import Substitution -from ..utils._docstring import _random_state_docstring - - -@Substitution( - sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) -@deprecated('BalanceCascade is deprecated in 0.4 and will be removed in 0.6.') -class BalanceCascade(BaseEnsembleSampler): - """Create an ensemble of balanced sets by iteratively under-sampling the - imbalanced dataset using an estimator. - - This method iteratively select subset and make an ensemble of the - different sets. The selection is performed using a specific classifier. - - Parameters - ---------- - {sampling_strategy} - - return_indices : bool, optional (default=True) - Whether or not to return the indices of the samples randomly - selected from the majority class. - - {random_state} - - n_max_subset : int or None, optional (default=None) - Maximum number of subsets to generate. By default, all data from - the training will be selected that could lead to a large number of - subsets. We can probably deduce this number empirically. - - estimator : object, optional (default=KNeighborsClassifier()) - An estimator inherited from :class:`sklearn.base.ClassifierMixin` and - having an attribute :func:`predict_proba`. - - bootstrap : bool, optional (default=True) - Whether to bootstrap the data before each iteration. - - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - - Notes - ----- - The method is described in [1]_. - - Supports multi-class resampling. A one-vs.-rest scheme is used as - originally proposed in [1]_. - - See also - -------- - BalancedBaggingClassifier, EasyEnsemble - - References - ---------- - .. [1] X. Y. Liu, J. Wu and Z. H. Zhou, "Exploratory Undersampling for - Class-Imbalance Learning," in IEEE Transactions on Systems, Man, and - Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550, - April 2009. - - Examples - -------- - - >>> from collections import Counter - >>> from sklearn.datasets import make_classification - >>> from imblearn.ensemble import \ -BalanceCascade # doctest: +NORMALIZE_WHITESPACE - >>> X, y = make_classification(n_classes=2, class_sep=2, - ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, - ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape %s' % Counter(y)) - Original dataset shape Counter({{1: 900, 0: 100}}) - >>> bc = BalanceCascade(random_state=42) - >>> X_res, y_res = bc.fit_resample(X, y) - >>> print('Resampled dataset shape %s' % Counter(y_res[0])) \ - # doctest: +ELLIPSIS - Resampled dataset shape Counter({{...}}) - - """ - - def __init__(self, - sampling_strategy='auto', - return_indices=False, - random_state=None, - n_max_subset=None, - estimator=None, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) - self.random_state = random_state - self.return_indices = return_indices - self.estimator = estimator - self.n_max_subset = n_max_subset - - def _validate_estimator(self): - """Private function to create the classifier""" - - if (self.estimator is not None and - isinstance(self.estimator, ClassifierMixin) and - hasattr(self.estimator, 'predict')): - self.estimator_ = clone(self.estimator) - elif self.estimator is None: - self.estimator_ = KNeighborsClassifier() - else: - raise ValueError('Invalid parameter `estimator`. Got {}.'.format( - type(self.estimator))) - - def _fit_resample(self, X, y): - self._validate_estimator() - - self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, 'under-sampling') - - random_state = check_random_state(self.random_state) - - # array to know which samples are available to be taken - samples_mask = np.ones(y.shape, dtype=bool) - - # where the different set will be stored - idx_under = [] - - n_subsets = 0 - b_subset_search = True - while b_subset_search: - target_stats = Counter( - safe_indexing(y, np.flatnonzero(samples_mask))) - # store the index of the data to under-sample - index_under_sample = np.empty((0, ), dtype=np.int) - # value which will be picked at each round - index_constant = np.empty((0, ), dtype=np.int) - for target_class in target_stats.keys(): - if target_class in self.sampling_strategy_.keys(): - n_samples = self.sampling_strategy_[target_class] - # extract the data of interest for this round from the - # current class - index_class = np.flatnonzero(y == target_class) - index_class_interest = index_class[samples_mask[ - y == target_class]] - y_class = safe_indexing(y, index_class_interest) - # select randomly the desired features - index_target_class = random_state.choice( - range(y_class.size), size=n_samples, replace=False) - index_under_sample = np.concatenate( - (index_under_sample, - index_class_interest[index_target_class]), - axis=0) - else: - index_constant = np.concatenate( - (index_constant, np.flatnonzero(y == target_class)), - axis=0) - - # store the set created - n_subsets += 1 - subset_indices = np.concatenate( - (index_under_sample, index_constant), axis=0) - idx_under.append(subset_indices) - - # fit and predict using cross validation - X_subset = safe_indexing(X, subset_indices) - y_subset = safe_indexing(y, subset_indices) - pred = cross_val_predict(self.estimator_, X_subset, y_subset, cv=3) - # extract the prediction about the targeted classes only - pred_target = pred[:index_under_sample.size] - index_classified = index_under_sample[pred_target == safe_indexing( - y_subset, range(index_under_sample.size))] - samples_mask[index_classified] = False - - # check the stopping criterion - if self.n_max_subset is not None: - if n_subsets == self.n_max_subset: - b_subset_search = False - # check that there is enough samples for another round - target_stats = Counter( - safe_indexing(y, np.flatnonzero(samples_mask))) - for target_class in self.sampling_strategy_.keys(): - if (target_stats[target_class] < - self.sampling_strategy_[target_class]): - b_subset_search = False - - X_resampled, y_resampled = [], [] - for indices in idx_under: - X_resampled.append(safe_indexing(X, indices)) - y_resampled.append(safe_indexing(y, indices)) - - if self.return_indices: - return (np.array(X_resampled), np.array(y_resampled), - np.array(idx_under)) - else: - return np.array(X_resampled), np.array(y_resampled) diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index 47ffa6338..368d521fb 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -9,12 +9,9 @@ import numpy as np from sklearn.base import clone -from sklearn.utils import check_random_state from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble.bagging import BaggingClassifier -from sklearn.utils.deprecation import deprecated -from .base import BaseEnsembleSampler from ..under_sampling import RandomUnderSampler from ..under_sampling.base import BaseUnderSampler from ..utils import Substitution, check_target_type @@ -24,119 +21,6 @@ MAX_INT = np.iinfo(np.int32).max -@Substitution( - sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) -@deprecated('EasyEnsemble is deprecated in 0.4 and will be removed in 0.6. ' - 'Use EasyEnsembleClassifier instead.') -class EasyEnsemble(BaseEnsembleSampler): - """Create an ensemble sets by iteratively applying random under-sampling. - - This method iteratively select a random subset and make an ensemble of the - different sets. - - .. deprecated:: 0.4 - ``EasyEnsemble`` is deprecated in 0.4 and will be removed in 0.6. Use - ``EasyEnsembleClassifier`` instead. - - Parameters - ---------- - {sampling_strategy} - - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected from the majority class. - - {random_state} - - replacement : bool, optional (default=False) - Whether or not to sample randomly with replacement or not. - - n_subsets : int, optional (default=10) - Number of subsets to generate. - - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - - Notes - ----- - The method is described in [1]_. - - Supports multi-class resampling by sampling each class independently. - - See also - -------- - BalanceCascade, BalancedBaggingClassifier - - References - ---------- - .. [1] X. Y. Liu, J. Wu and Z. H. Zhou, "Exploratory Undersampling for - Class-Imbalance Learning," in IEEE Transactions on Systems, Man, and - Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550, - April 2009. - - Examples - -------- - - >>> from collections import Counter - >>> from sklearn.datasets import make_classification - >>> from imblearn.ensemble import \ -EasyEnsemble # doctest: +NORMALIZE_WHITESPACE - >>> X, y = make_classification(n_classes=2, class_sep=2, - ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, - ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape %s' % Counter(y)) - Original dataset shape Counter({{1: 900, 0: 100}}) - >>> ee = EasyEnsemble(random_state=42) # doctest: +SKIP - >>> X_res, y_res = ee.fit_resample(X, y) # doctest: +SKIP - >>> print('Resampled dataset shape %s' % Counter(y_res[0])) - ... # doctest: +SKIP - Resampled dataset shape Counter({{0: 100, 1: 100}}) - - """ - - def __init__(self, - sampling_strategy='auto', - return_indices=False, - random_state=None, - replacement=False, - n_subsets=10, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) - self.random_state = random_state - self.return_indices = return_indices - self.replacement = replacement - self.n_subsets = n_subsets - - def _fit_resample(self, X, y): - random_state = check_random_state(self.random_state) - - X_resampled = [] - y_resampled = [] - if self.return_indices: - idx_under = [] - - for _ in range(self.n_subsets): - rus = RandomUnderSampler( - sampling_strategy=self.sampling_strategy_, - random_state=random_state.randint(MAX_INT), - replacement=self.replacement) - sel_x, sel_y = rus.fit_resample(X, y) - X_resampled.append(sel_x) - y_resampled.append(sel_y) - if self.return_indices: - idx_under.append(rus.sample_indices_) - - if self.return_indices: - return (np.array(X_resampled), np.array(y_resampled), - np.array(idx_under)) - else: - return np.array(X_resampled), np.array(y_resampled) - - @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring) diff --git a/imblearn/ensemble/tests/test_balance_cascade.py b/imblearn/ensemble/tests/test_balance_cascade.py deleted file mode 100644 index cdd3e23c8..000000000 --- a/imblearn/ensemble/tests/test_balance_cascade.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Test the module balance cascade.""" -# Authors: Guillaume Lemaitre -# Christos Aridas -# License: MIT - -import numpy as np - -import pytest -from pytest import raises - -from sklearn.utils.testing import assert_array_equal -from sklearn.ensemble import RandomForestClassifier -from sklearn.svm import LinearSVC - -from imblearn.ensemble import BalanceCascade - -RND_SEED = 0 -X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 -], [0.53366841, -0.30312976], [1.52091956, - -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, - 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], - [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ - -0.41635887, -0.38299653 - ], [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) -Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) - - -@pytest.mark.filterwarnings('ignore:Class BalanceCascade is deprecated') -def test_fit_resample_auto(): - sampling_strategy = 'auto' - bc = BalanceCascade( - sampling_strategy=sampling_strategy, - random_state=RND_SEED, - return_indices=True) - X_resampled, y_resampled, idx_under = bc.fit_resample(X, Y) - X_gt = np.array( - [[[1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, - -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ - -0.18410027, -0.45194484 - ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [ - 0.11622591, -0.0317206 - ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [ - 1.52091956, -0.49283504 - ], [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], - [[0.28893132, - -0.38761769], [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [ - 0.77481731, 0.60935141 - ], [-0.18410027, -0.45194484], [1.15514042, 0.0129463], - [0.11622591, -0.0317206], [1.25192108, -0.22367336], [ - 0.53366841, -0.30312976 - ], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [ - 1.31301027, -0.92648734 - ], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) - y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) - idx_gt = np.array( - [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19], - [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - -@pytest.mark.filterwarnings('ignore:Class BalanceCascade is deprecated') -def test_fit_resample_half(): - sampling_strategy = {0: 8, 1: 10} - bc = BalanceCascade( - sampling_strategy=sampling_strategy, random_state=RND_SEED) - X_resampled, y_resampled = bc.fit_resample(X, Y) - X_gt = np.array([[[-0.41635887, -0.38299653], [0.53366841, -0.30312976], [ - 1.25192108, -0.22367336 - ], [1.70580611, -0.11219234], [1.52091956, -0.49283504], [ - 0.11622591, -0.0317206 - ], [1.31301027, -0.92648734], [0.88407872, 0.35454207], [ - 0.3084254, 0.33299982 - ], [0.08711622, 0.93259929], [-0.28162401, -2.10400981], [ - -0.14374509, 0.27370049 - ], [0.9281014, 0.53085498], [-0.18410027, -0.45194484], - [0.77481731, 0.60935141], [1.15514042, 0.0129463], - [-1.11515198, -0.93689695], [0.70472253, -0.73309052]]]) - y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - - -@pytest.mark.filterwarnings('ignore:Class BalanceCascade is deprecated') -def test_fit_resample_auto_early_stop(): - sampling_strategy = 'auto' - estimator = LinearSVC(random_state=RND_SEED) - bc = BalanceCascade( - sampling_strategy=sampling_strategy, - random_state=RND_SEED, - return_indices=False, - estimator=estimator, - n_max_subset=1) - X_resampled, y_resampled = bc.fit_resample(X, Y) - X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [ - 0.70472253, -0.73309052 - ], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ - -0.18410027, -0.45194484 - ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [ - 0.11622591, -0.0317206 - ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [ - 1.52091956, -0.49283504 - ], [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) - y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - - -@pytest.mark.filterwarnings('ignore:Class BalanceCascade is deprecated') -def test_give_classifier_obj(): - sampling_strategy = 'auto' - estimator = RandomForestClassifier(n_estimators=10, random_state=RND_SEED) - bc = BalanceCascade( - sampling_strategy=sampling_strategy, - random_state=RND_SEED, - return_indices=False, - estimator=estimator) - X_resampled, y_resampled = bc.fit_resample(X, Y) - X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [ - 0.70472253, -0.73309052 - ], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ - -0.18410027, -0.45194484 - ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [ - 0.11622591, -0.0317206 - ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [ - 1.52091956, -0.49283504 - ], [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) - y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - - -@pytest.mark.filterwarnings('ignore:Class BalanceCascade is deprecated') -def test_give_classifier_wrong_obj(): - sampling_strategy = 'auto' - classifier = 2 - bc = BalanceCascade( - sampling_strategy=sampling_strategy, - random_state=RND_SEED, - return_indices=True, - estimator=classifier) - with raises(ValueError, match="Invalid parameter `estimator`"): - bc.fit_resample(X, Y) diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 3cad8285c..f9a8b00e7 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -14,7 +14,6 @@ from sklearn.feature_selection import SelectKBest from sklearn.utils.testing import assert_allclose -from imblearn.ensemble import EasyEnsemble from imblearn.ensemble import EasyEnsembleClassifier from imblearn.datasets import make_imbalance from imblearn.under_sampling import RandomUnderSampler @@ -32,96 +31,6 @@ Y = np.array([1, 2, 2, 2, 1, 0, 1, 1, 1, 0]) -@pytest.mark.filterwarnings('ignore:Class EasyEnsemble is deprecated') -def test_ee_init(): - # Define a sampling_strategy - sampling_strategy = 1. - ee = EasyEnsemble( - sampling_strategy=sampling_strategy, random_state=RND_SEED) - - assert ee.sampling_strategy == sampling_strategy - assert ee.replacement is False - assert ee.n_subsets == 10 - assert ee.random_state == RND_SEED - - -@pytest.mark.filterwarnings('ignore:Class EasyEnsemble is deprecated') -def test_fit_resample_auto(): - # Define the sampling_strategy parameter - sampling_strategy = 'auto' - - # Create the sampling object - ee = EasyEnsemble( - sampling_strategy=sampling_strategy, - random_state=RND_SEED, - return_indices=True, - n_subsets=3) - - # Get the different subset - X_resampled, y_resampled, idx_under = ee.fit_resample(X, Y) - - X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052], - [1.35269503, 0.44812421], [0.5220963, 0.11349303], - [1.10915364, 0.05718352], [0.22039505, 0.26469445]], - [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], - [-1.23195149, 0.15427291], [-2.10724436, 0.70263997], - [0.22039505, 0.26469445], [1.10915364, 0.05718352]], - [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], - [-1.23195149, 0.15427291], [0.5220963, 0.11349303], - [1.10915364, 0.05718352], [0.59091459, 0.40692742]]]) - y_gt = np.array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], - [0, 0, 1, 1, 2, 2]]) - idx_gt = np.array([[5, 9, 4, 0, 2, 3], [5, 9, 8, 6, 3, 2], - [5, 9, 8, 0, 2, 1]]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - -@pytest.mark.filterwarnings('ignore:Class EasyEnsemble is deprecated') -def test_fit_resample_half(): - # Define the sampling_strategy parameter - sampling_strategy = {0: 2, 1: 3, 2: 3} - - # Create the sampling object - ee = EasyEnsemble( - sampling_strategy=sampling_strategy, - random_state=RND_SEED, - n_subsets=3) - - # Get the different subset - X_resampled, y_resampled = ee.fit_resample(X, Y) - - X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556], - [1.35269503, 0.44812421], [-1.23195149, 0.15427291], - [0.5220963, 0.11349303], [1.10915364, 0.05718352], - [0.59091459, 0.40692742], [0.22039505, 0.26469445]], - [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], - [1.35269503, 0.44812421], [-2.10724436, 0.70263997], - [-1.23195149, 0.15427291], [0.59091459, 0.40692742], - [0.22039505, 0.26469445], [1.10915364, 0.05718352]], - [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], - [-1.23195149, 0.15427291], [0.5220963, 0.11349303], - [1.35269503, 0.44812421], [1.10915364, 0.05718352], - [0.59091459, 0.40692742], [0.22039505, 0.26469445]]]) - y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2], - [0, 0, 1, 1, 1, 2, 2, 2]]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - - -@pytest.mark.filterwarnings('ignore:Class EasyEnsemble is deprecated') -def test_random_state_none(): - # Define the sampling_strategy parameter - sampling_strategy = 'auto' - - # Create the sampling object - ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None) - - # Get the different subset - X_resampled, y_resampled = ee.fit_resample(X, Y) - - @pytest.mark.parametrize("n_estimators", [10, 20]) @pytest.mark.parametrize("base_estimator", [ AdaBoostClassifier(n_estimators=5), diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 0c1edc662..5c44735ef 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -22,8 +22,6 @@ def test_all_estimator_no_base_class(name, Estimator): @pytest.mark.filterwarnings("ignore:'ratio' is deprecated from 0.4") @pytest.mark.filterwarnings("ignore:'sampling_strategy' as a dict for") -@pytest.mark.filterwarnings("ignore:Class EasyEnsemble is deprecated") -@pytest.mark.filterwarnings('ignore:Class BalanceCascade is deprecated') @pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') @pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') @pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') @@ -56,8 +54,6 @@ def _generate_checks_per_estimator(check_generator, estimators): @pytest.mark.filterwarnings("ignore:'ratio' is deprecated from 0.4") @pytest.mark.filterwarnings("ignore:'sampling_strategy' as a dict for") -@pytest.mark.filterwarnings("ignore:Class EasyEnsemble is deprecated") -@pytest.mark.filterwarnings('ignore:Class BalanceCascade is deprecated') @pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') @pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') @pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') diff --git a/setup.cfg b/setup.cfg index 0384cabf7..2f5667152 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,8 +20,16 @@ values = test = pytest [tool:pytest] -addopts = - --doctest-modules -filterwarnings = - ignore:the matrix subclass:PendingDeprecationWarning +doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS +addopts = + --ignore build_tools + --ignore benchmarks + --ignore doc + --ignore examples + --ignore maint_tools + --doctest-modules + --disable-pytest-warnings + -rs +filterwarnings = + ignore:the matrix subclass:PendingDeprecationWarning \ No newline at end of file From 32c71152ab7d81ed8146524d17d2db5e486b6c7f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 15:12:23 +0100 Subject: [PATCH 15/37] MAINT remove check_ratio --- doc/api.rst | 1 - imblearn/utils/__init__.py | 7 ++-- imblearn/utils/_validation.py | 55 ------------------------- imblearn/utils/tests/test_validation.py | 1 - 4 files changed, 4 insertions(+), 60 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index de1ba4eb4..b83396643 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -249,5 +249,4 @@ Imbalance-learn provides some fast-prototyping tools. utils.estimator_checks.check_estimator utils.check_neighbors_object - utils.check_ratio utils.check_sampling_strategy diff --git a/imblearn/utils/__init__.py b/imblearn/utils/__init__.py index ce53cca31..45ecdbb59 100644 --- a/imblearn/utils/__init__.py +++ b/imblearn/utils/__init__.py @@ -6,10 +6,11 @@ from ._validation import check_neighbors_object from ._validation import check_target_type -from ._validation import check_ratio from ._validation import check_sampling_strategy __all__ = [ - 'Substitution', 'check_neighbors_object', 'check_target_type', - 'check_sampling_strategy', 'check_ratio' + 'check_neighbors_object', + 'check_sampling_strategy', + 'check_target_type', + 'Substitution', ] diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 1bdb3c9c5..d7326dde6 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -478,58 +478,3 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): 'all': _sampling_strategy_all, 'auto': _sampling_strategy_auto } - - -@deprecated("imblearn.utils.check_ratio was deprecated in favor of " - "imblearn.utils.check_sampling_strategy in 0.4. It will be " - "removed in 0.6.") -def check_ratio(ratio, y, sampling_type, **kwargs): - """Sampling target validation for samplers. - - Checks ratio for consistent type and return a dictionary - containing each targeted class with its corresponding number of - sample. - - .. deprecated:: 0.4 - This function is deprecated in favor of - :func:`imblearn.utils.check_sampling_strategy`. It will be removed in - 0.6. - - Parameters - ---------- - ratio : str, dict or callable, - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - y : ndarray, shape (n_samples,) - The target array. - - sampling_type : str, - The type of sampling. Can be either ``'over-sampling'`` or - ``'under-sampling'``. - - kwargs : dict, optional - Dictionary of additional keyword arguments to pass to ``ratio``. - - Returns - ------- - ratio_converted : dict, - The converted and validated ratio. Returns a dictionary with - the key being the class target and the value being the desired - number of samples. - - """ - return check_sampling_strategy(ratio, y, sampling_type, **kwargs) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 3880f8c39..7b05b9804 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -15,7 +15,6 @@ from imblearn.utils.testing import warns from imblearn.utils import check_neighbors_object -from imblearn.utils import check_ratio from imblearn.utils import check_sampling_strategy from imblearn.utils import check_target_type From da1265db2df6208bdbafd6ff4b048bb8cb72124c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 15:43:07 +0100 Subject: [PATCH 16/37] MAINT remove ratio after deprecation --- imblearn/base.py | 3 - imblearn/combine/_smote_enn.py | 12 +-- imblearn/combine/_smote_tomek.py | 12 +-- imblearn/datasets/_imbalance.py | 17 +--- imblearn/ensemble/_forest.py | 3 + imblearn/ensemble/base.py | 1 - imblearn/over_sampling/_smote.py | 23 ++---- imblearn/tests/test_common.py | 4 - .../_cluster_centroids.py | 11 +-- .../_condensed_nearest_neighbour.py | 11 +-- .../_edited_nearest_neighbours.py | 39 ++------- .../_instance_hardness_threshold.py | 10 +-- .../_prototype_selection/_nearmiss.py | 11 +-- .../_neighbourhood_cleaning_rule.py | 14 +--- .../_one_sided_selection.py | 11 +-- .../_random_under_sampler.py | 11 +-- .../_prototype_selection/_tomek_links.py | 11 +-- imblearn/utils/estimator_checks.py | 34 -------- imblearn/utils/tests/test_validation.py | 80 ------------------- 19 files changed, 41 insertions(+), 277 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index ab554aa83..0074c6de1 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -46,7 +46,6 @@ def fit(self, X, y): Return the instance itself. """ - self._deprecate_ratio() X, y, _ = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) @@ -73,8 +72,6 @@ def fit_resample(self, X, y): The corresponding label of `X_resampled`. """ - self._deprecate_ratio() - check_classification_targets(y) X, y, binarize_y = self._check_X_y(X, y) diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py index 9651b2d4a..401edd7ee 100644 --- a/imblearn/combine/_smote_enn.py +++ b/imblearn/combine/_smote_enn.py @@ -48,11 +48,6 @@ class SMOTEENN(BaseSampler): The number of threads to open if possible. Will not apply to smote and enn given by the user. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Notes ----- The method is presented in [1]_. @@ -95,15 +90,13 @@ def __init__(self, random_state=None, smote=None, enn=None, - n_jobs=1, - ratio=None): + n_jobs=1): super().__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.enn = enn self.n_jobs = n_jobs - self.ratio = ratio def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" @@ -118,8 +111,7 @@ def _validate_estimator(self): self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, - n_jobs=self.n_jobs, - ratio=self.ratio) + n_jobs=self.n_jobs) if self.enn is not None: if isinstance(self.enn, EditedNearestNeighbours): diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py index ffbbb8f59..c9226f0a4 100644 --- a/imblearn/combine/_smote_tomek.py +++ b/imblearn/combine/_smote_tomek.py @@ -48,11 +48,6 @@ class SMOTETomek(BaseSampler): The number of threads to open if possible. Will not apply to smote and tomek given by the user. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Notes ----- The methos is presented in [1]_. @@ -96,15 +91,13 @@ def __init__(self, random_state=None, smote=None, tomek=None, - n_jobs=1, - ratio=None): + n_jobs=1): super().__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.tomek = tomek self.n_jobs = n_jobs - self.ratio = ratio def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" @@ -120,8 +113,7 @@ def _validate_estimator(self): self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, - n_jobs=self.n_jobs, - ratio=self.ratio) + n_jobs=self.n_jobs) if self.tomek is not None: if isinstance(self.tomek, TomekLinks): diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py index e7299a874..f786f1a0e 100644 --- a/imblearn/datasets/_imbalance.py +++ b/imblearn/datasets/_imbalance.py @@ -17,11 +17,11 @@ def make_imbalance(X, y, sampling_strategy=None, - ratio=None, random_state=None, verbose=False, **kwargs): - """Turns a dataset into an imbalanced dataset at specific ratio. + """Turns a dataset into an imbalanced dataset with a specific sampling + strategy. A simple toy dataset to visualize clustering and classification algorithms. @@ -47,11 +47,6 @@ def make_imbalance(X, correspond to the targeted classes. The values correspond to the desired number of samples for each class. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -100,14 +95,6 @@ def make_imbalance(X, X, y = check_X_y(X, y) target_stats = Counter(y) # restrict ratio to be a dict or a callable - # FIXME remove ratio at 0.6 - if ratio is not None: - warnings.warn("'ratio' has been deprecated in 0.4 and will be " - "removed in 0.6. Use 'sampling_strategy' instead.") - sampling_strategy = ratio - elif sampling_strategy is None: - raise TypeError("make_imbalance() missing 1 required positional " - "argument: 'sampling_strategy'") if isinstance(sampling_strategy, dict) or callable(sampling_strategy): sampling_strategy_ = check_sampling_strategy( sampling_strategy, y, 'under-sampling', **kwargs) diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index d678a39e9..126f1df73 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -453,3 +453,6 @@ def fit(self, X, y, sample_weight=None): self.classes_ = self.classes_[0] return self + + def _more_tags(self): + return {'multioutput': False} diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py index e069d0f52..f5de813b1 100644 --- a/imblearn/ensemble/base.py +++ b/imblearn/ensemble/base.py @@ -42,7 +42,6 @@ def fit_resample(self, X, y): The corresponding label of `X_resampled` """ - self._deprecate_ratio() # Ensemble are a bit specific since they are returning an array of # resampled arrays. X, y, binarize_y = self._check_X_y(X, y) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index d9f44ed68..a769ce3ce 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -43,10 +43,8 @@ def __init__(self, sampling_strategy='auto', random_state=None, k_neighbors=5, - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.k_neighbors = k_neighbors self.n_jobs = n_jobs @@ -317,7 +315,7 @@ def __init__(self, kind='borderline-1'): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, - k_neighbors=k_neighbors, n_jobs=n_jobs, ratio=None) + k_neighbors=k_neighbors, n_jobs=n_jobs) self.m_neighbors = m_neighbors self.kind = kind @@ -499,7 +497,7 @@ def __init__(self, out_step=0.5): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, - k_neighbors=k_neighbors, n_jobs=n_jobs, ratio=None) + k_neighbors=k_neighbors, n_jobs=n_jobs) self.m_neighbors = m_neighbors self.svm_estimator = svm_estimator self.out_step = out_step @@ -668,11 +666,6 @@ class SMOTE(SVMSMOTE, BorderlineSMOTE): n_jobs : int, optional (default=1) The number of threads to open if possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Notes ----- See the original papers: [1]_ for more details. @@ -722,12 +715,11 @@ def __init__(self, out_step='deprecated', kind='deprecated', svm_estimator='deprecated', - n_jobs=1, - ratio=None): + n_jobs=1): # FIXME: in 0.6 call super() BaseSMOTE.__init__(self, sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, - n_jobs=n_jobs, ratio=ratio) + n_jobs=n_jobs) self.kind = kind self.m_neighbors = m_neighbors self.out_step = out_step @@ -955,8 +947,7 @@ def __init__(self, categorical_features, sampling_strategy='auto', random_state=None, k_neighbors=5, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy, random_state=random_state, - k_neighbors=k_neighbors, - ratio=None) + k_neighbors=k_neighbors) self.categorical_features = categorical_features @staticmethod diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 5c44735ef..ec3ca881b 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -20,8 +20,6 @@ def test_all_estimator_no_base_class(name, Estimator): assert not name.lower().startswith('base'), msg -@pytest.mark.filterwarnings("ignore:'ratio' is deprecated from 0.4") -@pytest.mark.filterwarnings("ignore:'sampling_strategy' as a dict for") @pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') @pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') @pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') @@ -52,8 +50,6 @@ def _generate_checks_per_estimator(check_generator, estimators): yield name, Estimator, check -@pytest.mark.filterwarnings("ignore:'ratio' is deprecated from 0.4") -@pytest.mark.filterwarnings("ignore:'sampling_strategy' as a dict for") @pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') @pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') @pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index fe21eaf98..f98a9a43d 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -61,11 +61,6 @@ class ClusterCentroids(BaseUnderSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Notes ----- Supports multi-class resampling by sampling each class independently. @@ -95,10 +90,8 @@ def __init__(self, random_state=None, estimator=None, voting='auto', - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator self.voting = voting diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index d5e4b4642..7126744f9 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -57,11 +57,6 @@ class CondensedNearestNeighbour(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -111,10 +106,8 @@ def __init__(self, random_state=None, n_neighbors=None, n_seeds_S=1, - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 032567af1..9a89dfd3b 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -65,11 +65,6 @@ class EditedNearestNeighbours(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -120,10 +115,8 @@ def __init__(self, random_state=None, n_neighbors=3, kind_sel='all', - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -235,11 +228,6 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of thread to open when it is possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -291,10 +279,8 @@ def __init__(self, n_neighbors=3, max_iter=100, kind_sel='all', - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -321,8 +307,7 @@ def _validate_estimator(self): return_indices=False, n_neighbors=self.nn_, kind_sel=self.kind_sel, - n_jobs=self.n_jobs, - ratio=self.ratio) + n_jobs=self.n_jobs) def _fit_resample(self, X, y): if self.return_indices: @@ -430,11 +415,6 @@ class without early stopping. n_jobs : int, optional (default=1) The number of thread to open when it is possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -487,10 +467,8 @@ def __init__(self, n_neighbors=3, kind_sel='all', allow_minority=False, - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -516,8 +494,7 @@ def _validate_estimator(self): return_indices=False, n_neighbors=self.nn_, kind_sel=self.kind_sel, - n_jobs=self.n_jobs, - ratio=self.ratio) + n_jobs=self.n_jobs) def _fit_resample(self, X, y): if self.return_indices: diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 5991a2785..1970e50cf 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -59,11 +59,6 @@ class InstanceHardnessThreshold(BaseUnderSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -109,10 +104,9 @@ def __init__(self, return_indices=False, random_state=None, cv=5, - n_jobs=1, - ratio=None): + n_jobs=1): super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator self.return_indices = return_indices diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 86e018769..f3d3c0279 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -64,11 +64,6 @@ class NearMiss(BaseUnderSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -115,10 +110,8 @@ def __init__(self, version=1, n_neighbors=3, n_neighbors_ver3=3, - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.return_indices = return_indices self.version = version diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index b2a07835a..c11cd2f25 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -65,11 +65,6 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -116,10 +111,8 @@ def __init__(self, n_neighbors=3, kind_sel='all', threshold_cleaning=0.5, - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -155,8 +148,7 @@ def _fit_resample(self, X, y): sampling_strategy=self.sampling_strategy, n_neighbors=self.n_neighbors, kind_sel='mode', - n_jobs=self.n_jobs, - ratio=self.ratio) + n_jobs=self.n_jobs) enn.fit_resample(X, y) index_not_a1 = enn.sample_indices_ index_a1 = np.ones(y.shape, dtype=bool) diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index cea9ecd0d..9d94c72e9 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -54,11 +54,6 @@ class OneSidedSelection(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -105,10 +100,8 @@ def __init__(self, random_state=None, n_neighbors=None, n_seeds_S=1, - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index afa9efb0f..c0c3c9b7c 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -45,11 +45,6 @@ class RandomUnderSampler(BaseUnderSampler): replacement : boolean, optional (default=False) Whether the sample is with or without replacement. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -87,10 +82,8 @@ def __init__(self, sampling_strategy='auto', return_indices=False, random_state=None, - replacement=False, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + replacement=False): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.return_indices = return_indices self.replacement = replacement diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 280377f55..3ecd5284d 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -44,11 +44,6 @@ class TomekLinks(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -92,10 +87,8 @@ def __init__(self, sampling_strategy='auto', return_indices=False, random_state=None, - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.return_indices = return_indices self.n_jobs = n_jobs diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 598dafa87..6f2bed47d 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -32,7 +32,6 @@ from imblearn.over_sampling import SMOTE from imblearn.under_sampling import NearMiss, ClusterCentroids -DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE', 'KMeansSMOTE'] # FIXME: remove in 0.6 DONT_HAVE_RANDOM_STATE = ('NearMiss', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', 'AllKNN', @@ -44,7 +43,6 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_one_label yield check_samplers_fit yield check_samplers_fit_resample - yield check_samplers_ratio_fit_resample yield check_samplers_sampling_strategy_fit_resample yield check_samplers_sparse yield check_samplers_pandas @@ -173,38 +171,6 @@ def check_samplers_fit_resample(name, Sampler): for value in Counter(y_ensemble).values()) -# FIXME remove in 0.6 -> ratio will be deprecated -def check_samplers_ratio_fit_resample(name, Sampler): - if name not in DONT_SUPPORT_RATIO: - # in this test we will force all samplers to not change the class 1 - X, y = make_classification(n_samples=1000, n_classes=3, - n_informative=4, weights=[0.2, 0.3, 0.5], - random_state=0) - sampler = Sampler() - expected_stat = Counter(y)[1] - if isinstance(sampler, BaseOverSampler): - ratio = {2: 498, 0: 498} - sampler.set_params(ratio=ratio) - X_res, y_res = sampler.fit_resample(X, y) - assert Counter(y_res)[1] == expected_stat - elif isinstance(sampler, BaseUnderSampler): - ratio = {2: 201, 0: 201} - sampler.set_params(ratio=ratio) - X_res, y_res = sampler.fit_resample(X, y) - assert Counter(y_res)[1] == expected_stat - elif isinstance(sampler, BaseCleaningSampler): - ratio = {2: 201, 0: 201} - sampler.set_params(ratio=ratio) - X_res, y_res = sampler.fit_resample(X, y) - assert Counter(y_res)[1] == expected_stat - if isinstance(sampler, BaseEnsembleSampler): - ratio = {2: 201, 0: 201} - sampler.set_params(ratio=ratio) - X_res, y_res = sampler.fit_resample(X, y) - y_ensemble = y_res[0] - assert Counter(y_ensemble)[1] == expected_stat - - def check_samplers_sampling_strategy_fit_resample(name, Sampler): # in this test we will force all samplers to not change the class 1 X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 7b05b9804..9af14163b 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -253,86 +253,6 @@ def test_check_sampling_strategy(sampling_strategy, sampling_type, assert sampling_strategy_ == expected_sampling_strategy -@pytest.mark.parametrize("ratio, sampling_type, expected_ratio, target", [ - ('auto', 'under-sampling', { - 1: 25, - 2: 25 - }, multiclass_target), ('auto', 'clean-sampling', { - 1: 25, - 2: 25 - }, multiclass_target), ('auto', 'over-sampling', { - 1: 50, - 3: 75 - }, multiclass_target), ('all', 'over-sampling', { - 1: 50, - 2: 0, - 3: 75 - }, multiclass_target), ('all', 'under-sampling', { - 1: 25, - 2: 25, - 3: 25 - }, multiclass_target), ('all', 'clean-sampling', { - 1: 25, - 2: 25, - 3: 25 - }, multiclass_target), ('majority', 'under-sampling', { - 2: 25 - }, multiclass_target), ('majority', 'clean-sampling', { - 2: 25 - }, multiclass_target), ('minority', 'over-sampling', { - 3: 75 - }, multiclass_target), ('not minority', 'over-sampling', { - 1: 50, - 2: 0 - }, multiclass_target), ('not minority', 'under-sampling', { - 1: 25, - 2: 25 - }, multiclass_target), ('not minority', 'clean-sampling', { - 1: 25, - 2: 25 - }, multiclass_target), ('not majority', 'over-sampling', { - 1: 50, - 3: 75 - }, multiclass_target), ('not majority', 'under-sampling', { - 1: 25, - 3: 25 - }, multiclass_target), ('not majority', 'clean-sampling', { - 1: 25, - 3: 25 - }, multiclass_target), ({ - 1: 70, - 2: 100, - 3: 70 - }, 'over-sampling', { - 1: 20, - 2: 0, - 3: 45 - }, multiclass_target), ({ - 1: 30, - 2: 45, - 3: 25 - }, 'under-sampling', { - 1: 30, - 2: 45, - 3: 25 - }, multiclass_target), ([1], 'clean-sampling', { - 1: 25 - }, multiclass_target), (_sampling_strategy_func, 'over-sampling', { - 1: 50, - 2: 0, - 3: 75 - }, multiclass_target), (0.5, 'over-sampling', { - 1: 25 - }, binary_target), (0.5, 'under-sampling', { - 0: 50 - }, binary_target) -]) -def test_check_ratio(ratio, sampling_type, expected_ratio, target): - with pytest.warns(DeprecationWarning, match="check_ratio is deprecated"): - ratio_ = check_ratio(ratio, target, sampling_type) - assert ratio_ == expected_ratio - - def test_sampling_strategy_dict_over_sampling(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy = {1: 70, 2: 140, 3: 70} From c3dccbb73010bbe31f49eace7b9a811a08ff99de Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 16:03:39 +0100 Subject: [PATCH 17/37] MAINT remove check_ratio --- imblearn/base.py | 19 +------ imblearn/datasets/tests/test_imbalance.py | 19 ------- imblearn/ensemble/_bagging.py | 12 +--- imblearn/over_sampling/_adasyn.py | 11 +--- .../over_sampling/_random_over_sampler.py | 11 +--- .../tests/test_cluster_centroids.py | 37 +++---------- .../_instance_hardness_threshold.py | 3 +- .../tests/test_instance_hardness_threshold.py | 55 +++---------------- 8 files changed, 25 insertions(+), 142 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 0074c6de1..dbf4ae5fd 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -123,10 +123,8 @@ class BaseSampler(SamplerMixin): instead. """ - def __init__(self, sampling_strategy='auto', ratio=None): + def __init__(self, sampling_strategy='auto'): self.sampling_strategy = sampling_strategy - # FIXME: remove in 0.6 - self.ratio = ratio @staticmethod def _check_X_y(X, y): @@ -134,21 +132,6 @@ def _check_X_y(X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) return X, y, binarize_y - @property - def ratio_(self): - # FIXME: remove in 0.6 - warnings.warn("'ratio' and 'ratio_' are deprecated. Use " - "'sampling_strategy' and 'sampling_strategy_' instead.", - DeprecationWarning) - return self.sampling_strategy_ - - def _deprecate_ratio(self): - # both ratio and sampling_strategy should not be set - if self.ratio is not None: - deprecate_parameter(self, '0.4', 'ratio', 'sampling_strategy') - self.sampling_strategy = self.ratio - - def _identity(X, y): return X, y diff --git a/imblearn/datasets/tests/test_imbalance.py b/imblearn/datasets/tests/test_imbalance.py index 60a063a7c..5fc72f552 100644 --- a/imblearn/datasets/tests/test_imbalance.py +++ b/imblearn/datasets/tests/test_imbalance.py @@ -18,13 +18,6 @@ def iris(): return load_iris(return_X_y=True) -def test_make_imbalanced_backcompat(iris): - # check an error is raised with we don't pass sampling_strategy and ratio - err_msg = "missing 1 required positional argument" - with pytest.raises(TypeError, match=err_msg): - make_imbalance(*iris) - - @pytest.mark.parametrize( "sampling_strategy, err_msg", [({0: -100, 1: 50, 2: 50}, "in a class cannot be negative"), @@ -55,15 +48,3 @@ def test_make_imbalance_dict(iris, sampling_strategy, expected_counts): X, y = iris _, y_ = make_imbalance(X, y, sampling_strategy=sampling_strategy) assert Counter(y_) == expected_counts - - -@pytest.mark.filterwarnings("ignore:'ratio' has been deprecated in 0.4") -@pytest.mark.parametrize( - "sampling_strategy, expected_counts", - [({0: 10, 1: 20, 2: 30}, {0: 10, 1: 20, 2: 30}), - ({0: 10, 1: 20}, {0: 10, 1: 20, 2: 50})] -) -def test_make_imbalance_dict_ratio(iris, sampling_strategy, expected_counts): - X, y = iris - _, y_ = make_imbalance(X, y, ratio=sampling_strategy) - assert Counter(y_) == expected_counts diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index 2deaf5289..557eefef7 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -81,11 +81,6 @@ class BalancedBaggingClassifier(BaggingClassifier): verbose : int, optional (default=0) Controls the verbosity of the building process. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- base_estimator_ : estimator @@ -183,8 +178,7 @@ def __init__(self, replacement=False, n_jobs=1, random_state=None, - verbose=0, - ratio=None): + verbose=0): super().__init__( base_estimator, @@ -199,7 +193,6 @@ def __init__(self, random_state=random_state, verbose=verbose) self.sampling_strategy = sampling_strategy - self.ratio = ratio self.replacement = replacement def _validate_estimator(self, default=DecisionTreeClassifier()): @@ -220,8 +213,7 @@ def _validate_estimator(self, default=DecisionTreeClassifier()): self.base_estimator_ = Pipeline([('sampler', RandomUnderSampler( sampling_strategy=self.sampling_strategy, - replacement=self.replacement, - ratio=self.ratio)), ('classifier', base_estimator)]) + replacement=self.replacement)), ('classifier', base_estimator)]) def fit(self, X, y): """Build a Bagging ensemble of estimators from the training diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index c3657c6c5..a4b3ecfc7 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -39,11 +39,6 @@ class ADASYN(BaseOverSampler): n_jobs : int, optional (default=1) Number of threads to run the algorithm when it is possible. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Notes ----- The implementation is based on [1]_. @@ -85,10 +80,8 @@ def __init__(self, sampling_strategy='auto', random_state=None, n_neighbors=5, - n_jobs=1, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + n_jobs=1): + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors self.n_jobs = n_jobs diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 6d067defb..9f4a8067e 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -41,11 +41,6 @@ class RandomOverSampler(BaseOverSampler): ``return_indices`` is deprecated. Use the attribute ``sample_indices_`` instead. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) @@ -81,10 +76,8 @@ class RandomOverSampler(BaseOverSampler): def __init__(self, sampling_strategy='auto', return_indices=False, - random_state=None, - ratio=None): - super().__init__( - sampling_strategy=sampling_strategy, ratio=ratio) + random_state=None): + super().__init__(sampling_strategy=sampling_strategy) self.return_indices = return_indices self.random_state = random_state diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 87d959731..9e24e5a61 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -36,12 +36,8 @@ def test_fit_resample_auto(): cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.06738818, -0.529627], - [0.17901516, 0.69860992], [0.094035, -2.55298982]]) - y_gt = np.array([0, 0, 0, 1, 1, 1]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) + assert X_resampled.shape == (6, 2) + assert y_resampled.shape == (6,) def test_fit_resample_half(): @@ -49,15 +45,8 @@ def test_fit_resample_half(): cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.13347175, 0.12167502], [ - 0.47104475, 0.44386323 - ], [0.09125309, -0.85409574], [0.19220316, 0.32337101], - [0.094035, -2.55298982], [0.20792588, 1.49407907], - [0.04352327, -0.20515826], [0.12372842, 0.6536186]]) - y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) - print(X_resampled) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) + assert X_resampled.shape == (9, 2) + assert y_resampled.shape == (9,) def test_multiclass_fit_resample(): @@ -65,7 +54,7 @@ def test_multiclass_fit_resample(): y[5] = 2 y[6] = 2 cc = ClusterCentroids(random_state=RND_SEED) - X_resampled, y_resampled = cc.fit_resample(X, y) + _, y_resampled = cc.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 @@ -81,12 +70,8 @@ def test_fit_resample_object(): estimator=cluster) X_resampled, y_resampled = cc.fit_resample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.06738818, -0.529627], - [0.17901516, 0.69860992], [0.094035, -2.55298982]]) - y_gt = np.array([0, 0, 0, 1, 1, 1]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) + assert X_resampled.shape == (6, 2) + assert y_resampled.shape == (6,) def test_fit_hard_voting(): @@ -100,12 +85,8 @@ def test_fit_hard_voting(): voting=voting) X_resampled, y_resampled = cc.fit_resample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.09125309, -0.85409574], - [0.12372842, 0.6536186], [0.094035, -2.55298982]]) - y_gt = np.array([0, 0, 0, 1, 1, 1]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) + assert X_resampled.shape == (6, 2) + assert y_resampled.shape == (6,) for x in X_resampled: assert np.any(np.all(x == X, axis=1)) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 1970e50cf..c0b7b17a2 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -105,8 +105,7 @@ def __init__(self, random_state=None, cv=5, n_jobs=1): - super().__init__( - sampling_strategy=sampling_strategy) + super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator self.return_indices = return_indices diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 68fd15b3c..a713cb69f 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -37,17 +37,8 @@ def test_iht_init(): def test_iht_fit_resample(): iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) - - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ - -0.65571327, 0.42412021 - ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ - -0.00717161, 0.00318087 - ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) + assert X_resampled.shape == (12, 2) + assert y_resampled.shape == (12,) @pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") @@ -55,19 +46,8 @@ def test_iht_fit_resample_with_indices(): iht = InstanceHardnessThreshold( ESTIMATOR, return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = iht.fit_resample(X, Y) - - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ - -0.65571327, 0.42412021 - ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ - -0.00717161, 0.00318087 - ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) - idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 7, 10]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) + assert X_resampled.shape == (12, 2) + assert y_resampled.shape == (12,) def test_iht_fit_resample_half(): @@ -75,35 +55,16 @@ def test_iht_fit_resample_half(): iht = InstanceHardnessThreshold( ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) - - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ - -0.65571327, 0.42412021 - ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ - -0.00717161, 0.00318087 - ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], - [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], - [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], - [-0.30126957, -0.66268378], [-0.28305528, 0.30284991]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) + assert X_resampled.shape == (14, 2) + assert y_resampled.shape == (14,) def test_iht_fit_resample_class_obj(): est = GradientBoostingClassifier(random_state=RND_SEED) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) - - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ - -0.65571327, 0.42412021 - ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ - -0.00717161, 0.00318087 - ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) + assert X_resampled.shape == (12, 2) + assert y_resampled.shape == (12,) def test_iht_fit_resample_wrong_class_obj(): From fdaf152945dcb6a77eec14c70a2d86d5262483b0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 16:22:29 +0100 Subject: [PATCH 18/37] DEPR remove return_indices --- .../over_sampling/_random_over_sampler.py | 22 +------- .../tests/test_random_over_sampler.py | 19 ------- imblearn/tests/test_common.py | 1 - .../_condensed_nearest_neighbour.py | 17 ------- .../_edited_nearest_neighbours.py | 51 ------------------- .../_instance_hardness_threshold.py | 17 ------- .../_prototype_selection/_nearmiss.py | 17 ------- .../_neighbourhood_cleaning_rule.py | 18 ------- .../_one_sided_selection.py | 17 +------ .../_random_under_sampler.py | 20 +------- .../_prototype_selection/_tomek_links.py | 19 ------- .../_prototype_selection/tests/test_allknn.py | 35 ------------- .../tests/test_condensed_nearest_neighbour.py | 18 ------- .../tests/test_edited_nearest_neighbours.py | 16 ------ .../tests/test_instance_hardness_threshold.py | 9 ---- .../tests/test_nearmiss.py | 41 --------------- .../tests/test_neighbourhood_cleaning_rule.py | 37 -------------- .../tests/test_one_sided_selection.py | 19 ------- .../tests/test_random_under_sampler.py | 16 ------ ...test_repeated_edited_nearest_neighbours.py | 34 ------------- .../tests/test_tomek_links.py | 23 --------- 21 files changed, 3 insertions(+), 463 deletions(-) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 9f4a8067e..0a06fd310 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -33,21 +33,12 @@ class RandomOverSampler(BaseOverSampler): {random_state} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly selected - in the corresponding classes. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - Attributes ---------- sample_indices_ : ndarray, shape (n_new_samples) Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -75,10 +66,8 @@ class RandomOverSampler(BaseOverSampler): """ def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None): super().__init__(sampling_strategy=sampling_strategy) - self.return_indices = return_indices self.random_state = random_state @staticmethod @@ -88,10 +77,6 @@ def _check_X_y(X, y): return X, y, binarize_y def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') - random_state = check_random_state(self.random_state) target_stats = Counter(y) @@ -106,14 +91,9 @@ def _fit_resample(self, X, y): target_class_indices[indices]) self.sample_indices_ = np.array(sample_indices) - if self.return_indices: - return (safe_indexing(X, sample_indices), - safe_indexing(y, sample_indices), sample_indices) return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices)) def _more_tags(self): - # TODO: remove the str tag once the following PR is merged: - # https://github.com/scikit-learn/scikit-learn/pull/14043 - return {'X_types': ['2darray', 'str', 'string'], + return {'X_types': ['2darray', 'string'], 'sample_indices': True} diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 65bbc58ae..5f5ed1b36 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -5,7 +5,6 @@ from collections import Counter -import pytest import numpy as np from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal @@ -60,24 +59,6 @@ def test_ros_fit_resample_half(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_random_over_sampling_return_indices(): - ros = RandomOverSampler(return_indices=True, random_state=RND_SEED) - X_resampled, y_resampled, sample_indices = ros.fit_resample(X, Y) - X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ - 0.20792588, 1.49407907 - ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [ - 0.15490546, 0.3130677 - ], [0.09125309, -0.85409574], [0.12372842, 0.6536186], - [0.13347175, 0.12167502], [0.094035, -2.55298982], - [0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.92923648, 0.76103773], [0.47104475, 0.44386323]]) - y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) - assert_allclose(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(np.sort(np.unique(sample_indices)), np.arange(len(X))) - - def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index ec3ca881b..a4e87ecc0 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -55,7 +55,6 @@ def _generate_checks_per_estimator(check_generator, estimators): @pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') @pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') @pytest.mark.filterwarnings("ignore:'y' should be of types") -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") @pytest.mark.parametrize( 'name, Estimator, check', _generate_checks_per_estimator(_yield_all_checks, diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 7126744f9..98c049ce1 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -34,14 +34,6 @@ class CondensedNearestNeighbour(BaseCleaningSampler): ---------- {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - {random_state} n_neighbors : int or object, optional (default=\ @@ -63,7 +55,6 @@ class CondensedNearestNeighbour(BaseCleaningSampler): Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -102,14 +93,12 @@ class CondensedNearestNeighbour(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None, n_neighbors=None, n_seeds_S=1, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.return_indices = return_indices self.n_neighbors = n_neighbors self.n_seeds_S = n_seeds_S self.n_jobs = n_jobs @@ -130,9 +119,6 @@ def _validate_estimator(self): ' Got {} instead.'.format(type(self.n_neighbors))) def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') self._validate_estimator() random_state = check_random_state(self.random_state) @@ -207,9 +193,6 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 9a89dfd3b..bff35b214 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -35,14 +35,6 @@ class EditedNearestNeighbours(BaseCleaningSampler): ---------- {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - {random_state} .. deprecated:: 0.4 @@ -71,7 +63,6 @@ class EditedNearestNeighbours(BaseCleaningSampler): Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -111,14 +102,12 @@ class EditedNearestNeighbours(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None, n_neighbors=3, kind_sel='all', n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.return_indices = return_indices self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.n_jobs = n_jobs @@ -138,9 +127,6 @@ def _validate_estimator(self): raise NotImplementedError def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') self._validate_estimator() idx_under = np.empty((0, ), dtype=int) @@ -172,9 +158,6 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): @@ -194,14 +177,6 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): ---------- {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - {random_state} .. deprecated:: 0.4 @@ -234,7 +209,6 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -274,7 +248,6 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None, n_neighbors=3, max_iter=100, @@ -282,7 +255,6 @@ def __init__(self, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.return_indices = return_indices self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.n_jobs = n_jobs @@ -304,15 +276,11 @@ def _validate_estimator(self): self.enn_ = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, - return_indices=False, n_neighbors=self.nn_, kind_sel=self.kind_sel, n_jobs=self.n_jobs) def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') self._validate_estimator() X_, y_ = X, y @@ -359,8 +327,6 @@ def _fit_resample(self, X, y): X_resampled, y_resampled = X_, y_ - if self.return_indices: - return X_resampled, y_resampled, self.sample_indices_ return X_resampled, y_resampled def _more_tags(self): @@ -379,14 +345,6 @@ class AllKNN(BaseCleaningSampler): ---------- {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - {random_state} .. deprecated:: 0.4 @@ -421,7 +379,6 @@ class without early stopping. Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -462,7 +419,6 @@ class without early stopping. def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None, n_neighbors=3, kind_sel='all', @@ -470,7 +426,6 @@ def __init__(self, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.return_indices = return_indices self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.allow_minority = allow_minority @@ -491,15 +446,11 @@ def _validate_estimator(self): self.enn_ = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, - return_indices=False, n_neighbors=self.nn_, kind_sel=self.kind_sel, n_jobs=self.n_jobs) def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') self._validate_estimator() X_, y_ = X, y @@ -542,8 +493,6 @@ def _fit_resample(self, X, y): X_resampled, y_resampled = X_, y_ - if self.return_indices: - return X_resampled, y_resampled, self.sample_indices_ return X_resampled, y_resampled def _more_tags(self): diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index c0b7b17a2..19bd112c1 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -43,14 +43,6 @@ class InstanceHardnessThreshold(BaseUnderSampler): {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - {random_state} cv : int, optional (default=5) @@ -65,7 +57,6 @@ class InstanceHardnessThreshold(BaseUnderSampler): Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -101,14 +92,12 @@ class InstanceHardnessThreshold(BaseUnderSampler): def __init__(self, estimator=None, sampling_strategy='auto', - return_indices=False, random_state=None, cv=5, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator - self.return_indices = return_indices self.cv = cv self.n_jobs = n_jobs @@ -128,9 +117,6 @@ def _validate_estimator(self): type(self.estimator))) def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') self._validate_estimator() target_stats = Counter(y) @@ -170,9 +156,6 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index f3d3c0279..0d4e37e76 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -30,14 +30,6 @@ class NearMiss(BaseUnderSampler): ---------- {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected from the majority class. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - {random_state} .. deprecated:: 0.4 @@ -70,7 +62,6 @@ class NearMiss(BaseUnderSampler): Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -105,7 +96,6 @@ class NearMiss(BaseUnderSampler): def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None, version=1, n_neighbors=3, @@ -113,7 +103,6 @@ def __init__(self, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.return_indices = return_indices self.version = version self.n_neighbors = n_neighbors self.n_neighbors_ver3 = n_neighbors_ver3 @@ -210,9 +199,6 @@ def _validate_estimator(self): ' {}'.format(self.version)) def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') self._validate_estimator() idx_under = np.empty((0, ), dtype=int) @@ -280,9 +266,6 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index c11cd2f25..492b770b4 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -34,14 +34,6 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): ---------- {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - {random_state} .. deprecated:: 0.4 @@ -71,7 +63,6 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -106,7 +97,6 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None, n_neighbors=3, kind_sel='all', @@ -114,7 +104,6 @@ def __init__(self, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.return_indices = return_indices self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.threshold_cleaning = threshold_cleaning @@ -140,9 +129,6 @@ def _validate_estimator(self): " Got {} instead.".format(self.threshold_cleaning)) def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') self._validate_estimator() enn = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, @@ -188,10 +174,6 @@ def _fit_resample(self, X, y): selected_samples[union_a1_a2] = False self.sample_indices_ = np.flatnonzero(selected_samples) - if self.return_indices: - return (safe_indexing(X, self.sample_indices_), - safe_indexing(y, self.sample_indices_), - self.sample_indices_) return (safe_indexing(X, self.sample_indices_), safe_indexing(y, self.sample_indices_)) diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 9d94c72e9..83b4d8735 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -31,14 +31,6 @@ class OneSidedSelection(BaseCleaningSampler): ---------- {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - {random_state} n_neighbors : int or object, optional (default=\ @@ -60,7 +52,6 @@ class OneSidedSelection(BaseCleaningSampler): Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -96,14 +87,12 @@ class OneSidedSelection(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None, n_neighbors=None, n_seeds_S=1, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.return_indices = return_indices self.n_neighbors = n_neighbors self.n_seeds_S = n_seeds_S self.n_jobs = n_jobs @@ -124,9 +113,6 @@ def _validate_estimator(self): ' Got {} instead.'.format(type(self.n_neighbors))) def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') self._validate_estimator() random_state = check_random_state(self.random_state) @@ -177,8 +163,7 @@ def _fit_resample(self, X, y): X_cleaned, y_cleaned = tl.fit_resample(X_resampled, y_resampled) self.sample_indices_ = safe_indexing(idx_under, tl.sample_indices_) - if self.return_indices: - return (X_cleaned, y_cleaned, self.sample_indices_) + return X_cleaned, y_cleaned def _more_tags(self): diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index c0c3c9b7c..9743087e0 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -33,13 +33,6 @@ class RandomUnderSampler(BaseUnderSampler): ---------- {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly selected. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - {random_state} replacement : boolean, optional (default=False) @@ -51,7 +44,6 @@ class RandomUnderSampler(BaseUnderSampler): Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -80,12 +72,10 @@ class RandomUnderSampler(BaseUnderSampler): def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None, replacement=False): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.return_indices = return_indices self.replacement = replacement @staticmethod @@ -98,9 +88,6 @@ def _check_X_y(X, y): return X, y, binarize_y def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') random_state = check_random_state(self.random_state) idx_under = np.empty((0, ), dtype=int) @@ -122,13 +109,8 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - if self.return_indices: - return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), - idx_under) return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): - # TODO: remove the str tag once the following PR is merged: - # https://github.com/scikit-learn/scikit-learn/pull/14043 - return {'X_types': ['2darray', 'str', 'string'], + return {'X_types': ['2darray', 'string'], 'sample_indices': True} diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 3ecd5284d..9fcd56b2f 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -27,15 +27,6 @@ class TomekLinks(BaseCleaningSampler): ---------- {sampling_strategy} - return_indices : bool, optional (default=False) - Whether or not to return the indices of the samples randomly - selected. - - .. deprecated:: 0.4 - ``return_indices`` is deprecated. Use the attribute - ``sample_indices_`` instead. - - {random_state} .. deprecated:: 0.4 @@ -50,7 +41,6 @@ class TomekLinks(BaseCleaningSampler): Indices of the samples selected. .. versionadded:: 0.4 - ``sample_indices_`` used instead of ``return_indices=True``. Notes ----- @@ -85,12 +75,10 @@ class TomekLinks(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - return_indices=False, random_state=None, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.return_indices = return_indices self.n_jobs = n_jobs @staticmethod @@ -136,9 +124,6 @@ def is_tomek(y, nn_index, class_type): return links def _fit_resample(self, X, y): - if self.return_indices: - deprecate_parameter(self, '0.4', 'return_indices', - 'sample_indices_') # check for deprecated random_state if self.random_state is not None: deprecate_parameter(self, '0.4', 'random_state') @@ -151,10 +136,6 @@ def _fit_resample(self, X, y): links = self.is_tomek(y, nns, self.sampling_strategy_) self.sample_indices_ = np.flatnonzero(np.logical_not(links)) - if self.return_indices: - return (safe_indexing(X, self.sample_indices_), - safe_indexing(y, self.sample_indices_), - self.sample_indices_) return (safe_indexing(X, self.sample_indices_), safe_indexing(y, self.sample_indices_)) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index d2faf8285..94e2c9f95 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -94,41 +94,6 @@ def test_all_knn_allow_minority(): assert len(y_res_1) < len(y_res_2) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_allknn_fit_resample_with_indices(): - allknn = AllKNN(return_indices=True) - X_resampled, y_resampled, idx_under = allknn.fit_resample(X, Y) - - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ - -0.46226554, -0.50481004 - ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ - 1.12202806, 0.33811558 - ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ - 0.50307437, 0.498805 - ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [ - 0.98382284, 0.37184502 - ], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [ - 0.28294738, -1.00125525 - ], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [ - 1.59068979, -0.96622933 - ], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [ - 1.16606871, -0.25641059 - ], [1.0304995, -0.16955962], [0.48921682, -1.38504507], - [-0.03918551, -0.68540745], [0.24991051, -1.00864997], - [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) - y_gt = np.array([ - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2 - ]) - idx_gt = np.array([ - 6, 13, 32, 39, 4, 5, 14, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, - 25, 26, 28, 31, 33, 34, 35, 36 - ]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_allclose(y_resampled, y_gt, rtol=R_TOL) - assert_allclose(idx_under, idx_gt, rtol=R_TOL) - - def test_allknn_fit_resample_mode(): allknn = AllKNN(kind_sel='mode') X_resampled, y_resampled = allknn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py index a498d252c..a9c2a6c46 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -50,24 +50,6 @@ def test_cnn_fit_resample(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_cnn_fit_resample_with_indices(): - cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) - X_resampled, y_resampled, idx_under = cnn.fit_resample(X, Y) - - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ - 0.05230552, 0.09043907 - ], [-1.25020462, -0.40402054], [0.70524765, - 0.39816382], [0.35831463, 1.33483198], - [-0.284881, -0.62730973], [0.03394306, 0.03986753], - [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) - y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) - idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - def test_cnn_fit_resample_with_object(): knn = KNeighborsClassifier(n_neighbors=1) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index 2879b9ff9..f27a2dfa5 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -50,22 +50,6 @@ def test_enn_fit_resample(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_enn_fit_resample_with_indices(): - enn = EditedNearestNeighbours(return_indices=True) - X_resampled, y_resampled, idx_under = enn.fit_resample(X, Y) - - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ - 2.59928271, 0.93323465 - ], [1.92365863, 0.82718767], [0.25738379, 0.95564169], - [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) - y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) - idx_gt = np.array([4, 11, 0, 3, 1, 8, 15]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - def test_enn_fit_resample_mode(): enn = EditedNearestNeighbours(kind_sel='mode') X_resampled, y_resampled = enn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index a713cb69f..599bba8ed 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -41,15 +41,6 @@ def test_iht_fit_resample(): assert y_resampled.shape == (12,) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_iht_fit_resample_with_indices(): - iht = InstanceHardnessThreshold( - ESTIMATOR, return_indices=True, random_state=RND_SEED) - X_resampled, y_resampled, idx_under = iht.fit_resample(X, Y) - assert X_resampled.shape == (12, 2) - assert y_resampled.shape == (12,) - - def test_iht_fit_resample_half(): sampling_strategy = {0: 6, 1: 8} iht = InstanceHardnessThreshold( diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py index decdbf3fd..e16a104a3 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py @@ -70,47 +70,6 @@ def test_nm_fit_resample_auto(): assert_array_equal(y_resampled, y_gt[version_idx]) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_nm_fit_resample_auto_indices(): - sampling_strategy = 'auto' - X_gt = [ - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], [0.50701028, -0.17636928], - [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], [0.50701028, -0.17636928], - [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], - [0.03142011, 0.12323596], [1.15157493, -1.2981518], - [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) - ] - y_gt = [ - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) - ] - idx_gt = [ - np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), - np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), - np.array([3, 10, 11, 0, 5, 8, 14, 4, 12]) - ] - for version_idx, version in enumerate(VERSION_NEARMISS): - nm = NearMiss( - sampling_strategy=sampling_strategy, - version=version, - return_indices=True) - X_resampled, y_resampled, idx_under = nm.fit_resample(X, Y) - assert_array_equal(X_resampled, X_gt[version_idx]) - assert_array_equal(y_resampled, y_gt[version_idx]) - assert_array_equal(idx_under, idx_gt[version_idx]) - - def test_nm_fit_resample_float_sampling_strategy(): sampling_strategy = {0: 3, 1: 4, 2: 4} X_gt = [ diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py index c71131ac9..5d60556ce 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -66,43 +66,6 @@ def test_ncr_fit_resample_mode(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_ncr_fit_resample_with_indices(): - ncr = NeighbourhoodCleaningRule(return_indices=True) - X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y) - - X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ - -0.20413357, 0.64628718 - ], [0.35967591, 2.61186964], [0.90701028, - -0.57636928], [-1.20809175, -1.49917302], - [-0.60497017, -0.66630228], [1.39272351, -0.51631728], - [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) - y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) - idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_ncr_fit_resample_nn_obj(): - nn = NearestNeighbors(n_neighbors=4) - ncr = NeighbourhoodCleaningRule(return_indices=True, n_neighbors=nn) - X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y) - - X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ - -0.20413357, 0.64628718 - ], [0.35967591, 2.61186964], [0.90701028, - -0.57636928], [-1.20809175, -1.49917302], - [-0.60497017, -0.66630228], [1.39272351, -0.51631728], - [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) - y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) - idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - def test_deprecation_random_state(): ncr = NeighbourhoodCleaningRule(random_state=0) with warns( diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py index 224b6804a..d6a5bf853 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py @@ -48,25 +48,6 @@ def test_oss_fit_resample(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_oss_fit_resample_with_indices(): - oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) - X_resampled, y_resampled, idx_under = oss.fit_resample(X, Y) - - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ - -0.65571327, 0.42412021 - ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ - -0.00717161, 0.00318087 - ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) - idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 8, 11]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - def test_oss_with_object(): knn = KNeighborsClassifier(n_neighbors=1) oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index 27fe86e7e..680ec0b3d 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -33,22 +33,6 @@ def test_rus_fit_resample(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_rus_fit_resample_with_indices(): - rus = RandomUnderSampler( - return_indices=True, random_state=RND_SEED, replacement=True) - X_resampled, y_resampled, idx_under = rus.fit_resample(X, Y) - - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.09125309, -0.85409574], - [0.12372842, 0.6536186], [0.04352327, -0.20515826]]) - y_gt = np.array([0, 0, 0, 1, 1, 1]) - idx_gt = np.array([1, 3, 8, 6, 7, 0]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - def test_rus_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} rus = RandomUnderSampler( diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index bbe56ad72..0371942a4 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -86,40 +86,6 @@ def test_renn_fit_resample(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_renn_fit_resample_with_indices(): - renn = RepeatedEditedNearestNeighbours(return_indices=True) - X_resampled, y_resampled, idx_under = renn.fit_resample(X, Y) - - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ - -0.46226554, -0.50481004 - ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ - 1.12202806, 0.33811558 - ], [0.73489726, 0.43915195], [0.50307437, 0.498805], [ - 0.84929742, 0.41042894 - ], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [ - 0.69804044, 0.44810796 - ], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [ - 0.34218094, -0.58781961 - ], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [ - 0.73418199, -0.02222847 - ], [0.79270821, -0.41386668], [1.16606871, -0.25641059], - [1.0304995, -0.16955962], [0.48921682, -1.38504507], - [-0.03918551, -0.68540745], [0.24991051, -1.00864997], - [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) - y_gt = np.array([ - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2 - ]) - idx_gt = np.array([ - 6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25, - 26, 28, 31, 33, 34, 35, 36 - ]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - def test_renn_fit_resample_mode_object(): renn = RepeatedEditedNearestNeighbours(kind_sel='mode') X_resampled, y_resampled = renn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py index e3c0ef18b..7118862b8 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py @@ -50,29 +50,6 @@ def test_tl_fit_resample(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4") -def test_tl_fit_resample_with_indices(): - tl = TomekLinks(return_indices=True) - X_resampled, y_resampled, idx_under = tl.fit_resample(X, Y) - - X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [ - 1.34192108, -0.13367336 - ], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [ - -0.37162401, -2.19400981 - ], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [ - 0.61472253, -0.82309052 - ], [0.19893132, -0.47761769], [0.97407872, 0.44454207], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], - [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) - y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) - idx_gt = np.array( - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 19]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - def test_deprecation_random_state(): tl = TomekLinks(random_state=0) with warns( From 0e4a665371402e8f91be7c7ed48ddf7ea9820fcc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 17:01:14 +0100 Subject: [PATCH 19/37] FIX SMOTE --- imblearn/ensemble/_forest.py | 2 +- imblearn/over_sampling/_smote.py | 138 +------------ .../over_sampling/tests/test_kmeans_smote.py | 10 +- imblearn/over_sampling/tests/test_smote.py | 185 ------------------ 4 files changed, 13 insertions(+), 322 deletions(-) diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 126f1df73..2efaa4e9d 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -37,7 +37,7 @@ def _local_parallel_build_trees(sampler, tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None): # resample before to fit the tree - X_resampled, y_resampled = sampler.fit_sample(X, y) + X_resampled, y_resampled = sampler.fit_resample(X, y) if sample_weight is not None: sample_weight = safe_indexing(sample_weight, sampler.sample_indices_) tree = _parallel_build_trees(tree, forest, X_resampled, y_resampled, diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index a769ce3ce..64ec1afe8 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -7,8 +7,6 @@ # License: MIT import math -import types -import warnings from collections import Counter import numpy as np @@ -33,9 +31,6 @@ from ..utils import Substitution from ..utils._docstring import _random_state_docstring -# FIXME: remove in 0.6 -SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm') - class BaseSMOTE(BaseOverSampler): """Base class for the different SMOTE algorithms.""" @@ -329,11 +324,7 @@ def _validate_estimator(self): '"borderline-1" and "borderline-2".' 'Got {} instead.'.format(self.kind)) - # FIXME: rename _sample -> _fit_resample in 0.6 def _fit_resample(self, X, y): - return self._sample(X, y) - - def _sample(self, X, y): self._validate_estimator() X_resampled = X.copy() @@ -517,11 +508,7 @@ def _validate_estimator(self): raise_isinstance_error('svm_estimator', [SVC], self.svm_estimator) - # FIXME: rename _sample -> _fit_resample in 0.6 def _fit_resample(self, X, y): - return self._sample(X, y) - - def _sample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) X_resampled = X.copy() @@ -603,11 +590,10 @@ def _sample(self, X, y): return X_resampled, y_resampled -# FIXME: In 0.6, SMOTE should inherit only from BaseSMOTE. @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring) -class SMOTE(SVMSMOTE, BorderlineSMOTE): +class SMOTE(BaseSMOTE): """Class to perform over-sampling using SMOTE. This object is an implementation of SMOTE - Synthetic Minority @@ -627,42 +613,6 @@ class SMOTE(SVMSMOTE, BorderlineSMOTE): :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. - m_neighbors : int or object, optional (default=10) - If int, number of nearest neighbours to use to determine if a minority - sample is in danger. Used with ``kind={{'borderline1', 'borderline2', - 'svm'}}``. If object, an estimator that inherits - from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used - to find the k_neighbors. - - .. deprecated:: 0.4 - ``m_neighbors`` is deprecated in 0.4 and will be removed in 0.6. Use - :class:`BorderlineSMOTE` or :class:`SVMSMOTE` instead to use the - intended algorithm. - - out_step : float, optional (default=0.5) - Step size when extrapolating. Used with ``kind='svm'``. - - .. deprecated:: 0.4 - ``out_step`` is deprecated in 0.4 and will be removed in 0.6. Use - :class:`SVMSMOTE` instead to use the intended algorithm. - - kind : str, optional (default='regular') - The type of SMOTE algorithm to use one of the following options: - ``'regular'``, ``'borderline1'``, ``'borderline2'``, ``'svm'``. - - .. deprecated:: 0.4 - ``kind`` is deprecated in 0.4 and will be removed in 0.6. Use - :class:`BorderlineSMOTE` or :class:`SVMSMOTE` instead to use the - intended algorithm. - - svm_estimator : object, optional (default=SVC()) - If ``kind='svm'``, a parametrized :class:`sklearn.svm.SVC` - classifier can be passed. - - .. deprecated:: 0.4 - ``out_step`` is deprecated in 0.4 and will be removed in 0.6. Use - :class:`SVMSMOTE` instead to use the intended algorithm. - n_jobs : int, optional (default=1) The number of threads to open if possible. @@ -711,86 +661,16 @@ def __init__(self, sampling_strategy='auto', random_state=None, k_neighbors=5, - m_neighbors='deprecated', - out_step='deprecated', - kind='deprecated', - svm_estimator='deprecated', n_jobs=1): - # FIXME: in 0.6 call super() - BaseSMOTE.__init__(self, sampling_strategy=sampling_strategy, - random_state=random_state, k_neighbors=k_neighbors, - n_jobs=n_jobs) - self.kind = kind - self.m_neighbors = m_neighbors - self.out_step = out_step - self.svm_estimator = svm_estimator - self.n_jobs = n_jobs - - def _validate_estimator(self): - # FIXME: in 0.6 call super() - BaseSMOTE._validate_estimator(self) - # FIXME: remove in 0.6 after deprecation cycle - if self.kind != 'deprecated' and not (self.kind == 'borderline-1' or - self.kind == 'borderline-2'): - if self.kind not in SMOTE_KIND: - raise ValueError('Unknown kind for SMOTE algorithm.' - ' Choices are {}. Got {} instead.'.format( - SMOTE_KIND, self.kind)) - else: - warnings.warn('"kind" is deprecated in 0.4 and will be ' - 'removed in 0.6. Use SMOTE, BorderlineSMOTE or ' - 'SVMSMOTE instead.', DeprecationWarning) - - if self.kind == 'borderline1' or self.kind == 'borderline2': - self._sample = types.MethodType(BorderlineSMOTE._sample, self) - self.kind = ('borderline-1' if self.kind == 'borderline1' - else 'borderline-2') - - elif self.kind == 'svm': - self._sample = types.MethodType(SVMSMOTE._sample, self) - - if self.out_step == 'deprecated': - self.out_step = 0.5 - else: - warnings.warn('"out_step" is deprecated in 0.4 and will ' - 'be removed in 0.6. Use SVMSMOTE class ' - 'instead.', DeprecationWarning) - - if self.svm_estimator == 'deprecated': - warnings.warn('"svm_estimator" is deprecated in 0.4 and ' - 'will be removed in 0.6. Use SVMSMOTE class ' - 'instead.', DeprecationWarning) - if (self.svm_estimator is None or - self.svm_estimator == 'deprecated'): - self.svm_estimator_ = SVC(gamma='scale', - random_state=self.random_state) - elif isinstance(self.svm_estimator, SVC): - self.svm_estimator_ = clone(self.svm_estimator) - else: - raise_isinstance_error('svm_estimator', [SVC], - self.svm_estimator) - - if self.kind != 'regular': - if self.m_neighbors == 'deprecated': - self.m_neighbors = 10 - else: - warnings.warn('"m_neighbors" is deprecated in 0.4 and ' - 'will be removed in 0.6. Use SVMSMOTE class ' - 'or BorderlineSMOTE instead.', - DeprecationWarning) - - self.nn_m_ = check_neighbors_object( - 'm_neighbors', self.m_neighbors, additional_neighbor=1) - self.nn_m_.set_params(**{'n_jobs': self.n_jobs}) + super().__init__( + sampling_strategy=sampling_strategy, + random_state=random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs + ) - # FIXME: to be removed in 0.6 def _fit_resample(self, X, y): self._validate_estimator() - return self._sample(X, y) - - def _sample(self, X, y): - # FIXME: uncomment in version 0.6 - # self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() @@ -1222,11 +1102,7 @@ def _find_cluster_sparsity(self, X): else self.density_exponent) return (mean_distance ** exponent) / X.shape[0] - # FIXME: rename _sample -> _fit_resample in 0.6 def _fit_resample(self, X, y): - return self._sample(X, y) - - def _sample(self, X, y): self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 9bb9e9a62..83c80dd8e 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -36,8 +36,8 @@ def test_kmeans_smote(data): k_neighbors=5) smote = SMOTE(random_state=42) - X_res_1, y_res_1 = kmeans_smote.fit_sample(X, y) - X_res_2, y_res_2 = smote.fit_sample(X, y) + X_res_1, y_res_1 = kmeans_smote.fit_resample(X, y) + X_res_2, y_res_2 = smote.fit_resample(X, y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) @@ -59,7 +59,7 @@ def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): kmeans_smote = KMeansSMOTE(random_state=42, kmeans_estimator=kmeans_estimator, k_neighbors=k_neighbors) - X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) + X_resampled, y_resampled = kmeans_smote.fit_resample(X, y) assert X_resampled.shape == (24, 2) assert y_resampled.shape == (24,) @@ -75,7 +75,7 @@ def test_sample_kmeans_not_enough_clusters(): kmeans_estimator=30, k_neighbors=2) with pytest.raises(RuntimeError): - smote.fit_sample(X, y) + smote.fit_resample(X, y) @pytest.mark.parametrize("density_exponent", ["auto", 2]) @@ -86,7 +86,7 @@ def test_sample_kmeans_density_estimation(data, density_exponent, smote = KMeansSMOTE(random_state=42, density_exponent=density_exponent, cluster_balance_threshold=cluster_balance_threshold) - smote.fit_sample(X, y) + smote.fit_resample(X, y) @pytest.mark.parametrize( diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 21539943c..276fffcf9 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -79,127 +79,6 @@ def test_sample_regular_half(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') -def test_sample_borderline1(): - kind = 'borderline1' - smote = SMOTE(random_state=RND_SEED, kind=kind) - X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 - ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ - -0.28162401, -2.10400981 - ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ - 0.70472253, -0.73309052 - ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ - -0.41635887, -0.38299653 - ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.3765279, -0.2009615], [0.55276636, -0.10550373], - [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) - - -@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') -def test_sample_borderline2(): - kind = 'borderline2' - smote = SMOTE(random_state=RND_SEED, kind=kind) - X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 - ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ - -0.28162401, -2.10400981 - ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ - 0.70472253, -0.73309052 - ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], [0.08711622, 0.93259929], - [1.70580611, -0.11219234], [0.47436888, -0.2645749], - [1.07844561, -0.19435291], [0.33339622, 0.49870937]]) - y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) - - -@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') -@pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') -@pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') -@pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') -def test_sample_svm(): - kind = 'svm' - smote = SMOTE(random_state=RND_SEED, kind=kind) - X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234], - [0.47436887, -0.2645749], - [1.07844562, -0.19435291], - [1.44228238, -1.31256615], - [1.25636713, -1.04463226]]) - y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, - 1, 0, 1, 0, 0, 0, 0, 0]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) - - -@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') -@pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') -def test_fit_resample_nn_obj(): - kind = 'borderline1' - nn_m = NearestNeighbors(n_neighbors=11) - nn_k = NearestNeighbors(n_neighbors=6) - smote = SMOTE( - random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) - X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 - ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ - -0.28162401, -2.10400981 - ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ - 0.70472253, -0.73309052 - ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ - -0.41635887, -0.38299653 - ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.3765279, -0.2009615], [0.55276636, -0.10550373], - [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) - - def test_sample_regular_with_nn(): nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) @@ -226,70 +105,6 @@ def test_sample_regular_with_nn(): assert_array_equal(y_resampled, y_gt) -@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') -@pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') -@pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') -@pytest.mark.parametrize( - "smote_params, err_msg", - [({"kind": "rnd"}, "Unknown kind for SMOTE"), - ({"kind": "borderline1", - "k_neighbors": NearestNeighbors(n_neighbors=6), - "m_neighbors": 'rnd'}, "has to be one of"), - ({"k_neighbors": 'rnd', - "m_neighbors": NearestNeighbors(n_neighbors=10)}, "has to be one of"), - ({"kind": "regular", - "k_neighbors": 'rnd'}, "has to be one of"), - ({"kind": "svm", - "k_neighbors": NearestNeighbors(n_neighbors=6), - "svm_estimator": 'rnd'}, "has to be one of")] -) -def test_smote_error_passing_estimator(smote_params, err_msg): - smote = SMOTE(**smote_params) - with pytest.raises(ValueError, match=err_msg): - smote.fit_resample(X, Y) - - -@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') -@pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') -@pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') -@pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') -def test_sample_with_nn_svm(): - kind = 'svm' - nn_k = NearestNeighbors(n_neighbors=6) - svm = SVC(gamma='scale', random_state=RND_SEED) - smote = SMOTE( - random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) - X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234], - [0.47436887, -0.2645749], - [1.07844562, -0.19435291], - [1.44228238, -1.31256615], - [1.25636713, -1.04463226]]) - y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, - 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) - - @pytest.mark.parametrize( "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=['borderline', 'svm'] ) From 23076ea00555dd0845cea95e132fcaf9e4b3e0eb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 17:06:58 +0100 Subject: [PATCH 20/37] DEPR remove random_state --- imblearn/tests/test_common.py | 8 ---- .../_edited_nearest_neighbours.py | 47 ++----------------- .../_prototype_selection/_nearmiss.py | 16 +------ .../_neighbourhood_cleaning_rule.py | 17 +------ .../_prototype_selection/_tomek_links.py | 14 +----- imblearn/utils/estimator_checks.py | 25 ++-------- 6 files changed, 11 insertions(+), 116 deletions(-) diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index a4e87ecc0..9c0d22883 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -20,10 +20,6 @@ def test_all_estimator_no_base_class(name, Estimator): assert not name.lower().startswith('base'), msg -@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') -@pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') -@pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') -@pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') @pytest.mark.filterwarnings("ignore:'y' should be of types") @pytest.mark.filterwarnings("ignore:The number of the samples to") @pytest.mark.parametrize( @@ -50,10 +46,6 @@ def _generate_checks_per_estimator(check_generator, estimators): yield name, Estimator, check -@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') -@pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') -@pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') -@pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') @pytest.mark.filterwarnings("ignore:'y' should be of types") @pytest.mark.parametrize( 'name, Estimator, check', diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index bff35b214..e77907e9c 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -16,15 +16,12 @@ from ..base import BaseCleaningSampler from ...utils import check_neighbors_object from ...utils import Substitution -from ...utils.deprecation import deprecate_parameter -from ...utils._docstring import _random_state_docstring SEL_KIND = ('all', 'mode') @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) class EditedNearestNeighbours(BaseCleaningSampler): """Class to perform under-sampling based on the edited nearest neighbour method. @@ -35,11 +32,6 @@ class EditedNearestNeighbours(BaseCleaningSampler): ---------- {sampling_strategy} - {random_state} - - .. deprecated:: 0.4 - ``random_state`` is deprecated in 0.4 and will be removed in 0.6. - n_neighbors : int or object, optional (default=3) If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from @@ -102,23 +94,16 @@ class EditedNearestNeighbours(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - random_state=None, n_neighbors=3, kind_sel='all', n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) - self.random_state = random_state self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.n_jobs = n_jobs def _validate_estimator(self): """Validate the estimator created in the ENN.""" - - # check for deprecated random_state - if self.random_state is not None: - deprecate_parameter(self, '0.4', 'random_state') - self.nn_ = check_neighbors_object( 'n_neighbors', self.n_neighbors, additional_neighbor=1) self.nn_.set_params(**{'n_jobs': self.n_jobs}) @@ -165,8 +150,7 @@ def _more_tags(self): @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) class RepeatedEditedNearestNeighbours(BaseCleaningSampler): """Class to perform under-sampling based on the repeated edited nearest neighbour method. @@ -177,11 +161,6 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): ---------- {sampling_strategy} - {random_state} - - .. deprecated:: 0.4 - ``random_state`` is deprecated in 0.4 and will be removed in 0.6. - n_neighbors : int or object, optional (default=3) If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from @@ -248,13 +227,11 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - random_state=None, n_neighbors=3, max_iter=100, kind_sel='all', n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) - self.random_state = random_state self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.n_jobs = n_jobs @@ -262,11 +239,6 @@ def __init__(self, def _validate_estimator(self): """Private function to create the NN estimator""" - - # check for deprecated random_state - if self.random_state is not None: - deprecate_parameter(self, '0.4', 'random_state') - if self.max_iter < 2: raise ValueError('max_iter must be greater than 1.' ' Got {} instead.'.format(type(self.max_iter))) @@ -334,8 +306,7 @@ def _more_tags(self): @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) class AllKNN(BaseCleaningSampler): """Class to perform under-sampling based on the AllKNN method. @@ -345,11 +316,6 @@ class AllKNN(BaseCleaningSampler): ---------- {sampling_strategy} - {random_state} - - .. deprecated:: 0.4 - ``random_state`` is deprecated in 0.4 and will be removed in 0.6. - n_neighbors : int or object, optional (default=3) If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from @@ -419,13 +385,11 @@ class without early stopping. def __init__(self, sampling_strategy='auto', - random_state=None, n_neighbors=3, kind_sel='all', allow_minority=False, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) - self.random_state = random_state self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.allow_minority = allow_minority @@ -433,11 +397,6 @@ def __init__(self, def _validate_estimator(self): """Create objects required by AllKNN""" - - # check for deprecated random_state - if self.random_state is not None: - deprecate_parameter(self, '0.4', 'random_state') - if self.kind_sel not in SEL_KIND: raise NotImplementedError diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 0d4e37e76..ca3897ef8 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -14,13 +14,10 @@ from ..base import BaseUnderSampler from ...utils import check_neighbors_object from ...utils import Substitution -from ...utils.deprecation import deprecate_parameter -from ...utils._docstring import _random_state_docstring @Substitution( - sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + sampling_strategy=BaseUnderSampler._sampling_strategy_docstring) class NearMiss(BaseUnderSampler): """Class to perform under-sampling based on NearMiss methods. @@ -30,11 +27,6 @@ class NearMiss(BaseUnderSampler): ---------- {sampling_strategy} - {random_state} - - .. deprecated:: 0.4 - ``random_state`` is deprecated in 0.4 and will be removed in 0.6. - version : int, optional (default=1) Version of the NearMiss to use. Possible values are 1, 2 or 3. @@ -96,13 +88,11 @@ class NearMiss(BaseUnderSampler): def __init__(self, sampling_strategy='auto', - random_state=None, version=1, n_neighbors=3, n_neighbors_ver3=3, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) - self.random_state = random_state self.version = version self.n_neighbors = n_neighbors self.n_neighbors_ver3 = n_neighbors_ver3 @@ -182,10 +172,6 @@ def _selection_dist_based(self, def _validate_estimator(self): """Private function to create the NN estimator""" - # check for deprecated random_state - if self.random_state is not None: - deprecate_parameter(self, '0.4', 'random_state') - self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors) self.nn_.set_params(**{'n_jobs': self.n_jobs}) diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 492b770b4..ed9f0b3b8 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -15,15 +15,12 @@ from ._edited_nearest_neighbours import EditedNearestNeighbours from ...utils import check_neighbors_object from ...utils import Substitution -from ...utils.deprecation import deprecate_parameter -from ...utils._docstring import _random_state_docstring SEL_KIND = ('all', 'mode') @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) class NeighbourhoodCleaningRule(BaseCleaningSampler): """Class performing under-sampling based on the neighbourhood cleaning rule. @@ -34,11 +31,6 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): ---------- {sampling_strategy} - {random_state} - - .. deprecated:: 0.4 - ``random_state`` is deprecated in 0.4 and will be removed in 0.6. - n_neighbors : int or object, optional (default=3) If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from @@ -97,13 +89,11 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - random_state=None, n_neighbors=3, kind_sel='all', threshold_cleaning=0.5, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) - self.random_state = random_state self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.threshold_cleaning = threshold_cleaning @@ -111,11 +101,6 @@ def __init__(self, def _validate_estimator(self): """Create the objects required by NCR.""" - - # check for deprecated random_state - if self.random_state is not None: - deprecate_parameter(self, '0.4', 'random_state') - self.nn_ = check_neighbors_object( 'n_neighbors', self.n_neighbors, additional_neighbor=1) self.nn_.set_params(**{'n_jobs': self.n_jobs}) diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 9fcd56b2f..bd058f3db 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -16,8 +16,7 @@ @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) class TomekLinks(BaseCleaningSampler): """Class to perform under-sampling by removing Tomek's links. @@ -27,11 +26,6 @@ class TomekLinks(BaseCleaningSampler): ---------- {sampling_strategy} - {random_state} - - .. deprecated:: 0.4 - ``random_state`` is deprecated in 0.4 and will be removed in 0.6. - n_jobs : int, optional (default=1) The number of threads to open if possible. @@ -75,10 +69,8 @@ class TomekLinks(BaseCleaningSampler): def __init__(self, sampling_strategy='auto', - random_state=None, n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) - self.random_state = random_state self.n_jobs = n_jobs @staticmethod @@ -124,10 +116,6 @@ def is_tomek(y, nn_index, class_type): return links def _fit_resample(self, X, y): - # check for deprecated random_state - if self.random_state is not None: - deprecate_parameter(self, '0.4', 'random_state') - # Find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) nn.fit(X) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 6f2bed47d..9d19be3e3 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -32,11 +32,6 @@ from imblearn.over_sampling import SMOTE from imblearn.under_sampling import NearMiss, ClusterCentroids -# FIXME: remove in 0.6 -DONT_HAVE_RANDOM_STATE = ('NearMiss', 'EditedNearestNeighbours', - 'RepeatedEditedNearestNeighbours', 'AllKNN', - 'NeighbourhoodCleaningRule', 'TomekLinks') - def _yield_sampler_checks(name, Estimator): yield check_target_type @@ -96,9 +91,7 @@ def check_target_type(name, Estimator): X = np.random.random((20, 2)) y = np.linspace(0, 1, 20) estimator = Estimator() - # FIXME: in 0.6 set the random_state for all - if name not in DONT_HAVE_RANDOM_STATE: - set_random_state(estimator) + set_random_state(estimator) with pytest.raises(ValueError, match="Unknown label type: 'continuous'"): estimator.fit_resample(X, y) # if the target is multilabel then we should raise an error @@ -225,9 +218,7 @@ def check_samplers_sparse(name, Sampler): samplers = [Sampler()] for sampler in samplers: - # FIXME: in 0.6 set the random_state for all - if name not in DONT_HAVE_RANDOM_STATE: - set_random_state(sampler) + set_random_state(sampler) X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) X_res, y_res = sampler.fit_resample(X, y) if not isinstance(sampler, BaseEnsembleSampler): @@ -262,9 +253,7 @@ def check_samplers_pandas(name, Sampler): samplers = [Sampler()] for sampler in samplers: - # FIXME: in 0.6 set the random_state for all - if name not in DONT_HAVE_RANDOM_STATE: - set_random_state(sampler) + set_random_state(sampler) X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y) X_res, y_res = sampler.fit_resample(X, y) assert_allclose(X_res_pd, X_res) @@ -277,9 +266,7 @@ def check_samplers_multiclass_ova(name, Sampler): weights=[0.2, 0.3, 0.5], random_state=0) y_ova = label_binarize(y, np.unique(y)) sampler = Sampler() - # FIXME: in 0.6 set the random_state for all - if name not in DONT_HAVE_RANDOM_STATE: - set_random_state(sampler) + set_random_state(sampler) X_res, y_res = sampler.fit_resample(X, y) X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) assert_allclose(X_res, X_res_ova) @@ -299,9 +286,7 @@ def check_samplers_preserve_dtype(name, Sampler): X = X.astype(np.float32) y = y.astype(np.int32) sampler = Sampler() - # FIXME: in 0.6 set the random_state for all - if name not in DONT_HAVE_RANDOM_STATE: - set_random_state(sampler) + set_random_state(sampler) X_res, y_res = sampler.fit_resample(X, y) assert X.dtype == X_res.dtype, "X dtype is not preserved" assert y.dtype == y_res.dtype, "y dtype is not preserved" From 449c3b1e2bdb023fe89ec885211e90c4ed2f327e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 17:17:30 +0100 Subject: [PATCH 21/37] iter --- .../_prototype_selection/tests/test_allknn.py | 7 ------- .../tests/test_edited_nearest_neighbours.py | 7 ------- .../_prototype_selection/tests/test_nearmiss.py | 7 ------- .../tests/test_neighbourhood_cleaning_rule.py | 7 ------- .../test_repeated_edited_nearest_neighbours.py | 7 ------- .../tests/test_tomek_links.py | 7 ------- imblearn/utils/estimator_checks.py | 15 ++------------- 7 files changed, 2 insertions(+), 55 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index 94e2c9f95..fef20b712 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -164,10 +164,3 @@ def test_alknn_not_good_object(): allknn = AllKNN(n_neighbors=nn, kind_sel='mode') with pytest.raises(ValueError): allknn.fit_resample(X, Y) - - -def test_deprecation_random_state(): - allknn = AllKNN(random_state=0) - with warns( - DeprecationWarning, match="'random_state' is deprecated from 0.4"): - allknn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index f27a2dfa5..87caf08bb 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -90,10 +90,3 @@ def test_enn_not_good_object(): enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with pytest.raises(ValueError, match="has to be one of"): enn.fit_resample(X, Y) - - -def test_deprecation_random_state(): - enn = EditedNearestNeighbours(random_state=0) - with warns( - DeprecationWarning, match="'random_state' is deprecated from 0.4"): - enn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py index e16a104a3..f91f2f522 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py @@ -138,10 +138,3 @@ def test_nm_fit_resample_nn_obj(): X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) - - -def test_deprecation_random_state(): - nm = NearMiss(random_state=0) - with warns( - DeprecationWarning, match="'random_state' is deprecated from 0.4"): - nm.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py index 5d60556ce..a78d92192 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -64,10 +64,3 @@ def test_ncr_fit_resample_mode(): y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) - - -def test_deprecation_random_state(): - ncr = NeighbourhoodCleaningRule(random_state=0) - with warns( - DeprecationWarning, match="'random_state' is deprecated from 0.4"): - ncr.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index 0371942a4..814dca775 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -160,10 +160,3 @@ def test_renn_not_good_object(): renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with pytest.raises(ValueError): renn.fit_resample(X, Y) - - -def test_deprecation_random_state(): - renn = RepeatedEditedNearestNeighbours(random_state=0) - with warns( - DeprecationWarning, match="'random_state' is deprecated from 0.4"): - renn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py index 7118862b8..78e287c74 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py @@ -48,10 +48,3 @@ def test_tl_fit_resample(): y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) - - -def test_deprecation_random_state(): - tl = TomekLinks(random_state=0) - with warns( - DeprecationWarning, match="'random_state' is deprecated from 0.4"): - tl.fit_resample(X, Y) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 9d19be3e3..897e7f5e1 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -199,12 +199,7 @@ def check_samplers_sparse(name, Sampler): X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0) X_sparse = sparse.csr_matrix(X) - if isinstance(Sampler(), SMOTE): - samplers = [ - Sampler(random_state=0, kind=kind) - for kind in ('regular', 'borderline1', 'borderline2', 'svm') - ] - elif isinstance(Sampler(), NearMiss): + if isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] elif isinstance(Sampler(), ClusterCentroids): # set KMeans to full since it support sparse and dense @@ -240,13 +235,7 @@ def check_samplers_pandas(name, Sampler): weights=[0.2, 0.3, 0.5], random_state=0) X_pd = pd.DataFrame(X) sampler = Sampler() - if isinstance(Sampler(), SMOTE): - samplers = [ - Sampler(random_state=0, kind=kind) - for kind in ('regular', 'borderline1', 'borderline2', 'svm') - ] - - elif isinstance(Sampler(), NearMiss): + if isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] else: From 78a43b77f27d4f281758c7ea322409cc60741ae1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 17:21:07 +0100 Subject: [PATCH 22/37] black --- imblearn/__init__.py | 17 +- imblearn/_version.py | 2 +- imblearn/base.py | 20 +- imblearn/combine/__init__.py | 2 +- imblearn/combine/_smote_enn.py | 43 +- imblearn/combine/_smote_tomek.py | 40 +- imblearn/combine/tests/test_smote_enn.py | 125 +++-- imblearn/combine/tests/test_smote_tomek.py | 163 ++++--- imblearn/datasets/__init__.py | 2 +- imblearn/datasets/_imbalance.py | 32 +- imblearn/datasets/_zenodo.py | 80 +++- imblearn/datasets/tests/test_imbalance.py | 14 +- imblearn/datasets/tests/test_zenodo.py | 84 ++-- imblearn/ensemble/__init__.py | 8 +- imblearn/ensemble/_bagging.py | 63 ++- imblearn/ensemble/_easy_ensemble.py | 48 +- imblearn/ensemble/_forest.py | 164 ++++--- imblearn/ensemble/_weight_boosting.py | 110 +++-- imblearn/ensemble/base.py | 8 +- imblearn/ensemble/tests/test_bagging.py | 255 +++++----- imblearn/ensemble/tests/test_easy_ensemble.py | 168 ++++--- imblearn/ensemble/tests/test_forest.py | 69 ++- .../ensemble/tests/test_weight_boosting.py | 55 ++- imblearn/exceptions.py | 7 +- imblearn/keras/__init__.py | 3 +- imblearn/keras/_generator.py | 78 ++- imblearn/keras/tests/test_generator.py | 92 ++-- imblearn/metrics/__init__.py | 9 +- imblearn/metrics/_classification.py | 308 +++++++----- imblearn/metrics/tests/test_classification.py | 246 ++++++---- imblearn/metrics/tests/test_score_objects.py | 54 ++- imblearn/over_sampling/__init__.py | 11 +- imblearn/over_sampling/_adasyn.py | 95 ++-- .../over_sampling/_random_over_sampler.py | 25 +- imblearn/over_sampling/_smote.py | 446 +++++++++++------- imblearn/over_sampling/base.py | 5 +- imblearn/over_sampling/tests/test_adasyn.py | 196 ++++++-- .../tests/test_borderline_smote.py | 47 +- .../over_sampling/tests/test_kmeans_smote.py | 89 ++-- .../tests/test_random_over_sampler.py | 78 ++- imblearn/over_sampling/tests/test_smote.py | 228 ++++++--- imblearn/over_sampling/tests/test_smote_nc.py | 54 ++- .../over_sampling/tests/test_svm_smote.py | 44 +- imblearn/pipeline.py | 105 +++-- imblearn/tensorflow/__init__.py | 2 +- imblearn/tensorflow/_generator.py | 45 +- imblearn/tensorflow/tests/test_generator.py | 35 +- imblearn/tests/test_base.py | 32 +- imblearn/tests/test_common.py | 23 +- imblearn/tests/test_exceptions.py | 2 +- imblearn/tests/test_pipeline.py | 426 ++++++++++------- imblearn/under_sampling/__init__.py | 15 +- .../_prototype_generation/__init__.py | 2 +- .../_cluster_centroids.py | 52 +- .../tests/test_cluster_centroids.py | 48 +- .../_prototype_selection/__init__.py | 14 +- .../_condensed_nearest_neighbour.py | 63 ++- .../_edited_nearest_neighbours.py | 137 +++--- .../_instance_hardness_threshold.py | 61 ++- .../_prototype_selection/_nearmiss.py | 113 +++-- .../_neighbourhood_cleaning_rule.py | 56 ++- .../_one_sided_selection.py | 48 +- .../_random_under_sampler.py | 34 +- .../_prototype_selection/_tomek_links.py | 15 +- .../_prototype_selection/tests/test_allknn.py | 399 ++++++++++++---- .../tests/test_condensed_nearest_neighbour.py | 79 +++- .../tests/test_edited_nearest_neighbours.py | 114 +++-- .../tests/test_instance_hardness_threshold.py | 37 +- .../tests/test_nearmiss.py | 228 ++++++--- .../tests/test_neighbourhood_cleaning_rule.py | 78 ++- .../tests/test_one_sided_selection.py | 76 ++- .../tests/test_random_under_sampler.py | 58 ++- ...test_repeated_edited_nearest_neighbours.py | 411 +++++++++++----- .../tests/test_tomek_links.py | 68 ++- imblearn/under_sampling/base.py | 12 +- imblearn/utils/__init__.py | 8 +- imblearn/utils/_docstring.py | 5 +- imblearn/utils/_validation.py | 336 ++++++++----- imblearn/utils/deprecation.py | 36 +- imblearn/utils/estimator_checks.py | 122 +++-- imblearn/utils/testing.py | 38 +- imblearn/utils/tests/test_deprecation.py | 8 +- imblearn/utils/tests/test_docstring.py | 13 +- imblearn/utils/tests/test_estimator_checks.py | 25 +- imblearn/utils/tests/test_testing.py | 18 +- imblearn/utils/tests/test_validation.py | 325 +++++++------ 86 files changed, 4817 insertions(+), 2692 deletions(-) diff --git a/imblearn/__init__.py b/imblearn/__init__.py index 8245d2f1e..634a34dd7 100644 --- a/imblearn/__init__.py +++ b/imblearn/__init__.py @@ -46,6 +46,17 @@ from ._version import __version__ from .utils._show_versions import show_versions -__all__ = ['combine', 'ensemble', 'exceptions', 'keras', 'metrics', - 'over_sampling', 'tensorflow', 'under_sampling', - 'utils', 'pipeline', 'FunctionSampler', '__version__'] +__all__ = [ + "combine", + "ensemble", + "exceptions", + "keras", + "metrics", + "over_sampling", + "tensorflow", + "under_sampling", + "utils", + "pipeline", + "FunctionSampler", + "__version__", +] diff --git a/imblearn/_version.py b/imblearn/_version.py index b45d2f5c1..2f1dcf358 100644 --- a/imblearn/_version.py +++ b/imblearn/_version.py @@ -22,4 +22,4 @@ # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.6.0.dev0' +__version__ = "0.6.0.dev0" diff --git a/imblearn/base.py b/imblearn/base.py index dbf4ae5fd..091e5e8e0 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -25,7 +25,7 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta): instead. """ - _estimator_type = 'sampler' + _estimator_type = "sampler" def fit(self, X, y): """Check inputs and statistics of the sampler. @@ -48,7 +48,8 @@ def fit(self, X, y): """ X, y, _ = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, self._sampling_type) + self.sampling_strategy, y, self._sampling_type + ) return self def fit_resample(self, X, y): @@ -76,7 +77,8 @@ def fit_resample(self, X, y): X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, self._sampling_type) + self.sampling_strategy, y, self._sampling_type + ) output = self._fit_resample(X, y) @@ -123,15 +125,16 @@ class BaseSampler(SamplerMixin): instead. """ - def __init__(self, sampling_strategy='auto'): + def __init__(self, sampling_strategy="auto"): self.sampling_strategy = sampling_strategy @staticmethod def _check_X_y(X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = check_X_y(X, y, accept_sparse=["csr", "csc"]) return X, y, binarize_y + def _identity(X, y): return X, y @@ -199,7 +202,7 @@ class FunctionSampler(BaseSampler): """ - _sampling_type = 'bypass' + _sampling_type = "bypass" def __init__(self, func=None, accept_sparse=True, kw_args=None): super().__init__() @@ -208,8 +211,9 @@ def __init__(self, func=None, accept_sparse=True, kw_args=None): self.kw_args = kw_args def _fit_resample(self, X, y): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] - if self.accept_sparse else False) + X, y = check_X_y( + X, y, accept_sparse=["csr", "csc"] if self.accept_sparse else False + ) func = _identity if self.func is None else self.func output = func(X, y, **(self.kw_args if self.kw_args else {})) return output diff --git a/imblearn/combine/__init__.py b/imblearn/combine/__init__.py index f203bd6f7..a0833f996 100644 --- a/imblearn/combine/__init__.py +++ b/imblearn/combine/__init__.py @@ -5,4 +5,4 @@ from ._smote_enn import SMOTEENN from ._smote_tomek import SMOTETomek -__all__ = ['SMOTEENN', 'SMOTETomek'] +__all__ = ["SMOTEENN", "SMOTETomek"] diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py index 401edd7ee..10d423ddd 100644 --- a/imblearn/combine/_smote_enn.py +++ b/imblearn/combine/_smote_enn.py @@ -18,7 +18,8 @@ @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class SMOTEENN(BaseSampler): """Class to perform over-sampling using SMOTE and cleaning using ENN. @@ -83,14 +84,17 @@ class SMOTEENN(BaseSampler): Resampled dataset shape Counter({{0: 900, 1: 881}}) """ - _sampling_type = 'over-sampling' - - def __init__(self, - sampling_strategy='auto', - random_state=None, - smote=None, - enn=None, - n_jobs=1): + + _sampling_type = "over-sampling" + + def __init__( + self, + sampling_strategy="auto", + random_state=None, + smote=None, + enn=None, + n_jobs=1, + ): super().__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state @@ -104,31 +108,36 @@ def _validate_estimator(self): if isinstance(self.smote, SMOTE): self.smote_ = clone(self.smote) else: - raise ValueError('smote needs to be a SMOTE object.' - 'Got {} instead.'.format(type(self.smote))) + raise ValueError( + "smote needs to be a SMOTE object." + "Got {} instead.".format(type(self.smote)) + ) # Otherwise create a default SMOTE else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, - n_jobs=self.n_jobs) + n_jobs=self.n_jobs, + ) if self.enn is not None: if isinstance(self.enn, EditedNearestNeighbours): self.enn_ = clone(self.enn) else: - raise ValueError('enn needs to be an EditedNearestNeighbours.' - ' Got {} instead.'.format(type(self.enn))) + raise ValueError( + "enn needs to be an EditedNearestNeighbours." + " Got {} instead.".format(type(self.enn)) + ) # Otherwise create a default EditedNearestNeighbours else: self.enn_ = EditedNearestNeighbours( - sampling_strategy='all', - n_jobs=self.n_jobs) + sampling_strategy="all", n_jobs=self.n_jobs + ) def _fit_resample(self, X, y): self._validate_estimator() y = check_target_type(y) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = check_X_y(X, y, accept_sparse=["csr", "csc"]) self.sampling_strategy_ = self.sampling_strategy X_res, y_res = self.smote_.fit_resample(X, y) diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py index c9226f0a4..af99d002d 100644 --- a/imblearn/combine/_smote_tomek.py +++ b/imblearn/combine/_smote_tomek.py @@ -19,7 +19,8 @@ @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class SMOTETomek(BaseSampler): """Class to perform over-sampling using SMOTE and cleaning using Tomek links. @@ -84,14 +85,16 @@ class SMOTETomek(BaseSampler): """ - _sampling_type = 'over-sampling' + _sampling_type = "over-sampling" - def __init__(self, - sampling_strategy='auto', - random_state=None, - smote=None, - tomek=None, - n_jobs=1): + def __init__( + self, + sampling_strategy="auto", + random_state=None, + smote=None, + tomek=None, + n_jobs=1, + ): super().__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state @@ -106,31 +109,36 @@ def _validate_estimator(self): if isinstance(self.smote, SMOTE): self.smote_ = clone(self.smote) else: - raise ValueError('smote needs to be a SMOTE object.' - 'Got {} instead.'.format(type(self.smote))) + raise ValueError( + "smote needs to be a SMOTE object." + "Got {} instead.".format(type(self.smote)) + ) # Otherwise create a default SMOTE else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, - n_jobs=self.n_jobs) + n_jobs=self.n_jobs, + ) if self.tomek is not None: if isinstance(self.tomek, TomekLinks): self.tomek_ = clone(self.tomek) else: - raise ValueError('tomek needs to be a TomekLinks object.' - 'Got {} instead.'.format(type(self.tomek))) + raise ValueError( + "tomek needs to be a TomekLinks object." + "Got {} instead.".format(type(self.tomek)) + ) # Otherwise create a default TomekLinks else: self.tomek_ = TomekLinks( - sampling_strategy='all', - n_jobs=self.n_jobs) + sampling_strategy="all", n_jobs=self.n_jobs + ) def _fit_resample(self, X, y): self._validate_estimator() y = check_target_type(y) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = check_X_y(X, y, accept_sparse=["csr", "csc"]) self.sampling_strategy_ = self.sampling_strategy X_res, y_res = self.smote_.fit_resample(X, y) diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py index ce1f98db2..8d83a7459 100644 --- a/imblearn/combine/tests/test_smote_enn.py +++ b/imblearn/combine/tests/test_smote_enn.py @@ -13,19 +13,30 @@ from imblearn.over_sampling import SMOTE RND_SEED = 0 -X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 -], [0.53366841, -0.30312976], [1.52091956, - -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, - 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], - [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ - -0.41635887, -0.38299653 - ], [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) +X = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + ] +) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 @@ -34,10 +45,17 @@ def test_sample_regular(): smote = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ - 0.61319159, -0.11571667 - ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) + X_gt = np.array( + [ + [1.52091956, -0.49283504], + [0.84976473, -0.15570176], + [0.61319159, -0.11571667], + [0.66052536, -0.28246518], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.08711622, 0.93259929], + ] + ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -45,15 +63,23 @@ def test_sample_regular(): def test_sample_regular_pass_smote_enn(): smote = SMOTEENN( - smote=SMOTE(sampling_strategy='auto', random_state=RND_SEED), - enn=EditedNearestNeighbours(sampling_strategy='all'), - random_state=RND_SEED) + smote=SMOTE(sampling_strategy="auto", random_state=RND_SEED), + enn=EditedNearestNeighbours(sampling_strategy="all"), + random_state=RND_SEED, + ) X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ - 0.61319159, -0.11571667 - ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) + X_gt = np.array( + [ + [1.52091956, -0.49283504], + [0.84976473, -0.15570176], + [0.61319159, -0.11571667], + [0.66052536, -0.28246518], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.08711622, 0.93259929], + ] + ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -62,11 +88,18 @@ def test_sample_regular_pass_smote_enn(): def test_sample_regular_half(): sampling_strategy = {0: 10, 1: 12} smote = SMOTEENN( - sampling_strategy=sampling_strategy, random_state=RND_SEED) + sampling_strategy=sampling_strategy, random_state=RND_SEED + ) X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) + X_gt = np.array( + [ + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.08711622, 0.93259929], + ] + ) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -74,13 +107,20 @@ def test_sample_regular_half(): def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) - enn = EditedNearestNeighbours(sampling_strategy='all') + enn = EditedNearestNeighbours(sampling_strategy="all") smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ - 0.61319159, -0.11571667 - ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) + X_gt = np.array( + [ + [1.52091956, -0.49283504], + [0.84976473, -0.15570176], + [0.61319159, -0.11571667], + [0.66052536, -0.28246518], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.08711622, 0.93259929], + ] + ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -89,10 +129,17 @@ def test_validate_estimator_init(): def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ - 0.61319159, -0.11571667 - ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) + X_gt = np.array( + [ + [1.52091956, -0.49283504], + [0.84976473, -0.15570176], + [0.61319159, -0.11571667], + [0.66052536, -0.28246518], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.08711622, 0.93259929], + ] + ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -116,8 +163,10 @@ def test_parallelisation(): @pytest.mark.parametrize( "smote_params, err_msg", - [({'smote': 'rnd'}, "smote needs to be a SMOTE"), - ({'enn': 'rnd'}, "enn needs to be an ")] + [ + ({"smote": "rnd"}, "smote needs to be a SMOTE"), + ({"enn": "rnd"}, "enn needs to be an "), + ], ) def test_error_wrong_object(smote_params, err_msg): smt = SMOTEENN(**smote_params) diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py index 3a58754bc..4d3123855 100644 --- a/imblearn/combine/tests/test_smote_tomek.py +++ b/imblearn/combine/tests/test_smote_tomek.py @@ -13,19 +13,30 @@ from imblearn.under_sampling import TomekLinks RND_SEED = 0 -X = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], [ - 1.34192108, -0.13367336 -], [0.62366841, -0.21312976], [1.61091956, - -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, - 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], - [0.19893132, -0.47761769], [1.06514042, -0.0770537], [ - 0.97407872, 0.44454207 - ], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ - -0.27410027, -0.54194484 - ], [0.8381014, 0.44085498], [-0.23374509, 0.18370049], [ - -0.32635887, -0.29299653 - ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) +X = np.array( + [ + [0.20622591, 0.0582794], + [0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.2184254, 0.24299982], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.06514042, -0.0770537], + [0.97407872, 0.44454207], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.27410027, -0.54194484], + [0.8381014, 0.44085498], + [-0.23374509, 0.18370049], + [-0.32635887, -0.29299653], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + ] +) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 @@ -33,16 +44,26 @@ def test_sample_regular(): smote = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ - 0.62366841, -0.21312976 - ], [1.61091956, -0.40283504], [-0.37162401, - -2.19400981], [0.74680821, 1.63827342], - [0.61472253, -0.82309052], [0.19893132, -0.47761769], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ - -0.23374509, 0.18370049 - ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [ - 0.38307743, -0.05670439 - ], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) + X_gt = np.array( + [ + [0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + [0.38307743, -0.05670439], + [0.70319159, -0.02571667], + [0.75052536, -0.19246518], + ] + ) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -51,16 +72,26 @@ def test_sample_regular(): def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} smote = SMOTETomek( - sampling_strategy=sampling_strategy, random_state=RND_SEED) + sampling_strategy=sampling_strategy, random_state=RND_SEED + ) X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.68481731, 0.51935141], [0.62366841, -0.21312976], [ - 1.61091956, -0.40283504 - ], [-0.37162401, -2.19400981], [0.74680821, - 1.63827342], [0.61472253, -0.82309052], - [0.19893132, -0.47761769], [1.40301027, -0.83648734], - [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [ - -0.00288378, 0.84259929 - ], [1.79580611, -0.02219234], [0.45784496, -0.1053161]]) + X_gt = np.array( + [ + [0.68481731, 0.51935141], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + [0.45784496, -0.1053161], + ] + ) y_gt = np.array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -68,19 +99,29 @@ def test_sample_regular_half(): def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) - tomek = TomekLinks(sampling_strategy='all') + tomek = TomekLinks(sampling_strategy="all") smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) - X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ - 0.62366841, -0.21312976 - ], [1.61091956, -0.40283504], [-0.37162401, - -2.19400981], [0.74680821, 1.63827342], - [0.61472253, -0.82309052], [0.19893132, -0.47761769], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ - -0.23374509, 0.18370049 - ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [ - 0.38307743, -0.05670439 - ], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) + X_gt = np.array( + [ + [0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + [0.38307743, -0.05670439], + [0.70319159, -0.02571667], + [0.75052536, -0.19246518], + ] + ) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -89,16 +130,26 @@ def test_validate_estimator_init(): def test_validate_estimator_default(): smt = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) - X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ - 0.62366841, -0.21312976 - ], [1.61091956, -0.40283504], [-0.37162401, - -2.19400981], [0.74680821, 1.63827342], - [0.61472253, -0.82309052], [0.19893132, -0.47761769], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ - -0.23374509, 0.18370049 - ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [ - 0.38307743, -0.05670439 - ], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) + X_gt = np.array( + [ + [0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + [0.38307743, -0.05670439], + [0.70319159, -0.02571667], + [0.75052536, -0.19246518], + ] + ) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -122,8 +173,10 @@ def test_parallelisation(): @pytest.mark.parametrize( "smote_params, err_msg", - [({'smote': 'rnd'}, "smote needs to be a SMOTE"), - ({'tomek': 'rnd'}, "tomek needs to be a TomekLinks")] + [ + ({"smote": "rnd"}, "smote needs to be a SMOTE"), + ({"tomek": "rnd"}, "tomek needs to be a TomekLinks"), + ], ) def test_error_wrong_object(smote_params, err_msg): smt = SMOTETomek(**smote_params) diff --git a/imblearn/datasets/__init__.py b/imblearn/datasets/__init__.py index 88ca39e82..db7ec5405 100644 --- a/imblearn/datasets/__init__.py +++ b/imblearn/datasets/__init__.py @@ -7,4 +7,4 @@ from ._zenodo import fetch_datasets -__all__ = ['make_imbalance', 'fetch_datasets'] +__all__ = ["make_imbalance", "fetch_datasets"] diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py index f786f1a0e..d483aec69 100644 --- a/imblearn/datasets/_imbalance.py +++ b/imblearn/datasets/_imbalance.py @@ -14,12 +14,9 @@ from ..utils import check_sampling_strategy -def make_imbalance(X, - y, - sampling_strategy=None, - random_state=None, - verbose=False, - **kwargs): +def make_imbalance( + X, y, sampling_strategy=None, random_state=None, verbose=False, **kwargs +): """Turns a dataset into an imbalanced dataset with a specific sampling strategy. @@ -97,21 +94,28 @@ def make_imbalance(X, # restrict ratio to be a dict or a callable if isinstance(sampling_strategy, dict) or callable(sampling_strategy): sampling_strategy_ = check_sampling_strategy( - sampling_strategy, y, 'under-sampling', **kwargs) + sampling_strategy, y, "under-sampling", **kwargs + ) else: - raise ValueError("'sampling_strategy' has to be a dictionary or a " - "function returning a dictionary. Got {} instead." - .format(type(sampling_strategy))) + raise ValueError( + "'sampling_strategy' has to be a dictionary or a " + "function returning a dictionary. Got {} instead.".format( + type(sampling_strategy) + ) + ) if verbose: - print('The original target distribution in the dataset is: %s', - target_stats) + print( + "The original target distribution in the dataset is: %s", + target_stats, + ) rus = RandomUnderSampler( sampling_strategy=sampling_strategy_, replacement=False, - random_state=random_state) + random_state=random_state, + ) X_resampled, y_resampled = rus.fit_resample(X, y) if verbose: - print('Make the dataset imbalanced: %s', Counter(y_resampled)) + print("Make the dataset imbalanced: %s", Counter(y_resampled)) return X_resampled, y_resampled diff --git a/imblearn/datasets/_zenodo.py b/imblearn/datasets/_zenodo.py index a51054d13..bc5bcedb1 100644 --- a/imblearn/datasets/_zenodo.py +++ b/imblearn/datasets/_zenodo.py @@ -57,18 +57,41 @@ from sklearn.datasets.base import Bunch from sklearn.utils import check_random_state -URL = ('https://zenodo.org/record/61452/files/' - 'benchmark-imbalanced-learn.tar.gz') -PRE_FILENAME = 'x' -POST_FILENAME = 'data.npz' +URL = ( + "https://zenodo.org/record/61452/files/" + "benchmark-imbalanced-learn.tar.gz" +) +PRE_FILENAME = "x" +POST_FILENAME = "data.npz" MAP_NAME_ID_KEYS = [ - 'ecoli', 'optical_digits', 'satimage', 'pen_digits', 'abalone', - 'sick_euthyroid', 'spectrometer', 'car_eval_34', 'isolet', 'us_crime', - 'yeast_ml8', 'scene', 'libras_move', 'thyroid_sick', 'coil_2000', - 'arrhythmia', 'solar_flare_m0', 'oil', 'car_eval_4', 'wine_quality', - 'letter_img', 'yeast_me2', 'webpage', 'ozone_level', 'mammography', - 'protein_homo', 'abalone_19' + "ecoli", + "optical_digits", + "satimage", + "pen_digits", + "abalone", + "sick_euthyroid", + "spectrometer", + "car_eval_34", + "isolet", + "us_crime", + "yeast_ml8", + "scene", + "libras_move", + "thyroid_sick", + "coil_2000", + "arrhythmia", + "solar_flare_m0", + "oil", + "car_eval_4", + "wine_quality", + "letter_img", + "yeast_me2", + "webpage", + "ozone_level", + "mammography", + "protein_homo", + "abalone_19", ] MAP_NAME_ID = OrderedDict() @@ -78,12 +101,14 @@ MAP_ID_NAME[v + 1] = k -def fetch_datasets(data_home=None, - filter_data=None, - download_if_missing=True, - random_state=None, - shuffle=False, - verbose=False): +def fetch_datasets( + data_home=None, + filter_data=None, + download_if_missing=True, + random_state=None, + shuffle=False, + verbose=False, +): """Load the benchmark datasets from Zenodo, downloading it if necessary. Parameters @@ -209,23 +234,28 @@ def fetch_datasets(data_home=None, for it in filter_data: if isinstance(it, str): if it not in list_data: - raise ValueError('{} is not a dataset available. ' - 'The available datasets are {}'.format( - it, list_data)) + raise ValueError( + "{} is not a dataset available. " + "The available datasets are {}".format(it, list_data) + ) else: filter_data_.append(it) elif isinstance(it, int): if it < 1 or it > 27: - raise ValueError('The dataset with the ID={} is not an ' - 'available dataset. The IDs are ' - '{}'.format(it, range(1, 28))) + raise ValueError( + "The dataset with the ID={} is not an " + "available dataset. The IDs are " + "{}".format(it, range(1, 28)) + ) else: # The index start at one, then we need to remove one # to not have issue with the indexing. filter_data_.append(MAP_ID_NAME[it]) else: - raise ValueError('The value in the tuple should be str or int.' - ' Got {} instead.'.format(type(it))) + raise ValueError( + "The value in the tuple should be str or int." + " Got {} instead.".format(type(it)) + ) # go through the list and check if the data are available for it in filter_data_: @@ -244,7 +274,7 @@ def fetch_datasets(data_home=None, raise IOError("Data not found and `download_if_missing` is False") data = np.load(filename) - X, y = data['data'], data['label'] + X, y = data["data"], data["label"] if shuffle: ind = np.arange(X.shape[0]) diff --git a/imblearn/datasets/tests/test_imbalance.py b/imblearn/datasets/tests/test_imbalance.py index 5fc72f552..bfe3f9e4b 100644 --- a/imblearn/datasets/tests/test_imbalance.py +++ b/imblearn/datasets/tests/test_imbalance.py @@ -20,9 +20,11 @@ def iris(): @pytest.mark.parametrize( "sampling_strategy, err_msg", - [({0: -100, 1: 50, 2: 50}, "in a class cannot be negative"), - ({0: 10, 1: 70}, "should be less or equal to the original"), - ('random-string', "has to be a dictionary or a function")] + [ + ({0: -100, 1: 50, 2: 50}, "in a class cannot be negative"), + ({0: 10, 1: 70}, "should be less or equal to the original"), + ("random-string", "has to be a dictionary or a function"), + ], ) def test_make_imbalance_error(iris, sampling_strategy, err_msg): # we are reusing part of utils.check_sampling_strategy, however this is not @@ -41,8 +43,10 @@ def test_make_imbalance_error_single_class(iris): @pytest.mark.parametrize( "sampling_strategy, expected_counts", - [({0: 10, 1: 20, 2: 30}, {0: 10, 1: 20, 2: 30}), - ({0: 10, 1: 20}, {0: 10, 1: 20, 2: 50})] + [ + ({0: 10, 1: 20, 2: 30}, {0: 10, 1: 20, 2: 30}), + ({0: 10, 1: 20}, {0: 10, 1: 20, 2: 50}), + ], ) def test_make_imbalance_dict(iris, sampling_strategy, expected_counts): X, y = iris diff --git a/imblearn/datasets/tests/test_zenodo.py b/imblearn/datasets/tests/test_zenodo.py index 28d0a06cc..4ac184902 100644 --- a/imblearn/datasets/tests/test_zenodo.py +++ b/imblearn/datasets/tests/test_zenodo.py @@ -12,33 +12,33 @@ from sklearn.utils.testing import SkipTest DATASET_SHAPE = { - 'ecoli': (336, 7), - 'optical_digits': (5620, 64), - 'satimage': (6435, 36), - 'pen_digits': (10992, 16), - 'abalone': (4177, 10), - 'sick_euthyroid': (3163, 42), - 'spectrometer': (531, 93), - 'car_eval_34': (1728, 21), - 'isolet': (7797, 617), - 'us_crime': (1994, 100), - 'yeast_ml8': (2417, 103), - 'scene': (2407, 294), - 'libras_move': (360, 90), - 'thyroid_sick': (3772, 52), - 'coil_2000': (9822, 85), - 'arrhythmia': (452, 278), - 'solar_flare_m0': (1389, 32), - 'oil': (937, 49), - 'car_eval_4': (1728, 21), - 'wine_quality': (4898, 11), - 'letter_img': (20000, 16), - 'yeast_me2': (1484, 8), - 'webpage': (34780, 300), - 'ozone_level': (2536, 72), - 'mammography': (11183, 6), - 'protein_homo': (145751, 74), - 'abalone_19': (4177, 10) + "ecoli": (336, 7), + "optical_digits": (5620, 64), + "satimage": (6435, 36), + "pen_digits": (10992, 16), + "abalone": (4177, 10), + "sick_euthyroid": (3163, 42), + "spectrometer": (531, 93), + "car_eval_34": (1728, 21), + "isolet": (7797, 617), + "us_crime": (1994, 100), + "yeast_ml8": (2417, 103), + "scene": (2407, 294), + "libras_move": (360, 90), + "thyroid_sick": (3772, 52), + "coil_2000": (9822, 85), + "arrhythmia": (452, 278), + "solar_flare_m0": (1389, 32), + "oil": (937, 49), + "car_eval_4": (1728, 21), + "wine_quality": (4898, 11), + "letter_img": (20000, 16), + "yeast_me2": (1484, 8), + "webpage": (34780, 300), + "ozone_level": (2536, 72), + "mammography": (11183, 6), + "protein_homo": (145751, 74), + "abalone_19": (4177, 10), } @@ -61,37 +61,41 @@ def test_fetch(): assert X1.shape == X2.shape y1, y2 = datasets1[k].target, datasets2[k].target - assert (X1.shape[0], ) == y1.shape - assert (X1.shape[0], ) == y2.shape + assert (X1.shape[0],) == y1.shape + assert (X1.shape[0],) == y2.shape def test_fetch_filter(): try: datasets1 = fetch( - filter_data=tuple([1]), shuffle=True, random_state=42) + filter_data=tuple([1]), shuffle=True, random_state=42 + ) except IOError: raise SkipTest("Zenodo dataset can not be loaded.") datasets2 = fetch( - filter_data=tuple(['ecoli']), shuffle=True, random_state=37) + filter_data=tuple(["ecoli"]), shuffle=True, random_state=37 + ) - X1, X2 = datasets1['ecoli'].data, datasets2['ecoli'].data - assert DATASET_SHAPE['ecoli'] == X1.shape + X1, X2 = datasets1["ecoli"].data, datasets2["ecoli"].data + assert DATASET_SHAPE["ecoli"] == X1.shape assert X1.shape == X2.shape assert X1.sum() == pytest.approx(X2.sum()) - y1, y2 = datasets1['ecoli'].target, datasets2['ecoli'].target - assert (X1.shape[0], ) == y1.shape - assert (X1.shape[0], ) == y2.shape + y1, y2 = datasets1["ecoli"].target, datasets2["ecoli"].target + assert (X1.shape[0],) == y1.shape + assert (X1.shape[0],) == y2.shape @pytest.mark.parametrize( "filter_data, err_msg", - [(('rnf',), "is not a dataset available"), - ((-1,), "dataset with the ID="), - ((100,), "dataset with the ID="), - ((1.00,), "value in the tuple")] + [ + (("rnf",), "is not a dataset available"), + ((-1,), "dataset with the ID="), + ((100,), "dataset with the ID="), + ((1.00,), "value in the tuple"), + ], ) def test_fetch_error(filter_data, err_msg): with pytest.raises(ValueError, match=err_msg): diff --git a/imblearn/ensemble/__init__.py b/imblearn/ensemble/__init__.py index 3dac8cee8..fdda4e155 100644 --- a/imblearn/ensemble/__init__.py +++ b/imblearn/ensemble/__init__.py @@ -9,8 +9,8 @@ from ._weight_boosting import RUSBoostClassifier __all__ = [ - 'BalancedBaggingClassifier', - 'BalancedRandomForestClassifier', - 'EasyEnsembleClassifier', - 'RUSBoostClassifier', + "BalancedBaggingClassifier", + "BalancedRandomForestClassifier", + "EasyEnsembleClassifier", + "RUSBoostClassifier", ] diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index 557eefef7..e77ab892d 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -21,7 +21,8 @@ @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class BalancedBaggingClassifier(BaggingClassifier): """A Bagging classifier with additional balancing. @@ -165,20 +166,22 @@ class BalancedBaggingClassifier(BaggingClassifier): """ - def __init__(self, - base_estimator=None, - n_estimators=10, - max_samples=1.0, - max_features=1.0, - bootstrap=True, - bootstrap_features=False, - oob_score=False, - warm_start=False, - sampling_strategy='auto', - replacement=False, - n_jobs=1, - random_state=None, - verbose=0): + def __init__( + self, + base_estimator=None, + n_estimators=10, + max_samples=1.0, + max_features=1.0, + bootstrap=True, + bootstrap_features=False, + oob_score=False, + warm_start=False, + sampling_strategy="auto", + replacement=False, + n_jobs=1, + random_state=None, + verbose=0, + ): super().__init__( base_estimator, @@ -191,7 +194,8 @@ def __init__(self, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, - verbose=verbose) + verbose=verbose, + ) self.sampling_strategy = sampling_strategy self.replacement = replacement @@ -199,21 +203,34 @@ def _validate_estimator(self, default=DecisionTreeClassifier()): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): - raise ValueError("n_estimators must be an integer, " - "got {}.".format(type(self.n_estimators))) + raise ValueError( + "n_estimators must be an integer, " + "got {}.".format(type(self.n_estimators)) + ) if self.n_estimators <= 0: - raise ValueError("n_estimators must be greater than zero, " - "got {}.".format(self.n_estimators)) + raise ValueError( + "n_estimators must be greater than zero, " + "got {}.".format(self.n_estimators) + ) if self.base_estimator is not None: base_estimator = clone(self.base_estimator) else: base_estimator = clone(default) - self.base_estimator_ = Pipeline([('sampler', RandomUnderSampler( - sampling_strategy=self.sampling_strategy, - replacement=self.replacement)), ('classifier', base_estimator)]) + self.base_estimator_ = Pipeline( + [ + ( + "sampler", + RandomUnderSampler( + sampling_strategy=self.sampling_strategy, + replacement=self.replacement, + ), + ), + ("classifier", base_estimator), + ] + ) def fit(self, X, y): """Build a Bagging ensemble of estimators from the training diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index 368d521fb..eb29d778d 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -23,7 +23,8 @@ @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class EasyEnsembleClassifier(BaggingClassifier): """Bag of balanced boosted learners also known as EasyEnsemble. @@ -117,9 +118,18 @@ class EasyEnsembleClassifier(BaggingClassifier): [ 2 225]] """ - def __init__(self, n_estimators=10, base_estimator=None, warm_start=False, - sampling_strategy='auto', replacement=False, n_jobs=1, - random_state=None, verbose=0): + + def __init__( + self, + n_estimators=10, + base_estimator=None, + warm_start=False, + sampling_strategy="auto", + replacement=False, + n_jobs=1, + random_state=None, + verbose=0, + ): super().__init__( base_estimator, n_estimators=n_estimators, @@ -131,7 +141,8 @@ def __init__(self, n_estimators=10, base_estimator=None, warm_start=False, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, - verbose=verbose) + verbose=verbose, + ) self.sampling_strategy = sampling_strategy self.replacement = replacement @@ -139,12 +150,16 @@ def _validate_estimator(self, default=AdaBoostClassifier()): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): - raise ValueError("n_estimators must be an integer, " - "got {}.".format(type(self.n_estimators))) + raise ValueError( + "n_estimators must be an integer, " + "got {}.".format(type(self.n_estimators)) + ) if self.n_estimators <= 0: - raise ValueError("n_estimators must be greater than zero, " - "got {}.".format(self.n_estimators)) + raise ValueError( + "n_estimators must be greater than zero, " + "got {}.".format(self.n_estimators) + ) if self.base_estimator is not None: base_estimator = clone(self.base_estimator) @@ -152,10 +167,17 @@ def _validate_estimator(self, default=AdaBoostClassifier()): base_estimator = clone(default) self.base_estimator_ = Pipeline( - [('sampler', RandomUnderSampler( - sampling_strategy=self.sampling_strategy, - replacement=self.replacement)), - ('classifier', base_estimator)]) + [ + ( + "sampler", + RandomUnderSampler( + sampling_strategy=self.sampling_strategy, + replacement=self.replacement, + ), + ), + ("classifier", base_estimator), + ] + ) def fit(self, X, y): """Build a Bagging ensemble of AdaBoost classifier using balanced diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 2efaa4e9d..12f57e09a 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -33,23 +33,41 @@ MAX_INT = np.iinfo(np.int32).max -def _local_parallel_build_trees(sampler, tree, forest, X, y, sample_weight, - tree_idx, n_trees, verbose=0, - class_weight=None): +def _local_parallel_build_trees( + sampler, + tree, + forest, + X, + y, + sample_weight, + tree_idx, + n_trees, + verbose=0, + class_weight=None, +): # resample before to fit the tree X_resampled, y_resampled = sampler.fit_resample(X, y) if sample_weight is not None: sample_weight = safe_indexing(sample_weight, sampler.sample_indices_) - tree = _parallel_build_trees(tree, forest, X_resampled, y_resampled, - sample_weight, tree_idx, n_trees, - verbose=verbose, class_weight=class_weight, - n_samples_bootstrap=X_resampled.shape[0]) + tree = _parallel_build_trees( + tree, + forest, + X_resampled, + y_resampled, + sample_weight, + tree_idx, + n_trees, + verbose=verbose, + class_weight=class_weight, + n_samples_bootstrap=X_resampled.shape[0], + ) return sampler, tree @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class BalancedRandomForestClassifier(RandomForestClassifier): """A balanced random forest classifier. @@ -242,25 +260,28 @@ class labels (multi-output problem). [1] """ - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=2, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=True, - oob_score=False, - sampling_strategy='auto', - replacement=False, - n_jobs=1, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None): + + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=2, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + sampling_strategy="auto", + replacement=False, + n_jobs=1, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ): super().__init__( criterion=criterion, max_depth=max_depth, @@ -277,7 +298,8 @@ def __init__(self, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, - min_impurity_decrease=min_impurity_decrease) + min_impurity_decrease=min_impurity_decrease, + ) self.sampling_strategy = sampling_strategy self.replacement = replacement @@ -286,12 +308,16 @@ def _validate_estimator(self, default=DecisionTreeClassifier()): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): - raise ValueError("n_estimators must be an integer, " - "got {}.".format(type(self.n_estimators))) + raise ValueError( + "n_estimators must be an integer, " + "got {}.".format(type(self.n_estimators)) + ) if self.n_estimators <= 0: - raise ValueError("n_estimators must be greater than zero, " - "got {}.".format(self.n_estimators)) + raise ValueError( + "n_estimators must be greater than zero, " + "got {}.".format(self.n_estimators) + ) if self.base_estimator is not None: self.base_estimator_ = clone(self.base_estimator) @@ -300,7 +326,8 @@ def _validate_estimator(self, default=DecisionTreeClassifier()): self.base_sampler_ = RandomUnderSampler( sampling_strategy=self.sampling_strategy, - replacement=self.replacement) + replacement=self.replacement, + ) def _make_sampler_estimator(self, random_state=None): """Make and configure a copy of the `base_estimator_` attribute. @@ -308,8 +335,9 @@ def _make_sampler_estimator(self, random_state=None): sub-estimators. """ estimator = clone(self.base_estimator_) - estimator.set_params(**{p: getattr(self, p) - for p in self.estimator_params}) + estimator.set_params( + **{p: getattr(self, p) for p in self.estimator_params} + ) sampler = clone(self.base_sampler_) if random_state is not None: @@ -347,7 +375,7 @@ def fit(self, X, y, sample_weight=None): # Validate or convert input data X = check_array(X, accept_sparse="csc", dtype=DTYPE) - y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) + y = check_array(y, accept_sparse="csc", ensure_2d=False, dtype=None) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) if issparse(X): @@ -360,10 +388,13 @@ def fit(self, X, y, sample_weight=None): y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: - warn("A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples,), for example using ravel().", - DataConversionWarning, stacklevel=2) + warn( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel().", + DataConversionWarning, + stacklevel=2, + ) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs @@ -387,8 +418,9 @@ def fit(self, X, y, sample_weight=None): self._validate_estimator() if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError( + "Out of bag estimation only available" " if bootstrap=True" + ) random_state = check_random_state(self.random_state) @@ -401,13 +433,17 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: - raise ValueError('n_estimators=%d must be larger or equal to ' - 'len(estimators_)=%d when warm_start==True' - % (self.n_estimators, len(self.estimators_))) + raise ValueError( + "n_estimators=%d must be larger or equal to " + "len(estimators_)=%d when warm_start==True" + % (self.n_estimators, len(self.estimators_)) + ) elif n_more_estimators == 0: - warn("Warm-start fitting without increasing n_estimators does not " - "fit new trees.") + warn( + "Warm-start fitting without increasing n_estimators does not " + "fit new trees." + ) else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we @@ -418,7 +454,8 @@ def fit(self, X, y, sample_weight=None): samplers = [] for _ in range(n_more_estimators): tree, sampler = self._make_sampler_estimator( - random_state=random_state) + random_state=random_state + ) trees.append(tree) samplers.append(sampler) @@ -429,11 +466,22 @@ def fit(self, X, y, sample_weight=None): # at a higher level, since correctness does not rely on using # threads. samplers_trees = Parallel( - n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads")( - delayed(_local_parallel_build_trees)( - s, t, self, X, y, sample_weight, i, len(trees), - verbose=self.verbose, class_weight=self.class_weight) - for i, (s, t) in enumerate(zip(samplers, trees))) + n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads" + )( + delayed(_local_parallel_build_trees)( + s, + t, + self, + X, + y, + sample_weight, + i, + len(trees), + verbose=self.verbose, + class_weight=self.class_weight, + ) + for i, (s, t) in enumerate(zip(samplers, trees)) + ) samplers, trees = zip(*samplers_trees) # Collect newly grown trees @@ -441,8 +489,12 @@ def fit(self, X, y, sample_weight=None): self.samplers_.extend(samplers) # Create pipeline with the fitted samplers and trees - self.pipelines_.extend([make_pipeline(deepcopy(s), deepcopy(t)) - for s, t in zip(samplers, trees)]) + self.pipelines_.extend( + [ + make_pipeline(deepcopy(s), deepcopy(t)) + for s, t in zip(samplers, trees) + ] + ) if self.oob_score: self._set_oob_score(X, y) @@ -455,4 +507,4 @@ def fit(self, X, y, sample_weight=None): return self def _more_tags(self): - return {'multioutput': False} + return {"multioutput": False} diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index 1c54dacc5..1a84c4c86 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -18,7 +18,8 @@ @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class RUSBoostClassifier(AdaBoostClassifier): """Random under-sampling integrating in the learning of an AdaBoost classifier. @@ -113,15 +114,23 @@ class RUSBoostClassifier(AdaBoostClassifier): array([...]) """ - def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1., - algorithm='SAMME.R', sampling_strategy='auto', - replacement=False, random_state=None): + def __init__( + self, + base_estimator=None, + n_estimators=50, + learning_rate=1.0, + algorithm="SAMME.R", + sampling_strategy="auto", + replacement=False, + random_state=None, + ): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, - random_state=random_state) + random_state=random_state, + ) self.sampling_strategy = sampling_strategy self.replacement = replacement @@ -160,7 +169,8 @@ def _validate_estimator(self): self.base_sampler_ = RandomUnderSampler( sampling_strategy=self.sampling_strategy, - replacement=self.replacement) + replacement=self.replacement, + ) def _make_sampler_estimator(self, append=True, random_state=None): """Make and configure a copy of the `base_estimator_` attribute. @@ -168,8 +178,9 @@ def _make_sampler_estimator(self, append=True, random_state=None): sub-estimators. """ estimator = clone(self.base_estimator_) - estimator.set_params(**{p: getattr(self, p) - for p in self.estimator_params}) + estimator.set_params( + **{p: getattr(self, p) for p in self.estimator_params} + ) sampler = clone(self.base_sampler_) if random_state is not None: @@ -179,40 +190,45 @@ def _make_sampler_estimator(self, append=True, random_state=None): if append: self.estimators_.append(estimator) self.samplers_.append(sampler) - self.pipelines_.append(make_pipeline(deepcopy(sampler), - deepcopy(estimator))) + self.pipelines_.append( + make_pipeline(deepcopy(sampler), deepcopy(estimator)) + ) return estimator, sampler def _boost_real(self, iboost, X, y, sample_weight, random_state): """Implement a single boost using the SAMME.R real algorithm.""" estimator, sampler = self._make_sampler_estimator( - random_state=random_state) + random_state=random_state + ) X_res, y_res = sampler.fit_resample(X, y) - sample_weight_res = safe_indexing(sample_weight, - sampler.sample_indices_) + sample_weight_res = safe_indexing( + sample_weight, sampler.sample_indices_ + ) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) y_predict_proba = estimator.predict_proba(X) if iboost == 0: - self.classes_ = getattr(estimator, 'classes_', None) + self.classes_ = getattr(estimator, "classes_", None) self.n_classes_ = len(self.classes_) - y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1), - axis=0) + y_predict = self.classes_.take( + np.argmax(y_predict_proba, axis=1), axis=0 + ) # Instances incorrectly classified incorrect = y_predict != y # Error fraction estimator_error = np.mean( - np.average(incorrect, weights=sample_weight, axis=0)) + np.average(incorrect, weights=sample_weight, axis=0) + ) # Stop if classification is perfect if estimator_error <= 0: - return sample_weight, 1., 0. + return sample_weight, 1.0, 0.0 # Construct y coding as described in Zhu et al [2]: # @@ -223,7 +239,7 @@ def _boost_real(self, iboost, X, y, sample_weight, random_state): # class label. n_classes = self.n_classes_ classes = self.classes_ - y_codes = np.array([-1. / (n_classes - 1), 1.]) + y_codes = np.array([-1.0 / (n_classes - 1), 1.0]) y_coding = y_codes.take(classes == y[:, np.newaxis]) # Displace zero probabilities so the log is defined. @@ -233,33 +249,39 @@ def _boost_real(self, iboost, X, y, sample_weight, random_state): np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba) # Boost weight using multi-class AdaBoost SAMME.R alg - estimator_weight = (-1. * self.learning_rate - * ((n_classes - 1.) / n_classes) - * (y_coding * np.log(y_predict_proba)).sum(axis=1)) + estimator_weight = ( + -1.0 + * self.learning_rate + * ((n_classes - 1.0) / n_classes) + * (y_coding * np.log(y_predict_proba)).sum(axis=1) + ) # Only boost the weights if it will fit again if not iboost == self.n_estimators - 1: # Only boost positive weights - sample_weight *= np.exp(estimator_weight * - ((sample_weight > 0) | - (estimator_weight < 0))) + sample_weight *= np.exp( + estimator_weight + * ((sample_weight > 0) | (estimator_weight < 0)) + ) - return sample_weight, 1., estimator_error + return sample_weight, 1.0, estimator_error def _boost_discrete(self, iboost, X, y, sample_weight, random_state): """Implement a single boost using the SAMME discrete algorithm.""" estimator, sampler = self._make_sampler_estimator( - random_state=random_state) + random_state=random_state + ) X_res, y_res = sampler.fit_resample(X, y) - sample_weight_res = safe_indexing(sample_weight, - sampler.sample_indices_) + sample_weight_res = safe_indexing( + sample_weight, sampler.sample_indices_ + ) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) y_predict = estimator.predict(X) if iboost == 0: - self.classes_ = getattr(estimator, 'classes_', None) + self.classes_ = getattr(estimator, "classes_", None) self.n_classes_ = len(self.classes_) # Instances incorrectly classified @@ -267,35 +289,41 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state): # Error fraction estimator_error = np.mean( - np.average(incorrect, weights=sample_weight, axis=0)) + np.average(incorrect, weights=sample_weight, axis=0) + ) # Stop if classification is perfect if estimator_error <= 0: - return sample_weight, 1., 0. + return sample_weight, 1.0, 0.0 n_classes = self.n_classes_ # Stop if the error is at least as bad as random guessing - if estimator_error >= 1. - (1. / n_classes): + if estimator_error >= 1.0 - (1.0 / n_classes): self.estimators_.pop(-1) self.samplers_.pop(-1) self.pipelines_.pop(-1) if len(self.estimators_) == 0: - raise ValueError('BaseClassifier in AdaBoostClassifier ' - 'ensemble is worse than random, ensemble ' - 'can not be fit.') + raise ValueError( + "BaseClassifier in AdaBoostClassifier " + "ensemble is worse than random, ensemble " + "can not be fit." + ) return None, None, None # Boost weight using multi-class AdaBoost SAMME alg estimator_weight = self.learning_rate * ( - np.log((1. - estimator_error) / estimator_error) + - np.log(n_classes - 1.)) + np.log((1.0 - estimator_error) / estimator_error) + + np.log(n_classes - 1.0) + ) # Only boost the weights if I will fit again if not iboost == self.n_estimators - 1: # Only boost positive weights - sample_weight *= np.exp(estimator_weight * incorrect * - ((sample_weight > 0) | - (estimator_weight < 0))) + sample_weight *= np.exp( + estimator_weight + * incorrect + * ((sample_weight > 0) | (estimator_weight < 0)) + ) return sample_weight, estimator_weight, estimator_error diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py index f5de813b1..935bb3dfe 100644 --- a/imblearn/ensemble/base.py +++ b/imblearn/ensemble/base.py @@ -19,7 +19,7 @@ class BaseEnsembleSampler(BaseSampler): instead. """ - _sampling_type = 'ensemble' + _sampling_type = "ensemble" def fit_resample(self, X, y): """Resample the dataset. @@ -47,7 +47,8 @@ def fit_resample(self, X, y): X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, self._sampling_type) + self.sampling_strategy, y, self._sampling_type + ) output = self._fit_resample(X, y) @@ -55,7 +56,8 @@ def fit_resample(self, X, y): y_resampled = output[1] classes = np.unique(y) y_resampled_encoded = np.array( - [label_binarize(batch_y, classes) for batch_y in y_resampled]) + [label_binarize(batch_y, classes) for batch_y in y_resampled] + ) if len(output) == 2: return output[0], y_resampled_encoded return output[0], y_resampled_encoded, output[2] diff --git a/imblearn/ensemble/tests/test_bagging.py b/imblearn/ensemble/tests/test_bagging.py index 307dcdd31..734070fc6 100644 --- a/imblearn/ensemble/tests/test_bagging.py +++ b/imblearn/ensemble/tests/test_bagging.py @@ -7,17 +7,22 @@ import pytest from sklearn.datasets import load_iris, make_hastie_10_2 -from sklearn.model_selection import (GridSearchCV, ParameterGrid, - train_test_split) +from sklearn.model_selection import ( + GridSearchCV, + ParameterGrid, + train_test_split, +) from sklearn.dummy import DummyClassifier from sklearn.linear_model import Perceptron, LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest -from sklearn.utils.testing import (assert_array_equal, - assert_array_almost_equal, - assert_allclose) +from sklearn.utils.testing import ( + assert_array_equal, + assert_array_almost_equal, + assert_allclose, +) from imblearn.datasets import make_imbalance from imblearn.ensemble import BalancedBaggingClassifier @@ -32,30 +37,31 @@ def test_balanced_bagging_classifier(): X, y = make_imbalance( iris.data, iris.target, - sampling_strategy={0: 20, - 1: 25, - 2: 50}, - random_state=0) + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - grid = ParameterGrid({ - "max_samples": [0.5, 1.0], - "max_features": [1, 2, 4], - "bootstrap": [True, False], - "bootstrap_features": [True, False] - }) + grid = ParameterGrid( + { + "max_samples": [0.5, 1.0], + "max_features": [1, 2, 4], + "bootstrap": [True, False], + "bootstrap_features": [True, False], + } + ) for base_estimator in [ - None, - DummyClassifier(), - Perceptron(max_iter=1000, tol=1e-3), - DecisionTreeClassifier(), - KNeighborsClassifier(), - SVC(gamma='scale') + None, + DummyClassifier(), + Perceptron(max_iter=1000, tol=1e-3), + DecisionTreeClassifier(), + KNeighborsClassifier(), + SVC(gamma="scale"), ]: for params in grid: BalancedBaggingClassifier( - base_estimator=base_estimator, random_state=0, **params).fit( - X_train, y_train).predict(X_test) + base_estimator=base_estimator, random_state=0, **params + ).fit(X_train, y_train).predict(X_test) def test_bootstrap_samples(): @@ -63,10 +69,9 @@ def test_bootstrap_samples(): X, y = make_imbalance( iris.data, iris.target, - sampling_strategy={0: 20, - 1: 25, - 2: 50}, - random_state=0) + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) base_estimator = DecisionTreeClassifier().fit(X_train, y_train) @@ -79,20 +84,24 @@ def test_bootstrap_samples(): bootstrap=False, n_estimators=10, sampling_strategy={}, - random_state=0).fit(X_train, y_train) + random_state=0, + ).fit(X_train, y_train) - assert (ensemble.score(X_train, y_train) == base_estimator.score( - X_train, y_train)) + assert ensemble.score(X_train, y_train) == base_estimator.score( + X_train, y_train + ) # with bootstrap, trees are no longer perfect on the training set ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=True, - random_state=0).fit(X_train, y_train) + random_state=0, + ).fit(X_train, y_train) - assert (ensemble.score(X_train, y_train) < base_estimator.score( - X_train, y_train)) + assert ensemble.score(X_train, y_train) < base_estimator.score( + X_train, y_train + ) def test_bootstrap_features(): @@ -100,17 +109,17 @@ def test_bootstrap_features(): X, y = make_imbalance( iris.data, iris.target, - sampling_strategy={0: 20, - 1: 25, - 2: 50}, - random_state=0) + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=False, - random_state=0).fit(X_train, y_train) + random_state=0, + ).fit(X_train, y_train) for features in ensemble.estimators_features_: assert np.unique(features).shape[0] == X.shape[1] @@ -119,7 +128,8 @@ def test_bootstrap_features(): base_estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=True, - random_state=0).fit(X_train, y_train) + random_state=0, + ).fit(X_train, y_train) unique_features = [ np.unique(features).shape[0] @@ -133,40 +143,46 @@ def test_probability(): X, y = make_imbalance( iris.data, iris.target, - sampling_strategy={0: 20, - 1: 25, - 2: 50}, - random_state=0) + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( - base_estimator=DecisionTreeClassifier(), random_state=0).fit( - X_train, y_train) + base_estimator=DecisionTreeClassifier(), random_state=0 + ).fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), - np.ones(len(X_test))) + np.ones(len(X_test)), + ) assert_array_almost_equal( ensemble.predict_proba(X_test), - np.exp(ensemble.predict_log_proba(X_test))) + np.exp(ensemble.predict_log_proba(X_test)), + ) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( - base_estimator=LogisticRegression(solver='lbfgs', - multi_class='auto'), - random_state=0, max_samples=5) + base_estimator=LogisticRegression( + solver="lbfgs", multi_class="auto" + ), + random_state=0, + max_samples=5, + ) ensemble.fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), - np.ones(len(X_test))) + np.ones(len(X_test)), + ) assert_array_almost_equal( ensemble.predict_proba(X_test), - np.exp(ensemble.predict_log_proba(X_test))) + np.exp(ensemble.predict_log_proba(X_test)), + ) def test_oob_score_classification(): @@ -175,19 +191,19 @@ def test_oob_score_classification(): X, y = make_imbalance( iris.data, iris.target, - sampling_strategy={0: 20, - 1: 25, - 2: 50}, - random_state=0) + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - for base_estimator in [DecisionTreeClassifier(), SVC(gamma='scale')]: + for base_estimator in [DecisionTreeClassifier(), SVC(gamma="scale")]: clf = BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, - random_state=0).fit(X_train, y_train) + random_state=0, + ).fit(X_train, y_train) test_score = clf.score(X_test, y_test) @@ -200,7 +216,8 @@ def test_oob_score_classification(): n_estimators=1, bootstrap=True, oob_score=True, - random_state=0).fit(X_train, y_train) + random_state=0, + ).fit(X_train, y_train) def test_single_estimator(): @@ -208,10 +225,9 @@ def test_single_estimator(): X, y = make_imbalance( iris.data, iris.target, - sampling_strategy={0: 20, - 1: 25, - 2: 50}, - random_state=0) + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( @@ -219,12 +235,15 @@ def test_single_estimator(): n_estimators=1, bootstrap=False, bootstrap_features=False, - random_state=0).fit(X_train, y_train) + random_state=0, + ).fit(X_train, y_train) clf2 = make_pipeline( RandomUnderSampler( - random_state=clf1.estimators_[0].steps[0][1].random_state), - KNeighborsClassifier()).fit(X_train, y_train) + random_state=clf1.estimators_[0].steps[0][1].random_state + ), + KNeighborsClassifier(), + ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) @@ -232,9 +251,8 @@ def test_single_estimator(): def test_error(): # Test that it gives proper exception on deficient input. X, y = make_imbalance( - iris.data, iris.target, sampling_strategy={0: 20, - 1: 25, - 2: 50}) + iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50} + ) base = DecisionTreeClassifier() # Test n_estimators @@ -268,8 +286,9 @@ def test_error(): BalancedBaggingClassifier(base, max_features="foobar").fit(X, y) # Test support of decision_function - assert not (hasattr( - BalancedBaggingClassifier(base).fit(X, y), 'decision_function')) + assert not ( + hasattr(BalancedBaggingClassifier(base).fit(X, y), "decision_function") + ) def test_gridsearch(): @@ -279,11 +298,14 @@ def test_gridsearch(): y[y == 2] = 1 # Grid search with scoring based on decision_function - parameters = {'n_estimators': (1, 2), 'base_estimator__C': (1, 2)} + parameters = {"n_estimators": (1, 2), "base_estimator__C": (1, 2)} GridSearchCV( - BalancedBaggingClassifier(SVC(gamma='scale')), parameters, cv=3, - scoring="roc_auc").fit(X, y) + BalancedBaggingClassifier(SVC(gamma="scale")), + parameters, + cv=3, + scoring="roc_auc", + ).fit(X, y) def test_base_estimator(): @@ -291,28 +313,30 @@ def test_base_estimator(): X, y = make_imbalance( iris.data, iris.target, - sampling_strategy={0: 20, - 1: 25, - 2: 50}, - random_state=0) + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - ensemble = BalancedBaggingClassifier( - None, n_jobs=3, random_state=0).fit(X_train, y_train) + ensemble = BalancedBaggingClassifier(None, n_jobs=3, random_state=0).fit( + X_train, y_train + ) - assert isinstance(ensemble.base_estimator_.steps[-1][1], - DecisionTreeClassifier) + assert isinstance( + ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier + ) ensemble = BalancedBaggingClassifier( - DecisionTreeClassifier(), n_jobs=3, random_state=0).fit( - X_train, y_train) + DecisionTreeClassifier(), n_jobs=3, random_state=0 + ).fit(X_train, y_train) - assert isinstance(ensemble.base_estimator_.steps[-1][1], - DecisionTreeClassifier) + assert isinstance( + ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier + ) ensemble = BalancedBaggingClassifier( - Perceptron(max_iter=1000, tol=1e-3), n_jobs=3, random_state=0).fit( - X_train, y_train) + Perceptron(max_iter=1000, tol=1e-3), n_jobs=3, random_state=0 + ).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], Perceptron) @@ -321,13 +345,13 @@ def test_bagging_with_pipeline(): X, y = make_imbalance( iris.data, iris.target, - sampling_strategy={0: 20, - 1: 25, - 2: 50}, - random_state=0) + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) estimator = BalancedBaggingClassifier( make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), - max_features=2) + max_features=2, + ) estimator.fit(X, y).predict(X) @@ -342,18 +366,21 @@ def test_warm_start(random_state=42): clf_ws = BalancedBaggingClassifier( n_estimators=n_estimators, random_state=random_state, - warm_start=True) + warm_start=True, + ) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BalancedBaggingClassifier( - n_estimators=10, random_state=random_state, warm_start=False) + n_estimators=10, random_state=random_state, warm_start=False + ) clf_no_ws.fit(X, y) - assert ({pipe.steps[-1][1].random_state for pipe in clf_ws} == { - pipe.steps[-1][1].random_state for pipe in clf_no_ws}) + assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == { + pipe.steps[-1][1].random_state for pipe in clf_no_ws + } def test_warm_start_smaller_n_estimators(): @@ -372,12 +399,13 @@ def test_warm_start_equal_n_estimators(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BalancedBaggingClassifier( - n_estimators=5, warm_start=True, random_state=83) + n_estimators=5, warm_start=True, random_state=83 + ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything - X_train += 1. + X_train += 1.0 warn_msg = "Warm-start fitting without increasing n_estimators does not" with pytest.warns(UserWarning, match=warn_msg): @@ -392,14 +420,16 @@ def test_warm_start_equivalence(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BalancedBaggingClassifier( - n_estimators=5, warm_start=True, random_state=3141) + n_estimators=5, warm_start=True, random_state=3141 + ) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BalancedBaggingClassifier( - n_estimators=10, warm_start=False, random_state=3141) + n_estimators=10, warm_start=False, random_state=3141 + ) clf.fit(X_train, y_train) y2 = clf.predict(X_test) @@ -410,7 +440,8 @@ def test_warm_start_with_oob_score_fails(): # Check using oob_score and warm_start simultaneously fails X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier( - n_estimators=5, warm_start=True, oob_score=True) + n_estimators=5, warm_start=True, oob_score=True + ) with pytest.raises(ValueError): clf.fit(X, y) @@ -437,7 +468,8 @@ def test_oob_score_consistency(): max_samples=0.5, max_features=0.5, oob_score=True, - random_state=1) + random_state=1, + ) assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_ @@ -449,11 +481,13 @@ def test_estimators_samples(): # remap the y outside of the BalancedBaggingclassifier # _, y = np.unique(y, return_inverse=True) - bagging = BalancedBaggingClassifier(LogisticRegression(solver='lbfgs', - multi_class='auto'), - max_samples=0.5, - max_features=0.5, random_state=1, - bootstrap=False) + bagging = BalancedBaggingClassifier( + LogisticRegression(solver="lbfgs", multi_class="auto"), + max_samples=0.5, + max_features=0.5, + random_state=1, + bootstrap=False, + ) bagging.fit(X, y) # Get relevant attributes @@ -464,7 +498,7 @@ def test_estimators_samples(): # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) // 2 - assert estimators_samples[0].dtype.kind == 'i' + assert estimators_samples[0].dtype.kind == "i" # Re-fit single estimator to test for consistent sampling estimator_index = 0 @@ -491,6 +525,7 @@ def test_max_samples_consistency(): KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, - random_state=1) + random_state=1, + ) bagging.fit(X, y) assert bagging._max_samples == max_samples diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index f9a8b00e7..db4762e88 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -23,34 +23,50 @@ # Generate a global dataset to use RND_SEED = 0 -X = np.array([[0.5220963, 0.11349303], [0.59091459, 0.40692742], [ - 1.10915364, 0.05718352 -], [0.22039505, 0.26469445], [1.35269503, 0.44812421], [0.85117925, 1.0185556], - [-2.10724436, 0.70263997], [-0.23627356, 0.30254174], - [-1.23195149, 0.15427291], [-0.58539673, 0.62515052]]) +X = np.array( + [ + [0.5220963, 0.11349303], + [0.59091459, 0.40692742], + [1.10915364, 0.05718352], + [0.22039505, 0.26469445], + [1.35269503, 0.44812421], + [0.85117925, 1.0185556], + [-2.10724436, 0.70263997], + [-0.23627356, 0.30254174], + [-1.23195149, 0.15427291], + [-0.58539673, 0.62515052], + ] +) Y = np.array([1, 2, 2, 2, 1, 0, 1, 1, 1, 0]) @pytest.mark.parametrize("n_estimators", [10, 20]) -@pytest.mark.parametrize("base_estimator", [ - AdaBoostClassifier(n_estimators=5), - AdaBoostClassifier(n_estimators=10)]) +@pytest.mark.parametrize( + "base_estimator", + [AdaBoostClassifier(n_estimators=5), AdaBoostClassifier(n_estimators=10)], +) def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. - X, y = make_imbalance(iris.data, iris.target, - sampling_strategy={0: 20, 1: 25, 2: 50}, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - eec = EasyEnsembleClassifier(n_estimators=n_estimators, - base_estimator=base_estimator, - n_jobs=-1, - random_state=RND_SEED) + eec = EasyEnsembleClassifier( + n_estimators=n_estimators, + base_estimator=base_estimator, + n_jobs=-1, + random_state=RND_SEED, + ) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: - assert (len(est.named_steps['classifier']) == - base_estimator.n_estimators) + assert ( + len(est.named_steps["classifier"]) == base_estimator.n_estimators + ) # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) @@ -60,32 +76,42 @@ def test_easy_ensemble_classifier(n_estimators, base_estimator): def test_base_estimator(): # Check base_estimator and its default values. - X, y = make_imbalance(iris.data, iris.target, - sampling_strategy={0: 20, 1: 25, 2: 50}, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - ensemble = EasyEnsembleClassifier( - 2, None, n_jobs=-1, random_state=0).fit(X_train, y_train) + ensemble = EasyEnsembleClassifier(2, None, n_jobs=-1, random_state=0).fit( + X_train, y_train + ) - assert isinstance(ensemble.base_estimator_.steps[-1][1], - AdaBoostClassifier) + assert isinstance( + ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier + ) ensemble = EasyEnsembleClassifier( - 2, AdaBoostClassifier(), n_jobs=-1, random_state=0).fit( - X_train, y_train) + 2, AdaBoostClassifier(), n_jobs=-1, random_state=0 + ).fit(X_train, y_train) - assert isinstance(ensemble.base_estimator_.steps[-1][1], - AdaBoostClassifier) + assert isinstance( + ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier + ) def test_bagging_with_pipeline(): - X, y = make_imbalance(iris.data, iris.target, - sampling_strategy={0: 20, 1: 25, 2: 50}, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) estimator = EasyEnsembleClassifier( n_estimators=2, - base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier())) + base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()), + ) estimator.fit(X, y).predict(X) @@ -100,18 +126,21 @@ def test_warm_start(random_state=42): clf_ws = EasyEnsembleClassifier( n_estimators=n_estimators, random_state=random_state, - warm_start=True) + warm_start=True, + ) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = EasyEnsembleClassifier( - n_estimators=10, random_state=random_state, warm_start=False) + n_estimators=10, random_state=random_state, warm_start=False + ) clf_no_ws.fit(X, y) - assert ({pipe.steps[-1][1].random_state for pipe in clf_ws} == { - pipe.steps[-1][1].random_state for pipe in clf_no_ws}) + assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == { + pipe.steps[-1][1].random_state for pipe in clf_no_ws + } def test_warm_start_smaller_n_estimators(): @@ -130,12 +159,13 @@ def test_warm_start_equal_n_estimators(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = EasyEnsembleClassifier( - n_estimators=5, warm_start=True, random_state=83) + n_estimators=5, warm_start=True, random_state=83 + ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything - X_train += 1. + X_train += 1.0 warn_msg = "Warm-start fitting without increasing n_estimators" with pytest.warns(UserWarning, match=warn_msg): @@ -150,14 +180,16 @@ def test_warm_start_equivalence(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = EasyEnsembleClassifier( - n_estimators=5, warm_start=True, random_state=3141) + n_estimators=5, warm_start=True, random_state=3141 + ) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = EasyEnsembleClassifier( - n_estimators=10, warm_start=False, random_state=3141) + n_estimators=10, warm_start=False, random_state=3141 + ) clf.fit(X_train, y_train) y2 = clf.predict(X_test) @@ -166,40 +198,58 @@ def test_warm_start_equivalence(): @pytest.mark.parametrize( "n_estimators, msg_error", - [(1., "n_estimators must be an integer"), - (-10, "n_estimators must be greater than zero")]) + [ + (1.0, "n_estimators must be an integer"), + (-10, "n_estimators must be greater than zero"), + ], +) def test_easy_ensemble_classifier_error(n_estimators, msg_error): - X, y = make_imbalance(iris.data, iris.target, - sampling_strategy={0: 20, 1: 25, 2: 50}, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) with pytest.raises(ValueError, match=msg_error): eec = EasyEnsembleClassifier(n_estimators=n_estimators) eec.fit(X, y) def test_easy_ensemble_classifier_single_estimator(): - X, y = make_imbalance(iris.data, iris.target, - sampling_strategy={0: 20, 1: 25, 2: 50}, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit( - X_train, y_train) - clf2 = make_pipeline(RandomUnderSampler(random_state=0), - AdaBoostClassifier(random_state=0)).fit( - X_train, y_train) + X_train, y_train + ) + clf2 = make_pipeline( + RandomUnderSampler(random_state=0), AdaBoostClassifier(random_state=0) + ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) def test_easy_ensemble_classifier_grid_search(): - X, y = make_imbalance(iris.data, iris.target, - sampling_strategy={0: 20, 1: 25, 2: 50}, - random_state=0) - - parameters = {'n_estimators': [1, 2], - 'base_estimator__n_estimators': [3, 4]} + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, + ) + + parameters = { + "n_estimators": [1, 2], + "base_estimator__n_estimators": [3, 4], + } grid_search = GridSearchCV( EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), - parameters, cv=5, iid=False) + parameters, + cv=5, + iid=False, + ) grid_search.fit(X, y) diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py index e43b26339..85ad37743 100644 --- a/imblearn/ensemble/tests/test_forest.py +++ b/imblearn/ensemble/tests/test_forest.py @@ -12,21 +12,34 @@ @pytest.fixture def imbalanced_dataset(): - return make_classification(n_samples=10000, n_features=2, n_informative=2, - n_redundant=0, n_repeated=0, n_classes=3, - n_clusters_per_class=1, - weights=[0.01, 0.05, 0.94], class_sep=0.8, - random_state=0) + return make_classification( + n_samples=10000, + n_features=2, + n_informative=2, + n_redundant=0, + n_repeated=0, + n_classes=3, + n_clusters_per_class=1, + weights=[0.01, 0.05, 0.94], + class_sep=0.8, + random_state=0, + ) @pytest.mark.parametrize( "forest_params, err_msg", - [({"n_estimators": 'whatever'}, "n_estimators must be an integer"), - ({"n_estimators": -100}, "n_estimators must be greater than zero"), - ({"bootstrap": False, 'oob_score': True}, "Out of bag estimation only")] + [ + ({"n_estimators": "whatever"}, "n_estimators must be an integer"), + ({"n_estimators": -100}, "n_estimators must be greater than zero"), + ( + {"bootstrap": False, "oob_score": True}, + "Out of bag estimation only", + ), + ], ) -def test_balanced_random_forest_error(imbalanced_dataset, forest_params, - err_msg): +def test_balanced_random_forest_error( + imbalanced_dataset, forest_params, err_msg +): brf = BalancedRandomForestClassifier(**forest_params) with pytest.raises(ValueError, match=err_msg): brf.fit(*imbalanced_dataset) @@ -49,8 +62,9 @@ def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset): def test_balanced_random_forest(imbalanced_dataset): n_estimators = 10 - brf = BalancedRandomForestClassifier(n_estimators=n_estimators, - random_state=0) + brf = BalancedRandomForestClassifier( + n_estimators=n_estimators, random_state=0 + ) brf.fit(*imbalanced_dataset) assert len(brf.samplers_) == n_estimators @@ -62,14 +76,18 @@ def test_balanced_random_forest(imbalanced_dataset): def test_balanced_random_forest_attributes(imbalanced_dataset): X, y = imbalanced_dataset n_estimators = 10 - brf = BalancedRandomForestClassifier(n_estimators=n_estimators, - random_state=0) + brf = BalancedRandomForestClassifier( + n_estimators=n_estimators, random_state=0 + ) brf.fit(X, y) for idx in range(n_estimators): X_res, y_res = brf.samplers_[idx].fit_resample(X, y) - X_res_2, y_res_2 = brf.pipelines_[idx].named_steps[ - 'randomundersampler'].fit_resample(X, y) + X_res_2, y_res_2 = ( + brf.pipelines_[idx] + .named_steps["randomundersampler"] + .fit_resample(X, y) + ) assert_allclose(X_res, X_res_2) assert_array_equal(y_res, y_res_2) @@ -95,21 +113,24 @@ def test_balanced_random_forest_oob(imbalanced_dataset): est = BalancedRandomForestClassifier(oob_score=True, random_state=0) n_samples = X.shape[0] - est.fit(X[:n_samples // 2, :], y[:n_samples // 2]) - test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:]) + est.fit(X[: n_samples // 2, :], y[: n_samples // 2]) + test_score = est.score(X[n_samples // 2 :, :], y[n_samples // 2 :]) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators - est = BalancedRandomForestClassifier(oob_score=True, random_state=0, - n_estimators=1, bootstrap=True) - with pytest.warns(UserWarning) and np.errstate(divide="ignore", - invalid="ignore"): + est = BalancedRandomForestClassifier( + oob_score=True, random_state=0, n_estimators=1, bootstrap=True + ) + with pytest.warns(UserWarning) and np.errstate( + divide="ignore", invalid="ignore" + ): est.fit(X, y) def test_balanced_random_forest_grid_search(imbalanced_dataset): brf = BalancedRandomForestClassifier() - grid = GridSearchCV(brf, {'n_estimators': (1, 2), 'max_depth': (1, 2)}, - cv=3, iid=False) + grid = GridSearchCV( + brf, {"n_estimators": (1, 2), "max_depth": (1, 2)}, cv=3, iid=False + ) grid.fit(*imbalanced_dataset) diff --git a/imblearn/ensemble/tests/test_weight_boosting.py b/imblearn/ensemble/tests/test_weight_boosting.py index d0878a298..eef975b28 100644 --- a/imblearn/ensemble/tests/test_weight_boosting.py +++ b/imblearn/ensemble/tests/test_weight_boosting.py @@ -11,17 +11,26 @@ @pytest.fixture def imbalanced_dataset(): - return make_classification(n_samples=10000, n_features=3, n_informative=2, - n_redundant=0, n_repeated=0, n_classes=3, - n_clusters_per_class=1, - weights=[0.01, 0.05, 0.94], class_sep=0.8, - random_state=0) + return make_classification( + n_samples=10000, + n_features=3, + n_informative=2, + n_redundant=0, + n_repeated=0, + n_classes=3, + n_clusters_per_class=1, + weights=[0.01, 0.05, 0.94], + class_sep=0.8, + random_state=0, + ) @pytest.mark.parametrize( "boosting_params, err_msg", - [({"n_estimators": 'whatever'}, "n_estimators must be an integer"), - ({"n_estimators": -100}, "n_estimators must be greater than zero")] + [ + ({"n_estimators": "whatever"}, "n_estimators must be an integer"), + ({"n_estimators": -100}, "n_estimators must be greater than zero"), + ], ) def test_rusboost_error(imbalanced_dataset, boosting_params, err_msg): rusboost = RUSBoostClassifier(**boosting_params) @@ -29,18 +38,18 @@ def test_rusboost_error(imbalanced_dataset, boosting_params, err_msg): rusboost.fit(*imbalanced_dataset) -@pytest.mark.parametrize('algorithm', ['SAMME', 'SAMME.R']) +@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset - X_train, X_test, y_train, y_test = train_test_split(X, y, - stratify=y, - random_state=1) + X_train, X_test, y_train, y_test = train_test_split( + X, y, stratify=y, random_state=1 + ) classes = np.unique(y) n_estimators = 500 - rusboost = RUSBoostClassifier(n_estimators=n_estimators, - algorithm=algorithm, - random_state=0) + rusboost = RUSBoostClassifier( + n_estimators=n_estimators, algorithm=algorithm, random_state=0 + ) rusboost.fit(X_train, y_train) assert_array_equal(classes, rusboost.classes_) @@ -51,11 +60,13 @@ def test_rusboost(imbalanced_dataset, algorithm): assert len(rusboost.pipelines_) == len(rusboost.samplers_) # each sampler in the ensemble should have different random state - assert (len({sampler.random_state for sampler in rusboost.samplers_}) == - len(rusboost.samplers_)) + assert len( + {sampler.random_state for sampler in rusboost.samplers_} + ) == len(rusboost.samplers_) # each estimator in the ensemble should have different random state - assert (len({est.random_state for est in rusboost.estimators_}) == - len(rusboost.estimators_)) + assert len({est.random_state for est in rusboost.estimators_}) == len( + rusboost.estimators_ + ) # check the consistency of the feature importances assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] @@ -67,18 +78,18 @@ def test_rusboost(imbalanced_dataset, algorithm): score = rusboost.score(X_test, y_test) assert score > 0.7, "Failed with algorithm {} and score {}".format( - algorithm, score) + algorithm, score + ) y_pred = rusboost.predict(X_test) assert y_pred.shape == y_test.shape -@pytest.mark.parametrize('algorithm', ['SAMME', 'SAMME.R']) +@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_rusboost_sample_weight(imbalanced_dataset, algorithm): X, y = imbalanced_dataset sample_weight = np.ones_like(y) - rusboost = RUSBoostClassifier(algorithm=algorithm, - random_state=0) + rusboost = RUSBoostClassifier(algorithm=algorithm, random_state=0) # Predictions should be the same when sample_weight are all ones y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) diff --git a/imblearn/exceptions.py b/imblearn/exceptions.py index b4d5fe479..4676e97e8 100644 --- a/imblearn/exceptions.py +++ b/imblearn/exceptions.py @@ -8,5 +8,8 @@ def raise_isinstance_error(variable_name, possible_type, variable): - raise ValueError("{} has to be one of {}. Got {} instead.".format( - variable_name, possible_type, type(variable))) + raise ValueError( + "{} has to be one of {}. Got {} instead.".format( + variable_name, possible_type, type(variable) + ) + ) diff --git a/imblearn/keras/__init__.py b/imblearn/keras/__init__.py index 407e0c7dd..ae6a7df4c 100644 --- a/imblearn/keras/__init__.py +++ b/imblearn/keras/__init__.py @@ -4,5 +4,4 @@ from ._generator import BalancedBatchGenerator from ._generator import balanced_batch_generator -__all__ = ['BalancedBatchGenerator', - 'balanced_batch_generator'] +__all__ = ["BalancedBatchGenerator", "balanced_batch_generator"] diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 040e9944f..64d2d6325 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -16,6 +16,7 @@ def import_keras(): def import_from_keras(): try: import keras + return (keras.utils.Sequence,), True except ImportError: return tuple(), False @@ -23,6 +24,7 @@ def import_from_keras(): def import_from_tensforflow(): try: from tensorflow import keras + return (keras.utils.Sequence,), True except ImportError: return tuple(), False @@ -31,7 +33,7 @@ def import_from_tensforflow(): ParentClassTensorflow, has_keras_tf = import_from_tensforflow() has_keras = has_keras_k or has_keras_tf if has_keras: - ParentClass = (ParentClassKeras + ParentClassTensorflow) + ParentClass = ParentClassKeras + ParentClassTensorflow else: ParentClass = (object,) return ParentClass, has_keras @@ -51,9 +53,14 @@ def import_from_tensforflow(): from ..utils._docstring import _random_state_docstring from ..tensorflow import balanced_batch_generator as tf_bbg -DONT_HAVE_RANDOM_STATE = ('NearMiss', 'EditedNearestNeighbours', - 'RepeatedEditedNearestNeighbours', 'AllKNN', - 'NeighbourhoodCleaningRule', 'TomekLinks') +DONT_HAVE_RANDOM_STATE = ( + "NearMiss", + "EditedNearestNeighbours", + "RepeatedEditedNearestNeighbours", + "AllKNN", + "NeighbourhoodCleaningRule", + "TomekLinks", +) class BalancedBatchGenerator(*ParentClass): @@ -133,8 +140,16 @@ class BalancedBatchGenerator(*ParentClass): # flag for keras sequence duck-typing use_sequence_api = True - def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32, - keep_sparse=False, random_state=None): + def __init__( + self, + X, + y, + sample_weight=None, + sampler=None, + batch_size=32, + keep_sparse=False, + random_state=None, + ): if not HAS_KERAS: raise ImportError("'No module named 'keras'") self.X = X @@ -157,9 +172,10 @@ def _sample(self): set_random_state(self.sampler_, random_state) self.sampler_.fit_resample(self.X, self.y) - if not hasattr(self.sampler_, 'sample_indices_'): - raise ValueError("'sampler' needs to have an attribute " - "'sample_indices_'.") + if not hasattr(self.sampler_, "sample_indices_"): + raise ValueError( + "'sampler' needs to have an attribute " "'sample_indices_'." + ) self.indices_ = self.sampler_.sample_indices_ # shuffle the indices since the sampler are packing them by class random_state.shuffle(self.indices_) @@ -169,18 +185,26 @@ def __len__(self): def __getitem__(self, index): X_resampled = safe_indexing( - self.X, self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]) + self.X, + self.indices_[ + index * self.batch_size : (index + 1) * self.batch_size + ], + ) y_resampled = safe_indexing( - self.y, self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]) + self.y, + self.indices_[ + index * self.batch_size : (index + 1) * self.batch_size + ], + ) if issparse(X_resampled) and not self.keep_sparse: X_resampled = X_resampled.toarray() if self.sample_weight is not None: sample_weight_resampled = safe_indexing( self.sample_weight, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]) + self.indices_[ + index * self.batch_size : (index + 1) * self.batch_size + ], + ) if self.sample_weight is None: return X_resampled, y_resampled @@ -189,9 +213,15 @@ def __getitem__(self, index): @Substitution(random_state=_random_state_docstring) -def balanced_batch_generator(X, y, sample_weight=None, sampler=None, - batch_size=32, keep_sparse=False, - random_state=None): +def balanced_batch_generator( + X, + y, + sample_weight=None, + sampler=None, + batch_size=32, + keep_sparse=False, + random_state=None, +): """Create a balanced batch generator to train keras model. Returns a generator --- as well as the number of step per epoch --- which @@ -261,6 +291,12 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, """ - return tf_bbg(X=X, y=y, sample_weight=sample_weight, - sampler=sampler, batch_size=batch_size, - keep_sparse=keep_sparse, random_state=random_state) + return tf_bbg( + X=X, + y=y, + sample_weight=sample_weight, + sampler=sampler, + batch_size=batch_size, + keep_sparse=keep_sparse, + random_state=random_state, + ) diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index d374db3ec..63dff5ec5 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -5,7 +5,7 @@ from sklearn.datasets import load_iris -keras = pytest.importorskip('keras') +keras = pytest.importorskip("keras") from keras.models import Sequential from keras.layers import Dense from keras.utils import to_categorical @@ -29,46 +29,54 @@ def data(): def _build_keras_model(n_classes, n_features): model = Sequential() - model.add(Dense(n_classes, input_dim=n_features, activation='softmax')) - model.compile(optimizer='sgd', loss='categorical_crossentropy', - metrics=['accuracy']) + model.add(Dense(n_classes, input_dim=n_features, activation="softmax")) + model.compile( + optimizer="sgd", loss="categorical_crossentropy", metrics=["accuracy"] + ) return model def test_balanced_batch_generator_class_no_return_indices(data): - with pytest.raises(ValueError, match='needs to have an attribute'): + with pytest.raises(ValueError, match="needs to have an attribute"): BalancedBatchGenerator( *data, sampler=ClusterCentroids(), batch_size=10 ) -@pytest.mark.filterwarnings('ignore:`wait_time` is not used') # keras 2.2.4 +@pytest.mark.filterwarnings("ignore:`wait_time` is not used") # keras 2.2.4 @pytest.mark.parametrize( "sampler, sample_weight", - [(None, None), - (RandomOverSampler(), None), - (NearMiss(), None), - (None, np.random.uniform(size=120))] + [ + (None, None), + (RandomOverSampler(), None), + (NearMiss(), None), + (None, np.random.uniform(size=120)), + ], ) def test_balanced_batch_generator_class(data, sampler, sample_weight): X, y = data model = _build_keras_model(y.shape[1], X.shape[1]) - training_generator = BalancedBatchGenerator(X, y, - sample_weight=sample_weight, - sampler=sampler, - batch_size=10, - random_state=42) - model.fit_generator(generator=training_generator, - epochs=10) + training_generator = BalancedBatchGenerator( + X, + y, + sample_weight=sample_weight, + sampler=sampler, + batch_size=10, + random_state=42, + ) + model.fit_generator(generator=training_generator, epochs=10) @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_class_sparse(data, keep_sparse): X, y = data - training_generator = BalancedBatchGenerator(sparse.csr_matrix(X), y, - batch_size=10, - keep_sparse=keep_sparse, - random_state=42) + training_generator = BalancedBatchGenerator( + sparse.csr_matrix(X), + y, + batch_size=10, + keep_sparse=keep_sparse, + random_state=42, + ) for idx in range(len(training_generator)): X_batch, _ = training_generator.__getitem__(idx) if keep_sparse: @@ -78,36 +86,50 @@ def test_balanced_batch_generator_class_sparse(data, keep_sparse): def test_balanced_batch_generator_function_no_return_indices(data): - with pytest.raises(ValueError, match='needs to have an attribute'): + with pytest.raises(ValueError, match="needs to have an attribute"): balanced_batch_generator( - *data, sampler=ClusterCentroids(), batch_size=10, random_state=42) + *data, sampler=ClusterCentroids(), batch_size=10, random_state=42 + ) -@pytest.mark.filterwarnings('ignore:`wait_time` is not used') # keras 2.2.4 +@pytest.mark.filterwarnings("ignore:`wait_time` is not used") # keras 2.2.4 @pytest.mark.parametrize( "sampler, sample_weight", - [(None, None), - (RandomOverSampler(), None), - (NearMiss(), None), - (None, np.random.uniform(size=120))] + [ + (None, None), + (RandomOverSampler(), None), + (NearMiss(), None), + (None, np.random.uniform(size=120)), + ], ) def test_balanced_batch_generator_function(data, sampler, sample_weight): X, y = data model = _build_keras_model(y.shape[1], X.shape[1]) training_generator, steps_per_epoch = balanced_batch_generator( - X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, - random_state=42) - model.fit_generator(generator=training_generator, - steps_per_epoch=steps_per_epoch, - epochs=10) + X, + y, + sample_weight=sample_weight, + sampler=sampler, + batch_size=10, + random_state=42, + ) + model.fit_generator( + generator=training_generator, + steps_per_epoch=steps_per_epoch, + epochs=10, + ) @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_function_sparse(data, keep_sparse): X, y = data training_generator, steps_per_epoch = balanced_batch_generator( - sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, - random_state=42) + sparse.csr_matrix(X), + y, + keep_sparse=keep_sparse, + batch_size=10, + random_state=42, + ) for _ in range(steps_per_epoch): X_batch, _ = next(training_generator) if keep_sparse: diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py index e44452f3b..3097a292c 100644 --- a/imblearn/metrics/__init__.py +++ b/imblearn/metrics/__init__.py @@ -11,7 +11,10 @@ from ._classification import classification_report_imbalanced __all__ = [ - 'sensitivity_specificity_support', 'sensitivity_score', - 'specificity_score', 'geometric_mean_score', - 'make_index_balanced_accuracy', 'classification_report_imbalanced' + "sensitivity_specificity_support", + "sensitivity_score", + "specificity_score", + "geometric_mean_score", + "make_index_balanced_accuracy", + "classification_report_imbalanced", ] diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index b4e3dc843..59e0f0522 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -20,8 +20,11 @@ import numpy as np import scipy as sp -from sklearn.metrics._classification import (_check_targets, _prf_divide, - precision_recall_fscore_support) +from sklearn.metrics._classification import ( + _check_targets, + _prf_divide, + precision_recall_fscore_support, +) from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import unique_labels @@ -31,13 +34,15 @@ from sklearn.externals.funcsigs import signature -def sensitivity_specificity_support(y_true, - y_pred, - labels=None, - pos_label=1, - average=None, - warn_for=('sensitivity', 'specificity'), - sample_weight=None): +def sensitivity_specificity_support( + y_true, + y_pred, + labels=None, + pos_label=1, + average=None, + warn_for=("sensitivity", "specificity"), + sample_weight=None, +): """Compute sensitivity, specificity, and support for each class The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number @@ -139,32 +144,38 @@ def sensitivity_specificity_support(y_true, (0.33333333333333331, 0.66666666666666663, None) """ - average_options = (None, 'micro', 'macro', 'weighted', 'samples') - if average not in average_options and average != 'binary': - raise ValueError('average has to be one of ' + str(average_options)) + average_options = (None, "micro", "macro", "weighted", "samples") + if average not in average_options and average != "binary": + raise ValueError("average has to be one of " + str(average_options)) y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) - if average == 'binary': - if y_type == 'binary': + if average == "binary": + if y_type == "binary": if pos_label not in present_labels: if len(present_labels) < 2: # Only negative labels - return (0., 0., 0) + return (0.0, 0.0, 0) else: - raise ValueError("pos_label=%r is not a valid label: %r" % - (pos_label, present_labels)) + raise ValueError( + "pos_label=%r is not a valid label: %r" + % (pos_label, present_labels) + ) labels = [pos_label] else: - raise ValueError("Target is %s but average='binary'. Please " - "choose another average setting." % y_type) + raise ValueError( + "Target is %s but average='binary'. Please " + "choose another average setting." % y_type + ) elif pos_label not in (None, 1): warnings.warn( "Note that pos_label (set to %r) is ignored when " "average != 'binary' (got %r). You may use " - "labels=[pos_label] to specify a single positive class." % - (pos_label, average), UserWarning) + "labels=[pos_label] to specify a single positive class." + % (pos_label, average), + UserWarning, + ) if labels is None: labels = present_labels @@ -172,17 +183,19 @@ def sensitivity_specificity_support(y_true, else: n_labels = len(labels) labels = np.hstack( - [labels, - np.setdiff1d(present_labels, labels, assume_unique=True)]) + [labels, np.setdiff1d(present_labels, labels, assume_unique=True)] + ) # Calculate tp_sum, pred_sum, true_sum ### - if y_type.startswith('multilabel'): - raise ValueError('imblearn does not support multilabel') - elif average == 'samples': - raise ValueError("Sample-based precision, recall, fscore is " - "not meaningful outside multilabel " - "classification. See the accuracy_score instead.") + if y_type.startswith("multilabel"): + raise ValueError("imblearn does not support multilabel") + elif average == "samples": + raise ValueError( + "Sample-based precision, recall, fscore is " + "not meaningful outside multilabel " + "classification. See the accuracy_score instead." + ) else: le = LabelEncoder() le.fit(labels) @@ -200,16 +213,19 @@ def sensitivity_specificity_support(y_true, if len(tp_bins): tp_sum = np.bincount( - tp_bins, weights=tp_bins_weights, minlength=len(labels)) + tp_bins, weights=tp_bins_weights, minlength=len(labels) + ) else: # Pathological case true_sum = pred_sum = tp_sum = np.zeros(len(labels)) if len(y_pred): pred_sum = np.bincount( - y_pred, weights=sample_weight, minlength=len(labels)) + y_pred, weights=sample_weight, minlength=len(labels) + ) if len(y_true): true_sum = np.bincount( - y_true, weights=sample_weight, minlength=len(labels)) + y_true, weights=sample_weight, minlength=len(labels) + ) # Compute the true negative tn_sum = y_true.size - (pred_sum + true_sum - tp_sum) @@ -221,7 +237,7 @@ def sensitivity_specificity_support(y_true, pred_sum = pred_sum[indices] tn_sum = tn_sum[indices] - if average == 'micro': + if average == "micro": tp_sum = np.array([tp_sum.sum()]) pred_sum = np.array([pred_sum.sum()]) true_sum = np.array([true_sum.sum()]) @@ -229,30 +245,36 @@ def sensitivity_specificity_support(y_true, # Finally, we have all our sufficient statistics. Divide! # - with np.errstate(divide='ignore', invalid='ignore'): + with np.errstate(divide="ignore", invalid="ignore"): # Divide, and on zero-division, set scores to 0 and warn: # Oddly, we may get an "invalid" rather than a "divide" error # here. - specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum, - 'specificity', 'predicted', average, - warn_for) - sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true', - average, warn_for) + specificity = _prf_divide( + tn_sum, + tn_sum + pred_sum - tp_sum, + "specificity", + "predicted", + average, + warn_for, + ) + sensitivity = _prf_divide( + tp_sum, true_sum, "sensitivity", "true", average, warn_for + ) # Average the results - if average == 'weighted': + if average == "weighted": weights = true_sum if weights.sum() == 0: return 0, 0, None - elif average == 'samples': + elif average == "samples": weights = sample_weight else: weights = None if average is not None: - assert average != 'binary' or len(specificity) == 1 + assert average != "binary" or len(specificity) == 1 specificity = np.average(specificity, weights=weights) sensitivity = np.average(sensitivity, weights=weights) true_sum = None # return no support @@ -260,12 +282,14 @@ def sensitivity_specificity_support(y_true, return sensitivity, specificity, true_sum -def sensitivity_score(y_true, - y_pred, - labels=None, - pos_label=1, - average='binary', - sample_weight=None): +def sensitivity_score( + y_true, + y_pred, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, +): """Compute the sensitivity The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number @@ -354,18 +378,21 @@ def sensitivity_score(y_true, labels=labels, pos_label=pos_label, average=average, - warn_for=('sensitivity', ), - sample_weight=sample_weight) + warn_for=("sensitivity",), + sample_weight=sample_weight, + ) return s -def specificity_score(y_true, - y_pred, - labels=None, - pos_label=1, - average='binary', - sample_weight=None): +def specificity_score( + y_true, + y_pred, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, +): """Compute the specificity The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number @@ -454,19 +481,22 @@ def specificity_score(y_true, labels=labels, pos_label=pos_label, average=average, - warn_for=('specificity', ), - sample_weight=sample_weight) + warn_for=("specificity",), + sample_weight=sample_weight, + ) return s -def geometric_mean_score(y_true, - y_pred, - labels=None, - pos_label=1, - average='multiclass', - sample_weight=None, - correction=0.0): +def geometric_mean_score( + y_true, + y_pred, + labels=None, + pos_label=1, + average="multiclass", + sample_weight=None, + correction=0.0, +): """Compute the geometric mean. The geometric mean (G-mean) is the root of the product of class-wise @@ -575,15 +605,16 @@ class is unrecognized by the classifier, G-mean resolves to zero. To array([ 0.8660254, 0. , 0. ]) """ - if average is None or average != 'multiclass': + if average is None or average != "multiclass": sen, spe, _ = sensitivity_specificity_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, - warn_for=('specificity', 'specificity'), - sample_weight=sample_weight) + warn_for=("specificity", "specificity"), + sample_weight=sample_weight, + ) return np.sqrt(sen * spe) else: @@ -594,10 +625,12 @@ class is unrecognized by the classifier, G-mean resolves to zero. To n_labels = None else: n_labels = len(labels) - labels = np.hstack([ - labels, - np.setdiff1d(present_labels, labels, assume_unique=True) - ]) + labels = np.hstack( + [ + labels, + np.setdiff1d(present_labels, labels, assume_unique=True), + ] + ) le = LabelEncoder() le.fit(labels) @@ -616,25 +649,28 @@ class is unrecognized by the classifier, G-mean resolves to zero. To if len(tp_bins): tp_sum = np.bincount( - tp_bins, weights=tp_bins_weights, minlength=len(labels)) + tp_bins, weights=tp_bins_weights, minlength=len(labels) + ) else: # Pathological case true_sum = tp_sum = np.zeros(len(labels)) if len(y_true): true_sum = np.bincount( - y_true, weights=sample_weight, minlength=len(labels)) + y_true, weights=sample_weight, minlength=len(labels) + ) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] - with np.errstate(divide='ignore', invalid='ignore'): - recall = _prf_divide(tp_sum, true_sum, "recall", "true", None, - "recall") + with np.errstate(divide="ignore", invalid="ignore"): + recall = _prf_divide( + tp_sum, true_sum, "recall", "true", None, "recall" + ) recall[recall == 0] = correction - with np.errstate(divide='ignore', invalid='ignore'): + with np.errstate(divide="ignore", invalid="ignore"): gmean = sp.stats.gmean(recall) # old version of scipy return MaskedConstant instead of 0.0 if isinstance(gmean, np.ma.core.MaskedConstant): @@ -700,13 +736,17 @@ def compute_score(*args, **kwargs): tags_scoring_func = getcallargs(scoring_func, *args, **kwargs) # check that the scoring function does not need a score # and only a prediction - if ('y_score' in tags_scoring_func or - 'y_prob' in tags_scoring_func or - 'y2' in tags_scoring_func): - raise AttributeError('The function {} has an unsupported' - ' attribute. Metric with`y_pred` are the' - ' only supported metrics is the only' - ' supported.') + if ( + "y_score" in tags_scoring_func + or "y_prob" in tags_scoring_func + or "y2" in tags_scoring_func + ): + raise AttributeError( + "The function {} has an unsupported" + " attribute. Metric with`y_pred` are the" + " only supported metrics is the only" + " supported." + ) # Compute the score from the scoring function _score = scoring_func(*args, **kwargs) # Square if desired @@ -720,39 +760,43 @@ def compute_score(*args, **kwargs): # Make the intersection between the parameters sel_params = params_sens_spec.intersection(set(tags_scoring_func)) # Create a sub dictionary - tags_scoring_func = {k: tags_scoring_func[k] - for k in sel_params} + tags_scoring_func = {k: tags_scoring_func[k] for k in sel_params} # Check if the metric is the geometric mean - if scoring_func.__name__ == 'geometric_mean_score': - if 'average' in tags_scoring_func: - if tags_scoring_func['average'] == 'multiclass': - tags_scoring_func['average'] = 'macro' + if scoring_func.__name__ == "geometric_mean_score": + if "average" in tags_scoring_func: + if tags_scoring_func["average"] == "multiclass": + tags_scoring_func["average"] = "macro" # We do not support multilabel so the only average supported # is binary - elif (scoring_func.__name__ == 'accuracy_score' or - scoring_func.__name__ == 'jaccard_score'): - tags_scoring_func['average'] = 'binary' + elif ( + scoring_func.__name__ == "accuracy_score" + or scoring_func.__name__ == "jaccard_score" + ): + tags_scoring_func["average"] = "binary" # Create the list of parameters through signature binding tags_sens_spec = sens_spec_sig.bind(**tags_scoring_func) # Call the sens/spec function sen, spe, _ = sensitivity_specificity_support( - *tags_sens_spec.args, **tags_sens_spec.kwargs) + *tags_sens_spec.args, **tags_sens_spec.kwargs + ) # Compute the dominance dom = sen - spe - return (1. + alpha * dom) * _score + return (1.0 + alpha * dom) * _score return compute_score return decorate -def classification_report_imbalanced(y_true, - y_pred, - labels=None, - target_names=None, - sample_weight=None, - digits=2, - alpha=0.1): +def classification_report_imbalanced( + y_true, + y_pred, + labels=None, + target_names=None, + sample_weight=None, + digits=2, + alpha=0.1, +): """Build a classification report based on metrics used with imbalanced dataset @@ -826,22 +870,22 @@ class 2 1.00 0.67 1.00 0.80 0.82 0.64\ else: labels = np.asarray(labels) - last_line_heading = 'avg / total' + last_line_heading = "avg / total" if target_names is None: - target_names = ['%s' % l for l in labels] + target_names = ["%s" % l for l in labels] name_width = max(len(cn) for cn in target_names) width = max(name_width, len(last_line_heading), digits) headers = ["pre", "rec", "spe", "f1", "geo", "iba", "sup"] - fmt = '%% %ds' % width # first column: class name - fmt += ' ' - fmt += ' '.join(['% 9s' for _ in headers]) - fmt += '\n' + fmt = "%% %ds" % width # first column: class name + fmt += " " + fmt += " ".join(["% 9s" for _ in headers]) + fmt += "\n" headers = [""] + headers report = fmt % tuple(headers) - report += '\n' + report += "\n" # Compute the different metrics # Precision/recall/f1 @@ -850,49 +894,63 @@ class 2 1.00 0.67 1.00 0.80 0.82 0.64\ y_pred, labels=labels, average=None, - sample_weight=sample_weight) + sample_weight=sample_weight, + ) # Specificity specificity = specificity_score( y_true, y_pred, labels=labels, average=None, - sample_weight=sample_weight) + sample_weight=sample_weight, + ) # Geometric mean geo_mean = geometric_mean_score( y_true, y_pred, labels=labels, average=None, - sample_weight=sample_weight) + sample_weight=sample_weight, + ) # Index balanced accuracy - iba_gmean = make_index_balanced_accuracy( - alpha=alpha, squared=True)(geometric_mean_score) + iba_gmean = make_index_balanced_accuracy(alpha=alpha, squared=True)( + geometric_mean_score + ) iba = iba_gmean( y_true, y_pred, labels=labels, average=None, - sample_weight=sample_weight) + sample_weight=sample_weight, + ) for i, label in enumerate(labels): values = [target_names[i]] - for v in (precision[i], recall[i], specificity[i], f1[i], geo_mean[i], - iba[i]): + for v in ( + precision[i], + recall[i], + specificity[i], + f1[i], + geo_mean[i], + iba[i], + ): values += ["{0:0.{1}f}".format(v, digits)] values += ["{}".format(support[i])] report += fmt % tuple(values) - report += '\n' + report += "\n" # compute averages values = [last_line_heading] - for v in (np.average(precision, weights=support), np.average( - recall, weights=support), np.average(specificity, weights=support), - np.average(f1, weights=support), np.average( - geo_mean, weights=support), np.average(iba, - weights=support)): + for v in ( + np.average(precision, weights=support), + np.average(recall, weights=support), + np.average(specificity, weights=support), + np.average(f1, weights=support), + np.average(geo_mean, weights=support), + np.average(iba, weights=support), + ): values += ["{0:0.{1}f}".format(v, digits)] - values += ['{}'.format(np.sum(support))] + values += ["{}".format(np.sum(support))] report += fmt % tuple(values) return report diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index a52be9419..ea3278451 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -69,7 +69,7 @@ def make_prediction(dataset=None, binary=False): X = np.c_[X, rng.randn(n_samples, 200 * n_features)] # run classifier, get class probabilities and label predictions - clf = svm.SVC(kernel='linear', probability=True, random_state=0) + clf = svm.SVC(kernel="linear", probability=True, random_state=0) probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) if binary: @@ -92,7 +92,8 @@ def test_sensitivity_specificity_score_binary(): # detailed measures for each class sen, spe, sup = sensitivity_specificity_support( - y_true, y_pred, average=None) + y_true, y_pred, average=None + ) assert_allclose(sen, [0.88, 0.68], rtol=R_TOL) assert_allclose(spe, [0.68, 0.88], rtol=R_TOL) assert_array_equal(sup, [25, 25]) @@ -100,7 +101,7 @@ def test_sensitivity_specificity_score_binary(): # individual scoring function that can be used for grid search: in the # binary class case the score is the value of the measure for the positive # class (e.g. label == 1). This is deprecated for average != 'binary'. - for kwargs in ({}, {'average': 'binary'}): + for kwargs in ({}, {"average": "binary"}): sen = assert_no_warnings(sensitivity_score, y_true, y_pred, **kwargs) assert sen == pytest.approx(0.68, rel=R_TOL) @@ -111,11 +112,11 @@ def test_sensitivity_specificity_score_binary(): @pytest.mark.filterwarnings("ignore:Specificity is ill-defined") @pytest.mark.parametrize( "y_pred, expected_sensitivity, expected_specificity", - [(([1, 1], [1, 1]), 1.0, 0.0), - (([-1, -1], [-1, -1]), 0.0, 0.0)] + [(([1, 1], [1, 1]), 1.0, 0.0), (([-1, -1], [-1, -1]), 0.0, 0.0)], ) def test_sensitivity_specificity_f_binary_single_class( - y_pred, expected_sensitivity, expected_specificity): + y_pred, expected_sensitivity, expected_specificity +): # Such a case may occur with non-stratified cross-validation assert sensitivity_score(*y_pred) == expected_sensitivity assert specificity_score(*y_pred) == expected_specificity @@ -123,16 +124,19 @@ def test_sensitivity_specificity_f_binary_single_class( @pytest.mark.parametrize( "average, expected_specificty", - [(None, [1., 0.67, 1., 1., 1.]), - ('macro', np.mean([1., 0.67, 1., 1., 1.])), - ('micro', 15 / 16)] + [ + (None, [1.0, 0.67, 1.0, 1.0, 1.0]), + ("macro", np.mean([1.0, 0.67, 1.0, 1.0, 1.0])), + ("micro", 15 / 16), + ], ) def test_sensitivity_specificity_extra_labels(average, expected_specificty): y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] actual = specificity_score( - y_true, y_pred, labels=[0, 1, 2, 3, 4], average=average) + y_true, y_pred, labels=[0, 1, 2, 3, 4], average=average + ) assert_allclose(expected_specificty, actual, rtol=R_TOL) @@ -143,18 +147,21 @@ def test_sensitivity_specificity_ignored_labels(): specificity_13 = partial(specificity_score, y_true, y_pred, labels=[1, 3]) specificity_all = partial(specificity_score, y_true, y_pred, labels=None) - assert_allclose([1., 0.33], specificity_13(average=None), rtol=R_TOL) + assert_allclose([1.0, 0.33], specificity_13(average=None), rtol=R_TOL) assert_allclose( - np.mean([1., 0.33]), specificity_13(average='macro'), rtol=R_TOL) + np.mean([1.0, 0.33]), specificity_13(average="macro"), rtol=R_TOL + ) assert_allclose( - np.average([1., .33], weights=[2., 1.]), - specificity_13(average='weighted'), - rtol=R_TOL) + np.average([1.0, 0.33], weights=[2.0, 1.0]), + specificity_13(average="weighted"), + rtol=R_TOL, + ) assert_allclose( - 3. / (3. + 2.), specificity_13(average='micro'), rtol=R_TOL) + 3.0 / (3.0 + 2.0), specificity_13(average="micro"), rtol=R_TOL + ) # ensure the above were meaningful tests: - for each in ['macro', 'weighted', 'micro']: + for each in ["macro", "weighted", "micro"]: assert specificity_13(average=each) != specificity_all(average=each) @@ -174,18 +181,20 @@ def test_sensitivity_specificity_support_errors(): # Bad pos_label with pytest.raises(ValueError): sensitivity_specificity_support( - y_true, y_pred, pos_label=2, average='binary') + y_true, y_pred, pos_label=2, average="binary" + ) # Bad average option with pytest.raises(ValueError): - sensitivity_specificity_support([0, 1, 2], [1, 2, 0], average='mega') + sensitivity_specificity_support([0, 1, 2], [1, 2, 0], average="mega") def test_sensitivity_specificity_unused_pos_label(): # but average != 'binary'; even if data is binary with warns(UserWarning, r"use labels=\[pos_label\] to specify a single"): sensitivity_specificity_support( - [1, 2, 1], [1, 2, 2], pos_label=2, average='macro') + [1, 2, 1], [1, 2, 2], pos_label=2, average="macro" + ) def test_geometric_mean_support_binary(): @@ -200,14 +209,21 @@ def test_geometric_mean_support_binary(): @pytest.mark.filterwarnings("ignore:Recall is ill-defined") @pytest.mark.parametrize( "y_true, y_pred, correction, expected_gmean", - [([0, 0, 1, 1], [0, 0, 1, 1], 0.0, 1.0), - ([0, 0, 0, 0], [1, 1, 1, 1], 0.0, 0.0), - ([0, 0, 0, 0], [0, 0, 0, 0], 0.001, 1.0), - ([0, 0, 0, 0], [1, 1, 1, 1], 0.001, 0.001), - ([0, 0, 1, 1], [0, 1, 1, 0], 0.001, 0.5), - ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], 0.001, (0.001 ** 2) ** (1 / 3)), - ([0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], 0.001, 1), - ([0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1], 0.001, (0.5 * 0.75) ** 0.5)] + [ + ([0, 0, 1, 1], [0, 0, 1, 1], 0.0, 1.0), + ([0, 0, 0, 0], [1, 1, 1, 1], 0.0, 0.0), + ([0, 0, 0, 0], [0, 0, 0, 0], 0.001, 1.0), + ([0, 0, 0, 0], [1, 1, 1, 1], 0.001, 0.001), + ([0, 0, 1, 1], [0, 1, 1, 0], 0.001, 0.5), + ( + [0, 1, 2, 0, 1, 2], + [0, 2, 1, 0, 0, 1], + 0.001, + (0.001 ** 2) ** (1 / 3), + ), + ([0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], 0.001, 1), + ([0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1], 0.001, (0.5 * 0.75) ** 0.5), + ], ) def test_geometric_mean_multiclass(y_true, y_pred, correction, expected_gmean): gmean = geometric_mean_score(y_true, y_pred, correction=correction) @@ -217,10 +233,12 @@ def test_geometric_mean_multiclass(y_true, y_pred, correction, expected_gmean): @pytest.mark.filterwarnings("ignore:Recall is ill-defined") @pytest.mark.parametrize( "y_true, y_pred, average, expected_gmean", - [([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], 'macro', 0.471), - ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], 'micro', 0.471), - ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], 'weighted', 0.471), - ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], None, [0.8660254, 0.0, 0.0])] + [ + ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "macro", 0.471), + ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "micro", 0.471), + ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "weighted", 0.471), + ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], None, [0.8660254, 0.0, 0.0]), + ], ) def test_geometric_mean_average(y_true, y_pred, average, expected_gmean): gmean = geometric_mean_score(y_true, y_pred, average=average) @@ -229,24 +247,45 @@ def test_geometric_mean_average(y_true, y_pred, average, expected_gmean): @pytest.mark.parametrize( "y_true, y_pred, sample_weight, average, expected_gmean", - [([0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], None, 'multiclass', 0.707), - ([0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], [1, 2, 1, 1, 2, 1], - 'multiclass', 0.707), - ([0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], [1, 2, 1, 1, 2, 1], - 'weighted', 0.333)] + [ + ([0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], None, "multiclass", 0.707), + ( + [0, 1, 2, 0, 1, 2], + [0, 1, 1, 0, 0, 1], + [1, 2, 1, 1, 2, 1], + "multiclass", + 0.707, + ), + ( + [0, 1, 2, 0, 1, 2], + [0, 1, 1, 0, 0, 1], + [1, 2, 1, 1, 2, 1], + "weighted", + 0.333, + ), + ], ) -def test_geometric_mean_sample_weight(y_true, y_pred, sample_weight, average, - expected_gmean): - gmean = geometric_mean_score(y_true, y_pred, labels=[0, 1], - sample_weight=sample_weight, - average=average) +def test_geometric_mean_sample_weight( + y_true, y_pred, sample_weight, average, expected_gmean +): + gmean = geometric_mean_score( + y_true, + y_pred, + labels=[0, 1], + sample_weight=sample_weight, + average=average, + ) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) @pytest.mark.parametrize( "average, expected_gmean", - [('multiclass', 0.41), (None, [0.85, 0.29, 0.7]), - ('macro', 0.68), ('weighted', 0.65)] + [ + ("multiclass", 0.41), + (None, [0.85, 0.29, 0.7]), + ("macro", 0.68), + ("weighted", 0.65), + ], ) def test_geometric_mean_score_prediction(average, expected_gmean): y_true, y_pred, _ = make_prediction(binary=False) @@ -258,15 +297,16 @@ def test_geometric_mean_score_prediction(average, expected_gmean): def test_iba_geo_mean_binary(): y_true, y_pred, _ = make_prediction(binary=True) - iba_gmean = make_index_balanced_accuracy( - alpha=0.5, squared=True)(geometric_mean_score) + iba_gmean = make_index_balanced_accuracy(alpha=0.5, squared=True)( + geometric_mean_score + ) iba = iba_gmean(y_true, y_pred) assert_allclose(iba, 0.5948, rtol=R_TOL) def _format_report(report): - return ' '.join(report.split()) + return " ".join(report.split()) def test_classification_report_imbalanced_multiclass(): @@ -274,22 +314,27 @@ def test_classification_report_imbalanced_multiclass(): y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names - expected_report = ('pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 ' - '0.81 0.85 0.72 24 versicolor 0.33 0.10 0.86 0.15 ' - '0.29 0.08 31 virginica 0.42 0.90 0.55 0.57 0.70 ' - '0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75') + expected_report = ( + "pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 " + "0.81 0.85 0.72 24 versicolor 0.33 0.10 0.86 0.15 " + "0.29 0.08 31 virginica 0.42 0.90 0.55 0.57 0.70 " + "0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75" + ) report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), - target_names=iris.target_names) + target_names=iris.target_names, + ) assert _format_report(report) == expected_report # print classification report with label detection - expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 ' - '0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 ' - '2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total ' - '0.51 0.53 0.80 0.47 0.58 0.40 75') + expected_report = ( + "pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " + "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 " + "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " + "0.51 0.53 0.80 0.47 0.58 0.40 75" + ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report @@ -300,24 +345,29 @@ def test_classification_report_imbalanced_multiclass_with_digits(): y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names - expected_report = ('pre rec spe f1 geo iba sup setosa 0.82609 0.79167 ' - '0.92157 0.80851 0.85415 0.72010 24 versicolor ' - '0.33333 0.09677 0.86364 0.15000 0.28910 0.07717 ' - '31 virginica 0.41860 0.90000 0.54545 0.57143 0.70065 ' - '0.50831 20 avg / total 0.51375 0.53333 0.79733 ' - '0.47310 0.57966 0.39788 75') + expected_report = ( + "pre rec spe f1 geo iba sup setosa 0.82609 0.79167 " + "0.92157 0.80851 0.85415 0.72010 24 versicolor " + "0.33333 0.09677 0.86364 0.15000 0.28910 0.07717 " + "31 virginica 0.41860 0.90000 0.54545 0.57143 0.70065 " + "0.50831 20 avg / total 0.51375 0.53333 0.79733 " + "0.47310 0.57966 0.39788 75" + ) report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, - digits=5) + digits=5, + ) assert _format_report(report) == expected_report # print classification report with label detection - expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 ' - '0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 ' - '2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 ' - '0.53 0.80 0.47 0.58 0.40 75') + expected_report = ( + "pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " + "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 " + "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 " + "0.53 0.80 0.47 0.58 0.40 75" + ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report @@ -328,19 +378,24 @@ def test_classification_report_imbalanced_multiclass_with_string_label(): y_true = np.array(["blue", "green", "red"])[y_true] y_pred = np.array(["blue", "green", "red"])[y_pred] - expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 ' - '0.85 0.72 24 green 0.33 0.10 0.86 0.15 0.29 0.08 31 ' - 'red 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total ' - '0.51 0.53 0.80 0.47 0.58 0.40 75') + expected_report = ( + "pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 " + "0.85 0.72 24 green 0.33 0.10 0.86 0.15 0.29 0.08 31 " + "red 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " + "0.51 0.53 0.80 0.47 0.58 0.40 75" + ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report - expected_report = ('pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 0.85 ' - '0.72 24 b 0.33 0.10 0.86 0.15 0.29 0.08 31 c 0.42 ' - '0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 0.53 ' - '0.80 0.47 0.58 0.40 75') + expected_report = ( + "pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 0.85 " + "0.72 24 b 0.33 0.10 0.86 0.15 0.29 0.08 31 c 0.42 " + "0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 0.53 " + "0.80 0.47 0.58 0.40 75" + ) report = classification_report_imbalanced( - y_true, y_pred, target_names=["a", "b", "c"]) + y_true, y_pred, target_names=["a", "b", "c"] + ) assert _format_report(report) == expected_report @@ -351,10 +406,12 @@ def test_classification_report_imbalanced_multiclass_with_unicode_label(): y_true = labels[y_true] y_pred = labels[y_pred] - expected_report = ('pre rec spe f1 geo iba sup blue¢ 0.83 0.79 0.92 0.81 ' - '0.85 0.72 24 green¢ 0.33 0.10 0.86 0.15 0.29 0.08 31 ' - 'red¢ 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total ' - '0.51 0.53 0.80 0.47 0.58 0.40 75') + expected_report = ( + "pre rec spe f1 geo iba sup blue¢ 0.83 0.79 0.92 0.81 " + "0.85 0.72 24 green¢ 0.33 0.10 0.86 0.15 0.29 0.08 31 " + "red¢ 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " + "0.51 0.53 0.80 0.47 0.58 0.40 75" + ) if np_version[:3] < (1, 7, 0): with pytest.raises(RuntimeError, match="NumPy < 1.7.0"): classification_report_imbalanced(y_true, y_pred) @@ -370,10 +427,12 @@ def test_classification_report_imbalanced_multiclass_with_long_string_label(): y_true = labels[y_true] y_pred = labels[y_pred] - expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 ' - '0.85 0.72 24 greengreengreengreengreen 0.33 0.10 ' - '0.86 0.15 0.29 0.08 31 red 0.42 0.90 0.55 0.57 0.70 ' - '0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75') + expected_report = ( + "pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 " + "0.85 0.72 24 greengreengreengreengreen 0.33 0.10 " + "0.86 0.15 0.29 0.08 31 red 0.42 0.90 0.55 0.57 0.70 " + "0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75" + ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report @@ -381,8 +440,12 @@ def test_classification_report_imbalanced_multiclass_with_long_string_label(): @pytest.mark.parametrize( "score, expected_score", - [(accuracy_score, 0.54756), (jaccard_score, 0.33176), - (precision_score, 0.65025), (recall_score, 0.41616)] + [ + (accuracy_score, 0.54756), + (jaccard_score, 0.33176), + (precision_score, 0.65025), + (recall_score, 0.41616), + ], ) def test_iba_sklearn_metrics(score, expected_score): y_true, y_pred, _ = make_prediction(binary=True) @@ -394,13 +457,16 @@ def test_iba_sklearn_metrics(score, expected_score): @pytest.mark.parametrize( "score_loss", - [average_precision_score, brier_score_loss, - cohen_kappa_score, roc_auc_score] + [ + average_precision_score, + brier_score_loss, + cohen_kappa_score, + roc_auc_score, + ], ) def test_iba_error_y_score_prob_error(score_loss): y_true, y_pred, _ = make_prediction(binary=True) - aps = make_index_balanced_accuracy( - alpha=0.5, squared=True)(score_loss) + aps = make_index_balanced_accuracy(alpha=0.5, squared=True)(score_loss) with pytest.raises(AttributeError): aps(y_true, y_pred) diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index 8cd42b375..4b7636e03 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -11,9 +11,12 @@ from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV -from imblearn.metrics import (sensitivity_score, specificity_score, - geometric_mean_score, - make_index_balanced_accuracy) +from imblearn.metrics import ( + sensitivity_score, + specificity_score, + geometric_mean_score, + make_index_balanced_accuracy, +) R_TOL = 1e-2 @@ -27,18 +30,25 @@ def data(): @pytest.mark.filterwarnings("ignore:Liblinear failed to converge") @pytest.mark.parametrize( "score, expected_score", - [(sensitivity_score, 0.92), - (specificity_score, 0.92), - (geometric_mean_score, 0.92), - (make_index_balanced_accuracy()(geometric_mean_score), 0.85)] + [ + (sensitivity_score, 0.92), + (specificity_score, 0.92), + (geometric_mean_score, 0.92), + (make_index_balanced_accuracy()(geometric_mean_score), 0.85), + ], ) -@pytest.mark.parametrize("average", ['macro', 'weighted', 'micro']) +@pytest.mark.parametrize("average", ["macro", "weighted", "micro"]) def test_scorer_common_average(data, score, expected_score, average): X_train, X_test, y_train, _ = data scorer = make_scorer(score, pos_label=None, average=average) - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer, cv=3, iid=False) + grid = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [1, 10]}, + scoring=scorer, + cv=3, + iid=False, + ) grid.fit(X_train, y_train).predict(X_test) assert grid.best_score_ == pytest.approx(expected_score, rel=R_TOL) @@ -47,18 +57,28 @@ def test_scorer_common_average(data, score, expected_score, average): @pytest.mark.filterwarnings("ignore:Liblinear failed to converge") @pytest.mark.parametrize( "score, average, expected_score", - [(sensitivity_score, 'binary', 0.92), - (specificity_score, 'binary', 0.95), - (geometric_mean_score, 'multiclass', 0.92), - (make_index_balanced_accuracy()(geometric_mean_score), - 'multiclass', 0.84)] + [ + (sensitivity_score, "binary", 0.92), + (specificity_score, "binary", 0.95), + (geometric_mean_score, "multiclass", 0.92), + ( + make_index_balanced_accuracy()(geometric_mean_score), + "multiclass", + 0.84, + ), + ], ) def test_scorer_default_average(data, score, average, expected_score): X_train, X_test, y_train, _ = data scorer = make_scorer(score, pos_label=1, average=average) - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer, cv=3, iid=False) + grid = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [1, 10]}, + scoring=scorer, + cv=3, + iid=False, + ) grid.fit(X_train, y_train).predict(X_test) assert grid.best_score_ == pytest.approx(expected_score, rel=R_TOL) diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py index 63abf3dc0..bd20b76ea 100644 --- a/imblearn/over_sampling/__init__.py +++ b/imblearn/over_sampling/__init__.py @@ -11,5 +11,12 @@ from ._smote import SVMSMOTE from ._smote import SMOTENC -__all__ = ['ADASYN', 'RandomOverSampler', 'KMeansSMOTE', - 'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE', 'SMOTENC'] +__all__ = [ + "ADASYN", + "RandomOverSampler", + "KMeansSMOTE", + "SMOTE", + "BorderlineSMOTE", + "SVMSMOTE", + "SMOTENC", +] diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index a4b3ecfc7..09e2c9c15 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -17,7 +17,8 @@ @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class ADASYN(BaseOverSampler): """Perform over-sampling using Adaptive Synthetic (ADASYN) sampling approach for imbalanced datasets. @@ -76,11 +77,13 @@ class ADASYN(BaseOverSampler): """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - n_neighbors=5, - n_jobs=1): + def __init__( + self, + sampling_strategy="auto", + random_state=None, + n_neighbors=5, + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors @@ -89,8 +92,9 @@ def __init__(self, def _validate_estimator(self): """Create the necessary objects for ADASYN""" self.nn_ = check_neighbors_object( - 'n_neighbors', self.n_neighbors, additional_neighbor=1) - self.nn_.set_params(**{'n_jobs': self.n_jobs}) + "n_neighbors", self.n_neighbors, additional_neighbor=1 + ) + self.nn_.set_params(**{"n_jobs": self.n_jobs}) def _fit_resample(self, X, y): self._validate_estimator() @@ -110,19 +114,24 @@ def _fit_resample(self, X, y): # The ratio is computed using a one-vs-rest manner. Using majority # in multi-class would lead to slightly different results at the # cost of introducing a new parameter. - ratio_nn = (np.sum(y[nn_index[:, 1:]] != class_sample, axis=1) / - (self.nn_.n_neighbors - 1)) + ratio_nn = np.sum(y[nn_index[:, 1:]] != class_sample, axis=1) / ( + self.nn_.n_neighbors - 1 + ) if not np.sum(ratio_nn): - raise RuntimeError('Not any neigbours belong to the majority' - ' class. This case will induce a NaN case' - ' with a division by zero. ADASYN is not' - ' suited for this specific dataset.' - ' Use SMOTE instead.') + raise RuntimeError( + "Not any neigbours belong to the majority" + " class. This case will induce a NaN case" + " with a division by zero. ADASYN is not" + " suited for this specific dataset." + " Use SMOTE instead." + ) ratio_nn /= np.sum(ratio_nn) n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) if not np.sum(n_samples_generate): - raise ValueError("No samples will be generated with the" - " provided ratio settings.") + raise ValueError( + "No samples will be generated with the" + " provided ratio settings." + ) # the nearest neighbors need to be fitted only on the current class # to find the class NN to generate new samples @@ -132,44 +141,56 @@ def _fit_resample(self, X, y): if sparse.issparse(X): row_indices, col_indices, samples = [], [], [] n_samples_generated = 0 - for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, - n_samples_generate): + for x_i, x_i_nn, num_sample_i in zip( + X_class, nn_index, n_samples_generate + ): if num_sample_i == 0: continue nn_zs = random_state.randint( - 1, high=self.nn_.n_neighbors, size=num_sample_i) + 1, high=self.nn_.n_neighbors, size=num_sample_i + ) steps = random_state.uniform(size=len(nn_zs)) if x_i.nnz: for step, nn_z in zip(steps, nn_zs): - sample = (x_i + step * - (X_class[x_i_nn[nn_z], :] - x_i)) - row_indices += ( - [n_samples_generated] * len(sample.indices)) + sample = x_i + step * ( + X_class[x_i_nn[nn_z], :] - x_i + ) + row_indices += [n_samples_generated] * len( + sample.indices + ) col_indices += sample.indices.tolist() samples += sample.data.tolist() n_samples_generated += 1 - X_new = (sparse.csr_matrix( + X_new = sparse.csr_matrix( (samples, (row_indices, col_indices)), - [np.sum(n_samples_generate), X.shape[1]], dtype=X.dtype)) - y_new = np.array([class_sample] * np.sum(n_samples_generate), - dtype=y.dtype) + [np.sum(n_samples_generate), X.shape[1]], + dtype=X.dtype, + ) + y_new = np.array( + [class_sample] * np.sum(n_samples_generate), dtype=y.dtype + ) else: x_class_gen = [] - for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, - n_samples_generate): + for x_i, x_i_nn, num_sample_i in zip( + X_class, nn_index, n_samples_generate + ): if num_sample_i == 0: continue nn_zs = random_state.randint( - 1, high=self.nn_.n_neighbors, size=num_sample_i) + 1, high=self.nn_.n_neighbors, size=num_sample_i + ) steps = random_state.uniform(size=len(nn_zs)) - x_class_gen.append([ - x_i + step * (X_class[x_i_nn[nn_z], :] - x_i) - for step, nn_z in zip(steps, nn_zs) - ]) + x_class_gen.append( + [ + x_i + step * (X_class[x_i_nn[nn_z], :] - x_i) + for step, nn_z in zip(steps, nn_zs) + ] + ) X_new = np.concatenate(x_class_gen).astype(X.dtype) - y_new = np.array([class_sample] * np.sum(n_samples_generate), - dtype=y.dtype) + y_new = np.array( + [class_sample] * np.sum(n_samples_generate), dtype=y.dtype + ) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 0a06fd310..e5bb757c3 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -18,7 +18,8 @@ @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class RandomOverSampler(BaseOverSampler): """Class to perform random over-sampling. @@ -65,15 +66,14 @@ class RandomOverSampler(BaseOverSampler): """ - def __init__(self, sampling_strategy='auto', - random_state=None): + def __init__(self, sampling_strategy="auto", random_state=None): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state @staticmethod def _check_X_y(X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) + X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None) return X, y, binarize_y def _fit_resample(self, X, y): @@ -85,15 +85,18 @@ def _fit_resample(self, X, y): for class_sample, num_samples in self.sampling_strategy_.items(): target_class_indices = np.flatnonzero(y == class_sample) indices = random_state.randint( - low=0, high=target_stats[class_sample], size=num_samples) + low=0, high=target_stats[class_sample], size=num_samples + ) - sample_indices = np.append(sample_indices, - target_class_indices[indices]) + sample_indices = np.append( + sample_indices, target_class_indices[indices] + ) self.sample_indices_ = np.array(sample_indices) - return (safe_indexing(X, sample_indices), - safe_indexing(y, sample_indices)) + return ( + safe_indexing(X, sample_indices), + safe_indexing(y, sample_indices), + ) def _more_tags(self): - return {'X_types': ['2darray', 'string'], - 'sample_indices': True} + return {"X_types": ["2darray", "string"], "sample_indices": True} diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 64ec1afe8..9e8c15022 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -34,11 +34,14 @@ class BaseSMOTE(BaseOverSampler): """Base class for the different SMOTE algorithms.""" - def __init__(self, - sampling_strategy='auto', - random_state=None, - k_neighbors=5, - n_jobs=1): + + def __init__( + self, + sampling_strategy="auto", + random_state=None, + k_neighbors=5, + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.k_neighbors = k_neighbors @@ -49,17 +52,13 @@ def _validate_estimator(self): algorithms. """ self.nn_k_ = check_neighbors_object( - 'k_neighbors', self.k_neighbors, additional_neighbor=1) - self.nn_k_.set_params(**{'n_jobs': self.n_jobs}) - - def _make_samples(self, - X, - y_dtype, - y_type, - nn_data, - nn_num, - n_samples, - step_size=1.): + "k_neighbors", self.k_neighbors, additional_neighbor=1 + ) + self.nn_k_.set_params(**{"n_jobs": self.n_jobs}) + + def _make_samples( + self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0 + ): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. @@ -99,7 +98,8 @@ def _make_samples(self, """ random_state = check_random_state(self.random_state) samples_indices = random_state.randint( - low=0, high=len(nn_num.flatten()), size=n_samples) + low=0, high=len(nn_num.flatten()), size=n_samples + ) steps = step_size * random_state.uniform(size=n_samples) rows = np.floor_divide(samples_indices, nn_num.shape[1]) cols = np.mod(samples_indices, nn_num.shape[1]) @@ -110,20 +110,26 @@ def _make_samples(self, row_indices, col_indices, samples = [], [], [] for i, (row, col, step) in enumerate(zip(rows, cols, steps)): if X[row].nnz: - sample = self._generate_sample(X, nn_data, nn_num, - row, col, step) + sample = self._generate_sample( + X, nn_data, nn_num, row, col, step + ) row_indices += [i] * len(sample.indices) col_indices += sample.indices.tolist() samples += sample.data.tolist() - return (sparse.csr_matrix((samples, (row_indices, col_indices)), - [len(samples_indices), X.shape[1]], - dtype=X.dtype), - y_new) + return ( + sparse.csr_matrix( + (samples, (row_indices, col_indices)), + [len(samples_indices), X.shape[1]], + dtype=X.dtype, + ), + y_new, + ) else: X_new = np.zeros((n_samples, X.shape[1]), dtype=X.dtype) for i, (row, col, step) in enumerate(zip(rows, cols, steps)): - X_new[i] = self._generate_sample(X, nn_data, nn_num, - row, col, step) + X_new[i] = self._generate_sample( + X, nn_data, nn_num, row, col, step + ) return X_new, y_new def _generate_sample(self, X, nn_data, nn_num, row, col, step): @@ -169,8 +175,9 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step): """ return X[row] - step * (X[row] - nn_data[nn_num[row, col]]) - def _in_danger_noise(self, nn_estimator, samples, target_class, y, - kind='danger'): + def _in_danger_noise( + self, nn_estimator, samples, target_class, y, kind="danger" + ): """Estimate if a set of sample are in danger or noise. Used by BorderlineSMOTE and SVMSMOTE. @@ -207,11 +214,13 @@ def _in_danger_noise(self, nn_estimator, samples, target_class, y, nn_label = (y[x] != target_class).astype(int) n_maj = np.sum(nn_label, axis=1) - if kind == 'danger': + if kind == "danger": # Samples are in danger for m/2 <= m' < m - return np.bitwise_and(n_maj >= (nn_estimator.n_neighbors - 1) / 2, - n_maj < nn_estimator.n_neighbors - 1) - elif kind == 'noise': + return np.bitwise_and( + n_maj >= (nn_estimator.n_neighbors - 1) / 2, + n_maj < nn_estimator.n_neighbors - 1, + ) + elif kind == "noise": # Samples are noise for m = m' return n_maj == nn_estimator.n_neighbors - 1 else: @@ -220,7 +229,8 @@ def _in_danger_noise(self, nn_estimator, samples, target_class, y, @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class BorderlineSMOTE(BaseSMOTE): """Over-sampling using Borderline SMOTE. @@ -301,28 +311,36 @@ class BorderlineSMOTE(BaseSMOTE): """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - k_neighbors=5, - n_jobs=1, - m_neighbors=10, - kind='borderline-1'): + def __init__( + self, + sampling_strategy="auto", + random_state=None, + k_neighbors=5, + n_jobs=1, + m_neighbors=10, + kind="borderline-1", + ): super().__init__( - sampling_strategy=sampling_strategy, random_state=random_state, - k_neighbors=k_neighbors, n_jobs=n_jobs) + sampling_strategy=sampling_strategy, + random_state=random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs, + ) self.m_neighbors = m_neighbors self.kind = kind def _validate_estimator(self): super()._validate_estimator() self.nn_m_ = check_neighbors_object( - 'm_neighbors', self.m_neighbors, additional_neighbor=1) - self.nn_m_.set_params(**{'n_jobs': self.n_jobs}) - if self.kind not in ('borderline-1', 'borderline-2'): - raise ValueError('The possible "kind" of algorithm are ' - '"borderline-1" and "borderline-2".' - 'Got {} instead.'.format(self.kind)) + "m_neighbors", self.m_neighbors, additional_neighbor=1 + ) + self.nn_m_.set_params(**{"n_jobs": self.n_jobs}) + if self.kind not in ("borderline-1", "borderline-2"): + raise ValueError( + 'The possible "kind" of algorithm are ' + '"borderline-1" and "borderline-2".' + "Got {} instead.".format(self.kind) + ) def _fit_resample(self, X, y): self._validate_estimator() @@ -338,27 +356,34 @@ def _fit_resample(self, X, y): self.nn_m_.fit(X) danger_index = self._in_danger_noise( - self.nn_m_, X_class, class_sample, y, kind='danger') + self.nn_m_, X_class, class_sample, y, kind="danger" + ) if not any(danger_index): continue self.nn_k_.fit(X_class) - nns = self.nn_k_.kneighbors(safe_indexing(X_class, danger_index), - return_distance=False)[:, 1:] + nns = self.nn_k_.kneighbors( + safe_indexing(X_class, danger_index), return_distance=False + )[:, 1:] # divergence between borderline-1 and borderline-2 - if self.kind == 'borderline-1': + if self.kind == "borderline-1": # Create synthetic samples for borderline points. X_new, y_new = self._make_samples( - safe_indexing(X_class, danger_index), y.dtype, - class_sample, X_class, nns, n_samples) + safe_indexing(X_class, danger_index), + y.dtype, + class_sample, + X_class, + nns, + n_samples, + ) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) else: X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) - elif self.kind == 'borderline-2': + elif self.kind == "borderline-2": random_state = check_random_state(self.random_state) fractions = random_state.beta(10, 10) @@ -370,7 +395,8 @@ def _fit_resample(self, X, y): X_class, nns, int(fractions * (n_samples + 1)), - step_size=1.) + step_size=1.0, + ) # we use a one-vs-rest policy to handle the multiclass in which # new samples will be created considering not only the majority @@ -382,11 +408,13 @@ def _fit_resample(self, X, y): safe_indexing(X, np.flatnonzero(y != class_sample)), nns, int((1 - fractions) * n_samples), - step_size=0.5) + step_size=0.5, + ) if sparse.issparse(X_resampled): X_resampled = sparse.vstack( - [X_resampled, X_new_1, X_new_2]) + [X_resampled, X_new_1, X_new_2] + ) else: X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) y_resampled = np.hstack((y_resampled, y_new_1, y_new_2)) @@ -396,7 +424,8 @@ def _fit_resample(self, X, y): @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class SVMSMOTE(BaseSMOTE): """Over-sampling using SVM-SMOTE. @@ -478,17 +507,22 @@ class SVMSMOTE(BaseSMOTE): """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - k_neighbors=5, - n_jobs=1, - m_neighbors=10, - svm_estimator=None, - out_step=0.5): + def __init__( + self, + sampling_strategy="auto", + random_state=None, + k_neighbors=5, + n_jobs=1, + m_neighbors=10, + svm_estimator=None, + out_step=0.5, + ): super().__init__( - sampling_strategy=sampling_strategy, random_state=random_state, - k_neighbors=k_neighbors, n_jobs=n_jobs) + sampling_strategy=sampling_strategy, + random_state=random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs, + ) self.m_neighbors = m_neighbors self.svm_estimator = svm_estimator self.out_step = out_step @@ -496,17 +530,18 @@ def __init__(self, def _validate_estimator(self): super()._validate_estimator() self.nn_m_ = check_neighbors_object( - 'm_neighbors', self.m_neighbors, additional_neighbor=1) - self.nn_m_.set_params(**{'n_jobs': self.n_jobs}) + "m_neighbors", self.m_neighbors, additional_neighbor=1 + ) + self.nn_m_.set_params(**{"n_jobs": self.n_jobs}) if self.svm_estimator is None: - self.svm_estimator_ = SVC(gamma='scale', - random_state=self.random_state) + self.svm_estimator_ = SVC( + gamma="scale", random_state=self.random_state + ) elif isinstance(self.svm_estimator, SVC): self.svm_estimator_ = clone(self.svm_estimator) else: - raise_isinstance_error('svm_estimator', [SVC], - self.svm_estimator) + raise_isinstance_error("svm_estimator", [SVC], self.svm_estimator) def _fit_resample(self, X, y): self._validate_estimator() @@ -521,17 +556,21 @@ def _fit_resample(self, X, y): X_class = safe_indexing(X, target_class_indices) self.svm_estimator_.fit(X, y) - support_index = self.svm_estimator_.support_[y[ - self.svm_estimator_.support_] == class_sample] + support_index = self.svm_estimator_.support_[ + y[self.svm_estimator_.support_] == class_sample + ] support_vector = safe_indexing(X, support_index) self.nn_m_.fit(X) noise_bool = self._in_danger_noise( - self.nn_m_, support_vector, class_sample, y, kind='noise') + self.nn_m_, support_vector, class_sample, y, kind="noise" + ) support_vector = safe_indexing( - support_vector, np.flatnonzero(np.logical_not(noise_bool))) + support_vector, np.flatnonzero(np.logical_not(noise_bool)) + ) danger_bool = self._in_danger_noise( - self.nn_m_, support_vector, class_sample, y, kind='danger') + self.nn_m_, support_vector, class_sample, y, kind="danger" + ) safety_bool = np.logical_not(danger_bool) self.nn_k_.fit(X_class) @@ -540,7 +579,8 @@ def _fit_resample(self, X, y): if np.count_nonzero(danger_bool) > 0: nns = self.nn_k_.kneighbors( safe_indexing(support_vector, np.flatnonzero(danger_bool)), - return_distance=False)[:, 1:] + return_distance=False, + )[:, 1:] X_new_1, y_new_1 = self._make_samples( safe_indexing(support_vector, np.flatnonzero(danger_bool)), @@ -549,12 +589,14 @@ def _fit_resample(self, X, y): X_class, nns, n_generated_samples, - step_size=1.) + step_size=1.0, + ) if np.count_nonzero(safety_bool) > 0: nns = self.nn_k_.kneighbors( safe_indexing(support_vector, np.flatnonzero(safety_bool)), - return_distance=False)[:, 1:] + return_distance=False, + )[:, 1:] X_new_2, y_new_2 = self._make_samples( safe_indexing(support_vector, np.flatnonzero(safety_bool)), @@ -563,17 +605,22 @@ def _fit_resample(self, X, y): X_class, nns, n_samples - n_generated_samples, - step_size=-self.out_step) + step_size=-self.out_step, + ) - if (np.count_nonzero(danger_bool) > 0 and - np.count_nonzero(safety_bool) > 0): + if ( + np.count_nonzero(danger_bool) > 0 + and np.count_nonzero(safety_bool) > 0 + ): if sparse.issparse(X_resampled): X_resampled = sparse.vstack( - [X_resampled, X_new_1, X_new_2]) + [X_resampled, X_new_1, X_new_2] + ) else: X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) y_resampled = np.concatenate( - (y_resampled, y_new_1, y_new_2), axis=0) + (y_resampled, y_new_1, y_new_2), axis=0 + ) elif np.count_nonzero(danger_bool) == 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_2]) @@ -592,7 +639,8 @@ def _fit_resample(self, X, y): @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class SMOTE(BaseSMOTE): """Class to perform over-sampling using SMOTE. @@ -657,16 +705,19 @@ class SMOTE(BaseSMOTE): Resampled dataset shape Counter({{0: 900, 1: 900}}) """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - k_neighbors=5, - n_jobs=1): + + def __init__( + self, + sampling_strategy="auto", + random_state=None, + k_neighbors=5, + n_jobs=1, + ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, - n_jobs=n_jobs + n_jobs=n_jobs, ) def _fit_resample(self, X, y): @@ -683,12 +734,13 @@ def _fit_resample(self, X, y): self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] - X_new, y_new = self._make_samples(X_class, y.dtype, class_sample, - X_class, nns, n_samples, 1.0) + X_new, y_new = self._make_samples( + X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0 + ) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) - sparse_func = 'tocsc' if X.format == 'csc' else 'tocsr' + sparse_func = "tocsc" if X.format == "csc" else "tocsr" X_resampled = getattr(X_resampled, sparse_func)() else: X_resampled = np.vstack((X_resampled, X_new)) @@ -823,11 +875,19 @@ class SMOTENC(SMOTE): """ - def __init__(self, categorical_features, sampling_strategy='auto', - random_state=None, k_neighbors=5, n_jobs=1): - super().__init__(sampling_strategy=sampling_strategy, - random_state=random_state, - k_neighbors=k_neighbors) + def __init__( + self, + categorical_features, + sampling_strategy="auto", + random_state=None, + k_neighbors=5, + n_jobs=1, + ): + super().__init__( + sampling_strategy=sampling_strategy, + random_state=random_state, + k_neighbors=k_neighbors, + ) self.categorical_features = categorical_features @staticmethod @@ -836,23 +896,29 @@ def _check_X_y(X, y): features. """ y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) + X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None) return X, y, binarize_y def _validate_estimator(self): super()._validate_estimator() categorical_features = np.asarray(self.categorical_features) - if categorical_features.dtype.name == 'bool': + if categorical_features.dtype.name == "bool": self.categorical_features_ = np.flatnonzero(categorical_features) else: - if any([cat not in np.arange(self.n_features_) - for cat in categorical_features]): + if any( + [ + cat not in np.arange(self.n_features_) + for cat in categorical_features + ] + ): raise ValueError( - 'Some of the categorical indices are out of range. Indices' - ' should be between 0 and {}'.format(self.n_features_)) + "Some of the categorical indices are out of range. Indices" + " should be between 0 and {}".format(self.n_features_) + ) self.categorical_features_ = categorical_features - self.continuous_features_ = np.setdiff1d(np.arange(self.n_features_), - self.categorical_features_) + self.continuous_features_ = np.setdiff1d( + np.arange(self.n_features_), self.categorical_features_ + ) def _fit_resample(self, X, y): self.n_features_ = X.shape[1] @@ -863,12 +929,13 @@ def _fit_resample(self, X, y): class_minority = min(target_stats, key=target_stats.get) X_continuous = X[:, self.continuous_features_] - X_continuous = check_array(X_continuous, accept_sparse=['csr', 'csc']) - X_minority = safe_indexing(X_continuous, - np.flatnonzero(y == class_minority)) + X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) + X_minority = safe_indexing( + X_continuous, np.flatnonzero(y == class_minority) + ) if sparse.issparse(X): - if X.format == 'csr': + if X.format == "csr": _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) @@ -877,42 +944,50 @@ def _fit_resample(self, X, y): self.median_std_ = np.median(np.sqrt(var)) X_categorical = X[:, self.categorical_features_] - if X_continuous.dtype.name != 'object': + if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 - self.ohe_ = OneHotEncoder(sparse=True, handle_unknown='ignore', - dtype=dtype_ohe) + self.ohe_ = OneHotEncoder( + sparse=True, handle_unknown="ignore", dtype=dtype_ohe + ) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform( - X_categorical.toarray() if sparse.issparse(X_categorical) - else X_categorical) + X_categorical.toarray() + if sparse.issparse(X_categorical) + else X_categorical + ) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. - X_ohe.data = (np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * - self.median_std_ / 2) - X_encoded = sparse.hstack((X_continuous, X_ohe), format='csr') + X_ohe.data = ( + np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2 + ) + X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr") - X_resampled, y_resampled = super()._fit_resample( - X_encoded, y) + X_resampled, y_resampled = super()._fit_resample(X_encoded, y) # reverse the encoding of the categorical features - X_res_cat = X_resampled[:, self.continuous_features_.size:] + X_res_cat = X_resampled[:, self.continuous_features_.size :] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( - (X_resampled[:, :self.continuous_features_.size], - X_res_cat_dec), format='csr' + ( + X_resampled[:, : self.continuous_features_.size], + X_res_cat_dec, + ), + format="csr", ) else: X_resampled = np.hstack( - (X_resampled[:, :self.continuous_features_.size].toarray(), - X_res_cat_dec) + ( + X_resampled[:, : self.continuous_features_.size].toarray(), + X_res_cat_dec, + ) ) indices_reordered = np.argsort( @@ -939,25 +1014,31 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step): of the majority class. """ rng = check_random_state(self.random_state) - sample = super()._generate_sample(X, nn_data, nn_num, - row, col, step) + sample = super()._generate_sample(X, nn_data, nn_num, row, col, step) # To avoid conversion and since there is only few samples used, we # convert those samples to dense array. - sample = (sample.toarray().squeeze() - if sparse.issparse(sample) else sample) + sample = ( + sample.toarray().squeeze() if sparse.issparse(sample) else sample + ) all_neighbors = nn_data[nn_num[row]] - all_neighbors = (all_neighbors.toarray() - if sparse.issparse(all_neighbors) else all_neighbors) + all_neighbors = ( + all_neighbors.toarray() + if sparse.issparse(all_neighbors) + else all_neighbors + ) - categories_size = ([self.continuous_features_.size] + - [cat.size for cat in self.ohe_.categories_]) + categories_size = [self.continuous_features_.size] + [ + cat.size for cat in self.ohe_.categories_ + ] - for start_idx, end_idx in zip(np.cumsum(categories_size)[:-1], - np.cumsum(categories_size)[1:]): + for start_idx, end_idx in zip( + np.cumsum(categories_size)[:-1], np.cumsum(categories_size)[1:] + ): col_max = all_neighbors[:, start_idx:end_idx].sum(axis=0) # tie breaking argmax - col_sel = rng.choice(np.flatnonzero( - np.isclose(col_max, col_max.max()))) + col_sel = rng.choice( + np.flatnonzero(np.isclose(col_max, col_max.max())) + ) sample[start_idx:end_idx] = 0 sample[start_idx + col_sel] = 1 @@ -966,7 +1047,8 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step): @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class KMeansSMOTE(BaseSMOTE): """Apply a KMeans clustering before to over-sample using SMOTE. @@ -1046,17 +1128,23 @@ class KMeansSMOTE(BaseSMOTE): More 0 samples: True """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - k_neighbors=2, - n_jobs=1, - kmeans_estimator=None, - cluster_balance_threshold="auto", - density_exponent="auto"): + + def __init__( + self, + sampling_strategy="auto", + random_state=None, + k_neighbors=2, + n_jobs=1, + kmeans_estimator=None, + cluster_balance_threshold="auto", + density_exponent="auto", + ): super().__init__( - sampling_strategy=sampling_strategy, random_state=random_state, - k_neighbors=k_neighbors, n_jobs=n_jobs) + sampling_strategy=sampling_strategy, + random_state=random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs, + ) self.kmeans_estimator = kmeans_estimator self.cluster_balance_threshold = cluster_balance_threshold self.density_exponent = density_exponent @@ -1065,18 +1153,20 @@ def _validate_estimator(self): super()._validate_estimator() if self.kmeans_estimator is None: self.kmeans_estimator_ = MiniBatchKMeans( - random_state=self.random_state) + random_state=self.random_state + ) elif isinstance(self.kmeans_estimator, int): self.kmeans_estimator_ = MiniBatchKMeans( n_clusters=self.kmeans_estimator, - random_state=self.random_state) + random_state=self.random_state, + ) else: self.kmeans_estimator_ = clone(self.kmeans_estimator) # validate the parameters - for param_name in ('cluster_balance_threshold', 'density_exponent'): + for param_name in ("cluster_balance_threshold", "density_exponent"): param = getattr(self, param_name) - if isinstance(param, str) and param != 'auto': + if isinstance(param, str) and param != "auto": raise ValueError( "'{}' should be 'auto' when a string is passed. " "Got {} instead.".format(param_name, repr(param)) @@ -1084,22 +1174,26 @@ def _validate_estimator(self): self.cluster_balance_threshold_ = ( self.cluster_balance_threshold - if self.kmeans_estimator_.n_clusters != 1 else -np.inf + if self.kmeans_estimator_.n_clusters != 1 + else -np.inf ) def _find_cluster_sparsity(self, X): """Compute the cluster sparsity.""" - euclidean_distances = pairwise_distances(X, metric="euclidean", - n_jobs=self.n_jobs) + euclidean_distances = pairwise_distances( + X, metric="euclidean", n_jobs=self.n_jobs + ) # negate diagonal elements for ind in range(X.shape[0]): euclidean_distances[ind, ind] = 0 non_diag_elements = (X.shape[0] ** 2) - X.shape[0] mean_distance = euclidean_distances.sum() / non_diag_elements - exponent = (math.log(X.shape[0], 1.6) ** 1.8 * 0.16 - if self.density_exponent == 'auto' - else self.density_exponent) + exponent = ( + math.log(X.shape[0], 1.6) ** 1.8 * 0.16 + if self.density_exponent == "auto" + else self.density_exponent + ) return (mean_distance ** exponent) / X.shape[0] def _fit_resample(self, X, y): @@ -1159,7 +1253,8 @@ def _fit_resample(self, X, y): "No clusters found with sufficient samples of " "class {}. Try lowering the cluster_balance_threshold " "or increasing the number of " - "clusters.".format(class_sample)) + "clusters.".format(class_sample) + ) for valid_cluster_idx, valid_cluster in enumerate(valid_clusters): X_cluster = safe_indexing(X, valid_cluster) @@ -1170,20 +1265,23 @@ def _fit_resample(self, X, y): ) self.nn_k_.fit(X_cluster_class) - nns = self.nn_k_.kneighbors(X_cluster_class, - return_distance=False)[:, 1:] + nns = self.nn_k_.kneighbors( + X_cluster_class, return_distance=False + )[:, 1:] - cluster_n_samples = int(math.ceil( - n_samples * cluster_weights[valid_cluster_idx]) + cluster_n_samples = int( + math.ceil(n_samples * cluster_weights[valid_cluster_idx]) ) - X_new, y_new = self._make_samples(X_cluster_class, - y.dtype, - class_sample, - X_cluster_class, - nns, - cluster_n_samples, - 1.0) + X_new, y_new = self._make_samples( + X_cluster_class, + y.dtype, + class_sample, + X_cluster_class, + nns, + cluster_n_samples, + 1.0, + ) stack = [np.vstack, sparse.vstack][int(sparse.issparse(X_new))] X_resampled = stack((X_resampled, X_new)) diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py index 0d98e9a41..aed0041db 100644 --- a/imblearn/over_sampling/base.py +++ b/imblearn/over_sampling/base.py @@ -15,10 +15,9 @@ class BaseOverSampler(BaseSampler): instead. """ - _sampling_type = 'over-sampling' + _sampling_type = "over-sampling" - _sampling_strategy_docstring = \ - """sampling_strategy : float, str, dict or callable, (default='auto') + _sampling_strategy_docstring = """sampling_strategy : float, str, dict or callable, (default='auto') Sampling information to resample the data set. - When ``float``, it corresponds to the desired ratio of the number of diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index 6e0d238e0..fc72e2a47 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -12,25 +12,36 @@ from imblearn.over_sampling import ADASYN RND_SEED = 0 -X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 -], [0.53366841, -0.30312976], [1.52091956, - -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, - 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], - [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ - -0.41635887, -0.38299653 - ], [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) +X = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + ] +) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_ada_init(): - sampling_strategy = 'auto' + sampling_strategy = "auto" ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED) assert ada.random_state == RND_SEED @@ -38,24 +49,62 @@ def test_ada_init(): def test_ada_fit_resample(): ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 - ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ - -0.28162401, -2.10400981 - ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ - 0.70472253, -0.73309052 - ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ - -0.41635887, -0.38299653 - ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.94899098, -0.30508981], [0.28204936, -0.13953426], - [1.58028868, -0.04089947], [0.66117333, -0.28009063]]) - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) + X_gt = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + [0.94899098, -0.30508981], + [0.28204936, -0.13953426], + [1.58028868, -0.04089947], + [0.66117333, -0.28009063], + ] + ) + y_gt = np.array( + [ + 0, + 1, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + ] + ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -64,32 +113,75 @@ def test_ada_fit_resample_nn_obj(): nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 - ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ - -0.28162401, -2.10400981 - ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ - 0.70472253, -0.73309052 - ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ - -0.41635887, -0.38299653 - ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.94899098, -0.30508981], [0.28204936, -0.13953426], - [1.58028868, -0.04089947], [0.66117333, -0.28009063]]) - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) + X_gt = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + [0.94899098, -0.30508981], + [0.28204936, -0.13953426], + [1.58028868, -0.04089947], + [0.66117333, -0.28009063], + ] + ) + y_gt = np.array( + [ + 0, + 1, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + ] + ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @pytest.mark.parametrize( "adasyn_params, err_msg", - [({"sampling_strategy": {0: 9, 1: 12}}, "No samples will be generated."), - ({"n_neighbors": 'rnd'}, "has to be one of")] + [ + ( + {"sampling_strategy": {0: 9, 1: 12}}, + "No samples will be generated.", + ), + ({"n_neighbors": "rnd"}, "has to be one of"), + ], ) def test_adasyn_error(adasyn_params, err_msg): adasyn = ADASYN(**adasyn_params) diff --git a/imblearn/over_sampling/tests/test_borderline_smote.py b/imblearn/over_sampling/tests/test_borderline_smote.py index 94ecaba78..06421f169 100644 --- a/imblearn/over_sampling/tests/test_borderline_smote.py +++ b/imblearn/over_sampling/tests/test_borderline_smote.py @@ -10,32 +10,49 @@ @pytest.fixture def data(): - X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) + X = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + ] + ) y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) return X, y def test_borderline_smote_wrong_kind(data): - bsmote = BorderlineSMOTE(kind='rand') + bsmote = BorderlineSMOTE(kind="rand") with pytest.raises(ValueError, match='The possible "kind" of algorithm'): bsmote.fit_resample(*data) -@pytest.mark.parametrize('kind', ['borderline-1', 'borderline-2']) +@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) def test_borderline_smote(kind, data): bsmote = BorderlineSMOTE(kind=kind, random_state=42) - bsmote_nn = BorderlineSMOTE(kind=kind, random_state=42, - k_neighbors=NearestNeighbors(n_neighbors=6), - m_neighbors=NearestNeighbors(n_neighbors=11)) + bsmote_nn = BorderlineSMOTE( + kind=kind, + random_state=42, + k_neighbors=NearestNeighbors(n_neighbors=6), + m_neighbors=NearestNeighbors(n_neighbors=11), + ) X_res_1, y_res_1 = bsmote.fit_resample(*data) X_res_2, y_res_2 = bsmote_nn.fit_resample(*data) diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 83c80dd8e..86899dc38 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -14,26 +14,42 @@ @pytest.fixture def data(): - X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) + X = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + ] + ) y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) return X, y def test_kmeans_smote(data): X, y = data - kmeans_smote = KMeansSMOTE(kmeans_estimator=1, - random_state=42, - cluster_balance_threshold=0.0, - k_neighbors=5) + kmeans_smote = KMeansSMOTE( + kmeans_estimator=1, + random_state=42, + cluster_balance_threshold=0.0, + k_neighbors=5, + ) smote = SMOTE(random_state=42) X_res_1, y_res_1 = kmeans_smote.fit_resample(X, y) @@ -44,21 +60,25 @@ def test_kmeans_smote(data): assert kmeans_smote.nn_k_.n_neighbors == 6 assert kmeans_smote.kmeans_estimator_.n_clusters == 1 - assert 'batch_size' in kmeans_smote.kmeans_estimator_.get_params() + assert "batch_size" in kmeans_smote.kmeans_estimator_.get_params() @pytest.mark.parametrize("k_neighbors", [2, NearestNeighbors(n_neighbors=3)]) @pytest.mark.parametrize( "kmeans_estimator", - [3, - KMeans(n_clusters=3, random_state=42), - MiniBatchKMeans(n_clusters=3, random_state=42)] + [ + 3, + KMeans(n_clusters=3, random_state=42), + MiniBatchKMeans(n_clusters=3, random_state=42), + ], ) def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): X, y = data - kmeans_smote = KMeansSMOTE(random_state=42, - kmeans_estimator=kmeans_estimator, - k_neighbors=k_neighbors) + kmeans_smote = KMeansSMOTE( + random_state=42, + kmeans_estimator=kmeans_estimator, + k_neighbors=k_neighbors, + ) X_resampled, y_resampled = kmeans_smote.fit_resample(X, y) assert X_resampled.shape == (24, 2) assert y_resampled.shape == (24,) @@ -66,39 +86,42 @@ def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): assert kmeans_smote.nn_k_.n_neighbors == 3 assert kmeans_smote.kmeans_estimator_.n_clusters == 3 + def test_sample_kmeans_not_enough_clusters(): rng = np.random.RandomState(42) X = rng.randn(30, 2) y = np.array([1] * 20 + [0] * 10) - smote = KMeansSMOTE(random_state=42, - kmeans_estimator=30, - k_neighbors=2) + smote = KMeansSMOTE(random_state=42, kmeans_estimator=30, k_neighbors=2) with pytest.raises(RuntimeError): smote.fit_resample(X, y) @pytest.mark.parametrize("density_exponent", ["auto", 2]) @pytest.mark.parametrize("cluster_balance_threshold", ["auto", 0.8]) -def test_sample_kmeans_density_estimation(data, density_exponent, - cluster_balance_threshold): +def test_sample_kmeans_density_estimation( + data, density_exponent, cluster_balance_threshold +): X, y = data - smote = KMeansSMOTE(random_state=42, - density_exponent=density_exponent, - cluster_balance_threshold=cluster_balance_threshold) + smote = KMeansSMOTE( + random_state=42, + density_exponent=density_exponent, + cluster_balance_threshold=cluster_balance_threshold, + ) smote.fit_resample(X, y) @pytest.mark.parametrize( "density_exponent, cluster_balance_threshold", - [('xxx', 'auto'), ('auto', 'xxx')] + [("xxx", "auto"), ("auto", "xxx")], ) -def test_kmeans_smote_param_error(data, density_exponent, - cluster_balance_threshold): +def test_kmeans_smote_param_error( + data, density_exponent, cluster_balance_threshold +): X, y = data kmeans_smote = KMeansSMOTE( density_exponent=density_exponent, - cluster_balance_threshold=cluster_balance_threshold + cluster_balance_threshold=cluster_balance_threshold, ) with pytest.raises(ValueError, match="should be 'auto' when a string"): kmeans_smote.fit_resample(X, y) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 5f5ed1b36..d49a807a8 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -12,32 +12,52 @@ from imblearn.over_sampling import RandomOverSampler RND_SEED = 0 -X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ - 0.20792588, 1.49407907 -], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], - [0.09125309, -0.85409574], [0.12372842, 0.6536186], - [0.13347175, 0.12167502], [0.094035, -2.55298982]]) +X = np.array( + [ + [0.04352327, -0.20515826], + [0.92923648, 0.76103773], + [0.20792588, 1.49407907], + [0.47104475, 0.44386323], + [0.22950086, 0.33367433], + [0.15490546, 0.3130677], + [0.09125309, -0.85409574], + [0.12372842, 0.6536186], + [0.13347175, 0.12167502], + [0.094035, -2.55298982], + ] +) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) def test_ros_init(): - sampling_strategy = 'auto' + sampling_strategy = "auto" ros = RandomOverSampler( - sampling_strategy=sampling_strategy, random_state=RND_SEED) + sampling_strategy=sampling_strategy, random_state=RND_SEED + ) assert ros.random_state == RND_SEED def test_ros_fit_resample(): ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X, Y) - X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ - 0.20792588, 1.49407907 - ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [ - 0.15490546, 0.3130677 - ], [0.09125309, -0.85409574], [0.12372842, 0.6536186], - [0.13347175, 0.12167502], [0.094035, -2.55298982], - [0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.92923648, 0.76103773], [0.47104475, 0.44386323]]) + X_gt = np.array( + [ + [0.04352327, -0.20515826], + [0.92923648, 0.76103773], + [0.20792588, 1.49407907], + [0.47104475, 0.44386323], + [0.22950086, 0.33367433], + [0.15490546, 0.3130677], + [0.09125309, -0.85409574], + [0.12372842, 0.6536186], + [0.13347175, 0.12167502], + [0.094035, -2.55298982], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + ] + ) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -46,14 +66,23 @@ def test_ros_fit_resample(): def test_ros_fit_resample_half(): sampling_strategy = {0: 3, 1: 7} ros = RandomOverSampler( - sampling_strategy=sampling_strategy, random_state=RND_SEED) + sampling_strategy=sampling_strategy, random_state=RND_SEED + ) X_resampled, y_resampled = ros.fit_resample(X, Y) - X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ - 0.20792588, 1.49407907 - ], [0.47104475, 0.44386323], [0.22950086, - 0.33367433], [0.15490546, 0.3130677], - [0.09125309, -0.85409574], [0.12372842, 0.6536186], - [0.13347175, 0.12167502], [0.094035, -2.55298982]]) + X_gt = np.array( + [ + [0.04352327, -0.20515826], + [0.92923648, 0.76103773], + [0.20792588, 1.49407907], + [0.47104475, 0.44386323], + [0.22950086, 0.33367433], + [0.15490546, 0.3130677], + [0.09125309, -0.85409574], + [0.12372842, 0.6536186], + [0.13347175, 0.12167502], + [0.094035, -2.55298982], + ] + ) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -72,8 +101,9 @@ def test_multiclass_fit_resample(): def test_random_over_sampling_heterogeneous_data(): - X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], - dtype=np.object) + X_hetero = np.array( + [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=np.object + ) y = np.array([0, 0, 1]) ros = RandomOverSampler(random_state=RND_SEED) X_res, y_res = ros.fit_resample(X_hetero, y) diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 276fffcf9..6d90fd62b 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -16,16 +16,30 @@ RND_SEED = 0 -X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) +X = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + ] +) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 @@ -33,48 +47,98 @@ def test_sample_regular(): smote = SMOTE(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 - ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ - -0.28162401, -2.10400981 - ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ - 0.70472253, -0.73309052 - ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ - -0.41635887, -0.38299653 - ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.29307743, -0.14670439], [0.84976473, -0.15570176], - [0.61319159, -0.11571668], [0.66052536, -0.28246517]]) - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) + X_gt = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + [0.29307743, -0.14670439], + [0.84976473, -0.15570176], + [0.61319159, -0.11571668], + [0.66052536, -0.28246517], + ] + ) + y_gt = np.array( + [ + 0, + 1, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + ] + ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} - smote = SMOTE( - sampling_strategy=sampling_strategy, random_state=RND_SEED) + smote = SMOTE(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 - ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ - -0.28162401, -2.10400981 - ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ - 0.70472253, -0.73309052 - ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], [0.08711622, 0.93259929], - [1.70580611, -0.11219234], [0.36784496, -0.1953161]]) + X_gt = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + [0.36784496, -0.1953161], + ] + ) y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0]) + [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0] + ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -83,30 +147,68 @@ def test_sample_regular_with_nn(): nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ - 1.25192108, -0.22367336 - ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ - -0.28162401, -2.10400981 - ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ - 0.70472253, -0.73309052 - ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ - 0.88407872, 0.35454207 - ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ - -0.18410027, -0.45194484 - ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ - -0.41635887, -0.38299653 - ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.29307743, -0.14670439], [0.84976473, -0.15570176], - [0.61319159, -0.11571668], [0.66052536, -0.28246517]]) - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) + X_gt = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + [0.29307743, -0.14670439], + [0.84976473, -0.15570176], + [0.61319159, -0.11571668], + [0.66052536, -0.28246517], + ] + ) + y_gt = np.array( + [ + 0, + 1, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + ] + ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @pytest.mark.parametrize( - "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=['borderline', 'svm'] + "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"] ) def test_smote_m_neighbors(smote): # check that m_neighbors is properly set. Regression test for: diff --git a/imblearn/over_sampling/tests/test_smote_nc.py b/imblearn/over_sampling/tests/test_smote_nc.py index b9533bc76..1046a717b 100644 --- a/imblearn/over_sampling/tests/test_smote_nc.py +++ b/imblearn/over_sampling/tests/test_smote_nc.py @@ -23,7 +23,7 @@ def data_heterogneous_ordered(): # create 2 random continuous feature X[:, :2] = rng.randn(30, 2) # create a categorical feature using some string - X[:, 2] = rng.choice(['a', 'b', 'c'], size=30).astype(object) + X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) @@ -37,7 +37,7 @@ def data_heterogneous_unordered(): # create 2 random continuous feature X[:, [1, 2]] = rng.randn(30, 2) # create a categorical feature using some string - X[:, 0] = rng.choice(['a', 'b', 'c'], size=30).astype(object) + X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) @@ -51,7 +51,7 @@ def data_heterogneous_masked(): # create 2 random continuous feature X[:, [1, 2]] = rng.randn(30, 2) # create a categorical feature using some string - X[:, 0] = rng.choice(['a', 'b', 'c'], size=30).astype(object) + X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) @@ -65,7 +65,7 @@ def data_heterogneous_unordered_multiclass(): # create 2 random continuous feature X[:, [1, 2]] = rng.randn(50, 2) # create a categorical feature using some string - X[:, 0] = rng.choice(['a', 'b', 'c'], size=50).astype(object) + X[:, 0] = rng.choice(["a", "b", "c"], size=50).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=50) y = np.array([0] * 10 + [1] * 15 + [2] * 25) @@ -83,7 +83,7 @@ def data_sparse(format): # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) - X = sparse.csr_matrix(X) if format == 'csr' else sparse.csc_matrix(X) + X = sparse.csr_matrix(X) if format == "csr" else sparse.csc_matrix(X) return X, y, [0, 3] @@ -97,9 +97,13 @@ def test_smotenc_error(): @pytest.mark.parametrize( "data", - [data_heterogneous_ordered(), data_heterogneous_unordered(), - data_heterogneous_masked(), - data_sparse('csr'), data_sparse('csc')] + [ + data_heterogneous_ordered(), + data_heterogneous_unordered(), + data_heterogneous_masked(), + data_sparse("csr"), + data_sparse("csc"), + ], ) def test_smotenc(data): X, y, categorical_features = data @@ -125,8 +129,7 @@ def test_smotenc(data): def test_smotenc_check_target_type(): X, _, categorical_features = data_heterogneous_unordered() y = np.linspace(0, 1, 30) - smote = SMOTENC(categorical_features=categorical_features, - random_state=0) + smote = SMOTENC(categorical_features=categorical_features, random_state=0) with pytest.raises(ValueError, match="Unknown label type: 'continuous'"): smote.fit_resample(X, y) rng = np.random.RandomState(42) @@ -139,26 +142,24 @@ def test_smotenc_check_target_type(): def test_smotenc_samplers_one_label(): X, _, categorical_features = data_heterogneous_unordered() y = np.zeros(30) - smote = SMOTENC(categorical_features=categorical_features, - random_state=0) - with pytest.raises(ValueError, match='needs to have more than 1 class'): + smote = SMOTENC(categorical_features=categorical_features, random_state=0) + with pytest.raises(ValueError, match="needs to have more than 1 class"): smote.fit(X, y) def test_smotenc_fit(): X, y, categorical_features = data_heterogneous_unordered() - smote = SMOTENC(categorical_features=categorical_features, - random_state=0) + smote = SMOTENC(categorical_features=categorical_features, random_state=0) smote.fit_resample(X, y) - assert hasattr(smote, 'sampling_strategy_'), \ - "No fitted attribute sampling_strategy_" + assert hasattr( + smote, "sampling_strategy_" + ), "No fitted attribute sampling_strategy_" def test_smotenc_fit_resample(): X, y, categorical_features = data_heterogneous_unordered() target_stats = Counter(y) - smote = SMOTENC(categorical_features=categorical_features, - random_state=0) + smote = SMOTENC(categorical_features=categorical_features, random_state=0) _, y_res = smote.fit_resample(X, y) _ = Counter(y_res) n_samples = max(target_stats.values()) @@ -168,8 +169,7 @@ def test_smotenc_fit_resample(): def test_smotenc_fit_resample_sampling_strategy(): X, y, categorical_features = data_heterogneous_unordered_multiclass() expected_stat = Counter(y)[1] - smote = SMOTENC(categorical_features=categorical_features, - random_state=0) + smote = SMOTENC(categorical_features=categorical_features, random_state=0) sampling_strategy = {2: 25, 0: 25} smote.set_params(sampling_strategy=sampling_strategy) X_res, y_res = smote.fit_resample(X, y) @@ -181,8 +181,7 @@ def test_smotenc_pandas(): # Check that the samplers handle pandas dataframe and pandas series X, y, categorical_features = data_heterogneous_unordered_multiclass() X_pd = pd.DataFrame(X) - smote = SMOTENC(categorical_features=categorical_features, - random_state=0) + smote = SMOTENC(categorical_features=categorical_features, random_state=0) X_res_pd, y_res_pd = smote.fit_resample(X_pd, y) X_res, y_res = smote.fit_resample(X, y) assert X_res_pd.tolist() == X_res.tolist() @@ -190,8 +189,13 @@ def test_smotenc_pandas(): def test_smotenc_preserve_dtype(): - X, y = make_classification(n_samples=50, n_classes=3, n_informative=4, - weights=[0.2, 0.3, 0.5], random_state=0) + X, y = make_classification( + n_samples=50, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) # Cast X and y to not default dtype X = X.astype(np.float32) y = y.astype(np.int32) diff --git a/imblearn/over_sampling/tests/test_svm_smote.py b/imblearn/over_sampling/tests/test_svm_smote.py index 7875cd8d7..eccffc8a8 100644 --- a/imblearn/over_sampling/tests/test_svm_smote.py +++ b/imblearn/over_sampling/tests/test_svm_smote.py @@ -12,26 +12,42 @@ @pytest.fixture def data(): - X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) + X = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + ] + ) y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) return X, y def test_svm_smote(data): svm_smote = SVMSMOTE(random_state=42) - svm_smote_nn = SVMSMOTE(random_state=42, - k_neighbors=NearestNeighbors(n_neighbors=6), - m_neighbors=NearestNeighbors(n_neighbors=11), - svm_estimator=SVC(gamma='scale', random_state=42)) + svm_smote_nn = SVMSMOTE( + random_state=42, + k_neighbors=NearestNeighbors(n_neighbors=6), + m_neighbors=NearestNeighbors(n_neighbors=11), + svm_estimator=SVC(gamma="scale", random_state=42), + ) X_res_1, y_res_1 = svm_smote.fit_resample(*data) X_res_2, y_res_2 = svm_smote_nn.fit_resample(*data) diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index 7fdf56fcf..27c7f94a3 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -21,7 +21,7 @@ from sklearn.utils.metaestimators import if_delegate_has_method from sklearn.utils.validation import check_memory -__all__ = ['Pipeline', 'make_pipeline'] +__all__ = ["Pipeline", "make_pipeline"] class Pipeline(pipeline.Pipeline): @@ -125,37 +125,46 @@ def _validate_steps(self): estimator = estimators[-1] for t in transformers: - if t is None or t == 'passthrough': + if t is None or t == "passthrough": continue - if (not (hasattr(t, "fit") or - hasattr(t, "fit_transform") or - hasattr(t, "fit_resample")) or - not (hasattr(t, "transform") or - hasattr(t, "fit_resample"))): + if not ( + hasattr(t, "fit") + or hasattr(t, "fit_transform") + or hasattr(t, "fit_resample") + ) or not (hasattr(t, "transform") or hasattr(t, "fit_resample")): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or " "fit_resample (but not both) or be a string 'passthrough' " - "'%s' (type %s) doesn't)" % (t, type(t))) + "'%s' (type %s) doesn't)" % (t, type(t)) + ) - if (hasattr(t, "fit_resample") and (hasattr(t, "fit_transform") or - hasattr(t, "transform"))): + if hasattr(t, "fit_resample") and ( + hasattr(t, "fit_transform") or hasattr(t, "transform") + ): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or sample." - " '%s' implements both)" % (t)) + " '%s' implements both)" % (t) + ) if isinstance(t, pipeline.Pipeline): raise TypeError( "All intermediate steps of the chain should not be" - " Pipelines") + " Pipelines" + ) # We allow last estimator to be None as an identity transformation - if (estimator is not None and estimator != 'passthrough' - and not hasattr(estimator, "fit")): - raise TypeError("Last step of Pipeline should implement fit or be " - "the string 'passthrough'. '%s' (type %s) doesn't" - % (estimator, type(estimator))) + if ( + estimator is not None + and estimator != "passthrough" + and not hasattr(estimator, "fit") + ): + raise TypeError( + "Last step of Pipeline should implement fit or be " + "the string 'passthrough'. '%s' (type %s) doesn't" + % (estimator, type(estimator)) + ) # Estimator interface @@ -168,13 +177,14 @@ def _fit(self, X, y=None, **fit_params): fit_transform_one_cached = memory.cache(_fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) - fit_params_steps = {name: {} for name, step in self.steps - if step is not None} + fit_params_steps = { + name: {} for name, step in self.steps if step is not None + } for pname, pval in fit_params.items(): - step, param = pname.split('__', 1) + step, param = pname.split("__", 1) fit_params_steps[step][param] = pval for step_idx, name, transformer in self._iter(with_final=False): - if hasattr(memory, 'location'): + if hasattr(memory, "location"): # joblib >= 0.12 if memory.location is None: # we do not clone when caching is disabled to @@ -182,7 +192,7 @@ def _fit(self, X, y=None, **fit_params): cloned_transformer = transformer else: cloned_transformer = clone(transformer) - elif hasattr(memory, 'cachedir'): + elif hasattr(memory, "cachedir"): # joblib < 0.11 if memory.cachedir is None: # we do not clone when caching is disabled to @@ -191,19 +201,21 @@ def _fit(self, X, y=None, **fit_params): else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer - if (hasattr(cloned_transformer, "transform") or - hasattr(cloned_transformer, "fit_transform")): + if hasattr(cloned_transformer, "transform") or hasattr( + cloned_transformer, "fit_transform" + ): X, fitted_transformer = fit_transform_one_cached( - cloned_transformer, None, X, y, - **fit_params_steps[name]) + cloned_transformer, None, X, y, **fit_params_steps[name] + ) elif hasattr(cloned_transformer, "fit_resample"): X, y, fitted_transformer = fit_resample_one_cached( - cloned_transformer, X, y, **fit_params_steps[name]) + cloned_transformer, X, y, **fit_params_steps[name] + ) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) - if self._final_estimator == 'passthrough': + if self._final_estimator == "passthrough": return X, y, {} return X, y, fit_params_steps[self.steps[-1][0]] @@ -236,7 +248,7 @@ def fit(self, X, y=None, **fit_params): """ Xt, yt, fit_params = self._fit(X, y, **fit_params) - if self._final_estimator != 'passthrough': + if self._final_estimator != "passthrough": self._final_estimator.fit(Xt, yt, **fit_params) return self @@ -270,9 +282,9 @@ def fit_transform(self, X, y=None, **fit_params): """ last_step = self._final_estimator Xt, yt, fit_params = self._fit(X, y, **fit_params) - if last_step == 'passthrough': + if last_step == "passthrough": return Xt - elif hasattr(last_step, 'fit_transform'): + elif hasattr(last_step, "fit_transform"): return last_step.fit_transform(Xt, yt, **fit_params) else: return last_step.fit(Xt, yt, **fit_params).transform(Xt) @@ -310,12 +322,12 @@ def fit_resample(self, X, y=None, **fit_params): """ last_step = self._final_estimator Xt, yt, fit_params = self._fit(X, y, **fit_params) - if last_step == 'passthrough': + if last_step == "passthrough": return Xt - elif hasattr(last_step, 'fit_resample'): + elif hasattr(last_step, "fit_resample"): return last_step.fit_resample(Xt, yt, **fit_params) - @if_delegate_has_method(delegate='_final_estimator') + @if_delegate_has_method(delegate="_final_estimator") def predict(self, X, **predict_params): """Apply transformers/samplers to the data, and predict with the final estimator @@ -347,7 +359,7 @@ def predict(self, X, **predict_params): Xt = transform.transform(Xt) return self.steps[-1][-1].predict(Xt, **predict_params) - @if_delegate_has_method(delegate='_final_estimator') + @if_delegate_has_method(delegate="_final_estimator") def fit_predict(self, X, y=None, **fit_params): """Applies fit_predict of last step in pipeline after transforms. @@ -377,7 +389,7 @@ def fit_predict(self, X, y=None, **fit_params): Xt, yt, fit_params = self._fit(X, y, **fit_params) return self.steps[-1][-1].fit_predict(Xt, yt, **fit_params) - @if_delegate_has_method(delegate='_final_estimator') + @if_delegate_has_method(delegate="_final_estimator") def predict_proba(self, X): """Apply transformers/samplers, and predict_proba of the final estimator @@ -401,7 +413,7 @@ def predict_proba(self, X): Xt = transform.transform(Xt) return self.steps[-1][-1].predict_proba(Xt) - @if_delegate_has_method(delegate='_final_estimator') + @if_delegate_has_method(delegate="_final_estimator") def score_samples(self, X): """Apply transforms, and score_samples of the final estimator. Parameters @@ -421,7 +433,7 @@ def score_samples(self, X): Xt = transformer.transform(Xt) return self.steps[-1][-1].score_samples(Xt) - @if_delegate_has_method(delegate='_final_estimator') + @if_delegate_has_method(delegate="_final_estimator") def decision_function(self, X): """Apply transformers/samplers, and decision_function of the final estimator @@ -445,7 +457,7 @@ def decision_function(self, X): Xt = transform.transform(Xt) return self.steps[-1][-1].decision_function(Xt) - @if_delegate_has_method(delegate='_final_estimator') + @if_delegate_has_method(delegate="_final_estimator") def predict_log_proba(self, X): """Apply transformers/samplers, and predict_log_proba of the final estimator @@ -487,7 +499,7 @@ def transform(self): Xt : array-like, shape = [n_samples, n_transformed_features] """ # _final_estimator is None or has transform, otherwise attribute error - if self._final_estimator != 'passthrough': + if self._final_estimator != "passthrough": self._final_estimator.transform return self._transform @@ -533,7 +545,7 @@ def _inverse_transform(self, X): Xt = transform.inverse_transform(Xt) return Xt - @if_delegate_has_method(delegate='_final_estimator') + @if_delegate_has_method(delegate="_final_estimator") def score(self, X, y=None, sample_weight=None): """Apply transformers/samplers, and score with the final estimator @@ -563,12 +575,12 @@ def score(self, X, y=None, sample_weight=None): Xt = transform.transform(Xt) score_params = {} if sample_weight is not None: - score_params['sample_weight'] = sample_weight + score_params["sample_weight"] = sample_weight return self.steps[-1][-1].score(Xt, y, **score_params) def _fit_transform_one(transformer, weight, X, y, **fit_params): - if hasattr(transformer, 'fit_transform'): + if hasattr(transformer, "fit_transform"): res = transformer.fit_transform(X, y, **fit_params) else: res = transformer.fit(X, y, **fit_params).transform(X) @@ -627,8 +639,9 @@ def make_pipeline(*steps, **kwargs): GaussianNB(priors=None, var_smoothing=1e-09))], verbose=False) """ - memory = kwargs.pop('memory', None) + memory = kwargs.pop("memory", None) if kwargs: - raise TypeError('Unknown keyword arguments: "{}"' - .format(list(kwargs.keys())[0])) + raise TypeError( + 'Unknown keyword arguments: "{}"'.format(list(kwargs.keys())[0]) + ) return Pipeline(pipeline._name_estimators(steps), memory=memory) diff --git a/imblearn/tensorflow/__init__.py b/imblearn/tensorflow/__init__.py index 3224a7db1..2a40c7adc 100644 --- a/imblearn/tensorflow/__init__.py +++ b/imblearn/tensorflow/__init__.py @@ -3,4 +3,4 @@ from ._generator import balanced_batch_generator -__all__ = ['balanced_batch_generator'] +__all__ = ["balanced_batch_generator"] diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index a218d63e5..75521f1e6 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -11,15 +11,26 @@ from ..utils import Substitution from ..utils._docstring import _random_state_docstring -DONT_HAVE_RANDOM_STATE = ('NearMiss', 'EditedNearestNeighbours', - 'RepeatedEditedNearestNeighbours', 'AllKNN', - 'NeighbourhoodCleaningRule', 'TomekLinks') +DONT_HAVE_RANDOM_STATE = ( + "NearMiss", + "EditedNearestNeighbours", + "RepeatedEditedNearestNeighbours", + "AllKNN", + "NeighbourhoodCleaningRule", + "TomekLinks", +) @Substitution(random_state=_random_state_docstring) -def balanced_batch_generator(X, y, sample_weight=None, sampler=None, - batch_size=32, keep_sparse=False, - random_state=None): +def balanced_batch_generator( + X, + y, + sample_weight=None, + sampler=None, + batch_size=32, + keep_sparse=False, + random_state=None, +): """Create a balanced batch generator to train keras model. Returns a generator --- as well as the number of step per epoch --- which @@ -127,9 +138,10 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, if sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE: set_random_state(sampler_, random_state) sampler_.fit_resample(X, y) - if not hasattr(sampler_, 'sample_indices_'): - raise ValueError("'sampler' needs to have an attribute " - "'sample_indices_'.") + if not hasattr(sampler_, "sample_indices_"): + raise ValueError( + "'sampler' needs to have an attribute " "'sample_indices_'." + ) indices = sampler_.sample_indices_ # shuffle the indices since the sampler are packing them by class random_state.shuffle(indices) @@ -137,16 +149,19 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, def generator(X, y, sample_weight, indices, batch_size): while True: for index in range(0, len(indices), batch_size): - X_res = safe_indexing(X, indices[index:index + batch_size]) - y_res = safe_indexing(y, indices[index:index + batch_size]) + X_res = safe_indexing(X, indices[index : index + batch_size]) + y_res = safe_indexing(y, indices[index : index + batch_size]) if issparse(X_res) and not keep_sparse: X_res = X_res.toarray() if sample_weight is None: yield X_res, y_res else: - sw_res = safe_indexing(sample_weight, - indices[index:index + batch_size]) + sw_res = safe_indexing( + sample_weight, indices[index : index + batch_size] + ) yield X_res, y_res, sw_res - return (generator(X, y, sample_weight, indices, batch_size), - int(indices.size // batch_size)) + return ( + generator(X, y, sample_weight, indices, batch_size), + int(indices.size // batch_size), + ) diff --git a/imblearn/tensorflow/tests/test_generator.py b/imblearn/tensorflow/tests/test_generator.py index 93de15cfc..27d63df3c 100644 --- a/imblearn/tensorflow/tests/test_generator.py +++ b/imblearn/tensorflow/tests/test_generator.py @@ -10,7 +10,7 @@ from imblearn.tensorflow import balanced_batch_generator -tf = pytest.importorskip('tensorflow') +tf = pytest.importorskip("tensorflow") @pytest.fixture @@ -26,8 +26,13 @@ def test_balanced_batch_generator(data, sampler): X, y = data batch_size = 10 training_generator, steps_per_epoch = balanced_batch_generator( - X, y, sample_weight=None, sampler=sampler, - batch_size=batch_size, random_state=42) + X, + y, + sample_weight=None, + sampler=sampler, + batch_size=batch_size, + random_state=42, + ) learning_rate = 0.01 epochs = 10 @@ -52,7 +57,8 @@ def accuracy(y_true, y_pred): # build the loss, predict, and train operator cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=out_act, labels=targets) + logits=out_act, labels=targets + ) loss = tf.reduce_sum(cross_entropy) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.minimize(loss) @@ -67,13 +73,18 @@ def accuracy(y_true, y_pred): for e in range(epochs): for i in range(steps_per_epoch): X_batch, y_batch = next(training_generator) - sess.run([train_op, loss], - feed_dict={data: X_batch, targets: y_batch}) + sess.run( + [train_op, loss], + feed_dict={data: X_batch, targets: y_batch}, + ) # For each epoch, run accuracy on train and test predicts_train = sess.run(predict, feed_dict={data: X}) - print("epoch: {} train accuracy: {:.3f}" - .format(e, accuracy(y, predicts_train))) + print( + "epoch: {} train accuracy: {:.3f}".format( + e, accuracy(y, predicts_train) + ) + ) @pytest.mark.parametrize("keep_sparse", [True, False]) @@ -81,8 +92,12 @@ def test_balanced_batch_generator_function_sparse(data, keep_sparse): X, y = data training_generator, steps_per_epoch = balanced_batch_generator( - sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, - random_state=42) + sparse.csr_matrix(X), + y, + keep_sparse=keep_sparse, + batch_size=10, + random_state=42, + ) for idx in range(steps_per_epoch): X_batch, y_batch = next(training_generator) if keep_sparse: diff --git a/imblearn/tests/test_base.py b/imblearn/tests/test_base.py index 25a976d34..721ebbbf5 100644 --- a/imblearn/tests/test_base.py +++ b/imblearn/tests/test_base.py @@ -17,22 +17,23 @@ iris = load_iris() X, y = make_imbalance( - iris.data, iris.target, sampling_strategy={0: 10, - 1: 25}, random_state=0) + iris.data, iris.target, sampling_strategy={0: 10, 1: 25}, random_state=0 +) def test_function_sampler_reject_sparse(): X_sparse = sparse.csr_matrix(X) sampler = FunctionSampler(accept_sparse=False) with pytest.raises( - TypeError, - match="A sparse matrix was passed, " - "but dense data is required"): + TypeError, + match="A sparse matrix was passed, " "but dense data is required", + ): sampler.fit_resample(X_sparse, y) -@pytest.mark.parametrize("X, y", [(X, y), (sparse.csr_matrix(X), y), - (sparse.csc_matrix(X), y)]) +@pytest.mark.parametrize( + "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] +) def test_function_sampler_identity(X, y): sampler = FunctionSampler() X_res, y_res = sampler.fit_resample(X, y) @@ -40,8 +41,9 @@ def test_function_sampler_identity(X, y): assert_array_equal(y_res, y) -@pytest.mark.parametrize("X, y", [(X, y), (sparse.csr_matrix(X), y), - (sparse.csc_matrix(X), y)]) +@pytest.mark.parametrize( + "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] +) def test_function_sampler_func(X, y): def func(X, y): return X[:10], y[:10] @@ -52,17 +54,19 @@ def func(X, y): assert_array_equal(y_res, y[:10]) -@pytest.mark.parametrize("X, y", [(X, y), (sparse.csr_matrix(X), y), - (sparse.csc_matrix(X), y)]) +@pytest.mark.parametrize( + "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] +) def test_function_sampler_func_kwargs(X, y): def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( - sampling_strategy=sampling_strategy, random_state=random_state) + sampling_strategy=sampling_strategy, random_state=random_state + ) return rus.fit_resample(X, y) sampler = FunctionSampler( - func=func, kw_args={'sampling_strategy': 'auto', - 'random_state': 0}) + func=func, kw_args={"sampling_strategy": "auto", "random_state": 0} + ) X_res, y_res = sampler.fit_resample(X, y) X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y) assert_allclose_dense_sparse(X_res, X_res_2) diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 9c0d22883..7f74d0233 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -9,22 +9,20 @@ from imblearn.utils.testing import all_estimators -@pytest.mark.parametrize( - 'name, Estimator', - all_estimators() -) +@pytest.mark.parametrize("name, Estimator", all_estimators()) def test_all_estimator_no_base_class(name, Estimator): # test that all_estimators doesn't find abstract classes. - msg = ("Base estimators such as {0} should not be included" - " in all_estimators").format(name) - assert not name.lower().startswith('base'), msg + msg = ( + "Base estimators such as {0} should not be included" + " in all_estimators" + ).format(name) + assert not name.lower().startswith("base"), msg @pytest.mark.filterwarnings("ignore:'y' should be of types") @pytest.mark.filterwarnings("ignore:The number of the samples to") @pytest.mark.parametrize( - 'name, Estimator', - all_estimators(include_meta_estimators=True) + "name, Estimator", all_estimators(include_meta_estimators=True) ) def test_all_estimators(name, Estimator): # don't run twice the sampler tests. Meta-estimator do not have a @@ -48,9 +46,10 @@ def _generate_checks_per_estimator(check_generator, estimators): @pytest.mark.filterwarnings("ignore:'y' should be of types") @pytest.mark.parametrize( - 'name, Estimator, check', - _generate_checks_per_estimator(_yield_all_checks, - _tested_non_meta_estimators()) + "name, Estimator, check", + _generate_checks_per_estimator( + _yield_all_checks, _tested_non_meta_estimators() + ), ) def test_samplers(name, Estimator, check): # input validation etc for non-meta estimators diff --git a/imblearn/tests/test_exceptions.py b/imblearn/tests/test_exceptions.py index 91ca4576a..d7dab593d 100644 --- a/imblearn/tests/test_exceptions.py +++ b/imblearn/tests/test_exceptions.py @@ -11,4 +11,4 @@ def test_raise_isinstance_error(): var = 10.0 with raises(ValueError, match="has to be one of"): - raise_isinstance_error('var', [int], var) + raise_isinstance_error("var", [int], var) diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index 1bdc78d73..bdb1b92c9 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -31,8 +31,10 @@ from sklearn.preprocessing import StandardScaler from imblearn.pipeline import Pipeline, make_pipeline -from imblearn.under_sampling import (RandomUnderSampler, - EditedNearestNeighbours as ENN) +from imblearn.under_sampling import ( + RandomUnderSampler, + EditedNearestNeighbours as ENN, +) JUNK_FOOD_DOCS = ( @@ -41,7 +43,8 @@ "the the pizza beer beer copyright", "the burger beer beer copyright", "the coke burger coke copyright", - "the coke burger burger", ) + "the coke burger burger", +) R_TOL = 1e-4 @@ -60,10 +63,10 @@ def fit(self, X, y): return self def get_params(self, deep=False): - return {'a': self.a, 'b': self.b} + return {"a": self.a, "b": self.b} def set_params(self, **params): - self.a = params['a'] + self.a = params["a"] return self @@ -144,6 +147,7 @@ def fit(self, X, y): class DummyEstimatorParams(BaseEstimator): """Mock classifier that takes params on predict""" + def fit(self, X, y): return self @@ -183,10 +187,10 @@ def transform(self, X, y=None): def test_pipeline_init_tuple(): # Pipeline accepts steps as tuple X = np.array([[1, 2]]) - pipe = Pipeline((('transf', Transf()), ('clf', FitParamT()))) + pipe = Pipeline((("transf", Transf()), ("clf", FitParamT()))) pipe.fit(X, y=None) pipe.score(X) - pipe.set_params(transf='passthrough') + pipe.set_params(transf="passthrough") pipe.fit(X, y=None) pipe.score(X) @@ -197,15 +201,18 @@ def test_pipeline_init(): Pipeline() # Check that we can't instantiate pipelines with objects without fit # method - error_regex = ("Last step of Pipeline should implement fit or be the " - "string 'passthrough'") + error_regex = ( + "Last step of Pipeline should implement fit or be the " + "string 'passthrough'" + ) with raises(TypeError, match=error_regex): - Pipeline([('clf', NoFit())]) + Pipeline([("clf", NoFit())]) # Smoke test with only an estimator clf = NoTrans() - pipe = Pipeline([('svc', clf)]) + pipe = Pipeline([("svc", clf)]) expected = dict( - svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)) + svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False) + ) assert pipe.get_params(deep=True) == expected # Check that params are set @@ -216,15 +223,15 @@ def test_pipeline_init(): repr(pipe) # Test with two objects - clf = SVC(gamma='scale') + clf = SVC(gamma="scale") filter1 = SelectKBest(f_classif) - pipe = Pipeline([('anova', filter1), ('svc', clf)]) + pipe = Pipeline([("anova", filter1), ("svc", clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform - error_regex = 'implement fit and transform or fit_resample' + error_regex = "implement fit and transform or fit_resample" with raises(TypeError, match=error_regex): - Pipeline([('t', NoTrans()), ('svc', clf)]) + Pipeline([("t", NoTrans()), ("svc", clf)]) # Check that params are set pipe.set_params(svc__C=0.1) @@ -238,7 +245,7 @@ def test_pipeline_init(): # Test clone pipe2 = clone(pipe) - assert not pipe.named_steps['svc'] is pipe2.named_steps['svc'] + assert not pipe.named_steps["svc"] is pipe2.named_steps["svc"] # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) @@ -251,10 +258,10 @@ def test_pipeline_init(): params2.pop(x) # Remove estimators that where copied - params.pop('svc') - params.pop('anova') - params2.pop('svc') - params2.pop('anova') + params.pop("svc") + params.pop("anova") + params2.pop("svc") + params2.pop("anova") assert params == params2 @@ -264,9 +271,9 @@ def test_pipeline_methods_anova(): X = iris.data y = iris.target # Test with Anova + LogisticRegression - clf = LogisticRegression(solver='lbfgs', multi_class='auto') + clf = LogisticRegression(solver="lbfgs", multi_class="auto") filter1 = SelectKBest(f_classif, k=2) - pipe = Pipeline([('anova', filter1), ('logistic', clf)]) + pipe = Pipeline([("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) @@ -276,13 +283,13 @@ def test_pipeline_methods_anova(): def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters - pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) + pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed - assert pipe.named_steps['transf'].a is None - assert pipe.named_steps['transf'].b is None + assert pipe.named_steps["transf"].a is None + assert pipe.named_steps["transf"].b is None # invalid parameters should raise an error message with raises(TypeError, match="unexpected keyword argument"): pipe.fit(None, None, clf__bad=True) @@ -291,7 +298,7 @@ def test_pipeline_fit_params(): def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) - pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) + pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 @@ -302,7 +309,7 @@ def test_pipeline_sample_weight_supported(): def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) - pipe = Pipeline([('transf', Transf()), ('clf', Mult())]) + pipe = Pipeline([("transf", Transf()), ("clf", Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 @@ -312,13 +319,13 @@ def test_pipeline_sample_weight_unsupported(): def test_pipeline_raise_set_params_error(): # Test pipeline raises set params error message for nested models. - pipe = Pipeline([('cls', LinearRegression())]) + pipe = Pipeline([("cls", LinearRegression())]) with raises(ValueError, match="Invalid parameter"): - pipe.set_params(fake='nope') + pipe.set_params(fake="nope") # nested model check with raises(ValueError, match="Invalid parameter"): - pipe.set_params(fake__estimator='nope') + pipe.set_params(fake__estimator="nope") def test_pipeline_methods_pca_svm(): @@ -327,9 +334,9 @@ def test_pipeline_methods_pca_svm(): X = iris.data y = iris.target # Test with PCA + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) - pca = PCA(svd_solver='full', n_components='mle', whiten=True) - pipe = Pipeline([('pca', pca), ('svc', clf)]) + clf = SVC(gamma="scale", probability=True, random_state=0) + pca = PCA(svd_solver="full", n_components="mle", whiten=True) + pipe = Pipeline([("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) @@ -345,17 +352,21 @@ def test_pipeline_methods_preprocessing_svm(): n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() - pca = PCA(n_components=2, svd_solver='randomized', whiten=True) - clf = SVC(gamma='scale', probability=True, random_state=0, - decision_function_shape='ovr') + pca = PCA(n_components=2, svd_solver="randomized", whiten=True) + clf = SVC( + gamma="scale", + probability=True, + random_state=0, + decision_function_shape="ovr", + ) for preprocessing in [scaler, pca]: - pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) + pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) - assert predict.shape == (n_samples, ) + assert predict.shape == (n_samples,) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) @@ -386,8 +397,9 @@ def test_fit_predict_on_pipeline(): separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step - pipe = Pipeline([('scaler', scaler_for_pipeline), ('Kmeans', - km_for_pipeline)]) + pipe = Pipeline( + [("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)] + ) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred) @@ -397,22 +409,23 @@ def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() - pca = PCA(svd_solver='full') - pipe = Pipeline([('scaler', scaler), ('pca', pca)]) + pca = PCA(svd_solver="full") + pipe = Pipeline([("scaler", scaler), ("pca", pca)]) error_regex = "'PCA' object has no attribute 'fit_predict'" with raises(AttributeError, match=error_regex): - getattr(pipe, 'fit_predict') + getattr(pipe, "fit_predict") def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked - pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())]) + pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())]) pipe.fit_predict( - X=None, y=None, transf__should_get_this=True, clf__should_succeed=True) - assert pipe.named_steps['transf'].fit_params['should_get_this'] - assert pipe.named_steps['clf'].successful - assert 'should_succeed' not in pipe.named_steps['transf'].fit_params + X=None, y=None, transf__should_get_this=True, clf__should_succeed=True + ) + assert pipe.named_steps["transf"].fit_params["should_get_this"] + assert pipe.named_steps["clf"].successful + assert "should_succeed" not in pipe.named_steps["transf"].fit_params def test_pipeline_transform(): @@ -420,8 +433,8 @@ def test_pipeline_transform(): # Also test pipeline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data - pca = PCA(n_components=2, svd_solver='full') - pipeline = Pipeline([('pca', pca)]) + pca = PCA(n_components=2, svd_solver="full") + pipeline = Pipeline([("pca", pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) @@ -441,7 +454,7 @@ def test_pipeline_fit_transform(): X = iris.data y = iris.target transf = Transf() - pipeline = Pipeline([('mock', transf)]) + pipeline = Pipeline([("mock", transf)]) # test fit_transform: X_trans = pipeline.fit_transform(X, y) @@ -452,51 +465,48 @@ def test_pipeline_fit_transform(): def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() - pipeline = Pipeline([('mock', transf1)]) - assert pipeline.named_steps['mock'] is transf1 + pipeline = Pipeline([("mock", transf1)]) + assert pipeline.named_steps["mock"] is transf1 # Directly setting attr - pipeline.steps = [('mock2', transf2)] - assert 'mock' not in pipeline.named_steps - assert pipeline.named_steps['mock2'] is transf2 - assert [('mock2', transf2)] == pipeline.steps + pipeline.steps = [("mock2", transf2)] + assert "mock" not in pipeline.named_steps + assert pipeline.named_steps["mock2"] is transf2 + assert [("mock2", transf2)] == pipeline.steps # Using set_params - pipeline.set_params(steps=[('mock', transf1)]) - assert [('mock', transf1)] == pipeline.steps + pipeline.set_params(steps=[("mock", transf1)]) + assert [("mock", transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) - assert [('mock', transf2)] == pipeline.steps + assert [("mock", transf2)] == pipeline.steps # With invalid data - pipeline.set_params(steps=[('junk', ())]) + pipeline.set_params(steps=[("junk", ())]) with raises(TypeError): pipeline.fit([[1]], [1]) with raises(TypeError): pipeline.fit_transform([[1]], [1]) -@pytest.mark.parametrize('passthrough', [None, 'passthrough']) +@pytest.mark.parametrize("passthrough", [None, "passthrough"]) def test_pipeline_correctly_adjusts_steps(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) - pipeline = Pipeline([ - ('m2', mult2), - ('bad', passthrough), - ('m3', mult3), - ('m5', mult5) - ]) + pipeline = Pipeline( + [("m2", mult2), ("bad", passthrough), ("m3", mult3), ("m5", mult5)] + ) pipeline.fit(X, y) - expected_names = ['m2', 'bad', 'm3', 'm5'] + expected_names = ["m2", "bad", "m3", "m5"] actual_names = [name for name, _ in pipeline.steps] assert expected_names == actual_names -@pytest.mark.parametrize('passthrough', [None, 'passthrough']) +@pytest.mark.parametrize("passthrough", [None, "passthrough"]) def test_set_pipeline_step_passthrough(passthrough): # Test setting Pipeline steps to None X = np.array([[1]]) @@ -506,7 +516,7 @@ def test_set_pipeline_step_passthrough(passthrough): mult5 = Mult(mult=5) def make(): - return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)]) + return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)]) pipeline = make() @@ -521,14 +531,14 @@ def make(): assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) expected_params = { - 'steps': pipeline.steps, - 'm2': mult2, - 'm3': passthrough, - 'last': mult5, - 'memory': None, - 'm2__mult': 2, - 'last__mult': 5, - 'verbose': False + "steps": pipeline.steps, + "m2": mult2, + "m3": passthrough, + "last": mult5, + "memory": None, + "m2__mult": 2, + "last__mult": 5, + "verbose": False, } assert pipeline.get_params(deep=True) == expected_params @@ -540,8 +550,11 @@ def make(): # for other methods, ensure no AttributeErrors on None: other_methods = [ - 'predict_proba', 'predict_log_proba', 'decision_function', 'transform', - 'score' + "predict_proba", + "predict_log_proba", + "decision_function", + "transform", + "score", ] for method in other_methods: getattr(pipeline, method)(X) @@ -562,11 +575,11 @@ def make(): assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) with raises(AttributeError, match="has no attribute 'predict'"): - getattr(pipeline, 'predict') + getattr(pipeline, "predict") # Check 'passthrough' step at construction time exp = 2 * 5 - pipeline = Pipeline([('m2', mult2), ('m3', passthrough), ('last', mult5)]) + pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) @@ -579,25 +592,25 @@ def test_pipeline_ducktyping(): pipeline.inverse_transform pipeline = make_pipeline(Transf()) - assert not hasattr(pipeline, 'predict') + assert not hasattr(pipeline, "predict") pipeline.transform pipeline.inverse_transform - pipeline = make_pipeline('passthrough') - assert pipeline.steps[0] == ('passthrough', 'passthrough') - assert not hasattr(pipeline, 'predict') + pipeline = make_pipeline("passthrough") + assert pipeline.steps[0] == ("passthrough", "passthrough") + assert not hasattr(pipeline, "predict") pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) - assert not hasattr(pipeline, 'predict') + assert not hasattr(pipeline, "predict") pipeline.transform - assert not hasattr(pipeline, 'inverse_transform') + assert not hasattr(pipeline, "inverse_transform") pipeline = make_pipeline(NoInvTransf(), Transf()) - assert not hasattr(pipeline, 'predict') + assert not hasattr(pipeline, "predict") pipeline.transform - assert not hasattr(pipeline, 'inverse_transform') + assert not hasattr(pipeline, "inverse_transform") def test_make_pipeline(): @@ -625,9 +638,10 @@ def test_classes_property(): with raises(AttributeError): getattr(reg, "classes_") - clf = make_pipeline(SelectKBest(k=1), - LogisticRegression(solver='lbfgs', multi_class='auto', - random_state=0)) + clf = make_pipeline( + SelectKBest(k=1), + LogisticRegression(solver="lbfgs", multi_class="auto", random_state=0), + ) with raises(AttributeError): getattr(clf, "classes_") clf.fit(X, y) @@ -643,8 +657,8 @@ def test_pipeline_wrong_memory(): # Define memory as an integer memory = 1 cached_pipe = Pipeline( - [('transf', DummyTransf()), ('svc', SVC(gamma='scale'))], - memory=memory) + [("transf", DummyTransf()), ("svc", SVC(gamma="scale"))], memory=memory + ) error_regex = "string or have the same interface as" with raises(ValueError, match=error_regex): cached_pipe.fit(X, y) @@ -658,26 +672,30 @@ def test_pipeline_memory_transformer(): try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) + clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummyTransf() - pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) + pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline( - [('transf', transf), ('svc', clf)], memory=memory) + [("transf", transf), ("svc", clf)], memory=memory + ) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline - expected_ts = cached_pipe.named_steps['transf'].timestamp_ + expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal( - pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X) + ) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) - assert_array_equal(pipe.named_steps['transf'].means_, - cached_pipe.named_steps['transf'].means_) - assert not hasattr(transf, 'means_') + assert_array_equal( + pipe.named_steps["transf"].means_, + cached_pipe.named_steps["transf"].means_, + ) + assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) @@ -685,29 +703,37 @@ def test_pipeline_memory_transformer(): assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal( - pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X) + ) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) - assert_array_equal(pipe.named_steps['transf'].means_, - cached_pipe.named_steps['transf'].means_) - assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts + assert_array_equal( + pipe.named_steps["transf"].means_, + cached_pipe.named_steps["transf"].means_, + ) + assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit - clf_2 = SVC(gamma='scale', probability=True, random_state=0) + clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline( - [('transf_2', transf_2), ('svc', clf_2)], memory=memory) + [("transf_2", transf_2), ("svc", clf_2)], memory=memory + ) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal( - pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) + pipe.predict_proba(X), cached_pipe_2.predict_proba(X) + ) assert_array_equal( - pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) + pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) + ) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) - assert_array_equal(pipe.named_steps['transf'].means_, - cached_pipe_2.named_steps['transf_2'].means_) - assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts + assert_array_equal( + pipe.named_steps["transf"].means_, + cached_pipe_2.named_steps["transf_2"].means_, + ) + assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir) @@ -723,31 +749,36 @@ def test_pipeline_memory_sampler(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) + clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummySampler() - pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) + pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline( - [('transf', transf), ('svc', clf)], memory=memory) + [("transf", transf), ("svc", clf)], memory=memory + ) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline - expected_ts = cached_pipe.named_steps['transf'].timestamp_ + expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal( - pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X) + ) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) - assert_array_equal(pipe.named_steps['transf'].means_, - cached_pipe.named_steps['transf'].means_) - assert not hasattr(transf, 'means_') + assert_array_equal( + pipe.named_steps["transf"].means_, + cached_pipe.named_steps["transf"].means_, + ) + assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) @@ -755,29 +786,37 @@ def test_pipeline_memory_sampler(): assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal( - pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X) + ) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) - assert_array_equal(pipe.named_steps['transf'].means_, - cached_pipe.named_steps['transf'].means_) - assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts + assert_array_equal( + pipe.named_steps["transf"].means_, + cached_pipe.named_steps["transf"].means_, + ) + assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit - clf_2 = SVC(gamma='scale', probability=True, random_state=0) + clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline( - [('transf_2', transf_2), ('svc', clf_2)], memory=memory) + [("transf_2", transf_2), ("svc", clf_2)], memory=memory + ) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal( - pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) + pipe.predict_proba(X), cached_pipe_2.predict_proba(X) + ) assert_array_equal( - pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) + pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) + ) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) - assert_array_equal(pipe.named_steps['transf'].means_, - cached_pipe_2.named_steps['transf_2'].means_) - assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts + assert_array_equal( + pipe.named_steps["transf"].means_, + cached_pipe_2.named_steps["transf_2"].means_, + ) + assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir) @@ -794,13 +833,14 @@ def test_pipeline_methods_pca_rus_svm(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) # Test with PCA + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) + clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) - pipe = Pipeline([('pca', pca), ('rus', rus), ('svc', clf)]) + pipe = Pipeline([("pca", pca), ("rus", rus), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) @@ -820,13 +860,14 @@ def test_pipeline_methods_rus_pca_svm(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) # Test with PCA + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) + clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) - pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)]) + pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) @@ -847,10 +888,11 @@ def test_pipeline_sample(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) rus = RandomUnderSampler(random_state=0) - pipeline = Pipeline([('rus', rus)]) + pipeline = Pipeline([("rus", rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit_resample(X, y) @@ -859,7 +901,7 @@ def test_pipeline_sample(): assert_allclose(y_trans, y_trans2, rtol=R_TOL) pca = PCA() - pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) + pipeline = Pipeline([("pca", PCA()), ("rus", rus)]) X_trans, y_trans = pipeline.fit_resample(X, y) X_pca = pca.fit_transform(X) @@ -885,12 +927,13 @@ def test_pipeline_sample_transform(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) rus = RandomUnderSampler(random_state=0) pca = PCA() pca2 = PCA() - pipeline = Pipeline([('pca', pca), ('rus', rus), ('pca2', pca2)]) + pipeline = Pipeline([("pca", pca), ("rus", rus), ("pca2", pca2)]) pipeline.fit(X, y).transform(X) @@ -907,8 +950,9 @@ def test_pipeline_none_classifier(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) - clf = LogisticRegression(solver='lbfgs', random_state=0) + random_state=0, + ) + clf = LogisticRegression(solver="lbfgs", random_state=0) pipe = make_pipeline(None, clf) pipe.fit(X, y) pipe.predict(X) @@ -929,8 +973,9 @@ def test_pipeline_none_sampler_classifier(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) - clf = LogisticRegression(solver='lbfgs', random_state=0) + random_state=0, + ) + clf = LogisticRegression(solver="lbfgs", random_state=0) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(None, rus, clf) pipe.fit(X, y) @@ -952,8 +997,9 @@ def test_pipeline_sampler_none_classifier(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) - clf = LogisticRegression(solver='lbfgs', random_state=0) + random_state=0, + ) + clf = LogisticRegression(solver="lbfgs", random_state=0) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(rus, None, clf) pipe.fit(X, y) @@ -975,7 +1021,8 @@ def test_pipeline_none_sampler_sample(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(None, rus) @@ -995,7 +1042,8 @@ def test_pipeline_none_transformer(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) pca = PCA(whiten=True) pipe = make_pipeline(None, pca) @@ -1017,12 +1065,13 @@ def test_pipeline_methods_anova_rus(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) # Test with RandomUnderSampling + Anova + LogisticRegression - clf = LogisticRegression(solver='lbfgs') + clf = LogisticRegression(solver="lbfgs") rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) - pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)]) + pipe = Pipeline([("rus", rus), ("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) @@ -1042,11 +1091,12 @@ def test_pipeline_with_step_that_implements_both_sample_and_transform(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) - clf = LogisticRegression(solver='lbfgs') + clf = LogisticRegression(solver="lbfgs") with raises(TypeError): - Pipeline([('step', FitTransformSample()), ('logistic', clf)]) + Pipeline([("step", FitTransformSample()), ("logistic", clf)]) def test_pipeline_with_step_that_it_is_pipeline(): @@ -1061,14 +1111,15 @@ def test_pipeline_with_step_that_it_is_pipeline(): n_features=20, n_clusters_per_class=1, n_samples=5000, - random_state=0) + random_state=0, + ) # Test with RandomUnderSampling + Anova + LogisticRegression - clf = LogisticRegression(solver='lbfgs') + clf = LogisticRegression(solver="lbfgs") rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) - pipe1 = Pipeline([('rus', rus), ('anova', filter1)]) + pipe1 = Pipeline([("rus", rus), ("anova", filter1)]) with raises(TypeError): - Pipeline([('pipe1', pipe1), ('logistic', clf)]) + Pipeline([("pipe1", pipe1), ("logistic", clf)]) def test_pipeline_fit_then_sample_with_sampler_last_estimator(): @@ -1082,13 +1133,15 @@ def test_pipeline_fit_then_sample_with_sampler_last_estimator(): n_features=20, n_clusters_per_class=1, n_samples=50000, - random_state=0) + random_state=0, + ) rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn) - X_fit_resample_resampled, y_fit_resample_resampled = \ - pipeline.fit_resample(X, y) + X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample( + X, y + ) pipeline = make_pipeline(rus, enn) pipeline.fit(X, y) X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) @@ -1107,13 +1160,15 @@ def test_pipeline_fit_then_sample_3_samplers_with_sampler_last_estimator(): n_features=20, n_clusters_per_class=1, n_samples=50000, - random_state=0) + random_state=0, + ) rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn, rus) - X_fit_resample_resampled, y_fit_resample_resampled = \ - pipeline.fit_resample(X, y) + X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample( + X, y + ) pipeline = make_pipeline(rus, enn, rus) pipeline.fit(X, y) X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) @@ -1125,10 +1180,11 @@ def test_make_pipeline_memory(): cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) - pipeline = make_pipeline(DummyTransf(), SVC(gamma='scale'), - memory=memory) + pipeline = make_pipeline( + DummyTransf(), SVC(gamma="scale"), memory=memory + ) assert pipeline.memory is memory - pipeline = make_pipeline(DummyTransf(), SVC(gamma='scale')) + pipeline = make_pipeline(DummyTransf(), SVC(gamma="scale")) assert pipeline.memory is None finally: shutil.rmtree(cachedir) @@ -1137,10 +1193,10 @@ def test_make_pipeline_memory(): def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked - pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())]) + pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) - assert pipe.named_steps['clf'].got_attribute + assert pipe.named_steps["clf"].got_attribute def test_resampler_last_stage_passthrough(): @@ -1155,7 +1211,8 @@ def test_resampler_last_stage_passthrough(): n_features=20, n_clusters_per_class=1, n_samples=50000, - random_state=0) + random_state=0, + ) rus = RandomUnderSampler(random_state=42) pipe = make_pipeline(rus, None) @@ -1173,14 +1230,15 @@ def test_pipeline_score_samples_pca_lof(): n_features=20, n_clusters_per_class=1, n_samples=500, - random_state=0) + random_state=0, + ) # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. rus = RandomUnderSampler(random_state=42) - pca = PCA(svd_solver='full', n_components='mle', whiten=True) + pca = PCA(svd_solver="full", n_components="mle", whiten=True) lof = LocalOutlierFactor(novelty=True) - pipe = Pipeline([('rus', rus), ('pca', pca), ('lof', lof)]) + pipe = Pipeline([("rus", rus), ("pca", pca), ("lof", lof)]) pipe.fit(X, y) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0],) @@ -1197,7 +1255,9 @@ def test_score_samples_on_pipeline_without_score_samples(): # step of the pipeline does not have score_samples defined. pipe = make_pipeline(LogisticRegression()) pipe.fit(X, y) - with pytest.raises(AttributeError, - match="'LogisticRegression' object has no attribute " - "'score_samples'"): + with pytest.raises( + AttributeError, + match="'LogisticRegression' object has no attribute " + "'score_samples'", + ): pipe.score_samples(X) diff --git a/imblearn/under_sampling/__init__.py b/imblearn/under_sampling/__init__.py index 4324833e5..55872743b 100644 --- a/imblearn/under_sampling/__init__.py +++ b/imblearn/under_sampling/__init__.py @@ -17,8 +17,15 @@ from ._prototype_selection import InstanceHardnessThreshold __all__ = [ - 'ClusterCentroids', 'RandomUnderSampler', 'InstanceHardnessThreshold', - 'NearMiss', 'TomekLinks', 'EditedNearestNeighbours', - 'RepeatedEditedNearestNeighbours', 'AllKNN', 'OneSidedSelection', - 'CondensedNearestNeighbour', 'NeighbourhoodCleaningRule' + "ClusterCentroids", + "RandomUnderSampler", + "InstanceHardnessThreshold", + "NearMiss", + "TomekLinks", + "EditedNearestNeighbours", + "RepeatedEditedNearestNeighbours", + "AllKNN", + "OneSidedSelection", + "CondensedNearestNeighbour", + "NeighbourhoodCleaningRule", ] diff --git a/imblearn/under_sampling/_prototype_generation/__init__.py b/imblearn/under_sampling/_prototype_generation/__init__.py index 1bd6a8885..13deacc62 100644 --- a/imblearn/under_sampling/_prototype_generation/__init__.py +++ b/imblearn/under_sampling/_prototype_generation/__init__.py @@ -5,4 +5,4 @@ from ._cluster_centroids import ClusterCentroids -__all__ = ['ClusterCentroids'] +__all__ = ["ClusterCentroids"] diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index f98a9a43d..3de4060eb 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -18,12 +18,13 @@ from ...utils import Substitution from ...utils._docstring import _random_state_docstring -VOTING_KIND = ('auto', 'hard', 'soft') +VOTING_KIND = ("auto", "hard", "soft") @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class ClusterCentroids(BaseUnderSampler): """Perform under-sampling by generating centroids based on clustering methods. @@ -85,12 +86,14 @@ class ClusterCentroids(BaseUnderSampler): """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - estimator=None, - voting='auto', - n_jobs=1): + def __init__( + self, + sampling_strategy="auto", + random_state=None, + estimator=None, + voting="auto", + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator @@ -101,19 +104,23 @@ def _validate_estimator(self): """Private function to create the KMeans estimator""" if self.estimator is None: self.estimator_ = KMeans( - random_state=self.random_state, n_jobs=self.n_jobs) + random_state=self.random_state, n_jobs=self.n_jobs + ) elif isinstance(self.estimator, KMeans): self.estimator_ = clone(self.estimator) else: - raise ValueError('`estimator` has to be a KMeans clustering.' - ' Got {} instead.'.format(type(self.estimator))) + raise ValueError( + "`estimator` has to be a KMeans clustering." + " Got {} instead.".format(type(self.estimator)) + ) def _generate_sample(self, X, y, centroids, target_class): - if self.voting_ == 'hard': + if self.voting_ == "hard": nearest_neighbors = NearestNeighbors(n_neighbors=1) nearest_neighbors.fit(X, y) indices = nearest_neighbors.kneighbors( - centroids, return_distance=False) + centroids, return_distance=False + ) X_new = safe_indexing(X, np.squeeze(indices)) else: if sparse.issparse(X): @@ -127,26 +134,29 @@ def _generate_sample(self, X, y, centroids, target_class): def _fit_resample(self, X, y): self._validate_estimator() - if self.voting == 'auto': + if self.voting == "auto": if sparse.issparse(X): - self.voting_ = 'hard' + self.voting_ = "hard" else: - self.voting_ = 'soft' + self.voting_ = "soft" else: if self.voting in VOTING_KIND: self.voting_ = self.voting else: - raise ValueError("'voting' needs to be one of {}. Got {}" - " instead.".format(VOTING_KIND, self.voting)) + raise ValueError( + "'voting' needs to be one of {}. Got {}" + " instead.".format(VOTING_KIND, self.voting) + ) X_resampled, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] - self.estimator_.set_params(**{'n_clusters': n_samples}) + self.estimator_.set_params(**{"n_clusters": n_samples}) self.estimator_.fit(X[y == target_class]) X_new, y_new = self._generate_sample( - X, y, self.estimator_.cluster_centers_, target_class) + X, y, self.estimator_.cluster_centers_, target_class + ) X_resampled.append(X_new) y_resampled.append(y_new) else: @@ -163,4 +173,4 @@ def _fit_resample(self, X, y): return X_resampled, np.array(y_resampled, dtype=y.dtype) def _more_tags(self): - return {'sample_indices': False} + return {"sample_indices": False} diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 9e24e5a61..67363144c 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -12,18 +12,26 @@ from imblearn.under_sampling import ClusterCentroids RND_SEED = 0 -X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ - 0.20792588, 1.49407907 -], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], - [0.09125309, -0.85409574], [0.12372842, 0.6536186], - [0.13347175, 0.12167502], [0.094035, -2.55298982]]) +X = np.array( + [ + [0.04352327, -0.20515826], + [0.92923648, 0.76103773], + [0.20792588, 1.49407907], + [0.47104475, 0.44386323], + [0.22950086, 0.33367433], + [0.15490546, 0.3130677], + [0.09125309, -0.85409574], + [0.12372842, 0.6536186], + [0.13347175, 0.12167502], + [0.094035, -2.55298982], + ] +) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) R_TOL = 1e-4 @pytest.mark.parametrize( - "X, expected_voting", - [(X, 'soft'), (sparse.csr_matrix(X), 'hard')] + "X, expected_voting", [(X, "soft"), (sparse.csr_matrix(X), "hard")] ) def test_fit_resample_check_voting(X, expected_voting): cc = ClusterCentroids(random_state=RND_SEED) @@ -32,9 +40,10 @@ def test_fit_resample_check_voting(X, expected_voting): def test_fit_resample_auto(): - sampling_strategy = 'auto' + sampling_strategy = "auto" cc = ClusterCentroids( - sampling_strategy=sampling_strategy, random_state=RND_SEED) + sampling_strategy=sampling_strategy, random_state=RND_SEED + ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6,) @@ -43,7 +52,8 @@ def test_fit_resample_auto(): def test_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} cc = ClusterCentroids( - sampling_strategy=sampling_strategy, random_state=RND_SEED) + sampling_strategy=sampling_strategy, random_state=RND_SEED + ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (9, 2) assert y_resampled.shape == (9,) @@ -62,12 +72,13 @@ def test_multiclass_fit_resample(): def test_fit_resample_object(): - sampling_strategy = 'auto' + sampling_strategy = "auto" cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, - estimator=cluster) + estimator=cluster, + ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) @@ -75,14 +86,15 @@ def test_fit_resample_object(): def test_fit_hard_voting(): - sampling_strategy = 'auto' - voting = 'hard' + sampling_strategy = "auto" + voting = "hard" cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, - voting=voting) + voting=voting, + ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) @@ -93,8 +105,10 @@ def test_fit_hard_voting(): @pytest.mark.parametrize( "cluster_centroids_params, err_msg", - [({"estimator": "rnd"}, "has to be a KMeans clustering"), - ({"voting": "unknown"}, "needs to be one of")] + [ + ({"estimator": "rnd"}, "has to be a KMeans clustering"), + ({"voting": "unknown"}, "needs to be one of"), + ], ) def test_fit_resample_error(cluster_centroids_params, err_msg): cc = ClusterCentroids(**cluster_centroids_params) diff --git a/imblearn/under_sampling/_prototype_selection/__init__.py b/imblearn/under_sampling/_prototype_selection/__init__.py index d721fbe49..9ed9455c4 100644 --- a/imblearn/under_sampling/_prototype_selection/__init__.py +++ b/imblearn/under_sampling/_prototype_selection/__init__.py @@ -15,8 +15,14 @@ from ._instance_hardness_threshold import InstanceHardnessThreshold __all__ = [ - 'RandomUnderSampler', 'InstanceHardnessThreshold', 'NearMiss', - 'TomekLinks', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', - 'AllKNN', 'OneSidedSelection', 'CondensedNearestNeighbour', - 'NeighbourhoodCleaningRule' + "RandomUnderSampler", + "InstanceHardnessThreshold", + "NearMiss", + "TomekLinks", + "EditedNearestNeighbours", + "RepeatedEditedNearestNeighbours", + "AllKNN", + "OneSidedSelection", + "CondensedNearestNeighbour", + "NeighbourhoodCleaningRule", ] diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 98c049ce1..06d79070c 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -23,7 +23,8 @@ @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class CondensedNearestNeighbour(BaseCleaningSampler): """Class to perform under-sampling based on the condensed nearest neighbour method. @@ -91,12 +92,14 @@ class CondensedNearestNeighbour(BaseCleaningSampler): """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - n_neighbors=None, - n_seeds_S=1, - n_jobs=1): + def __init__( + self, + sampling_strategy="auto", + random_state=None, + n_neighbors=None, + n_seeds_S=1, + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors @@ -107,16 +110,20 @@ def _validate_estimator(self): """Private function to create the NN estimator""" if self.n_neighbors is None: self.estimator_ = KNeighborsClassifier( - n_neighbors=1, n_jobs=self.n_jobs) + n_neighbors=1, n_jobs=self.n_jobs + ) elif isinstance(self.n_neighbors, int): self.estimator_ = KNeighborsClassifier( - n_neighbors=self.n_neighbors, n_jobs=self.n_jobs) + n_neighbors=self.n_neighbors, n_jobs=self.n_jobs + ) elif isinstance(self.n_neighbors, KNeighborsClassifier): self.estimator_ = clone(self.n_neighbors) else: - raise ValueError('`n_neighbors` has to be a int or an object' - ' inhereited from KNeighborsClassifier.' - ' Got {} instead.'.format(type(self.n_neighbors))) + raise ValueError( + "`n_neighbors` has to be a int or an object" + " inhereited from KNeighborsClassifier." + " Got {} instead.".format(type(self.n_neighbors)) + ) def _fit_resample(self, X, y): self._validate_estimator() @@ -124,21 +131,25 @@ def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # Randomly get one sample from the majority class # Generate the index to select idx_maj = np.flatnonzero(y == target_class) - idx_maj_sample = idx_maj[random_state.randint( - low=0, - high=target_stats[target_class], - size=self.n_seeds_S)] + idx_maj_sample = idx_maj[ + random_state.randint( + low=0, + high=target_stats[target_class], + size=self.n_seeds_S, + ) + ] # Create the set C - One majority samples and all minority C_indices = np.append( - np.flatnonzero(y == class_minority), idx_maj_sample) + np.flatnonzero(y == class_minority), idx_maj_sample + ) C_x = safe_indexing(X, C_indices) C_y = safe_indexing(y, C_indices) @@ -167,8 +178,9 @@ def _fit_resample(self, X, y): # append it in C_x if y_sam != pred_y: # Keep the index for later - idx_maj_sample = np.append(idx_maj_sample, - idx_maj[idx_sam]) + idx_maj_sample = np.append( + idx_maj_sample, idx_maj[idx_sam] + ) # Update C C_indices = np.append(C_indices, idx_maj[idx_sam]) @@ -183,17 +195,20 @@ def _fit_resample(self, X, y): # well classified elements pred_S_y = self.estimator_.predict(S_x) good_classif_label = np.unique( - np.append(idx_maj_sample, - np.flatnonzero(pred_S_y == S_y))) + np.append( + idx_maj_sample, np.flatnonzero(pred_S_y == S_y) + ) + ) idx_under = np.concatenate((idx_under, idx_maj_sample), axis=0) else: idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)), axis=0) + (idx_under, np.flatnonzero(y == target_class)), axis=0 + ) self.sample_indices_ = idx_under return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): - return {'sample_indices': True} + return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index e77907e9c..f70042b38 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -17,11 +17,12 @@ from ...utils import check_neighbors_object from ...utils import Substitution -SEL_KIND = ('all', 'mode') +SEL_KIND = ("all", "mode") @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring +) class EditedNearestNeighbours(BaseCleaningSampler): """Class to perform under-sampling based on the edited nearest neighbour method. @@ -92,11 +93,9 @@ class EditedNearestNeighbours(BaseCleaningSampler): """ - def __init__(self, - sampling_strategy='auto', - n_neighbors=3, - kind_sel='all', - n_jobs=1): + def __init__( + self, sampling_strategy="auto", n_neighbors=3, kind_sel="all", n_jobs=1 + ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel @@ -105,8 +104,9 @@ def __init__(self, def _validate_estimator(self): """Validate the estimator created in the ENN.""" self.nn_ = check_neighbors_object( - 'n_neighbors', self.n_neighbors, additional_neighbor=1) - self.nn_.set_params(**{'n_jobs': self.n_jobs}) + "n_neighbors", self.n_neighbors, additional_neighbor=1 + ) + self.nn_.set_params(**{"n_jobs": self.n_jobs}) if self.kind_sel not in SEL_KIND: raise NotImplementedError @@ -114,7 +114,7 @@ def _validate_estimator(self): def _fit_resample(self, X, y): self._validate_estimator() - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0,), dtype=int) self.nn_.fit(X) @@ -124,12 +124,13 @@ def _fit_resample(self, X, y): X_class = safe_indexing(X, target_class_indices) y_class = safe_indexing(y, target_class_indices) nnhood_idx = self.nn_.kneighbors( - X_class, return_distance=False)[:, 1:] + X_class, return_distance=False + )[:, 1:] nnhood_label = y[nnhood_idx] - if self.kind_sel == 'mode': + if self.kind_sel == "mode": nnhood_label, _ = mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label) == y_class - elif self.kind_sel == 'all': + elif self.kind_sel == "all": nnhood_label = nnhood_label == target_class nnhood_bool = np.all(nnhood_label, axis=1) index_target_class = np.flatnonzero(nnhood_bool) @@ -137,20 +138,24 @@ def _fit_resample(self, X, y): index_target_class = slice(None) idx_under = np.concatenate( - (idx_under, - np.flatnonzero(y == target_class)[index_target_class]), - axis=0) + ( + idx_under, + np.flatnonzero(y == target_class)[index_target_class], + ), + axis=0, + ) self.sample_indices_ = idx_under return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): - return {'sample_indices': True} + return {"sample_indices": True} @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring +) class RepeatedEditedNearestNeighbours(BaseCleaningSampler): """Class to perform under-sampling based on the repeated edited nearest neighbour method. @@ -225,12 +230,14 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): """ - def __init__(self, - sampling_strategy='auto', - n_neighbors=3, - max_iter=100, - kind_sel='all', - n_jobs=1): + def __init__( + self, + sampling_strategy="auto", + n_neighbors=3, + max_iter=100, + kind_sel="all", + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel @@ -240,17 +247,21 @@ def __init__(self, def _validate_estimator(self): """Private function to create the NN estimator""" if self.max_iter < 2: - raise ValueError('max_iter must be greater than 1.' - ' Got {} instead.'.format(type(self.max_iter))) + raise ValueError( + "max_iter must be greater than 1." + " Got {} instead.".format(type(self.max_iter)) + ) self.nn_ = check_neighbors_object( - 'n_neighbors', self.n_neighbors, additional_neighbor=1) + "n_neighbors", self.n_neighbors, additional_neighbor=1 + ) self.enn_ = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.nn_, kind_sel=self.kind_sel, - n_jobs=self.n_jobs) + n_jobs=self.n_jobs, + ) def _fit_resample(self, X, y): self._validate_estimator() @@ -272,29 +283,35 @@ def _fit_resample(self, X, y): # 3. If one of the class is disappearing # Case 1 - b_conv = (prev_len == y_enn.shape[0]) + b_conv = prev_len == y_enn.shape[0] # Case 2 stats_enn = Counter(y_enn) - count_non_min = np.array([ - val for val, key in zip(stats_enn.values(), stats_enn.keys()) - if key != class_minority - ]) + count_non_min = np.array( + [ + val + for val, key in zip(stats_enn.values(), stats_enn.keys()) + if key != class_minority + ] + ) b_min_bec_maj = np.any( - count_non_min < target_stats[class_minority]) + count_non_min < target_stats[class_minority] + ) # Case 3 - b_remove_maj_class = (len(stats_enn) < len(target_stats)) + b_remove_maj_class = len(stats_enn) < len(target_stats) X_, y_, = X_enn, y_enn self.sample_indices_ = self.sample_indices_[ - self.enn_.sample_indices_] + self.enn_.sample_indices_ + ] if b_conv or b_min_bec_maj or b_remove_maj_class: if b_conv: X_, y_, = X_enn, y_enn self.sample_indices_ = self.sample_indices_[ - self.enn_.sample_indices_] + self.enn_.sample_indices_ + ] break X_resampled, y_resampled = X_, y_ @@ -302,11 +319,12 @@ def _fit_resample(self, X, y): return X_resampled, y_resampled def _more_tags(self): - return {'sample_indices': True} + return {"sample_indices": True} @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring +) class AllKNN(BaseCleaningSampler): """Class to perform under-sampling based on the AllKNN method. @@ -383,12 +401,14 @@ class without early stopping. """ - def __init__(self, - sampling_strategy='auto', - n_neighbors=3, - kind_sel='all', - allow_minority=False, - n_jobs=1): + def __init__( + self, + sampling_strategy="auto", + n_neighbors=3, + kind_sel="all", + allow_minority=False, + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel @@ -401,13 +421,15 @@ def _validate_estimator(self): raise NotImplementedError self.nn_ = check_neighbors_object( - 'n_neighbors', self.n_neighbors, additional_neighbor=1) + "n_neighbors", self.n_neighbors, additional_neighbor=1 + ) self.enn_ = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.nn_, kind_sel=self.kind_sel, - n_jobs=self.n_jobs) + n_jobs=self.n_jobs, + ) def _fit_resample(self, X, y): self._validate_estimator() @@ -430,22 +452,27 @@ def _fit_resample(self, X, y): # Case 1else: stats_enn = Counter(y_enn) - count_non_min = np.array([ - val for val, key in zip(stats_enn.values(), stats_enn.keys()) - if key != class_minority - ]) + count_non_min = np.array( + [ + val + for val, key in zip(stats_enn.values(), stats_enn.keys()) + if key != class_minority + ] + ) b_min_bec_maj = np.any( - count_non_min < target_stats[class_minority]) + count_non_min < target_stats[class_minority] + ) if self.allow_minority: # overwrite b_min_bec_maj b_min_bec_maj = False # Case 2 - b_remove_maj_class = (len(stats_enn) < len(target_stats)) + b_remove_maj_class = len(stats_enn) < len(target_stats) X_, y_, = X_enn, y_enn self.sample_indices_ = self.sample_indices_[ - self.enn_.sample_indices_] + self.enn_.sample_indices_ + ] if b_min_bec_maj or b_remove_maj_class: break @@ -455,4 +482,4 @@ def _fit_resample(self, X, y): return X_resampled, y_resampled def _more_tags(self): - return {'sample_indices': True} + return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 19bd112c1..7f0cd52b5 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -23,7 +23,8 @@ @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class InstanceHardnessThreshold(BaseUnderSampler): """Class to perform under-sampling based on the instance hardness threshold. @@ -89,12 +90,14 @@ class InstanceHardnessThreshold(BaseUnderSampler): """ - def __init__(self, - estimator=None, - sampling_strategy='auto', - random_state=None, - cv=5, - n_jobs=1): + def __init__( + self, + estimator=None, + sampling_strategy="auto", + random_state=None, + cv=5, + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator @@ -104,25 +107,32 @@ def __init__(self, def _validate_estimator(self): """Private function to create the classifier""" - if (self.estimator is not None and - isinstance(self.estimator, ClassifierMixin) and - hasattr(self.estimator, 'predict_proba')): + if ( + self.estimator is not None + and isinstance(self.estimator, ClassifierMixin) + and hasattr(self.estimator, "predict_proba") + ): self.estimator_ = clone(self.estimator) elif self.estimator is None: self.estimator_ = RandomForestClassifier( - n_estimators=100, random_state=self.random_state, - n_jobs=self.n_jobs) + n_estimators=100, + random_state=self.random_state, + n_jobs=self.n_jobs, + ) else: - raise ValueError('Invalid parameter `estimator`. Got {}.'.format( - type(self.estimator))) + raise ValueError( + "Invalid parameter `estimator`. Got {}.".format( + type(self.estimator) + ) + ) def _fit_resample(self, X, y): self._validate_estimator() target_stats = Counter(y) skf = StratifiedKFold( - n_splits=self.cv, shuffle=False, - random_state=self.random_state).split(X, y) + n_splits=self.cv, shuffle=False, random_state=self.random_state + ).split(X, y) probabilities = np.zeros(y.shape[0], dtype=float) for train_index, test_index in skf: @@ -136,27 +146,32 @@ def _fit_resample(self, X, y): probs = self.estimator_.predict_proba(X_test) probabilities[test_index] = probs[range(len(y_test)), y_test] - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] threshold = np.percentile( probabilities[y == target_class], - (1. - (n_samples / target_stats[target_class])) * 100.) + (1.0 - (n_samples / target_stats[target_class])) * 100.0, + ) index_target_class = np.flatnonzero( - probabilities[y == target_class] >= threshold) + probabilities[y == target_class] >= threshold + ) else: index_target_class = slice(None) idx_under = np.concatenate( - (idx_under, - np.flatnonzero(y == target_class)[index_target_class]), - axis=0) + ( + idx_under, + np.flatnonzero(y == target_class)[index_target_class], + ), + axis=0, + ) self.sample_indices_ = idx_under return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): - return {'sample_indices': True} + return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index ca3897ef8..1d76fc0ee 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -16,8 +16,7 @@ from ...utils import Substitution -@Substitution( - sampling_strategy=BaseUnderSampler._sampling_strategy_docstring) +@Substitution(sampling_strategy=BaseUnderSampler._sampling_strategy_docstring) class NearMiss(BaseUnderSampler): """Class to perform under-sampling based on NearMiss methods. @@ -86,25 +85,23 @@ class NearMiss(BaseUnderSampler): """ - def __init__(self, - sampling_strategy='auto', - version=1, - n_neighbors=3, - n_neighbors_ver3=3, - n_jobs=1): + def __init__( + self, + sampling_strategy="auto", + version=1, + n_neighbors=3, + n_neighbors_ver3=3, + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.version = version self.n_neighbors = n_neighbors self.n_neighbors_ver3 = n_neighbors_ver3 self.n_jobs = n_jobs - def _selection_dist_based(self, - X, - y, - dist_vec, - num_samples, - key, - sel_strategy='nearest'): + def _selection_dist_based( + self, X, y, dist_vec, num_samples, key, sel_strategy="nearest" + ): """Select the appropriate samples depending of the strategy selected. Parameters @@ -135,20 +132,24 @@ def _selection_dist_based(self, """ # Compute the distance considering the farthest neighbour - dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors:], axis=1) + dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors :], axis=1) target_class_indices = np.flatnonzero(y == key) - if (dist_vec.shape[0] != safe_indexing(X, - target_class_indices).shape[0]): - raise RuntimeError('The samples to be selected do not correspond' - ' to the distance matrix given. Ensure that' - ' both `X[y == key]` and `dist_vec` are' - ' related.') + if ( + dist_vec.shape[0] + != safe_indexing(X, target_class_indices).shape[0] + ): + raise RuntimeError( + "The samples to be selected do not correspond" + " to the distance matrix given. Ensure that" + " both `X[y == key]` and `dist_vec` are" + " related." + ) # Sort the list of distance and get the index - if sel_strategy == 'nearest': + if sel_strategy == "nearest": sort_way = False - elif sel_strategy == 'farthest': + elif sel_strategy == "farthest": sort_way = True else: raise NotImplementedError @@ -156,15 +157,18 @@ def _selection_dist_based(self, sorted_idx = sorted( range(len(dist_avg_vec)), key=dist_avg_vec.__getitem__, - reverse=sort_way) + reverse=sort_way, + ) # Throw a warning to tell the user that we did not have enough samples # to select and that we just select everything if len(sorted_idx) < num_samples: - warnings.warn('The number of the samples to be selected is larger' - ' than the number of samples available. The' - ' balancing ratio cannot be ensure and all samples' - ' will be returned.') + warnings.warn( + "The number of the samples to be selected is larger" + " than the number of samples available. The" + " balancing ratio cannot be ensure and all samples" + " will be returned." + ) # Select the desired number of samples return sorted_idx[:num_samples] @@ -172,22 +176,25 @@ def _selection_dist_based(self, def _validate_estimator(self): """Private function to create the NN estimator""" - self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors) - self.nn_.set_params(**{'n_jobs': self.n_jobs}) + self.nn_ = check_neighbors_object("n_neighbors", self.n_neighbors) + self.nn_.set_params(**{"n_jobs": self.n_jobs}) if self.version == 3: - self.nn_ver3_ = check_neighbors_object('n_neighbors_ver3', - self.n_neighbors_ver3) - self.nn_ver3_.set_params(**{'n_jobs': self.n_jobs}) + self.nn_ver3_ = check_neighbors_object( + "n_neighbors_ver3", self.n_neighbors_ver3 + ) + self.nn_ver3_.set_params(**{"n_jobs": self.n_jobs}) if self.version not in (1, 2, 3): - raise ValueError('Parameter `version` must be 1, 2 or 3, got' - ' {}'.format(self.version)) + raise ValueError( + "Parameter `version` must be 1, 2 or 3, got" + " {}".format(self.version) + ) def _fit_resample(self, X, y): self._validate_estimator() - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0,), dtype=int) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) @@ -204,41 +211,48 @@ def _fit_resample(self, X, y): if self.version == 1: dist_vec, idx_vec = self.nn_.kneighbors( - X_class, n_neighbors=self.nn_.n_neighbors) + X_class, n_neighbors=self.nn_.n_neighbors + ) index_target_class = self._selection_dist_based( X, y, dist_vec, n_samples, target_class, - sel_strategy='nearest') + sel_strategy="nearest", + ) elif self.version == 2: dist_vec, idx_vec = self.nn_.kneighbors( - X_class, n_neighbors=target_stats[class_minority]) + X_class, n_neighbors=target_stats[class_minority] + ) index_target_class = self._selection_dist_based( X, y, dist_vec, n_samples, target_class, - sel_strategy='nearest') + sel_strategy="nearest", + ) elif self.version == 3: self.nn_ver3_.fit(X_class) dist_vec, idx_vec = self.nn_ver3_.kneighbors( - safe_indexing(X, minority_class_indices)) + safe_indexing(X, minority_class_indices) + ) idx_vec_farthest = np.unique(idx_vec.reshape(-1)) X_class_selected = safe_indexing(X_class, idx_vec_farthest) y_class_selected = safe_indexing(y_class, idx_vec_farthest) dist_vec, idx_vec = self.nn_.kneighbors( - X_class_selected, n_neighbors=self.nn_.n_neighbors) + X_class_selected, n_neighbors=self.nn_.n_neighbors + ) index_target_class = self._selection_dist_based( X_class_selected, y_class_selected, dist_vec, n_samples, target_class, - sel_strategy='farthest') + sel_strategy="farthest", + ) # idx_tmp is relative to the feature selected in the # previous step and we need to find the indirection index_target_class = idx_vec_farthest[index_target_class] @@ -246,13 +260,16 @@ def _fit_resample(self, X, y): index_target_class = slice(None) idx_under = np.concatenate( - (idx_under, - np.flatnonzero(y == target_class)[index_target_class]), - axis=0) + ( + idx_under, + np.flatnonzero(y == target_class)[index_target_class], + ), + axis=0, + ) self.sample_indices_ = idx_under return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): - return {'sample_indices': True} + return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index ed9f0b3b8..4c43faf0a 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -16,11 +16,12 @@ from ...utils import check_neighbors_object from ...utils import Substitution -SEL_KIND = ('all', 'mode') +SEL_KIND = ("all", "mode") @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring +) class NeighbourhoodCleaningRule(BaseCleaningSampler): """Class performing under-sampling based on the neighbourhood cleaning rule. @@ -87,12 +88,14 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): """ - def __init__(self, - sampling_strategy='auto', - n_neighbors=3, - kind_sel='all', - threshold_cleaning=0.5, - n_jobs=1): + def __init__( + self, + sampling_strategy="auto", + n_neighbors=3, + kind_sel="all", + threshold_cleaning=0.5, + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel @@ -102,8 +105,9 @@ def __init__(self, def _validate_estimator(self): """Create the objects required by NCR.""" self.nn_ = check_neighbors_object( - 'n_neighbors', self.n_neighbors, additional_neighbor=1) - self.nn_.set_params(**{'n_jobs': self.n_jobs}) + "n_neighbors", self.n_neighbors, additional_neighbor=1 + ) + self.nn_.set_params(**{"n_jobs": self.n_jobs}) if self.kind_sel not in SEL_KIND: raise NotImplementedError @@ -111,15 +115,17 @@ def _validate_estimator(self): if self.threshold_cleaning > 1 or self.threshold_cleaning < 0: raise ValueError( "'threshold_cleaning' is a value between 0 and 1." - " Got {} instead.".format(self.threshold_cleaning)) + " Got {} instead.".format(self.threshold_cleaning) + ) def _fit_resample(self, X, y): self._validate_estimator() enn = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.n_neighbors, - kind_sel='mode', - n_jobs=self.n_jobs) + kind_sel="mode", + n_jobs=self.n_jobs, + ) enn.fit_resample(X, y) index_not_a1 = enn.sample_indices_ index_a1 = np.ones(y.shape, dtype=bool) @@ -131,9 +137,12 @@ def _fit_resample(self, X, y): class_minority = min(target_stats, key=target_stats.get) # compute which classes to consider for cleaning for the A2 group classes_under_sample = [ - c for c, n_samples in target_stats.items() - if (c in self.sampling_strategy_.keys() and ( - n_samples > X.shape[0] * self.threshold_cleaning)) + c + for c, n_samples in target_stats.items() + if ( + c in self.sampling_strategy_.keys() + and (n_samples > X.shape[0] * self.threshold_cleaning) + ) ] self.nn_.fit(X) class_minority_indices = np.flatnonzero(y == class_minority) @@ -141,10 +150,10 @@ def _fit_resample(self, X, y): y_class = safe_indexing(y, class_minority_indices) nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] - if self.kind_sel == 'mode': + if self.kind_sel == "mode": nnhood_label_majority, _ = mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label_majority) == y_class - elif self.kind_sel == 'all': + elif self.kind_sel == "all": nnhood_label_majority = nnhood_label == class_minority nnhood_bool = np.all(nnhood_label, axis=1) else: @@ -152,15 +161,18 @@ def _fit_resample(self, X, y): # compute a2 group index_a2 = np.ravel(nnhood_idx[~nnhood_bool]) index_a2 = np.unique( - [index for index in index_a2 if y[index] in classes_under_sample]) + [index for index in index_a2 if y[index] in classes_under_sample] + ) union_a1_a2 = np.union1d(index_a1, index_a2).astype(int) selected_samples = np.ones(y.shape, dtype=bool) selected_samples[union_a1_a2] = False self.sample_indices_ = np.flatnonzero(selected_samples) - return (safe_indexing(X, self.sample_indices_), - safe_indexing(y, self.sample_indices_)) + return ( + safe_indexing(X, self.sample_indices_), + safe_indexing(y, self.sample_indices_), + ) def _more_tags(self): - return {'sample_indices': True} + return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 83b4d8735..94a08f332 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -21,7 +21,8 @@ @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class OneSidedSelection(BaseCleaningSampler): """Class to perform under-sampling based on one-sided selection method. @@ -85,12 +86,14 @@ class OneSidedSelection(BaseCleaningSampler): """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - n_neighbors=None, - n_seeds_S=1, - n_jobs=1): + def __init__( + self, + sampling_strategy="auto", + random_state=None, + n_neighbors=None, + n_seeds_S=1, + n_jobs=1, + ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors @@ -101,16 +104,20 @@ def _validate_estimator(self): """Private function to create the NN estimator""" if self.n_neighbors is None: self.estimator_ = KNeighborsClassifier( - n_neighbors=1, n_jobs=self.n_jobs) + n_neighbors=1, n_jobs=self.n_jobs + ) elif isinstance(self.n_neighbors, int): self.estimator_ = KNeighborsClassifier( - n_neighbors=self.n_neighbors, n_jobs=self.n_jobs) + n_neighbors=self.n_neighbors, n_jobs=self.n_jobs + ) elif isinstance(self.n_neighbors, KNeighborsClassifier): self.estimator_ = clone(self.n_neighbors) else: - raise ValueError('`n_neighbors` has to be a int or an object' - ' inherited from KNeighborsClassifier.' - ' Got {} instead.'.format(type(self.n_neighbors))) + raise ValueError( + "`n_neighbors` has to be a int or an object" + " inherited from KNeighborsClassifier." + " Got {} instead.".format(type(self.n_neighbors)) + ) def _fit_resample(self, X, y): self._validate_estimator() @@ -119,15 +126,15 @@ def _fit_resample(self, X, y): target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # select a sample from the current class idx_maj = np.flatnonzero(y == target_class) sel_idx_maj = random_state.randint( - low=0, high=target_stats[target_class], - size=self.n_seeds_S) + low=0, high=target_stats[target_class], size=self.n_seeds_S + ) idx_maj_sample = idx_maj[sel_idx_maj] minority_class_indices = np.flatnonzero(y == class_minority) @@ -149,17 +156,18 @@ def _fit_resample(self, X, y): S_misclassified_indices = np.flatnonzero(pred_S_y != S_y) idx_tmp = idx_maj_extracted[S_misclassified_indices] idx_under = np.concatenate( - (idx_under, idx_maj_sample, idx_tmp), axis=0) + (idx_under, idx_maj_sample, idx_tmp), axis=0 + ) else: idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)), axis=0) + (idx_under, np.flatnonzero(y == target_class)), axis=0 + ) X_resampled = safe_indexing(X, idx_under) y_resampled = safe_indexing(y, idx_under) # apply Tomek cleaning - tl = TomekLinks( - sampling_strategy=list(self.sampling_strategy_.keys())) + tl = TomekLinks(sampling_strategy=list(self.sampling_strategy_.keys())) X_cleaned, y_cleaned = tl.fit_resample(X_resampled, y_resampled) self.sample_indices_ = safe_indexing(idx_under, tl.sample_indices_) @@ -167,4 +175,4 @@ def _fit_resample(self, X, y): return X_cleaned, y_cleaned def _more_tags(self): - return {'sample_indices': True} + return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 9743087e0..5f3c8c7b1 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -20,7 +20,8 @@ @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) + random_state=_random_state_docstring, +) class RandomUnderSampler(BaseUnderSampler): """Class to perform random under-sampling. @@ -70,10 +71,9 @@ class RandomUnderSampler(BaseUnderSampler): """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - replacement=False): + def __init__( + self, sampling_strategy="auto", random_state=None, replacement=False + ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.replacement = replacement @@ -81,16 +81,17 @@ def __init__(self, @staticmethod def _check_X_y(X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X = check_array(X, accept_sparse=['csr', 'csc'], dtype=None) - y = check_array(y, accept_sparse=['csr', 'csc'], dtype=None, - ensure_2d=False) + X = check_array(X, accept_sparse=["csr", "csc"], dtype=None) + y = check_array( + y, accept_sparse=["csr", "csc"], dtype=None, ensure_2d=False + ) check_consistent_length(X, y) return X, y, binarize_y def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): @@ -98,19 +99,22 @@ def _fit_resample(self, X, y): index_target_class = random_state.choice( range(np.count_nonzero(y == target_class)), size=n_samples, - replace=self.replacement) + replace=self.replacement, + ) else: index_target_class = slice(None) idx_under = np.concatenate( - (idx_under, - np.flatnonzero(y == target_class)[index_target_class]), - axis=0) + ( + idx_under, + np.flatnonzero(y == target_class)[index_target_class], + ), + axis=0, + ) self.sample_indices_ = idx_under return safe_indexing(X, idx_under), safe_indexing(y, idx_under) def _more_tags(self): - return {'X_types': ['2darray', 'string'], - 'sample_indices': True} + return {"X_types": ["2darray", "string"], "sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index bd058f3db..04402398a 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -16,7 +16,8 @@ @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring) + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring +) class TomekLinks(BaseCleaningSampler): """Class to perform under-sampling by removing Tomek's links. @@ -67,9 +68,7 @@ class TomekLinks(BaseCleaningSampler): """ - def __init__(self, - sampling_strategy='auto', - n_jobs=1): + def __init__(self, sampling_strategy="auto", n_jobs=1): super().__init__(sampling_strategy=sampling_strategy) self.n_jobs = n_jobs @@ -124,8 +123,10 @@ def _fit_resample(self, X, y): links = self.is_tomek(y, nns, self.sampling_strategy_) self.sample_indices_ = np.flatnonzero(np.logical_not(links)) - return (safe_indexing(X, self.sample_indices_), - safe_indexing(y, self.sample_indices_)) + return ( + safe_indexing(X, self.sample_indices_), + safe_indexing(y, self.sample_indices_), + ) def _more_tags(self): - return {'sample_indices': True} + return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index fef20b712..e165871cf 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -13,35 +13,94 @@ from imblearn.under_sampling import AllKNN from imblearn.utils.testing import warns -X = np.array([[-0.12840393, 0.66446571], [1.32319756, -0.13181616], [ - 0.04296502, -0.37981873 -], [0.83631853, 0.18569783], [1.02956816, 0.36061601], [ - 1.12202806, 0.33811558 -], [-0.53171468, -0.53735182], [1.3381556, 0.35956356], [ - -0.35946678, 0.72510189 -], [1.32326943, 0.28393874], [2.94290565, -0.13986434], [ - 0.28294738, -1.00125525 -], [0.34218094, -0.58781961], [-0.88864036, -0.33782387], [ - -1.10146139, 0.91782682 -], [-0.7969716, -0.50493969], [0.73489726, 0.43915195], [ - 0.2096964, -0.61814058 -], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [ - 1.59068979, -0.96622933 -], [0.73418199, -0.02222847], [0.50307437, 0.498805], [0.84929742, 0.41042894], - [0.62649535, 0.46600596], [0.79270821, -0.41386668], [ - 1.16606871, -0.25641059 - ], [1.57356906, 0.30390519], [1.0304995, -0.16955962], [ - 1.67314371, 0.19231498 - ], [0.98382284, 0.37184502], [0.48921682, -1.38504507], [ - -0.46226554, -0.50481004 - ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [ - 0.80541964, -0.34465185 - ], [0.1732627, -1.61323172], [0.69804044, 0.44810796], - [-0.5506368, -0.42072426], [-0.34474418, 0.21969797]]) -Y = np.array([ - 1, 2, 2, 2, 1, 1, 0, 2, 1, 1, 1, 2, 2, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, - 2, 2, 2, 2, 1, 1, 2, 0, 2, 2, 2, 2, 1, 2, 0 -]) +X = np.array( + [ + [-0.12840393, 0.66446571], + [1.32319756, -0.13181616], + [0.04296502, -0.37981873], + [0.83631853, 0.18569783], + [1.02956816, 0.36061601], + [1.12202806, 0.33811558], + [-0.53171468, -0.53735182], + [1.3381556, 0.35956356], + [-0.35946678, 0.72510189], + [1.32326943, 0.28393874], + [2.94290565, -0.13986434], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [-0.88864036, -0.33782387], + [-1.10146139, 0.91782682], + [-0.7969716, -0.50493969], + [0.73489726, 0.43915195], + [0.2096964, -0.61814058], + [-0.28479268, 0.70459548], + [1.84864913, 0.14729596], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.50307437, 0.498805], + [0.84929742, 0.41042894], + [0.62649535, 0.46600596], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.57356906, 0.30390519], + [1.0304995, -0.16955962], + [1.67314371, 0.19231498], + [0.98382284, 0.37184502], + [0.48921682, -1.38504507], + [-0.46226554, -0.50481004], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], + [0.69804044, 0.44810796], + [-0.5506368, -0.42072426], + [-0.34474418, 0.21969797], + ] +) +Y = np.array( + [ + 1, + 2, + 2, + 2, + 1, + 1, + 0, + 2, + 1, + 1, + 1, + 2, + 2, + 0, + 1, + 2, + 1, + 2, + 1, + 1, + 2, + 2, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 1, + 1, + 2, + 0, + 2, + 2, + 2, + 2, + 1, + 2, + 0, + ] +) R_TOL = 1e-4 @@ -49,27 +108,68 @@ def test_allknn_fit_resample(): allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ - -0.46226554, -0.50481004 - ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ - 1.12202806, 0.33811558 - ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ - 0.50307437, 0.498805 - ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [ - 0.98382284, 0.37184502 - ], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [ - 0.28294738, -1.00125525 - ], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [ - 1.59068979, -0.96622933 - ], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [ - 1.16606871, -0.25641059 - ], [1.0304995, -0.16955962], [0.48921682, -1.38504507], - [-0.03918551, -0.68540745], [0.24991051, -1.00864997], - [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) - y_gt = np.array([ - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2 - ]) + X_gt = np.array( + [ + [-0.53171468, -0.53735182], + [-0.88864036, -0.33782387], + [-0.46226554, -0.50481004], + [-0.34474418, 0.21969797], + [1.02956816, 0.36061601], + [1.12202806, 0.33811558], + [-1.10146139, 0.91782682], + [0.73489726, 0.43915195], + [0.50307437, 0.498805], + [0.84929742, 0.41042894], + [0.62649535, 0.46600596], + [0.98382284, 0.37184502], + [0.69804044, 0.44810796], + [0.04296502, -0.37981873], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [0.2096964, -0.61814058], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.0304995, -0.16955962], + [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], + ] + ) + y_gt = np.array( + [ + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + ] + ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) @@ -85,7 +185,8 @@ def test_all_knn_allow_minority(): n_clusters_per_class=1, weights=[0.2, 0.3, 0.5], class_sep=0.4, - random_state=0) + random_state=0, + ) allknn = AllKNN(allow_minority=True) X_res_1, y_res_1 = allknn.fit_resample(X, y) @@ -95,72 +196,164 @@ def test_all_knn_allow_minority(): def test_allknn_fit_resample_mode(): - allknn = AllKNN(kind_sel='mode') + allknn = AllKNN(kind_sel="mode") X_resampled, y_resampled = allknn.fit_resample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ - -0.46226554, -0.50481004 - ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [ - 1.02956816, 0.36061601 - ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [ - -1.10146139, 0.91782682 - ], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [ - 0.50307437, 0.498805 - ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [ - 0.98382284, 0.37184502 - ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [ - 0.04296502, -0.37981873 - ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [ - 0.2096964, -0.61814058 - ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [ - 0.79270821, -0.41386668 - ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [ - 0.48921682, -1.38504507 - ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], - [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) - y_gt = np.array([ - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2 - ]) + X_gt = np.array( + [ + [-0.53171468, -0.53735182], + [-0.88864036, -0.33782387], + [-0.46226554, -0.50481004], + [-0.34474418, 0.21969797], + [-0.12840393, 0.66446571], + [1.02956816, 0.36061601], + [1.12202806, 0.33811558], + [-0.35946678, 0.72510189], + [-1.10146139, 0.91782682], + [0.73489726, 0.43915195], + [-0.28479268, 0.70459548], + [0.50307437, 0.498805], + [0.84929742, 0.41042894], + [0.62649535, 0.46600596], + [0.98382284, 0.37184502], + [0.69804044, 0.44810796], + [1.32319756, -0.13181616], + [0.04296502, -0.37981873], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [0.2096964, -0.61814058], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.0304995, -0.16955962], + [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], + ] + ) + y_gt = np.array( + [ + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + ] + ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_allknn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) - allknn = AllKNN(n_neighbors=nn, kind_sel='mode') + allknn = AllKNN(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = allknn.fit_resample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ - -0.46226554, -0.50481004 - ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [ - 1.02956816, 0.36061601 - ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [ - -1.10146139, 0.91782682 - ], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [ - 0.50307437, 0.498805 - ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [ - 0.98382284, 0.37184502 - ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [ - 0.04296502, -0.37981873 - ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [ - 0.2096964, -0.61814058 - ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [ - 0.79270821, -0.41386668 - ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [ - 0.48921682, -1.38504507 - ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], - [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) - y_gt = np.array([ - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2 - ]) + X_gt = np.array( + [ + [-0.53171468, -0.53735182], + [-0.88864036, -0.33782387], + [-0.46226554, -0.50481004], + [-0.34474418, 0.21969797], + [-0.12840393, 0.66446571], + [1.02956816, 0.36061601], + [1.12202806, 0.33811558], + [-0.35946678, 0.72510189], + [-1.10146139, 0.91782682], + [0.73489726, 0.43915195], + [-0.28479268, 0.70459548], + [0.50307437, 0.498805], + [0.84929742, 0.41042894], + [0.62649535, 0.46600596], + [0.98382284, 0.37184502], + [0.69804044, 0.44810796], + [1.32319756, -0.13181616], + [0.04296502, -0.37981873], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [0.2096964, -0.61814058], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.0304995, -0.16955962], + [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], + ] + ) + y_gt = np.array( + [ + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + ] + ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_alknn_not_good_object(): - nn = 'rnd' - allknn = AllKNN(n_neighbors=nn, kind_sel='mode') + nn = "rnd" + allknn = AllKNN(n_neighbors=nn, kind_sel="mode") with pytest.raises(ValueError): allknn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py index a9c2a6c46..7f0f67c43 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -12,19 +12,30 @@ from imblearn.under_sampling import CondensedNearestNeighbour RND_SEED = 0 -X = np.array([[2.59928271, 0.93323465], [0.25738379, 0.95564169], [ - 1.42772181, 0.526027 -], [1.92365863, 0.82718767], [-0.10903849, - -0.12085181], [-0.284881, -0.62730973], - [0.57062627, 1.19528323], [0.03394306, - 0.03986753], [0.78318102, 2.59153329], - [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [ - 0.01936241, 0.17799828 - ], [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], [ - -0.01252787, 0.34102657 - ], [0.52726792, -0.38735648], [0.2821046, -0.07862747], [ - 0.05230552, 0.09043907 - ], [0.15198585, 0.12512646], [0.70524765, 0.39816382]]) +X = np.array( + [ + [2.59928271, 0.93323465], + [0.25738379, 0.95564169], + [1.42772181, 0.526027], + [1.92365863, 0.82718767], + [-0.10903849, -0.12085181], + [-0.284881, -0.62730973], + [0.57062627, 1.19528323], + [0.03394306, 0.03986753], + [0.78318102, 2.59153329], + [0.35831463, 1.33483198], + [-0.14313184, -1.0412815], + [0.01936241, 0.17799828], + [-1.25020462, -0.40402054], + [-0.09816301, -0.74662486], + [-0.01252787, 0.34102657], + [0.52726792, -0.38735648], + [0.2821046, -0.07862747], + [0.05230552, 0.09043907], + [0.15198585, 0.12512646], + [0.70524765, 0.39816382], + ] +) Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1]) @@ -39,12 +50,20 @@ def test_cnn_fit_resample(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_resample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ - 0.05230552, 0.09043907 - ], [-1.25020462, -0.40402054], [0.70524765, - 0.39816382], [0.35831463, 1.33483198], - [-0.284881, -0.62730973], [0.03394306, 0.03986753], - [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) + X_gt = np.array( + [ + [-0.10903849, -0.12085181], + [0.01936241, 0.17799828], + [0.05230552, 0.09043907], + [-1.25020462, -0.40402054], + [0.70524765, 0.39816382], + [0.35831463, 1.33483198], + [-0.284881, -0.62730973], + [0.03394306, 0.03986753], + [-0.01252787, 0.34102657], + [0.15198585, 0.12512646], + ] + ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -55,12 +74,20 @@ def test_cnn_fit_resample_with_object(): cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = cnn.fit_resample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ - 0.05230552, 0.09043907 - ], [-1.25020462, -0.40402054], [0.70524765, - 0.39816382], [0.35831463, 1.33483198], - [-0.284881, -0.62730973], [0.03394306, 0.03986753], - [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) + X_gt = np.array( + [ + [-0.10903849, -0.12085181], + [0.01936241, 0.17799828], + [0.05230552, 0.09043907], + [-1.25020462, -0.40402054], + [0.70524765, 0.39816382], + [0.35831463, 1.33483198], + [-0.284881, -0.62730973], + [0.03394306, 0.03986753], + [-0.01252787, 0.34102657], + [0.15198585, 0.12512646], + ] + ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -72,7 +99,7 @@ def test_cnn_fit_resample_with_object(): def test_cnn_fit_resample_with_wrong_object(): - knn = 'rnd' + knn = "rnd" cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) with pytest.raises(ValueError, match="has to be a int or an "): cnn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index 87caf08bb..1378c9f6d 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -13,19 +13,30 @@ from imblearn.under_sampling import EditedNearestNeighbours from imblearn.utils.testing import warns -X = np.array([[2.59928271, 0.93323465], [0.25738379, 0.95564169], [ - 1.42772181, 0.526027 -], [1.92365863, 0.82718767], [-0.10903849, - -0.12085181], [-0.284881, -0.62730973], - [0.57062627, 1.19528323], [0.03394306, - 0.03986753], [0.78318102, 2.59153329], - [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [ - 0.01936241, 0.17799828 - ], [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], [ - -0.01252787, 0.34102657 - ], [0.52726792, -0.38735648], [0.2821046, -0.07862747], [ - 0.05230552, 0.09043907 - ], [0.15198585, 0.12512646], [0.70524765, 0.39816382]]) +X = np.array( + [ + [2.59928271, 0.93323465], + [0.25738379, 0.95564169], + [1.42772181, 0.526027], + [1.92365863, 0.82718767], + [-0.10903849, -0.12085181], + [-0.284881, -0.62730973], + [0.57062627, 1.19528323], + [0.03394306, 0.03986753], + [0.78318102, 2.59153329], + [0.35831463, 1.33483198], + [-0.14313184, -1.0412815], + [0.01936241, 0.17799828], + [-1.25020462, -0.40402054], + [-0.09816301, -0.74662486], + [-0.01252787, 0.34102657], + [0.52726792, -0.38735648], + [0.2821046, -0.07862747], + [0.05230552, 0.09043907], + [0.15198585, 0.12512646], + [0.70524765, 0.39816382], + ] +) Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1]) @@ -33,7 +44,7 @@ def test_enn_init(): enn = EditedNearestNeighbours() assert enn.n_neighbors == 3 - assert enn.kind_sel == 'all' + assert enn.kind_sel == "all" assert enn.n_jobs == 1 @@ -41,27 +52,44 @@ def test_enn_fit_resample(): enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ - 2.59928271, 0.93323465 - ], [1.92365863, 0.82718767], [0.25738379, 0.95564169], - [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) + X_gt = np.array( + [ + [-0.10903849, -0.12085181], + [0.01936241, 0.17799828], + [2.59928271, 0.93323465], + [1.92365863, 0.82718767], + [0.25738379, 0.95564169], + [0.78318102, 2.59153329], + [0.52726792, -0.38735648], + ] + ) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_fit_resample_mode(): - enn = EditedNearestNeighbours(kind_sel='mode') + enn = EditedNearestNeighbours(kind_sel="mode") X_resampled, y_resampled = enn.fit_resample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ - 2.59928271, 0.93323465 - ], [1.42772181, 0.526027], [1.92365863, 0.82718767], [ - 0.25738379, 0.95564169 - ], [-0.284881, -0.62730973], [0.57062627, 1.19528323], - [0.78318102, 2.59153329], [0.35831463, 1.33483198], - [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], - [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) + X_gt = np.array( + [ + [-0.10903849, -0.12085181], + [0.01936241, 0.17799828], + [2.59928271, 0.93323465], + [1.42772181, 0.526027], + [1.92365863, 0.82718767], + [0.25738379, 0.95564169], + [-0.284881, -0.62730973], + [0.57062627, 1.19528323], + [0.78318102, 2.59153329], + [0.35831463, 1.33483198], + [-0.14313184, -1.0412815], + [-0.09816301, -0.74662486], + [0.52726792, -0.38735648], + [0.2821046, -0.07862747], + ] + ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -69,24 +97,34 @@ def test_enn_fit_resample_mode(): def test_enn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) - enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') + enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = enn.fit_resample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ - 2.59928271, 0.93323465 - ], [1.42772181, 0.526027], [1.92365863, 0.82718767], [ - 0.25738379, 0.95564169 - ], [-0.284881, -0.62730973], [0.57062627, 1.19528323], - [0.78318102, 2.59153329], [0.35831463, 1.33483198], - [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], - [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) + X_gt = np.array( + [ + [-0.10903849, -0.12085181], + [0.01936241, 0.17799828], + [2.59928271, 0.93323465], + [1.42772181, 0.526027], + [1.92365863, 0.82718767], + [0.25738379, 0.95564169], + [-0.284881, -0.62730973], + [0.57062627, 1.19528323], + [0.78318102, 2.59153329], + [0.35831463, 1.33483198], + [-0.14313184, -1.0412815], + [-0.09816301, -0.74662486], + [0.52726792, -0.38735648], + [0.2821046, -0.07862747], + ] + ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_not_good_object(): - nn = 'rnd' - enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') + nn = "rnd" + enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") with pytest.raises(ValueError, match="has to be one of"): enn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 599bba8ed..913d07f39 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -12,23 +12,34 @@ from imblearn.under_sampling import InstanceHardnessThreshold RND_SEED = 0 -X = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [ - -0.77740357, 0.74097941 -], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [ - -0.43877303, 1.07366684 -], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [ - -0.30126957, -0.66268378 -], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], - [0.20246714, -0.34727125], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) +X = np.array( + [ + [-0.3879569, 0.6894251], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [0.91542919, -0.65453327], + [-0.03852113, 0.40910479], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.30126957, -0.66268378], + [-0.65571327, 0.42412021], + [-0.28305528, 0.30284991], + [0.20246714, -0.34727125], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + ] +) Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]) ESTIMATOR = GradientBoostingClassifier(random_state=RND_SEED) def test_iht_init(): - sampling_strategy = 'auto' + sampling_strategy = "auto" iht = InstanceHardnessThreshold( - ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED) + ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED + ) assert iht.sampling_strategy == sampling_strategy assert iht.random_state == RND_SEED @@ -44,7 +55,8 @@ def test_iht_fit_resample(): def test_iht_fit_resample_half(): sampling_strategy = {0: 6, 1: 8} iht = InstanceHardnessThreshold( - ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED) + ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED + ) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (14, 2) assert y_resampled.shape == (14,) @@ -60,6 +72,7 @@ def test_iht_fit_resample_class_obj(): def test_iht_fit_resample_wrong_class_obj(): from sklearn.cluster import KMeans + est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) with pytest.raises(ValueError, match="Invalid parameter `estimator`"): diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py index f91f2f522..69d4e6e57 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py @@ -12,15 +12,25 @@ from imblearn.under_sampling import NearMiss from imblearn.utils.testing import warns -X = np.array([[1.17737838, -0.2002118], [0.4960075, 0.86130762], [ - -0.05903827, 0.10947647 -], [0.91464286, 1.61369212], [-0.54619583, 1.73009918], - [-0.60413357, 0.24628718], [0.45713638, 1.31069295], - [-0.04032409, 3.01186964], [0.03142011, 0.12323596], [ - 0.50701028, -0.17636928 - ], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [ - 0.99272351, -0.11631728 - ], [-1.95581933, 0.69609604], [1.15157493, -1.2981518]]) +X = np.array( + [ + [1.17737838, -0.2002118], + [0.4960075, 0.86130762], + [-0.05903827, 0.10947647], + [0.91464286, 1.61369212], + [-0.54619583, 1.73009918], + [-0.60413357, 0.24628718], + [0.45713638, 1.31069295], + [-0.04032409, 3.01186964], + [0.03142011, 0.12323596], + [0.50701028, -0.17636928], + [-0.80809175, -1.09917302], + [-0.20497017, -0.26630228], + [0.99272351, -0.11631728], + [-1.95581933, 0.69609604], + [1.15157493, -1.2981518], + ] +) Y = np.array([1, 2, 1, 0, 2, 1, 2, 2, 1, 2, 0, 0, 2, 1, 2]) VERSION_NEARMISS = (1, 2, 3) @@ -28,10 +38,18 @@ @pytest.mark.parametrize( "nearmiss_params, err_msg", - [({"version": 1000}, "must be 1, 2 or 3"), - ({"version": 1, "n_neighbors": 'rnd'}, "has to be one of"), - ({"version": 3, "n_neighbors": NearestNeighbors(n_neighbors=3), - "n_neighbors_ver3": "rnd"}, "has to be one of")] + [ + ({"version": 1000}, "must be 1, 2 or 3"), + ({"version": 1, "n_neighbors": "rnd"}, "has to be one of"), + ( + { + "version": 3, + "n_neighbors": NearestNeighbors(n_neighbors=3), + "n_neighbors_ver3": "rnd", + }, + "has to be one of", + ), + ], ) def test_nearmiss_error(nearmiss_params, err_msg): nm = NearMiss(**nearmiss_params) @@ -40,28 +58,52 @@ def test_nearmiss_error(nearmiss_params, err_msg): def test_nm_fit_resample_auto(): - sampling_strategy = 'auto' + sampling_strategy = "auto" X_gt = [ - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], [0.50701028, -0.17636928], - [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], [0.50701028, -0.17636928], - [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], - [0.03142011, 0.12323596], [1.15157493, -1.2981518], - [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) + np.array( + [ + [0.91464286, 1.61369212], + [-0.80809175, -1.09917302], + [-0.20497017, -0.26630228], + [-0.05903827, 0.10947647], + [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], + [0.50701028, -0.17636928], + [0.4960075, 0.86130762], + [0.45713638, 1.31069295], + ] + ), + np.array( + [ + [0.91464286, 1.61369212], + [-0.80809175, -1.09917302], + [-0.20497017, -0.26630228], + [-0.05903827, 0.10947647], + [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], + [0.50701028, -0.17636928], + [0.4960075, 0.86130762], + [0.45713638, 1.31069295], + ] + ), + np.array( + [ + [0.91464286, 1.61369212], + [-0.80809175, -1.09917302], + [-0.20497017, -0.26630228], + [1.17737838, -0.2002118], + [-0.60413357, 0.24628718], + [0.03142011, 0.12323596], + [1.15157493, -1.2981518], + [-0.54619583, 1.73009918], + [0.99272351, -0.11631728], + ] + ), ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) @@ -73,29 +115,56 @@ def test_nm_fit_resample_auto(): def test_nm_fit_resample_float_sampling_strategy(): sampling_strategy = {0: 3, 1: 4, 2: 4} X_gt = [ - np.array([[-0.20497017, -0.26630228], [-0.80809175, -1.09917302], [ - 0.91464286, 1.61369212 - ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], [1.17737838, -0.2002118], - [0.50701028, -0.17636928], [0.4960075, 0.86130762], - [0.45713638, 1.31069295], [0.99272351, -0.11631728]]), - np.array([[-0.20497017, -0.26630228], [-0.80809175, -1.09917302], [ - 0.91464286, 1.61369212 - ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], [1.17737838, -0.2002118], - [0.50701028, -0.17636928], [0.4960075, 0.86130762], - [0.45713638, 1.31069295], [0.99272351, -0.11631728]]), - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], - [0.03142011, 0.12323596], [-0.05903827, 0.10947647], - [1.15157493, -1.2981518], [-0.54619583, 1.73009918], - [0.99272351, -0.11631728], [0.45713638, 1.31069295]]) + np.array( + [ + [-0.20497017, -0.26630228], + [-0.80809175, -1.09917302], + [0.91464286, 1.61369212], + [-0.05903827, 0.10947647], + [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], + [1.17737838, -0.2002118], + [0.50701028, -0.17636928], + [0.4960075, 0.86130762], + [0.45713638, 1.31069295], + [0.99272351, -0.11631728], + ] + ), + np.array( + [ + [-0.20497017, -0.26630228], + [-0.80809175, -1.09917302], + [0.91464286, 1.61369212], + [-0.05903827, 0.10947647], + [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], + [1.17737838, -0.2002118], + [0.50701028, -0.17636928], + [0.4960075, 0.86130762], + [0.45713638, 1.31069295], + [0.99272351, -0.11631728], + ] + ), + np.array( + [ + [0.91464286, 1.61369212], + [-0.80809175, -1.09917302], + [-0.20497017, -0.26630228], + [1.17737838, -0.2002118], + [-0.60413357, 0.24628718], + [0.03142011, 0.12323596], + [-0.05903827, 0.10947647], + [1.15157493, -1.2981518], + [-0.54619583, 1.73009918], + [0.99272351, -0.11631728], + [0.45713638, 1.31069295], + ] + ), ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]) + np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), ] for version_idx, version in enumerate(VERSION_NEARMISS): @@ -106,35 +175,60 @@ def test_nm_fit_resample_float_sampling_strategy(): def test_nm_fit_resample_nn_obj(): - sampling_strategy = 'auto' + sampling_strategy = "auto" nn = NearestNeighbors(n_neighbors=3) X_gt = [ - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], [0.50701028, -0.17636928], - [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], [0.50701028, -0.17636928], - [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ - -0.20497017, -0.26630228 - ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], - [0.03142011, 0.12323596], [1.15157493, -1.2981518], - [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) + np.array( + [ + [0.91464286, 1.61369212], + [-0.80809175, -1.09917302], + [-0.20497017, -0.26630228], + [-0.05903827, 0.10947647], + [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], + [0.50701028, -0.17636928], + [0.4960075, 0.86130762], + [0.45713638, 1.31069295], + ] + ), + np.array( + [ + [0.91464286, 1.61369212], + [-0.80809175, -1.09917302], + [-0.20497017, -0.26630228], + [-0.05903827, 0.10947647], + [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], + [0.50701028, -0.17636928], + [0.4960075, 0.86130762], + [0.45713638, 1.31069295], + ] + ), + np.array( + [ + [0.91464286, 1.61369212], + [-0.80809175, -1.09917302], + [-0.20497017, -0.26630228], + [1.17737838, -0.2002118], + [-0.60413357, 0.24628718], + [0.03142011, 0.12323596], + [1.15157493, -1.2981518], + [-0.54619583, 1.73009918], + [0.99272351, -0.11631728], + ] + ), ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss( sampling_strategy=sampling_strategy, version=version, - n_neighbors=nn) + n_neighbors=nn, + ) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py index a78d92192..5338e9d1b 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -12,23 +12,35 @@ from imblearn.under_sampling import NeighbourhoodCleaningRule from imblearn.utils.testing import warns -X = np.array([[1.57737838, 0.1997882], [0.8960075, 0.46130762], [ - 0.34096173, 0.50947647 -], [-0.91735824, 0.93110278], [-0.14619583, 1.33009918], - [-0.20413357, 0.64628718], [0.85713638, 0.91069295], - [0.35967591, 2.61186964], [0.43142011, 0.52323596], [ - 0.90701028, -0.57636928 - ], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [ - 1.39272351, -0.51631728 - ], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) +X = np.array( + [ + [1.57737838, 0.1997882], + [0.8960075, 0.46130762], + [0.34096173, 0.50947647], + [-0.91735824, 0.93110278], + [-0.14619583, 1.33009918], + [-0.20413357, 0.64628718], + [0.85713638, 0.91069295], + [0.35967591, 2.61186964], + [0.43142011, 0.52323596], + [0.90701028, -0.57636928], + [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], + [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], + [1.55157493, -1.6981518], + ] +) Y = np.array([1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 0, 0, 2, 1, 2]) @pytest.mark.parametrize( "ncr_params, err_msg", - [({"threshold_cleaning": -10}, "value between 0 and 1"), - ({"threshold_cleaning": 10}, "value between 0 and 1"), - ({"n_neighbors": 'rnd'}, "has to be one of")] + [ + ({"threshold_cleaning": -10}, "value between 0 and 1"), + ({"threshold_cleaning": 10}, "value between 0 and 1"), + ({"n_neighbors": "rnd"}, "has to be one of"), + ], ) def test_ncr_error(ncr_params, err_msg): ncr = NeighbourhoodCleaningRule(**ncr_params) @@ -40,27 +52,43 @@ def test_ncr_fit_resample(): ncr = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncr.fit_resample(X, Y) - X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ - -0.20413357, 0.64628718 - ], [0.35967591, 2.61186964], [0.90701028, - -0.57636928], [-1.20809175, -1.49917302], - [-0.60497017, -0.66630228], [1.39272351, -0.51631728], - [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) + X_gt = np.array( + [ + [0.34096173, 0.50947647], + [-0.91735824, 0.93110278], + [-0.20413357, 0.64628718], + [0.35967591, 2.61186964], + [0.90701028, -0.57636928], + [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], + [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], + [1.55157493, -1.6981518], + ] + ) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_ncr_fit_resample_mode(): - ncr = NeighbourhoodCleaningRule(kind_sel='mode') + ncr = NeighbourhoodCleaningRule(kind_sel="mode") X_resampled, y_resampled = ncr.fit_resample(X, Y) - X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ - -0.20413357, 0.64628718 - ], [0.35967591, 2.61186964], [0.90701028, - -0.57636928], [-1.20809175, -1.49917302], - [-0.60497017, -0.66630228], [1.39272351, -0.51631728], - [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) + X_gt = np.array( + [ + [0.34096173, 0.50947647], + [-0.91735824, 0.93110278], + [-0.20413357, 0.64628718], + [0.35967591, 2.61186964], + [0.90701028, -0.57636928], + [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], + [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], + [1.55157493, -1.6981518], + ] + ) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py index d6a5bf853..612d23cc7 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py @@ -12,15 +12,25 @@ from imblearn.under_sampling import OneSidedSelection RND_SEED = 0 -X = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [ - -0.77740357, 0.74097941 -], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [ - -0.43877303, 1.07366684 -], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [ - -0.30126957, -0.66268378 -], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], - [0.20246714, -0.34727125], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) +X = np.array( + [ + [-0.3879569, 0.6894251], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [0.91542919, -0.65453327], + [-0.03852113, 0.40910479], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.30126957, -0.66268378], + [-0.65571327, 0.42412021], + [-0.28305528, 0.30284991], + [0.20246714, -0.34727125], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + ] +) Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]) @@ -36,13 +46,22 @@ def test_oss_fit_resample(): oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_resample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ - -0.65571327, 0.42412021 - ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ - -0.00717161, 0.00318087 - ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) + X_gt = np.array( + [ + [-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.30126957, -0.66268378], + [0.20246714, -0.34727125], + ] + ) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -53,13 +72,22 @@ def test_oss_with_object(): oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_resample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ - -0.65571327, 0.42412021 - ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ - -0.00717161, 0.00318087 - ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) + X_gt = np.array( + [ + [-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.30126957, -0.66268378], + [0.20246714, -0.34727125], + ] + ) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -71,7 +99,7 @@ def test_oss_with_object(): def test_oss_with_wrong_object(): - knn = 'rnd' + knn = "rnd" oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) with pytest.raises(ValueError, match="has to be a int"): oss.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index 680ec0b3d..78514a719 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -12,11 +12,20 @@ from imblearn.under_sampling import RandomUnderSampler RND_SEED = 0 -X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ - 0.20792588, 1.49407907 -], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], - [0.09125309, -0.85409574], [0.12372842, 0.6536186], - [0.13347175, 0.12167502], [0.094035, -2.55298982]]) +X = np.array( + [ + [0.04352327, -0.20515826], + [0.92923648, 0.76103773], + [0.20792588, 1.49407907], + [0.47104475, 0.44386323], + [0.22950086, 0.33367433], + [0.15490546, 0.3130677], + [0.09125309, -0.85409574], + [0.12372842, 0.6536186], + [0.13347175, 0.12167502], + [0.094035, -2.55298982], + ] +) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) @@ -24,9 +33,16 @@ def test_rus_fit_resample(): rus = RandomUnderSampler(random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_resample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.09125309, -0.85409574], - [0.12372842, 0.6536186], [0.04352327, -0.20515826]]) + X_gt = np.array( + [ + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.13347175, 0.12167502], + [0.09125309, -0.85409574], + [0.12372842, 0.6536186], + [0.04352327, -0.20515826], + ] + ) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) @@ -38,14 +54,23 @@ def test_rus_fit_resample_half(): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED, - replacement=True) + replacement=True, + ) X_resampled, y_resampled = rus.fit_resample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [ - 0.92923648, 0.76103773 - ], [0.15490546, 0.3130677], [0.15490546, 0.3130677], - [0.15490546, 0.3130677], [0.20792588, 1.49407907], - [0.15490546, 0.3130677], [0.12372842, 0.6536186]]) + X_gt = np.array( + [ + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.92923648, 0.76103773], + [0.15490546, 0.3130677], + [0.15490546, 0.3130677], + [0.15490546, 0.3130677], + [0.20792588, 1.49407907], + [0.15490546, 0.3130677], + [0.12372842, 0.6536186], + ] + ) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -64,8 +89,9 @@ def test_multiclass_fit_resample(): def test_random_under_sampling_heterogeneous_data(): - X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], - dtype=np.object) + X_hetero = np.array( + [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=np.object + ) y = np.array([0, 0, 1]) rus = RandomUnderSampler(random_state=RND_SEED) X_res, y_res = rus.fit_resample(X_hetero, y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index 814dca775..b11163709 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -12,42 +12,101 @@ from imblearn.under_sampling import RepeatedEditedNearestNeighbours from imblearn.utils.testing import warns -X = np.array([[-0.12840393, 0.66446571], [1.32319756, -0.13181616], [ - 0.04296502, -0.37981873 -], [0.83631853, 0.18569783], [1.02956816, 0.36061601], [ - 1.12202806, 0.33811558 -], [-0.53171468, -0.53735182], [1.3381556, 0.35956356], [ - -0.35946678, 0.72510189 -], [1.32326943, 0.28393874], [2.94290565, -0.13986434], [ - 0.28294738, -1.00125525 -], [0.34218094, -0.58781961], [-0.88864036, -0.33782387], [ - -1.10146139, 0.91782682 -], [-0.7969716, -0.50493969], [0.73489726, 0.43915195], [ - 0.2096964, -0.61814058 -], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [ - 1.59068979, -0.96622933 -], [0.73418199, -0.02222847], [0.50307437, 0.498805], [0.84929742, 0.41042894], - [0.62649535, 0.46600596], [0.79270821, -0.41386668], [ - 1.16606871, -0.25641059 - ], [1.57356906, 0.30390519], [1.0304995, -0.16955962], [ - 1.67314371, 0.19231498 - ], [0.98382284, 0.37184502], [0.48921682, -1.38504507], [ - -0.46226554, -0.50481004 - ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [ - 0.80541964, -0.34465185 - ], [0.1732627, -1.61323172], [0.69804044, 0.44810796], - [-0.5506368, -0.42072426], [-0.34474418, 0.21969797]]) -Y = np.array([ - 1, 2, 2, 2, 1, 1, 0, 2, 1, 1, 1, 2, 2, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, - 2, 2, 2, 2, 1, 1, 2, 0, 2, 2, 2, 2, 1, 2, 0 -]) +X = np.array( + [ + [-0.12840393, 0.66446571], + [1.32319756, -0.13181616], + [0.04296502, -0.37981873], + [0.83631853, 0.18569783], + [1.02956816, 0.36061601], + [1.12202806, 0.33811558], + [-0.53171468, -0.53735182], + [1.3381556, 0.35956356], + [-0.35946678, 0.72510189], + [1.32326943, 0.28393874], + [2.94290565, -0.13986434], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [-0.88864036, -0.33782387], + [-1.10146139, 0.91782682], + [-0.7969716, -0.50493969], + [0.73489726, 0.43915195], + [0.2096964, -0.61814058], + [-0.28479268, 0.70459548], + [1.84864913, 0.14729596], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.50307437, 0.498805], + [0.84929742, 0.41042894], + [0.62649535, 0.46600596], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.57356906, 0.30390519], + [1.0304995, -0.16955962], + [1.67314371, 0.19231498], + [0.98382284, 0.37184502], + [0.48921682, -1.38504507], + [-0.46226554, -0.50481004], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], + [0.69804044, 0.44810796], + [-0.5506368, -0.42072426], + [-0.34474418, 0.21969797], + ] +) +Y = np.array( + [ + 1, + 2, + 2, + 2, + 1, + 1, + 0, + 2, + 1, + 1, + 1, + 2, + 2, + 0, + 1, + 2, + 1, + 2, + 1, + 1, + 2, + 2, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 1, + 1, + 2, + 0, + 2, + 2, + 2, + 2, + 1, + 2, + 0, + ] +) def test_renn_init(): renn = RepeatedEditedNearestNeighbours() assert renn.n_neighbors == 3 - assert renn.kind_sel == 'all' + assert renn.kind_sel == "all" assert renn.n_jobs == 1 @@ -62,101 +121,241 @@ def test_renn_fit_resample(): renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_resample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ - -0.46226554, -0.50481004 - ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ - 1.12202806, 0.33811558 - ], [0.73489726, 0.43915195], [0.50307437, 0.498805], [ - 0.84929742, 0.41042894 - ], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [ - 0.69804044, 0.44810796 - ], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [ - 0.34218094, -0.58781961 - ], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [ - 0.73418199, -0.02222847 - ], [0.79270821, -0.41386668], [1.16606871, -0.25641059], - [1.0304995, -0.16955962], [0.48921682, -1.38504507], - [-0.03918551, -0.68540745], [0.24991051, -1.00864997], - [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) - y_gt = np.array([ - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2 - ]) + X_gt = np.array( + [ + [-0.53171468, -0.53735182], + [-0.88864036, -0.33782387], + [-0.46226554, -0.50481004], + [-0.34474418, 0.21969797], + [1.02956816, 0.36061601], + [1.12202806, 0.33811558], + [0.73489726, 0.43915195], + [0.50307437, 0.498805], + [0.84929742, 0.41042894], + [0.62649535, 0.46600596], + [0.98382284, 0.37184502], + [0.69804044, 0.44810796], + [0.04296502, -0.37981873], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [0.2096964, -0.61814058], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.0304995, -0.16955962], + [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], + ] + ) + y_gt = np.array( + [ + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + ] + ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_renn_fit_resample_mode_object(): - renn = RepeatedEditedNearestNeighbours(kind_sel='mode') + renn = RepeatedEditedNearestNeighbours(kind_sel="mode") X_resampled, y_resampled = renn.fit_resample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ - -0.46226554, -0.50481004 - ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [ - 1.02956816, 0.36061601 - ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [ - 2.94290565, -0.13986434 - ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ - -0.28479268, 0.70459548 - ], [1.84864913, 0.14729596], [0.50307437, 0.498805], [ - 0.84929742, 0.41042894 - ], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [ - 0.98382284, 0.37184502 - ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [ - 0.04296502, -0.37981873 - ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [ - 0.2096964, -0.61814058 - ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [ - 0.79270821, -0.41386668 - ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [ - 0.48921682, -1.38504507 - ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], - [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) - y_gt = np.array([ - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 - ]) + X_gt = np.array( + [ + [-0.53171468, -0.53735182], + [-0.88864036, -0.33782387], + [-0.46226554, -0.50481004], + [-0.34474418, 0.21969797], + [-0.12840393, 0.66446571], + [1.02956816, 0.36061601], + [1.12202806, 0.33811558], + [-0.35946678, 0.72510189], + [2.94290565, -0.13986434], + [-1.10146139, 0.91782682], + [0.73489726, 0.43915195], + [-0.28479268, 0.70459548], + [1.84864913, 0.14729596], + [0.50307437, 0.498805], + [0.84929742, 0.41042894], + [0.62649535, 0.46600596], + [1.67314371, 0.19231498], + [0.98382284, 0.37184502], + [0.69804044, 0.44810796], + [1.32319756, -0.13181616], + [0.04296502, -0.37981873], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [0.2096964, -0.61814058], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.0304995, -0.16955962], + [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], + ] + ) + y_gt = np.array( + [ + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + ] + ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_renn_fit_resample_mode(): nn = NearestNeighbors(n_neighbors=4) - renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') + renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = renn.fit_resample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ - -0.46226554, -0.50481004 - ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [ - 1.02956816, 0.36061601 - ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [ - 2.94290565, -0.13986434 - ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ - -0.28479268, 0.70459548 - ], [1.84864913, 0.14729596], [0.50307437, 0.498805], [ - 0.84929742, 0.41042894 - ], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [ - 0.98382284, 0.37184502 - ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [ - 0.04296502, -0.37981873 - ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [ - 0.2096964, -0.61814058 - ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [ - 0.79270821, -0.41386668 - ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [ - 0.48921682, -1.38504507 - ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], - [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) - y_gt = np.array([ - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 - ]) + X_gt = np.array( + [ + [-0.53171468, -0.53735182], + [-0.88864036, -0.33782387], + [-0.46226554, -0.50481004], + [-0.34474418, 0.21969797], + [-0.12840393, 0.66446571], + [1.02956816, 0.36061601], + [1.12202806, 0.33811558], + [-0.35946678, 0.72510189], + [2.94290565, -0.13986434], + [-1.10146139, 0.91782682], + [0.73489726, 0.43915195], + [-0.28479268, 0.70459548], + [1.84864913, 0.14729596], + [0.50307437, 0.498805], + [0.84929742, 0.41042894], + [0.62649535, 0.46600596], + [1.67314371, 0.19231498], + [0.98382284, 0.37184502], + [0.69804044, 0.44810796], + [1.32319756, -0.13181616], + [0.04296502, -0.37981873], + [0.28294738, -1.00125525], + [0.34218094, -0.58781961], + [0.2096964, -0.61814058], + [1.59068979, -0.96622933], + [0.73418199, -0.02222847], + [0.79270821, -0.41386668], + [1.16606871, -0.25641059], + [1.0304995, -0.16955962], + [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], + [0.24991051, -1.00864997], + [0.80541964, -0.34465185], + [0.1732627, -1.61323172], + ] + ) + y_gt = np.array( + [ + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + ] + ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_renn_not_good_object(): - nn = 'rnd' - renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') + nn = "rnd" + renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") with pytest.raises(ValueError): renn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py index 78e287c74..ad6b81ad6 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py @@ -10,19 +10,30 @@ from imblearn.under_sampling import TomekLinks from imblearn.utils.testing import warns -X = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [ - 1.34192108, -0.13367336 -], [0.62366841, -0.21312976], [1.61091956, - -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, - 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], - [0.19893132, -0.47761769], [1.06514042, -0.0770537], [ - 0.97407872, 0.44454207 - ], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ - -0.27410027, -0.54194484 - ], [0.8381014, 0.44085498], [-0.23374509, 0.18370049], [ - -0.32635887, -0.29299653 - ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) +X = np.array( + [ + [0.31230513, 0.1216318], + [0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.2184254, 0.24299982], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.06514042, -0.0770537], + [0.97407872, 0.44454207], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.27410027, -0.54194484], + [0.8381014, 0.44085498], + [-0.23374509, 0.18370049], + [-0.32635887, -0.29299653], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + ] +) Y = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) @@ -35,16 +46,27 @@ def test_tl_fit_resample(): tl = TomekLinks() X_resampled, y_resampled = tl.fit_resample(X, Y) - X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [ - 1.34192108, -0.13367336 - ], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [ - -0.37162401, -2.19400981 - ], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [ - 0.61472253, -0.82309052 - ], [0.19893132, -0.47761769], [0.97407872, 0.44454207], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], - [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) + X_gt = np.array( + [ + [0.31230513, 0.1216318], + [0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.2184254, 0.24299982], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [0.97407872, 0.44454207], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.32635887, -0.29299653], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + ] + ) y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/under_sampling/base.py b/imblearn/under_sampling/base.py index a17a4b68f..9e20eb457 100644 --- a/imblearn/under_sampling/base.py +++ b/imblearn/under_sampling/base.py @@ -13,10 +13,10 @@ class BaseUnderSampler(BaseSampler): Warning: This class should not be used directly. Use the derive classes instead. """ - _sampling_type = 'under-sampling' - _sampling_strategy_docstring = \ - """sampling_strategy : float, str, dict, callable, (default='auto') + _sampling_type = "under-sampling" + + _sampling_strategy_docstring = """sampling_strategy : float, str, dict, callable, (default='auto') Sampling information to sample the data set. - When ``float``, it corresponds to the desired ratio of the number of @@ -61,10 +61,10 @@ class BaseCleaningSampler(BaseSampler): Warning: This class should not be used directly. Use the derive classes instead. """ - _sampling_type = 'clean-sampling' - _sampling_strategy_docstring = \ - """sampling_strategy : str, list or callable + _sampling_type = "clean-sampling" + + _sampling_strategy_docstring = """sampling_strategy : str, list or callable Sampling information to sample the data set. - When ``str``, specify the class targeted by the resampling. Note the diff --git a/imblearn/utils/__init__.py b/imblearn/utils/__init__.py index 45ecdbb59..4e74d2ee3 100644 --- a/imblearn/utils/__init__.py +++ b/imblearn/utils/__init__.py @@ -9,8 +9,8 @@ from ._validation import check_sampling_strategy __all__ = [ - 'check_neighbors_object', - 'check_sampling_strategy', - 'check_target_type', - 'Substitution', + "check_neighbors_object", + "check_sampling_strategy", + "check_target_type", + "Substitution", ] diff --git a/imblearn/utils/_docstring.py b/imblearn/utils/_docstring.py index d17db83c3..d43ba8362 100644 --- a/imblearn/utils/_docstring.py +++ b/imblearn/utils/_docstring.py @@ -13,7 +13,7 @@ class Substitution: """ def __init__(self, *args, **kwargs): - if (args and kwargs): + if args and kwargs: raise AssertionError("Only positional or keyword args are allowed") self.params = args or kwargs @@ -23,8 +23,7 @@ def __call__(self, obj): return obj -_random_state_docstring = \ - """random_state : int, RandomState instance or None, optional (default=None) +_random_state_docstring = """random_state : int, RandomState instance or None, optional (default=None) Control the randomization of the algorithm. - If int, ``random_state`` is the seed used by the random number diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index d7326dde6..edf7df70c 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -17,9 +17,14 @@ from ..exceptions import raise_isinstance_error -SAMPLING_KIND = ('over-sampling', 'under-sampling', 'clean-sampling', - 'ensemble', 'bypass') -TARGET_KIND = ('binary', 'multiclass', 'multilabel-indicator') +SAMPLING_KIND = ( + "over-sampling", + "under-sampling", + "clean-sampling", + "ensemble", + "bypass", +) +TARGET_KIND = ("binary", "multiclass", "multilabel-indicator") def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): @@ -84,32 +89,33 @@ def check_target_type(y, indicate_one_vs_all=False): """ type_y = type_of_target(y) - if type_y == 'multilabel-indicator': + if type_y == "multilabel-indicator": if np.any(y.sum(axis=1) > 1): raise ValueError( "Imbalanced-learn currently supports binary, multiclass and " "binarized encoded multiclasss targets. Multilabel and " - "multioutput targets are not supported.") + "multioutput targets are not supported." + ) y = y.argmax(axis=1) - return (y, type_y == 'multilabel-indicator') if indicate_one_vs_all else y + return (y, type_y == "multilabel-indicator") if indicate_one_vs_all else y def _sampling_strategy_all(y, sampling_type): """Returns sampling target by targeting all classes.""" target_stats = _count_class_sample(y) - if sampling_type == 'over-sampling': + if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) sampling_strategy = { key: n_sample_majority - value for (key, value) in target_stats.items() } - elif (sampling_type == 'under-sampling' or - sampling_type == 'clean-sampling'): + elif ( + sampling_type == "under-sampling" or sampling_type == "clean-sampling" + ): n_sample_minority = min(target_stats.values()) sampling_strategy = { - key: n_sample_minority - for key in target_stats.keys() + key: n_sample_minority for key in target_stats.keys() } else: raise NotImplementedError @@ -119,17 +125,21 @@ def _sampling_strategy_all(y, sampling_type): def _sampling_strategy_majority(y, sampling_type): """Returns sampling target by targeting the majority class only.""" - if sampling_type == 'over-sampling': - raise ValueError("'sampling_strategy'='majority' cannot be used with" - " over-sampler.") - elif (sampling_type == 'under-sampling' or - sampling_type == 'clean-sampling'): + if sampling_type == "over-sampling": + raise ValueError( + "'sampling_strategy'='majority' cannot be used with" + " over-sampler." + ) + elif ( + sampling_type == "under-sampling" or sampling_type == "clean-sampling" + ): target_stats = _count_class_sample(y) class_majority = max(target_stats, key=target_stats.get) n_sample_minority = min(target_stats.values()) sampling_strategy = { key: n_sample_minority - for key in target_stats.keys() if key == class_majority + for key in target_stats.keys() + if key == class_majority } else: raise NotImplementedError @@ -141,20 +151,23 @@ def _sampling_strategy_not_majority(y, sampling_type): """Returns sampling target by targeting all classes but not the majority.""" target_stats = _count_class_sample(y) - if sampling_type == 'over-sampling': + if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_majority - value - for (key, value) in target_stats.items() if key != class_majority + for (key, value) in target_stats.items() + if key != class_majority } - elif (sampling_type == 'under-sampling' or - sampling_type == 'clean-sampling'): + elif ( + sampling_type == "under-sampling" or sampling_type == "clean-sampling" + ): n_sample_minority = min(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_minority - for key in target_stats.keys() if key != class_majority + for key in target_stats.keys() + if key != class_majority } else: raise NotImplementedError @@ -166,20 +179,23 @@ def _sampling_strategy_not_minority(y, sampling_type): """Returns sampling target by targeting all classes but not the minority.""" target_stats = _count_class_sample(y) - if sampling_type == 'over-sampling': + if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_majority - value - for (key, value) in target_stats.items() if key != class_minority + for (key, value) in target_stats.items() + if key != class_minority } - elif (sampling_type == 'under-sampling' or - sampling_type == 'clean-sampling'): + elif ( + sampling_type == "under-sampling" or sampling_type == "clean-sampling" + ): n_sample_minority = min(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_minority - for key in target_stats.keys() if key != class_minority + for key in target_stats.keys() + if key != class_minority } else: raise NotImplementedError @@ -190,17 +206,21 @@ def _sampling_strategy_not_minority(y, sampling_type): def _sampling_strategy_minority(y, sampling_type): """Returns sampling target by targeting the minority class only.""" target_stats = _count_class_sample(y) - if sampling_type == 'over-sampling': + if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_majority - value - for (key, value) in target_stats.items() if key == class_minority + for (key, value) in target_stats.items() + if key == class_minority } - elif (sampling_type == 'under-sampling' or - sampling_type == 'clean-sampling'): - raise ValueError("'sampling_strategy'='minority' cannot be used with" - " under-sampler and clean-sampler.") + elif ( + sampling_type == "under-sampling" or sampling_type == "clean-sampling" + ): + raise ValueError( + "'sampling_strategy'='minority' cannot be used with" + " under-sampler and clean-sampler." + ) else: raise NotImplementedError @@ -210,10 +230,11 @@ def _sampling_strategy_minority(y, sampling_type): def _sampling_strategy_auto(y, sampling_type): """Returns sampling target auto for over-sampling and not-minority for under-sampling.""" - if sampling_type == 'over-sampling': + if sampling_type == "over-sampling": return _sampling_strategy_not_majority(y, sampling_type) - elif (sampling_type == 'under-sampling' or - sampling_type == 'clean-sampling'): + elif ( + sampling_type == "under-sampling" or sampling_type == "clean-sampling" + ): return _sampling_strategy_not_minority(y, sampling_type) @@ -222,53 +243,74 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type): sampling.""" target_stats = _count_class_sample(y) # check that all keys in sampling_strategy are also in y - set_diff_sampling_strategy_target = ( - set(sampling_strategy.keys()) - set(target_stats.keys())) + set_diff_sampling_strategy_target = set(sampling_strategy.keys()) - set( + target_stats.keys() + ) if len(set_diff_sampling_strategy_target) > 0: - raise ValueError("The {} target class is/are not present in the" - " data.".format(set_diff_sampling_strategy_target)) + raise ValueError( + "The {} target class is/are not present in the" + " data.".format(set_diff_sampling_strategy_target) + ) # check that there is no negative number if any(n_samples < 0 for n_samples in sampling_strategy.values()): - raise ValueError("The number of samples in a class cannot be negative." - "'sampling_strategy' contains some negative value: {}" - .format(sampling_strategy)) + raise ValueError( + "The number of samples in a class cannot be negative." + "'sampling_strategy' contains some negative value: {}".format( + sampling_strategy + ) + ) sampling_strategy_ = {} - if sampling_type == 'over-sampling': + if sampling_type == "over-sampling": n_samples_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) for class_sample, n_samples in sampling_strategy.items(): if n_samples < target_stats[class_sample]: - raise ValueError("With over-sampling methods, the number" - " of samples in a class should be greater" - " or equal to the original number of samples." - " Originally, there is {} samples and {}" - " samples are asked.".format( - target_stats[class_sample], n_samples)) + raise ValueError( + "With over-sampling methods, the number" + " of samples in a class should be greater" + " or equal to the original number of samples." + " Originally, there is {} samples and {}" + " samples are asked.".format( + target_stats[class_sample], n_samples + ) + ) if n_samples > n_samples_majority: - warnings.warn("After over-sampling, the number of samples ({})" - " in class {} will be larger than the number of" - " samples in the majority class (class #{} ->" - " {})".format(n_samples, class_sample, - class_majority, - n_samples_majority)) + warnings.warn( + "After over-sampling, the number of samples ({})" + " in class {} will be larger than the number of" + " samples in the majority class (class #{} ->" + " {})".format( + n_samples, + class_sample, + class_majority, + n_samples_majority, + ) + ) sampling_strategy_[class_sample] = ( - n_samples - target_stats[class_sample]) - elif sampling_type == 'under-sampling': + n_samples - target_stats[class_sample] + ) + elif sampling_type == "under-sampling": for class_sample, n_samples in sampling_strategy.items(): if n_samples > target_stats[class_sample]: - raise ValueError("With under-sampling methods, the number of" - " samples in a class should be less or equal" - " to the original number of samples." - " Originally, there is {} samples and {}" - " samples are asked.".format( - target_stats[class_sample], n_samples)) + raise ValueError( + "With under-sampling methods, the number of" + " samples in a class should be less or equal" + " to the original number of samples." + " Originally, there is {} samples and {}" + " samples are asked.".format( + target_stats[class_sample], n_samples + ) + ) sampling_strategy_[class_sample] = n_samples - elif sampling_type == 'clean-sampling': + elif sampling_type == "clean-sampling": # FIXME: Turn into an error in 0.6 - warnings.warn("'sampling_strategy' as a dict for cleaning methods is " - "deprecated and will raise an error in version 0.6. " - "Please give a list of the classes to be targeted by the" - " sampling.", DeprecationWarning) + warnings.warn( + "'sampling_strategy' as a dict for cleaning methods is " + "deprecated and will raise an error in version 0.6. " + "Please give a list of the classes to be targeted by the" + " sampling.", + DeprecationWarning, + ) # clean-sampling can be more permissive since those samplers do not # use samples for class_sample, n_samples in sampling_strategy.items(): @@ -282,17 +324,22 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type): def _sampling_strategy_list(sampling_strategy, y, sampling_type): """With cleaning methods, sampling_strategy can be a list to target the class of interest.""" - if sampling_type != 'clean-sampling': - raise ValueError("'sampling_strategy' cannot be a list for samplers " - "which are not cleaning methods.") + if sampling_type != "clean-sampling": + raise ValueError( + "'sampling_strategy' cannot be a list for samplers " + "which are not cleaning methods." + ) target_stats = _count_class_sample(y) # check that all keys in sampling_strategy are also in y - set_diff_sampling_strategy_target = ( - set(sampling_strategy) - set(target_stats.keys())) + set_diff_sampling_strategy_target = set(sampling_strategy) - set( + target_stats.keys() + ) if len(set_diff_sampling_strategy_target) > 0: - raise ValueError("The {} target class is/are not present in the" - " data.".format(set_diff_sampling_strategy_target)) + raise ValueError( + "The {} target class is/are not present in the" + " data.".format(set_diff_sampling_strategy_target) + ) return { class_sample: min(target_stats.values()) @@ -304,38 +351,51 @@ def _sampling_strategy_float(sampling_strategy, y, sampling_type): """Take a proportion of the majority (over-sampling) or minority (under-sampling) class in binary classification.""" type_y = type_of_target(y) - if type_y != 'binary': + if type_y != "binary": raise ValueError( '"sampling_strategy" can be a float only when the type ' - 'of target is binary. For multi-class, use a dict.') + "of target is binary. For multi-class, use a dict." + ) target_stats = _count_class_sample(y) - if sampling_type == 'over-sampling': + if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy_ = { key: int(n_sample_majority * sampling_strategy - value) - for (key, value) in target_stats.items() if key != class_majority + for (key, value) in target_stats.items() + if key != class_majority } if any([n_samples <= 0 for n_samples in sampling_strategy_.values()]): - raise ValueError("The specified ratio required to remove samples " - "from the minority class while trying to " - "generate new samples. Please increase the " - "ratio.") - elif (sampling_type == 'under-sampling'): + raise ValueError( + "The specified ratio required to remove samples " + "from the minority class while trying to " + "generate new samples. Please increase the " + "ratio." + ) + elif sampling_type == "under-sampling": n_sample_minority = min(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy_ = { key: int(n_sample_minority / sampling_strategy) - for (key, value) in target_stats.items() if key != class_minority + for (key, value) in target_stats.items() + if key != class_minority } - if any([n_samples > target_stats[target] - for target, n_samples in sampling_strategy_.items()]): - raise ValueError("The specified ratio required to generate new " - "sample in the majority class while trying to " - "remove samples. Please increase the ratio.") + if any( + [ + n_samples > target_stats[target] + for target, n_samples in sampling_strategy_.items() + ] + ): + raise ValueError( + "The specified ratio required to generate new " + "sample in the majority class while trying to " + "remove samples. Please increase the ratio." + ) else: - raise ValueError("'clean-sampling' methods do let the user " - "specify the sampling ratio.") + raise ValueError( + "'clean-sampling' methods do let the user " + "specify the sampling ratio." + ) return sampling_strategy_ @@ -429,52 +489,82 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): """ if sampling_type not in SAMPLING_KIND: - raise ValueError("'sampling_type' should be one of {}. Got '{}'" - " instead.".format(SAMPLING_KIND, sampling_type)) + raise ValueError( + "'sampling_type' should be one of {}. Got '{}'" + " instead.".format(SAMPLING_KIND, sampling_type) + ) if np.unique(y).size <= 1: - raise ValueError("The target 'y' needs to have more than 1 class." - " Got {} class instead".format(np.unique(y).size)) + raise ValueError( + "The target 'y' needs to have more than 1 class." + " Got {} class instead".format(np.unique(y).size) + ) - if sampling_type in ('ensemble', 'bypass'): + if sampling_type in ("ensemble", "bypass"): return sampling_strategy if isinstance(sampling_strategy, str): if sampling_strategy not in SAMPLING_TARGET_KIND.keys(): - raise ValueError("When 'sampling_strategy' is a string, it needs" - " to be one of {}. Got '{}' instead.".format( - SAMPLING_TARGET_KIND, sampling_strategy)) - return OrderedDict(sorted( - SAMPLING_TARGET_KIND[sampling_strategy](y, sampling_type).items())) + raise ValueError( + "When 'sampling_strategy' is a string, it needs" + " to be one of {}. Got '{}' instead.".format( + SAMPLING_TARGET_KIND, sampling_strategy + ) + ) + return OrderedDict( + sorted( + SAMPLING_TARGET_KIND[sampling_strategy]( + y, sampling_type + ).items() + ) + ) elif isinstance(sampling_strategy, dict): - return OrderedDict(sorted( - _sampling_strategy_dict(sampling_strategy, y, sampling_type) - .items())) + return OrderedDict( + sorted( + _sampling_strategy_dict( + sampling_strategy, y, sampling_type + ).items() + ) + ) elif isinstance(sampling_strategy, list): - return OrderedDict(sorted( - _sampling_strategy_list(sampling_strategy, y, sampling_type) - .items())) + return OrderedDict( + sorted( + _sampling_strategy_list( + sampling_strategy, y, sampling_type + ).items() + ) + ) elif isinstance(sampling_strategy, Real): if sampling_strategy <= 0 or sampling_strategy > 1: raise ValueError( "When 'sampling_strategy' is a float, it should be " - "in the range (0, 1]. Got {} instead." - .format(sampling_strategy)) - return OrderedDict(sorted( - _sampling_strategy_float(sampling_strategy, y, sampling_type) - .items())) + "in the range (0, 1]. Got {} instead.".format( + sampling_strategy + ) + ) + return OrderedDict( + sorted( + _sampling_strategy_float( + sampling_strategy, y, sampling_type + ).items() + ) + ) elif callable(sampling_strategy): sampling_strategy_ = sampling_strategy(y, **kwargs) - return OrderedDict(sorted( - _sampling_strategy_dict(sampling_strategy_, y, sampling_type) - .items())) + return OrderedDict( + sorted( + _sampling_strategy_dict( + sampling_strategy_, y, sampling_type + ).items() + ) + ) SAMPLING_TARGET_KIND = { - 'minority': _sampling_strategy_minority, - 'majority': _sampling_strategy_majority, - 'not minority': _sampling_strategy_not_minority, - 'not majority': _sampling_strategy_not_majority, - 'all': _sampling_strategy_all, - 'auto': _sampling_strategy_auto + "minority": _sampling_strategy_minority, + "majority": _sampling_strategy_majority, + "not minority": _sampling_strategy_not_minority, + "not majority": _sampling_strategy_not_majority, + "all": _sampling_strategy_all, + "auto": _sampling_strategy_auto, } diff --git a/imblearn/utils/deprecation.py b/imblearn/utils/deprecation.py index 3aa74b2cd..11262ea22 100644 --- a/imblearn/utils/deprecation.py +++ b/imblearn/utils/deprecation.py @@ -6,10 +6,9 @@ import warnings -def deprecate_parameter(sampler, - version_deprecation, - param_deprecated, - new_param=None): +def deprecate_parameter( + sampler, version_deprecation, param_deprecated, new_param=None +): """Helper to deprecate a parameter by another one. Parameters @@ -33,22 +32,31 @@ def deprecate_parameter(sampler, None """ - x, y = version_deprecation.split('.') - version_removed = x + '.' + str(int(y) + 2) + x, y = version_deprecation.split(".") + version_removed = x + "." + str(int(y) + 2) if new_param is None: if getattr(sampler, param_deprecated) is not None: warnings.warn( "'{}' is deprecated from {} and will be removed in" - " {} for the estimator {}." - .format(param_deprecated, version_deprecation, - version_removed, sampler.__class__), - category=DeprecationWarning) + " {} for the estimator {}.".format( + param_deprecated, + version_deprecation, + version_removed, + sampler.__class__, + ), + category=DeprecationWarning, + ) else: if getattr(sampler, param_deprecated) is not None: warnings.warn( "'{}' is deprecated from {} and will be removed in" - " {} for the estimator {}. Use '{}' instead." - .format(param_deprecated, version_deprecation, - version_removed, sampler.__class__, new_param), - category=DeprecationWarning) + " {} for the estimator {}. Use '{}' instead.".format( + param_deprecated, + version_deprecation, + version_removed, + sampler.__class__, + new_param, + ), + category=DeprecationWarning, + ) setattr(sampler, new_param, getattr(sampler, param_deprecated)) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 897e7f5e1..51f814764 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -15,11 +15,16 @@ from scipy import sparse from sklearn.base import clone -from sklearn.datasets import make_classification, make_multilabel_classification # noqa +from sklearn.datasets import ( + make_classification, + make_multilabel_classification, +) # noqa from sklearn.cluster import KMeans from sklearn.preprocessing import label_binarize -from sklearn.utils.estimator_checks import check_estimator \ - as sklearn_check_estimator, check_parameters_default_constructible +from sklearn.utils.estimator_checks import ( + check_estimator as sklearn_check_estimator, + check_parameters_default_constructible, +) from sklearn.utils.testing import assert_allclose from sklearn.utils._testing import assert_raises_regex from sklearn.utils._testing import set_random_state @@ -52,10 +57,10 @@ def _yield_classifier_checks(name, Estimator): def _yield_all_checks(name, estimator): # trigger our checks if this is a SamplerMixin - if hasattr(estimator, 'fit_resample'): + if hasattr(estimator, "fit_resample"): for check in _yield_sampler_checks(name, estimator): yield check - if hasattr(estimator, 'predict'): + if hasattr(estimator, "predict"): for check in _yield_classifier_checks(name, estimator): yield check @@ -110,7 +115,7 @@ def check_samplers_one_label(name, Sampler): try: sampler.fit_resample(X, y) except ValueError as e: - if 'class' not in repr(e): + if "class" not in repr(e): print(error_string_fit, Sampler, e) traceback.print_exc(file=sys.stdout) raise e @@ -128,14 +133,20 @@ def check_samplers_fit(name, Sampler): X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) sampler.fit_resample(X, y) - assert hasattr(sampler, 'sampling_strategy_'), \ - "No fitted attribute sampling_strategy_" + assert hasattr( + sampler, "sampling_strategy_" + ), "No fitted attribute sampling_strategy_" def check_samplers_fit_resample(name, Sampler): sampler = Sampler() - X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, - weights=[0.2, 0.3, 0.5], random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) target_stats = Counter(y) X_res, y_res = sampler.fit_resample(X, y) if isinstance(sampler, BaseOverSampler): @@ -144,30 +155,40 @@ def check_samplers_fit_resample(name, Sampler): assert all(value >= n_samples for value in Counter(y_res).values()) elif isinstance(sampler, BaseUnderSampler): n_samples = min(target_stats.values()) - if name == 'InstanceHardnessThreshold': + if name == "InstanceHardnessThreshold": # IHT does not enforce the number of samples but provide a number # of samples the closest to the desired target. - assert all(Counter(y_res)[k] <= target_stats[k] - for k in target_stats.keys()) + assert all( + Counter(y_res)[k] <= target_stats[k] + for k in target_stats.keys() + ) else: assert all(value == n_samples for value in Counter(y_res).values()) elif isinstance(sampler, BaseCleaningSampler): target_stats_res = Counter(y_res) class_minority = min(target_stats, key=target_stats.get) - assert all(target_stats[class_sample] > target_stats_res[class_sample] - for class_sample in target_stats.keys() - if class_sample != class_minority) + assert all( + target_stats[class_sample] > target_stats_res[class_sample] + for class_sample in target_stats.keys() + if class_sample != class_minority + ) elif isinstance(sampler, BaseEnsembleSampler): y_ensemble = y_res[0] n_samples = min(target_stats.values()) - assert all(value == n_samples - for value in Counter(y_ensemble).values()) + assert all( + value == n_samples for value in Counter(y_ensemble).values() + ) def check_samplers_sampling_strategy_fit_resample(name, Sampler): # in this test we will force all samplers to not change the class 1 - X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, - weights=[0.2, 0.3, 0.5], random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) sampler = Sampler() expected_stat = Counter(y)[1] if isinstance(sampler, BaseOverSampler): @@ -196,8 +217,13 @@ def check_samplers_sampling_strategy_fit_resample(name, Sampler): def check_samplers_sparse(name, Sampler): # check that sparse matrices can be passed through the sampler leading to # the same results than dense - X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, - weights=[0.2, 0.3, 0.5], random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) X_sparse = sparse.csr_matrix(X) if isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] @@ -206,8 +232,9 @@ def check_samplers_sparse(name, Sampler): samplers = [ Sampler( random_state=0, - voting='soft', - estimator=KMeans(random_state=1, algorithm='full')) + voting="soft", + estimator=KMeans(random_state=1, algorithm="full"), + ) ] else: samplers = [Sampler()] @@ -221,8 +248,9 @@ def check_samplers_sparse(name, Sampler): assert_allclose(X_res_sparse.A, X_res) assert_allclose(y_res_sparse, y_res) else: - for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, - y_res): + for x_sp, x, y_sp, y in zip( + X_res_sparse, X_res, y_res_sparse, y_res + ): assert sparse.issparse(x_sp) assert_allclose(x_sp.A, x) assert_allclose(y_sp, y) @@ -231,8 +259,13 @@ def check_samplers_sparse(name, Sampler): def check_samplers_pandas(name, Sampler): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series - X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, - weights=[0.2, 0.3, 0.5], random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) X_pd = pd.DataFrame(X) sampler = Sampler() if isinstance(Sampler(), NearMiss): @@ -251,8 +284,13 @@ def check_samplers_pandas(name, Sampler): def check_samplers_multiclass_ova(name, Sampler): # Check that multiclass target lead to the same results than OVA encoding - X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, - weights=[0.2, 0.3, 0.5], random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) y_ova = label_binarize(y, np.unique(y)) sampler = Sampler() set_random_state(sampler) @@ -269,8 +307,13 @@ def check_samplers_multiclass_ova(name, Sampler): def check_samplers_preserve_dtype(name, Sampler): - X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, - weights=[0.2, 0.3, 0.5], random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) # Cast X and y to not default dtype X = X.astype(np.float32) y = y.astype(np.int32) @@ -282,15 +325,20 @@ def check_samplers_preserve_dtype(name, Sampler): def check_samplers_sample_indices(name, Sampler): - X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, - weights=[0.2, 0.3, 0.5], random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) sampler = Sampler() sampler.fit_resample(X, y) - sample_indices = sampler._get_tags().get('sample_indices', None) + sample_indices = sampler._get_tags().get("sample_indices", None) if sample_indices: - assert hasattr(sampler, 'sample_indices_') is sample_indices + assert hasattr(sampler, "sample_indices_") is sample_indices else: - assert not hasattr(sampler, 'sample_indices_') + assert not hasattr(sampler, "sample_indices_") def check_classifier_on_multilabel_or_multioutput_targets(name, Estimator): diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py index dd4d61336..5478c2eab 100644 --- a/imblearn/utils/testing.py +++ b/imblearn/utils/testing.py @@ -25,10 +25,12 @@ DONT_TEST = [] -def all_estimators(include_meta_estimators=False, - include_other=False, - type_filter=None, - include_dont_test=False): +def all_estimators( + include_meta_estimators=False, + include_other=False, + type_filter=None, + include_dont_test=False, +): """Get a list of all estimators from imblearn. This function crawls the module and gets all classes that inherit @@ -67,7 +69,7 @@ def all_estimators(include_meta_estimators=False, """ def is_abstract(c): - if not (hasattr(c, '__abstractmethods__')): + if not (hasattr(c, "__abstractmethods__")): return False if not len(c.__abstractmethods__): return False @@ -77,8 +79,9 @@ def is_abstract(c): # get parent folder path = imblearn.__path__ for importer, modname, ispkg in pkgutil.walk_packages( - path=path, prefix='imblearn.', onerror=lambda x: None): - if (".tests." in modname): + path=path, prefix="imblearn.", onerror=lambda x: None + ): + if ".tests." in modname: continue module = __import__(modname, fromlist="dummy") classes = inspect.getmembers(module, inspect.isclass) @@ -87,8 +90,9 @@ def is_abstract(c): all_classes = set(all_classes) estimators = [ - c for c in all_classes - if (issubclass(c[1], BaseEstimator) and c[0] != 'BaseEstimator') + c + for c in all_classes + if (issubclass(c[1], BaseEstimator) and c[0] != "BaseEstimator") ] # get rid of abstract base classes estimators = [c for c in estimators if not is_abstract(c[1])] @@ -110,17 +114,20 @@ def is_abstract(c): else: type_filter = list(type_filter) # copy filtered_estimators = [] - filters = {'sampler': SamplerMixin} + filters = {"sampler": SamplerMixin} for name, mixin in filters.items(): if name in type_filter: type_filter.remove(name) filtered_estimators.extend( - [est for est in estimators if issubclass(est[1], mixin)]) + [est for est in estimators if issubclass(est[1], mixin)] + ) estimators = filtered_estimators if type_filter: - raise ValueError("Parameter type_filter must be 'sampler' or " - "None, got" - " %s." % repr(type_filter)) + raise ValueError( + "Parameter type_filter must be 'sampler' or " + "None, got" + " %s." % repr(type_filter) + ) # drop duplicates, sort for reproducibility # itemgetter is used to ensure the sort does not extend to the 2nd item of @@ -166,7 +173,8 @@ def warns(expected_warning, match=None): break else: msg = "'{}' pattern not found in {}".format( - match, '{}'.format([str(r.message) for r in record])) + match, "{}".format([str(r.message) for r in record]) + ) assert False, msg else: pass diff --git a/imblearn/utils/tests/test_deprecation.py b/imblearn/utils/tests/test_deprecation.py index 1bc1326b0..4f352aaae 100644 --- a/imblearn/utils/tests/test_deprecation.py +++ b/imblearn/utils/tests/test_deprecation.py @@ -9,12 +9,12 @@ class Sampler: def __init__(self): - self.a = 'something' - self.b = 'something' + self.a = "something" + self.b = "something" def test_deprecate_parameter(): with warns(DeprecationWarning, match="is deprecated from"): - deprecate_parameter(Sampler(), '0.2', 'a') + deprecate_parameter(Sampler(), "0.2", "a") with warns(DeprecationWarning, match="Use 'b' instead."): - deprecate_parameter(Sampler(), '0.2', 'a', 'b') + deprecate_parameter(Sampler(), "0.2", "a", "b") diff --git a/imblearn/utils/tests/test_docstring.py b/imblearn/utils/tests/test_docstring.py index cddce2858..74b3fe0c1 100644 --- a/imblearn/utils/tests/test_docstring.py +++ b/imblearn/utils/tests/test_docstring.py @@ -7,8 +7,7 @@ from imblearn.utils import Substitution -func_docstring = \ - """A function. +func_docstring = """A function. Parameters ---------- @@ -30,8 +29,7 @@ def func(param_1, param_2): return param_1, param_2 -cls_docstring = \ - """A class. +cls_docstring = """A class. Parameters ---------- @@ -56,8 +54,9 @@ def __init__(self, param_1, param_2): self.param_2 = param_2 -@pytest.mark.parametrize("obj, obj_docstring", [(func, func_docstring), - (cls, cls_docstring)]) +@pytest.mark.parametrize( + "obj, obj_docstring", [(func, func_docstring), (cls, cls_docstring)] +) def test_docstring_inject(obj, obj_docstring): - obj_injected_docstring = Substitution(param_1='xxx', param_2='yyy')(obj) + obj_injected_docstring = Substitution(param_1="xxx", param_2="yyy")(obj) assert obj_injected_docstring.__doc__ == obj_docstring diff --git a/imblearn/utils/tests/test_estimator_checks.py b/imblearn/utils/tests/test_estimator_checks.py index 7ee712ff9..9a66c29e1 100644 --- a/imblearn/utils/tests/test_estimator_checks.py +++ b/imblearn/utils/tests/test_estimator_checks.py @@ -12,7 +12,8 @@ class BaseBadSampler(BaseEstimator): """Sampler without inputs checking.""" - _sampling_type = 'bypass' + + _sampling_type = "bypass" def fit(self, X, y): return self @@ -25,6 +26,7 @@ def fit_resample(self, X, y): class NotFittedSampler(BaseBadSampler): """Sampler without target checking.""" + def fit(self, X, y): y, _ = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=True) @@ -33,15 +35,16 @@ def fit(self, X, y): class NoAcceptingSparseSampler(BaseBadSampler): """Sampler which does not accept sparse matrix.""" + def fit(self, X, y): y, _ = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=False) - self.sampling_strategy_ = 'sampling_strategy_' + self.sampling_strategy_ = "sampling_strategy_" return self class NotPreservingDtypeSampler(BaseSampler): - _sampling_type = 'bypass' + _sampling_type = "bypass" def _fit_resample(self, X, y): return X.astype(np.float64), y.astype(np.int64) @@ -50,11 +53,17 @@ def _fit_resample(self, X, y): @pytest.mark.filterwarnings("ignore:'y' should be of types") @pytest.mark.filterwarnings("ignore: Can't check dok sparse matrix for nan") @pytest.mark.parametrize( - 'Estimator, err_type, err_msg', - [(BaseBadSampler, AssertionError, "ValueError not raised by fit"), - (NotFittedSampler, AssertionError, "No fitted attribute"), - (NoAcceptingSparseSampler, TypeError, "A sparse matrix was passed"), - (NotPreservingDtypeSampler, AssertionError, "X dtype is not preserved")] + "Estimator, err_type, err_msg", + [ + (BaseBadSampler, AssertionError, "ValueError not raised by fit"), + (NotFittedSampler, AssertionError, "No fitted attribute"), + (NoAcceptingSparseSampler, TypeError, "A sparse matrix was passed"), + ( + NotPreservingDtypeSampler, + AssertionError, + "X dtype is not preserved", + ), + ], ) def test_check_estimator(Estimator, err_type, err_msg): with pytest.raises(err_type, match=err_msg): diff --git a/imblearn/utils/tests/test_testing.py b/imblearn/utils/tests/test_testing.py index b40d38d9d..dab2ed532 100644 --- a/imblearn/utils/tests/test_testing.py +++ b/imblearn/utils/tests/test_testing.py @@ -13,16 +13,16 @@ def test_all_estimators(): # check if the filtering is working with a list or a single string - type_filter = 'sampler' + type_filter = "sampler" all_estimators(type_filter=type_filter) - type_filter = ['sampler'] + type_filter = ["sampler"] estimators = all_estimators(type_filter=type_filter) for estimator in estimators: # check that all estimators are sampler assert issubclass(estimator[1], SamplerMixin) # check that an error is raised when the type is unknown - type_filter = 'rnd' + type_filter = "rnd" with raises(ValueError, match="Parameter type_filter must be 'sampler'"): all_estimators(type_filter=type_filter) @@ -30,21 +30,21 @@ def test_all_estimators(): def test_warns(): import warnings - with warns(UserWarning, match=r'must be \d+$'): + with warns(UserWarning, match=r"must be \d+$"): warnings.warn("value must be 42", UserWarning) - with raises(AssertionError, match='pattern not found'): - with warns(UserWarning, match=r'must be \d+$'): + with raises(AssertionError, match="pattern not found"): + with warns(UserWarning, match=r"must be \d+$"): warnings.warn("this is not here", UserWarning) - with warns(UserWarning, match=r'aaa'): + with warns(UserWarning, match=r"aaa"): warnings.warn("cccccccccc", UserWarning) warnings.warn("bbbbbbbbbb", UserWarning) warnings.warn("aaaaaaaaaa", UserWarning) - a, b, c = ('aaa', 'bbbbbbbbbb', 'cccccccccc') + a, b, c = ("aaa", "bbbbbbbbbb", "cccccccccc") expected_msg = r"'{}' pattern not found in \['{}', '{}'\]".format(a, b, c) with raises(AssertionError, match=expected_msg): - with warns(UserWarning, match=r'aaa'): + with warns(UserWarning, match=r"aaa"): warnings.warn("bbbbbbbbbb", UserWarning) warnings.warn("cccccccccc", UserWarning) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 9af14163b..8df1dd28b 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -23,7 +23,7 @@ def test_check_neighbors_object(): - name = 'n_neighbors' + name = "n_neighbors" n_neighbors = 1 estimator = check_neighbors_object(name, n_neighbors) assert issubclass(type(estimator), KNeighborsMixin) @@ -34,49 +34,70 @@ def test_check_neighbors_object(): estimator = NearestNeighbors(n_neighbors) estimator_cloned = check_neighbors_object(name, estimator) assert estimator.n_neighbors == estimator_cloned.n_neighbors - n_neighbors = 'rnd' + n_neighbors = "rnd" with pytest.raises(ValueError, match="has to be one of"): check_neighbors_object(name, n_neighbors) -@pytest.mark.parametrize("target, output_target", [(np.array( - [0, 1, 1]), np.array([0, 1, 1])), (np.array([0, 1, 2]), np.array( - [0, 1, 2])), (np.array([[0, 1], [1, 0]]), np.array([1, 0]))]) +@pytest.mark.parametrize( + "target, output_target", + [ + (np.array([0, 1, 1]), np.array([0, 1, 1])), + (np.array([0, 1, 2]), np.array([0, 1, 2])), + (np.array([[0, 1], [1, 0]]), np.array([1, 0])), + ], +) def test_check_target_type(target, output_target): converted_target = check_target_type(target.astype(int)) assert_array_equal(converted_target, output_target.astype(int)) -@pytest.mark.parametrize("target, output_target, is_ova", - [(np.array([0, 1, 1]), np.array([0, 1, 1]), False), - (np.array([0, 1, 2]), np.array([0, 1, 2]), - False), (np.array([[0, 1], [1, 0]]), - np.array([1, 0]), True)]) +@pytest.mark.parametrize( + "target, output_target, is_ova", + [ + (np.array([0, 1, 1]), np.array([0, 1, 1]), False), + (np.array([0, 1, 2]), np.array([0, 1, 2]), False), + (np.array([[0, 1], [1, 0]]), np.array([1, 0]), True), + ], +) def test_check_target_type_ova(target, output_target, is_ova): converted_target, binarize_target = check_target_type( - target.astype(int), indicate_one_vs_all=True) + target.astype(int), indicate_one_vs_all=True + ) assert_array_equal(converted_target, output_target.astype(int)) assert binarize_target == is_ova def test_check_sampling_strategy_warning(): - msg = 'dict for cleaning methods is deprecated' + msg = "dict for cleaning methods is deprecated" with pytest.warns(DeprecationWarning, match=msg): - check_sampling_strategy({ - 1: 0, - 2: 0, - 3: 0 - }, multiclass_target, 'clean-sampling') + check_sampling_strategy( + {1: 0, 2: 0, 3: 0}, multiclass_target, "clean-sampling" + ) @pytest.mark.parametrize( "ratio, y, type, err_msg", - [(0.5, binary_target, 'clean-sampling', - "'clean-sampling' methods do let the user specify the sampling ratio"), - (0.1, np.array([0] * 10 + [1] * 20), 'over-sampling', - "remove samples from the minority class while trying to generate new"), - (0.1, np.array([0] * 10 + [1] * 20), 'under-sampling', - "generate new sample in the majority class while trying to remove")] + [ + ( + 0.5, + binary_target, + "clean-sampling", + "'clean-sampling' methods do let the user specify the sampling ratio", + ), + ( + 0.1, + np.array([0] * 10 + [1] * 20), + "over-sampling", + "remove samples from the minority class while trying to generate new", + ), + ( + 0.1, + np.array([0] * 10 + [1] * 20), + "under-sampling", + "generate new sample in the majority class while trying to remove", + ), + ], ) def test_check_sampling_strategy_float_error(ratio, y, type, err_msg): with pytest.raises(ValueError, match=err_msg): @@ -85,37 +106,49 @@ def test_check_sampling_strategy_float_error(ratio, y, type, err_msg): def test_check_sampling_strategy_error(): with pytest.raises(ValueError, match="'sampling_type' should be one of"): - check_sampling_strategy('auto', np.array([1, 2, 3]), 'rnd') + check_sampling_strategy("auto", np.array([1, 2, 3]), "rnd") error_regex = "The target 'y' needs to have more than 1 class." with pytest.raises(ValueError, match=error_regex): - check_sampling_strategy('auto', np.ones((10, )), 'over-sampling') + check_sampling_strategy("auto", np.ones((10,)), "over-sampling") error_regex = "When 'sampling_strategy' is a string, it needs to be one of" with pytest.raises(ValueError, match=error_regex): - check_sampling_strategy('rnd', np.array([1, 2, 3]), 'over-sampling') + check_sampling_strategy("rnd", np.array([1, 2, 3]), "over-sampling") -@pytest.mark.parametrize("sampling_strategy, sampling_type, err_msg", - [('majority', 'over-sampling', 'over-sampler'), - ('minority', 'under-sampling', 'under-sampler')]) -def test_check_sampling_strategy_error_wrong_string(sampling_strategy, - sampling_type, err_msg): +@pytest.mark.parametrize( + "sampling_strategy, sampling_type, err_msg", + [ + ("majority", "over-sampling", "over-sampler"), + ("minority", "under-sampling", "under-sampler"), + ], +) +def test_check_sampling_strategy_error_wrong_string( + sampling_strategy, sampling_type, err_msg +): with pytest.raises( - ValueError, - match=("'{}' cannot be used with {}".format( - sampling_strategy, err_msg))): - check_sampling_strategy(sampling_strategy, - np.array([1, 2, 3]), sampling_type) - - -@pytest.mark.parametrize("sampling_strategy, sampling_method", [({ - 10: 10 -}, 'under-sampling'), ({ - 10: 10 -}, 'over-sampling'), ([10], 'clean-sampling')]) -def test_sampling_strategy_class_target_unknown(sampling_strategy, - sampling_method): + ValueError, + match=( + "'{}' cannot be used with {}".format(sampling_strategy, err_msg) + ), + ): + check_sampling_strategy( + sampling_strategy, np.array([1, 2, 3]), sampling_type + ) + + +@pytest.mark.parametrize( + "sampling_strategy, sampling_method", + [ + ({10: 10}, "under-sampling"), + ({10: 10}, "over-sampling"), + ([10], "clean-sampling"), + ], +) +def test_sampling_strategy_class_target_unknown( + sampling_strategy, sampling_method +): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="are not present in the data."): check_sampling_strategy(sampling_strategy, y, sampling_method) @@ -125,42 +158,47 @@ def test_sampling_strategy_dict_error(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy = {1: -100, 2: 50, 3: 25} with pytest.raises(ValueError, match="in a class cannot be negative."): - check_sampling_strategy(sampling_strategy, y, 'under-sampling') + check_sampling_strategy(sampling_strategy, y, "under-sampling") sampling_strategy = {1: 45, 2: 100, 3: 70} - error_regex = ("With over-sampling methods, the number of samples in a" - " class should be greater or equal to the original number" - " of samples. Originally, there is 50 samples and 45" - " samples are asked.") + error_regex = ( + "With over-sampling methods, the number of samples in a" + " class should be greater or equal to the original number" + " of samples. Originally, there is 50 samples and 45" + " samples are asked." + ) with pytest.raises(ValueError, match=error_regex): - check_sampling_strategy(sampling_strategy, y, 'over-sampling') - - error_regex = ("With under-sampling methods, the number of samples in a" - " class should be less or equal to the original number of" - " samples. Originally, there is 25 samples and 70 samples" - " are asked.") + check_sampling_strategy(sampling_strategy, y, "over-sampling") + + error_regex = ( + "With under-sampling methods, the number of samples in a" + " class should be less or equal to the original number of" + " samples. Originally, there is 25 samples and 70 samples" + " are asked." + ) with pytest.raises(ValueError, match=error_regex): - check_sampling_strategy(sampling_strategy, y, 'under-sampling') + check_sampling_strategy(sampling_strategy, y, "under-sampling") @pytest.mark.parametrize("sampling_strategy", [-10, 10]) def test_sampling_strategy_float_error_not_in_range(sampling_strategy): y = np.array([1] * 50 + [2] * 100) - with pytest.raises(ValueError, match='it should be in the range'): - check_sampling_strategy(sampling_strategy, y, 'under-sampling') + with pytest.raises(ValueError, match="it should be in the range"): + check_sampling_strategy(sampling_strategy, y, "under-sampling") def test_sampling_strategy_float_error_not_binary(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) - with pytest.raises(ValueError, match='the type of target is binary'): + with pytest.raises(ValueError, match="the type of target is binary"): sampling_strategy = 0.5 - check_sampling_strategy(sampling_strategy, y, 'under-sampling') + check_sampling_strategy(sampling_strategy, y, "under-sampling") -@pytest.mark.parametrize("sampling_method", - ['over-sampling', 'under-sampling']) +@pytest.mark.parametrize( + "sampling_method", ["over-sampling", "under-sampling"] +) def test_sampling_strategy_list_error_not_clean_sampling(sampling_method): y = np.array([1] * 50 + [2] * 100 + [3] * 25) - with pytest.raises(ValueError, match='cannot be a list for samplers'): + with pytest.raises(ValueError, match="cannot be a list for samplers"): sampling_strategy = [1, 2, 3] check_sampling_strategy(sampling_strategy, y, sampling_method) @@ -174,93 +212,64 @@ def _sampling_strategy_func(y): @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_sampling_strategy, target", - [('auto', 'under-sampling', { - 1: 25, - 2: 25 - }, multiclass_target), ('auto', 'clean-sampling', { - 1: 25, - 2: 25 - }, multiclass_target), ('auto', 'over-sampling', { - 1: 50, - 3: 75 - }, multiclass_target), ('all', 'over-sampling', { - 1: 50, - 2: 0, - 3: 75 - }, multiclass_target), ('all', 'under-sampling', { - 1: 25, - 2: 25, - 3: 25 - }, multiclass_target), ('all', 'clean-sampling', { - 1: 25, - 2: 25, - 3: 25 - }, multiclass_target), ('majority', 'under-sampling', { - 2: 25 - }, multiclass_target), ('majority', 'clean-sampling', { - 2: 25 - }, multiclass_target), ('minority', 'over-sampling', { - 3: 75 - }, multiclass_target), ('not minority', 'over-sampling', { - 1: 50, - 2: 0 - }, multiclass_target), ('not minority', 'under-sampling', { - 1: 25, - 2: 25 - }, multiclass_target), ('not minority', 'clean-sampling', { - 1: 25, - 2: 25 - }, multiclass_target), ('not majority', 'over-sampling', { - 1: 50, - 3: 75 - }, multiclass_target), ('not majority', 'under-sampling', { - 1: 25, - 3: 25 - }, multiclass_target), ('not majority', 'clean-sampling', { - 1: 25, - 3: 25 - }, multiclass_target), ({ - 1: 70, - 2: 100, - 3: 70 - }, 'over-sampling', { - 1: 20, - 2: 0, - 3: 45 - }, multiclass_target), ({ - 1: 30, - 2: 45, - 3: 25 - }, 'under-sampling', { - 1: 30, - 2: 45, - 3: 25 - }, multiclass_target), ([1], 'clean-sampling', { - 1: 25 - }, multiclass_target), (_sampling_strategy_func, 'over-sampling', { - 1: 50, - 2: 0, - 3: 75 - }, multiclass_target), (0.5, 'over-sampling', { - 1: 25 - }, binary_target), (0.5, 'under-sampling', { - 0: 50 - }, binary_target)]) -def test_check_sampling_strategy(sampling_strategy, sampling_type, - expected_sampling_strategy, target): - sampling_strategy_ = check_sampling_strategy(sampling_strategy, target, - sampling_type) + [ + ("auto", "under-sampling", {1: 25, 2: 25}, multiclass_target), + ("auto", "clean-sampling", {1: 25, 2: 25}, multiclass_target), + ("auto", "over-sampling", {1: 50, 3: 75}, multiclass_target), + ("all", "over-sampling", {1: 50, 2: 0, 3: 75}, multiclass_target), + ("all", "under-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target), + ("all", "clean-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target), + ("majority", "under-sampling", {2: 25}, multiclass_target), + ("majority", "clean-sampling", {2: 25}, multiclass_target), + ("minority", "over-sampling", {3: 75}, multiclass_target), + ("not minority", "over-sampling", {1: 50, 2: 0}, multiclass_target), + ("not minority", "under-sampling", {1: 25, 2: 25}, multiclass_target), + ("not minority", "clean-sampling", {1: 25, 2: 25}, multiclass_target), + ("not majority", "over-sampling", {1: 50, 3: 75}, multiclass_target), + ("not majority", "under-sampling", {1: 25, 3: 25}, multiclass_target), + ("not majority", "clean-sampling", {1: 25, 3: 25}, multiclass_target), + ( + {1: 70, 2: 100, 3: 70}, + "over-sampling", + {1: 20, 2: 0, 3: 45}, + multiclass_target, + ), + ( + {1: 30, 2: 45, 3: 25}, + "under-sampling", + {1: 30, 2: 45, 3: 25}, + multiclass_target, + ), + ([1], "clean-sampling", {1: 25}, multiclass_target), + ( + _sampling_strategy_func, + "over-sampling", + {1: 50, 2: 0, 3: 75}, + multiclass_target, + ), + (0.5, "over-sampling", {1: 25}, binary_target), + (0.5, "under-sampling", {0: 50}, binary_target), + ], +) +def test_check_sampling_strategy( + sampling_strategy, sampling_type, expected_sampling_strategy, target +): + sampling_strategy_ = check_sampling_strategy( + sampling_strategy, target, sampling_type + ) assert sampling_strategy_ == expected_sampling_strategy def test_sampling_strategy_dict_over_sampling(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy = {1: 70, 2: 140, 3: 70} - expected_msg = (r"After over-sampling, the number of samples \(140\) in" - r" class 2 will be larger than the number of samples in" - r" the majority class \(class #2 -> 100\)") + expected_msg = ( + r"After over-sampling, the number of samples \(140\) in" + r" class 2 will be larger than the number of samples in" + r" the majority class \(class #2 -> 100\)" + ) with warns(UserWarning, expected_msg): - check_sampling_strategy(sampling_strategy, y, 'over-sampling') + check_sampling_strategy(sampling_strategy, y, "over-sampling") def test_sampling_strategy_callable_args(): @@ -276,21 +285,33 @@ def sampling_strategy_func(y, multiplier): } sampling_strategy_ = check_sampling_strategy( - sampling_strategy_func, y, 'over-sampling', multiplier=multiplier) + sampling_strategy_func, y, "over-sampling", multiplier=multiplier + ) assert sampling_strategy_ == {1: 25, 2: 0, 3: 50} @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_result", - [({3: 25, 1: 25, 2: 25}, 'under-sampling', - OrderedDict({1: 25, 2: 25, 3: 25})), - ({3: 100, 1: 100, 2: 100}, 'over-sampling', - OrderedDict({1: 50, 2: 0, 3: 75}))]) -def test_sampling_strategy_check_order(sampling_strategy, sampling_type, - expected_result): + [ + ( + {3: 25, 1: 25, 2: 25}, + "under-sampling", + OrderedDict({1: 25, 2: 25, 3: 25}), + ), + ( + {3: 100, 1: 100, 2: 100}, + "over-sampling", + OrderedDict({1: 50, 2: 0, 3: 75}), + ), + ], +) +def test_sampling_strategy_check_order( + sampling_strategy, sampling_type, expected_result +): # We pass on purpose a non sorted dictionary and check that the resulting # dictionary is sorted. Refer to issue #428. y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy_ = check_sampling_strategy( - sampling_strategy, y, sampling_type) + sampling_strategy, y, sampling_type + ) assert sampling_strategy_ == expected_result From f5821829002cb1d85b38d7e4a5cec88577e5c8a2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 17:31:45 +0100 Subject: [PATCH 23/37] flake8 --- imblearn/base.py | 2 -- imblearn/datasets/_imbalance.py | 1 - imblearn/ensemble/_weight_boosting.py | 2 -- imblearn/ensemble/tests/test_forest.py | 2 +- imblearn/keras/_generator.py | 20 +++---------------- .../over_sampling/_random_over_sampler.py | 1 - imblearn/over_sampling/_smote.py | 2 +- imblearn/over_sampling/tests/test_smote.py | 1 - imblearn/pipeline.py | 3 --- imblearn/tensorflow/_generator.py | 19 +++--------------- .../tests/test_cluster_centroids.py | 2 -- .../_condensed_nearest_neighbour.py | 1 - .../_instance_hardness_threshold.py | 3 +-- .../_prototype_selection/_nearmiss.py | 2 +- .../_one_sided_selection.py | 1 - .../_random_under_sampler.py | 1 - .../_prototype_selection/_tomek_links.py | 2 -- .../_prototype_selection/tests/test_allknn.py | 1 - .../tests/test_edited_nearest_neighbours.py | 1 - .../tests/test_instance_hardness_threshold.py | 1 - .../tests/test_nearmiss.py | 1 - .../tests/test_neighbourhood_cleaning_rule.py | 2 -- .../tests/test_random_under_sampler.py | 1 - ...test_repeated_edited_nearest_neighbours.py | 1 - .../tests/test_tomek_links.py | 2 -- imblearn/utils/_validation.py | 1 - imblearn/utils/estimator_checks.py | 4 ---- imblearn/utils/tests/test_validation.py | 4 ++-- 28 files changed, 12 insertions(+), 72 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 091e5e8e0..1c918c09e 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -4,7 +4,6 @@ # Christos Aridas # License: MIT -import warnings from abc import ABCMeta, abstractmethod import numpy as np @@ -15,7 +14,6 @@ from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type -from .utils.deprecation import deprecate_parameter class SamplerMixin(BaseEstimator, metaclass=ABCMeta): diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py index d483aec69..940a77b39 100644 --- a/imblearn/datasets/_imbalance.py +++ b/imblearn/datasets/_imbalance.py @@ -5,7 +5,6 @@ # Christos Aridas # License: MIT -import warnings from collections import Counter from sklearn.utils import check_X_y diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index 1a84c4c86..a757e6304 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -1,4 +1,3 @@ -import numbers from copy import deepcopy import numpy as np @@ -6,7 +5,6 @@ from sklearn.base import clone from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble._base import _set_random_states -from sklearn.tree import DecisionTreeClassifier from sklearn.utils import safe_indexing from ..under_sampling.base import BaseUnderSampler diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py index 85ad37743..a7aa4860f 100644 --- a/imblearn/ensemble/tests/test_forest.py +++ b/imblearn/ensemble/tests/test_forest.py @@ -114,7 +114,7 @@ def test_balanced_random_forest_oob(imbalanced_dataset): n_samples = X.shape[0] est.fit(X[: n_samples // 2, :], y[: n_samples // 2]) - test_score = est.score(X[n_samples // 2 :, :], y[n_samples // 2 :]) + test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:]) assert abs(test_score - est.oob_score_) < 0.1 diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 64d2d6325..88ea040e3 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -46,22 +46,12 @@ def import_from_tensforflow(): from sklearn.base import clone from sklearn.utils import safe_indexing from sklearn.utils import check_random_state -from sklearn.utils._testing import set_random_state from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _random_state_docstring from ..tensorflow import balanced_batch_generator as tf_bbg -DONT_HAVE_RANDOM_STATE = ( - "NearMiss", - "EditedNearestNeighbours", - "RepeatedEditedNearestNeighbours", - "AllKNN", - "NeighbourhoodCleaningRule", - "TomekLinks", -) - class BalancedBatchGenerator(*ParentClass): """Create balanced batches when training a keras model. @@ -167,10 +157,6 @@ def _sample(self): self.sampler_ = RandomUnderSampler(random_state=random_state) else: self.sampler_ = clone(self.sampler) - # FIXME: Remove in 0.6 - if self.sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE: - set_random_state(self.sampler_, random_state) - self.sampler_.fit_resample(self.X, self.y) if not hasattr(self.sampler_, "sample_indices_"): raise ValueError( @@ -187,13 +173,13 @@ def __getitem__(self, index): X_resampled = safe_indexing( self.X, self.indices_[ - index * self.batch_size : (index + 1) * self.batch_size + index * self.batch_size:(index + 1) * self.batch_size ], ) y_resampled = safe_indexing( self.y, self.indices_[ - index * self.batch_size : (index + 1) * self.batch_size + index * self.batch_size:(index + 1) * self.batch_size ], ) if issparse(X_resampled) and not self.keep_sparse: @@ -202,7 +188,7 @@ def __getitem__(self, index): sample_weight_resampled = safe_indexing( self.sample_weight, self.indices_[ - index * self.batch_size : (index + 1) * self.batch_size + index * self.batch_size:(index + 1) * self.batch_size ], ) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index e5bb757c3..fac23d51b 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -12,7 +12,6 @@ from .base import BaseOverSampler from ..utils import check_target_type from ..utils import Substitution -from ..utils.deprecation import deprecate_parameter from ..utils._docstring import _random_state_docstring diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 9e8c15022..adf94b161 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -970,7 +970,7 @@ def _fit_resample(self, X, y): X_resampled, y_resampled = super()._fit_resample(X_encoded, y) # reverse the encoding of the categorical features - X_res_cat = X_resampled[:, self.continuous_features_.size :] + X_res_cat = X_resampled[:, self.continuous_features_.size:] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 6d90fd62b..59de36057 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -8,7 +8,6 @@ from sklearn.utils.testing import assert_allclose, assert_array_equal from sklearn.neighbors import NearestNeighbors -from sklearn.svm import SVC from imblearn.over_sampling import SMOTE from imblearn.over_sampling import SVMSMOTE diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index 27c7f94a3..a201dfef5 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -13,9 +13,6 @@ # Guillaume Lemaitre # License: BSD -from collections import defaultdict -from itertools import islice - from sklearn import pipeline from sklearn.base import clone from sklearn.utils.metaestimators import if_delegate_has_method diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index 75521f1e6..d4f11d39f 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -5,21 +5,11 @@ from sklearn.base import clone from sklearn.utils import safe_indexing from sklearn.utils import check_random_state -from sklearn.utils._testing import set_random_state from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _random_state_docstring -DONT_HAVE_RANDOM_STATE = ( - "NearMiss", - "EditedNearestNeighbours", - "RepeatedEditedNearestNeighbours", - "AllKNN", - "NeighbourhoodCleaningRule", - "TomekLinks", -) - @Substitution(random_state=_random_state_docstring) def balanced_batch_generator( @@ -134,9 +124,6 @@ def balanced_batch_generator( sampler_ = RandomUnderSampler(random_state=random_state) else: sampler_ = clone(sampler) - # FIXME: Remove in 0.6 - if sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE: - set_random_state(sampler_, random_state) sampler_.fit_resample(X, y) if not hasattr(sampler_, "sample_indices_"): raise ValueError( @@ -149,15 +136,15 @@ def balanced_batch_generator( def generator(X, y, sample_weight, indices, batch_size): while True: for index in range(0, len(indices), batch_size): - X_res = safe_indexing(X, indices[index : index + batch_size]) - y_res = safe_indexing(y, indices[index : index + batch_size]) + X_res = safe_indexing(X, indices[index:index + batch_size]) + y_res = safe_indexing(y, indices[index:index + batch_size]) if issparse(X_res) and not keep_sparse: X_res = X_res.toarray() if sample_weight is None: yield X_res, y_res else: sw_res = safe_indexing( - sample_weight, indices[index : index + batch_size] + sample_weight, indices[index:index + batch_size] ) yield X_res, y_res, sw_res diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 67363144c..7091e900d 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -5,8 +5,6 @@ import numpy as np from scipy import sparse -from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import assert_array_equal from sklearn.cluster import KMeans from imblearn.under_sampling import ClusterCentroids diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 06d79070c..3deaf7d97 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -17,7 +17,6 @@ from ..base import BaseCleaningSampler from ...utils import Substitution -from ...utils.deprecation import deprecate_parameter from ...utils._docstring import _random_state_docstring diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 7f0cd52b5..2fa8f81ca 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -17,7 +17,6 @@ from ..base import BaseUnderSampler from ...utils import Substitution -from ...utils.deprecation import deprecate_parameter from ...utils._docstring import _random_state_docstring @@ -85,7 +84,7 @@ class InstanceHardnessThreshold(BaseUnderSampler): Original dataset shape Counter({{1: 900, 0: 100}}) >>> iht = InstanceHardnessThreshold(random_state=42) >>> X_res, y_res = iht.fit_resample(X, y) - >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +ELLIPSIS + >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +ELLIPSIS # noqa Resampled dataset shape Counter({{1: 5..., 0: 100}}) """ diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 1d76fc0ee..65e57a6b0 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -132,7 +132,7 @@ def _selection_dist_based( """ # Compute the distance considering the farthest neighbour - dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors :], axis=1) + dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors:], axis=1) target_class_indices = np.flatnonzero(y == key) if ( diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 94a08f332..581c138cc 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -15,7 +15,6 @@ from ..base import BaseCleaningSampler from ._tomek_links import TomekLinks from ...utils import Substitution -from ...utils.deprecation import deprecate_parameter from ...utils._docstring import _random_state_docstring diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 5f3c8c7b1..c25b3cd43 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -14,7 +14,6 @@ from ..base import BaseUnderSampler from ...utils import check_target_type from ...utils import Substitution -from ...utils.deprecation import deprecate_parameter from ...utils._docstring import _random_state_docstring diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 04402398a..21a515949 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -11,8 +11,6 @@ from ..base import BaseCleaningSampler from ...utils import Substitution -from ...utils.deprecation import deprecate_parameter -from ...utils._docstring import _random_state_docstring @Substitution( diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index e165871cf..c4bd4886e 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -11,7 +11,6 @@ from sklearn.datasets import make_classification from imblearn.under_sampling import AllKNN -from imblearn.utils.testing import warns X = np.array( [ diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index 1378c9f6d..86a4893fe 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -11,7 +11,6 @@ from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import EditedNearestNeighbours -from imblearn.utils.testing import warns X = np.array( [ diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 913d07f39..f720e7b38 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -6,7 +6,6 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_array_equal from sklearn.ensemble import GradientBoostingClassifier from imblearn.under_sampling import InstanceHardnessThreshold diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py index 69d4e6e57..64c7edf32 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py @@ -10,7 +10,6 @@ from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import NearMiss -from imblearn.utils.testing import warns X = np.array( [ diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py index 5338e9d1b..192f83323 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -7,10 +7,8 @@ import numpy as np from sklearn.utils.testing import assert_array_equal -from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import NeighbourhoodCleaningRule -from imblearn.utils.testing import warns X = np.array( [ diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index 78514a719..a23185bb3 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -5,7 +5,6 @@ from collections import Counter -import pytest import numpy as np from sklearn.utils.testing import assert_array_equal diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index b11163709..dfc006650 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -10,7 +10,6 @@ from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import RepeatedEditedNearestNeighbours -from imblearn.utils.testing import warns X = np.array( [ diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py index ad6b81ad6..4fc5388e3 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py @@ -3,12 +3,10 @@ # Christos Aridas # License: MIT -import pytest import numpy as np from sklearn.utils.testing import assert_array_equal from imblearn.under_sampling import TomekLinks -from imblearn.utils.testing import warns X = np.array( [ diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index edf7df70c..a3a78f7d1 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -13,7 +13,6 @@ from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.utils.multiclass import type_of_target -from sklearn.utils.deprecation import deprecated from ..exceptions import raise_isinstance_error diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 51f814764..8ae4cf7b2 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -14,7 +14,6 @@ import numpy as np from scipy import sparse -from sklearn.base import clone from sklearn.datasets import ( make_classification, make_multilabel_classification, @@ -26,15 +25,12 @@ check_parameters_default_constructible, ) from sklearn.utils.testing import assert_allclose -from sklearn.utils._testing import assert_raises_regex from sklearn.utils._testing import set_random_state from sklearn.utils.multiclass import type_of_target -from imblearn.base import BaseSampler from imblearn.over_sampling.base import BaseOverSampler from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler from imblearn.ensemble.base import BaseEnsembleSampler -from imblearn.over_sampling import SMOTE from imblearn.under_sampling import NearMiss, ClusterCentroids diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 8df1dd28b..2880f9396 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -83,13 +83,13 @@ def test_check_sampling_strategy_warning(): 0.5, binary_target, "clean-sampling", - "'clean-sampling' methods do let the user specify the sampling ratio", + "'clean-sampling' methods do let the user specify the sampling ratio", # noqa ), ( 0.1, np.array([0] * 10 + [1] * 20), "over-sampling", - "remove samples from the minority class while trying to generate new", + "remove samples from the minority class while trying to generate new", # noqa ), ( 0.1, From 171d361fe75b1043d1fd5bf0d1097f39c64a6ec1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 17:37:14 +0100 Subject: [PATCH 24/37] iter --- .../_instance_hardness_threshold.py | 2 +- imblearn/utils/_validation.py | 12 +++--------- imblearn/utils/tests/test_validation.py | 4 ++-- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 2fa8f81ca..1dc9ea479 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -84,7 +84,7 @@ class InstanceHardnessThreshold(BaseUnderSampler): Original dataset shape Counter({{1: 900, 0: 100}}) >>> iht = InstanceHardnessThreshold(random_state=42) >>> X_res, y_res = iht.fit_resample(X, y) - >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +ELLIPSIS # noqa + >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +ELLIPSIS Resampled dataset shape Counter({{1: 5..., 0: 100}}) """ diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index a3a78f7d1..30a76b731 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -303,17 +303,11 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type): sampling_strategy_[class_sample] = n_samples elif sampling_type == "clean-sampling": # FIXME: Turn into an error in 0.6 - warnings.warn( + raise ValueError( "'sampling_strategy' as a dict for cleaning methods is " - "deprecated and will raise an error in version 0.6. " - "Please give a list of the classes to be targeted by the" - " sampling.", - DeprecationWarning, + "not supported. Please give a list of the classes to be " + "targeted by the sampling." ) - # clean-sampling can be more permissive since those samplers do not - # use samples - for class_sample, n_samples in sampling_strategy.items(): - sampling_strategy_[class_sample] = n_samples else: raise NotImplementedError diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 2880f9396..5d34b663d 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -69,8 +69,8 @@ def test_check_target_type_ova(target, output_target, is_ova): def test_check_sampling_strategy_warning(): - msg = "dict for cleaning methods is deprecated" - with pytest.warns(DeprecationWarning, match=msg): + msg = "dict for cleaning methods is not supported" + with pytest.raises(ValueError, match=msg): check_sampling_strategy( {1: 0, 2: 0, 3: 0}, multiclass_target, "clean-sampling" ) From dfb54bf6c53a34534588363412861b3bf28f5bcb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 18:03:26 +0100 Subject: [PATCH 25/37] remove warning --- imblearn/combine/tests/test_smote_enn.py | 3 +- imblearn/combine/tests/test_smote_tomek.py | 3 +- imblearn/datasets/_zenodo.py | 2 +- imblearn/ensemble/_easy_ensemble.py | 2 +- imblearn/ensemble/_forest.py | 6 +-- imblearn/ensemble/_weight_boosting.py | 6 +-- imblearn/ensemble/tests/test_bagging.py | 8 ++-- imblearn/ensemble/tests/test_easy_ensemble.py | 4 +- imblearn/ensemble/tests/test_forest.py | 4 +- .../ensemble/tests/test_weight_boosting.py | 2 +- imblearn/keras/_generator.py | 8 ++-- imblearn/metrics/_classification.py | 9 ++-- imblearn/metrics/tests/test_classification.py | 3 +- imblearn/metrics/tests/test_score_objects.py | 10 ++-- imblearn/over_sampling/_adasyn.py | 5 +- .../over_sampling/_random_over_sampler.py | 8 ++-- imblearn/over_sampling/_smote.py | 46 +++++++++---------- imblearn/over_sampling/tests/test_adasyn.py | 3 +- .../tests/test_borderline_smote.py | 4 +- .../over_sampling/tests/test_kmeans_smote.py | 4 +- .../tests/test_random_over_sampler.py | 4 +- imblearn/over_sampling/tests/test_smote.py | 3 +- imblearn/over_sampling/tests/test_smote_nc.py | 2 +- .../over_sampling/tests/test_svm_smote.py | 4 +- imblearn/tensorflow/_generator.py | 8 ++-- imblearn/tests/test_base.py | 2 +- imblearn/tests/test_common.py | 3 +- imblearn/tests/test_pipeline.py | 6 +-- .../_cluster_centroids.py | 8 ++-- .../_condensed_nearest_neighbour.py | 16 +++---- .../_edited_nearest_neighbours.py | 8 ++-- .../_instance_hardness_threshold.py | 12 ++--- .../_prototype_selection/_nearmiss.py | 18 ++++---- .../_neighbourhood_cleaning_rule.py | 10 ++-- .../_one_sided_selection.py | 16 +++---- .../_random_under_sampler.py | 4 +- .../_prototype_selection/_tomek_links.py | 6 +-- .../_prototype_selection/tests/test_allknn.py | 2 +- .../tests/test_condensed_nearest_neighbour.py | 2 +- .../tests/test_edited_nearest_neighbours.py | 2 +- .../tests/test_nearmiss.py | 2 +- .../tests/test_neighbourhood_cleaning_rule.py | 2 +- .../tests/test_one_sided_selection.py | 2 +- .../tests/test_random_under_sampler.py | 2 +- ...test_repeated_edited_nearest_neighbours.py | 2 +- .../tests/test_tomek_links.py | 2 +- imblearn/utils/_validation.py | 1 - imblearn/utils/estimator_checks.py | 2 +- imblearn/utils/tests/test_validation.py | 2 +- 49 files changed, 148 insertions(+), 145 deletions(-) diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py index 8d83a7459..f6aaa52fa 100644 --- a/imblearn/combine/tests/test_smote_enn.py +++ b/imblearn/combine/tests/test_smote_enn.py @@ -6,7 +6,8 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_allclose, assert_array_equal +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from imblearn.combine import SMOTEENN from imblearn.under_sampling import EditedNearestNeighbours diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py index 4d3123855..da74b4ee6 100644 --- a/imblearn/combine/tests/test_smote_tomek.py +++ b/imblearn/combine/tests/test_smote_tomek.py @@ -6,7 +6,8 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_allclose, assert_array_equal +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from imblearn.combine import SMOTETomek from imblearn.over_sampling import SMOTE diff --git a/imblearn/datasets/_zenodo.py b/imblearn/datasets/_zenodo.py index bc5bcedb1..3a180ad78 100644 --- a/imblearn/datasets/_zenodo.py +++ b/imblearn/datasets/_zenodo.py @@ -54,7 +54,7 @@ import numpy as np from sklearn.datasets import get_data_home -from sklearn.datasets.base import Bunch +from sklearn.utils import Bunch from sklearn.utils import check_random_state URL = ( diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index eb29d778d..808d36b97 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -10,7 +10,7 @@ from sklearn.base import clone from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble.bagging import BaggingClassifier +from sklearn.ensemble import BaggingClassifier from ..under_sampling import RandomUnderSampler from ..under_sampling.base import BaseUnderSampler diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 12f57e09a..a3066fef5 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -16,13 +16,13 @@ from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier -from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble._base import _set_random_states from sklearn.ensemble._forest import _parallel_build_trees from sklearn.exceptions import DataConversionWarning +from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array from sklearn.utils import check_random_state -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from ..pipeline import make_pipeline from ..under_sampling import RandomUnderSampler @@ -48,7 +48,7 @@ def _local_parallel_build_trees( # resample before to fit the tree X_resampled, y_resampled = sampler.fit_resample(X, y) if sample_weight is not None: - sample_weight = safe_indexing(sample_weight, sampler.sample_indices_) + sample_weight = _safe_indexing(sample_weight, sampler.sample_indices_) tree = _parallel_build_trees( tree, forest, diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index a757e6304..a17cc789d 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -5,7 +5,7 @@ from sklearn.base import clone from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble._base import _set_random_states -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from ..under_sampling.base import BaseUnderSampler from ..under_sampling import RandomUnderSampler @@ -201,7 +201,7 @@ def _boost_real(self, iboost, X, y, sample_weight, random_state): ) X_res, y_res = sampler.fit_resample(X, y) - sample_weight_res = safe_indexing( + sample_weight_res = _safe_indexing( sample_weight, sampler.sample_indices_ ) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) @@ -271,7 +271,7 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state): ) X_res, y_res = sampler.fit_resample(X, y) - sample_weight_res = safe_indexing( + sample_weight_res = _safe_indexing( sample_weight, sampler.sample_indices_ ) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) diff --git a/imblearn/ensemble/tests/test_bagging.py b/imblearn/ensemble/tests/test_bagging.py index 734070fc6..738266147 100644 --- a/imblearn/ensemble/tests/test_bagging.py +++ b/imblearn/ensemble/tests/test_bagging.py @@ -18,11 +18,9 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest -from sklearn.utils.testing import ( - assert_array_equal, - assert_array_almost_equal, - assert_allclose, -) +from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_allclose from imblearn.datasets import make_imbalance from imblearn.ensemble import BalancedBaggingClassifier diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index db4762e88..68c757a4c 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -8,11 +8,11 @@ from sklearn.datasets import load_iris, make_hastie_10_2 from sklearn.ensemble import AdaBoostClassifier -from sklearn.utils.testing import assert_array_equal from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.feature_selection import SelectKBest -from sklearn.utils.testing import assert_allclose +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from imblearn.ensemble import EasyEnsembleClassifier from imblearn.datasets import make_imbalance diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py index a7aa4860f..ffe71b0c2 100644 --- a/imblearn/ensemble/tests/test_forest.py +++ b/imblearn/ensemble/tests/test_forest.py @@ -3,9 +3,9 @@ import numpy as np from sklearn.datasets import make_classification -from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import assert_array_equal from sklearn.model_selection import GridSearchCV +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from imblearn.ensemble import BalancedRandomForestClassifier diff --git a/imblearn/ensemble/tests/test_weight_boosting.py b/imblearn/ensemble/tests/test_weight_boosting.py index eef975b28..26facce90 100644 --- a/imblearn/ensemble/tests/test_weight_boosting.py +++ b/imblearn/ensemble/tests/test_weight_boosting.py @@ -4,7 +4,7 @@ from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from imblearn.ensemble import RUSBoostClassifier diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 88ea040e3..39f036713 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -44,7 +44,7 @@ def import_from_tensforflow(): from scipy.sparse import issparse from sklearn.base import clone -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from sklearn.utils import check_random_state from ..under_sampling import RandomUnderSampler @@ -170,13 +170,13 @@ def __len__(self): return int(self.indices_.size // self.batch_size) def __getitem__(self, index): - X_resampled = safe_indexing( + X_resampled = _safe_indexing( self.X, self.indices_[ index * self.batch_size:(index + 1) * self.batch_size ], ) - y_resampled = safe_indexing( + y_resampled = _safe_indexing( self.y, self.indices_[ index * self.batch_size:(index + 1) * self.batch_size @@ -185,7 +185,7 @@ def __getitem__(self, index): if issparse(X_resampled) and not self.keep_sparse: X_resampled = X_resampled.toarray() if self.sample_weight is not None: - sample_weight_resampled = safe_indexing( + sample_weight_resampled = _safe_indexing( self.sample_weight, self.indices_[ index * self.batch_size:(index + 1) * self.batch_size diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index 59e0f0522..b16f45494 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -20,11 +20,10 @@ import numpy as np import scipy as sp -from sklearn.metrics._classification import ( - _check_targets, - _prf_divide, - precision_recall_fscore_support, -) +from sklearn.metrics._classification import _check_targets +from sklearn.metrics._classification import _prf_divide +from sklearn.metrics._classification import precision_recall_fscore_support + from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import unique_labels diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index ea3278451..e7972f03d 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -16,7 +16,8 @@ from sklearn.preprocessing import label_binarize from sklearn.utils.fixes import np_version from sklearn.utils.validation import check_random_state -from sklearn.utils.testing import assert_allclose, assert_array_equal +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_no_warnings from sklearn.metrics import accuracy_score, average_precision_score from sklearn.metrics import brier_score_loss, cohen_kappa_score diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index 4b7636e03..7596b6dbe 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -11,12 +11,10 @@ from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV -from imblearn.metrics import ( - sensitivity_score, - specificity_score, - geometric_mean_score, - make_index_balanced_accuracy, -) +from imblearn.metrics import sensitivity_score +from imblearn.metrics import specificity_score +from imblearn.metrics import geometric_mean_score +from imblearn.metrics import make_index_balanced_accuracy R_TOL = 1e-2 diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index 09e2c9c15..507ddfd25 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -7,7 +7,8 @@ import numpy as np from scipy import sparse -from sklearn.utils import check_random_state, safe_indexing +from sklearn.utils import check_random_state +from sklearn.utils import _safe_indexing from .base import BaseOverSampler from ..utils import check_neighbors_object @@ -107,7 +108,7 @@ def _fit_resample(self, X, y): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) - X_class = safe_indexing(X, target_class_indices) + X_class = _safe_indexing(X, target_class_indices) self.nn_.fit(X) _, nn_index = self.nn_.kneighbors(X_class) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index fac23d51b..39de3f84a 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -7,7 +7,9 @@ from collections import Counter import numpy as np -from sklearn.utils import check_X_y, check_random_state, safe_indexing +from sklearn.utils import check_X_y +from sklearn.utils import check_random_state +from sklearn.utils import _safe_indexing from .base import BaseOverSampler from ..utils import check_target_type @@ -93,8 +95,8 @@ def _fit_resample(self, X, y): self.sample_indices_ = np.array(sample_indices) return ( - safe_indexing(X, sample_indices), - safe_indexing(y, sample_indices), + _safe_indexing(X, sample_indices), + _safe_indexing(y, sample_indices), ) def _more_tags(self): diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index adf94b161..4798efadf 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -18,7 +18,7 @@ from sklearn.preprocessing import OneHotEncoder from sklearn.svm import SVC from sklearn.utils import check_random_state -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from sklearn.utils import check_array from sklearn.utils import check_X_y from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0 @@ -352,7 +352,7 @@ def _fit_resample(self, X, y): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) - X_class = safe_indexing(X, target_class_indices) + X_class = _safe_indexing(X, target_class_indices) self.nn_m_.fit(X) danger_index = self._in_danger_noise( @@ -363,14 +363,14 @@ def _fit_resample(self, X, y): self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors( - safe_indexing(X_class, danger_index), return_distance=False + _safe_indexing(X_class, danger_index), return_distance=False )[:, 1:] # divergence between borderline-1 and borderline-2 if self.kind == "borderline-1": # Create synthetic samples for borderline points. X_new, y_new = self._make_samples( - safe_indexing(X_class, danger_index), + _safe_indexing(X_class, danger_index), y.dtype, class_sample, X_class, @@ -389,7 +389,7 @@ def _fit_resample(self, X, y): # only minority X_new_1, y_new_1 = self._make_samples( - safe_indexing(X_class, danger_index), + _safe_indexing(X_class, danger_index), y.dtype, class_sample, X_class, @@ -402,10 +402,10 @@ def _fit_resample(self, X, y): # new samples will be created considering not only the majority # class but all over classes. X_new_2, y_new_2 = self._make_samples( - safe_indexing(X_class, danger_index), + _safe_indexing(X_class, danger_index), y.dtype, class_sample, - safe_indexing(X, np.flatnonzero(y != class_sample)), + _safe_indexing(X, np.flatnonzero(y != class_sample)), nns, int((1 - fractions) * n_samples), step_size=0.5, @@ -553,19 +553,19 @@ def _fit_resample(self, X, y): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) - X_class = safe_indexing(X, target_class_indices) + X_class = _safe_indexing(X, target_class_indices) self.svm_estimator_.fit(X, y) support_index = self.svm_estimator_.support_[ y[self.svm_estimator_.support_] == class_sample ] - support_vector = safe_indexing(X, support_index) + support_vector = _safe_indexing(X, support_index) self.nn_m_.fit(X) noise_bool = self._in_danger_noise( self.nn_m_, support_vector, class_sample, y, kind="noise" ) - support_vector = safe_indexing( + support_vector = _safe_indexing( support_vector, np.flatnonzero(np.logical_not(noise_bool)) ) danger_bool = self._in_danger_noise( @@ -578,12 +578,12 @@ def _fit_resample(self, X, y): n_generated_samples = int(fractions * (n_samples + 1)) if np.count_nonzero(danger_bool) > 0: nns = self.nn_k_.kneighbors( - safe_indexing(support_vector, np.flatnonzero(danger_bool)), + _safe_indexing(support_vector, np.flatnonzero(danger_bool)), return_distance=False, )[:, 1:] X_new_1, y_new_1 = self._make_samples( - safe_indexing(support_vector, np.flatnonzero(danger_bool)), + _safe_indexing(support_vector, np.flatnonzero(danger_bool)), y.dtype, class_sample, X_class, @@ -594,12 +594,12 @@ def _fit_resample(self, X, y): if np.count_nonzero(safety_bool) > 0: nns = self.nn_k_.kneighbors( - safe_indexing(support_vector, np.flatnonzero(safety_bool)), + _safe_indexing(support_vector, np.flatnonzero(safety_bool)), return_distance=False, )[:, 1:] X_new_2, y_new_2 = self._make_samples( - safe_indexing(support_vector, np.flatnonzero(safety_bool)), + _safe_indexing(support_vector, np.flatnonzero(safety_bool)), y.dtype, class_sample, X_class, @@ -730,7 +730,7 @@ def _fit_resample(self, X, y): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) - X_class = safe_indexing(X, target_class_indices) + X_class = _safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] @@ -930,7 +930,7 @@ def _fit_resample(self, X, y): X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) - X_minority = safe_indexing( + X_minority = _safe_indexing( X_continuous, np.flatnonzero(y == class_minority) ) @@ -1207,7 +1207,7 @@ def _fit_resample(self, X, y): continue # target_class_indices = np.flatnonzero(y == class_sample) - # X_class = safe_indexing(X, target_class_indices) + # X_class = _safe_indexing(X, target_class_indices) X_clusters = self.kmeans_estimator_.fit_predict(X) valid_clusters = [] @@ -1217,8 +1217,8 @@ def _fit_resample(self, X, y): for cluster_idx in range(self.kmeans_estimator_.n_clusters): cluster_mask = np.flatnonzero(X_clusters == cluster_idx) - X_cluster = safe_indexing(X, cluster_mask) - y_cluster = safe_indexing(y, cluster_mask) + X_cluster = _safe_indexing(X, cluster_mask) + y_cluster = _safe_indexing(y, cluster_mask) cluster_class_mean = (y_cluster == class_sample).mean() @@ -1236,7 +1236,7 @@ def _fit_resample(self, X, y): if anticipated_samples < self.nn_k_.n_neighbors: continue - X_cluster_class = safe_indexing( + X_cluster_class = _safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample) ) @@ -1257,10 +1257,10 @@ def _fit_resample(self, X, y): ) for valid_cluster_idx, valid_cluster in enumerate(valid_clusters): - X_cluster = safe_indexing(X, valid_cluster) - y_cluster = safe_indexing(y, valid_cluster) + X_cluster = _safe_indexing(X, valid_cluster) + y_cluster = _safe_indexing(y, valid_cluster) - X_cluster_class = safe_indexing( + X_cluster_class = _safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample) ) diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index fc72e2a47..f7fcb07c7 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -6,7 +6,8 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_allclose, assert_array_equal +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.over_sampling import ADASYN diff --git a/imblearn/over_sampling/tests/test_borderline_smote.py b/imblearn/over_sampling/tests/test_borderline_smote.py index 06421f169..eee2981d7 100644 --- a/imblearn/over_sampling/tests/test_borderline_smote.py +++ b/imblearn/over_sampling/tests/test_borderline_smote.py @@ -2,8 +2,8 @@ import numpy as np from sklearn.neighbors import NearestNeighbors -from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from imblearn.over_sampling import BorderlineSMOTE diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 86899dc38..8b4f06985 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -1,8 +1,8 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from sklearn.cluster import KMeans from sklearn.cluster import MiniBatchKMeans diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index d49a807a8..bae6be0d6 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -6,8 +6,8 @@ from collections import Counter import numpy as np -from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from imblearn.over_sampling import RandomOverSampler diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 59de36057..60459256c 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -6,7 +6,8 @@ import numpy as np import pytest -from sklearn.utils.testing import assert_allclose, assert_array_equal +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.over_sampling import SMOTE diff --git a/imblearn/over_sampling/tests/test_smote_nc.py b/imblearn/over_sampling/tests/test_smote_nc.py index 1046a717b..a495c775d 100644 --- a/imblearn/over_sampling/tests/test_smote_nc.py +++ b/imblearn/over_sampling/tests/test_smote_nc.py @@ -12,7 +12,7 @@ from scipy import sparse from sklearn.datasets import make_classification -from sklearn.utils.testing import assert_allclose +from sklearn.utils._testing import assert_allclose from imblearn.over_sampling import SMOTENC diff --git a/imblearn/over_sampling/tests/test_svm_smote.py b/imblearn/over_sampling/tests/test_svm_smote.py index eccffc8a8..578ceccde 100644 --- a/imblearn/over_sampling/tests/test_svm_smote.py +++ b/imblearn/over_sampling/tests/test_svm_smote.py @@ -4,8 +4,8 @@ from sklearn.neighbors import NearestNeighbors from sklearn.svm import SVC -from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from imblearn.over_sampling import SVMSMOTE diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index d4f11d39f..725014875 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -3,7 +3,7 @@ from scipy.sparse import issparse from sklearn.base import clone -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from sklearn.utils import check_random_state from ..under_sampling import RandomUnderSampler @@ -136,14 +136,14 @@ def balanced_batch_generator( def generator(X, y, sample_weight, indices, batch_size): while True: for index in range(0, len(indices), batch_size): - X_res = safe_indexing(X, indices[index:index + batch_size]) - y_res = safe_indexing(y, indices[index:index + batch_size]) + X_res = _safe_indexing(X, indices[index:index + batch_size]) + y_res = _safe_indexing(y, indices[index:index + batch_size]) if issparse(X_res) and not keep_sparse: X_res = X_res.toarray() if sample_weight is None: yield X_res, y_res else: - sw_res = safe_indexing( + sw_res = _safe_indexing( sample_weight, indices[index:index + batch_size] ) yield X_res, y_res, sw_res diff --git a/imblearn/tests/test_base.py b/imblearn/tests/test_base.py index 721ebbbf5..255bf26f9 100644 --- a/imblearn/tests/test_base.py +++ b/imblearn/tests/test_base.py @@ -8,7 +8,7 @@ from scipy import sparse from sklearn.datasets import load_iris -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose_dense_sparse from imblearn.datasets import make_imbalance diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 7f74d0233..5e1b3bf67 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -5,7 +5,8 @@ import pytest -from imblearn.utils.estimator_checks import check_estimator, _yield_all_checks +from imblearn.utils.estimator_checks import check_estimator +from imblearn.utils.estimator_checks import _yield_all_checks from imblearn.utils.testing import all_estimators diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index bdb1b92c9..eb80c692d 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -15,9 +15,9 @@ from joblib import Memory -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_allclose +from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_allclose from sklearn.base import clone, BaseEstimator from sklearn.svm import SVC diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 3de4060eb..98054e4fa 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -12,7 +12,7 @@ from sklearn.base import clone from sklearn.cluster import KMeans from sklearn.neighbors import NearestNeighbors -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler from ...utils import Substitution @@ -121,7 +121,7 @@ def _generate_sample(self, X, y, centroids, target_class): indices = nearest_neighbors.kneighbors( centroids, return_distance=False ) - X_new = safe_indexing(X, np.squeeze(indices)) + X_new = _safe_indexing(X, np.squeeze(indices)) else: if sparse.issparse(X): X_new = sparse.csr_matrix(centroids, dtype=X.dtype) @@ -161,8 +161,8 @@ def _fit_resample(self, X, y): y_resampled.append(y_new) else: target_class_indices = np.flatnonzero(y == target_class) - X_resampled.append(safe_indexing(X, target_class_indices)) - y_resampled.append(safe_indexing(y, target_class_indices)) + X_resampled.append(_safe_indexing(X, target_class_indices)) + y_resampled.append(_safe_indexing(y, target_class_indices)) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled) diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 3deaf7d97..9b0bea3c8 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -13,7 +13,7 @@ from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier -from sklearn.utils import check_random_state, safe_indexing +from sklearn.utils import check_random_state, _safe_indexing from ..base import BaseCleaningSampler from ...utils import Substitution @@ -149,13 +149,13 @@ def _fit_resample(self, X, y): C_indices = np.append( np.flatnonzero(y == class_minority), idx_maj_sample ) - C_x = safe_indexing(X, C_indices) - C_y = safe_indexing(y, C_indices) + C_x = _safe_indexing(X, C_indices) + C_y = _safe_indexing(y, C_indices) # Create the set S - all majority samples S_indices = np.flatnonzero(y == target_class) - S_x = safe_indexing(X, S_indices) - S_y = safe_indexing(y, S_indices) + S_x = _safe_indexing(X, S_indices) + S_y = _safe_indexing(y, S_indices) # fit knn on C self.estimator_.fit(C_x, C_y) @@ -183,8 +183,8 @@ def _fit_resample(self, X, y): # Update C C_indices = np.append(C_indices, idx_maj[idx_sam]) - C_x = safe_indexing(X, C_indices) - C_y = safe_indexing(y, C_indices) + C_x = _safe_indexing(X, C_indices) + C_y = _safe_indexing(y, C_indices) # fit a knn on C self.estimator_.fit(C_x, C_y) @@ -207,7 +207,7 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index f70042b38..94aef0268 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -11,7 +11,7 @@ import numpy as np from scipy.stats import mode -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from ..base import BaseCleaningSampler from ...utils import check_neighbors_object @@ -121,8 +121,8 @@ def _fit_resample(self, X, y): for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): target_class_indices = np.flatnonzero(y == target_class) - X_class = safe_indexing(X, target_class_indices) - y_class = safe_indexing(y, target_class_indices) + X_class = _safe_indexing(X, target_class_indices) + y_class = _safe_indexing(y, target_class_indices) nnhood_idx = self.nn_.kneighbors( X_class, return_distance=False )[:, 1:] @@ -147,7 +147,7 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 1dc9ea479..e595b4c81 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -13,7 +13,7 @@ from sklearn.base import ClassifierMixin, clone from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import StratifiedKFold -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler from ...utils import Substitution @@ -135,10 +135,10 @@ def _fit_resample(self, X, y): probabilities = np.zeros(y.shape[0], dtype=float) for train_index, test_index in skf: - X_train = safe_indexing(X, train_index) - X_test = safe_indexing(X, test_index) - y_train = safe_indexing(y, train_index) - y_test = safe_indexing(y, test_index) + X_train = _safe_indexing(X, train_index) + X_test = _safe_indexing(X, test_index) + y_train = _safe_indexing(y, train_index) + y_test = _safe_indexing(y, test_index) self.estimator_.fit(X_train, y_train) @@ -170,7 +170,7 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 65e57a6b0..5f76a34cf 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -9,7 +9,7 @@ import numpy as np -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler from ...utils import check_neighbors_object @@ -137,7 +137,7 @@ def _selection_dist_based( target_class_indices = np.flatnonzero(y == key) if ( dist_vec.shape[0] - != safe_indexing(X, target_class_indices).shape[0] + != _safe_indexing(X, target_class_indices).shape[0] ): raise RuntimeError( "The samples to be selected do not correspond" @@ -200,14 +200,14 @@ def _fit_resample(self, X, y): class_minority = min(target_stats, key=target_stats.get) minority_class_indices = np.flatnonzero(y == class_minority) - self.nn_.fit(safe_indexing(X, minority_class_indices)) + self.nn_.fit(_safe_indexing(X, minority_class_indices)) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] target_class_indices = np.flatnonzero(y == target_class) - X_class = safe_indexing(X, target_class_indices) - y_class = safe_indexing(y, target_class_indices) + X_class = _safe_indexing(X, target_class_indices) + y_class = _safe_indexing(y, target_class_indices) if self.version == 1: dist_vec, idx_vec = self.nn_.kneighbors( @@ -236,11 +236,11 @@ def _fit_resample(self, X, y): elif self.version == 3: self.nn_ver3_.fit(X_class) dist_vec, idx_vec = self.nn_ver3_.kneighbors( - safe_indexing(X, minority_class_indices) + _safe_indexing(X, minority_class_indices) ) idx_vec_farthest = np.unique(idx_vec.reshape(-1)) - X_class_selected = safe_indexing(X_class, idx_vec_farthest) - y_class_selected = safe_indexing(y_class, idx_vec_farthest) + X_class_selected = _safe_indexing(X_class, idx_vec_farthest) + y_class_selected = _safe_indexing(y_class, idx_vec_farthest) dist_vec, idx_vec = self.nn_.kneighbors( X_class_selected, n_neighbors=self.nn_.n_neighbors @@ -269,7 +269,7 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 4c43faf0a..0cd6daea4 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -9,7 +9,7 @@ import numpy as np from scipy.stats import mode -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from ..base import BaseCleaningSampler from ._edited_nearest_neighbours import EditedNearestNeighbours @@ -146,8 +146,8 @@ def _fit_resample(self, X, y): ] self.nn_.fit(X) class_minority_indices = np.flatnonzero(y == class_minority) - X_class = safe_indexing(X, class_minority_indices) - y_class = safe_indexing(y, class_minority_indices) + X_class = _safe_indexing(X, class_minority_indices) + y_class = _safe_indexing(y, class_minority_indices) nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] if self.kind_sel == "mode": @@ -170,8 +170,8 @@ def _fit_resample(self, X, y): self.sample_indices_ = np.flatnonzero(selected_samples) return ( - safe_indexing(X, self.sample_indices_), - safe_indexing(y, self.sample_indices_), + _safe_indexing(X, self.sample_indices_), + _safe_indexing(y, self.sample_indices_), ) def _more_tags(self): diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 581c138cc..aa37c0337 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -10,7 +10,7 @@ from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier -from sklearn.utils import check_random_state, safe_indexing +from sklearn.utils import check_random_state, _safe_indexing from ..base import BaseCleaningSampler from ._tomek_links import TomekLinks @@ -141,14 +141,14 @@ def _fit_resample(self, X, y): # create the set composed of all minority samples and one # sample from the current class. - C_x = safe_indexing(X, C_indices) - C_y = safe_indexing(y, C_indices) + C_x = _safe_indexing(X, C_indices) + C_y = _safe_indexing(y, C_indices) # create the set S with removing the seed from S # since that it will be added anyway idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0) - S_x = safe_indexing(X, idx_maj_extracted) - S_y = safe_indexing(y, idx_maj_extracted) + S_x = _safe_indexing(X, idx_maj_extracted) + S_y = _safe_indexing(y, idx_maj_extracted) self.estimator_.fit(C_x, C_y) pred_S_y = self.estimator_.predict(S_x) @@ -162,14 +162,14 @@ def _fit_resample(self, X, y): (idx_under, np.flatnonzero(y == target_class)), axis=0 ) - X_resampled = safe_indexing(X, idx_under) - y_resampled = safe_indexing(y, idx_under) + X_resampled = _safe_indexing(X, idx_under) + y_resampled = _safe_indexing(y, idx_under) # apply Tomek cleaning tl = TomekLinks(sampling_strategy=list(self.sampling_strategy_.keys())) X_cleaned, y_cleaned = tl.fit_resample(X_resampled, y_resampled) - self.sample_indices_ = safe_indexing(idx_under, tl.sample_indices_) + self.sample_indices_ = _safe_indexing(idx_under, tl.sample_indices_) return X_cleaned, y_cleaned diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index c25b3cd43..d05359380 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -9,7 +9,7 @@ from sklearn.utils import check_array from sklearn.utils import check_consistent_length from sklearn.utils import check_random_state -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler from ...utils import check_target_type @@ -113,7 +113,7 @@ def _fit_resample(self, X, y): self.sample_indices_ = idx_under - return safe_indexing(X, idx_under), safe_indexing(y, idx_under) + return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"X_types": ["2darray", "string"], "sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 21a515949..603930050 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -7,7 +7,7 @@ import numpy as np from sklearn.neighbors import NearestNeighbors -from sklearn.utils import safe_indexing +from sklearn.utils import _safe_indexing from ..base import BaseCleaningSampler from ...utils import Substitution @@ -122,8 +122,8 @@ def _fit_resample(self, X, y): self.sample_indices_ = np.flatnonzero(np.logical_not(links)) return ( - safe_indexing(X, self.sample_indices_), - safe_indexing(y, self.sample_indices_), + _safe_indexing(X, self.sample_indices_), + _safe_indexing(y, self.sample_indices_), ) def _more_tags(self): diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py index c4bd4886e..e4e91b91a 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py @@ -6,7 +6,7 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_allclose, assert_array_equal +from sklearn.utils._testing import assert_allclose, assert_array_equal from sklearn.neighbors import NearestNeighbors from sklearn.datasets import make_classification diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py index 7f0f67c43..c47e5acb9 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -6,7 +6,7 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import KNeighborsClassifier from imblearn.under_sampling import CondensedNearestNeighbour diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index 86a4893fe..57bac0bb1 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -6,7 +6,7 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py index 64c7edf32..3e2e8686c 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py @@ -6,7 +6,7 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import NearMiss diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py index 192f83323..fc84cb017 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -6,7 +6,7 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import NeighbourhoodCleaningRule diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py index 612d23cc7..4b27bea19 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py @@ -6,7 +6,7 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import KNeighborsClassifier from imblearn.under_sampling import OneSidedSelection diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index a23185bb3..208aff01f 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -6,7 +6,7 @@ from collections import Counter import numpy as np -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import RandomUnderSampler diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index dfc006650..d8af21350 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -6,7 +6,7 @@ import pytest import numpy as np -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import RepeatedEditedNearestNeighbours diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py index 4fc5388e3..d678cd19b 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py @@ -4,7 +4,7 @@ # License: MIT import numpy as np -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import TomekLinks diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 30a76b731..82c9a0b93 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -302,7 +302,6 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type): ) sampling_strategy_[class_sample] = n_samples elif sampling_type == "clean-sampling": - # FIXME: Turn into an error in 0.6 raise ValueError( "'sampling_strategy' as a dict for cleaning methods is " "not supported. Please give a list of the classes to be " diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 8ae4cf7b2..ce885cf56 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -24,7 +24,7 @@ check_estimator as sklearn_check_estimator, check_parameters_default_constructible, ) -from sklearn.utils.testing import assert_allclose +from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import set_random_state from sklearn.utils.multiclass import type_of_target diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 5d34b663d..b3329bf98 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -11,7 +11,7 @@ from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors -from sklearn.utils.testing import assert_array_equal +from sklearn.utils._testing import assert_array_equal from imblearn.utils.testing import warns from imblearn.utils import check_neighbors_object From 46ba016d8518e8e29f6c379ac92c0953eef4e9f0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 18:05:20 +0100 Subject: [PATCH 26/37] allow failure ubuntu --- .travis.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index e062ec44e..89657fff8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,9 +42,7 @@ matrix: NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" OPTIONAL_DEPS="false" allow_failures: - - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" - OPTIONAL_DEPS="false" + - env: DISTRIB="ubuntu" install: source build_tools/travis/install.sh script: bash build_tools/travis/test_script.sh From 7fea797e7f5ea09f54efd5649845281c5ea856dc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 18:11:02 +0100 Subject: [PATCH 27/37] last warning --- imblearn/datasets/tests/test_zenodo.py | 2 +- imblearn/utils/_validation.py | 2 +- imblearn/utils/tests/test_validation.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/imblearn/datasets/tests/test_zenodo.py b/imblearn/datasets/tests/test_zenodo.py index 4ac184902..efd4943c2 100644 --- a/imblearn/datasets/tests/test_zenodo.py +++ b/imblearn/datasets/tests/test_zenodo.py @@ -9,7 +9,7 @@ import pytest from imblearn.datasets import fetch_datasets -from sklearn.utils.testing import SkipTest +from sklearn.utils._testing import SkipTest DATASET_SHAPE = { "ecoli": (336, 7), diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 82c9a0b93..8cb505f50 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -10,7 +10,7 @@ import numpy as np from sklearn.base import clone -from sklearn.neighbors.base import KNeighborsMixin +from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.utils.multiclass import type_of_target diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index b3329bf98..634f502f0 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -9,7 +9,7 @@ import pytest import numpy as np -from sklearn.neighbors.base import KNeighborsMixin +from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_array_equal From 263ff722253f0da5d35b6e17a0cc463d90bae428 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 18:12:49 +0100 Subject: [PATCH 28/37] iter --- imblearn/ensemble/tests/test_easy_ensemble.py | 1 - imblearn/ensemble/tests/test_forest.py | 2 +- imblearn/metrics/tests/test_score_objects.py | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 68c757a4c..1e13b88a7 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -250,6 +250,5 @@ def test_easy_ensemble_classifier_grid_search(): EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), parameters, cv=5, - iid=False, ) grid_search.fit(X, y) diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py index ffe71b0c2..0f386cb1a 100644 --- a/imblearn/ensemble/tests/test_forest.py +++ b/imblearn/ensemble/tests/test_forest.py @@ -131,6 +131,6 @@ def test_balanced_random_forest_oob(imbalanced_dataset): def test_balanced_random_forest_grid_search(imbalanced_dataset): brf = BalancedRandomForestClassifier() grid = GridSearchCV( - brf, {"n_estimators": (1, 2), "max_depth": (1, 2)}, cv=3, iid=False + brf, {"n_estimators": (1, 2), "max_depth": (1, 2)}, cv=3 ) grid.fit(*imbalanced_dataset) diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index 7596b6dbe..88c7d2c93 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -45,7 +45,6 @@ def test_scorer_common_average(data, score, expected_score, average): param_grid={"C": [1, 10]}, scoring=scorer, cv=3, - iid=False, ) grid.fit(X_train, y_train).predict(X_test) @@ -75,7 +74,6 @@ def test_scorer_default_average(data, score, average, expected_score): param_grid={"C": [1, 10]}, scoring=scorer, cv=3, - iid=False, ) grid.fit(X_train, y_train).predict(X_test) From c5ac35d3c8d43b934aa46eca077005fd1b33094b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 18:22:47 +0100 Subject: [PATCH 29/37] iter --- imblearn/tensorflow/_generator.py | 2 +- .../_prototype_selection/_instance_hardness_threshold.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index 725014875..8214b0de8 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -21,7 +21,7 @@ def balanced_batch_generator( keep_sparse=False, random_state=None, ): - """Create a balanced batch generator to train keras model. + """Create a balanced batch generator to train tensorflow model. Returns a generator --- as well as the number of step per epoch --- which is given to ``fit_generator``. The sampler defines the sampling strategy diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index e595b4c81..16d69056b 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -129,9 +129,7 @@ def _fit_resample(self, X, y): self._validate_estimator() target_stats = Counter(y) - skf = StratifiedKFold( - n_splits=self.cv, shuffle=False, random_state=self.random_state - ).split(X, y) + skf = StratifiedKFold(n_splits=self.cv, shuffle=False).split(X, y) probabilities = np.zeros(y.shape[0], dtype=float) for train_index, test_index in skf: From c5a60a15ef3aab8f4cd04b7a0daa5924927d24f6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 18:29:51 +0100 Subject: [PATCH 30/37] update ubuntu image --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 89657fff8..d499a3f38 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ # make it explicit that we favor the new container-based travis workers -dist: xenial +dist: bionic sudo: false language: python From 54a2e92b2ec880101fbd4e1c2a5013c1f071726f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 18:39:00 +0100 Subject: [PATCH 31/37] iter --- build_tools/circle/build_doc.sh | 3 ++- doc/conf.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index d364c74d0..7895fd29e 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -92,9 +92,10 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python=3.7 source activate $CONDA_ENV_NAME conda install --yes pip numpy scipy joblib pillow matplotlib sphinx \ - sphinx_rtd_theme numpydoc pandas tensorflow + sphinx_rtd_theme pandas keras tensorflow=1 pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git +pip install -U git+https://github.com/numpy/numpydoc.git # Build and install imbalanced-learn in dev mode ls -l diff --git a/doc/conf.py b/doc/conf.py index 3b44b341c..26f5d06c7 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -265,6 +265,7 @@ sphinx_gallery_conf = { 'doc_module': 'imblearn', 'backreferences_dir': os.path.join('generated'), + 'show_memory': True, 'reference_url': { 'imblearn': None} } From 05aaa8a932b44b36f4f9fe0321870c470bec8257 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 18:49:49 +0100 Subject: [PATCH 32/37] update example --- .travis.yml | 2 -- .../ensemble/plot_comparison_ensemble_classifier.py | 13 +++++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index d499a3f38..e67b9e017 100644 --- a/.travis.yml +++ b/.travis.yml @@ -41,8 +41,6 @@ matrix: - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" OPTIONAL_DEPS="false" - allow_failures: - - env: DISTRIB="ubuntu" install: source build_tools/travis/install.sh script: bash build_tools/travis/test_script.sh diff --git a/examples/ensemble/plot_comparison_ensemble_classifier.py b/examples/ensemble/plot_comparison_ensemble_classifier.py index ee30789a1..641df2b57 100644 --- a/examples/ensemble/plot_comparison_ensemble_classifier.py +++ b/examples/ensemble/plot_comparison_ensemble_classifier.py @@ -111,9 +111,8 @@ def plot_confusion_matrix(cm, classes, ax, # will use a bagging classifier and its counter part which internally uses a # random under-sampling to balanced each boostrap sample. -bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) -balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, - n_jobs=-1) +bagging = BaggingClassifier(n_estimators=50, random_state=0) +balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) @@ -149,9 +148,8 @@ def plot_confusion_matrix(cm, classes, ax, # outperforming bagging. Here, we used a vanilla random forest and its balanced # counterpart in which each bootstrap sample is balanced. -rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) -brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, - n_jobs=-1) +rf = RandomForestClassifier(n_estimators=50, random_state=0) +brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0) rf.fit(X_train, y_train) brf.fit(X_train, y_train) @@ -189,8 +187,7 @@ def plot_confusion_matrix(cm, classes, ax, base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, - base_estimator=base_estimator, - n_jobs=-1) + base_estimator=base_estimator) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') From e31c6c6a707c1eec464b6a00709cc84baf147e00 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 18:59:49 +0100 Subject: [PATCH 33/37] TST only for tensorflow1 --- imblearn/tensorflow/_generator.py | 55 --------------------- imblearn/tensorflow/tests/test_generator.py | 14 ++++-- 2 files changed, 11 insertions(+), 58 deletions(-) diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index 8214b0de8..6c5b32872 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -62,61 +62,6 @@ def balanced_batch_generator( steps_per_epoch : int The number of samples per epoch. - Examples - -------- - >>> import numpy as np - >>> from sklearn.datasets import load_iris - >>> X, y = load_iris(return_X_y=True) - >>> class_dict = dict() - >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 - >>> from imblearn.datasets import make_imbalance - >>> X, y = make_imbalance(X, y, class_dict) - >>> X = X.astype(np.float32) - >>> batch_size, learning_rate, epochs = 10, 0.01, 10 - >>> training_generator, steps_per_epoch = balanced_batch_generator( - ... X, y, sample_weight=None, sampler=None, - ... batch_size=batch_size, random_state=42) - >>> input_size, output_size = X.shape[1], 3 - >>> import tensorflow as tf - >>> def init_weights(shape): - ... return tf.Variable(tf.random_normal(shape, stddev=0.01)) - >>> def accuracy(y_true, y_pred): - ... return np.mean(np.argmax(y_pred, axis=1) == y_true) - >>> # input and output - >>> data = tf.placeholder("float32", shape=[None, input_size]) - >>> targets = tf.placeholder("int32", shape=[None]) - >>> # build the model and weights - >>> W = init_weights([input_size, output_size]) - >>> b = init_weights([output_size]) - >>> out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) - >>> # build the loss, predict, and train operator - >>> cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( - ... logits=out_act, labels=targets) - >>> loss = tf.reduce_sum(cross_entropy) - >>> optimizer = tf.train.GradientDescentOptimizer(learning_rate) - >>> train_op = optimizer.minimize(loss) - >>> predict = tf.nn.softmax(out_act) - >>> # Initialization of all variables in the graph - >>> init = tf.global_variables_initializer() - >>> with tf.Session() as sess: - ... print('Starting training') - ... sess.run(init) - ... for e in range(epochs): - ... for i in range(steps_per_epoch): - ... X_batch, y_batch = next(training_generator) - ... feed_dict = dict() - ... feed_dict[data] = X_batch; feed_dict[targets] = y_batch - ... sess.run([train_op, loss], feed_dict=feed_dict) - ... # For each epoch, run accuracy on train and test - ... feed_dict = dict() - ... feed_dict[data] = X - ... predicts_train = sess.run(predict, feed_dict=feed_dict) - ... print("epoch: {{}} train accuracy: {{:.3f}}" - ... .format(e, accuracy(y, predicts_train))) - ... # doctest: +ELLIPSIS - Starting training - [... - """ random_state = check_random_state(random_state) diff --git a/imblearn/tensorflow/tests/test_generator.py b/imblearn/tensorflow/tests/test_generator.py index 27d63df3c..73d751c05 100644 --- a/imblearn/tensorflow/tests/test_generator.py +++ b/imblearn/tensorflow/tests/test_generator.py @@ -1,3 +1,5 @@ +from distutils.version import LooseVersion + import pytest import numpy as np from scipy import sparse @@ -21,9 +23,8 @@ def data(): return X, y -@pytest.mark.parametrize("sampler", [None, NearMiss(), RandomOverSampler()]) -def test_balanced_batch_generator(data, sampler): - X, y = data +def check_balanced_batch_generator_tf_1_X_X(dataset, sampler): + X, y = dataset batch_size = 10 training_generator, steps_per_epoch = balanced_batch_generator( X, @@ -87,6 +88,13 @@ def accuracy(y_true, y_pred): ) + +@pytest.mark.parametrize("sampler", [None, NearMiss(), RandomOverSampler()]) +def test_balanced_batch_generator(data, sampler): + if LooseVersion(tf.__version__) < '2': + check_balanced_batch_generator_tf_1_X_X(data, sampler) + + @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_function_sparse(data, keep_sparse): X, y = data From d5b9ca8a51ab239e029b2230a410a90d03bc1c89 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 19:31:49 +0100 Subject: [PATCH 34/37] xxx --- build_tools/circle/build_doc.sh | 2 +- imblearn/tensorflow/tests/test_generator.py | 68 +++++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 7895fd29e..0e80ed8d7 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -92,7 +92,7 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python=3.7 source activate $CONDA_ENV_NAME conda install --yes pip numpy scipy joblib pillow matplotlib sphinx \ - sphinx_rtd_theme pandas keras tensorflow=1 + memory_profiler sphinx_rtd_theme pandas keras tensorflow=1 pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git pip install -U git+https://github.com/numpy/numpydoc.git diff --git a/imblearn/tensorflow/tests/test_generator.py b/imblearn/tensorflow/tests/test_generator.py index 73d751c05..3aa8864f5 100644 --- a/imblearn/tensorflow/tests/test_generator.py +++ b/imblearn/tensorflow/tests/test_generator.py @@ -88,11 +88,79 @@ def accuracy(y_true, y_pred): ) +def check_balanced_batch_generator_tf_2_X_X_compat_1_X_X(dataset, sampler): + tf.compat.v1.disable_eager_execution() + + X, y = dataset + batch_size = 10 + training_generator, steps_per_epoch = balanced_batch_generator( + X, + y, + sample_weight=None, + sampler=sampler, + batch_size=batch_size, + random_state=42, + ) + + learning_rate = 0.01 + epochs = 10 + input_size = X.shape[1] + output_size = 3 + + # helper functions + def init_weights(shape): + return tf.Variable(tf.random.normal(shape, stddev=0.01)) + + def accuracy(y_true, y_pred): + return np.mean(np.argmax(y_pred, axis=1) == y_true) + + # input and output + data = tf.compat.v1.placeholder("float32", shape=[None, input_size]) + targets = tf.compat.v1.placeholder("int32", shape=[None]) + + # build the model and weights + W = init_weights([input_size, output_size]) + b = init_weights([output_size]) + out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) + + # build the loss, predict, and train operator + cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=out_act, labels=targets + ) + loss = tf.reduce_sum(input_tensor=cross_entropy) + optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate) + train_op = optimizer.minimize(loss) + predict = tf.nn.softmax(out_act) + + # Initialization of all variables in the graph + init = tf.compat.v1.global_variables_initializer() + + with tf.compat.v1.Session() as sess: + sess.run(init) + + for e in range(epochs): + for i in range(steps_per_epoch): + X_batch, y_batch = next(training_generator) + sess.run( + [train_op, loss], + feed_dict={data: X_batch, targets: y_batch}, + ) + + # For each epoch, run accuracy on train and test + predicts_train = sess.run(predict, feed_dict={data: X}) + print( + "epoch: {} train accuracy: {:.3f}".format( + e, accuracy(y, predicts_train) + ) + ) + @pytest.mark.parametrize("sampler", [None, NearMiss(), RandomOverSampler()]) def test_balanced_batch_generator(data, sampler): if LooseVersion(tf.__version__) < '2': check_balanced_batch_generator_tf_1_X_X(data, sampler) + else: + check_balanced_batch_generator_tf_2_X_X_compat_1_X_X(data, sampler) @pytest.mark.parametrize("keep_sparse", [True, False]) From 3a570b740e7418adbf5058cd48abb5c26ffe3b04 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 19:34:57 +0100 Subject: [PATCH 35/37] update appveyor --- appveyor.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 5c3316abd..b09063a6a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -10,7 +10,12 @@ environment: - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" - OPTIONAL_DEP: "pandas keras tensorflow" + OPTIONAL_DEP: "pandas keras tensorflow=1" + + - PYTHON: "C:\\Miniconda36-x64" + PYTHON_VERSION: "3.6" + PYTHON_ARCH: "64" + OPTIONAL_DEP: "pandas tensorflow" - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.7" From 3603cdfcf6f5f590ec63ef15276d80b92657d4ef Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 19:44:07 +0100 Subject: [PATCH 36/37] update version of documentation for readthedocs --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 80c407a5a..32f5aa15a 100755 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ 'matplotlib', 'pandas', 'keras', - 'tensorflow' + 'tensorflow>=1.0,<2' ] } From 034270f5503166b814326482f2008373f568c329 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 31 Oct 2019 22:47:17 +0100 Subject: [PATCH 37/37] documentation --- doc/whats_new/v0.6.rst | 43 +++++++++++++++++++++++++++++ imblearn/metrics/_classification.py | 2 +- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst index f165eac8a..7b9f86817 100644 --- a/doc/whats_new/v0.6.rst +++ b/doc/whats_new/v0.6.rst @@ -6,3 +6,46 @@ Version 0.6.0 (under-development) Changelog --------- +Changed models +.............. + +The following models might give some different sampling due to changes in +scikit-learn: + +- :class:`imblearn.under_sampling.ClusterCentroid` +- :class:`imblearn.under_sampling.InstanceHardnessThreshold` + +Maintenance +........... + +- Update imports from scikit-learn after that some modules have been privatize. + The following import have been changed: + :class:`sklearn.ensemble._base._set_random_states`, + :class:`sklearn.ensemble._forest._parallel_build_trees`, + :class:`sklearn.metrics._classification._check_targets`, + :class:`sklearn.metrics._classification._prf_divide`, + :class:`sklearn.utils.Bunch`, + :class:`sklearn.utils._safe_indexing`, + :class:`sklearn.utils._testing.assert_allclose`, + :class:`sklearn.utils._testing.assert_array_equal`, + :class:`sklearn.utils._testing.SkipTest`. + :pr:`617` by :user:`Guillaume Lemaitre `. + +Deprecation +........... + +- The following classes have been removed after 2 deprecation cycles: + `ensemble.BalanceCascade` and `ensemble.EasyEnsemble`. + :pr:`617` by :user:`Guillaume Lemaitre `. + +- The following functions have been removed after 2 deprecation cycles: + `utils.check_ratio`. + :pr:`617` by :user:`Guillaume Lemaitre `. + +- The parameter `ratio` and `return_indices` has been removed from all + samplers. + :pr:`617` by :user:`Guillaume Lemaitre `. + +- The parameters `m_neighbors`, `out_step`, `kind`, `svm_estimator` + have been removed from the :class:`imblearn.over_sampling.SMOTE`. + :pr:`617` by :user:`Guillaume Lemaitre `. diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index b16f45494..30f8551b4 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -20,9 +20,9 @@ import numpy as np import scipy as sp +from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics._classification import _check_targets from sklearn.metrics._classification import _prf_divide -from sklearn.metrics._classification import precision_recall_fscore_support from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import unique_labels