diff --git a/asv_benchmarks/.gitignore b/asv_benchmarks/.gitignore new file mode 100644 index 0000000000000..a3fecdb98e0d3 --- /dev/null +++ b/asv_benchmarks/.gitignore @@ -0,0 +1,6 @@ +*__pycache__* +env/ +html/ +results/ +scikit-learn/ +benchmarks/cache/ diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json new file mode 100644 index 0000000000000..7e839a1ecb175 --- /dev/null +++ b/asv_benchmarks/asv.conf.json @@ -0,0 +1,162 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "scikit-learn", + + // The project's homepage + "project_url": "scikit-learn.org/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + // "build_command": [ + // "python setup.py build", + // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + // ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + // "branches": ["master"], // for git + // "branches": ["default"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + // "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/scikit-learn/scikit-learn/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["3.6"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + // "conda_channels": ["conda-forge", "defaults"] + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + "matrix": { + "numpy": [], + "scipy": [], + "cython": [], + "joblib": [], + "threadpoolctl": [] + }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "six": null}, // don't run without six on conda + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "numpy": "1.8"}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + // "env_dir": "env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + // "results_dir": "results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + // "html_dir": "html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/asv_benchmarks/benchmarks/__init__.py b/asv_benchmarks/benchmarks/__init__.py new file mode 100644 index 0000000000000..27dd4763446f0 --- /dev/null +++ b/asv_benchmarks/benchmarks/__init__.py @@ -0,0 +1 @@ +"""Benchmark suite for scikit-learn using ASV""" diff --git a/asv_benchmarks/benchmarks/cluster.py b/asv_benchmarks/benchmarks/cluster.py new file mode 100644 index 0000000000000..7e92f8cb6ddd2 --- /dev/null +++ b/asv_benchmarks/benchmarks/cluster.py @@ -0,0 +1,100 @@ +from sklearn.cluster import KMeans, MiniBatchKMeans + +from .common import Benchmark, Estimator, Predictor, Transformer +from .datasets import _blobs_dataset, _20newsgroups_highdim_dataset +from .utils import neg_mean_inertia + + +class KMeansBenchmark(Predictor, Transformer, Estimator, Benchmark): + """ + Benchmarks for KMeans. + """ + + param_names = ['representation', 'algorithm', 'init'] + params = (['dense', 'sparse'], ['full', 'elkan'], ['random', 'k-means++']) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, algorithm, init = params + + if representation == 'sparse': + data = _20newsgroups_highdim_dataset(n_samples=8000) + else: + data = _blobs_dataset(n_clusters=20) + + return data + + def make_estimator(self, params): + representation, algorithm, init = params + + max_iter = 30 if representation == 'sparse' else 100 + + estimator = KMeans(n_clusters=20, + algorithm=algorithm, + init=init, + n_init=1, + max_iter=max_iter, + tol=-1, + random_state=0) + + return estimator + + def make_scorers(self): + self.train_scorer = ( + lambda _, __: neg_mean_inertia(self.X, + self.estimator.predict(self.X), + self.estimator.cluster_centers_)) + self.test_scorer = ( + lambda _, __: neg_mean_inertia(self.X_val, + self.estimator.predict(self.X_val), + self.estimator.cluster_centers_)) + + +class MiniBatchKMeansBenchmark(Predictor, Transformer, Estimator, Benchmark): + """ + Benchmarks for MiniBatchKMeans. + """ + + param_names = ['representation', 'init'] + params = (['dense', 'sparse'], ['random', 'k-means++']) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, init = params + + if representation == 'sparse': + data = _20newsgroups_highdim_dataset() + else: + data = _blobs_dataset(n_clusters=20) + + return data + + def make_estimator(self, params): + representation, init = params + + max_iter = 5 if representation == 'sparse' else 2 + + estimator = MiniBatchKMeans(n_clusters=20, + init=init, + n_init=1, + max_iter=max_iter, + batch_size=1000, + max_no_improvement=None, + compute_labels=False, + random_state=0) + + return estimator + + def make_scorers(self): + self.train_scorer = ( + lambda _, __: neg_mean_inertia(self.X, + self.estimator.predict(self.X), + self.estimator.cluster_centers_)) + self.test_scorer = ( + lambda _, __: neg_mean_inertia(self.X_val, + self.estimator.predict(self.X_val), + self.estimator.cluster_centers_)) diff --git a/asv_benchmarks/benchmarks/common.py b/asv_benchmarks/benchmarks/common.py new file mode 100644 index 0000000000000..70760dc47a9b7 --- /dev/null +++ b/asv_benchmarks/benchmarks/common.py @@ -0,0 +1,235 @@ +import os +import json +import timeit +import pickle +import itertools +from abc import ABC, abstractmethod +from pathlib import Path +from multiprocessing import cpu_count + +import numpy as np + + +def get_from_config(): + """Get benchmarks configuration from the config.json file""" + current_path = Path(__file__).resolve().parent + + config_path = current_path / 'config.json' + with open(config_path, 'r') as config_file: + config_file = ''.join(line for line in config_file + if line and '//' not in line) + config = json.loads(config_file) + + profile = os.getenv('SKLBENCH_PROFILE', config['profile']) + + n_jobs_vals_env = os.getenv('SKLBENCH_NJOBS') + if n_jobs_vals_env: + n_jobs_vals = eval(n_jobs_vals_env) + else: + n_jobs_vals = config['n_jobs_vals'] + if not n_jobs_vals: + n_jobs_vals = list(range(1, 1 + cpu_count())) + + cache_path = current_path / 'cache' + cache_path.mkdir(exist_ok=True) + (cache_path / 'estimators').mkdir(exist_ok=True) + (cache_path / 'tmp').mkdir(exist_ok=True) + + save_estimators = os.getenv('SKLBENCH_SAVE_ESTIMATORS', + config['save_estimators']) + save_dir = os.getenv('ASV_COMMIT', 'new')[:8] + + if save_estimators: + (cache_path / 'estimators' / save_dir).mkdir(exist_ok=True) + + base_commit = os.getenv('SKLBENCH_BASE_COMMIT', config['base_commit']) + + bench_predict = os.getenv('SKLBENCH_PREDICT', config['bench_predict']) + bench_transform = os.getenv('SKLBENCH_TRANSFORM', + config['bench_transform']) + + return (profile, n_jobs_vals, save_estimators, save_dir, base_commit, + bench_predict, bench_transform) + + +def get_estimator_path(benchmark, directory, params, save=False): + """Get path of pickled fitted estimator""" + path = Path(__file__).resolve().parent / 'cache' + path = (path / 'estimators' / directory) if save else (path / 'tmp') + + filename = (benchmark.__class__.__name__ + + '_estimator_' + '_'.join(list(map(str, params))) + '.pkl') + + return path / filename + + +def clear_tmp(): + """Clean the tmp directory""" + path = Path(__file__).resolve().parent / 'cache' / 'tmp' + for child in path.iterdir(): + child.unlink() + + +class Benchmark(ABC): + """Abstract base class for all the benchmarks""" + timer = timeit.default_timer # wall time + processes = 1 + timeout = 500 + + (profile, n_jobs_vals, save_estimators, save_dir, base_commit, + bench_predict, bench_transform) = get_from_config() + + if profile == 'fast': + warmup_time = 0 + repeat = 1 + number = 1 + min_run_count = 1 + data_size = 'small' + elif profile == 'regular': + warmup_time = 1 + repeat = (3, 100, 30) + data_size = 'small' + elif profile == 'large_scale': + warmup_time = 1 + repeat = 3 + number = 1 + data_size = 'large' + + @property + @abstractmethod + def params(self): + pass + + +class Estimator(ABC): + """Abstract base class for all benchmarks of estimators""" + @abstractmethod + def make_data(self, params): + """Return the dataset for a combination of parameters""" + # The datasets are cached using joblib.Memory so it's fast and can be + # called for each repeat + pass + + @abstractmethod + def make_estimator(self, params): + """Return an instance of the estimator for a combination of parameters + """ + pass + + def skip(self, params): + """Return True if the benchmark should be skipped for these params""" + return False + + def setup_cache(self): + """Pickle a fitted estimator for all combinations of parameters""" + # This is run once per benchmark class. + + clear_tmp() + + param_grid = list(itertools.product(*self.params)) + + for params in param_grid: + if self.skip(params): + continue + + estimator = self.make_estimator(params) + X, _, y, _ = self.make_data(params) + + estimator.fit(X, y) + + est_path = get_estimator_path(self, Benchmark.save_dir, + params, Benchmark.save_estimators) + with est_path.open(mode='wb') as f: + pickle.dump(estimator, f) + + def setup(self, *params): + """Generate dataset and load the fitted estimator""" + # This is run once per combination of parameters and per repeat so we + # need to avoid doing expensive operations there. + + if self.skip(params): + raise NotImplementedError + + self.X, self.X_val, self.y, self.y_val = self.make_data(params) + + est_path = get_estimator_path(self, Benchmark.save_dir, + params, Benchmark.save_estimators) + with est_path.open(mode='rb') as f: + self.estimator = pickle.load(f) + + self.make_scorers() + + def time_fit(self, *args): + self.estimator.fit(self.X, self.y) + + def peakmem_fit(self, *args): + self.estimator.fit(self.X, self.y) + + def track_train_score(self, *args): + if hasattr(self.estimator, 'predict'): + y_pred = self.estimator.predict(self.X) + else: + y_pred = None + return float(self.train_scorer(self.y, y_pred)) + + def track_test_score(self, *args): + if hasattr(self.estimator, 'predict'): + y_val_pred = self.estimator.predict(self.X_val) + else: + y_val_pred = None + return float(self.test_scorer(self.y_val, y_val_pred)) + + +class Predictor(ABC): + """Abstract base class for benchmarks of estimators implementing predict""" + if Benchmark.bench_predict: + def time_predict(self, *args): + self.estimator.predict(self.X) + + def peakmem_predict(self, *args): + self.estimator.predict(self.X) + + if Benchmark.base_commit is not None: + def track_same_prediction(self, *args): + est_path = get_estimator_path(self, Benchmark.base_commit, + args, True) + with est_path.open(mode='rb') as f: + estimator_base = pickle.load(f) + + y_val_pred_base = estimator_base.predict(self.X_val) + y_val_pred = self.estimator.predict(self.X_val) + + return np.allclose(y_val_pred_base, y_val_pred) + + @property + @abstractmethod + def params(self): + pass + + +class Transformer(ABC): + """Abstract base class for benchmarks of estimators implementing transform + """ + if Benchmark.bench_transform: + def time_transform(self, *args): + self.estimator.transform(self.X) + + def peakmem_transform(self, *args): + self.estimator.transform(self.X) + + if Benchmark.base_commit is not None: + def track_same_transform(self, *args): + est_path = get_estimator_path(self, Benchmark.base_commit, + args, True) + with est_path.open(mode='rb') as f: + estimator_base = pickle.load(f) + + X_val_t_base = estimator_base.transform(self.X_val) + X_val_t = self.estimator.transform(self.X_val) + + return np.allclose(X_val_t_base, X_val_t) + + @property + @abstractmethod + def params(self): + pass diff --git a/asv_benchmarks/benchmarks/config.json b/asv_benchmarks/benchmarks/config.json new file mode 100644 index 0000000000000..f50827cdbd7b7 --- /dev/null +++ b/asv_benchmarks/benchmarks/config.json @@ -0,0 +1,33 @@ +{ + // "regular": Bencharks are run on small to medium datasets. Each benchmark + // is run multiple times and averaged. + // "fast": Benchmarks are run on small to medium datasets. Each benchmark + // is run only once. May provide unstable benchmarks. + // "large_scale": Benchmarks are run on large datasets. Each benchmark is + // run multiple times and averaged. This profile is meant to + // benchmark scalability and will take hours on single core. + // Can be overridden by environment variable SKLBENCH_PROFILE. + "profile": "regular", + + // List of values of n_jobs to use for estimators which accept this + // parameter (-1 means all cores). An empty list means all values from 1 to + // the maximum number of available cores. + // Can be overridden by environment variable SKLBENCH_NJOBS. + "n_jobs_vals": [1], + + // If true, fitted estimators are saved in ./cache/estimators/ + // Can be overridden by environment variable SKLBENCH_SAVE_ESTIMATORS. + "save_estimators": false, + + // Commit hash to compare estimator predictions with. + // If null, predictions are not compared. + // Can be overridden by environment variable SKLBENCH_BASE_COMMIT. + "base_commit": null, + + // If false, the predict (resp. transform) method of the estimators won't + // be benchmarked. + // Can be overridden by environment variables SKLBENCH_PREDICT and + // SKLBENCH_TRANSFORM. + "bench_predict": true, + "bench_transform": true +} diff --git a/asv_benchmarks/benchmarks/datasets.py b/asv_benchmarks/benchmarks/datasets.py new file mode 100644 index 0000000000000..709fe5746f8bc --- /dev/null +++ b/asv_benchmarks/benchmarks/datasets.py @@ -0,0 +1,145 @@ +import numpy as np +import scipy.sparse as sp +from joblib import Memory +from pathlib import Path + +from sklearn.decomposition import TruncatedSVD +from sklearn.datasets import (make_blobs, fetch_20newsgroups, + fetch_openml, load_digits, make_regression, + make_classification, fetch_olivetti_faces) +from sklearn.preprocessing import MaxAbsScaler, StandardScaler +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split + +# memory location for caching datasets +M = Memory(location=str(Path(__file__).resolve().parent / 'cache')) + + +@M.cache +def _blobs_dataset(n_samples=500000, n_features=3, n_clusters=100, + dtype=np.float32): + X, _ = make_blobs(n_samples=n_samples, n_features=n_features, + centers=n_clusters, random_state=0) + X = X.astype(dtype, copy=False) + + X, X_val = train_test_split(X, test_size=0.1, random_state=0) + return X, X_val, None, None + + +@M.cache +def _20newsgroups_highdim_dataset(n_samples=None, ngrams=(1, 1), + dtype=np.float32): + newsgroups = fetch_20newsgroups(random_state=0) + vectorizer = TfidfVectorizer(ngram_range=ngrams, dtype=dtype) + X = vectorizer.fit_transform(newsgroups.data[:n_samples]) + y = newsgroups.target[:n_samples] + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), + dtype=np.float32): + newsgroups = fetch_20newsgroups() + vectorizer = TfidfVectorizer(ngram_range=ngrams) + X = vectorizer.fit_transform(newsgroups.data) + X = X.astype(dtype, copy=False) + svd = TruncatedSVD(n_components=n_components) + X = svd.fit_transform(X) + y = newsgroups.target + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _mnist_dataset(dtype=np.float32): + X, y = fetch_openml('mnist_784', version=1, return_X_y=True) + X = X.astype(dtype, copy=False) + X = MaxAbsScaler().fit_transform(X) + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _digits_dataset(n_samples=None, dtype=np.float32): + X, y = load_digits(return_X_y=True) + X = X.astype(dtype, copy=False) + X = MaxAbsScaler().fit_transform(X) + X = X[:n_samples] + y = y[:n_samples] + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _synth_regression_dataset(n_samples=100000, n_features=100, + dtype=np.float32): + X, y = make_regression(n_samples=n_samples, n_features=n_features, + n_informative=n_features // 10, noise=50, + random_state=0) + X = X.astype(dtype, copy=False) + X = StandardScaler().fit_transform(X) + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _synth_regression_sparse_dataset(n_samples=10000, n_features=10000, + density=0.01, dtype=np.float32): + X = sp.random(m=n_samples, n=n_features, density=density, format='csr', + random_state=0) + X.data = np.random.RandomState(0).randn(X.getnnz()) + X = X.astype(dtype, copy=False) + coefs = sp.random(m=n_features, n=1, density=0.5, random_state=0) + coefs.data = np.random.RandomState(0).randn(coefs.getnnz()) + y = X.dot(coefs.toarray()).reshape(-1) + y += 0.2 * y.std() * np.random.randn(n_samples) + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _synth_classification_dataset(n_samples=1000, n_features=10000, + n_classes=2, dtype=np.float32): + X, y = make_classification(n_samples=n_samples, n_features=n_features, + n_classes=n_classes, random_state=0, + n_informative=n_features, n_redundant=0) + X = X.astype(dtype, copy=False) + X = StandardScaler().fit_transform(X) + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _olivetti_faces_dataset(): + dataset = fetch_olivetti_faces(shuffle=True, random_state=42) + faces = dataset.data + n_samples, n_features = faces.shape + faces_centered = faces - faces.mean(axis=0) + # local centering + faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) + X = faces_centered + + X, X_val = train_test_split(X, test_size=0.1, random_state=0) + return X, X_val, None, None + + +@M.cache +def _random_dataset(n_samples=1000, n_features=1000, + representation='dense', dtype=np.float32): + if representation == 'dense': + X = np.random.RandomState(0).random_sample((n_samples, n_features)) + X = X.astype(dtype, copy=False) + else: + X = sp.random(n_samples, n_features, density=0.05, format='csr', + dtype=dtype, random_state=0) + + X, X_val = train_test_split(X, test_size=0.1, random_state=0) + return X, X_val, None, None diff --git a/asv_benchmarks/benchmarks/decomposition.py b/asv_benchmarks/benchmarks/decomposition.py new file mode 100644 index 0000000000000..ea23b6d0d4c82 --- /dev/null +++ b/asv_benchmarks/benchmarks/decomposition.py @@ -0,0 +1,94 @@ +from sklearn.decomposition import (PCA, DictionaryLearning, + MiniBatchDictionaryLearning) + +from .common import Benchmark, Estimator, Transformer +from .datasets import _olivetti_faces_dataset, _mnist_dataset +from .utils import make_pca_scorers, make_dict_learning_scorers + + +class PCABenchmark(Transformer, Estimator, Benchmark): + """ + Benchmarks for PCA. + """ + + param_names = ['svd_solver'] + params = (['full', 'arpack', 'randomized'],) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + return _mnist_dataset() + + def make_estimator(self, params): + svd_solver, = params + + estimator = PCA(n_components=32, + svd_solver=svd_solver, + random_state=0) + + return estimator + + def make_scorers(self): + make_pca_scorers(self) + + +class DictionaryLearningBenchmark(Transformer, Estimator, Benchmark): + """ + Benchmarks for DictionaryLearning. + """ + + param_names = ['fit_algorithm', 'n_jobs'] + params = (['lars', 'cd'], Benchmark.n_jobs_vals) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + return _olivetti_faces_dataset() + + def make_estimator(self, params): + fit_algorithm, n_jobs = params + + estimator = DictionaryLearning(n_components=15, + fit_algorithm=fit_algorithm, + alpha=0.1, + max_iter=20, + tol=1e-16, + random_state=0, + n_jobs=n_jobs) + + return estimator + + def make_scorers(self): + make_dict_learning_scorers(self) + + +class MiniBatchDictionaryLearningBenchmark(Transformer, Estimator, Benchmark): + """ + Benchmarks for MiniBatchDictionaryLearning + """ + + param_names = ['fit_algorithm', 'n_jobs'] + params = (['lars', 'cd'], Benchmark.n_jobs_vals) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + return _olivetti_faces_dataset() + + def make_estimator(self, params): + fit_algorithm, n_jobs = params + + estimator = MiniBatchDictionaryLearning(n_components=15, + fit_algorithm=fit_algorithm, + alpha=0.1, + batch_size=3, + random_state=0, + n_jobs=n_jobs) + + return estimator + + def make_scorers(self): + make_dict_learning_scorers(self) diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py new file mode 100644 index 0000000000000..2c6995167ac6e --- /dev/null +++ b/asv_benchmarks/benchmarks/ensemble.py @@ -0,0 +1,81 @@ +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier + +from .common import Benchmark, Estimator, Predictor +from .datasets import (_20newsgroups_highdim_dataset, + _20newsgroups_lowdim_dataset) +from .utils import make_gen_classif_scorers + + +class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for RandomForestClassifier. + """ + + param_names = ['representation', 'n_jobs'] + params = (['dense', 'sparse'], Benchmark.n_jobs_vals) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, n_jobs = params + + if representation == 'sparse': + data = _20newsgroups_highdim_dataset() + else: + data = _20newsgroups_lowdim_dataset() + + return data + + def make_estimator(self, params): + representation, n_jobs = params + + n_estimators = 500 if Benchmark.data_size == 'large' else 100 + + estimator = RandomForestClassifier(n_estimators=n_estimators, + min_samples_split=10, + max_features='log2', + n_jobs=n_jobs, + random_state=0) + + return estimator + + def make_scorers(self): + make_gen_classif_scorers(self) + + +class GradientBoostingClassifierBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for GradientBoostingClassifier. + """ + + param_names = ['representation'] + params = (['dense', 'sparse'],) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, = params + + if representation == 'sparse': + data = _20newsgroups_highdim_dataset() + else: + data = _20newsgroups_lowdim_dataset() + + return data + + def make_estimator(self, params): + representation, = params + + n_estimators = 100 if Benchmark.data_size == 'large' else 10 + + estimator = GradientBoostingClassifier(n_estimators=n_estimators, + max_features='log2', + subsample=0.5, + random_state=0) + + return estimator + + def make_scorers(self): + make_gen_classif_scorers(self) diff --git a/asv_benchmarks/benchmarks/linear_model.py b/asv_benchmarks/benchmarks/linear_model.py new file mode 100644 index 0000000000000..e8f41a97a80cd --- /dev/null +++ b/asv_benchmarks/benchmarks/linear_model.py @@ -0,0 +1,250 @@ +from sklearn.linear_model import (LogisticRegression, Ridge, ElasticNet, Lasso, + LinearRegression, SGDRegressor) + +from .common import Benchmark, Estimator, Predictor +from .datasets import (_20newsgroups_highdim_dataset, + _20newsgroups_lowdim_dataset, + _synth_regression_dataset, + _synth_regression_sparse_dataset) +from .utils import make_gen_classif_scorers, make_gen_reg_scorers + + +class LogisticRegressionBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for LogisticRegression. + """ + + param_names = ['representation', 'solver', 'n_jobs'] + params = (['dense', 'sparse'], ['lbfgs', 'saga'], Benchmark.n_jobs_vals) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, solver, n_jobs = params + + if Benchmark.data_size == 'large': + if representation == 'sparse': + data = _20newsgroups_highdim_dataset(n_samples=10000) + else: + data = _20newsgroups_lowdim_dataset(n_components=1e3) + else: + if representation == 'sparse': + data = _20newsgroups_highdim_dataset(n_samples=2500) + else: + data = _20newsgroups_lowdim_dataset() + + return data + + def make_estimator(self, params): + representation, solver, n_jobs = params + + penalty = 'l2' if solver == 'lbfgs' else 'l1' + + estimator = LogisticRegression(solver=solver, + penalty=penalty, + multi_class='multinomial', + tol=0.01, + n_jobs=n_jobs, + random_state=0) + + return estimator + + def make_scorers(self): + make_gen_classif_scorers(self) + + +class RidgeBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for Ridge. + """ + + param_names = ['representation', 'solver'] + params = (['dense', 'sparse'], + ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, solver = params + + if representation == 'dense': + data = _synth_regression_dataset(n_samples=500000, n_features=100) + else: + data = _synth_regression_sparse_dataset(n_samples=100000, + n_features=10000, + density=0.005) + + return data + + def make_estimator(self, params): + representation, solver = params + + estimator = Ridge(solver=solver, + fit_intercept=False, + random_state=0) + + return estimator + + def make_scorers(self): + make_gen_reg_scorers(self) + + def skip(self, params): + representation, solver = params + + if representation == 'sparse' and solver == 'svd': + return True + return False + + +class LinearRegressionBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for Linear Reagression. + """ + + param_names = ['representation'] + params = (['dense', 'sparse'],) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, = params + + if representation == 'dense': + data = _synth_regression_dataset(n_samples=1000000, n_features=100) + else: + data = _synth_regression_sparse_dataset(n_samples=10000, + n_features=100000, + density=0.01) + + return data + + def make_estimator(self, params): + estimator = LinearRegression() + + return estimator + + def make_scorers(self): + make_gen_reg_scorers(self) + + +class SGDRegressorBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmark for SGD + """ + + param_names = ['representation'] + params = (['dense', 'sparse'],) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, = params + + if representation == 'dense': + data = _synth_regression_dataset(n_samples=100000, n_features=200) + else: + data = _synth_regression_sparse_dataset(n_samples=100000, + n_features=1000, + density=0.01) + + return data + + def make_estimator(self, params): + estimator = SGDRegressor(max_iter=1000, + tol=1e-16, + random_state=0) + + return estimator + + def make_scorers(self): + make_gen_reg_scorers(self) + + +class ElasticNetBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for ElasticNet. + """ + + param_names = ['representation', 'precompute'] + params = (['dense', 'sparse'], [True, False]) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, precompute = params + + if representation == 'dense': + data = _synth_regression_dataset(n_samples=1000000, n_features=100) + else: + data = _synth_regression_sparse_dataset(n_samples=50000, + n_features=5000, + density=0.01) + + return data + + def make_estimator(self, params): + representation, precompute = params + + estimator = ElasticNet(precompute=precompute, + alpha=0.001, + random_state=0) + + return estimator + + def make_scorers(self): + make_gen_reg_scorers(self) + + def skip(self, params): + representation, precompute = params + + if representation == 'sparse' and precompute is False: + return True + return False + + +class LassoBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for Lasso. + """ + + param_names = ['representation', 'precompute'] + params = (['dense', 'sparse'], [True, False]) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, precompute = params + + if representation == 'dense': + data = _synth_regression_dataset(n_samples=1000000, n_features=100) + else: + data = _synth_regression_sparse_dataset(n_samples=50000, + n_features=5000, + density=0.01) + + return data + + def make_estimator(self, params): + representation, precompute = params + + estimator = Lasso(precompute=precompute, + alpha=0.001, + random_state=0) + + return estimator + + def make_scorers(self): + make_gen_reg_scorers(self) + + def skip(self, params): + representation, precompute = params + + if representation == 'sparse' and precompute is False: + return True + return False diff --git a/asv_benchmarks/benchmarks/manifold.py b/asv_benchmarks/benchmarks/manifold.py new file mode 100644 index 0000000000000..26197dc8bbc31 --- /dev/null +++ b/asv_benchmarks/benchmarks/manifold.py @@ -0,0 +1,34 @@ +from sklearn.manifold import TSNE + +from .common import Benchmark, Estimator +from .datasets import _digits_dataset + + +class TSNEBenchmark(Estimator, Benchmark): + """ + Benchmarks for t-SNE. + """ + + param_names = ['method'] + params = (['exact', 'barnes_hut'],) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + method, = params + + n_samples = 500 if method == 'exact' else None + + return _digits_dataset(n_samples=n_samples) + + def make_estimator(self, params): + method, = params + + estimator = TSNE(random_state=0, method=method) + + return estimator + + def make_scorers(self): + self.train_scorer = lambda _, __: self.estimator.kl_divergence_ + self.test_scorer = lambda _, __: self.estimator.kl_divergence_ diff --git a/asv_benchmarks/benchmarks/metrics.py b/asv_benchmarks/benchmarks/metrics.py new file mode 100644 index 0000000000000..4a84cf1941a8f --- /dev/null +++ b/asv_benchmarks/benchmarks/metrics.py @@ -0,0 +1,45 @@ +from sklearn.metrics.pairwise import pairwise_distances + +from .common import Benchmark +from .datasets import _random_dataset + + +class PairwiseDistancesBenchmark(Benchmark): + """ + Benchmarks for pairwise distances. + """ + + param_names = ['representation', 'metric', 'n_jobs'] + params = (['dense', 'sparse'], + ['cosine', 'euclidean', 'manhattan', 'correlation'], + Benchmark.n_jobs_vals) + + def setup(self, *params): + representation, metric, n_jobs = params + + if representation == 'sparse' and metric == 'correlation': + raise NotImplementedError + + if Benchmark.data_size == 'large': + if metric in ('manhattan', 'correlation'): + n_samples = 8000 + else: + n_samples = 24000 + else: + if metric in ('manhattan', 'correlation'): + n_samples = 4000 + else: + n_samples = 12000 + + data = _random_dataset(n_samples=n_samples, + representation=representation) + self.X, self.X_val, self.y, self.y_val = data + + self.pdist_params = {'metric': metric, + 'n_jobs': n_jobs} + + def time_pairwise_distances(self, *args): + pairwise_distances(self.X, **self.pdist_params) + + def peakmem_pairwise_distances(self, *args): + pairwise_distances(self.X, **self.pdist_params) diff --git a/asv_benchmarks/benchmarks/model_selection.py b/asv_benchmarks/benchmarks/model_selection.py new file mode 100644 index 0000000000000..4e7058ffc2262 --- /dev/null +++ b/asv_benchmarks/benchmarks/model_selection.py @@ -0,0 +1,86 @@ +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import GridSearchCV, cross_val_score + +from .common import Benchmark, Estimator, Predictor +from .datasets import _synth_classification_dataset +from .utils import make_gen_classif_scorers + + +class CrossValidationBenchmark(Benchmark): + """ + Benchmarks for Cross Validation. + """ + + timeout = 20000 + + param_names = ['n_jobs'] + params = (Benchmark.n_jobs_vals,) + + def setup(self, *params): + n_jobs, = params + + data = _synth_classification_dataset(n_samples=50000, n_features=100) + self.X, self.X_val, self.y, self.y_val = data + + self.clf = RandomForestClassifier(n_estimators=50, + max_depth=10, + random_state=0) + + cv = 16 if Benchmark.data_size == 'large' else 4 + + self.cv_params = {'n_jobs': n_jobs, + 'cv': cv} + + def time_crossval(self, *args): + cross_val_score(self.clf, self.X, self.y, **self.cv_params) + + def peakmem_crossval(self, *args): + cross_val_score(self.clf, self.X, self.y, **self.cv_params) + + def track_crossval(self, *args): + return float(cross_val_score(self.clf, self.X, + self.y, **self.cv_params).mean()) + + +class GridSearchBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for GridSearch. + """ + + timeout = 20000 + + param_names = ['n_jobs'] + params = (Benchmark.n_jobs_vals,) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + data = _synth_classification_dataset(n_samples=10000, n_features=100) + + return data + + def make_estimator(self, params): + n_jobs, = params + + clf = RandomForestClassifier(random_state=0) + + if Benchmark.data_size == 'large': + n_estimators_list = [10, 25, 50, 100, 500] + max_depth_list = [5, 10, None] + max_features_list = [0.1, 0.4, 0.8, 1.0] + else: + n_estimators_list = [10, 25, 50] + max_depth_list = [5, 10] + max_features_list = [0.1, 0.4, 0.8] + + param_grid = {'n_estimators': n_estimators_list, + 'max_depth': max_depth_list, + 'max_features': max_features_list} + + estimator = GridSearchCV(clf, param_grid, n_jobs=n_jobs, cv=4) + + return estimator + + def make_scorers(self): + make_gen_classif_scorers(self) diff --git a/asv_benchmarks/benchmarks/neighbors.py b/asv_benchmarks/benchmarks/neighbors.py new file mode 100644 index 0000000000000..2be6cc2f09364 --- /dev/null +++ b/asv_benchmarks/benchmarks/neighbors.py @@ -0,0 +1,42 @@ +from sklearn.neighbors import KNeighborsClassifier + +from .common import Benchmark, Estimator, Predictor +from .datasets import _20newsgroups_lowdim_dataset +from .utils import make_gen_classif_scorers + + +class KNeighborsClassifierBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for KNeighborsClassifier. + """ + + param_names = ['algorithm', 'dimension', 'n_jobs'] + params = (['brute', 'kd_tree', 'ball_tree'], + ['low', 'high'], + Benchmark.n_jobs_vals) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + algorithm, dimension, n_jobs = params + + if Benchmark.data_size == 'large': + n_components = 40 if dimension == 'low' else 200 + else: + n_components = 10 if dimension == 'low' else 50 + + data = _20newsgroups_lowdim_dataset(n_components=n_components) + + return data + + def make_estimator(self, params): + algorithm, dimension, n_jobs = params + + estimator = KNeighborsClassifier(algorithm=algorithm, + n_jobs=n_jobs) + + return estimator + + def make_scorers(self): + make_gen_classif_scorers(self) diff --git a/asv_benchmarks/benchmarks/svm.py b/asv_benchmarks/benchmarks/svm.py new file mode 100644 index 0000000000000..bbcc7a27edecf --- /dev/null +++ b/asv_benchmarks/benchmarks/svm.py @@ -0,0 +1,32 @@ +from sklearn.svm import SVC + +from .common import Benchmark, Estimator, Predictor +from .datasets import _synth_classification_dataset +from .utils import make_gen_classif_scorers + + +class SVCBenchmark(Predictor, Estimator, Benchmark): + """Benchmarks for SVC.""" + + param_names = ['kernel'] + params = (['linear', 'poly', 'rbf', 'sigmoid'],) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + return _synth_classification_dataset() + + def make_estimator(self, params): + kernel, = params + + estimator = SVC(max_iter=100, + tol=1e-16, + kernel=kernel, + random_state=0, + gamma='scale') + + return estimator + + def make_scorers(self): + make_gen_classif_scorers(self) diff --git a/asv_benchmarks/benchmarks/utils.py b/asv_benchmarks/benchmarks/utils.py new file mode 100644 index 0000000000000..6a3073a634169 --- /dev/null +++ b/asv_benchmarks/benchmarks/utils.py @@ -0,0 +1,44 @@ +import numpy as np + +from sklearn.metrics import balanced_accuracy_score, r2_score + + +def neg_mean_inertia(X, labels, centers): + return - (np.asarray(X - centers[labels])**2).sum(axis=1).mean() + + +def make_gen_classif_scorers(caller): + caller.train_scorer = balanced_accuracy_score + caller.test_scorer = balanced_accuracy_score + + +def make_gen_reg_scorers(caller): + caller.test_scorer = r2_score + caller.train_scorer = r2_score + + +def neg_mean_data_error(X, U, V): + return - np.sqrt(((X - U.dot(V))**2).mean()) + + +def make_dict_learning_scorers(caller): + caller.train_scorer = lambda _, __: ( + neg_mean_data_error(caller.X, + caller.estimator.transform(caller.X), + caller.estimator.components_)) + caller.test_scorer = lambda _, __: ( + neg_mean_data_error(caller.X_val, + caller.estimator.transform(caller.X_val), + caller.estimator.components_)) + + +def explained_variance_ratio(Xt, X): + return np.var(Xt, axis=0).sum() / np.var(X, axis=0).sum() + + +def make_pca_scorers(caller): + caller.train_scorer = ( + lambda _, __: caller.estimator.explained_variance_ratio_.sum()) + caller.test_scorer = lambda _, __: ( + explained_variance_ratio(caller.estimator.transform(caller.X_val), + caller.X_val)) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index aafa06250e273..f24a1ba11bf77 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -396,9 +396,9 @@ complies with the following rules before marking a PR as ``[MRG]``. The the keywords (e.g., ``See also #1234``). 9. PRs should often substantiate the change, through benchmarks of - performance and efficiency or through examples of usage. Examples also - illustrate the features and intricacies of the library to users. Have a - look at other examples in the `examples/ + performance and efficiency (see :ref:`monitoring_performances`) or through + examples of usage. Examples also illustrate the features and intricacies of + the library to users. Have a look at other examples in the `examples/ `_ directory for reference. Examples should demonstrate why the new functionality is useful in practice and, if possible, compare it to other @@ -442,8 +442,7 @@ You can check for common programming errors with the following tools: - on properties with decorators Bonus points for contributions that include a performance analysis with -a benchmark script and profiling output (please report on the mailing -list or on the GitHub issue). +a benchmark script and profiling output (see :ref:`monitoring_performances`). Also check out the :ref:`performance-howto` guide for more details on profiling and Cython optimizations. @@ -810,6 +809,99 @@ To test code coverage, you need to install the `coverage 3. Loop. +.. _monitoring_performances: + +Monitoring performance +====================== + +*This section is heavily inspired from the* `pandas documentation +`_. + +When proposing changes to the existing code base, it's important to make sure +that they don't introduce performance regressions. Scikit-learn uses +`asv benchmarks `_ to monitor the +performance of a selection of common estimators and functions. The benchmark +suite can be found in the `scikit-learn/asv_benchmarks` directory. + +To use all features of asv, you will need either `conda` or `virtualenv`. For +more details please check the `asv installation webpage +`_. + +First of all you need to install the development version of asv:: + + pip install git+https://github.com/airspeed-velocity/asv + +and change your directory to `asv_benchmarks/`:: + + cd asv_benchmarks/ + +The benchmark suite is configured to run against your local clone of +scikit-learn. Make sure it is up to date:: + + git fetch upstream + +In the benchmark suite, the benchmarks are organized following the same +structure as scikit-learn. For example, you can compare the performance of a +specific estimator between upstream/master and the branch you are working on:: + + asv continuous -b LogisticRegression upstream/master HEAD + +The command uses conda by default for creating the benchmark environments. If +you want to use virtualenv instead, use the `-E` flag:: + + asv continuous -E virtualenv -b LogisticRegression upstream/master HEAD + +You can also specify a whole module to benchmark:: + + asv continuous -b linear_model upstream/master HEAD + +You can replace `HEAD` by any local branch. By default it will only report the +benchmarks that have change by at least 10%. You can control this ratio with +the `-f` flag. + +To run the full benchmark suite, simply remove the `-b` flag :: + + asv continuous upstream/master HEAD + +However this can take up to two hours. The `-b` flag also accepts a regular +expression for a more complex subset of benchmarks to run. + +To run the benchmarks without comparing to another branch, use the `run` +command:: + + asv run -b linear_model HEAD^! + +You can also run the benchmark suite using the version of scikit-learn already +installed in your current Python environment:: + + asv run --python=same + +It's particulary useful when you installed scikit-learn in editable mode to +avoid creating a new environment each time you run the benchmarks. By default +the results are not saved when using an existing installation. To save the +results you must specify a commit hash:: + + asv run --python=same --set-commit-hash= + +Benchmarks are saved and organized by machine, environment and commit. To see +the list of all saved benchmarks:: + + asv show + +and to see the report of a specific run:: + + asv show + +When running benchmarks for a pull request you're working on please report the +results on github. + +The benchmark suite supports additional configurable options which can be set +in the `benchmarks/config.json` configuration file. For example, the benchmarks +can run for a provided list of values for the `n_jobs` parameter. + +More information on how to write a benchmark and how to use asv can be found in +the `asv documentation `_. + Issue Tracker Tags ================== diff --git a/setup.cfg b/setup.cfg index f086993b26a29..1a09d8872e9b1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,6 +11,7 @@ addopts = --ignore doc --ignore examples --ignore maint_tools + --ignore asv_benchmarks --doctest-modules --disable-pytest-warnings -rxXs