8000 MNT Add asv benchmark suite (#17026) · jayzed82/scikit-learn@dc7c97c · GitHub
[go: up one dir, main page]

Skip to content

Commit dc7c97c

Browse files
jeremiedbbjayzed82
authored andcommitted
MNT Add asv benchmark suite (scikit-learn#17026)
1 parent 3d421d2 commit dc7c97c

18 files changed

+1488
-5
lines changed

asv_benchmarks/.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
*__pycache__*
2+
env/
3+
html/
4+
results/
5+
scikit-learn/
6+
benchmarks/cache/

asv_benchmarks/asv.conf.json

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
{
2+
// The version of the config file format. Do not change, unless
3+
// you know what you are doing.
4+
"version": 1,
5+
6+
// The name of the project being benchmarked
7+
"project": "scikit-learn",
8+
9+
// The project's homepage
10+
"project_url": "scikit-learn.org/",
11+
12+
// The URL or local path of the source code repository for the
13+
// project being benchmarked
14+
"repo": "..",
15+
16+
// The Python project's subdirectory in your repo. If missing or
17+
// the empty string, the project is assumed to be located at the root
18+
// of the repository.
19+
// "repo_subdir": "",
20+
21+
// Customizable commands for building, installing, and
22+
// uninstalling the project. See asv.conf.json documentation.
23+
//
24+
// "install_command": ["python -mpip install {wheel_file}"],
25+
// "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
26+
// "build_command": [
27+
// "python setup.py build",
28+
// "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
29+
// ],
30+
31+
// List of branches to benchmark. If not provided, defaults to "master"
32+
// (for git) or "default" (for mercurial).
33+
// "branches": ["master"], // for git
34+
// "branches": ["default"], // for mercurial
35+
36+
// The DVCS being used. If not set, it will be automatically
37+
// determined from "repo" by looking at the protocol in the URL
38+
// (if remote), or by looking for special directories, such as
39+
// ".git" (if local).
40+
// "dvcs": "git",
41+
42+
// The tool to use to create environments. May be "conda",
43+
// "virtualenv" or other value depending on the plugins in use.
44+
// If missing or the empty string, the tool will be automatically
45+
// determined by looking for tools on the PATH environment
46+
// variable.
47+
"environment_type": "conda",
48+
49+
// timeout in seconds for installing any dependencies in environment
50+
// defaults to 10 min
51+
//"install_timeout": 600,
52+
53+
// the base URL to show a commit for the project.
54+
"show_commit_url": "https://github.com/scikit-learn/scikit-learn/commit/",
55+
56+
// The Pythons you'd like to test against. If not provided, defaults
57+
// to the current version of Python used to run `asv`.
58+
// "pythons": ["3.6"],
59+
60+
// The list of conda channel names to be searched for benchmark
61+
// dependency packages in the specified order
62+
// "conda_channels": ["conda-forge", "defaults"]
63+
64+
// The matrix of dependencies to test. Each key is the name of a
65+
// package (in PyPI) and the values are version numbers. An empty
66+
// list or empty string indicates to just test against the default
67+
// (latest) version. null indicates that the package is to not be
68+
// installed. If the package to be tested is only available from
69+
// PyPi, and the 'environment_type' is conda, then you can preface
70+
// the package name by 'pip+', and the package will be installed via
71+
// pip (with all the conda available packages installed first,
72+
// followed by the pip installed packages).
73+
//
74+
"matrix": {
75+
"numpy": [],
76+
"scipy": [],
77+
"cython": [],
78+
"joblib": [],
79+
"threadpoolctl": []
80+
},
81+
82+
// Combinations of libraries/python versions can be excluded/included
83+
// from the set to test. Each entry is a dictionary containing additional
84+
// key-value pairs to include/exclude.
85+
//
86+
// An exclude entry excludes entries where all values match. The
87+
// values are regexps that should match the whole string.
88+
//
89+
// An include entry adds an environment. Only the packages listed
90+
// are installed. The 'python' key is required. The exclude rules
91+
// do not apply to includes.
92+
//
93+
// In addition to package names, the following keys are available:
94+
//
95+
// - python
96+
// Python version, as in the *pythons* variable above.
97+
// - environment_type
98+
// Environment type, as above.
99+
// - sys_platform
100+
// Platform, as in sys.platform. Possible values for the common
101+
// cases: 'linux2', 'win32', 'cygwin', 'darwin'.
102+
//
103+
// "exclude": [
104+
// {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
105+
// {"environment_type": "conda", "six": null}, // don't run without six on conda
106+
// ],
107+
//
108+
// "include": [
109+
// // additional env for python2.7
110+
// {"python": "2.7", "numpy": "1.8"},
111+
// // additional env if run on windows+conda
112+
// {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
113+
// ],
114+
115+
// The directory (relative to the current directory) that benchmarks are
116+
// stored in. If not provided, defaults to "benchmarks"
117+
// "benchmark_dir": "benchmarks",
118+
119+
// The directory (relative to the current directory) to cache the Python
120+
// environments in. If not provided, defaults to "env"
121+
// "env_dir": "env",
122+
123+
// The directory (relative to the current directory) that raw benchmark
124+
// results are stored in. If not provided, defaults to "results".
125+
// "results_dir": "results",
126+
127+
// The directory (relative to the current directory) that the html tree
128+
// should be written to. If not provided, defaults to "html".
129+
// "html_dir": "html",
130+
131+
// The number of characters to retain in the commit hashes.
132+
// "hash_length": 8,
133+
134+
// `asv` will cache results of the recent builds in each
135+
// environment, making them faster to install next time. This is
136+
// the number of builds to keep, per environment.
137+
// "build_cache_size": 2,
138+
139+
// The commits after which the regression search in `asv publish`
140+
// should start looking for regressions. Dictionary whose keys are
141+
// regexps matching to benchmark names, and values corresponding to
142+
// the commit (exclusive) after which to start looking for
143+
// regressions. The default is to start from the first commit
144+
// with results. If the commit is `null`, regression detection is
145+
// skipped for the matching benchmark.
146+
//
147+
// "regressions_first_commits": {
148+
// "some_benchmark": "352cdf", // Consider regressions only after this commit
149+
// "another_benchmark": null, // Skip regression detection altogether
150+
// },
151+
152+
// The thresholds for relative change in results, after which `asv
153+
// publish` starts reporting regressions. Dictionary of the same
154+
// form as in ``regressions_first_commits``, with values
155+
// indicating the thresholds. If multiple entries match, the
156+
// maximum is taken. If no entry matches, the default is 5%.
157+
//
158+
// "regressions_thresholds": {
159+
// "some_benchmark": 0.01, // Threshold of 1%
160+
// "another_benchmark": 0.5, // Threshold of 50%
161+
// },
162+
}

asv_benchmarks/benchmarks/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Benchmark suite for scikit-learn using ASV"""

asv_benchmarks/benchmarks/cluster.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from sklearn.cluster import KMeans, MiniBatchKMeans
2+
3+
from .common import Benchmark, Estimator, Predictor, Transformer
4+
from .datasets import _blobs_dataset, _20newsgroups_highdim_dataset
5+
from .utils import neg_mean_inertia
6+
7+
8+
class KMeansBenchmark(Predictor, Transformer, Estimator, Benchmark):
9+
"""
10+
Benchmarks for KMeans.
11+
"""
12+
13+
param_names = ['representation', 'algorithm', 'init']
14+
params = (['dense', 'sparse'], ['full', 'elkan'], ['random', 'k-means++'])
15+
16+
def setup_cache(self):
17+
super().setup_cache()
18+
19+
def make_data(self, params):
20+
representation, algorithm, init = params
21+
22+
if representation == 'sparse':
23+
data = _20newsgroups_highdim_dataset(n_samples=8000)
24+
else:
25+
data = _blobs_dataset(n_clusters=20)
26+
27+
return data
28+
29+
def make_estimator(self, params):
30+
representation, algorithm, init = params
31+
32+
max_iter = 30 if representation == 'sparse' else 100
33+
34+
estimator = KMeans(n_clusters=20,
35+
algorithm=algorithm,
36+
init=init,
37+
n_init=1,
38+
max_iter=max_iter,
39+
tol=-1,
40+
random_state=0)
41+
42+
return estimator
43+
44+
def make_scorers(self):
45+
self.train_scorer = (
46+
lambda _, __: neg_mean_inertia(self.X,
47+
self.estimator.predict(self.X),
48+
self.estimator.cluster_centers_))
49+
self.test_scorer = (
50+
lambda _, __: neg_mean_inertia(self.X_val,
51+
self.estimator.predict(self.X_val),
52+
self.estimator.cluster_centers_))
53+
54+
55+
class MiniBatchKMeansBenchmark(Predictor, Transformer, Estimator, Benchmark):
56+
"""
57+
Benchmarks for MiniBatchKMeans.
58+
"""
59+
60+
param_names = ['representation', 'init']
61+
params = (['dense', 'sparse'], ['random', 'k-means++'])
62+
63+
def setup_cache(self):
64+
super().setup_cache()
65+
66+
def make_data(self, params):
67+
representation, init = params
68+
69+
if representation == 'sparse':
70+
data = _20newsgroups_highdim_dataset()
71+
else:
72+
data = _blobs_dataset(n_clusters=20)
73+
74+
return data
75+
76+
def make_estimator(self, params):
77+
representation, init = params
78+
79+
max_iter = 5 if representation == 'sparse' else 2
80+
81+
estimator = MiniBatchKMeans(n_clusters=20,
82+
init=init,
83+
n_init=1,
84+
max_iter=max_iter,
85+
batch_size=1000,
86+
max_no_improvement=None,
87+
compute_labels=False,
88+
random_state=0)
89+
90+
return estimator
91+
92+
def make_scorers(self):
93+
self.train_scorer = (
94+
lambda _, __: neg_mean_inertia(self.X,
95+
self.estimator.predict(self.X),
96+
self.estimator.cluster_centers_))
97+
self.test_scorer = (
98+
lambda _, __: neg_mean_inertia(self.X_val,
99+
self.estimator.predict(self.X_val),
100+
self.estimator.cluster_centers_))

0 commit comments

Comments
 (0)
0