8000 MNT Applies black formatting to most of the code base by thomasjpfan · Pull Request #18948 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

MNT Applies black formatting to most of the code base #18948

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 17, 2021
Merged
  •  
  •  
  •  
10 changes: 2 additions & 8 deletions .github/scripts/label_title_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,9 @@
title = issue.title


regex_to_labels = [
(r"\bDOC\b", "Documentation"),
(r"\bCI\b", "Build / CI")
]
regex_to_labels = [(r"\bDOC\b", "Documentation"), (r"\bCI\b", "Build / CI")]

labels_to_add = [
label for regex, label in regex_to_labels
if re.search(regex, title)
]
labels_to_add = [label for regex, label in regex_to_labels if re.search(regex, title)]

if labels_to_add:
issue.add_to_labels(*labels_to_add)
82 changes: 43 additions & 39 deletions asv_benchmarks/benchmarks/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ class KMeansBenchmark(Predictor, Transformer, Estimator, Benchmark):
Benchmarks for KMeans.
"""

param_names = ['representation', 'algorithm', 'init']
params = (['dense', 'sparse'], ['full', 'elkan'], ['random', 'k-means++'])
param_names = ["representation", "algorithm", "init"]
params = (["dense", "sparse"], ["full", "elkan"], ["random", "k-means++"])

def setup_cache(self):
super().setup_cache()

def make_data(self, params):
representation, algorithm, init = params

if representation == 'sparse':
if representation == "sparse":
data = _20newsgroups_highdim_dataset(n_samples=8000)
else:
data = _blobs_dataset(n_clusters=20)
Expand All @@ -29,44 +29,46 @@ def make_data(self, params):
def make_estimator(self, params):
representation, algorithm, init = params

max_iter = 30 if representation == 'sparse' else 100
max_iter = 30 if representation == "sparse" else 100

estimator = KMeans(n_clusters=20,
algorithm=algorithm,
init=init,
n_init=1,
max_iter=max_iter,
tol=-1,
random_state=0)
estimator = KMeans(
n_clusters=20,
algorithm=algorithm,
init=init,
n_init=1,
max_iter=max_iter,
tol=-1,
random_state=0,
)

return estimator

def make_scorers(self):
self.train_scorer = (
lambda _, __: neg_mean_inertia(self.X,
self.estimator.predict(self.X),
self.estimator.cluster_centers_))
self.test_scorer = (
lambda _, __: neg_mean_inertia(self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_))
self.train_scorer = lambda _, __: neg_mean_inertia(
self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_
)
self.test_scorer = lambda _, __: neg_mean_inertia(
self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_,
)


class MiniBatchKMeansBenchmark(Predictor, Transformer, Estimator, Benchmark):
"""
Benchmarks for MiniBatchKMeans.
"""

param_names = ['representation', 'init']
params = (['dense', 'sparse'], ['random', 'k-means++'])
param_names = ["representation", "init"]
params = (["dense", "sparse"], ["random", "k-means++"])

def setup_cache(self):
super().setup_cache()

def make_data(self, params):
representation, init = params

if representation == 'sparse':
if representation == "sparse":
data = _20newsgroups_highdim_dataset()
else:
data = _blobs_dataset(n_clusters=20)
Expand All @@ -76,25 +78,27 @@ def make_data(self, params):
def make_estimator(self, params):
representation, init = params

max_iter = 5 if representation == 'sparse' else 2
max_iter = 5 if representation == "sparse" else 2

estimator = MiniBatchKMeans(n_clusters=20,
init=init,
n_init=1,
max_iter=max_iter,
batch_size=1000,
max_no_improvement=None,
compute_labels=False,
random_state=0)
estimator = MiniBatchKMeans(
n_clusters=20,
init=init,
n_init=1,
max_iter=max_iter,
batch_size=1000,
max_no_improvement=None,
compute_labels=False,
random_state=0,
)

return estimator

def make_scorers(self):
self.train_scorer = (
lambda _, __: neg_mean_inertia(self.X,
self.estimator.predict(self.X),
self.estimator.cluster_centers_))
self.test_scorer = (
lambda _, __: neg_mean_inertia(self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_))
self.train_scorer = lambda _, __: neg_mean_inertia(
self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_
)
self.test_scorer = lambda _, __: neg_mean_inertia(
self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_,
)
125 changes: 73 additions & 52 deletions asv_benchmarks/benchmarks/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,86 +14,102 @@ def get_from_config():
"""Get benchmarks configuration from the config.json file"""
current_path = Path(__file__).resolve().parent

config_path = current_path / 'config.json'
with open(config_path, 'r') as config_file:
config_file = ''.join(line for line in config_file
if line and '//' not in line)
config_path = current_path / "config.json"
with open(config_path, "r") as config_file:
config_file = "".join(line for line in config_file if line and "//" not in line)
config = json.loads(config_file)

profile = os.getenv('SKLBENCH_PROFILE', config['profile'])
profile = os.getenv("SKLBENCH_PROFILE", config["profile"])

n_jobs_vals_env = os.getenv('SKLBENCH_NJOBS')
n_jobs_vals_env = os.getenv("SKLBENCH_NJOBS")
if n_jobs_vals_env:
n_jobs_vals = eval(n_jobs_vals_env)
else:
n_jobs_vals = config['n_jobs_vals']
n_jobs_vals = config["n_jobs_vals"]
if not n_jobs_vals:
n_jobs_vals = list(range(1, 1 + cpu_count()))

cache_path = current_path / 'cache'
cache_path = current_path / "cache"
cache_path.mkdir(exist_ok=True)
(cache_path / 'estimators').mkdir(exist_ok=True)
(cache_path / 'tmp').mkdir(exist_ok=True)
(cache_path / "estimators").mkdir(exist_ok=True)
(cache_path / "tmp").mkdir(exist_ok=True)

save_estimators = os.getenv('SKLBENCH_SAVE_ESTIMATORS',
config['save_estimators'])
save_dir = os.getenv('ASV_COMMIT', 'new')[:8]
save_estimators = os.getenv("SKLBENCH_SAVE_ESTIMATORS", config["save_estimators"])
save_dir = os.getenv("ASV_COMMIT", "new")[:8]

if save_estimators:
(cache_path / 'estimators' / save_dir).mkdir(exist_ok=True)
(cache_path / "estimators" / save_dir).mkdir(exist_ok=True)

base_commit = os.getenv('SKLBENCH_BASE_COMMIT', config['base_commit'])
base_commit = os.getenv("SKLBENCH_BASE_COMMIT", config["base_commit"])

bench_predict = os.getenv('SKLBENCH_PREDICT', config['bench_predict'])
bench_transform = os.getenv('SKLBENCH_TRANSFORM',
config['bench_transform'])
bench_predict = os.getenv("SKLBENCH_PREDICT", config["bench_predict"])
bench_transform = os.getenv("SKLBENCH_TRANSFORM", config["bench_transform"])

return (profile, n_jobs_vals, save_estimators, save_dir, base_commit,
bench_predict, bench_transform)
return (
profile,
n_jobs_vals,
save_estimators,
save_dir,
base_commit,
bench_predict,
bench_transform,
)


def get_estimator_path(benchmark, directory, params, save=False):
"""Get path of pickled fitted estimator"""
path = Path(__file__).resolve().parent / 'cache'
path = (path / 'estimators' / directory) if save else (path / 'tmp')
path = Path(__file__).resolve().parent / "cache"
path = (path / "estimators" / directory) if save else (path / "tmp")

filename = (benchmark.__class__.__name__
+ '_estimator_' + '_'.join(list(map(str, params))) + '.pkl')
filename = (
benchmark.__class__.__name__
+ "_estimator_"
+ "_".join(list(map(str, params)))
+ ".pkl"
)

return path / filename


def clear_tmp():
"""Clean the tmp directory"""
path = Path(__file__).resolve().parent / 'cache' / 'tmp'
path = Path(__file__).resolve().parent / "cache" / "tmp"
for child in path.iterdir():
child.unlink()


class Benchmark(ABC):
"""Abstract base class for all the benchmarks"""

timer = timeit.default_timer # wall time
processes = 1
timeout = 500

(profile, n_jobs_vals, save_estimators, save_dir, base_commit,
bench_predict, bench_transform) = get_from_config()

if profile == 'fast':
(
profile,
n_jobs_vals,
save_estimators,
save_dir,
base_commit,
bench_predict,
bench_transform,
) = get_from_config()

if profile == "fast":
warmup_time = 0
repeat = 1
number = 1
min_run_count = 1
data_size = 'small'
elif profile == 'regular':
data_size = "small"
elif profile == "regular":
warmup_time = 1
repeat = (3, 100, 30)
data_size = 'small'
elif profile == 'large_scale':
data_size = "small"
elif profile == "large_scale":
warmup_time = 1
repeat = 3
number = 1
data_size = 'large'
data_size = "large"

@property
@abstractmethod
Expand All @@ -103,6 +119,7 @@ def params(self):

class Estimator(ABC):
"""Abstract base class for all benchmarks of estimators"""

@abstractmethod
def make_data(self, params):
"""Return the dataset for a combination of parameters"""
Expand All @@ -112,8 +129,7 @@ def make_data(self, params):

@abstractmethod
def make_estimator(self, params):
"""Return an instance of the estimator for a combination of parameters
"""
"""Return an instance of the estimator for a combination of parameters"""
pass

def skip(self, params):
Expand All @@ -137,9 +153,10 @@ def setup_cache(self):

estimator.fit(X, y)

est_path = get_estimator_path(self, Benchmark.save_dir,
params, Benchmark.save_estimators)
with est_path.open(mode='wb') as f:
est_path = get_estimator_path(
self, Benchmark.save_dir, params, Benchmark.save_estimators
)
with est_path.open(mode="wb") as f:
pickle.dump(estimator, f)

def setup(self, *params):
Expand All @@ -152,9 +169,10 @@ def setup(self, *params):

self.X, self.X_val, self.y, self.y_val = self.make_data(params)

est_path = get_estimator_path(self, Benchmark.save_dir,
params, Benchmark.save_estimators)
with est_path.open(mode='rb') as f:
est_path = get_estimator_path(
self, Benchmark.save_dir, params, Benchmark.save_estimators
)
with est_path.open(mode="rb") as f:
self.estimator = pickle.load(f)

self.make_scorers()
Expand All @@ -166,14 +184,14 @@ def peakmem_fit(self, *args):
self.estimator.fit(self.X, self.y)

def track_train_score(self, *args):
if hasattr(self.estimator, 'predict'):
if hasattr(self.estimator, "predict"):
y_pred = self.estimator.predict(self.X)
else:
y_pred = None
return float(self.train_scorer(self.y, y_pred))

def track_test_score(self, *args):
if hasattr(self.estimator, 'predict'):
if hasattr(self.estimator, "predict"):
y_val_pred = self.estimator.predict(self.X_val)
else:
y_val_pred = None
Expand All @@ -182,18 +200,20 @@ def track_test_score(self, *args):

class Predictor(ABC):
"""Abstract base class for benchmarks of estimators implementing predict"""

if Benchmark.bench_predict:

def time_predict(self, *args):
self.estimator.predict(self.X)

def peakmem_predict(self, *args):
self.estimator.predict(self.X)

if Benchmark.base_commit is not None:

def track_same_prediction(self, *args):
est_path = get_estimator_path(self, Benchmark.base_commit,
args, True)
with est_path.open(mode='rb') as f:
est_path = get_estimator_path(self, Benchmark.base_commit, args, True)
with est_path.open(mode="rb") as f:
estimator_base = pickle.load(f)

y_val_pred_base = estimator_base.predict(self.X_val)
Expand All @@ -208,20 +228,21 @@ def params(self):


class Transformer(ABC):
"""Abstract base class for benchmarks of estimators implementing transform
"""
"""Abstract base class for benchmarks of estimators implementing transform"""

if Benchmark.bench_transform:

def time_transform(self, *args):
self.estimator.transform(self.X)

def peakmem_transform(self, *args):
self.estimator.transform(self.X)

if Benchmark.base_commit is not None:

def track_same_transform(self, *args):
est_path = get_estimator_path(self, Benchmark.base_commit,
args, True)
with est_path.open(mode='rb') as f:
est_path = get_estimator_path(self, Benchmark.base_commit, args, True)
with est_path.open(mode="rb") as f:
estimator_base = pickle.load(f)

X_val_t_base = estimator_base.transform(self.X_val)
Expand Down
Loading
0