scikit-learn · ogrisel · Oct 7, 2021 · Oct 1, 2021 · Oct 1, 2021
diff --git a/examples/applications/plot_digits_denoising.py b/examples/applications/plot_digits_denoising.py
@@ -87,9 +87,9 @@ def plot_digits(X, title):
 # Let's first have a look to see the difference between noise-free and noisy
 # images. We will check the test set in this regard.
 plot_digits(X_test, "Uncorrupted test images")
-plot_digits(X_test_noisy,
-            f"Noisy test images\n"
-            f"MSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}")
+plot_digits(
+    X_test_noisy, f"Noisy test images\nMSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}"
+)
 
 # %%
 # Learn the `PCA` basis
@@ -100,8 +100,9 @@ def plot_digits(X, title):
 from sklearn.decomposition import PCA, KernelPCA
 
 pca = PCA(n_components=32)
-kernel_pca = KernelPCA(n_components=400, kernel="rbf", gamma=1e-3,
-                       fit_inverse_transform=True, alpha=5e-3)
+kernel_pca = KernelPCA(
+    n_components=400, kernel="rbf", gamma=1e-3, fit_inverse_transform=True, alpha=5e-3
+)
 
 pca.fit(X_train_noisy)
 _ = kernel_pca.fit(X_train_noisy)
@@ -118,17 +119,21 @@ def plot_digits(X, title):
 # kernel to learn the PCA basis and a kernel ridge to learn the mapping
 # function.
 X_reconstructed_kernel_pca = kernel_pca.inverse_transform(
-    kernel_pca.transform(X_test_noisy))
+    kernel_pca.transform(X_test_noisy)
+)
 X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test_noisy))
 
 # %%
 plot_digits(X_test, "Uncorrupted test images")
-plot_digits(X_reconstructed_pca,
-            f"PCA reconstruction\n"
-            f"MSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}")
-plot_digits(X_reconstructed_kernel_pca,
-            f"Kernel PCA reconstruction\n"
-            f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}")
+plot_digits(
+    X_reconstructed_pca,
+    f"PCA reconstruction\nMSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}",
+)
+plot_digits(
+    X_reconstructed_kernel_pca,
+    "Kernel PCA reconstruction\n"
+    f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}",
+)
 
 # %%
 # PCA has a lower MSE than kernel PCA. However, the qualitative analysis might

diff --git a/examples/applications/plot_face_recognition.py b/examples/applications/plot_face_recognition.py
@@ -43,7 +43,7 @@
 print(__doc__)
 
 # Display progress logs on stdout
-logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
 
 
 # #############################################################################
@@ -75,19 +75,20 @@
 
 # split into a training and testing set
 X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=0.25, random_state=42)
+    X, y, test_size=0.25, random_state=42
+)
 
 
 # #############################################################################
 # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
 # dataset): unsupervised feature extraction / dimensionality reduction
 n_components = 150
 
-print("Extracting the top %d eigenfaces from %d faces"
-      % (n_components, X_train.shape[0]))
+print(
+    "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
+)
 t0 = time()
-pca = PCA(n_components=n_components, svd_solver='randomized',
-          whiten=True).fit(X_train)
+pca = PCA(n_components=n_components, svd_solver="randomized", whiten=True).fit(X_train)
 print("done in %0.3fs" % (time() - t0))
 
 eigenfaces = pca.components_.reshape((n_components, h, w))
@@ -104,11 +105,11 @@
 
 print("Fitting the classifier to the training set")
 t0 = time()
-param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
-              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
-clf = GridSearchCV(
-    SVC(kernel='rbf', class_weight='balanced'), param_grid
-)
+param_grid = {
+    "C": [1e3, 5e3, 1e4, 5e4, 1e5],
+    "gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
+}
+clf = GridSearchCV(SVC(kernel="rbf", class_weight="balanced"), param_grid)
 clf = clf.fit(X_train_pca, y_train)
 print("done in %0.3fs" % (time() - t0))
 print("Best estimator found by grid search:")
@@ -130,10 +131,11 @@
 # #############################################################################
 # Qualitative evaluation of the predictions using matplotlib
 
+
 def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
     """Helper function to plot a gallery of portraits"""
     plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
-    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
+    plt.subplots_adjust(bottom=0, left=0.01, right=0.99, top=0.90, hspace=0.35)
     for i in range(n_row * n_col):
         plt.subplot(n_row, n_col, i + 1)
         plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
@@ -144,14 +146,16 @@ def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
 
 # plot the result of the prediction on a portion of the test set
 
+
 def title(y_pred, y_test, target_names, i):
-    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
-    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
-    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)
+    pred_name = target_names[y_pred[i]].rsplit(" ", 1)[-1]
+    true_name = target_names[y_test[i]].rsplit(" ", 1)[-1]
+    return "predicted: %s\ntrue:      %s" % (pred_name, true_name)
 
 
-prediction_titles = [title(y_pred, y_test, target_names, i)
-                     for i in range(y_pred.shape[0])]
+prediction_titles = [
+    title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])
+]
 
 plot_gallery(X_test, prediction_titles, h, w)
 

diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py
@@ -72,23 +72,21 @@
 
 def generate_data(case):
     """Generate regression/classification data."""
-    if case == 'regression':
+    if case == "regression":
         X, y = datasets.load_diabetes(return_X_y=True)
-    elif case == 'classification':
-        X, y = datasets.fetch_20newsgroups_vectorized(subset='all',
-                                                      return_X_y=True)
+    elif case == "classification":
+        X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True)
     X, y = shuffle(X, y)
     offset = int(X.shape[0] * 0.8)
     X_train, y_train = X[:offset], y[:offset]
     X_test, y_test = X[offset:], y[offset:]
 
-    data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,
-            'y_test': y_test}
+    data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
     return data
 
 
-regression_data = generate_data('regression')
-classification_data = generate_data('classification')
+regression_data = generate_data("regression")
+classification_data = generate_data("classification")
 
 
 ##############################################################################
@@ -110,26 +108,33 @@ def benchmark_influence(conf):
     prediction_times = []
     prediction_powers = []
     complexities = []
-    for param_value in conf['changing_param_values']:
-        conf['tuned_params'][conf['changing_param']] = param_value
-        estimator = conf['estimator'](**conf['tuned_params'])
+    for param_value in conf["changing_param_values"]:
+        conf["tuned_params"][conf["changing_param"]] = param_value
+        estimator = conf["estimator"](**conf["tuned_params"])
 
         print("Benchmarking %s" % estimator)
-        estimator.fit(conf['data']['X_train'], conf['data']['y_train'])
-        conf['postfit_hook'](estimator)
-        complexity = conf['complexity_computer'](estimator)
+        estimator.fit(conf["data"]["X_train"], conf["data"]["y_train"])
+        conf["postfit_hook"](estimator)
+        complexity = conf["complexity_computer"](estimator)
         complexities.append(complexity)
         start_time = time.time()
-        for _ in range(conf['n_samples']):
-            y_pred = estimator.predict(conf['data']['X_test'])
-        elapsed_time = (time.time() - start_time) / float(conf['n_samples'])
+        for _ in range(conf["n_samples"]):
+            y_pred = estimator.predict(conf["data"]["X_test"])
+        elapsed_time = (time.time() - start_time) / float(conf["n_samples"])
         prediction_times.append(elapsed_time)
-        pred_score = conf['prediction_performance_computer'](
-            conf['data']['y_test'], y_pred)
+        pred_score = conf["prediction_performance_computer"](
+            conf["data"]["y_test"], y_pred
+        )
         prediction_powers.append(pred_score)
-        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
-            complexity, conf['prediction_performance_label'], pred_score,
-            elapsed_time))
+        print(
+            "Complexity: %d | %s: %.4f | Pred. Time: %fs\n"
+            % (
+                complexity,
+                conf["prediction_performance_label"],
+                pred_score,
+                elapsed_time,
+            )
+        )
     return prediction_powers, prediction_times, complexities
 
 
@@ -147,46 +152,58 @@ def benchmark_influence(conf):
 # different data.
 #
 
+
 def _count_nonzero_coefficients(estimator):
     a = estimator.coef_.toarray()
     return np.count_nonzero(a)
 
 
 configurations = [
-    {'estimator': SGDClassifier,
-     'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss':
-                      'modified_huber', 'fit_intercept': True, 'tol': 1e-3},
-     'changing_param': 'l1_ratio',
-     'changing_param_values': [0.25, 0.5, 0.75, 0.9],
-     'complexity_label': 'non_zero coefficients',
-     'complexity_computer': _count_nonzero_coefficients,
-     'prediction_performance_computer': hamming_loss,
-     'prediction_performance_label': 'Hamming Loss (Misclassification Ratio)',
-     'postfit_hook': lambda x: x.sparsify(),
-     'data': classification_data,
-     'n_samples': 30},
-    {'estimator': NuSVR,
-     'tuned_params': {'C': 1e3, 'gamma': 2 ** -15},
-     'changing_param': 'nu',
-     'changing_param_values': [0.1, 0.25, 0.5, 0.75, 0.9],
-     'complexity_label': 'n_support_vectors',
-     'complexity_computer': lambda x: len(x.support_vectors_),
-     'data': regression_data,
-     'postfit_hook': lambda x: x,
-     'prediction_performance_computer': mean_squared_error,
-     'prediction_performance_label': 'MSE',
-     'n_samples': 30},
-    {'estimator': GradientBoostingRegressor,
-     'tuned_params': {'loss': 'squared_error'},
-     'changing_param': 'n_estimators',
-     'changing_param_values': [10, 50, 100, 200, 500],
-     'complexity_label': 'n_trees',
-     'complexity_computer': lambda x: x.n_estimators,
-     'data': regression_data,
-     'postfit_hook': lambda x: x,
-     'prediction_performance_computer': mean_squared_error,
-     'prediction_performance_label': 'MSE',
-     'n_samples': 30},
+    {
+        "estimator": SGDClassifier,
+        "tuned_params": {
+            "penalty": "elasticnet",
+            "alpha": 0.001,
+            "loss": "modified_huber",
+            "fit_intercept": True,
+            "tol": 1e-3,
+        },
+        "changing_param": "l1_ratio",
+        "changing_param_values": [0.25, 0.5, 0.75, 0.9],
+        "complexity_label": "non_zero coefficients",
+        "complexity_computer": _count_nonzero_coefficients,
+        "prediction_performance_computer": hamming_loss,
+        "prediction_performance_label": "Hamming Loss (Misclassification Ratio)",
+        "postfit_hook": lambda x: x.sparsify(),
+        "data": classification_data,
+        "n_samples": 30,
+    },
+    {
+        "estimator": NuSVR,
+        "tuned_params": {"C": 1e3, "gamma": 2 ** -15},
+        "changing_param": "nu",
+        "changing_param_values": [0.1, 0.25, 0.5, 0.75, 0.9],
+        "complexity_label": "n_support_vectors",
+        "complexity_computer": lambda x: len(x.support_vectors_),
+        "data": regression_data,
+        "postfit_hook": lambda x: x,
+        "prediction_performance_computer": mean_squared_error,
+        "prediction_performance_label": "MSE",
+        "n_samples": 30,
+    },
+    {
+        "estimator": GradientBoostingRegressor,
+        "tuned_params": {"loss": "squared_error"},
+        "changing_param": "n_estimators",
+        "changing_param_values": [10, 50, 100, 200, 500],
+        "complexity_label": "n_trees",
+        "complexity_computer": lambda x: x.n_estimators,
+        "data": regression_data,
+        "postfit_hook": lambda x: x,
+        "prediction_performance_computer": mean_squared_error,
+        "prediction_performance_label": "MSE",
+        "n_samples": 30,
+    },
 ]
 
 
@@ -209,6 +226,7 @@ def _count_nonzero_coefficients(estimator):
 # ensemble is not as detrimental.
 #
 
+
 def plot_influence(conf, mse_values, prediction_times, complexities):
     """
     Plot influence of model complexity on both accuracy and latency.
@@ -219,38 +237,37 @@ def plot_influence(conf, mse_values, prediction_times, complexities):
 
     # first axes (prediction error)
     ax1 = fig.add_subplot(111)
-    line1 = ax1.plot(complexities, mse_values, c='tab:blue', ls='-')[0]
-    ax1.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
-    y1_label = conf['prediction_performance_label']
+    line1 = ax1.plot(complexities, mse_values, c="tab:blue", ls="-")[0]
+    ax1.set_xlabel("Model Complexity (%s)" % conf["complexity_label"])
+    y1_label = conf["prediction_performance_label"]
     ax1.set_ylabel(y1_label)
 
-    ax1.spines['left'].set_color(line1.get_color())
+    ax1.spines["left"].set_color(line1.get_color())
     ax1.yaxis.label.set_color(line1.get_color())
-    ax1.tick_params(axis='y', colors=line1.get_color())
+    ax1.tick_params(axis="y", colors=line1.get_color())
 
     # second axes (latency)
     ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)
-    line2 = ax2.plot(complexities, prediction_times, c='tab:orange', ls='-')[0]
+    line2 = ax2.plot(complexities, prediction_times, c="tab:orange", ls="-")[0]
     ax2.yaxis.tick_right()
     ax2.yaxis.set_label_position("right")
     y2_label = "Time (s)"
     ax2.set_ylabel(y2_label)
-    ax1.spines['right'].set_color(line2.get_color())
+    ax1.spines["right"].set_color(line2.get_color())
     ax2.yaxis.label.set_color(line2.get_color())
-    ax2.tick_params(axis='y', colors=line2.get_color())
+    ax2.tick_params(axis="y", colors=line2.get_color())
 
-    plt.legend((line1, line2), ("prediction error", "latency"),
-               loc='upper right')
+    plt.legend((line1, line2), ("prediction error", "latency"), loc="upper right")
 
-    plt.title("Influence of varying '%s' on %s" % (conf['changing_param'],
-                                                   conf['estimator'].__name__))
+    plt.title(
+        "Influence of varying '%s' on %s"
+        % (conf["changing_param"], conf["estimator"].__name__)
+    )
 
 
 for conf in configurations:
-    prediction_performances, prediction_times, complexities = \
-        benchmark_influence(conf)
-    plot_influence(conf, prediction_performances, prediction_times,
-                   complexities)
+    prediction_performances, prediction_times, complexities = benchmark_influence(conf)
+    plot_influence(conf, prediction_performances, prediction_times, complexities)
 plt.show()