From b10a007b9e258736a2bf1795e12b33f588bb6878 Mon Sep 17 00:00:00 2001
From: plagree <paul.lagree@gmail.com>
Date: Thu, 8 Jun 2017 15:26:48 +0200
Subject: [PATCH 1/4] DOC examples with correct notebook style

---
 examples/cluster/plot_dict_face_patches.py                   | 2 ++
 examples/feature_selection/plot_feature_selection.py         | 4 ----
 .../plot_permutation_test_for_classification.py              | 2 --
 examples/plot_kernel_ridge_regression.py                     | 5 +----
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/examples/cluster/plot_dict_face_patches.py b/examples/cluster/plot_dict_face_patches.py
index 7fb125ed735e9..654fd3162e87b 100644
--- a/examples/cluster/plot_dict_face_patches.py
+++ b/examples/cluster/plot_dict_face_patches.py
@@ -34,6 +34,7 @@
 
 ###############################################################################
 # Learn the dictionary of images
+# ------------------------------
 
 print('Learning the dictionary... ')
 rng = np.random.RandomState(0)
@@ -68,6 +69,7 @@
 
 ###############################################################################
 # Plot the results
+# ----------------
 plt.figure(figsize=(4.2, 4))
 for i, patch in enumerate(kmeans.cluster_centers_):
     plt.subplot(9, 9, i + 1)
diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py
index 5d123985a01bb..61c17dc87c4e1 100644
--- a/examples/feature_selection/plot_feature_selection.py
+++ b/examples/feature_selection/plot_feature_selection.py
@@ -27,7 +27,6 @@
 from sklearn import datasets, svm
 from sklearn.feature_selection import SelectPercentile, f_classif
 
-###############################################################################
 # import some data to play with
 
 # The iris dataset
@@ -40,13 +39,11 @@
 X = np.hstack((iris.data, E))
 y = iris.target
 
-###############################################################################
 plt.figure(1)
 plt.clf()
 
 X_indices = np.arange(X.shape[-1])
 
-###############################################################################
 # Univariate feature selection with F-test for feature scoring
 # We use the default selection function: the 10% most significant features
 selector = SelectPercentile(f_classif, percentile=10)
@@ -57,7 +54,6 @@
         label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
         edgecolor='black')
 
-###############################################################################
 # Compare to the weights of an SVM
 clf = svm.SVC(kernel='linear')
 clf.fit(X, y)
diff --git a/examples/feature_selection/plot_permutation_test_for_classification.py b/examples/feature_selection/plot_permutation_test_for_classification.py
index 8cadbfa91ad09..84b1c5a3fca4e 100644
--- a/examples/feature_selection/plot_permutation_test_for_classification.py
+++ b/examples/feature_selection/plot_permutation_test_for_classification.py
@@ -25,7 +25,6 @@
 from sklearn import datasets
 
 
-##############################################################################
 # Loading a dataset
 iris = datasets.load_iris()
 X = iris.data
@@ -47,7 +46,6 @@
 
 print("Classification score %s (pvalue : %s)" % (score, pvalue))
 
-###############################################################################
 # View histogram of permutation scores
 plt.hist(permutation_scores, 20, label='Permutation scores',
          edgecolor='black')
diff --git a/examples/plot_kernel_ridge_regression.py b/examples/plot_kernel_ridge_regression.py
index 85cd9990c1f68..6ad422227bb21 100644
--- a/examples/plot_kernel_ridge_regression.py
+++ b/examples/plot_kernel_ridge_regression.py
@@ -48,7 +48,6 @@
 
 rng = np.random.RandomState(0)
 
-#############################################################################
 # Generate sample data
 X = 5 * rng.rand(10000, 1)
 y = np.sin(X).ravel()
@@ -58,7 +57,6 @@
 
 X_plot = np.linspace(0, 5, 100000)[:, None]
 
-#############################################################################
 # Fit regression model
 train_size = 100
 svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,
@@ -97,8 +95,7 @@
       % (X_plot.shape[0], kr_predict))
 
 
-#############################################################################
-# look at the results
+# Look at the results
 sv_ind = svr.best_estimator_.support_
 plt.scatter(X[sv_ind], y[sv_ind], c='r', s=50, label='SVR support vectors',
             zorder=2, edgecolors=(0, 0, 0))

From 187b51c7235660d318094361526ac7d2212ddcff Mon Sep 17 00:00:00 2001
From: plagree <paul.lagree@gmail.com>
Date: Thu, 8 Jun 2017 17:35:36 +0200
Subject: [PATCH 2/4] Modifications in examples/ to avoid unwanted notebook
 style

---
 examples/applications/plot_face_recognition.py         |  6 ------
 .../applications/plot_model_complexity_influence.py    |  6 ++----
 examples/applications/plot_prediction_latency.py       |  5 ++---
 examples/applications/plot_stock_market.py             |  5 -----
 .../applications/wikipedia_principal_eigenvector.py    |  2 --
 examples/calibration/plot_calibration.py               |  1 -
 examples/calibration/plot_compare_calibration.py       |  1 -
 examples/classification/plot_lda_qda.py                | 10 +++-------
 examples/cluster/plot_affinity_propagation.py          |  3 ---
 examples/cluster/plot_dbscan.py                        |  3 ---
 examples/cluster/plot_dict_face_patches.py             |  4 ----
 examples/cluster/plot_face_ward_segmentation.py        |  4 ----
 ...ot_feature_agglomeration_vs_univariate_selection.py |  3 ---
 examples/cluster/plot_kmeans_digits.py                 |  1 -
 examples/cluster/plot_mean_shift.py                    |  3 ---
 examples/cluster/plot_mini_batch_kmeans.py             |  4 ----
 examples/cluster/plot_segmentation_toy.py              |  3 ---
 .../cluster/plot_ward_structured_vs_unstructured.py    |  6 ------
 examples/covariance/plot_covariance_estimation.py      |  4 ----
 examples/covariance/plot_mahalanobis_distances.py      |  1 -
 examples/covariance/plot_sparse_cov.py                 |  3 ---
 .../plot_compare_cross_decomposition.py                |  5 -----
 examples/decomposition/plot_faces_decomposition.py     |  5 -----
 .../decomposition/plot_ica_blind_source_separation.py  |  2 --
 examples/decomposition/plot_ica_vs_pca.py              |  2 --
 examples/decomposition/plot_image_denoising.py         |  4 ----
 examples/decomposition/plot_pca_3d.py                  |  2 --
 .../decomposition/plot_pca_vs_fa_model_selection.py    |  2 --
 examples/ensemble/plot_gradient_boosting_regression.py |  4 ----
 examples/exercises/plot_cv_diabetes.py                 |  1 -
 examples/feature_selection/plot_feature_selection.py   |  2 +-
 examples/linear_model/plot_ard.py                      |  3 ---
 examples/linear_model/plot_bayesian_ridge.py           |  3 ---
 examples/linear_model/plot_lasso_and_elasticnet.py     |  5 +----
 .../linear_model/plot_lasso_dense_vs_sparse_data.py    |  2 --
 examples/linear_model/plot_lasso_model_selection.py    |  3 +++
 examples/linear_model/plot_logistic_path.py            |  1 -
 examples/linear_model/plot_multi_task_lasso_support.py |  1 -
 examples/linear_model/plot_ols_3d.py                   |  1 -
 examples/linear_model/plot_ridge_path.py               |  2 --
 examples/linear_model/plot_theilsen.py                 |  2 --
 .../grid_search_text_feature_extraction.py             |  4 +---
 examples/model_selection/plot_roc_crossval.py          |  4 +---
 .../model_selection/plot_train_error_vs_test_error.py  |  3 ---
 examples/neighbors/plot_regression.py                  |  2 --
 .../plot_rbm_logistic_classification.py                |  4 ----
 examples/plot_isotonic_regression.py                   |  4 +---
 .../semi_supervised/plot_label_propagation_digits.py   | 10 ++++------
 .../plot_label_propagation_structure.py                |  2 --
 examples/svm/plot_rbf_parameters.py                    |  5 +----
 examples/svm/plot_svm_anova.py                         |  3 ---
 examples/svm/plot_svm_regression.py                    |  6 +-----
 examples/text/document_classification_20newsgroups.py  |  2 --
 examples/text/document_clustering.py                   |  2 --
 54 files changed, 21 insertions(+), 160 deletions(-)

diff --git a/examples/applications/plot_face_recognition.py b/examples/applications/plot_face_recognition.py
index 039af7ea2feb6..123c4b4bdd9b7 100644
--- a/examples/applications/plot_face_recognition.py
+++ b/examples/applications/plot_face_recognition.py
@@ -48,7 +48,6 @@
 logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
 
 
-###############################################################################
 # Download the data, if not already on disk and load it as numpy arrays
 
 lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
@@ -72,7 +71,6 @@
 print("n_classes: %d" % n_classes)
 
 
-###############################################################################
 # Split into a training set and a test set using a stratified k fold
 
 # split into a training and testing set
@@ -80,7 +78,6 @@
     X, y, test_size=0.25, random_state=42)
 
 
-###############################################################################
 # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
 # dataset): unsupervised feature extraction / dimensionality reduction
 n_components = 150
@@ -101,7 +98,6 @@
 print("done in %0.3fs" % (time() - t0))
 
 
-###############################################################################
 # Train a SVM classification model
 
 print("Fitting the classifier to the training set")
@@ -115,7 +111,6 @@
 print(clf.best_estimator_)
 
 
-###############################################################################
 # Quantitative evaluation of the model quality on the test set
 
 print("Predicting people's names on the test set")
@@ -127,7 +122,6 @@
 print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
 
 
-###############################################################################
 # Qualitative evaluation of the predictions using matplotlib
 
 def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py
index 90fd5c718e78f..f7df14f63b4b7 100644
--- a/examples/applications/plot_model_complexity_influence.py
+++ b/examples/applications/plot_model_complexity_influence.py
@@ -34,11 +34,10 @@
 from sklearn.linear_model.stochastic_gradient import SGDClassifier
 from sklearn.metrics import hamming_loss
 
-###############################################################################
 # Routines
 
 
-# initialize random generator
+# Initialize random generator
 np.random.seed(0)
 
 
@@ -122,8 +121,7 @@ def _count_nonzero_coefficients(estimator):
     a = estimator.coef_.toarray()
     return np.count_nonzero(a)
 
-###############################################################################
-# main code
+# Main code
 regression_data = generate_data('regression')
 classification_data = generate_data('classification', sparse=True)
 configurations = [
diff --git a/examples/applications/plot_prediction_latency.py b/examples/applications/plot_prediction_latency.py
index a375c1cc8f3c3..156a5d33ee2af 100644
--- a/examples/applications/plot_prediction_latency.py
+++ b/examples/applications/plot_prediction_latency.py
@@ -266,12 +266,11 @@ def plot_benchmark_throughput(throughputs, configuration):
     plt.show()
 
 
-###############################################################################
-# main code
+# Main code
 
 start_time = time.time()
 
-# benchmark bulk/atomic prediction speed for various regressors
+# Benchmark bulk/atomic prediction speed for various regressors
 configuration = {
     'n_train': int(1e3),
     'n_test': int(1e2),
diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py
index cd1745bb1825f..bd4fd1f5fc9c7 100644
--- a/examples/applications/plot_stock_market.py
+++ b/examples/applications/plot_stock_market.py
@@ -74,7 +74,6 @@
 from sklearn import cluster, covariance, manifold
 
 
-###############################################################################
 # Retrieve the data from Internet
 
 def quotes_historical_google(symbol, date1, date2):
@@ -189,7 +188,6 @@ def quotes_historical_google(symbol, date1, date2):
 variation = close_prices - open_prices
 
 
-###############################################################################
 # Learn a graphical structure from the correlations
 edge_model = covariance.GraphLassoCV()
 
@@ -199,7 +197,6 @@ def quotes_historical_google(symbol, date1, date2):
 X /= X.std(axis=0)
 edge_model.fit(X)
 
-###############################################################################
 # Cluster using affinity propagation
 
 _, labels = cluster.affinity_propagation(edge_model.covariance_)
@@ -208,7 +205,6 @@ def quotes_historical_google(symbol, date1, date2):
 for i in range(n_labels + 1):
     print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
 
-###############################################################################
 # Find a low-dimension embedding for visualization: find the best position of
 # the nodes (the stocks) on a 2D plane
 
@@ -220,7 +216,6 @@ def quotes_historical_google(symbol, date1, date2):
 
 embedding = node_position_model.fit_transform(X.T).T
 
-###############################################################################
 # Visualization
 plt.figure(1, facecolor='w', figsize=(10, 8))
 plt.clf()
diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py
index d60121e8ece31..c0b5529d2e3f6 100644
--- a/examples/applications/wikipedia_principal_eigenvector.py
+++ b/examples/applications/wikipedia_principal_eigenvector.py
@@ -52,7 +52,6 @@
 
 print(__doc__)
 
-###############################################################################
 # Where to download the data, if not already on disk
 redirects_url = "http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2"
 redirects_filename = redirects_url.rsplit("/", 1)[1]
@@ -73,7 +72,6 @@
         print()
 
 
-###############################################################################
 # Loading the redirect files
 
 memory = Memory(cachedir=".")
diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py
index b38b25812bb7f..174812be4d1e6 100644
--- a/examples/calibration/plot_calibration.py
+++ b/examples/calibration/plot_calibration.py
@@ -83,7 +83,6 @@
 clf_sigmoid_score = brier_score_loss(y_test, prob_pos_sigmoid, sw_test)
 print("With sigmoid calibration: %1.3f" % clf_sigmoid_score)
 
-###############################################################################
 # Plot the data and the predicted probabilities
 plt.figure()
 y_unique = np.unique(y)
diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index 2e914696fc177..28624090f5da6 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -81,7 +81,6 @@
 rfc = RandomForestClassifier(n_estimators=100)
 
 
-###############################################################################
 # Plot calibration plots
 
 plt.figure(figsize=(10, 10))
diff --git a/examples/classification/plot_lda_qda.py b/examples/classification/plot_lda_qda.py
index a668e7cc0db0c..74744ab8348f9 100644
--- a/examples/classification/plot_lda_qda.py
+++ b/examples/classification/plot_lda_qda.py
@@ -20,8 +20,7 @@ class has its own standard deviation with QDA.
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 
-###############################################################################
-# colormap
+# Colormap
 cmap = colors.LinearSegmentedColormap(
     'red_blue_classes',
     {'red': [(0, 1, 1), (1, 0.7, 0.7)],
@@ -30,8 +29,7 @@ class has its own standard deviation with QDA.
 plt.cm.register_cmap(cmap=cmap)
 
 
-###############################################################################
-# generate datasets
+# Generate datasets
 def dataset_fixed_cov():
     '''Generate 2 Gaussians samples with the same covariance matrix'''
     n, dim = 300, 2
@@ -54,8 +52,7 @@ def dataset_cov():
     return X, y
 
 
-###############################################################################
-# plot functions
+# Plot functions
 def plot_data(lda, X, y, y_pred, fig_index):
     splot = plt.subplot(2, 2, fig_index)
     if fig_index == 1:
@@ -132,7 +129,6 @@ def plot_qda_cov(qda, splot):
     plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')
     plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')
 
-###############################################################################
 for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
     # Linear Discriminant Analysis
     lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py
index 0d6c395a4e4bf..edce07c7206e9 100644
--- a/examples/cluster/plot_affinity_propagation.py
+++ b/examples/cluster/plot_affinity_propagation.py
@@ -14,13 +14,11 @@
 from sklearn import metrics
 from sklearn.datasets.samples_generator import make_blobs
 
-##############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
 X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,
                             random_state=0)
 
-##############################################################################
 # Compute Affinity Propagation
 af = AffinityPropagation(preference=-50).fit(X)
 cluster_centers_indices = af.cluster_centers_indices_
@@ -39,7 +37,6 @@
 print("Silhouette Coefficient: %0.3f"
       % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
 
-##############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 from itertools import cycle
diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py
index a12b3d39128b6..83c07bcbb892a 100644
--- a/examples/cluster/plot_dbscan.py
+++ b/examples/cluster/plot_dbscan.py
@@ -17,7 +17,6 @@
 from sklearn.preprocessing import StandardScaler
 
 
-##############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
 X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
@@ -25,7 +24,6 @@
 
 X = StandardScaler().fit_transform(X)
 
-##############################################################################
 # Compute DBSCAN
 db = DBSCAN(eps=0.3, min_samples=10).fit(X)
 core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
@@ -46,7 +44,6 @@
 print("Silhouette Coefficient: %0.3f"
       % metrics.silhouette_score(X, labels))
 
-##############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 
diff --git a/examples/cluster/plot_dict_face_patches.py b/examples/cluster/plot_dict_face_patches.py
index 654fd3162e87b..ed837dbaa0cfd 100644
--- a/examples/cluster/plot_dict_face_patches.py
+++ b/examples/cluster/plot_dict_face_patches.py
@@ -32,9 +32,7 @@
 
 faces = datasets.fetch_olivetti_faces()
 
-###############################################################################
 # Learn the dictionary of images
-# ------------------------------
 
 print('Learning the dictionary... ')
 rng = np.random.RandomState(0)
@@ -67,9 +65,7 @@
 dt = time.time() - t0
 print('done in %.2fs.' % dt)
 
-###############################################################################
 # Plot the results
-# ----------------
 plt.figure(figsize=(4.2, 4))
 for i, patch in enumerate(kmeans.cluster_centers_):
     plt.subplot(9, 9, i + 1)
diff --git a/examples/cluster/plot_face_ward_segmentation.py b/examples/cluster/plot_face_ward_segmentation.py
index 687d87ce7f429..27b464a244641 100644
--- a/examples/cluster/plot_face_ward_segmentation.py
+++ b/examples/cluster/plot_face_ward_segmentation.py
@@ -25,7 +25,6 @@
 from sklearn.cluster import AgglomerativeClustering
 
 
-###############################################################################
 # Generate data
 try:  # SciPy >= 0.16 have face in misc
     from scipy.misc import face
@@ -38,11 +37,9 @@
 
 X = np.reshape(face, (-1, 1))
 
-###############################################################################
 # Define the structure A of the data. Pixels connected to their neighbors.
 connectivity = grid_to_graph(*face.shape)
 
-###############################################################################
 # Compute clustering
 print("Compute structured hierarchical clustering...")
 st = time.time()
@@ -55,7 +52,6 @@
 print("Number of pixels: ", label.size)
 print("Number of clusters: ", np.unique(label).size)
 
-###############################################################################
 # Plot the results on an image
 plt.figure(figsize=(5, 5))
 plt.imshow(face, cmap=plt.cm.gray)
diff --git a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
index ca3eb2a0035be..d6f65d83f959d 100644
--- a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
+++ b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
@@ -34,7 +34,6 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import KFold
 
-###############################################################################
 # Generate data
 n_samples = 200
 size = 40  # image size
@@ -58,7 +57,6 @@
 noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
 y += noise_coef * noise  # add noise
 
-###############################################################################
 # Compute the coefs of a Bayesian Ridge with GridSearch
 cv = KFold(2)  # cross-validation generator for model selection
 ridge = BayesianRidge()
@@ -88,7 +86,6 @@
 coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
 coef_selection_ = coef_.reshape(size, size)
 
-###############################################################################
 # Inverse the transformation to plot the results on an image
 plt.close('all')
 plt.figure(figsize=(7.3, 2.7))
diff --git a/examples/cluster/plot_kmeans_digits.py b/examples/cluster/plot_kmeans_digits.py
index 1e6fbbc019923..b08fd6b854fc4 100644
--- a/examples/cluster/plot_kmeans_digits.py
+++ b/examples/cluster/plot_kmeans_digits.py
@@ -84,7 +84,6 @@ def bench_k_means(estimator, name, data):
               data=data)
 print(82 * '_')
 
-###############################################################################
 # Visualize the results on PCA-reduced data
 
 reduced_data = PCA(n_components=2).fit_transform(data)
diff --git a/examples/cluster/plot_mean_shift.py b/examples/cluster/plot_mean_shift.py
index 775cd98e59527..2e2c75b962688 100644
--- a/examples/cluster/plot_mean_shift.py
+++ b/examples/cluster/plot_mean_shift.py
@@ -16,12 +16,10 @@
 from sklearn.cluster import MeanShift, estimate_bandwidth
 from sklearn.datasets.samples_generator import make_blobs
 
-###############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
 X, _ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6)
 
-###############################################################################
 # Compute clustering with MeanShift
 
 # The following bandwidth can be automatically detected using
@@ -37,7 +35,6 @@
 
 print("number of estimated clusters : %d" % n_clusters_)
 
-###############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 from itertools import cycle
diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py
index 56d999c6c846d..9c650be0a44e2 100644
--- a/examples/cluster/plot_mini_batch_kmeans.py
+++ b/examples/cluster/plot_mini_batch_kmeans.py
@@ -23,7 +23,6 @@
 from sklearn.metrics.pairwise import pairwise_distances_argmin
 from sklearn.datasets.samples_generator import make_blobs
 
-##############################################################################
 # Generate sample data
 np.random.seed(0)
 
@@ -32,7 +31,6 @@
 n_clusters = len(centers)
 X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)
 
-##############################################################################
 # Compute clustering with Means
 
 k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
@@ -40,7 +38,6 @@
 k_means.fit(X)
 t_batch = time.time() - t0
 
-##############################################################################
 # Compute clustering with MiniBatchKMeans
 
 mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
@@ -49,7 +46,6 @@
 mbk.fit(X)
 t_mini_batch = time.time() - t0
 
-##############################################################################
 # Plot result
 
 fig = plt.figure(figsize=(8, 3))
diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py
index 96f007400e492..fe24304e5b8cc 100644
--- a/examples/cluster/plot_segmentation_toy.py
+++ b/examples/cluster/plot_segmentation_toy.py
@@ -36,7 +36,6 @@
 from sklearn.feature_extraction import image
 from sklearn.cluster import spectral_clustering
 
-###############################################################################
 l = 100
 x, y = np.indices((l, l))
 
@@ -52,7 +51,6 @@
 circle3 = (x - center3[0]) ** 2 + (y - center3[1]) ** 2 < radius3 ** 2
 circle4 = (x - center4[0]) ** 2 + (y - center4[1]) ** 2 < radius4 ** 2
 
-###############################################################################
 # 4 circles
 img = circle1 + circle2 + circle3 + circle4
 
@@ -81,7 +79,6 @@
 plt.matshow(img)
 plt.matshow(label_im)
 
-###############################################################################
 # 2 circles
 img = circle1 + circle2
 mask = img.astype(bool)
diff --git a/examples/cluster/plot_ward_structured_vs_unstructured.py b/examples/cluster/plot_ward_structured_vs_unstructured.py
index 2471f68a6f8ed..102a78ba82eea 100644
--- a/examples/cluster/plot_ward_structured_vs_unstructured.py
+++ b/examples/cluster/plot_ward_structured_vs_unstructured.py
@@ -33,7 +33,6 @@
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.datasets.samples_generator import make_swiss_roll
 
-###############################################################################
 # Generate data (swiss roll dataset)
 n_samples = 1500
 noise = 0.05
@@ -41,7 +40,6 @@
 # Make it thinner
 X[:, 1] *= .5
 
-###############################################################################
 # Compute clustering
 print("Compute unstructured hierarchical clustering...")
 st = time.time()
@@ -51,7 +49,6 @@
 print("Elapsed time: %.2fs" % elapsed_time)
 print("Number of points: %i" % label.size)
 
-###############################################################################
 # Plot result
 fig = plt.figure()
 ax = p3.Axes3D(fig)
@@ -62,12 +59,10 @@
 plt.title('Without connectivity constraints (time %.2fs)' % elapsed_time)
 
 
-###############################################################################
 # Define the structure A of the data. Here a 10 nearest neighbors
 from sklearn.neighbors import kneighbors_graph
 connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
 
-###############################################################################
 # Compute clustering
 print("Compute structured hierarchical clustering...")
 st = time.time()
@@ -78,7 +73,6 @@
 print("Elapsed time: %.2fs" % elapsed_time)
 print("Number of points: %i" % label.size)
 
-###############################################################################
 # Plot result
 fig = plt.figure()
 ax = p3.Axes3D(fig)
diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py
index 96f637974ee29..8a663fec43fc8 100644
--- a/examples/covariance/plot_covariance_estimation.py
+++ b/examples/covariance/plot_covariance_estimation.py
@@ -52,7 +52,6 @@
 from sklearn.model_selection import GridSearchCV
 
 
-###############################################################################
 # Generate sample data
 n_features, n_samples = 40, 20
 np.random.seed(42)
@@ -64,7 +63,6 @@
 X_train = np.dot(base_X_train, coloring_matrix)
 X_test = np.dot(base_X_test, coloring_matrix)
 
-###############################################################################
 # Compute the likelihood on test data
 
 # spanning a range of possible shrinkage coefficient values
@@ -78,7 +76,6 @@
 emp_cov = empirical_covariance(X_train)
 loglik_real = -log_likelihood(emp_cov, linalg.inv(real_cov))
 
-###############################################################################
 # Compare different approaches to setting the parameter
 
 # GridSearch for an optimal shrinkage coefficient
@@ -94,7 +91,6 @@
 oa = OAS()
 loglik_oa = oa.fit(X_train).score(X_test)
 
-###############################################################################
 # Plot results
 fig = plt.figure()
 plt.title("Regularized covariance: likelihood and shrinkage coefficient")
diff --git a/examples/covariance/plot_mahalanobis_distances.py b/examples/covariance/plot_mahalanobis_distances.py
index 53329aa71b80f..166252fc7f61f 100644
--- a/examples/covariance/plot_mahalanobis_distances.py
+++ b/examples/covariance/plot_mahalanobis_distances.py
@@ -78,7 +78,6 @@
 # compare estimators learnt from the full data set with true parameters
 emp_cov = EmpiricalCovariance().fit(X)
 
-###############################################################################
 # Display results
 fig = plt.figure()
 plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
diff --git a/examples/covariance/plot_sparse_cov.py b/examples/covariance/plot_sparse_cov.py
index d9b7f0808fd75..8d42e7aaef929 100644
--- a/examples/covariance/plot_sparse_cov.py
+++ b/examples/covariance/plot_sparse_cov.py
@@ -59,7 +59,6 @@
 from sklearn.covariance import GraphLassoCV, ledoit_wolf
 import matplotlib.pyplot as plt
 
-##############################################################################
 # Generate the data
 n_samples = 60
 n_features = 20
@@ -79,7 +78,6 @@
 X -= X.mean(axis=0)
 X /= X.std(axis=0)
 
-##############################################################################
 # Estimate the covariance
 emp_cov = np.dot(X.T, X) / n_samples
 
@@ -91,7 +89,6 @@
 lw_cov_, _ = ledoit_wolf(X)
 lw_prec_ = linalg.inv(lw_cov_)
 
-##############################################################################
 # Plot the results
 plt.figure(figsize=(10, 6))
 plt.subplots_adjust(left=0.02, right=0.98)
diff --git a/examples/cross_decomposition/plot_compare_cross_decomposition.py b/examples/cross_decomposition/plot_compare_cross_decomposition.py
index 437c08b056479..65a2980d746e6 100644
--- a/examples/cross_decomposition/plot_compare_cross_decomposition.py
+++ b/examples/cross_decomposition/plot_compare_cross_decomposition.py
@@ -24,7 +24,6 @@
 import matplotlib.pyplot as plt
 from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA
 
-###############################################################################
 # Dataset based latent variables model
 
 n = 500
@@ -46,7 +45,6 @@
 print("Corr(Y)")
 print(np.round(np.corrcoef(Y.T), 2))
 
-###############################################################################
 # Canonical (symmetric) PLS
 
 # Transform data
@@ -106,7 +104,6 @@
 plt.yticks(())
 plt.show()
 
-###############################################################################
 # PLS regression, with multivariate response, a.k.a. PLS2
 
 n = 1000
@@ -126,7 +123,6 @@
 print(np.round(pls2.coef_, 1))
 pls2.predict(X)
 
-###############################################################################
 # PLS regression, with univariate response, a.k.a. PLS1
 
 n = 1000
@@ -139,7 +135,6 @@
 print("Estimated betas")
 print(np.round(pls1.coef_, 1))
 
-###############################################################################
 # CCA (PLS mode B with symmetric deflation)
 
 cca = CCA(n_components=2)
diff --git a/examples/decomposition/plot_faces_decomposition.py b/examples/decomposition/plot_faces_decomposition.py
index fce02751a1b0c..2b84bbf4374b0 100644
--- a/examples/decomposition/plot_faces_decomposition.py
+++ b/examples/decomposition/plot_faces_decomposition.py
@@ -32,7 +32,6 @@
 image_shape = (64, 64)
 rng = RandomState(0)
 
-###############################################################################
 # Load faces data
 dataset = fetch_olivetti_faces(shuffle=True, random_state=rng)
 faces = dataset.data
@@ -48,7 +47,6 @@
 print("Dataset consists of %d faces" % n_samples)
 
 
-###############################################################################
 def plot_gallery(title, images, n_col=n_col, n_row=n_row):
     plt.figure(figsize=(2. * n_col, 2.26 * n_row))
     plt.suptitle(title, size=16)
@@ -62,7 +60,6 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row):
         plt.yticks(())
     plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)
 
-###############################################################################
 # List of the different estimators, whether to center and transpose the
 # problem, and whether the transformer uses the clustering API.
 estimators = [
@@ -102,12 +99,10 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row):
 ]
 
 
-###############################################################################
 # Plot a sample of the input data
 
 plot_gallery("First centered Olivetti faces", faces_centered[:n_components])
 
-###############################################################################
 # Do the estimation and plot it
 
 for name, estimator, center in estimators:
diff --git a/examples/decomposition/plot_ica_blind_source_separation.py b/examples/decomposition/plot_ica_blind_source_separation.py
index 9ba5a1523a3c6..31b3e8a560605 100644
--- a/examples/decomposition/plot_ica_blind_source_separation.py
+++ b/examples/decomposition/plot_ica_blind_source_separation.py
@@ -21,7 +21,6 @@
 
 from sklearn.decomposition import FastICA, PCA
 
-###############################################################################
 # Generate sample data
 np.random.seed(0)
 n_samples = 2000
@@ -51,7 +50,6 @@
 pca = PCA(n_components=3)
 H = pca.fit_transform(X)  # Reconstruct signals based on orthogonal components
 
-###############################################################################
 # Plot results
 
 plt.figure()
diff --git a/examples/decomposition/plot_ica_vs_pca.py b/examples/decomposition/plot_ica_vs_pca.py
index 54655e519257a..cf9ce0925204a 100644
--- a/examples/decomposition/plot_ica_vs_pca.py
+++ b/examples/decomposition/plot_ica_vs_pca.py
@@ -37,7 +37,6 @@
 
 from sklearn.decomposition import PCA, FastICA
 
-###############################################################################
 # Generate sample data
 rng = np.random.RandomState(42)
 S = rng.standard_t(1.5, size=(20000, 2))
@@ -57,7 +56,6 @@
 S_ica_ /= S_ica_.std(axis=0)
 
 
-###############################################################################
 # Plot results
 
 def plot_samples(S, axis_list=None):
diff --git a/examples/decomposition/plot_image_denoising.py b/examples/decomposition/plot_image_denoising.py
index 29bdf6ba65217..68e62a9b6e305 100644
--- a/examples/decomposition/plot_image_denoising.py
+++ b/examples/decomposition/plot_image_denoising.py
@@ -44,7 +44,6 @@
 from sklearn.feature_extraction.image import reconstruct_from_patches_2d
 
 
-###############################################################################
 try:  # SciPy >= 0.16 have face in misc
     from scipy.misc import face
     face = face(gray=True)
@@ -75,7 +74,6 @@
 data /= np.std(data, axis=0)
 print('done in %.2fs.' % (time() - t0))
 
-###############################################################################
 # Learn the dictionary from reference patches
 
 print('Learning the dictionary...')
@@ -98,7 +96,6 @@
 plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
 
 
-###############################################################################
 # Display the distorted image
 
 def show_with_diff(image, reference, title):
@@ -123,7 +120,6 @@ def show_with_diff(image, reference, title):
 
 show_with_diff(distorted, face, 'Distorted image')
 
-###############################################################################
 # Extract noisy patches and reconstruct them using the dictionary
 
 print('Extracting noisy patches... ')
diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py
index f26d5d9d1c9bb..06592c0e6b221 100644
--- a/examples/decomposition/plot_pca_3d.py
+++ b/examples/decomposition/plot_pca_3d.py
@@ -26,7 +26,6 @@
 from scipy import stats
 
 
-###############################################################################
 # Create the data
 
 e = np.exp(1)
@@ -55,7 +54,6 @@ def pdf(x):
 b /= norm
 
 
-###############################################################################
 # Plot the figures
 def plot_figs(fig_num, elev, azim):
     fig = plt.figure(fig_num, figsize=(4, 3))
diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py
index 7944f327e3645..5601f9cdd7798 100644
--- a/examples/decomposition/plot_pca_vs_fa_model_selection.py
+++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py
@@ -39,7 +39,6 @@
 
 print(__doc__)
 
-###############################################################################
 # Create the data
 
 n_samples, n_features, rank = 1000, 50, 10
@@ -55,7 +54,6 @@
 sigmas = sigma * rng.rand(n_features) + sigma / 2.
 X_hetero = X + rng.randn(n_samples, n_features) * sigmas
 
-###############################################################################
 # Fit the models
 
 n_components = np.arange(0, n_features, 5)  # options for n_components
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index 0437fd924ef1d..468c338f42f2f 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -22,7 +22,6 @@
 from sklearn.utils import shuffle
 from sklearn.metrics import mean_squared_error
 
-###############################################################################
 # Load data
 boston = datasets.load_boston()
 X, y = shuffle(boston.data, boston.target, random_state=13)
@@ -31,7 +30,6 @@
 X_train, y_train = X[:offset], y[:offset]
 X_test, y_test = X[offset:], y[offset:]
 
-###############################################################################
 # Fit regression model
 params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
           'learning_rate': 0.01, 'loss': 'ls'}
@@ -41,7 +39,6 @@
 mse = mean_squared_error(y_test, clf.predict(X_test))
 print("MSE: %.4f" % mse)
 
-###############################################################################
 # Plot training deviance
 
 # compute test set deviance
@@ -61,7 +58,6 @@
 plt.xlabel('Boosting Iterations')
 plt.ylabel('Deviance')
 
-###############################################################################
 # Plot feature importance
 feature_importance = clf.feature_importances_
 # make importances relative to max importance
diff --git a/examples/exercises/plot_cv_diabetes.py b/examples/exercises/plot_cv_diabetes.py
index 6f3736d3c255b..0b41315069b2b 100644
--- a/examples/exercises/plot_cv_diabetes.py
+++ b/examples/exercises/plot_cv_diabetes.py
@@ -52,7 +52,6 @@
 plt.axhline(np.max(scores), linestyle='--', color='.5')
 plt.xlim([alphas[0], alphas[-1]])
 
-##############################################################################
 # Bonus: how much can you trust the selection of alpha?
 
 # To answer this question we use the LassoCV object that sets its alpha
diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py
index 61c17dc87c4e1..73badff72be2e 100644
--- a/examples/feature_selection/plot_feature_selection.py
+++ b/examples/feature_selection/plot_feature_selection.py
@@ -27,7 +27,7 @@
 from sklearn import datasets, svm
 from sklearn.feature_selection import SelectPercentile, f_classif
 
-# import some data to play with
+# Import some data to play with
 
 # The iris dataset
 iris = datasets.load_iris()
diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py
index 76d34d3150a5e..9947b110365cb 100644
--- a/examples/linear_model/plot_ard.py
+++ b/examples/linear_model/plot_ard.py
@@ -30,7 +30,6 @@
 
 from sklearn.linear_model import ARDRegression, LinearRegression
 
-###############################################################################
 # Generating simulated data with Gaussian weights
 
 # Parameters of the example
@@ -51,7 +50,6 @@
 # Create the target
 y = np.dot(X, w) + noise
 
-###############################################################################
 # Fit the ARD Regression
 clf = ARDRegression(compute_score=True)
 clf.fit(X, y)
@@ -59,7 +57,6 @@
 ols = LinearRegression()
 ols.fit(X, y)
 
-###############################################################################
 # Plot the true weights, the estimated weights, the histogram of the
 # weights, and predictions with standard deviations
 plt.figure(figsize=(6, 5))
diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py
index 0dbc854cf2ee2..c0a4bd8cdcfe7 100644
--- a/examples/linear_model/plot_bayesian_ridge.py
+++ b/examples/linear_model/plot_bayesian_ridge.py
@@ -30,7 +30,6 @@
 
 from sklearn.linear_model import BayesianRidge, LinearRegression
 
-###############################################################################
 # Generating simulated data with Gaussian weights
 np.random.seed(0)
 n_samples, n_features = 100, 100
@@ -48,7 +47,6 @@
 # Create the target
 y = np.dot(X, w) + noise
 
-###############################################################################
 # Fit the Bayesian Ridge Regression and an OLS for comparison
 clf = BayesianRidge(compute_score=True)
 clf.fit(X, y)
@@ -56,7 +54,6 @@
 ols = LinearRegression()
 ols.fit(X, y)
 
-###############################################################################
 # Plot true weights, estimated weights, histogram of the weights, and
 # predictions with standard deviations
 lw = 2
diff --git a/examples/linear_model/plot_lasso_and_elasticnet.py b/examples/linear_model/plot_lasso_and_elasticnet.py
index ca2d2425f9f5d..1c385a162467b 100644
--- a/examples/linear_model/plot_lasso_and_elasticnet.py
+++ b/examples/linear_model/plot_lasso_and_elasticnet.py
@@ -15,8 +15,7 @@
 
 from sklearn.metrics import r2_score
 
-###############################################################################
-# generate some sparse data to play with
+# Generate some sparse data to play with
 np.random.seed(42)
 
 n_samples, n_features = 50, 200
@@ -35,7 +34,6 @@
 X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
 X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]
 
-###############################################################################
 # Lasso
 from sklearn.linear_model import Lasso
 
@@ -47,7 +45,6 @@
 print(lasso)
 print("r^2 on test data : %f" % r2_score_lasso)
 
-###############################################################################
 # ElasticNet
 from sklearn.linear_model import ElasticNet
 
diff --git a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
index bc8df42a8490e..bc1d9d2b561c2 100644
--- a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
+++ b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
@@ -17,7 +17,6 @@
 from sklearn.linear_model import Lasso
 
 
-###############################################################################
 # The two Lasso implementations on Dense data
 print("--- Dense matrices")
 
@@ -39,7 +38,6 @@
 print("Distance between coefficients : %s"
       % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_))
 
-###############################################################################
 # The two Lasso implementations on Sparse data
 print("--- Sparse matrices")
 
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 245c6bd0492c7..7a36ba0672336 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -66,6 +66,7 @@
 
 ##############################################################################
 # LassoLarsIC: least angle regression with BIC/AIC criterion
+# ----------------------------------------------------------
 
 model_bic = LassoLarsIC(criterion='bic')
 t1 = time.time()
@@ -98,6 +99,7 @@ def plot_ic_criterion(model, name, color):
 
 ##############################################################################
 # LassoCV: coordinate descent
+# ---------------------------
 
 # Compute paths
 print("Computing regularization path using the coordinate descent lasso...")
@@ -127,6 +129,7 @@ def plot_ic_criterion(model, name, color):
 
 ##############################################################################
 # LassoLarsCV: least angle regression
+# -----------------------------------
 
 # Compute paths
 print("Computing regularization path using the Lars lasso...")
diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index d1b17948c78e0..220811259f4ac 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -29,7 +29,6 @@
 
 X -= np.mean(X, 0)
 
-###############################################################################
 # Demo path functions
 
 cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)
diff --git a/examples/linear_model/plot_multi_task_lasso_support.py b/examples/linear_model/plot_multi_task_lasso_support.py
index ea17d752f94a0..58f315b401770 100644
--- a/examples/linear_model/plot_multi_task_lasso_support.py
+++ b/examples/linear_model/plot_multi_task_lasso_support.py
@@ -39,7 +39,6 @@
 coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T])
 coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_
 
-###############################################################################
 # Plot support and time series
 fig = plt.figure(figsize=(8, 5))
 plt.subplot(1, 2, 1)
diff --git a/examples/linear_model/plot_ols_3d.py b/examples/linear_model/plot_ols_3d.py
index 23dfa01d60ecc..e9af33303ad00 100644
--- a/examples/linear_model/plot_ols_3d.py
+++ b/examples/linear_model/plot_ols_3d.py
@@ -37,7 +37,6 @@
 ols.fit(X_train, y_train)
 
 
-###############################################################################
 # Plot the figure
 def plot_figs(fig_num, elev, azim, X_train, clf):
     fig = plt.figure(fig_num, figsize=(4, 3))
diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py
index 1f2c475f78b7d..6164932996b08 100644
--- a/examples/linear_model/plot_ridge_path.py
+++ b/examples/linear_model/plot_ridge_path.py
@@ -39,7 +39,6 @@
 X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
 y = np.ones(10)
 
-###############################################################################
 # Compute paths
 
 n_alphas = 200
@@ -51,7 +50,6 @@
     ridge.fit(X, y)
     coefs.append(ridge.coef_)
 
-###############################################################################
 # Display results
 
 ax = plt.gca()
diff --git a/examples/linear_model/plot_theilsen.py b/examples/linear_model/plot_theilsen.py
index 747ac63e6a205..e87b64d3c12c0 100644
--- a/examples/linear_model/plot_theilsen.py
+++ b/examples/linear_model/plot_theilsen.py
@@ -51,7 +51,6 @@
 colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen'}
 lw = 2
 
-##############################################################################
 # Outliers only in the y direction
 
 np.random.seed(0)
@@ -80,7 +79,6 @@
 plt.legend(loc='upper left')
 plt.title("Corrupt y")
 
-##############################################################################
 # Outliers in the X direction
 
 np.random.seed(0)
diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index daf82718d42e1..8622b1b021e52 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -67,7 +67,6 @@
                     format='%(asctime)s %(levelname)s %(message)s')
 
 
-###############################################################################
 # Load some categories from the training set
 categories = [
     'alt.atheism',
@@ -84,8 +83,7 @@
 print("%d categories" % len(data.target_names))
 print()
 
-###############################################################################
-# define a pipeline combining a text feature extractor with a simple
+# Define a pipeline combining a text feature extractor with a simple
 # classifier
 pipeline = Pipeline([
     ('vect', CountVectorizer()),
diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index 366aa0acbee06..6ecae65877fce 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -40,10 +40,9 @@
 from sklearn.metrics import roc_curve, auc
 from sklearn.model_selection import StratifiedKFold
 
-###############################################################################
 # Data IO and generation
 
-# import some data to play with
+# Import some data to play with
 iris = datasets.load_iris()
 X = iris.data
 y = iris.target
@@ -54,7 +53,6 @@
 random_state = np.random.RandomState(0)
 X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
 
-###############################################################################
 # Classification and ROC analysis
 
 # Run classifier with cross-validation and plot ROC curves
diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py
index 9002a0a3a5f30..26e64f79660d9 100644
--- a/examples/model_selection/plot_train_error_vs_test_error.py
+++ b/examples/model_selection/plot_train_error_vs_test_error.py
@@ -19,7 +19,6 @@
 import numpy as np
 from sklearn import linear_model
 
-###############################################################################
 # Generate sample data
 n_samples_train, n_samples_test, n_features = 75, 150, 500
 np.random.seed(0)
@@ -32,7 +31,6 @@
 X_train, X_test = X[:n_samples_train], X[n_samples_train:]
 y_train, y_test = y[:n_samples_train], y[n_samples_train:]
 
-###############################################################################
 # Compute train and test errors
 alphas = np.logspace(-5, 1, 60)
 enet = linear_model.ElasticNet(l1_ratio=0.7)
@@ -52,7 +50,6 @@
 enet.set_params(alpha=alpha_optim)
 coef_ = enet.fit(X, y).coef_
 
-###############################################################################
 # Plot results functions
 
 import matplotlib.pyplot as plt
diff --git a/examples/neighbors/plot_regression.py b/examples/neighbors/plot_regression.py
index c664d7f173b0e..89c730ec877a8 100644
--- a/examples/neighbors/plot_regression.py
+++ b/examples/neighbors/plot_regression.py
@@ -16,7 +16,6 @@
 # License: BSD 3 clause (C) INRIA
 
 
-###############################################################################
 # Generate sample data
 import numpy as np
 import matplotlib.pyplot as plt
@@ -30,7 +29,6 @@
 # Add noise to targets
 y[::5] += 1 * (0.5 - np.random.rand(8))
 
-###############################################################################
 # Fit regression model
 n_neighbors = 5
 
diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index 2b9b15fe3d966..0fd6a075353cd 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -42,7 +42,6 @@
 from sklearn.pipeline import Pipeline
 
 
-###############################################################################
 # Setting up
 
 def nudge_dataset(X, Y):
@@ -91,7 +90,6 @@ def nudge_dataset(X, Y):
 
 classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
 
-###############################################################################
 # Training
 
 # Hyper-parameters. These were set by cross-validation,
@@ -111,7 +109,6 @@ def nudge_dataset(X, Y):
 logistic_classifier = linear_model.LogisticRegression(C=100.0)
 logistic_classifier.fit(X_train, Y_train)
 
-###############################################################################
 # Evaluation
 
 print()
@@ -125,7 +122,6 @@ def nudge_dataset(X, Y):
         Y_test,
         logistic_classifier.predict(X_test))))
 
-###############################################################################
 # Plotting
 
 plt.figure(figsize=(4.2, 4))
diff --git a/examples/plot_isotonic_regression.py b/examples/plot_isotonic_regression.py
index 4ae207ccedcfd..bac13ac414903 100644
--- a/examples/plot_isotonic_regression.py
+++ b/examples/plot_isotonic_regression.py
@@ -30,7 +30,6 @@
 rs = check_random_state(0)
 y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
 
-###############################################################################
 # Fit IsotonicRegression and LinearRegression models
 
 ir = IsotonicRegression()
@@ -40,8 +39,7 @@
 lr = LinearRegression()
 lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression
 
-###############################################################################
-# plot result
+# Plot result
 
 segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
 lc = LineCollection(segments, zorder=0)
diff --git a/examples/semi_supervised/plot_label_propagation_digits.py b/examples/semi_supervised/plot_label_propagation_digits.py
index 72da021374ad9..1590bf0bf9120 100644
--- a/examples/semi_supervised/plot_label_propagation_digits.py
+++ b/examples/semi_supervised/plot_label_propagation_digits.py
@@ -45,11 +45,10 @@ class will be very good.
 
 unlabeled_set = indices[n_labeled_points:]
 
-# shuffle everything around
+# Shuffle everything around
 y_train = np.copy(y)
 y_train[unlabeled_set] = -1
 
-###############################################################################
 # Learn with LabelSpreading
 lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
 lp_model.fit(X, y_train)
@@ -66,14 +65,13 @@ class will be very good.
 print("Confusion matrix")
 print(cm)
 
-# calculate uncertainty values for each transduced distribution
+# Calculate uncertainty values for each transduced distribution
 pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
 
-# pick the top 10 most uncertain labels
+# Pick the top 10 most uncertain labels
 uncertainty_index = np.argsort(pred_entropies)[-10:]
 
-###############################################################################
-# plot
+# Plot
 f = plt.figure(figsize=(7, 5))
 for index, image_index in enumerate(uncertainty_index):
     image = images[image_index]
diff --git a/examples/semi_supervised/plot_label_propagation_structure.py b/examples/semi_supervised/plot_label_propagation_structure.py
index 2632247984b24..8a1fa6e24c172 100644
--- a/examples/semi_supervised/plot_label_propagation_structure.py
+++ b/examples/semi_supervised/plot_label_propagation_structure.py
@@ -28,12 +28,10 @@
 labels[0] = outer
 labels[-1] = inner
 
-###############################################################################
 # Learn with LabelSpreading
 label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=1.0)
 label_spread.fit(X, labels)
 
-###############################################################################
 # Plot output labels
 output_labels = label_spread.transduction_
 plt.figure(figsize=(8.5, 4))
diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py
index 9bbca6683ce95..045f7cf245b99 100644
--- a/examples/svm/plot_rbf_parameters.py
+++ b/examples/svm/plot_rbf_parameters.py
@@ -91,7 +91,6 @@ def __call__(self, value, clip=None):
         x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
         return np.ma.masked_array(np.interp(value, x, y))
 
-##############################################################################
 # Load and prepare data set
 #
 # dataset for grid search
@@ -118,7 +117,6 @@ def __call__(self, value, clip=None):
 X = scaler.fit_transform(X)
 X_2d = scaler.fit_transform(X_2d)
 
-##############################################################################
 # Train classifiers
 #
 # For an initial search, a logarithmic grid with basis
@@ -147,8 +145,7 @@ def __call__(self, value, clip=None):
         clf.fit(X_2d, y_2d)
         classifiers.append((C, gamma, clf))
 
-##############################################################################
-# visualization
+# Visualization
 #
 # draw visualization of parameter effects
 
diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py
index 01938efd593ac..d8b54cd306c1d 100644
--- a/examples/svm/plot_svm_anova.py
+++ b/examples/svm/plot_svm_anova.py
@@ -14,7 +14,6 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import Pipeline
 
-###############################################################################
 # Import some data to play with
 digits = datasets.load_digits()
 y = digits.target
@@ -26,7 +25,6 @@
 # add 200 non-informative features
 X = np.hstack((X, 2 * np.random.random((n_samples, 200))))
 
-###############################################################################
 # Create a feature-selection transform and an instance of SVM that we
 # combine together to have an full-blown estimator
 
@@ -34,7 +32,6 @@
 
 clf = Pipeline([('anova', transform), ('svc', svm.SVC(C=1.0))])
 
-###############################################################################
 # Plot the cross-validation score as a function of percentile of features
 score_means = list()
 score_stds = list()
diff --git a/examples/svm/plot_svm_regression.py b/examples/svm/plot_svm_regression.py
index 15a744e2aa8ca..0093bbcea2a05 100644
--- a/examples/svm/plot_svm_regression.py
+++ b/examples/svm/plot_svm_regression.py
@@ -12,16 +12,13 @@
 from sklearn.svm import SVR
 import matplotlib.pyplot as plt
 
-###############################################################################
 # Generate sample data
 X = np.sort(5 * np.random.rand(40, 1), axis=0)
 y = np.sin(X).ravel()
 
-###############################################################################
 # Add noise to targets
 y[::5] += 3 * (0.5 - np.random.rand(8))
 
-###############################################################################
 # Fit regression model
 svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
 svr_lin = SVR(kernel='linear', C=1e3)
@@ -30,8 +27,7 @@
 y_lin = svr_lin.fit(X, y).predict(X)
 y_poly = svr_poly.fit(X, y).predict(X)
 
-###############################################################################
-# look at the results
+# Look at the results
 lw = 2
 plt.scatter(X, y, color='darkorange', label='data')
 plt.hold('on')
diff --git a/examples/text/document_classification_20newsgroups.py b/examples/text/document_classification_20newsgroups.py
index f34bbd10cbe55..250aa8429ec82 100644
--- a/examples/text/document_classification_20newsgroups.py
+++ b/examples/text/document_classification_20newsgroups.py
@@ -100,7 +100,6 @@ def is_interactive():
 print()
 
 
-###############################################################################
 # Load some categories from the training set
 if opts.all_categories:
     categories = None
@@ -201,7 +200,6 @@ def trim(s):
     return s if len(s) <= 80 else s[:77] + "..."
 
 
-###############################################################################
 # Benchmark classifiers
 def benchmark(clf):
     print('_' * 80)
diff --git a/examples/text/document_clustering.py b/examples/text/document_clustering.py
index 29725cc7ccfb4..ba7a9a8a1daf1 100644
--- a/examples/text/document_clustering.py
+++ b/examples/text/document_clustering.py
@@ -114,7 +114,6 @@ def is_interactive():
     sys.exit(1)
 
 
-###############################################################################
 # Load some categories from the training set
 categories = [
     'alt.atheism',
@@ -183,7 +182,6 @@ def is_interactive():
     print()
 
 
-###############################################################################
 # Do the actual clustering
 
 if opts.minibatch:

From 7bbf4ae47c8610fa5e88d70f345c52eefd5cc42d Mon Sep 17 00:00:00 2001
From: plagree <paul.lagree@gmail.com>
Date: Tue, 20 Jun 2017 10:07:45 +0200
Subject: [PATCH 3/4] Remove last notebook style example

---
 examples/linear_model/plot_lasso_model_selection.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 7a36ba0672336..6b58b55956162 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -64,9 +64,8 @@
 # normalize data as done by Lars to allow for comparison
 X /= np.sqrt(np.sum(X ** 2, axis=0))
 
-##############################################################################
+# #############################################################################
 # LassoLarsIC: least angle regression with BIC/AIC criterion
-# ----------------------------------------------------------
 
 model_bic = LassoLarsIC(criterion='bic')
 t1 = time.time()
@@ -97,9 +96,8 @@ def plot_ic_criterion(model, name, color):
 plt.title('Information-criterion for model selection (training time %.3fs)'
           % t_bic)
 
-##############################################################################
+# #############################################################################
 # LassoCV: coordinate descent
-# ---------------------------
 
 # Compute paths
 print("Computing regularization path using the coordinate descent lasso...")
@@ -127,9 +125,8 @@ def plot_ic_criterion(model, name, color):
 plt.axis('tight')
 plt.ylim(ymin, ymax)
 
-##############################################################################
+# #############################################################################
 # LassoLarsCV: least angle regression
-# -----------------------------------
 
 # Compute paths
 print("Computing regularization path using the Lars lasso...")

From 895bc522a0c6a751055cb4011f65ee08641c84f0 Mon Sep 17 00:00:00 2001
From: plagree <paul.lagree@gmail.com>
Date: Tue, 20 Jun 2017 11:36:38 +0200
Subject: [PATCH 4/4] Space formatting to avoid notebook style

---
 examples/applications/plot_face_recognition.py              | 6 ++++++
 examples/applications/plot_model_complexity_influence.py    | 2 ++
 examples/applications/plot_prediction_latency.py            | 2 ++
 examples/applications/plot_stock_market.py                  | 5 +++++
 examples/applications/wikipedia_principal_eigenvector.py    | 2 ++
 examples/calibration/plot_calibration.py                    | 1 +
 examples/calibration/plot_compare_calibration.py            | 1 +
 examples/classification/plot_lda_qda.py                     | 3 +++
 examples/cluster/plot_affinity_propagation.py               | 3 +++
 examples/cluster/plot_dbscan.py                             | 3 +++
 examples/cluster/plot_dict_face_patches.py                  | 2 ++
 examples/cluster/plot_face_ward_segmentation.py             | 4 ++++
 .../plot_feature_agglomeration_vs_univariate_selection.py   | 3 +++
 examples/cluster/plot_kmeans_digits.py                      | 1 +
 examples/cluster/plot_mean_shift.py                         | 3 +++
 examples/cluster/plot_mini_batch_kmeans.py                  | 4 ++++
 examples/cluster/plot_segmentation_toy.py                   | 2 ++
 examples/cluster/plot_ward_structured_vs_unstructured.py    | 6 ++++++
 examples/covariance/plot_covariance_estimation.py           | 4 ++++
 examples/covariance/plot_mahalanobis_distances.py           | 1 +
 examples/covariance/plot_sparse_cov.py                      | 3 +++
 .../cross_decomposition/plot_compare_cross_decomposition.py | 4 ++++
 examples/decomposition/plot_faces_decomposition.py          | 4 ++++
 examples/decomposition/plot_ica_blind_source_separation.py  | 2 ++
 examples/decomposition/plot_ica_vs_pca.py                   | 2 ++
 examples/decomposition/plot_image_denoising.py              | 3 +++
 examples/decomposition/plot_pca_3d.py                       | 2 ++
 examples/decomposition/plot_pca_vs_fa_model_selection.py    | 2 ++
 examples/ensemble/plot_gradient_boosting_regression.py      | 4 ++++
 examples/exercises/plot_cv_diabetes.py                      | 1 +
 examples/feature_selection/plot_feature_selection.py        | 3 +++
 .../plot_permutation_test_for_classification.py             | 2 ++
 examples/linear_model/plot_ard.py                           | 3 +++
 examples/linear_model/plot_bayesian_ridge.py                | 3 +++
 examples/linear_model/plot_lasso_and_elasticnet.py          | 3 +++
 examples/linear_model/plot_lasso_dense_vs_sparse_data.py    | 2 ++
 examples/linear_model/plot_logistic_path.py                 | 1 +
 examples/linear_model/plot_multi_task_lasso_support.py      | 1 +
 examples/linear_model/plot_ols_3d.py                        | 1 +
 examples/linear_model/plot_ridge_path.py                    | 2 ++
 examples/linear_model/plot_theilsen.py                      | 2 ++
 .../model_selection/grid_search_text_feature_extraction.py  | 2 ++
 examples/model_selection/plot_roc_crossval.py               | 2 ++
 examples/model_selection/plot_train_error_vs_test_error.py  | 3 +++
 examples/neighbors/plot_regression.py                       | 2 ++
 .../neural_networks/plot_rbm_logistic_classification.py     | 4 ++++
 examples/plot_isotonic_regression.py                        | 2 ++
 examples/plot_kernel_ridge_regression.py                    | 3 +++
 examples/semi_supervised/plot_label_propagation_digits.py   | 5 +++++
 .../semi_supervised/plot_label_propagation_structure.py     | 2 ++
 examples/svm/plot_rbf_parameters.py                         | 3 +++
 examples/svm/plot_svm_anova.py                              | 3 +++
 examples/svm/plot_svm_regression.py                         | 4 ++++
 examples/text/document_classification_20newsgroups.py       | 2 ++
 examples/text/document_clustering.py                        | 2 ++
 55 files changed, 147 insertions(+)

diff --git a/examples/applications/plot_face_recognition.py b/examples/applications/plot_face_recognition.py
index 123c4b4bdd9b7..13a38d13bc00c 100644
--- a/examples/applications/plot_face_recognition.py
+++ b/examples/applications/plot_face_recognition.py
@@ -48,6 +48,7 @@
 logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
 
 
+# #############################################################################
 # Download the data, if not already on disk and load it as numpy arrays
 
 lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
@@ -71,6 +72,7 @@
 print("n_classes: %d" % n_classes)
 
 
+# #############################################################################
 # Split into a training set and a test set using a stratified k fold
 
 # split into a training and testing set
@@ -78,6 +80,7 @@
     X, y, test_size=0.25, random_state=42)
 
 
+# #############################################################################
 # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
 # dataset): unsupervised feature extraction / dimensionality reduction
 n_components = 150
@@ -98,6 +101,7 @@
 print("done in %0.3fs" % (time() - t0))
 
 
+# #############################################################################
 # Train a SVM classification model
 
 print("Fitting the classifier to the training set")
@@ -111,6 +115,7 @@
 print(clf.best_estimator_)
 
 
+# #############################################################################
 # Quantitative evaluation of the model quality on the test set
 
 print("Predicting people's names on the test set")
@@ -122,6 +127,7 @@
 print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
 
 
+# #############################################################################
 # Qualitative evaluation of the predictions using matplotlib
 
 def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py
index f7df14f63b4b7..359711b995b14 100644
--- a/examples/applications/plot_model_complexity_influence.py
+++ b/examples/applications/plot_model_complexity_influence.py
@@ -34,6 +34,7 @@
 from sklearn.linear_model.stochastic_gradient import SGDClassifier
 from sklearn.metrics import hamming_loss
 
+# #############################################################################
 # Routines
 
 
@@ -121,6 +122,7 @@ def _count_nonzero_coefficients(estimator):
     a = estimator.coef_.toarray()
     return np.count_nonzero(a)
 
+# #############################################################################
 # Main code
 regression_data = generate_data('regression')
 classification_data = generate_data('classification', sparse=True)
diff --git a/examples/applications/plot_prediction_latency.py b/examples/applications/plot_prediction_latency.py
index 156a5d33ee2af..71321b4d39d6e 100644
--- a/examples/applications/plot_prediction_latency.py
+++ b/examples/applications/plot_prediction_latency.py
@@ -266,10 +266,12 @@ def plot_benchmark_throughput(throughputs, configuration):
     plt.show()
 
 
+# #############################################################################
 # Main code
 
 start_time = time.time()
 
+# #############################################################################
 # Benchmark bulk/atomic prediction speed for various regressors
 configuration = {
     'n_train': int(1e3),
diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py
index bd4fd1f5fc9c7..c7d627e8148ef 100644
--- a/examples/applications/plot_stock_market.py
+++ b/examples/applications/plot_stock_market.py
@@ -74,6 +74,7 @@
 from sklearn import cluster, covariance, manifold
 
 
+# #############################################################################
 # Retrieve the data from Internet
 
 def quotes_historical_google(symbol, date1, date2):
@@ -188,6 +189,7 @@ def quotes_historical_google(symbol, date1, date2):
 variation = close_prices - open_prices
 
 
+# #############################################################################
 # Learn a graphical structure from the correlations
 edge_model = covariance.GraphLassoCV()
 
@@ -197,6 +199,7 @@ def quotes_historical_google(symbol, date1, date2):
 X /= X.std(axis=0)
 edge_model.fit(X)
 
+# #############################################################################
 # Cluster using affinity propagation
 
 _, labels = cluster.affinity_propagation(edge_model.covariance_)
@@ -205,6 +208,7 @@ def quotes_historical_google(symbol, date1, date2):
 for i in range(n_labels + 1):
     print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
 
+# #############################################################################
 # Find a low-dimension embedding for visualization: find the best position of
 # the nodes (the stocks) on a 2D plane
 
@@ -216,6 +220,7 @@ def quotes_historical_google(symbol, date1, date2):
 
 embedding = node_position_model.fit_transform(X.T).T
 
+# #############################################################################
 # Visualization
 plt.figure(1, facecolor='w', figsize=(10, 8))
 plt.clf()
diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py
index c0b5529d2e3f6..175c10594440e 100644
--- a/examples/applications/wikipedia_principal_eigenvector.py
+++ b/examples/applications/wikipedia_principal_eigenvector.py
@@ -52,6 +52,7 @@
 
 print(__doc__)
 
+# #############################################################################
 # Where to download the data, if not already on disk
 redirects_url = "http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2"
 redirects_filename = redirects_url.rsplit("/", 1)[1]
@@ -72,6 +73,7 @@
         print()
 
 
+# #############################################################################
 # Loading the redirect files
 
 memory = Memory(cachedir=".")
diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py
index 174812be4d1e6..c6e3c0111b708 100644
--- a/examples/calibration/plot_calibration.py
+++ b/examples/calibration/plot_calibration.py
@@ -83,6 +83,7 @@
 clf_sigmoid_score = brier_score_loss(y_test, prob_pos_sigmoid, sw_test)
 print("With sigmoid calibration: %1.3f" % clf_sigmoid_score)
 
+# #############################################################################
 # Plot the data and the predicted probabilities
 plt.figure()
 y_unique = np.unique(y)
diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index 28624090f5da6..d935bce4f5bc2 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -81,6 +81,7 @@
 rfc = RandomForestClassifier(n_estimators=100)
 
 
+# #############################################################################
 # Plot calibration plots
 
 plt.figure(figsize=(10, 10))
diff --git a/examples/classification/plot_lda_qda.py b/examples/classification/plot_lda_qda.py
index 74744ab8348f9..c76ffc1f2c11e 100644
--- a/examples/classification/plot_lda_qda.py
+++ b/examples/classification/plot_lda_qda.py
@@ -20,6 +20,7 @@ class has its own standard deviation with QDA.
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 
+# #############################################################################
 # Colormap
 cmap = colors.LinearSegmentedColormap(
     'red_blue_classes',
@@ -29,6 +30,7 @@ class has its own standard deviation with QDA.
 plt.cm.register_cmap(cmap=cmap)
 
 
+# #############################################################################
 # Generate datasets
 def dataset_fixed_cov():
     '''Generate 2 Gaussians samples with the same covariance matrix'''
@@ -52,6 +54,7 @@ def dataset_cov():
     return X, y
 
 
+# #############################################################################
 # Plot functions
 def plot_data(lda, X, y, y_pred, fig_index):
     splot = plt.subplot(2, 2, fig_index)
diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py
index edce07c7206e9..2c8fc3acc3936 100644
--- a/examples/cluster/plot_affinity_propagation.py
+++ b/examples/cluster/plot_affinity_propagation.py
@@ -14,11 +14,13 @@
 from sklearn import metrics
 from sklearn.datasets.samples_generator import make_blobs
 
+# #############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
 X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,
                             random_state=0)
 
+# #############################################################################
 # Compute Affinity Propagation
 af = AffinityPropagation(preference=-50).fit(X)
 cluster_centers_indices = af.cluster_centers_indices_
@@ -37,6 +39,7 @@
 print("Silhouette Coefficient: %0.3f"
       % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
 
+# #############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 from itertools import cycle
diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py
index 83c07bcbb892a..8b116ed2cfbb0 100644
--- a/examples/cluster/plot_dbscan.py
+++ b/examples/cluster/plot_dbscan.py
@@ -17,6 +17,7 @@
 from sklearn.preprocessing import StandardScaler
 
 
+# #############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
 X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
@@ -24,6 +25,7 @@
 
 X = StandardScaler().fit_transform(X)
 
+# #############################################################################
 # Compute DBSCAN
 db = DBSCAN(eps=0.3, min_samples=10).fit(X)
 core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
@@ -44,6 +46,7 @@
 print("Silhouette Coefficient: %0.3f"
       % metrics.silhouette_score(X, labels))
 
+# #############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 
diff --git a/examples/cluster/plot_dict_face_patches.py b/examples/cluster/plot_dict_face_patches.py
index ed837dbaa0cfd..ac2fde3e2cc6a 100644
--- a/examples/cluster/plot_dict_face_patches.py
+++ b/examples/cluster/plot_dict_face_patches.py
@@ -32,6 +32,7 @@
 
 faces = datasets.fetch_olivetti_faces()
 
+# #############################################################################
 # Learn the dictionary of images
 
 print('Learning the dictionary... ')
@@ -65,6 +66,7 @@
 dt = time.time() - t0
 print('done in %.2fs.' % dt)
 
+# #############################################################################
 # Plot the results
 plt.figure(figsize=(4.2, 4))
 for i, patch in enumerate(kmeans.cluster_centers_):
diff --git a/examples/cluster/plot_face_ward_segmentation.py b/examples/cluster/plot_face_ward_segmentation.py
index 27b464a244641..1490b6a110388 100644
--- a/examples/cluster/plot_face_ward_segmentation.py
+++ b/examples/cluster/plot_face_ward_segmentation.py
@@ -25,6 +25,7 @@
 from sklearn.cluster import AgglomerativeClustering
 
 
+# #############################################################################
 # Generate data
 try:  # SciPy >= 0.16 have face in misc
     from scipy.misc import face
@@ -37,9 +38,11 @@
 
 X = np.reshape(face, (-1, 1))
 
+# #############################################################################
 # Define the structure A of the data. Pixels connected to their neighbors.
 connectivity = grid_to_graph(*face.shape)
 
+# #############################################################################
 # Compute clustering
 print("Compute structured hierarchical clustering...")
 st = time.time()
@@ -52,6 +55,7 @@
 print("Number of pixels: ", label.size)
 print("Number of clusters: ", np.unique(label).size)
 
+# #############################################################################
 # Plot the results on an image
 plt.figure(figsize=(5, 5))
 plt.imshow(face, cmap=plt.cm.gray)
diff --git a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
index d6f65d83f959d..0801899f70349 100644
--- a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
+++ b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
@@ -34,6 +34,7 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import KFold
 
+# #############################################################################
 # Generate data
 n_samples = 200
 size = 40  # image size
@@ -57,6 +58,7 @@
 noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
 y += noise_coef * noise  # add noise
 
+# #############################################################################
 # Compute the coefs of a Bayesian Ridge with GridSearch
 cv = KFold(2)  # cross-validation generator for model selection
 ridge = BayesianRidge()
@@ -86,6 +88,7 @@
 coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
 coef_selection_ = coef_.reshape(size, size)
 
+# #############################################################################
 # Inverse the transformation to plot the results on an image
 plt.close('all')
 plt.figure(figsize=(7.3, 2.7))
diff --git a/examples/cluster/plot_kmeans_digits.py b/examples/cluster/plot_kmeans_digits.py
index b08fd6b854fc4..f38eb8b4be416 100644
--- a/examples/cluster/plot_kmeans_digits.py
+++ b/examples/cluster/plot_kmeans_digits.py
@@ -84,6 +84,7 @@ def bench_k_means(estimator, name, data):
               data=data)
 print(82 * '_')
 
+# #############################################################################
 # Visualize the results on PCA-reduced data
 
 reduced_data = PCA(n_components=2).fit_transform(data)
diff --git a/examples/cluster/plot_mean_shift.py b/examples/cluster/plot_mean_shift.py
index 2e2c75b962688..730c820c48345 100644
--- a/examples/cluster/plot_mean_shift.py
+++ b/examples/cluster/plot_mean_shift.py
@@ -16,10 +16,12 @@
 from sklearn.cluster import MeanShift, estimate_bandwidth
 from sklearn.datasets.samples_generator import make_blobs
 
+# #############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
 X, _ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6)
 
+# #############################################################################
 # Compute clustering with MeanShift
 
 # The following bandwidth can be automatically detected using
@@ -35,6 +37,7 @@
 
 print("number of estimated clusters : %d" % n_clusters_)
 
+# #############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 from itertools import cycle
diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py
index 9c650be0a44e2..9f84566a3c3a7 100644
--- a/examples/cluster/plot_mini_batch_kmeans.py
+++ b/examples/cluster/plot_mini_batch_kmeans.py
@@ -23,6 +23,7 @@
 from sklearn.metrics.pairwise import pairwise_distances_argmin
 from sklearn.datasets.samples_generator import make_blobs
 
+# #############################################################################
 # Generate sample data
 np.random.seed(0)
 
@@ -31,6 +32,7 @@
 n_clusters = len(centers)
 X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)
 
+# #############################################################################
 # Compute clustering with Means
 
 k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
@@ -38,6 +40,7 @@
 k_means.fit(X)
 t_batch = time.time() - t0
 
+# #############################################################################
 # Compute clustering with MiniBatchKMeans
 
 mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
@@ -46,6 +49,7 @@
 mbk.fit(X)
 t_mini_batch = time.time() - t0
 
+# #############################################################################
 # Plot result
 
 fig = plt.figure(figsize=(8, 3))
diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py
index fe24304e5b8cc..aa66c811eda8d 100644
--- a/examples/cluster/plot_segmentation_toy.py
+++ b/examples/cluster/plot_segmentation_toy.py
@@ -51,6 +51,7 @@
 circle3 = (x - center3[0]) ** 2 + (y - center3[1]) ** 2 < radius3 ** 2
 circle4 = (x - center4[0]) ** 2 + (y - center4[1]) ** 2 < radius4 ** 2
 
+# #############################################################################
 # 4 circles
 img = circle1 + circle2 + circle3 + circle4
 
@@ -79,6 +80,7 @@
 plt.matshow(img)
 plt.matshow(label_im)
 
+# #############################################################################
 # 2 circles
 img = circle1 + circle2
 mask = img.astype(bool)
diff --git a/examples/cluster/plot_ward_structured_vs_unstructured.py b/examples/cluster/plot_ward_structured_vs_unstructured.py
index 102a78ba82eea..fa804d1e50335 100644
--- a/examples/cluster/plot_ward_structured_vs_unstructured.py
+++ b/examples/cluster/plot_ward_structured_vs_unstructured.py
@@ -33,6 +33,7 @@
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.datasets.samples_generator import make_swiss_roll
 
+# #############################################################################
 # Generate data (swiss roll dataset)
 n_samples = 1500
 noise = 0.05
@@ -40,6 +41,7 @@
 # Make it thinner
 X[:, 1] *= .5
 
+# #############################################################################
 # Compute clustering
 print("Compute unstructured hierarchical clustering...")
 st = time.time()
@@ -49,6 +51,7 @@
 print("Elapsed time: %.2fs" % elapsed_time)
 print("Number of points: %i" % label.size)
 
+# #############################################################################
 # Plot result
 fig = plt.figure()
 ax = p3.Axes3D(fig)
@@ -59,10 +62,12 @@
 plt.title('Without connectivity constraints (time %.2fs)' % elapsed_time)
 
 
+# #############################################################################
 # Define the structure A of the data. Here a 10 nearest neighbors
 from sklearn.neighbors import kneighbors_graph
 connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
 
+# #############################################################################
 # Compute clustering
 print("Compute structured hierarchical clustering...")
 st = time.time()
@@ -73,6 +78,7 @@
 print("Elapsed time: %.2fs" % elapsed_time)
 print("Number of points: %i" % label.size)
 
+# #############################################################################
 # Plot result
 fig = plt.figure()
 ax = p3.Axes3D(fig)
diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py
index 8a663fec43fc8..adb57f003cfbb 100644
--- a/examples/covariance/plot_covariance_estimation.py
+++ b/examples/covariance/plot_covariance_estimation.py
@@ -52,6 +52,7 @@
 from sklearn.model_selection import GridSearchCV
 
 
+# #############################################################################
 # Generate sample data
 n_features, n_samples = 40, 20
 np.random.seed(42)
@@ -63,6 +64,7 @@
 X_train = np.dot(base_X_train, coloring_matrix)
 X_test = np.dot(base_X_test, coloring_matrix)
 
+# #############################################################################
 # Compute the likelihood on test data
 
 # spanning a range of possible shrinkage coefficient values
@@ -76,6 +78,7 @@
 emp_cov = empirical_covariance(X_train)
 loglik_real = -log_likelihood(emp_cov, linalg.inv(real_cov))
 
+# #############################################################################
 # Compare different approaches to setting the parameter
 
 # GridSearch for an optimal shrinkage coefficient
@@ -91,6 +94,7 @@
 oa = OAS()
 loglik_oa = oa.fit(X_train).score(X_test)
 
+# #############################################################################
 # Plot results
 fig = plt.figure()
 plt.title("Regularized covariance: likelihood and shrinkage coefficient")
diff --git a/examples/covariance/plot_mahalanobis_distances.py b/examples/covariance/plot_mahalanobis_distances.py
index 166252fc7f61f..21f295ce58305 100644
--- a/examples/covariance/plot_mahalanobis_distances.py
+++ b/examples/covariance/plot_mahalanobis_distances.py
@@ -78,6 +78,7 @@
 # compare estimators learnt from the full data set with true parameters
 emp_cov = EmpiricalCovariance().fit(X)
 
+# #############################################################################
 # Display results
 fig = plt.figure()
 plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
diff --git a/examples/covariance/plot_sparse_cov.py b/examples/covariance/plot_sparse_cov.py
index 8d42e7aaef929..1d6782cb43ef8 100644
--- a/examples/covariance/plot_sparse_cov.py
+++ b/examples/covariance/plot_sparse_cov.py
@@ -59,6 +59,7 @@
 from sklearn.covariance import GraphLassoCV, ledoit_wolf
 import matplotlib.pyplot as plt
 
+# #############################################################################
 # Generate the data
 n_samples = 60
 n_features = 20
@@ -78,6 +79,7 @@
 X -= X.mean(axis=0)
 X /= X.std(axis=0)
 
+# #############################################################################
 # Estimate the covariance
 emp_cov = np.dot(X.T, X) / n_samples
 
@@ -89,6 +91,7 @@
 lw_cov_, _ = ledoit_wolf(X)
 lw_prec_ = linalg.inv(lw_cov_)
 
+# #############################################################################
 # Plot the results
 plt.figure(figsize=(10, 6))
 plt.subplots_adjust(left=0.02, right=0.98)
diff --git a/examples/cross_decomposition/plot_compare_cross_decomposition.py b/examples/cross_decomposition/plot_compare_cross_decomposition.py
index 65a2980d746e6..4a123c04b03a4 100644
--- a/examples/cross_decomposition/plot_compare_cross_decomposition.py
+++ b/examples/cross_decomposition/plot_compare_cross_decomposition.py
@@ -24,6 +24,7 @@
 import matplotlib.pyplot as plt
 from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA
 
+# #############################################################################
 # Dataset based latent variables model
 
 n = 500
@@ -45,6 +46,7 @@
 print("Corr(Y)")
 print(np.round(np.corrcoef(Y.T), 2))
 
+# #############################################################################
 # Canonical (symmetric) PLS
 
 # Transform data
@@ -104,6 +106,7 @@
 plt.yticks(())
 plt.show()
 
+# #############################################################################
 # PLS regression, with multivariate response, a.k.a. PLS2
 
 n = 1000
@@ -135,6 +138,7 @@
 print("Estimated betas")
 print(np.round(pls1.coef_, 1))
 
+# #############################################################################
 # CCA (PLS mode B with symmetric deflation)
 
 cca = CCA(n_components=2)
diff --git a/examples/decomposition/plot_faces_decomposition.py b/examples/decomposition/plot_faces_decomposition.py
index 2b84bbf4374b0..d29af6ad408fb 100644
--- a/examples/decomposition/plot_faces_decomposition.py
+++ b/examples/decomposition/plot_faces_decomposition.py
@@ -32,6 +32,7 @@
 image_shape = (64, 64)
 rng = RandomState(0)
 
+# #############################################################################
 # Load faces data
 dataset = fetch_olivetti_faces(shuffle=True, random_state=rng)
 faces = dataset.data
@@ -60,6 +61,7 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row):
         plt.yticks(())
     plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)
 
+# #############################################################################
 # List of the different estimators, whether to center and transpose the
 # problem, and whether the transformer uses the clustering API.
 estimators = [
@@ -99,10 +101,12 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row):
 ]
 
 
+# #############################################################################
 # Plot a sample of the input data
 
 plot_gallery("First centered Olivetti faces", faces_centered[:n_components])
 
+# #############################################################################
 # Do the estimation and plot it
 
 for name, estimator, center in estimators:
diff --git a/examples/decomposition/plot_ica_blind_source_separation.py b/examples/decomposition/plot_ica_blind_source_separation.py
index 31b3e8a560605..fb7689064dd06 100644
--- a/examples/decomposition/plot_ica_blind_source_separation.py
+++ b/examples/decomposition/plot_ica_blind_source_separation.py
@@ -21,6 +21,7 @@
 
 from sklearn.decomposition import FastICA, PCA
 
+# #############################################################################
 # Generate sample data
 np.random.seed(0)
 n_samples = 2000
@@ -50,6 +51,7 @@
 pca = PCA(n_components=3)
 H = pca.fit_transform(X)  # Reconstruct signals based on orthogonal components
 
+# #############################################################################
 # Plot results
 
 plt.figure()
diff --git a/examples/decomposition/plot_ica_vs_pca.py b/examples/decomposition/plot_ica_vs_pca.py
index cf9ce0925204a..f9ef968babeb1 100644
--- a/examples/decomposition/plot_ica_vs_pca.py
+++ b/examples/decomposition/plot_ica_vs_pca.py
@@ -37,6 +37,7 @@
 
 from sklearn.decomposition import PCA, FastICA
 
+# #############################################################################
 # Generate sample data
 rng = np.random.RandomState(42)
 S = rng.standard_t(1.5, size=(20000, 2))
@@ -56,6 +57,7 @@
 S_ica_ /= S_ica_.std(axis=0)
 
 
+# #############################################################################
 # Plot results
 
 def plot_samples(S, axis_list=None):
diff --git a/examples/decomposition/plot_image_denoising.py b/examples/decomposition/plot_image_denoising.py
index 68e62a9b6e305..33a394a856c91 100644
--- a/examples/decomposition/plot_image_denoising.py
+++ b/examples/decomposition/plot_image_denoising.py
@@ -74,6 +74,7 @@
 data /= np.std(data, axis=0)
 print('done in %.2fs.' % (time() - t0))
 
+# #############################################################################
 # Learn the dictionary from reference patches
 
 print('Learning the dictionary...')
@@ -96,6 +97,7 @@
 plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
 
 
+# #############################################################################
 # Display the distorted image
 
 def show_with_diff(image, reference, title):
@@ -120,6 +122,7 @@ def show_with_diff(image, reference, title):
 
 show_with_diff(distorted, face, 'Distorted image')
 
+# #############################################################################
 # Extract noisy patches and reconstruct them using the dictionary
 
 print('Extracting noisy patches... ')
diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py
index 06592c0e6b221..d9db17ffaec39 100644
--- a/examples/decomposition/plot_pca_3d.py
+++ b/examples/decomposition/plot_pca_3d.py
@@ -26,6 +26,7 @@
 from scipy import stats
 
 
+# #############################################################################
 # Create the data
 
 e = np.exp(1)
@@ -54,6 +55,7 @@ def pdf(x):
 b /= norm
 
 
+# #############################################################################
 # Plot the figures
 def plot_figs(fig_num, elev, azim):
     fig = plt.figure(fig_num, figsize=(4, 3))
diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py
index 5601f9cdd7798..b858434d910e3 100644
--- a/examples/decomposition/plot_pca_vs_fa_model_selection.py
+++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py
@@ -39,6 +39,7 @@
 
 print(__doc__)
 
+# #############################################################################
 # Create the data
 
 n_samples, n_features, rank = 1000, 50, 10
@@ -54,6 +55,7 @@
 sigmas = sigma * rng.rand(n_features) + sigma / 2.
 X_hetero = X + rng.randn(n_samples, n_features) * sigmas
 
+# #############################################################################
 # Fit the models
 
 n_components = np.arange(0, n_features, 5)  # options for n_components
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index 468c338f42f2f..9285f8dae0eea 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -22,6 +22,7 @@
 from sklearn.utils import shuffle
 from sklearn.metrics import mean_squared_error
 
+# #############################################################################
 # Load data
 boston = datasets.load_boston()
 X, y = shuffle(boston.data, boston.target, random_state=13)
@@ -30,6 +31,7 @@
 X_train, y_train = X[:offset], y[:offset]
 X_test, y_test = X[offset:], y[offset:]
 
+# #############################################################################
 # Fit regression model
 params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
           'learning_rate': 0.01, 'loss': 'ls'}
@@ -39,6 +41,7 @@
 mse = mean_squared_error(y_test, clf.predict(X_test))
 print("MSE: %.4f" % mse)
 
+# #############################################################################
 # Plot training deviance
 
 # compute test set deviance
@@ -58,6 +61,7 @@
 plt.xlabel('Boosting Iterations')
 plt.ylabel('Deviance')
 
+# #############################################################################
 # Plot feature importance
 feature_importance = clf.feature_importances_
 # make importances relative to max importance
diff --git a/examples/exercises/plot_cv_diabetes.py b/examples/exercises/plot_cv_diabetes.py
index 0b41315069b2b..76b0d81b8998c 100644
--- a/examples/exercises/plot_cv_diabetes.py
+++ b/examples/exercises/plot_cv_diabetes.py
@@ -52,6 +52,7 @@
 plt.axhline(np.max(scores), linestyle='--', color='.5')
 plt.xlim([alphas[0], alphas[-1]])
 
+# #############################################################################
 # Bonus: how much can you trust the selection of alpha?
 
 # To answer this question we use the LassoCV object that sets its alpha
diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py
index 73badff72be2e..59ed716660341 100644
--- a/examples/feature_selection/plot_feature_selection.py
+++ b/examples/feature_selection/plot_feature_selection.py
@@ -27,6 +27,7 @@
 from sklearn import datasets, svm
 from sklearn.feature_selection import SelectPercentile, f_classif
 
+# #############################################################################
 # Import some data to play with
 
 # The iris dataset
@@ -44,6 +45,7 @@
 
 X_indices = np.arange(X.shape[-1])
 
+# #############################################################################
 # Univariate feature selection with F-test for feature scoring
 # We use the default selection function: the 10% most significant features
 selector = SelectPercentile(f_classif, percentile=10)
@@ -54,6 +56,7 @@
         label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
         edgecolor='black')
 
+# #############################################################################
 # Compare to the weights of an SVM
 clf = svm.SVC(kernel='linear')
 clf.fit(X, y)
diff --git a/examples/feature_selection/plot_permutation_test_for_classification.py b/examples/feature_selection/plot_permutation_test_for_classification.py
index 84b1c5a3fca4e..095f743d40803 100644
--- a/examples/feature_selection/plot_permutation_test_for_classification.py
+++ b/examples/feature_selection/plot_permutation_test_for_classification.py
@@ -25,6 +25,7 @@
 from sklearn import datasets
 
 
+# #############################################################################
 # Loading a dataset
 iris = datasets.load_iris()
 X = iris.data
@@ -46,6 +47,7 @@
 
 print("Classification score %s (pvalue : %s)" % (score, pvalue))
 
+# #############################################################################
 # View histogram of permutation scores
 plt.hist(permutation_scores, 20, label='Permutation scores',
          edgecolor='black')
diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py
index 9947b110365cb..38c334a217df5 100644
--- a/examples/linear_model/plot_ard.py
+++ b/examples/linear_model/plot_ard.py
@@ -30,6 +30,7 @@
 
 from sklearn.linear_model import ARDRegression, LinearRegression
 
+# #############################################################################
 # Generating simulated data with Gaussian weights
 
 # Parameters of the example
@@ -50,6 +51,7 @@
 # Create the target
 y = np.dot(X, w) + noise
 
+# #############################################################################
 # Fit the ARD Regression
 clf = ARDRegression(compute_score=True)
 clf.fit(X, y)
@@ -57,6 +59,7 @@
 ols = LinearRegression()
 ols.fit(X, y)
 
+# #############################################################################
 # Plot the true weights, the estimated weights, the histogram of the
 # weights, and predictions with standard deviations
 plt.figure(figsize=(6, 5))
diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py
index c0a4bd8cdcfe7..4359c421ea866 100644
--- a/examples/linear_model/plot_bayesian_ridge.py
+++ b/examples/linear_model/plot_bayesian_ridge.py
@@ -30,6 +30,7 @@
 
 from sklearn.linear_model import BayesianRidge, LinearRegression
 
+# #############################################################################
 # Generating simulated data with Gaussian weights
 np.random.seed(0)
 n_samples, n_features = 100, 100
@@ -47,6 +48,7 @@
 # Create the target
 y = np.dot(X, w) + noise
 
+# #############################################################################
 # Fit the Bayesian Ridge Regression and an OLS for comparison
 clf = BayesianRidge(compute_score=True)
 clf.fit(X, y)
@@ -54,6 +56,7 @@
 ols = LinearRegression()
 ols.fit(X, y)
 
+# #############################################################################
 # Plot true weights, estimated weights, histogram of the weights, and
 # predictions with standard deviations
 lw = 2
diff --git a/examples/linear_model/plot_lasso_and_elasticnet.py b/examples/linear_model/plot_lasso_and_elasticnet.py
index 1c385a162467b..350cac0a0ad95 100644
--- a/examples/linear_model/plot_lasso_and_elasticnet.py
+++ b/examples/linear_model/plot_lasso_and_elasticnet.py
@@ -15,6 +15,7 @@
 
 from sklearn.metrics import r2_score
 
+# #############################################################################
 # Generate some sparse data to play with
 np.random.seed(42)
 
@@ -34,6 +35,7 @@
 X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
 X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]
 
+# #############################################################################
 # Lasso
 from sklearn.linear_model import Lasso
 
@@ -45,6 +47,7 @@
 print(lasso)
 print("r^2 on test data : %f" % r2_score_lasso)
 
+# #############################################################################
 # ElasticNet
 from sklearn.linear_model import ElasticNet
 
diff --git a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
index bc1d9d2b561c2..c54f81d1b8bcd 100644
--- a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
+++ b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
@@ -17,6 +17,7 @@
 from sklearn.linear_model import Lasso
 
 
+# #############################################################################
 # The two Lasso implementations on Dense data
 print("--- Dense matrices")
 
@@ -38,6 +39,7 @@
 print("Distance between coefficients : %s"
       % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_))
 
+# #############################################################################
 # The two Lasso implementations on Sparse data
 print("--- Sparse matrices")
 
diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index 220811259f4ac..66a1ab9bd0254 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -29,6 +29,7 @@
 
 X -= np.mean(X, 0)
 
+# #############################################################################
 # Demo path functions
 
 cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)
diff --git a/examples/linear_model/plot_multi_task_lasso_support.py b/examples/linear_model/plot_multi_task_lasso_support.py
index 58f315b401770..c7a9536383bc2 100644
--- a/examples/linear_model/plot_multi_task_lasso_support.py
+++ b/examples/linear_model/plot_multi_task_lasso_support.py
@@ -39,6 +39,7 @@
 coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T])
 coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_
 
+# #############################################################################
 # Plot support and time series
 fig = plt.figure(figsize=(8, 5))
 plt.subplot(1, 2, 1)
diff --git a/examples/linear_model/plot_ols_3d.py b/examples/linear_model/plot_ols_3d.py
index e9af33303ad00..d8b0f2b52aa22 100644
--- a/examples/linear_model/plot_ols_3d.py
+++ b/examples/linear_model/plot_ols_3d.py
@@ -37,6 +37,7 @@
 ols.fit(X_train, y_train)
 
 
+# #############################################################################
 # Plot the figure
 def plot_figs(fig_num, elev, azim, X_train, clf):
     fig = plt.figure(fig_num, figsize=(4, 3))
diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py
index 6164932996b08..b16212cbd3718 100644
--- a/examples/linear_model/plot_ridge_path.py
+++ b/examples/linear_model/plot_ridge_path.py
@@ -39,6 +39,7 @@
 X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
 y = np.ones(10)
 
+# #############################################################################
 # Compute paths
 
 n_alphas = 200
@@ -50,6 +51,7 @@
     ridge.fit(X, y)
     coefs.append(ridge.coef_)
 
+# #############################################################################
 # Display results
 
 ax = plt.gca()
diff --git a/examples/linear_model/plot_theilsen.py b/examples/linear_model/plot_theilsen.py
index e87b64d3c12c0..c80b4a409937b 100644
--- a/examples/linear_model/plot_theilsen.py
+++ b/examples/linear_model/plot_theilsen.py
@@ -51,6 +51,7 @@
 colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen'}
 lw = 2
 
+# #############################################################################
 # Outliers only in the y direction
 
 np.random.seed(0)
@@ -79,6 +80,7 @@
 plt.legend(loc='upper left')
 plt.title("Corrupt y")
 
+# #############################################################################
 # Outliers in the X direction
 
 np.random.seed(0)
diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index 8622b1b021e52..bc26ca0719265 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -67,6 +67,7 @@
                     format='%(asctime)s %(levelname)s %(message)s')
 
 
+# #############################################################################
 # Load some categories from the training set
 categories = [
     'alt.atheism',
@@ -83,6 +84,7 @@
 print("%d categories" % len(data.target_names))
 print()
 
+# #############################################################################
 # Define a pipeline combining a text feature extractor with a simple
 # classifier
 pipeline = Pipeline([
diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index 6ecae65877fce..eb4664049ee77 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -40,6 +40,7 @@
 from sklearn.metrics import roc_curve, auc
 from sklearn.model_selection import StratifiedKFold
 
+# #############################################################################
 # Data IO and generation
 
 # Import some data to play with
@@ -53,6 +54,7 @@
 random_state = np.random.RandomState(0)
 X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
 
+# #############################################################################
 # Classification and ROC analysis
 
 # Run classifier with cross-validation and plot ROC curves
diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py
index 26e64f79660d9..4a1654d228f0f 100644
--- a/examples/model_selection/plot_train_error_vs_test_error.py
+++ b/examples/model_selection/plot_train_error_vs_test_error.py
@@ -19,6 +19,7 @@
 import numpy as np
 from sklearn import linear_model
 
+# #############################################################################
 # Generate sample data
 n_samples_train, n_samples_test, n_features = 75, 150, 500
 np.random.seed(0)
@@ -31,6 +32,7 @@
 X_train, X_test = X[:n_samples_train], X[n_samples_train:]
 y_train, y_test = y[:n_samples_train], y[n_samples_train:]
 
+# #############################################################################
 # Compute train and test errors
 alphas = np.logspace(-5, 1, 60)
 enet = linear_model.ElasticNet(l1_ratio=0.7)
@@ -50,6 +52,7 @@
 enet.set_params(alpha=alpha_optim)
 coef_ = enet.fit(X, y).coef_
 
+# #############################################################################
 # Plot results functions
 
 import matplotlib.pyplot as plt
diff --git a/examples/neighbors/plot_regression.py b/examples/neighbors/plot_regression.py
index 89c730ec877a8..28c593ceeaf34 100644
--- a/examples/neighbors/plot_regression.py
+++ b/examples/neighbors/plot_regression.py
@@ -16,6 +16,7 @@
 # License: BSD 3 clause (C) INRIA
 
 
+# #############################################################################
 # Generate sample data
 import numpy as np
 import matplotlib.pyplot as plt
@@ -29,6 +30,7 @@
 # Add noise to targets
 y[::5] += 1 * (0.5 - np.random.rand(8))
 
+# #############################################################################
 # Fit regression model
 n_neighbors = 5
 
diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index 0fd6a075353cd..aa75ccc06d1f1 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -42,6 +42,7 @@
 from sklearn.pipeline import Pipeline
 
 
+# #############################################################################
 # Setting up
 
 def nudge_dataset(X, Y):
@@ -90,6 +91,7 @@ def nudge_dataset(X, Y):
 
 classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
 
+# #############################################################################
 # Training
 
 # Hyper-parameters. These were set by cross-validation,
@@ -109,6 +111,7 @@ def nudge_dataset(X, Y):
 logistic_classifier = linear_model.LogisticRegression(C=100.0)
 logistic_classifier.fit(X_train, Y_train)
 
+# #############################################################################
 # Evaluation
 
 print()
@@ -122,6 +125,7 @@ def nudge_dataset(X, Y):
         Y_test,
         logistic_classifier.predict(X_test))))
 
+# #############################################################################
 # Plotting
 
 plt.figure(figsize=(4.2, 4))
diff --git a/examples/plot_isotonic_regression.py b/examples/plot_isotonic_regression.py
index bac13ac414903..fd076b5afad62 100644
--- a/examples/plot_isotonic_regression.py
+++ b/examples/plot_isotonic_regression.py
@@ -30,6 +30,7 @@
 rs = check_random_state(0)
 y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
 
+# #############################################################################
 # Fit IsotonicRegression and LinearRegression models
 
 ir = IsotonicRegression()
@@ -39,6 +40,7 @@
 lr = LinearRegression()
 lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression
 
+# #############################################################################
 # Plot result
 
 segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
diff --git a/examples/plot_kernel_ridge_regression.py b/examples/plot_kernel_ridge_regression.py
index 6ad422227bb21..cb91908ed5f89 100644
--- a/examples/plot_kernel_ridge_regression.py
+++ b/examples/plot_kernel_ridge_regression.py
@@ -48,6 +48,7 @@
 
 rng = np.random.RandomState(0)
 
+# #############################################################################
 # Generate sample data
 X = 5 * rng.rand(10000, 1)
 y = np.sin(X).ravel()
@@ -57,6 +58,7 @@
 
 X_plot = np.linspace(0, 5, 100000)[:, None]
 
+# #############################################################################
 # Fit regression model
 train_size = 100
 svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,
@@ -95,6 +97,7 @@
       % (X_plot.shape[0], kr_predict))
 
 
+# #############################################################################
 # Look at the results
 sv_ind = svr.best_estimator_.support_
 plt.scatter(X[sv_ind], y[sv_ind], c='r', s=50, label='SVR support vectors',
diff --git a/examples/semi_supervised/plot_label_propagation_digits.py b/examples/semi_supervised/plot_label_propagation_digits.py
index 1590bf0bf9120..6b15fc21629bd 100644
--- a/examples/semi_supervised/plot_label_propagation_digits.py
+++ b/examples/semi_supervised/plot_label_propagation_digits.py
@@ -45,10 +45,12 @@ class will be very good.
 
 unlabeled_set = indices[n_labeled_points:]
 
+# #############################################################################
 # Shuffle everything around
 y_train = np.copy(y)
 y_train[unlabeled_set] = -1
 
+# #############################################################################
 # Learn with LabelSpreading
 lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
 lp_model.fit(X, y_train)
@@ -65,12 +67,15 @@ class will be very good.
 print("Confusion matrix")
 print(cm)
 
+# #############################################################################
 # Calculate uncertainty values for each transduced distribution
 pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
 
+# #############################################################################
 # Pick the top 10 most uncertain labels
 uncertainty_index = np.argsort(pred_entropies)[-10:]
 
+# #############################################################################
 # Plot
 f = plt.figure(figsize=(7, 5))
 for index, image_index in enumerate(uncertainty_index):
diff --git a/examples/semi_supervised/plot_label_propagation_structure.py b/examples/semi_supervised/plot_label_propagation_structure.py
index 8a1fa6e24c172..7cc15d73f1b89 100644
--- a/examples/semi_supervised/plot_label_propagation_structure.py
+++ b/examples/semi_supervised/plot_label_propagation_structure.py
@@ -28,10 +28,12 @@
 labels[0] = outer
 labels[-1] = inner
 
+# #############################################################################
 # Learn with LabelSpreading
 label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=1.0)
 label_spread.fit(X, labels)
 
+# #############################################################################
 # Plot output labels
 output_labels = label_spread.transduction_
 plt.figure(figsize=(8.5, 4))
diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py
index 045f7cf245b99..3a909b2b422bf 100644
--- a/examples/svm/plot_rbf_parameters.py
+++ b/examples/svm/plot_rbf_parameters.py
@@ -91,6 +91,7 @@ def __call__(self, value, clip=None):
         x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
         return np.ma.masked_array(np.interp(value, x, y))
 
+# #############################################################################
 # Load and prepare data set
 #
 # dataset for grid search
@@ -117,6 +118,7 @@ def __call__(self, value, clip=None):
 X = scaler.fit_transform(X)
 X_2d = scaler.fit_transform(X_2d)
 
+# #############################################################################
 # Train classifiers
 #
 # For an initial search, a logarithmic grid with basis
@@ -145,6 +147,7 @@ def __call__(self, value, clip=None):
         clf.fit(X_2d, y_2d)
         classifiers.append((C, gamma, clf))
 
+# #############################################################################
 # Visualization
 #
 # draw visualization of parameter effects
diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py
index d8b54cd306c1d..e223730eb82bf 100644
--- a/examples/svm/plot_svm_anova.py
+++ b/examples/svm/plot_svm_anova.py
@@ -14,6 +14,7 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import Pipeline
 
+# #############################################################################
 # Import some data to play with
 digits = datasets.load_digits()
 y = digits.target
@@ -25,6 +26,7 @@
 # add 200 non-informative features
 X = np.hstack((X, 2 * np.random.random((n_samples, 200))))
 
+# #############################################################################
 # Create a feature-selection transform and an instance of SVM that we
 # combine together to have an full-blown estimator
 
@@ -32,6 +34,7 @@
 
 clf = Pipeline([('anova', transform), ('svc', svm.SVC(C=1.0))])
 
+# #############################################################################
 # Plot the cross-validation score as a function of percentile of features
 score_means = list()
 score_stds = list()
diff --git a/examples/svm/plot_svm_regression.py b/examples/svm/plot_svm_regression.py
index 0093bbcea2a05..e46675eb0e069 100644
--- a/examples/svm/plot_svm_regression.py
+++ b/examples/svm/plot_svm_regression.py
@@ -12,13 +12,16 @@
 from sklearn.svm import SVR
 import matplotlib.pyplot as plt
 
+# #############################################################################
 # Generate sample data
 X = np.sort(5 * np.random.rand(40, 1), axis=0)
 y = np.sin(X).ravel()
 
+# #############################################################################
 # Add noise to targets
 y[::5] += 3 * (0.5 - np.random.rand(8))
 
+# #############################################################################
 # Fit regression model
 svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
 svr_lin = SVR(kernel='linear', C=1e3)
@@ -27,6 +30,7 @@
 y_lin = svr_lin.fit(X, y).predict(X)
 y_poly = svr_poly.fit(X, y).predict(X)
 
+# #############################################################################
 # Look at the results
 lw = 2
 plt.scatter(X, y, color='darkorange', label='data')
diff --git a/examples/text/document_classification_20newsgroups.py b/examples/text/document_classification_20newsgroups.py
index 250aa8429ec82..4781d28043e21 100644
--- a/examples/text/document_classification_20newsgroups.py
+++ b/examples/text/document_classification_20newsgroups.py
@@ -100,6 +100,7 @@ def is_interactive():
 print()
 
 
+# #############################################################################
 # Load some categories from the training set
 if opts.all_categories:
     categories = None
@@ -200,6 +201,7 @@ def trim(s):
     return s if len(s) <= 80 else s[:77] + "..."
 
 
+# #############################################################################
 # Benchmark classifiers
 def benchmark(clf):
     print('_' * 80)
diff --git a/examples/text/document_clustering.py b/examples/text/document_clustering.py
index ba7a9a8a1daf1..58e0e25a89cff 100644
--- a/examples/text/document_clustering.py
+++ b/examples/text/document_clustering.py
@@ -114,6 +114,7 @@ def is_interactive():
     sys.exit(1)
 
 
+# #############################################################################
 # Load some categories from the training set
 categories = [
     'alt.atheism',
@@ -182,6 +183,7 @@ def is_interactive():
     print()
 
 
+# #############################################################################
 # Do the actual clustering
 
 if opts.minibatch: