diff --git a/examples/applications/plot_face_recognition.py b/examples/applications/plot_face_recognition.py
index 039af7ea2feb6..13a38d13bc00c 100644
--- a/examples/applications/plot_face_recognition.py
+++ b/examples/applications/plot_face_recognition.py
@@ -48,7 +48,7 @@
 logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
 
 
-###############################################################################
+# #############################################################################
 # Download the data, if not already on disk and load it as numpy arrays
 
 lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
@@ -72,7 +72,7 @@
 print("n_classes: %d" % n_classes)
 
 
-###############################################################################
+# #############################################################################
 # Split into a training set and a test set using a stratified k fold
 
 # split into a training and testing set
@@ -80,7 +80,7 @@
     X, y, test_size=0.25, random_state=42)
 
 
-###############################################################################
+# #############################################################################
 # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
 # dataset): unsupervised feature extraction / dimensionality reduction
 n_components = 150
@@ -101,7 +101,7 @@
 print("done in %0.3fs" % (time() - t0))
 
 
-###############################################################################
+# #############################################################################
 # Train a SVM classification model
 
 print("Fitting the classifier to the training set")
@@ -115,7 +115,7 @@
 print(clf.best_estimator_)
 
 
-###############################################################################
+# #############################################################################
 # Quantitative evaluation of the model quality on the test set
 
 print("Predicting people's names on the test set")
@@ -127,7 +127,7 @@
 print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
 
 
-###############################################################################
+# #############################################################################
 # Qualitative evaluation of the predictions using matplotlib
 
 def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py
index 90fd5c718e78f..359711b995b14 100644
--- a/examples/applications/plot_model_complexity_influence.py
+++ b/examples/applications/plot_model_complexity_influence.py
@@ -34,11 +34,11 @@
 from sklearn.linear_model.stochastic_gradient import SGDClassifier
 from sklearn.metrics import hamming_loss
 
-###############################################################################
+# #############################################################################
 # Routines
 
 
-# initialize random generator
+# Initialize random generator
 np.random.seed(0)
 
 
@@ -122,8 +122,8 @@ def _count_nonzero_coefficients(estimator):
     a = estimator.coef_.toarray()
     return np.count_nonzero(a)
 
-###############################################################################
-# main code
+# #############################################################################
+# Main code
 regression_data = generate_data('regression')
 classification_data = generate_data('classification', sparse=True)
 configurations = [
diff --git a/examples/applications/plot_prediction_latency.py b/examples/applications/plot_prediction_latency.py
index a375c1cc8f3c3..71321b4d39d6e 100644
--- a/examples/applications/plot_prediction_latency.py
+++ b/examples/applications/plot_prediction_latency.py
@@ -266,12 +266,13 @@ def plot_benchmark_throughput(throughputs, configuration):
     plt.show()
 
 
-###############################################################################
-# main code
+# #############################################################################
+# Main code
 
 start_time = time.time()
 
-# benchmark bulk/atomic prediction speed for various regressors
+# #############################################################################
+# Benchmark bulk/atomic prediction speed for various regressors
 configuration = {
     'n_train': int(1e3),
     'n_test': int(1e2),
diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py
index cd1745bb1825f..c7d627e8148ef 100644
--- a/examples/applications/plot_stock_market.py
+++ b/examples/applications/plot_stock_market.py
@@ -74,7 +74,7 @@
 from sklearn import cluster, covariance, manifold
 
 
-###############################################################################
+# #############################################################################
 # Retrieve the data from Internet
 
 def quotes_historical_google(symbol, date1, date2):
@@ -189,7 +189,7 @@ def quotes_historical_google(symbol, date1, date2):
 variation = close_prices - open_prices
 
 
-###############################################################################
+# #############################################################################
 # Learn a graphical structure from the correlations
 edge_model = covariance.GraphLassoCV()
 
@@ -199,7 +199,7 @@ def quotes_historical_google(symbol, date1, date2):
 X /= X.std(axis=0)
 edge_model.fit(X)
 
-###############################################################################
+# #############################################################################
 # Cluster using affinity propagation
 
 _, labels = cluster.affinity_propagation(edge_model.covariance_)
@@ -208,7 +208,7 @@ def quotes_historical_google(symbol, date1, date2):
 for i in range(n_labels + 1):
     print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
 
-###############################################################################
+# #############################################################################
 # Find a low-dimension embedding for visualization: find the best position of
 # the nodes (the stocks) on a 2D plane
 
@@ -220,7 +220,7 @@ def quotes_historical_google(symbol, date1, date2):
 
 embedding = node_position_model.fit_transform(X.T).T
 
-###############################################################################
+# #############################################################################
 # Visualization
 plt.figure(1, facecolor='w', figsize=(10, 8))
 plt.clf()
diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py
index d60121e8ece31..175c10594440e 100644
--- a/examples/applications/wikipedia_principal_eigenvector.py
+++ b/examples/applications/wikipedia_principal_eigenvector.py
@@ -52,7 +52,7 @@
 
 print(__doc__)
 
-###############################################################################
+# #############################################################################
 # Where to download the data, if not already on disk
 redirects_url = "http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2"
 redirects_filename = redirects_url.rsplit("/", 1)[1]
@@ -73,7 +73,7 @@
         print()
 
 
-###############################################################################
+# #############################################################################
 # Loading the redirect files
 
 memory = Memory(cachedir=".")
diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py
index b38b25812bb7f..c6e3c0111b708 100644
--- a/examples/calibration/plot_calibration.py
+++ b/examples/calibration/plot_calibration.py
@@ -83,7 +83,7 @@
 clf_sigmoid_score = brier_score_loss(y_test, prob_pos_sigmoid, sw_test)
 print("With sigmoid calibration: %1.3f" % clf_sigmoid_score)
 
-###############################################################################
+# #############################################################################
 # Plot the data and the predicted probabilities
 plt.figure()
 y_unique = np.unique(y)
diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index 2e914696fc177..d935bce4f5bc2 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -81,7 +81,7 @@
 rfc = RandomForestClassifier(n_estimators=100)
 
 
-###############################################################################
+# #############################################################################
 # Plot calibration plots
 
 plt.figure(figsize=(10, 10))
diff --git a/examples/classification/plot_lda_qda.py b/examples/classification/plot_lda_qda.py
index a668e7cc0db0c..c76ffc1f2c11e 100644
--- a/examples/classification/plot_lda_qda.py
+++ b/examples/classification/plot_lda_qda.py
@@ -20,8 +20,8 @@ class has its own standard deviation with QDA.
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 
-###############################################################################
-# colormap
+# #############################################################################
+# Colormap
 cmap = colors.LinearSegmentedColormap(
     'red_blue_classes',
     {'red': [(0, 1, 1), (1, 0.7, 0.7)],
@@ -30,8 +30,8 @@ class has its own standard deviation with QDA.
 plt.cm.register_cmap(cmap=cmap)
 
 
-###############################################################################
-# generate datasets
+# #############################################################################
+# Generate datasets
 def dataset_fixed_cov():
     '''Generate 2 Gaussians samples with the same covariance matrix'''
     n, dim = 300, 2
@@ -54,8 +54,8 @@ def dataset_cov():
     return X, y
 
 
-###############################################################################
-# plot functions
+# #############################################################################
+# Plot functions
 def plot_data(lda, X, y, y_pred, fig_index):
     splot = plt.subplot(2, 2, fig_index)
     if fig_index == 1:
@@ -132,7 +132,6 @@ def plot_qda_cov(qda, splot):
     plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')
     plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')
 
-###############################################################################
 for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
     # Linear Discriminant Analysis
     lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py
index 0d6c395a4e4bf..2c8fc3acc3936 100644
--- a/examples/cluster/plot_affinity_propagation.py
+++ b/examples/cluster/plot_affinity_propagation.py
@@ -14,13 +14,13 @@
 from sklearn import metrics
 from sklearn.datasets.samples_generator import make_blobs
 
-##############################################################################
+# #############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
 X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,
                             random_state=0)
 
-##############################################################################
+# #############################################################################
 # Compute Affinity Propagation
 af = AffinityPropagation(preference=-50).fit(X)
 cluster_centers_indices = af.cluster_centers_indices_
@@ -39,7 +39,7 @@
 print("Silhouette Coefficient: %0.3f"
       % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
 
-##############################################################################
+# #############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 from itertools import cycle
diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py
index a12b3d39128b6..8b116ed2cfbb0 100644
--- a/examples/cluster/plot_dbscan.py
+++ b/examples/cluster/plot_dbscan.py
@@ -17,7 +17,7 @@
 from sklearn.preprocessing import StandardScaler
 
 
-##############################################################################
+# #############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
 X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
@@ -25,7 +25,7 @@
 
 X = StandardScaler().fit_transform(X)
 
-##############################################################################
+# #############################################################################
 # Compute DBSCAN
 db = DBSCAN(eps=0.3, min_samples=10).fit(X)
 core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
@@ -46,7 +46,7 @@
 print("Silhouette Coefficient: %0.3f"
       % metrics.silhouette_score(X, labels))
 
-##############################################################################
+# #############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 
diff --git a/examples/cluster/plot_dict_face_patches.py b/examples/cluster/plot_dict_face_patches.py
index 7fb125ed735e9..ac2fde3e2cc6a 100644
--- a/examples/cluster/plot_dict_face_patches.py
+++ b/examples/cluster/plot_dict_face_patches.py
@@ -32,7 +32,7 @@
 
 faces = datasets.fetch_olivetti_faces()
 
-###############################################################################
+# #############################################################################
 # Learn the dictionary of images
 
 print('Learning the dictionary... ')
@@ -66,7 +66,7 @@
 dt = time.time() - t0
 print('done in %.2fs.' % dt)
 
-###############################################################################
+# #############################################################################
 # Plot the results
 plt.figure(figsize=(4.2, 4))
 for i, patch in enumerate(kmeans.cluster_centers_):
diff --git a/examples/cluster/plot_face_ward_segmentation.py b/examples/cluster/plot_face_ward_segmentation.py
index 687d87ce7f429..1490b6a110388 100644
--- a/examples/cluster/plot_face_ward_segmentation.py
+++ b/examples/cluster/plot_face_ward_segmentation.py
@@ -25,7 +25,7 @@
 from sklearn.cluster import AgglomerativeClustering
 
 
-###############################################################################
+# #############################################################################
 # Generate data
 try:  # SciPy >= 0.16 have face in misc
     from scipy.misc import face
@@ -38,11 +38,11 @@
 
 X = np.reshape(face, (-1, 1))
 
-###############################################################################
+# #############################################################################
 # Define the structure A of the data. Pixels connected to their neighbors.
 connectivity = grid_to_graph(*face.shape)
 
-###############################################################################
+# #############################################################################
 # Compute clustering
 print("Compute structured hierarchical clustering...")
 st = time.time()
@@ -55,7 +55,7 @@
 print("Number of pixels: ", label.size)
 print("Number of clusters: ", np.unique(label).size)
 
-###############################################################################
+# #############################################################################
 # Plot the results on an image
 plt.figure(figsize=(5, 5))
 plt.imshow(face, cmap=plt.cm.gray)
diff --git a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
index ca3eb2a0035be..0801899f70349 100644
--- a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
+++ b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
@@ -34,7 +34,7 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import KFold
 
-###############################################################################
+# #############################################################################
 # Generate data
 n_samples = 200
 size = 40  # image size
@@ -58,7 +58,7 @@
 noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
 y += noise_coef * noise  # add noise
 
-###############################################################################
+# #############################################################################
 # Compute the coefs of a Bayesian Ridge with GridSearch
 cv = KFold(2)  # cross-validation generator for model selection
 ridge = BayesianRidge()
@@ -88,7 +88,7 @@
 coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
 coef_selection_ = coef_.reshape(size, size)
 
-###############################################################################
+# #############################################################################
 # Inverse the transformation to plot the results on an image
 plt.close('all')
 plt.figure(figsize=(7.3, 2.7))
diff --git a/examples/cluster/plot_kmeans_digits.py b/examples/cluster/plot_kmeans_digits.py
index 1e6fbbc019923..f38eb8b4be416 100644
--- a/examples/cluster/plot_kmeans_digits.py
+++ b/examples/cluster/plot_kmeans_digits.py
@@ -84,7 +84,7 @@ def bench_k_means(estimator, name, data):
               data=data)
 print(82 * '_')
 
-###############################################################################
+# #############################################################################
 # Visualize the results on PCA-reduced data
 
 reduced_data = PCA(n_components=2).fit_transform(data)
diff --git a/examples/cluster/plot_mean_shift.py b/examples/cluster/plot_mean_shift.py
index 775cd98e59527..730c820c48345 100644
--- a/examples/cluster/plot_mean_shift.py
+++ b/examples/cluster/plot_mean_shift.py
@@ -16,12 +16,12 @@
 from sklearn.cluster import MeanShift, estimate_bandwidth
 from sklearn.datasets.samples_generator import make_blobs
 
-###############################################################################
+# #############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
 X, _ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6)
 
-###############################################################################
+# #############################################################################
 # Compute clustering with MeanShift
 
 # The following bandwidth can be automatically detected using
@@ -37,7 +37,7 @@
 
 print("number of estimated clusters : %d" % n_clusters_)
 
-###############################################################################
+# #############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 from itertools import cycle
diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py
index 56d999c6c846d..9f84566a3c3a7 100644
--- a/examples/cluster/plot_mini_batch_kmeans.py
+++ b/examples/cluster/plot_mini_batch_kmeans.py
@@ -23,7 +23,7 @@
 from sklearn.metrics.pairwise import pairwise_distances_argmin
 from sklearn.datasets.samples_generator import make_blobs
 
-##############################################################################
+# #############################################################################
 # Generate sample data
 np.random.seed(0)
 
@@ -32,7 +32,7 @@
 n_clusters = len(centers)
 X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)
 
-##############################################################################
+# #############################################################################
 # Compute clustering with Means
 
 k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
@@ -40,7 +40,7 @@
 k_means.fit(X)
 t_batch = time.time() - t0
 
-##############################################################################
+# #############################################################################
 # Compute clustering with MiniBatchKMeans
 
 mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
@@ -49,7 +49,7 @@
 mbk.fit(X)
 t_mini_batch = time.time() - t0
 
-##############################################################################
+# #############################################################################
 # Plot result
 
 fig = plt.figure(figsize=(8, 3))
diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py
index 96f007400e492..aa66c811eda8d 100644
--- a/examples/cluster/plot_segmentation_toy.py
+++ b/examples/cluster/plot_segmentation_toy.py
@@ -36,7 +36,6 @@
 from sklearn.feature_extraction import image
 from sklearn.cluster import spectral_clustering
 
-###############################################################################
 l = 100
 x, y = np.indices((l, l))
 
@@ -52,7 +51,7 @@
 circle3 = (x - center3[0]) ** 2 + (y - center3[1]) ** 2 < radius3 ** 2
 circle4 = (x - center4[0]) ** 2 + (y - center4[1]) ** 2 < radius4 ** 2
 
-###############################################################################
+# #############################################################################
 # 4 circles
 img = circle1 + circle2 + circle3 + circle4
 
@@ -81,7 +80,7 @@
 plt.matshow(img)
 plt.matshow(label_im)
 
-###############################################################################
+# #############################################################################
 # 2 circles
 img = circle1 + circle2
 mask = img.astype(bool)
diff --git a/examples/cluster/plot_ward_structured_vs_unstructured.py b/examples/cluster/plot_ward_structured_vs_unstructured.py
index 2471f68a6f8ed..fa804d1e50335 100644
--- a/examples/cluster/plot_ward_structured_vs_unstructured.py
+++ b/examples/cluster/plot_ward_structured_vs_unstructured.py
@@ -33,7 +33,7 @@
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.datasets.samples_generator import make_swiss_roll
 
-###############################################################################
+# #############################################################################
 # Generate data (swiss roll dataset)
 n_samples = 1500
 noise = 0.05
@@ -41,7 +41,7 @@
 # Make it thinner
 X[:, 1] *= .5
 
-###############################################################################
+# #############################################################################
 # Compute clustering
 print("Compute unstructured hierarchical clustering...")
 st = time.time()
@@ -51,7 +51,7 @@
 print("Elapsed time: %.2fs" % elapsed_time)
 print("Number of points: %i" % label.size)
 
-###############################################################################
+# #############################################################################
 # Plot result
 fig = plt.figure()
 ax = p3.Axes3D(fig)
@@ -62,12 +62,12 @@
 plt.title('Without connectivity constraints (time %.2fs)' % elapsed_time)
 
 
-###############################################################################
+# #############################################################################
 # Define the structure A of the data. Here a 10 nearest neighbors
 from sklearn.neighbors import kneighbors_graph
 connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
 
-###############################################################################
+# #############################################################################
 # Compute clustering
 print("Compute structured hierarchical clustering...")
 st = time.time()
@@ -78,7 +78,7 @@
 print("Elapsed time: %.2fs" % elapsed_time)
 print("Number of points: %i" % label.size)
 
-###############################################################################
+# #############################################################################
 # Plot result
 fig = plt.figure()
 ax = p3.Axes3D(fig)
diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py
index 96f637974ee29..adb57f003cfbb 100644
--- a/examples/covariance/plot_covariance_estimation.py
+++ b/examples/covariance/plot_covariance_estimation.py
@@ -52,7 +52,7 @@
 from sklearn.model_selection import GridSearchCV
 
 
-###############################################################################
+# #############################################################################
 # Generate sample data
 n_features, n_samples = 40, 20
 np.random.seed(42)
@@ -64,7 +64,7 @@
 X_train = np.dot(base_X_train, coloring_matrix)
 X_test = np.dot(base_X_test, coloring_matrix)
 
-###############################################################################
+# #############################################################################
 # Compute the likelihood on test data
 
 # spanning a range of possible shrinkage coefficient values
@@ -78,7 +78,7 @@
 emp_cov = empirical_covariance(X_train)
 loglik_real = -log_likelihood(emp_cov, linalg.inv(real_cov))
 
-###############################################################################
+# #############################################################################
 # Compare different approaches to setting the parameter
 
 # GridSearch for an optimal shrinkage coefficient
@@ -94,7 +94,7 @@
 oa = OAS()
 loglik_oa = oa.fit(X_train).score(X_test)
 
-###############################################################################
+# #############################################################################
 # Plot results
 fig = plt.figure()
 plt.title("Regularized covariance: likelihood and shrinkage coefficient")
diff --git a/examples/covariance/plot_mahalanobis_distances.py b/examples/covariance/plot_mahalanobis_distances.py
index 53329aa71b80f..21f295ce58305 100644
--- a/examples/covariance/plot_mahalanobis_distances.py
+++ b/examples/covariance/plot_mahalanobis_distances.py
@@ -78,7 +78,7 @@
 # compare estimators learnt from the full data set with true parameters
 emp_cov = EmpiricalCovariance().fit(X)
 
-###############################################################################
+# #############################################################################
 # Display results
 fig = plt.figure()
 plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
diff --git a/examples/covariance/plot_sparse_cov.py b/examples/covariance/plot_sparse_cov.py
index d9b7f0808fd75..1d6782cb43ef8 100644
--- a/examples/covariance/plot_sparse_cov.py
+++ b/examples/covariance/plot_sparse_cov.py
@@ -59,7 +59,7 @@
 from sklearn.covariance import GraphLassoCV, ledoit_wolf
 import matplotlib.pyplot as plt
 
-##############################################################################
+# #############################################################################
 # Generate the data
 n_samples = 60
 n_features = 20
@@ -79,7 +79,7 @@
 X -= X.mean(axis=0)
 X /= X.std(axis=0)
 
-##############################################################################
+# #############################################################################
 # Estimate the covariance
 emp_cov = np.dot(X.T, X) / n_samples
 
@@ -91,7 +91,7 @@
 lw_cov_, _ = ledoit_wolf(X)
 lw_prec_ = linalg.inv(lw_cov_)
 
-##############################################################################
+# #############################################################################
 # Plot the results
 plt.figure(figsize=(10, 6))
 plt.subplots_adjust(left=0.02, right=0.98)
diff --git a/examples/cross_decomposition/plot_compare_cross_decomposition.py b/examples/cross_decomposition/plot_compare_cross_decomposition.py
index 437c08b056479..4a123c04b03a4 100644
--- a/examples/cross_decomposition/plot_compare_cross_decomposition.py
+++ b/examples/cross_decomposition/plot_compare_cross_decomposition.py
@@ -24,7 +24,7 @@
 import matplotlib.pyplot as plt
 from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA
 
-###############################################################################
+# #############################################################################
 # Dataset based latent variables model
 
 n = 500
@@ -46,7 +46,7 @@
 print("Corr(Y)")
 print(np.round(np.corrcoef(Y.T), 2))
 
-###############################################################################
+# #############################################################################
 # Canonical (symmetric) PLS
 
 # Transform data
@@ -106,7 +106,7 @@
 plt.yticks(())
 plt.show()
 
-###############################################################################
+# #############################################################################
 # PLS regression, with multivariate response, a.k.a. PLS2
 
 n = 1000
@@ -126,7 +126,6 @@
 print(np.round(pls2.coef_, 1))
 pls2.predict(X)
 
-###############################################################################
 # PLS regression, with univariate response, a.k.a. PLS1
 
 n = 1000
@@ -139,7 +138,7 @@
 print("Estimated betas")
 print(np.round(pls1.coef_, 1))
 
-###############################################################################
+# #############################################################################
 # CCA (PLS mode B with symmetric deflation)
 
 cca = CCA(n_components=2)
diff --git a/examples/decomposition/plot_faces_decomposition.py b/examples/decomposition/plot_faces_decomposition.py
index fce02751a1b0c..d29af6ad408fb 100644
--- a/examples/decomposition/plot_faces_decomposition.py
+++ b/examples/decomposition/plot_faces_decomposition.py
@@ -32,7 +32,7 @@
 image_shape = (64, 64)
 rng = RandomState(0)
 
-###############################################################################
+# #############################################################################
 # Load faces data
 dataset = fetch_olivetti_faces(shuffle=True, random_state=rng)
 faces = dataset.data
@@ -48,7 +48,6 @@
 print("Dataset consists of %d faces" % n_samples)
 
 
-###############################################################################
 def plot_gallery(title, images, n_col=n_col, n_row=n_row):
     plt.figure(figsize=(2. * n_col, 2.26 * n_row))
     plt.suptitle(title, size=16)
@@ -62,7 +61,7 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row):
         plt.yticks(())
     plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)
 
-###############################################################################
+# #############################################################################
 # List of the different estimators, whether to center and transpose the
 # problem, and whether the transformer uses the clustering API.
 estimators = [
@@ -102,12 +101,12 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row):
 ]
 
 
-###############################################################################
+# #############################################################################
 # Plot a sample of the input data
 
 plot_gallery("First centered Olivetti faces", faces_centered[:n_components])
 
-###############################################################################
+# #############################################################################
 # Do the estimation and plot it
 
 for name, estimator, center in estimators:
diff --git a/examples/decomposition/plot_ica_blind_source_separation.py b/examples/decomposition/plot_ica_blind_source_separation.py
index 9ba5a1523a3c6..fb7689064dd06 100644
--- a/examples/decomposition/plot_ica_blind_source_separation.py
+++ b/examples/decomposition/plot_ica_blind_source_separation.py
@@ -21,7 +21,7 @@
 
 from sklearn.decomposition import FastICA, PCA
 
-###############################################################################
+# #############################################################################
 # Generate sample data
 np.random.seed(0)
 n_samples = 2000
@@ -51,7 +51,7 @@
 pca = PCA(n_components=3)
 H = pca.fit_transform(X)  # Reconstruct signals based on orthogonal components
 
-###############################################################################
+# #############################################################################
 # Plot results
 
 plt.figure()
diff --git a/examples/decomposition/plot_ica_vs_pca.py b/examples/decomposition/plot_ica_vs_pca.py
index 54655e519257a..f9ef968babeb1 100644
--- a/examples/decomposition/plot_ica_vs_pca.py
+++ b/examples/decomposition/plot_ica_vs_pca.py
@@ -37,7 +37,7 @@
 
 from sklearn.decomposition import PCA, FastICA
 
-###############################################################################
+# #############################################################################
 # Generate sample data
 rng = np.random.RandomState(42)
 S = rng.standard_t(1.5, size=(20000, 2))
@@ -57,7 +57,7 @@
 S_ica_ /= S_ica_.std(axis=0)
 
 
-###############################################################################
+# #############################################################################
 # Plot results
 
 def plot_samples(S, axis_list=None):
diff --git a/examples/decomposition/plot_image_denoising.py b/examples/decomposition/plot_image_denoising.py
index 29bdf6ba65217..33a394a856c91 100644
--- a/examples/decomposition/plot_image_denoising.py
+++ b/examples/decomposition/plot_image_denoising.py
@@ -44,7 +44,6 @@
 from sklearn.feature_extraction.image import reconstruct_from_patches_2d
 
 
-###############################################################################
 try:  # SciPy >= 0.16 have face in misc
     from scipy.misc import face
     face = face(gray=True)
@@ -75,7 +74,7 @@
 data /= np.std(data, axis=0)
 print('done in %.2fs.' % (time() - t0))
 
-###############################################################################
+# #############################################################################
 # Learn the dictionary from reference patches
 
 print('Learning the dictionary...')
@@ -98,7 +97,7 @@
 plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
 
 
-###############################################################################
+# #############################################################################
 # Display the distorted image
 
 def show_with_diff(image, reference, title):
@@ -123,7 +122,7 @@ def show_with_diff(image, reference, title):
 
 show_with_diff(distorted, face, 'Distorted image')
 
-###############################################################################
+# #############################################################################
 # Extract noisy patches and reconstruct them using the dictionary
 
 print('Extracting noisy patches... ')
diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py
index f26d5d9d1c9bb..d9db17ffaec39 100644
--- a/examples/decomposition/plot_pca_3d.py
+++ b/examples/decomposition/plot_pca_3d.py
@@ -26,7 +26,7 @@
 from scipy import stats
 
 
-###############################################################################
+# #############################################################################
 # Create the data
 
 e = np.exp(1)
@@ -55,7 +55,7 @@ def pdf(x):
 b /= norm
 
 
-###############################################################################
+# #############################################################################
 # Plot the figures
 def plot_figs(fig_num, elev, azim):
     fig = plt.figure(fig_num, figsize=(4, 3))
diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py
index 7944f327e3645..b858434d910e3 100644
--- a/examples/decomposition/plot_pca_vs_fa_model_selection.py
+++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py
@@ -39,7 +39,7 @@
 
 print(__doc__)
 
-###############################################################################
+# #############################################################################
 # Create the data
 
 n_samples, n_features, rank = 1000, 50, 10
@@ -55,7 +55,7 @@
 sigmas = sigma * rng.rand(n_features) + sigma / 2.
 X_hetero = X + rng.randn(n_samples, n_features) * sigmas
 
-###############################################################################
+# #############################################################################
 # Fit the models
 
 n_components = np.arange(0, n_features, 5)  # options for n_components
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index 0437fd924ef1d..9285f8dae0eea 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -22,7 +22,7 @@
 from sklearn.utils import shuffle
 from sklearn.metrics import mean_squared_error
 
-###############################################################################
+# #############################################################################
 # Load data
 boston = datasets.load_boston()
 X, y = shuffle(boston.data, boston.target, random_state=13)
@@ -31,7 +31,7 @@
 X_train, y_train = X[:offset], y[:offset]
 X_test, y_test = X[offset:], y[offset:]
 
-###############################################################################
+# #############################################################################
 # Fit regression model
 params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
           'learning_rate': 0.01, 'loss': 'ls'}
@@ -41,7 +41,7 @@
 mse = mean_squared_error(y_test, clf.predict(X_test))
 print("MSE: %.4f" % mse)
 
-###############################################################################
+# #############################################################################
 # Plot training deviance
 
 # compute test set deviance
@@ -61,7 +61,7 @@
 plt.xlabel('Boosting Iterations')
 plt.ylabel('Deviance')
 
-###############################################################################
+# #############################################################################
 # Plot feature importance
 feature_importance = clf.feature_importances_
 # make importances relative to max importance
diff --git a/examples/exercises/plot_cv_diabetes.py b/examples/exercises/plot_cv_diabetes.py
index 6f3736d3c255b..76b0d81b8998c 100644
--- a/examples/exercises/plot_cv_diabetes.py
+++ b/examples/exercises/plot_cv_diabetes.py
@@ -52,7 +52,7 @@
 plt.axhline(np.max(scores), linestyle='--', color='.5')
 plt.xlim([alphas[0], alphas[-1]])
 
-##############################################################################
+# #############################################################################
 # Bonus: how much can you trust the selection of alpha?
 
 # To answer this question we use the LassoCV object that sets its alpha
diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py
index 5d123985a01bb..59ed716660341 100644
--- a/examples/feature_selection/plot_feature_selection.py
+++ b/examples/feature_selection/plot_feature_selection.py
@@ -27,8 +27,8 @@
 from sklearn import datasets, svm
 from sklearn.feature_selection import SelectPercentile, f_classif
 
-###############################################################################
-# import some data to play with
+# #############################################################################
+# Import some data to play with
 
 # The iris dataset
 iris = datasets.load_iris()
@@ -40,13 +40,12 @@
 X = np.hstack((iris.data, E))
 y = iris.target
 
-###############################################################################
 plt.figure(1)
 plt.clf()
 
 X_indices = np.arange(X.shape[-1])
 
-###############################################################################
+# #############################################################################
 # Univariate feature selection with F-test for feature scoring
 # We use the default selection function: the 10% most significant features
 selector = SelectPercentile(f_classif, percentile=10)
@@ -57,7 +56,7 @@
         label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
         edgecolor='black')
 
-###############################################################################
+# #############################################################################
 # Compare to the weights of an SVM
 clf = svm.SVC(kernel='linear')
 clf.fit(X, y)
diff --git a/examples/feature_selection/plot_permutation_test_for_classification.py b/examples/feature_selection/plot_permutation_test_for_classification.py
index 8cadbfa91ad09..095f743d40803 100644
--- a/examples/feature_selection/plot_permutation_test_for_classification.py
+++ b/examples/feature_selection/plot_permutation_test_for_classification.py
@@ -25,7 +25,7 @@
 from sklearn import datasets
 
 
-##############################################################################
+# #############################################################################
 # Loading a dataset
 iris = datasets.load_iris()
 X = iris.data
@@ -47,7 +47,7 @@
 
 print("Classification score %s (pvalue : %s)" % (score, pvalue))
 
-###############################################################################
+# #############################################################################
 # View histogram of permutation scores
 plt.hist(permutation_scores, 20, label='Permutation scores',
          edgecolor='black')
diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py
index 76d34d3150a5e..38c334a217df5 100644
--- a/examples/linear_model/plot_ard.py
+++ b/examples/linear_model/plot_ard.py
@@ -30,7 +30,7 @@
 
 from sklearn.linear_model import ARDRegression, LinearRegression
 
-###############################################################################
+# #############################################################################
 # Generating simulated data with Gaussian weights
 
 # Parameters of the example
@@ -51,7 +51,7 @@
 # Create the target
 y = np.dot(X, w) + noise
 
-###############################################################################
+# #############################################################################
 # Fit the ARD Regression
 clf = ARDRegression(compute_score=True)
 clf.fit(X, y)
@@ -59,7 +59,7 @@
 ols = LinearRegression()
 ols.fit(X, y)
 
-###############################################################################
+# #############################################################################
 # Plot the true weights, the estimated weights, the histogram of the
 # weights, and predictions with standard deviations
 plt.figure(figsize=(6, 5))
diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py
index 0dbc854cf2ee2..4359c421ea866 100644
--- a/examples/linear_model/plot_bayesian_ridge.py
+++ b/examples/linear_model/plot_bayesian_ridge.py
@@ -30,7 +30,7 @@
 
 from sklearn.linear_model import BayesianRidge, LinearRegression
 
-###############################################################################
+# #############################################################################
 # Generating simulated data with Gaussian weights
 np.random.seed(0)
 n_samples, n_features = 100, 100
@@ -48,7 +48,7 @@
 # Create the target
 y = np.dot(X, w) + noise
 
-###############################################################################
+# #############################################################################
 # Fit the Bayesian Ridge Regression and an OLS for comparison
 clf = BayesianRidge(compute_score=True)
 clf.fit(X, y)
@@ -56,7 +56,7 @@
 ols = LinearRegression()
 ols.fit(X, y)
 
-###############################################################################
+# #############################################################################
 # Plot true weights, estimated weights, histogram of the weights, and
 # predictions with standard deviations
 lw = 2
diff --git a/examples/linear_model/plot_lasso_and_elasticnet.py b/examples/linear_model/plot_lasso_and_elasticnet.py
index ca2d2425f9f5d..350cac0a0ad95 100644
--- a/examples/linear_model/plot_lasso_and_elasticnet.py
+++ b/examples/linear_model/plot_lasso_and_elasticnet.py
@@ -15,8 +15,8 @@
 
 from sklearn.metrics import r2_score
 
-###############################################################################
-# generate some sparse data to play with
+# #############################################################################
+# Generate some sparse data to play with
 np.random.seed(42)
 
 n_samples, n_features = 50, 200
@@ -35,7 +35,7 @@
 X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
 X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]
 
-###############################################################################
+# #############################################################################
 # Lasso
 from sklearn.linear_model import Lasso
 
@@ -47,7 +47,7 @@
 print(lasso)
 print("r^2 on test data : %f" % r2_score_lasso)
 
-###############################################################################
+# #############################################################################
 # ElasticNet
 from sklearn.linear_model import ElasticNet
 
diff --git a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
index bc8df42a8490e..c54f81d1b8bcd 100644
--- a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
+++ b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
@@ -17,7 +17,7 @@
 from sklearn.linear_model import Lasso
 
 
-###############################################################################
+# #############################################################################
 # The two Lasso implementations on Dense data
 print("--- Dense matrices")
 
@@ -39,7 +39,7 @@
 print("Distance between coefficients : %s"
       % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_))
 
-###############################################################################
+# #############################################################################
 # The two Lasso implementations on Sparse data
 print("--- Sparse matrices")
 
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 245c6bd0492c7..6b58b55956162 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -64,7 +64,7 @@
 # normalize data as done by Lars to allow for comparison
 X /= np.sqrt(np.sum(X ** 2, axis=0))
 
-##############################################################################
+# #############################################################################
 # LassoLarsIC: least angle regression with BIC/AIC criterion
 
 model_bic = LassoLarsIC(criterion='bic')
@@ -96,7 +96,7 @@ def plot_ic_criterion(model, name, color):
 plt.title('Information-criterion for model selection (training time %.3fs)'
           % t_bic)
 
-##############################################################################
+# #############################################################################
 # LassoCV: coordinate descent
 
 # Compute paths
@@ -125,7 +125,7 @@ def plot_ic_criterion(model, name, color):
 plt.axis('tight')
 plt.ylim(ymin, ymax)
 
-##############################################################################
+# #############################################################################
 # LassoLarsCV: least angle regression
 
 # Compute paths
diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index d1b17948c78e0..66a1ab9bd0254 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -29,7 +29,7 @@
 
 X -= np.mean(X, 0)
 
-###############################################################################
+# #############################################################################
 # Demo path functions
 
 cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)
diff --git a/examples/linear_model/plot_multi_task_lasso_support.py b/examples/linear_model/plot_multi_task_lasso_support.py
index ea17d752f94a0..c7a9536383bc2 100644
--- a/examples/linear_model/plot_multi_task_lasso_support.py
+++ b/examples/linear_model/plot_multi_task_lasso_support.py
@@ -39,7 +39,7 @@
 coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T])
 coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_
 
-###############################################################################
+# #############################################################################
 # Plot support and time series
 fig = plt.figure(figsize=(8, 5))
 plt.subplot(1, 2, 1)
diff --git a/examples/linear_model/plot_ols_3d.py b/examples/linear_model/plot_ols_3d.py
index 23dfa01d60ecc..d8b0f2b52aa22 100644
--- a/examples/linear_model/plot_ols_3d.py
+++ b/examples/linear_model/plot_ols_3d.py
@@ -37,7 +37,7 @@
 ols.fit(X_train, y_train)
 
 
-###############################################################################
+# #############################################################################
 # Plot the figure
 def plot_figs(fig_num, elev, azim, X_train, clf):
     fig = plt.figure(fig_num, figsize=(4, 3))
diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py
index 1f2c475f78b7d..b16212cbd3718 100644
--- a/examples/linear_model/plot_ridge_path.py
+++ b/examples/linear_model/plot_ridge_path.py
@@ -39,7 +39,7 @@
 X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
 y = np.ones(10)
 
-###############################################################################
+# #############################################################################
 # Compute paths
 
 n_alphas = 200
@@ -51,7 +51,7 @@
     ridge.fit(X, y)
     coefs.append(ridge.coef_)
 
-###############################################################################
+# #############################################################################
 # Display results
 
 ax = plt.gca()
diff --git a/examples/linear_model/plot_theilsen.py b/examples/linear_model/plot_theilsen.py
index 747ac63e6a205..c80b4a409937b 100644
--- a/examples/linear_model/plot_theilsen.py
+++ b/examples/linear_model/plot_theilsen.py
@@ -51,7 +51,7 @@
 colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen'}
 lw = 2
 
-##############################################################################
+# #############################################################################
 # Outliers only in the y direction
 
 np.random.seed(0)
@@ -80,7 +80,7 @@
 plt.legend(loc='upper left')
 plt.title("Corrupt y")
 
-##############################################################################
+# #############################################################################
 # Outliers in the X direction
 
 np.random.seed(0)
diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index daf82718d42e1..bc26ca0719265 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -67,7 +67,7 @@
                     format='%(asctime)s %(levelname)s %(message)s')
 
 
-###############################################################################
+# #############################################################################
 # Load some categories from the training set
 categories = [
     'alt.atheism',
@@ -84,8 +84,8 @@
 print("%d categories" % len(data.target_names))
 print()
 
-###############################################################################
-# define a pipeline combining a text feature extractor with a simple
+# #############################################################################
+# Define a pipeline combining a text feature extractor with a simple
 # classifier
 pipeline = Pipeline([
     ('vect', CountVectorizer()),
diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index 366aa0acbee06..eb4664049ee77 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -40,10 +40,10 @@
 from sklearn.metrics import roc_curve, auc
 from sklearn.model_selection import StratifiedKFold
 
-###############################################################################
+# #############################################################################
 # Data IO and generation
 
-# import some data to play with
+# Import some data to play with
 iris = datasets.load_iris()
 X = iris.data
 y = iris.target
@@ -54,7 +54,7 @@
 random_state = np.random.RandomState(0)
 X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
 
-###############################################################################
+# #############################################################################
 # Classification and ROC analysis
 
 # Run classifier with cross-validation and plot ROC curves
diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py
index 9002a0a3a5f30..4a1654d228f0f 100644
--- a/examples/model_selection/plot_train_error_vs_test_error.py
+++ b/examples/model_selection/plot_train_error_vs_test_error.py
@@ -19,7 +19,7 @@
 import numpy as np
 from sklearn import linear_model
 
-###############################################################################
+# #############################################################################
 # Generate sample data
 n_samples_train, n_samples_test, n_features = 75, 150, 500
 np.random.seed(0)
@@ -32,7 +32,7 @@
 X_train, X_test = X[:n_samples_train], X[n_samples_train:]
 y_train, y_test = y[:n_samples_train], y[n_samples_train:]
 
-###############################################################################
+# #############################################################################
 # Compute train and test errors
 alphas = np.logspace(-5, 1, 60)
 enet = linear_model.ElasticNet(l1_ratio=0.7)
@@ -52,7 +52,7 @@
 enet.set_params(alpha=alpha_optim)
 coef_ = enet.fit(X, y).coef_
 
-###############################################################################
+# #############################################################################
 # Plot results functions
 
 import matplotlib.pyplot as plt
diff --git a/examples/neighbors/plot_regression.py b/examples/neighbors/plot_regression.py
index c664d7f173b0e..28c593ceeaf34 100644
--- a/examples/neighbors/plot_regression.py
+++ b/examples/neighbors/plot_regression.py
@@ -16,7 +16,7 @@
 # License: BSD 3 clause (C) INRIA
 
 
-###############################################################################
+# #############################################################################
 # Generate sample data
 import numpy as np
 import matplotlib.pyplot as plt
@@ -30,7 +30,7 @@
 # Add noise to targets
 y[::5] += 1 * (0.5 - np.random.rand(8))
 
-###############################################################################
+# #############################################################################
 # Fit regression model
 n_neighbors = 5
 
diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index 2b9b15fe3d966..aa75ccc06d1f1 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -42,7 +42,7 @@
 from sklearn.pipeline import Pipeline
 
 
-###############################################################################
+# #############################################################################
 # Setting up
 
 def nudge_dataset(X, Y):
@@ -91,7 +91,7 @@ def nudge_dataset(X, Y):
 
 classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
 
-###############################################################################
+# #############################################################################
 # Training
 
 # Hyper-parameters. These were set by cross-validation,
@@ -111,7 +111,7 @@ def nudge_dataset(X, Y):
 logistic_classifier = linear_model.LogisticRegression(C=100.0)
 logistic_classifier.fit(X_train, Y_train)
 
-###############################################################################
+# #############################################################################
 # Evaluation
 
 print()
@@ -125,7 +125,7 @@ def nudge_dataset(X, Y):
         Y_test,
         logistic_classifier.predict(X_test))))
 
-###############################################################################
+# #############################################################################
 # Plotting
 
 plt.figure(figsize=(4.2, 4))
diff --git a/examples/plot_isotonic_regression.py b/examples/plot_isotonic_regression.py
index 4ae207ccedcfd..fd076b5afad62 100644
--- a/examples/plot_isotonic_regression.py
+++ b/examples/plot_isotonic_regression.py
@@ -30,7 +30,7 @@
 rs = check_random_state(0)
 y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
 
-###############################################################################
+# #############################################################################
 # Fit IsotonicRegression and LinearRegression models
 
 ir = IsotonicRegression()
@@ -40,8 +40,8 @@
 lr = LinearRegression()
 lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression
 
-###############################################################################
-# plot result
+# #############################################################################
+# Plot result
 
 segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
 lc = LineCollection(segments, zorder=0)
diff --git a/examples/plot_kernel_ridge_regression.py b/examples/plot_kernel_ridge_regression.py
index 85cd9990c1f68..cb91908ed5f89 100644
--- a/examples/plot_kernel_ridge_regression.py
+++ b/examples/plot_kernel_ridge_regression.py
@@ -48,7 +48,7 @@
 
 rng = np.random.RandomState(0)
 
-#############################################################################
+# #############################################################################
 # Generate sample data
 X = 5 * rng.rand(10000, 1)
 y = np.sin(X).ravel()
@@ -58,7 +58,7 @@
 
 X_plot = np.linspace(0, 5, 100000)[:, None]
 
-#############################################################################
+# #############################################################################
 # Fit regression model
 train_size = 100
 svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,
@@ -97,8 +97,8 @@
       % (X_plot.shape[0], kr_predict))
 
 
-#############################################################################
-# look at the results
+# #############################################################################
+# Look at the results
 sv_ind = svr.best_estimator_.support_
 plt.scatter(X[sv_ind], y[sv_ind], c='r', s=50, label='SVR support vectors',
             zorder=2, edgecolors=(0, 0, 0))
diff --git a/examples/semi_supervised/plot_label_propagation_digits.py b/examples/semi_supervised/plot_label_propagation_digits.py
index 72da021374ad9..6b15fc21629bd 100644
--- a/examples/semi_supervised/plot_label_propagation_digits.py
+++ b/examples/semi_supervised/plot_label_propagation_digits.py
@@ -45,11 +45,12 @@ class will be very good.
 
 unlabeled_set = indices[n_labeled_points:]
 
-# shuffle everything around
+# #############################################################################
+# Shuffle everything around
 y_train = np.copy(y)
 y_train[unlabeled_set] = -1
 
-###############################################################################
+# #############################################################################
 # Learn with LabelSpreading
 lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
 lp_model.fit(X, y_train)
@@ -66,14 +67,16 @@ class will be very good.
 print("Confusion matrix")
 print(cm)
 
-# calculate uncertainty values for each transduced distribution
+# #############################################################################
+# Calculate uncertainty values for each transduced distribution
 pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
 
-# pick the top 10 most uncertain labels
+# #############################################################################
+# Pick the top 10 most uncertain labels
 uncertainty_index = np.argsort(pred_entropies)[-10:]
 
-###############################################################################
-# plot
+# #############################################################################
+# Plot
 f = plt.figure(figsize=(7, 5))
 for index, image_index in enumerate(uncertainty_index):
     image = images[image_index]
diff --git a/examples/semi_supervised/plot_label_propagation_structure.py b/examples/semi_supervised/plot_label_propagation_structure.py
index 2632247984b24..7cc15d73f1b89 100644
--- a/examples/semi_supervised/plot_label_propagation_structure.py
+++ b/examples/semi_supervised/plot_label_propagation_structure.py
@@ -28,12 +28,12 @@
 labels[0] = outer
 labels[-1] = inner
 
-###############################################################################
+# #############################################################################
 # Learn with LabelSpreading
 label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=1.0)
 label_spread.fit(X, labels)
 
-###############################################################################
+# #############################################################################
 # Plot output labels
 output_labels = label_spread.transduction_
 plt.figure(figsize=(8.5, 4))
diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py
index 9bbca6683ce95..3a909b2b422bf 100644
--- a/examples/svm/plot_rbf_parameters.py
+++ b/examples/svm/plot_rbf_parameters.py
@@ -91,7 +91,7 @@ def __call__(self, value, clip=None):
         x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
         return np.ma.masked_array(np.interp(value, x, y))
 
-##############################################################################
+# #############################################################################
 # Load and prepare data set
 #
 # dataset for grid search
@@ -118,7 +118,7 @@ def __call__(self, value, clip=None):
 X = scaler.fit_transform(X)
 X_2d = scaler.fit_transform(X_2d)
 
-##############################################################################
+# #############################################################################
 # Train classifiers
 #
 # For an initial search, a logarithmic grid with basis
@@ -147,8 +147,8 @@ def __call__(self, value, clip=None):
         clf.fit(X_2d, y_2d)
         classifiers.append((C, gamma, clf))
 
-##############################################################################
-# visualization
+# #############################################################################
+# Visualization
 #
 # draw visualization of parameter effects
 
diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py
index 01938efd593ac..e223730eb82bf 100644
--- a/examples/svm/plot_svm_anova.py
+++ b/examples/svm/plot_svm_anova.py
@@ -14,7 +14,7 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import Pipeline
 
-###############################################################################
+# #############################################################################
 # Import some data to play with
 digits = datasets.load_digits()
 y = digits.target
@@ -26,7 +26,7 @@
 # add 200 non-informative features
 X = np.hstack((X, 2 * np.random.random((n_samples, 200))))
 
-###############################################################################
+# #############################################################################
 # Create a feature-selection transform and an instance of SVM that we
 # combine together to have an full-blown estimator
 
@@ -34,7 +34,7 @@
 
 clf = Pipeline([('anova', transform), ('svc', svm.SVC(C=1.0))])
 
-###############################################################################
+# #############################################################################
 # Plot the cross-validation score as a function of percentile of features
 score_means = list()
 score_stds = list()
diff --git a/examples/svm/plot_svm_regression.py b/examples/svm/plot_svm_regression.py
index 15a744e2aa8ca..e46675eb0e069 100644
--- a/examples/svm/plot_svm_regression.py
+++ b/examples/svm/plot_svm_regression.py
@@ -12,16 +12,16 @@
 from sklearn.svm import SVR
 import matplotlib.pyplot as plt
 
-###############################################################################
+# #############################################################################
 # Generate sample data
 X = np.sort(5 * np.random.rand(40, 1), axis=0)
 y = np.sin(X).ravel()
 
-###############################################################################
+# #############################################################################
 # Add noise to targets
 y[::5] += 3 * (0.5 - np.random.rand(8))
 
-###############################################################################
+# #############################################################################
 # Fit regression model
 svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
 svr_lin = SVR(kernel='linear', C=1e3)
@@ -30,8 +30,8 @@
 y_lin = svr_lin.fit(X, y).predict(X)
 y_poly = svr_poly.fit(X, y).predict(X)
 
-###############################################################################
-# look at the results
+# #############################################################################
+# Look at the results
 lw = 2
 plt.scatter(X, y, color='darkorange', label='data')
 plt.hold('on')
diff --git a/examples/text/document_classification_20newsgroups.py b/examples/text/document_classification_20newsgroups.py
index f34bbd10cbe55..4781d28043e21 100644
--- a/examples/text/document_classification_20newsgroups.py
+++ b/examples/text/document_classification_20newsgroups.py
@@ -100,7 +100,7 @@ def is_interactive():
 print()
 
 
-###############################################################################
+# #############################################################################
 # Load some categories from the training set
 if opts.all_categories:
     categories = None
@@ -201,7 +201,7 @@ def trim(s):
     return s if len(s) <= 80 else s[:77] + "..."
 
 
-###############################################################################
+# #############################################################################
 # Benchmark classifiers
 def benchmark(clf):
     print('_' * 80)
diff --git a/examples/text/document_clustering.py b/examples/text/document_clustering.py
index 29725cc7ccfb4..58e0e25a89cff 100644
--- a/examples/text/document_clustering.py
+++ b/examples/text/document_clustering.py
@@ -114,7 +114,7 @@ def is_interactive():
     sys.exit(1)
 
 
-###############################################################################
+# #############################################################################
 # Load some categories from the training set
 categories = [
     'alt.atheism',
@@ -183,7 +183,7 @@ def is_interactive():
     print()
 
 
-###############################################################################
+# #############################################################################
 # Do the actual clustering
 
 if opts.minibatch: