scikit-learn · lesteve · Mar 12, 2022 · Feb 20, 2022 · Feb 20, 2022 · Feb 27, 2022
diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -45,6 +45,11 @@
 #         Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #         Mathieu Blondel <mathieu@mblondel.org>
 # License: BSD 3 clause
+
+# %%
+# Data loading
+# ------------
+
 from pprint import pprint
 from time import time
 import logging
@@ -59,13 +64,12 @@
 # Display progress logs on stdout
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 
-
-# #############################################################################
 # Load some categories from the training set
 categories = [
     "alt.atheism",
     "talk.religion.misc",
 ]
+
 # Uncomment the following to do the analysis on all the categories
 # categories = None
 
@@ -77,9 +81,11 @@
 print("%d categories" % len(data.target_names))
 print()
 
-# #############################################################################
-# Define a pipeline combining a text feature extractor with a simple
-# classifier
+# %%
+# Pipeline with hyperparameter tuning
+# -----------------------------------
+
+# Define a pipeline combining a text feature extractor with a simple classifier
 pipeline = Pipeline(
     [
         ("vect", CountVectorizer()),
@@ -88,8 +94,9 @@
     ]
 )
 
-# uncommenting more parameters will give better exploring power but will
-# increase processing time in a combinatorial way
+# Parameters to use for grid search. Uncommenting more parameters will give
+# better exploring power but will increase processing time in a combinatorial
+# way
 parameters = {
     "vect__max_df": (0.5, 0.75, 1.0),
     # 'vect__max_features': (None, 5000, 10000, 50000),
@@ -102,25 +109,21 @@
     # 'clf__max_iter': (10, 50, 80),
 }
 
-if __name__ == "__main__":
-    # multiprocessing requires the fork to happen in a __main__ protected
-    # block
-
-    # find the best parameters for both the feature extraction and the
-    # classifier
-    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
-
-    print("Performing grid search...")
-    print("pipeline:", [name for name, _ in pipeline.steps])
-    print("parameters:")
-    pprint(parameters)
-    t0 = time()
-    grid_search.fit(data.data, data.target)
-    print("done in %0.3fs" % (time() - t0))
-    print()
-
-    print("Best score: %0.3f" % grid_search.best_score_)
-    print("Best parameters set:")
-    best_parameters = grid_search.best_estimator_.get_params()
-    for param_name in sorted(parameters.keys()):
-        print("\t%s: %r" % (param_name, best_parameters[param_name]))
+# Find the best parameters for both the feature extraction and the
+# classifier
+grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
+
+print("Performing grid search...")
+print("pipeline:", [name for name, _ in pipeline.steps])
+print("parameters:")
+pprint(parameters)
+t0 = time()
+grid_search.fit(data.data, data.target)
+print("done in %0.3fs" % (time() - t0))
+print()
+
 print("Best score: %0.3f" % grid_search.best_score_)
+print("Best parameters set:")
+best_parameters = grid_search.best_estimator_.get_params()
+for param_name in sorted(parameters.keys()):
+    print("\t%s: %r" % (param_name, best_parameters[param_name]))