From aa8f15cee09e072ecab47ffef350b557a25c2db0 Mon Sep 17 00:00:00 2001
From: Brenden <brenden.kadota@gmail.com>
Date: Sun, 20 Feb 2022 12:35:59 -0500
Subject: [PATCH 1/6] updated to notebook style for
 grid_search_text_feature_extraction.py

---
 .../grid_search_text_feature_extraction.py    | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index dc2618473ecb1..d2ff1235c2f12 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -45,6 +45,9 @@
 #         Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #         Mathieu Blondel <mathieu@mblondel.org>
 # License: BSD 3 clause
+# %%
+# Load categories from the training set
+# -------------------------------------
 from pprint import pprint
 from time import time
 import logging
@@ -59,9 +62,6 @@
 # Display progress logs on stdout
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 
-
-# #############################################################################
-# Load some categories from the training set
 categories = [
     "alt.atheism",
     "talk.religion.misc",
@@ -77,7 +77,9 @@
 print("%d categories" % len(data.target_names))
 print()
 
-# #############################################################################
+# %%
+# Build Pipeline
+# --------------
 # Define a pipeline combining a text feature extractor with a simple
 # classifier
 pipeline = Pipeline(
@@ -88,8 +90,12 @@
     ]
 )
 
-# uncommenting more parameters will give better exploring power but will
-# increase processing time in a combinatorial way
+
+# %%
+# Grid Search
+# -----------
+# Parameters to use for grid search. Uncommenting more parameters will give
+# better exploring power but will increase processing time in a combinatorial way
 parameters = {
     "vect__max_df": (0.5, 0.75, 1.0),
     # 'vect__max_features': (None, 5000, 10000, 50000),
@@ -102,12 +108,14 @@
     # 'clf__max_iter': (10, 50, 80),
 }
 
+
+# %%
+# Find the best parameters for both the feature extraction and the
+# classifier
 if __name__ == "__main__":
     # multiprocessing requires the fork to happen in a __main__ protected
     # block
 
-    # find the best parameters for both the feature extraction and the
-    # classifier
     grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
 
     print("Performing grid search...")

From ac2cf8ca08c241fcc9200c8bfd08704af21376e7 Mon Sep 17 00:00:00 2001
From: Brenden <brenden.kadota@gmail.com>
Date: Sun, 20 Feb 2022 12:42:00 -0500
Subject: [PATCH 2/6] removed empty lines

---
 examples/model_selection/grid_search_text_feature_extraction.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index d2ff1235c2f12..299c52d14973d 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -90,7 +90,6 @@
     ]
 )
 
-
 # %%
 # Grid Search
 # -----------
@@ -108,7 +107,6 @@
     # 'clf__max_iter': (10, 50, 80),
 }
 
-
 # %%
 # Find the best parameters for both the feature extraction and the
 # classifier

From b82cbaecd33f64564600526e3412072b82ee211b Mon Sep 17 00:00:00 2001
From: Brenden Kadota <brenden.kadota@gmail.com>
Date: Sun, 27 Feb 2022 13:09:34 -0500
Subject: [PATCH 3/6] Update
 examples/model_selection/grid_search_text_feature_extraction.py

Changed header

Co-authored-by: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
---
 .../model_selection/grid_search_text_feature_extraction.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index 299c52d14973d..aa569b1ee58a4 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -78,8 +78,8 @@
 print()
 
 # %%
-# Build Pipeline
-# --------------
+# Tuning a pipeline
+# -----------------
 # Define a pipeline combining a text feature extractor with a simple
 # classifier
 pipeline = Pipeline(

From 0f20954700bbc36b4f28293cde9864872da184dc Mon Sep 17 00:00:00 2001
From: Brenden <brenden.kadota@gmail.com>
Date: Sun, 27 Feb 2022 13:24:30 -0500
Subject: [PATCH 4/6] removed grid search block and combined it with tuning a
 model block

---
 examples/model_selection/grid_search_text_feature_extraction.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index aa569b1ee58a4..a02a291f28516 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -91,8 +91,6 @@
 )
 
 # %%
-# Grid Search
-# -----------
 # Parameters to use for grid search. Uncommenting more parameters will give
 # better exploring power but will increase processing time in a combinatorial way
 parameters = {

From 2e4b81eeddcda71246705fd36a35d56d0daf3cac Mon Sep 17 00:00:00 2001
From: Brenden <brenden.kadota@gmail.com>
Date: Fri, 11 Mar 2022 21:19:35 -0500
Subject: [PATCH 5/6] removed main

---
 .../grid_search_text_feature_extraction.py    | 39 +++++++++----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index a02a291f28516..d3e7572c52dc1 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -108,23 +108,22 @@
 # %%
 # Find the best parameters for both the feature extraction and the
 # classifier
-if __name__ == "__main__":
-    # multiprocessing requires the fork to happen in a __main__ protected
-    # block
-
-    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
-
-    print("Performing grid search...")
-    print("pipeline:", [name for name, _ in pipeline.steps])
-    print("parameters:")
-    pprint(parameters)
-    t0 = time()
-    grid_search.fit(data.data, data.target)
-    print("done in %0.3fs" % (time() - t0))
-    print()
-
-    print("Best score: %0.3f" % grid_search.best_score_)
-    print("Best parameters set:")
-    best_parameters = grid_search.best_estimator_.get_params()
-    for param_name in sorted(parameters.keys()):
-        print("\t%s: %r" % (param_name, best_parameters[param_name]))
+# multiprocessing requires the fork to happen in a __main__ protected
+# block
+
+grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
+
+print("Performing grid search...")
+print("pipeline:", [name for name, _ in pipeline.steps])
+print("parameters:")
+pprint(parameters)
+t0 = time()
+grid_search.fit(data.data, data.target)
+print("done in %0.3fs" % (time() - t0))
+print()
+
+print("Best score: %0.3f" % grid_search.best_score_)
+print("Best parameters set:")
+best_parameters = grid_search.best_estimator_.get_params()
+for param_name in sorted(parameters.keys()):
+    print("\t%s: %r" % (param_name, best_parameters[param_name]))

From d9f5f4d330dab2db488efc9c7bf18bdd4db5adc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Sat, 12 Mar 2022 10:03:32 +0100
Subject: [PATCH 6/6] tweaks

---
 .../grid_search_text_feature_extraction.py    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index d3e7572c52dc1..91801b361265b 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -45,9 +45,11 @@
 #         Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #         Mathieu Blondel <mathieu@mblondel.org>
 # License: BSD 3 clause
+
 # %%
-# Load categories from the training set
-# -------------------------------------
+# Data loading
+# ------------
+
 from pprint import pprint
 from time import time
 import logging
@@ -62,10 +64,12 @@
 # Display progress logs on stdout
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 
+# Load some categories from the training set
 categories = [
     "alt.atheism",
     "talk.religion.misc",
 ]
+
 # Uncomment the following to do the analysis on all the categories
 # categories = None
 
@@ -78,10 +82,10 @@
 print()
 
 # %%
-# Tuning a pipeline
-# -----------------
-# Define a pipeline combining a text feature extractor with a simple
-# classifier
+# Pipeline with hyperparameter tuning
+# -----------------------------------
+
+# Define a pipeline combining a text feature extractor with a simple classifier
 pipeline = Pipeline(
     [
         ("vect", CountVectorizer()),
@@ -90,9 +94,9 @@
     ]
 )
 
-# %%
 # Parameters to use for grid search. Uncommenting more parameters will give
-# better exploring power but will increase processing time in a combinatorial way
+# better exploring power but will increase processing time in a combinatorial
+# way
 parameters = {
     "vect__max_df": (0.5, 0.75, 1.0),
     # 'vect__max_features': (None, 5000, 10000, 50000),
@@ -105,12 +109,8 @@
     # 'clf__max_iter': (10, 50, 80),
 }
 
-# %%
 # Find the best parameters for both the feature extraction and the
 # classifier
-# multiprocessing requires the fork to happen in a __main__ protected
-# block
-
 grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
 
 print("Performing grid search...")