From aa8f15cee09e072ecab47ffef350b557a25c2db0 Mon Sep 17 00:00:00 2001 From: Brenden Date: Sun, 20 Feb 2022 12:35:59 -0500 Subject: [PATCH 1/6] updated to notebook style for grid_search_text_feature_extraction.py --- .../grid_search_text_feature_extraction.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py index dc2618473ecb1..d2ff1235c2f12 100644 --- a/examples/model_selection/grid_search_text_feature_extraction.py +++ b/examples/model_selection/grid_search_text_feature_extraction.py @@ -45,6 +45,9 @@ # Peter Prettenhofer # Mathieu Blondel # License: BSD 3 clause +# %% +# Load categories from the training set +# ------------------------------------- from pprint import pprint from time import time import logging @@ -59,9 +62,6 @@ # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") - -# ############################################################################# -# Load some categories from the training set categories = [ "alt.atheism", "talk.religion.misc", @@ -77,7 +77,9 @@ print("%d categories" % len(data.target_names)) print() -# ############################################################################# +# %% +# Build Pipeline +# -------------- # Define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline( @@ -88,8 +90,12 @@ ] ) -# uncommenting more parameters will give better exploring power but will -# increase processing time in a combinatorial way + +# %% +# Grid Search +# ----------- +# Parameters to use for grid search. Uncommenting more parameters will give +# better exploring power but will increase processing time in a combinatorial way parameters = { "vect__max_df": (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), @@ -102,12 +108,14 @@ # 'clf__max_iter': (10, 50, 80), } + +# %% +# Find the best parameters for both the feature extraction and the +# classifier if __name__ == "__main__": # multiprocessing requires the fork to happen in a __main__ protected # block - # find the best parameters for both the feature extraction and the - # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("Performing grid search...") From ac2cf8ca08c241fcc9200c8bfd08704af21376e7 Mon Sep 17 00:00:00 2001 From: Brenden Date: Sun, 20 Feb 2022 12:42:00 -0500 Subject: [PATCH 2/6] removed empty lines --- examples/model_selection/grid_search_text_feature_extraction.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py index d2ff1235c2f12..299c52d14973d 100644 --- a/examples/model_selection/grid_search_text_feature_extraction.py +++ b/examples/model_selection/grid_search_text_feature_extraction.py @@ -90,7 +90,6 @@ ] ) - # %% # Grid Search # ----------- @@ -108,7 +107,6 @@ # 'clf__max_iter': (10, 50, 80), } - # %% # Find the best parameters for both the feature extraction and the # classifier From b82cbaecd33f64564600526e3412072b82ee211b Mon Sep 17 00:00:00 2001 From: Brenden Kadota Date: Sun, 27 Feb 2022 13:09:34 -0500 Subject: [PATCH 3/6] Update examples/model_selection/grid_search_text_feature_extraction.py Changed header Co-authored-by: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> --- .../model_selection/grid_search_text_feature_extraction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py index 299c52d14973d..aa569b1ee58a4 100644 --- a/examples/model_selection/grid_search_text_feature_extraction.py +++ b/examples/model_selection/grid_search_text_feature_extraction.py @@ -78,8 +78,8 @@ print() # %% -# Build Pipeline -# -------------- +# Tuning a pipeline +# ----------------- # Define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline( From 0f20954700bbc36b4f28293cde9864872da184dc Mon Sep 17 00:00:00 2001 From: Brenden Date: Sun, 27 Feb 2022 13:24:30 -0500 Subject: [PATCH 4/6] removed grid search block and combined it with tuning a model block --- examples/model_selection/grid_search_text_feature_extraction.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py index aa569b1ee58a4..a02a291f28516 100644 --- a/examples/model_selection/grid_search_text_feature_extraction.py +++ b/examples/model_selection/grid_search_text_feature_extraction.py @@ -91,8 +91,6 @@ ) # %% -# Grid Search -# ----------- # Parameters to use for grid search. Uncommenting more parameters will give # better exploring power but will increase processing time in a combinatorial way parameters = { From 2e4b81eeddcda71246705fd36a35d56d0daf3cac Mon Sep 17 00:00:00 2001 From: Brenden Date: Fri, 11 Mar 2022 21:19:35 -0500 Subject: [PATCH 5/6] removed main --- .../grid_search_text_feature_extraction.py | 39 +++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py index a02a291f28516..d3e7572c52dc1 100644 --- a/examples/model_selection/grid_search_text_feature_extraction.py +++ b/examples/model_selection/grid_search_text_feature_extraction.py @@ -108,23 +108,22 @@ # %% # Find the best parameters for both the feature extraction and the # classifier -if __name__ == "__main__": - # multiprocessing requires the fork to happen in a __main__ protected - # block - - grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) - - print("Performing grid search...") - print("pipeline:", [name for name, _ in pipeline.steps]) - print("parameters:") - pprint(parameters) - t0 = time() - grid_search.fit(data.data, data.target) - print("done in %0.3fs" % (time() - t0)) - print() - - print("Best score: %0.3f" % grid_search.best_score_) - print("Best parameters set:") - best_parameters = grid_search.best_estimator_.get_params() - for param_name in sorted(parameters.keys()): - print("\t%s: %r" % (param_name, best_parameters[param_name])) +# multiprocessing requires the fork to happen in a __main__ protected +# block + +grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) + +print("Performing grid search...") +print("pipeline:", [name for name, _ in pipeline.steps]) +print("parameters:") +pprint(parameters) +t0 = time() +grid_search.fit(data.data, data.target) +print("done in %0.3fs" % (time() - t0)) +print() + +print("Best score: %0.3f" % grid_search.best_score_) +print("Best parameters set:") +best_parameters = grid_search.best_estimator_.get_params() +for param_name in sorted(parameters.keys()): + print("\t%s: %r" % (param_name, best_parameters[param_name])) From d9f5f4d330dab2db488efc9c7bf18bdd4db5adc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Sat, 12 Mar 2022 10:03:32 +0100 Subject: [PATCH 6/6] tweaks --- .../grid_search_text_feature_extraction.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py index d3e7572c52dc1..91801b361265b 100644 --- a/examples/model_selection/grid_search_text_feature_extraction.py +++ b/examples/model_selection/grid_search_text_feature_extraction.py @@ -45,9 +45,11 @@ # Peter Prettenhofer # Mathieu Blondel # License: BSD 3 clause + # %% -# Load categories from the training set -# ------------------------------------- +# Data loading +# ------------ + from pprint import pprint from time import time import logging @@ -62,10 +64,12 @@ # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +# Load some categories from the training set categories = [ "alt.atheism", "talk.religion.misc", ] + # Uncomment the following to do the analysis on all the categories # categories = None @@ -78,10 +82,10 @@ print() # %% -# Tuning a pipeline -# ----------------- -# Define a pipeline combining a text feature extractor with a simple -# classifier +# Pipeline with hyperparameter tuning +# ----------------------------------- + +# Define a pipeline combining a text feature extractor with a simple classifier pipeline = Pipeline( [ ("vect", CountVectorizer()), @@ -90,9 +94,9 @@ ] ) -# %% # Parameters to use for grid search. Uncommenting more parameters will give -# better exploring power but will increase processing time in a combinatorial way +# better exploring power but will increase processing time in a combinatorial +# way parameters = { "vect__max_df": (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), @@ -105,12 +109,8 @@ # 'clf__max_iter': (10, 50, 80), } -# %% # Find the best parameters for both the feature extraction and the # classifier -# multiprocessing requires the fork to happen in a __main__ protected -# block - grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("Performing grid search...")