diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py index dc2618473ecb1..91801b361265b 100644 --- a/examples/model_selection/grid_search_text_feature_extraction.py +++ b/examples/model_selection/grid_search_text_feature_extraction.py @@ -45,6 +45,11 @@ # Peter Prettenhofer # Mathieu Blondel # License: BSD 3 clause + +# %% +# Data loading +# ------------ + from pprint import pprint from time import time import logging @@ -59,13 +64,12 @@ # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") - -# ############################################################################# # Load some categories from the training set categories = [ "alt.atheism", "talk.religion.misc", ] + # Uncomment the following to do the analysis on all the categories # categories = None @@ -77,9 +81,11 @@ print("%d categories" % len(data.target_names)) print() -# ############################################################################# -# Define a pipeline combining a text feature extractor with a simple -# classifier +# %% +# Pipeline with hyperparameter tuning +# ----------------------------------- + +# Define a pipeline combining a text feature extractor with a simple classifier pipeline = Pipeline( [ ("vect", CountVectorizer()), @@ -88,8 +94,9 @@ ] ) -# uncommenting more parameters will give better exploring power but will -# increase processing time in a combinatorial way +# Parameters to use for grid search. Uncommenting more parameters will give +# better exploring power but will increase processing time in a combinatorial +# way parameters = { "vect__max_df": (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), @@ -102,25 +109,21 @@ # 'clf__max_iter': (10, 50, 80), } -if __name__ == "__main__": - # multiprocessing requires the fork to happen in a __main__ protected - # block - - # find the best parameters for both the feature extraction and the - # classifier - grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) - - print("Performing grid search...") - print("pipeline:", [name for name, _ in pipeline.steps]) - print("parameters:") - pprint(parameters) - t0 = time() - grid_search.fit(data.data, data.target) - print("done in %0.3fs" % (time() - t0)) - print() - - print("Best score: %0.3f" % grid_search.best_score_) - print("Best parameters set:") - best_parameters = grid_search.best_estimator_.get_params() - for param_name in sorted(parameters.keys()): - print("\t%s: %r" % (param_name, best_parameters[param_name])) +# Find the best parameters for both the feature extraction and the +# classifier +grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) + +print("Performing grid search...") +print("pipeline:", [name for name, _ in pipeline.steps]) +print("parameters:") +pprint(parameters) +t0 = time() +grid_search.fit(data.data, data.target) +print("done in %0.3fs" % (time() - t0)) +print() + +print("Best score: %0.3f" % grid_search.best_score_) +print("Best parameters set:") +best_parameters = grid_search.best_estimator_.get_params() +for param_name in sorted(parameters.keys()): + print("\t%s: %r" % (param_name, best_parameters[param_name]))