8000 DOC updated to notebook style for grid_search_text_feature_extraction.py by brendo-k · Pull Request #22558 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

DOC updated to notebook style for grid_search_text_feature_extraction.py #22558

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 12, 2022
61 changes: 32 additions & 29 deletions examples/model_selection/grid_search_text_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Mathieu Blondel <mathieu@mblondel.org>
# License: BSD 3 clause

# %%
# Data loading
# ------------

from pprint import pprint
from time import time
import logging
Expand All @@ -59,13 +64,12 @@
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")


# #############################################################################
# Load some categories from the training set
categories = [
"alt.atheism",
"talk.religion.misc",
]

# Uncomment the following to do the analysis on all the categories
# categories = None

Expand All @@ -77,9 +81,11 @@
print("%d categories" % len(data.target_names))
print()

# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
# %%
# Pipeline with hyperparameter tuning
# -----------------------------------

# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline(
[
("vect", CountVectorizer()),
Expand All @@ -88,8 +94,9 @@
]
)

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
# Parameters to use for grid search. Uncommenting more parameters will give
# better exploring power but will increase processing time in a combinatorial
# way
parameters = {
"vect__max_df": (0.5, 0.75, 1.0),
# 'vect__max_features': (None, 5000, 10000, 50000),
Expand All @@ -102,25 +109,21 @@
# 'clf__max_iter': (10, 50, 80),
}

if __name__ == "__main__":
# multiprocessing requires the fork to happen in a __main__ protected
# block

# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
# Find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
0