|
45 | 45 | # Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
46 | 46 | # Mathieu Blondel <mathieu@mblondel.org>
|
47 | 47 | # License: BSD 3 clause
|
| 48 | + |
| 49 | +# %% |
| 50 | +# Data loading |
| 51 | +# ------------ |
| 52 | + |
48 | 53 | from pprint import pprint
|
49 | 54 | from time import time
|
50 | 55 | import logging
|
|
59 | 64 | # Display progress logs on stdout
|
60 | 65 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
61 | 66 |
|
62 |
| - |
63 |
| -# ############################################################################# |
64 | 67 | # Load some categories from the training set
|
65 | 68 | categories = [
|
66 | 69 | "alt.atheism",
|
67 | 70 | "talk.religion.misc",
|
68 | 71 | ]
|
| 72 | + |
69 | 73 | # Uncomment the following to do the analysis on all the categories
|
70 | 74 | # categories = None
|
71 | 75 |
|
|
77 | 81 | print("%d categories" % len(data.target_names))
|
78 | 82 | print()
|
79 | 83 |
|
80 |
| -# ############################################################################# |
81 |
| -# Define a pipeline combining a text feature extractor with a simple |
82 |
| -# classifier |
| 84 | +# %% |
| 85 | +# Pipeline with hyperparameter tuning |
| 86 | +# ----------------------------------- |
| 87 | + |
| 88 | +# Define a pipeline combining a text feature extractor with a simple classifier |
83 | 89 | pipeline = Pipeline(
|
84 | 90 | [
|
85 | 91 | ("vect", CountVectorizer()),
|
|
88 | 94 | ]
|
89 | 95 | )
|
90 | 96 |
|
91 |
| -# uncommenting more parameters will give better exploring power but will |
92 |
| -# increase processing time in a combinatorial way |
| 97 | +# Parameters to use for grid search. Uncommenting more parameters will give |
| 98 | +# better exploring power but will increase processing time in a combinatorial |
| 99 | +# way |
93 | 100 | parameters = {
|
94 | 101 | "vect__max_df": (0.5, 0.75, 1.0),
|
95 | 102 | # 'vect__max_features': (None, 5000, 10000, 50000),
|
|
102 | 109 | # 'clf__max_iter': (10, 50, 80),
|
103 | 110 | }
|
104 | 111 |
|
105 |
| -if __name__ == "__main__": |
106 |
| - # multiprocessing requires the fork to happen in a __main__ protected |
107 |
| - # block |
108 |
| - |
109 |
| - # find the best parameters for both the feature extraction and the |
110 |
| - # classifier |
111 |
| - grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) |
112 |
| - |
113 |
| - print("Performing grid search...") |
114 |
| - print("pipeline:", [name for name, _ in pipeline.steps]) |
115 |
| - print("parameters:") |
116 |
| - pprint(parameters) |
117 |
| - t0 = time() |
118 |
| - grid_search.fit(data.data, data.target) |
119 |
| - print("done in %0.3fs" % (time() - t0)) |
120 |
| - print() |
121 |
| - |
122 |
| - print("Best score: %0.3f" % grid_search.best_score_) |
123 |
| - print("Best parameters set:") |
124 |
| - best_parameters = grid_search.best_estimator_.get_params() |
125 |
| - for param_name in sorted(parameters.keys()): |
126 |
| - print("\t%s: %r" % (param_name, best_parameters[param_name])) |
| 112 | +# Find the best parameters for both the feature extraction and the |
| 113 | +# classifier |
| 114 | +grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) |
| 115 | + |
| 116 | +print("Performing grid search...") |
| 117 | +print("pipeline:", [name for name, _ in pipeline.steps]) |
| 118 | +print("parameters:") |
| 119 | +pprint(parameters) |
| 120 | +t0 = time() |
| 121 | +grid_search.fit(data.data, data.target) |
| 122 | +print("done in %0.3fs" % (time() - t0)) |
| 123 | +print() |
| 124 | + |
| 125 | +print("Best score: %0.3f" % grid_search.best_score_) |
| 126 | +print("Best parameters set:") |
| 127 | +best_parameters = grid_search.best_estimator_.get_params() |
| 128 | +for param_name in sorted(parameters.keys()): |
| 129 | + print("\t%s: %r" % (param_name, best_parameters[param_name])) |
0 commit comments