scikit-learn · GaelVaroquaux · Jun 22, 2016 · Jun 18, 2016 · Jun 19, 2016 · Jun 19, 2016
diff --git a/doc/datasets/twenty_newsgroups.rst b/doc/datasets/twenty_newsgroups.rst
@@ -132,8 +132,8 @@ which is fast to train and achieves a decent F-score::
   >>> clf = MultinomialNB(alpha=.01)
   >>> clf.fit(vectors, newsgroups_train.target)
   >>> pred = clf.predict(vectors_test)
-  >>> metrics.f1_score(newsgroups_test.target, pred, average='weighted')
-  0.88251152461278892
+  >>> metrics.f1_score(newsgroups_test.target, pred, average='macro')
+  0.88213592402729568
 
 (The example :ref:`example_text_document_classification_20newsgroups.py` shuffles
 the training and test data, instead of segmenting by time, and in that case
@@ -182,8 +182,8 @@ blocks, and quotation blocks respectively.
   ...                                      categories=categories)
   >>> vectors_test = vectorizer.transform(newsgroups_test.data)
   >>> pred = clf.predict(vectors_test)
-  >>> metrics.f1_score(pred, newsgroups_test.target, average='weighted')
-  0.78409163025839435
+  >>> metrics.f1_score(pred, newsgroups_test.target, average='macro')
+  0.77310350681274775
 
 This classifier lost over a lot of its F-score, just because we removed
 metadata that has little to do with topic classification.
@@ -193,12 +193,12 @@ It loses even more if we also strip this metadata from the training data:
   ...                                       remove=('headers', 'footers', 'quotes'),
   ...                                       categories=categories)
   >>> vectors = vectorizer.fit_transform(newsgroups_train.data)
-  >>> clf = BernoulliNB(alpha=.01)
+  >>> clf = MultinomialNB(alpha=.01)
   >>> clf.fit(vectors, newsgroups_train.target)
   >>> vectors_test = vectorizer.transform(newsgroups_test.data)
   >>> pred = clf.predict(vectors_test)
-  >>> metrics.f1_score(newsgroups_test.target, pred, average='weighted')
-  0.73160869205141166
+  >>> metrics.f1_score(newsgroups_test.target, pred, average='macro')
+  0.76995175184521725
 
 Some other classifiers cope better with this harder version of the task. Try
 running :ref:`example_model_selection_grid_search_text_feature_extraction.py` with and without

diff --git a/examples/model_selection/grid_search_digits.py b/examples/model_selection/grid_search_digits.py
@@ -51,7 +51,7 @@
     print()
 
     clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
-                       scoring='%s_weighted' % score)
+                       scoring='%s_macro' % score)
     clf.fit(X_train, y_train)
 
     print("Best parameters set found on development set:")

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -469,7 +469,7 @@ def test_precision_recall_f1_score_multiclass_pos_label_none():
     # compute scores with default labels introspection
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                  pos_label=None,
-                                                 average='weighted')
+                                                 average='macro')
 
 
 def test_zero_precision_recall():
@@ -482,10 +482,10 @@ def test_zero_precision_recall():
         y_pred = np.array([2, 0, 1, 1, 2, 0])
 
         assert_almost_equal(precision_score(y_true, y_pred,
-                                            average='weighted'), 0.0, 2)
-        assert_almost_equal(recall_score(y_true, y_pred, average='weighted'),
+                                            average='macro'), 0.0, 2)
+        assert_almost_equal(recall_score(y_true, y_pred, average='macro'),
                             0.0, 2)
-        assert_almost_equal(f1_score(y_true, y_pred, average='weighted'),
+        assert_almost_equal(f1_score(y_true, y_pred, average='macro'),
                             0.0, 2)
 
     finally:

diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
@@ -439,9 +439,9 @@ def test_auto_weight():
         y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
         clf.set_params(class_weight='balanced')
         y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],).predict(X)
-        assert_true(metrics.f1_score(y, y_pred, average='weighted')
+        assert_true(metrics.f1_score(y, y_pred, average='macro')
                     <= metrics.f1_score(y, y_pred_balanced,
-                                        average='weighted'))
+                                        average='macro'))
 
 
 def test_bad_input():