scikit-learn
diff --git a/‎examples/semi_supervised/plot_semi_supervised_newsgroups.py
Lines changed: 106 additions & 35 deletions b/‎examples/semi_supervised/plot_semi_supervised_newsgroups.py
Lines changed: 106 additions & 35 deletions
@@ -3,17 +3,42 @@
 Semi-supervised Classification on a Text Dataset
 ================================================
 
-In this example, semi-supervised classifiers are trained on the 20 newsgroups
-dataset (which will be automatically downloaded).
+This example demonstrates the effectiveness of semi-supervised learning
+in text classification when labeled data is scarce.
+We compare four different approaches:
 
-You can adjust the number of categories by giving their names to the dataset
-loader or setting them to `None` to get all 20 of them.
+1. Supervised learning using 100% of labeled data (baseline)
 
+   - Uses SGDClassifier with TF-IDF features
+   - Represents the best possible performance with full supervision
+
+2. Supervised learning using only 20% of labeled data
+
+   - Same model as baseline but with limited training data
+   - Shows the performance degradation due to limited labeled data
+
+3. SelfTrainingClassifier (semi-supervised)
+
+   - Uses 20% labeled data + 80% unlabeled data
+   - Iteratively predicts labels for unlabeled data
+   - Demonstrates how self-training can improve performance
+
+4. LabelSpreading (semi-supervised)
+
+   - Uses 20% labeled data + 80% unlabeled data
+   - Propagates labels through the data manifold
+   - Shows how graph-based methods can leverage unlabeled data
+
+The example uses the 20 newsgroups dataset, focusing on five categories.
+The results demonstrate how semi-supervised methods can achieve better
+performance than supervised learning with limited labeled data by
+effectively utilizing unlabeled samples.
 """
 
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
+import matplotlib.pyplot as plt
 import numpy as np
 
 from sklearn.datasets import fetch_20newsgroups
@@ -36,9 +61,6 @@
         "comp.sys.mac.hardware",
     ],
 )
-print("%d documents" % len(data.filenames))
-print("%d categories" % len(data.target_names))
-print()
 
 # Parameters
 sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss")
@@ -57,7 +79,7 @@
     [
         ("vect", CountVectorizer(**vectorizer_params)),
         ("tfidf", TfidfTransformer()),
-        ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
+        ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=False)),
     ]
 )
 # LabelSpreading Pipeline
@@ -72,40 +94,89 @@
 )
 
 
-def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
-    print("Number of training samples:", len(X_train))
-    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
+def eval_and_get_f1(clf, X_train, y_train, X_test, y_test):
+    """Evaluate model performance and return F1 score"""
+    print(f"   Number of training samples: {len(X_train)}")
+    print(f"   Unlabeled samples in training set: {sum(1 for x in y_train if x == -1)}")
     clf.fit(X_train, y_train)
     y_pred = clf.predict(X_test)
-    print(
-        "Micro-averaged F1 score on test set: %0.3f"
-        % f1_score(y_test, y_pred, average="micro")
-    )
-    print("-" * 10)
-    print()
+    f1 = f1_score(y_test, y_pred, average="micro")
+    print(f"   Micro-averaged F1 score on test set: {f1:.3f}")
+    print("\n")
+    return f1
 
 
-if __name__ == "__main__":
-    X, y = data.data, data.target
-    X_train, X_test, y_train, y_test = train_test_split(X, y)
+X, y = data.data, data.target
+X_train, X_test, y_train, y_test = train_test_split(X, y)
 
-    print("Supervised SGDClassifier on 100% of the data:")
-    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)
+f1_scores = {}
 
-    # select a mask of 20% of the train dataset
-    y_mask = np.random.rand(len(y_train)) < 0.2
+# Evaluate supervised model with 100% of training data
+print("1. Supervised SGDClassifier on 100% of the data:")
+f1_scores["Supervised (100%)"] = eval_and_get_f1(
+    pipeline, X_train, y_train, X_test, y_test
+)
 
-    # X_20 and y_20 are the subset of the train dataset indicated by the mask
-    X_20, y_20 = map(
-        list, zip(*((x, y) for x, y, m in zip(X_train, y_train, y_mask) if m))
+# Evaluate supervised model with 20% of training data
+print("2. Supervised SGDClassifier on 20% of the training data:")
+y_mask = np.random.rand(len(y_train)) < 0.2
+# X_20 and y_20 are the subset of the train dataset indicated by the mask
+X_20, y_20 = map(list, zip(*((x, y) for x, y, m in zip(X_train, y_train, y_mask) if m)))
+f1_scores["Supervised (20%)"] = eval_and_get_f1(pipeline, X_20, y_20, X_test, y_test)
+
+# Evaluate semi-supervised approaches
+print(
+    "3. SelfTrainingClassifier (semi-supervised) using 20% labeled "
+    "+ 80% unlabeled data):"
+)
+y_train_semi = y_train.copy()
+y_train_semi[~y_mask] = -1  # Mark unlabeled data with -1
+f1_scores["SelfTraining"] = eval_and_get_f1(
+    st_pipeline, X_train, y_train_semi, X_test, y_test
+)
+print("4. LabelSpreading (semi-supervised) using 20% labeled + 80% unlabeled data:")
+f1_scores["LabelSpreading"] = eval_and_get_f1(
+    ls_pipeline, X_train, y_train_semi, X_test, y_test
+)
+# %%
+# Plot results
+# ------------
+# Visualize the performance of different classification approaches using a bar chart.
+# This helps to compare how each method performs based on the micro-averaged F1 score.
+
+plt.figure(figsize=(10, 6))
+
+models = list(f1_scores.keys())
+scores = list(f1_scores.values())
+
+colors = ["royalblue", "royalblue", "forestgreen", "royalblue"]
+bars = plt.bar(models, scores, color=colors)
+
+plt.title("Comparison of Classification Approaches")
+plt.ylabel("Micro-averaged F1 Score")
+plt.xticks()
+
+for bar in bars:
+    height = bar.get_height()
+    plt.text(
+        bar.get_x() + bar.get_width() / 2.0,
+        height,
+        f"{height:.2f}",
+        ha="center",
+        va="bottom",
     )
-    print("Supervised SGDClassifier on 20% of the training data:")
-    eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)
 
-    # set the non-masked subset to be unlabeled
-    y_train[~y_mask] = -1
-    print("SelfTrainingClassifier on 20% of the training data (rest is unlabeled):")
-    eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)
+plt.figtext(
+    0.5,
+    0.02,
+    "SelfTraining classifier shows improved performance over "
+    "supervised learning with limited data",
+    ha="center",
+    va="bottom",
+    fontsize=10,
+    style="italic",
+)
 
-    print("LabelSpreading on 20% of the data (rest is unlabeled):")
-    eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)
+plt.tight_layout()
+plt.subplots_adjust(bottom=0.15)
+plt.show()