scikit-learn
diff --git a/‎examples/semi_supervised/plot_self_training_performance.py
Lines changed: 80 additions & 0 deletions b/‎examples/semi_supervised/plot_self_training_performance.py
Lines changed: 80 additions & 0 deletions
diff --git a/‎sklearn/semi_supervised/self_training.py
Lines changed: 114 additions & 0 deletions b/‎sklearn/semi_supervised/self_training.py
Lines changed: 114 additions & 0 deletions
@@ -0,0 +1,80 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.semi_supervised.self_training import SelfTraining
+from sklearn.utils import shuffle
+from sklearn.svm import SVC
+from sklearn.datasets import load_iris
+from sklearn.model_selection import StratifiedKFold
+from sklearn.semi_supervised.label_propagation import LabelPropagation
+from sklearn.metrics import f1_score
+from sklearn.base import clone
+
+supervised_score = []
+self_training_score = []
+label_propagation_score = []
+x_values = []
+
+clf = SVC(probability=True, C=100, gamma=0.7, kernel='rbf')
+self_training_clf = SelfTraining(
+    clone(clf, safe=True), max_iter=100, threshold=0.8
+)
+ls = LabelPropagation()
+
+for t in range(20, 80):
+    x_values.append(t)
+    X, y = load_iris(return_X_y=True)
+    X, y = shuffle(X, y, random_state=42)
+    y_true = y.copy()
+
+    lim = t
+    y[lim:] = -1
+
+    supervised_score_temp = []
+    self_training_score_temp = []
+    label_propagation_score_temp = []
+
+    skfolds = StratifiedKFold(n_splits=3, random_state=42)
+    for train_index, test_index in skfolds.split(X, y):
+        X_train = X[train_index]
+        y_train = y[train_index]
+        X_test = X[test_index]
+        y_test = y[test_index]
+        y_test_true = y_true[test_index]
+
+        X_train_filtered = X_train[np.where(y_train != -1)]
+        y_train_filtered = y_train[np.where(y_train != -1)]
+
+        clf.fit(X_train_filtered, y_train_filtered)
+        y_pred = clf.predict(X_test)
+        supervised_score_temp.append(
+            f1_score(y_test_true, y_pred, average='macro')
+        )
+
+        self_training_clf.fit(X_train, y_train)
+        y_pred = self_training_clf.predict(X_test)
+        self_training_score_temp.append(
+            f1_score(y_test_true, y_pred, average='macro')
+        )
+
+        ls.fit(X_train, y_train)
+        y_pred = ls.predict(X_test)
+        label_propagation_score_temp.append(
+            f1_score(y_test_true, y_pred, average='macro')
+        )
+
+    supervised_score.append(np.array(supervised_score_temp).mean())
+    self_training_score.append(np.array(self_training_score_temp).mean())
+    label_propagation_score.append(
+        np.array(label_propagation_score_temp).mean()
+    )
+
+
+plt.figure(1)
+plt.plot(x_values, supervised_score, label='Supervised')
+plt.plot(x_values, self_training_score, label='Self-training')
+plt.plot(x_values, label_propagation_score, label='Label Propagation')
+plt.legend()
+plt.ylabel("Accuracy")
+plt.title("Comparision of classifiers on limited labeled data")
+plt.xlabel("Amount of Labeled Data")
+plt.show()
@@ -0,0 +1,114 @@
+import numpy as np
+
+from ..base import BaseEstimator
+from ..utils.validation import check_X_y, check_array, check_is_fitted
+from ..utils import safe_mask
+
+
+def _check_estimator(estimator):
+    """Make sure that an estimator implements the necessary methods."""
+    if not hasattr(estimator, "predict_proba"):
+        raise ValueError("The base estimator should implement predict_proba!")
+
+
+class SelfTraining(BaseEstimator):
+
+    """Self-Training classifier
+
+    Parameters
+    ----------
+    estimator : estimator object
+        An estimator object implementing `fit` and `predict_proba`.
+
+    threshold : float
+        Threshold above which predictions are added to the labeled dataset
+
+    max_iter : integer
+        Change maximum number of iterations allowed
+
+    """
+    def __init__(self, estimator, threshold=0.7, max_iter=500):
+        self.estimator = estimator
+        self.threshold = threshold
+        self.max_iter = max_iter
+
+    def fit(self, X, y):
+        """
+        Fits SelfTraining Estimator to dataset
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            array representing the data
+        y : array-like, shape = (n_samples, 1)
+            array representing the labels
+
+        Returns
+        -------
+        self: returns an instance of self.
+        """
+        X, y = check_X_y(X, y)
+        _check_estimator(self.estimator)
+
+        # Data usable for supervised training
+        X_labeled = X[safe_mask(X, np.where(y != -1))][0]
+        y_labeled = y[safe_mask(y, np.where(y != -1))][0]
+
+        # Unlabeled data
+        X_unlabeled = X[safe_mask(X, np.where(y == -1))][0]
+        y_unlabeled = y[safe_mask(y, np.where(y == -1))][0]
+
+        iter = 0
+        while (len(X_labeled) < len(X) and iter < self.max_iter):
+            iter += 1
+            self.estimator.fit(X_labeled, y_labeled)
+
+            # Select prediction where confidence is above the threshold
+            pred = self.predict(X_unlabeled)
+            max_proba = np.max(self.predict_proba(X_unlabeled), axis=1)
+            confident = np.where(max_proba > self.threshold)[0]
+
+            # Add newly labeled confident predictions to the dataset
+            X_labeled = np.append(X_labeled, X_unlabeled[confident], axis=0)
+            y_labeled = np.append(y_labeled, pred[confident], axis=0)
+
+            # Remove already labeled data from unlabeled dataset
+            X_unlabeled = np.delete(X_unlabeled, confident, axis=0)
+            y_unlabeled = np.delete(y_unlabeled, confident, axis=0)
+
+        self.estimator.fit(X_labeled, y_labeled)
+        return self.estimator
+
+    def predict(self, X):
+        """Predict on a dataset.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            array representing the data
+
+        Returns
+        -------
+        y : array-like, shape = (n_samples, 1)
+            array with predicted labels
+        """
+        check_is_fitted(self, 'estimator')
+        X = check_array(X)
+        return self.estimator.predict(X)
+
+    def predict_proba(self, X):
+        """Predict probability for each possible outcome.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            array representing the data
+
+        Returns
+        -------
+        y : array-like, shape = (n_samples, n_features)
+            array with prediction probabilities
+        """
+        _check_estimator(self.estimator)
+        check_is_fitted(self, 'estimator')
+        return self.estimator.predict_proba(X)