diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py
index 35ed4c515dd81..bb73b3ad3e22f 100644
--- a/maint_tools/test_docstrings.py
+++ b/maint_tools/test_docstrings.py
@@ -23,7 +23,6 @@
     "PassiveAggressiveClassifier",
     "PassiveAggressiveRegressor",
     "QuadraticDiscriminantAnalysis",
-    "SelfTrainingClassifier",
     "SparseRandomProjection",
     "SpectralBiclustering",
     "SpectralCoclustering",
diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py
index ad627c6f98574..71d2a7b32236b 100644
--- a/sklearn/semi_supervised/_self_training.py
+++ b/sklearn/semi_supervised/_self_training.py
@@ -37,30 +37,30 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
     Parameters
     ----------
     base_estimator : estimator object
-        An estimator object implementing ``fit`` and ``predict_proba``.
-        Invoking the ``fit`` method will fit a clone of the passed estimator,
-        which will be stored in the ``base_estimator_`` attribute.
+        An estimator object implementing `fit` and `predict_proba`.
+        Invoking the `fit` method will fit a clone of the passed estimator,
+        which will be stored in the `base_estimator_` attribute.
 
     threshold : float, default=0.75
         The decision threshold for use with `criterion='threshold'`.
-        Should be in [0, 1). When using the 'threshold' criterion, a
+        Should be in [0, 1). When using the `'threshold'` criterion, a
         :ref:`well calibrated classifier <calibration>` should be used.
 
     criterion : {'threshold', 'k_best'}, default='threshold'
         The selection criterion used to select which labels to add to the
-        training set. If 'threshold', pseudo-labels with prediction
-        probabilities above `threshold` are added to the dataset. If 'k_best',
+        training set. If `'threshold'`, pseudo-labels with prediction
+        probabilities above `threshold` are added to the dataset. If `'k_best'`,
         the `k_best` pseudo-labels with highest prediction probabilities are
         added to the dataset. When using the 'threshold' criterion, a
         :ref:`well calibrated classifier <calibration>` should be used.
 
     k_best : int, default=10
         The amount of samples to add in each iteration. Only used when
-        `criterion` is k_best'.
+        `criterion='k_best'`.
 
     max_iter : int or None, default=10
         Maximum number of iterations allowed. Should be greater than or equal
-        to 0. If it is ``None``, the classifier will continue to predict labels
+        to 0. If it is `None`, the classifier will continue to predict labels
         until no new pseudo-labels are added, or all unlabeled samples have
         been labeled.
 
@@ -74,7 +74,7 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
 
     classes_ : ndarray or list of ndarray of shape (n_classes,)
         Class labels for each output. (Taken from the trained
-        ``base_estimator_``).
+        `base_estimator_`).
 
     transduction_ : ndarray of shape (n_samples,)
         The labels used for the final fit of the classifier, including
@@ -104,11 +104,24 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
     termination_condition_ : {'max_iter', 'no_change', 'all_labeled'}
         The reason that fitting was stopped.
 
-        - 'max_iter': `n_iter_` reached `max_iter`.
-        - 'no_change': no new labels were predicted.
-        - 'all_labeled': all unlabeled samples were labeled before `max_iter`
+        - `'max_iter'`: `n_iter_` reached `max_iter`.
+        - `'no_change'`: no new labels were predicted.
+        - `'all_labeled'`: all unlabeled samples were labeled before `max_iter`
           was reached.
 
+    See Also
+    --------
+    LabelPropagation : Label propagation classifier.
+    LabelSpreading : Label spreading model for semi-supervised learning.
+
+    References
+    ----------
+    David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling
+    supervised methods. In Proceedings of the 33rd annual meeting on
+    Association for Computational Linguistics (ACL '95). Association for
+    Computational Linguistics, Stroudsburg, PA, USA, 189-196. DOI:
+    https://doi.org/10.3115/981658.981684
+
     Examples
     --------
     >>> import numpy as np
@@ -123,14 +136,6 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
     >>> self_training_model = SelfTrainingClassifier(svc)
     >>> self_training_model.fit(iris.data, iris.target)
     SelfTrainingClassifier(...)
-
-    References
-    ----------
-    David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling
-    supervised methods. In Proceedings of the 33rd annual meeting on
-    Association for Computational Linguistics (ACL '95). Association for
-    Computational Linguistics, Stroudsburg, PA, USA, 189-196. DOI:
-    https://doi.org/10.3115/981658.981684
     """
 
     _estimator_type = "classifier"
@@ -153,7 +158,7 @@ def __init__(
 
     def fit(self, X, y):
         """
-        Fits this ``SelfTrainingClassifier`` to a dataset.
+        Fit self-training classifier using `X`, `y` as training data.
 
         Parameters
         ----------
@@ -167,7 +172,7 @@ def fit(self, X, y):
         Returns
         -------
         self : object
-            Returns an instance of self.
+            Fitted estimator.
         """
         # we need row slicing support for sparce matrices, but costly finiteness check
         # can be delegated to the base estimator.
@@ -281,7 +286,7 @@ def fit(self, X, y):
 
     @if_delegate_has_method(delegate="base_estimator")
     def predict(self, X):
-        """Predict the classes of X.
+        """Predict the classes of `X`.
 
         Parameters
         ----------
@@ -326,7 +331,7 @@ def predict_proba(self, X):
 
     @if_delegate_has_method(delegate="base_estimator")
     def decision_function(self, X):
-        """Calls decision function of the `base_estimator`.
+        """Call decision function of the `base_estimator`.
 
         Parameters
         ----------
@@ -372,7 +377,7 @@ def predict_log_proba(self, X):
 
     @if_delegate_has_method(delegate="base_estimator")
     def score(self, X, y):
-        """Calls score on the `base_estimator`.
+        """Call score on the `base_estimator`.
 
         Parameters
         ----------