diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index ee7028f469b5f..a6c0f430c2638 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -1,6 +1,6 @@
 .. currentmodule:: sklearn.model_selection
 
-.. _TunedThresholdClassifierCV:
+.. _threshold_tunning:
 
 ==================================================
 Tuning the decision threshold for class prediction
@@ -63,7 +63,7 @@ Post-tuning the decision threshold
 
 One solution to address the problem stated in the introduction is to tune the decision
 threshold of the classifier once the model has been trained. The
-:class:`~sklearn.model_selection.TunedThresholdClassifierCV` tunes this threshold using
+:class:`TunedThresholdClassifierCV` tunes this threshold using
 an internal cross-validation. The optimum threshold is chosen to maximize a given
 metric.
 
@@ -80,6 +80,15 @@ a utility metric defined by the business (in this case an insurance company).
    :target: ../auto_examples/model_selection/plot_cost_sensitive_learning.html
    :align: center
 
+Plotting metric across thresholds
+---------------------------------
+
+The final plot above shows the value of a utility metric of interest across a range
+of threshold values. This can be a useful visualization when tuning decision
+threshold, especially if there is more than one metric of interest. The
+:func:`decision_threshold_curve` allows you to easily generate such plots as it
+computes the values required for each axis, scores per threshold and threshold values.
+
 Options to tune the decision threshold
 --------------------------------------
 
@@ -120,7 +129,7 @@ a meaningful metric for their use case.
 Important notes regarding the internal cross-validation
 -------------------------------------------------------
 
-By default :class:`~sklearn.model_selection.TunedThresholdClassifierCV` uses a 5-fold
+By default :class:`TunedThresholdClassifierCV` uses a 5-fold
 stratified cross-validation to tune the decision threshold. The parameter `cv` allows to
 control the cross-validation strategy. It is possible to bypass cross-validation by
 setting `cv="prefit"` and providing a fitted classifier. In this case, the decision
@@ -143,7 +152,7 @@ Manually setting the decision threshold
 
 The previous sections discussed strategies to find an optimal decision threshold. It is
 also possible to manually set the decision threshold using the class
-:class:`~sklearn.model_selection.FixedThresholdClassifier`. In case that you don't want
+:class:`FixedThresholdClassifier`. In case that you don't want
 to refit the model when calling `fit`, wrap your sub-estimator with a
 :class:`~sklearn.frozen.FrozenEstimator` and do
 ``FixedThresholdClassifier(FrozenEstimator(estimator), ...)``.
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index cf168295a6024..cc15b4ecce185 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -63,7 +63,7 @@ The most common decisions are done on binary classification tasks, where the res
 probability of rain a decision is made on how to act (whether to take mitigating
 measures like an umbrella or not).
 For classifiers, this is what :term:`predict` returns.
-See also :ref:`TunedThresholdClassifierCV`.
+See also :ref:`threshold_tunning`.
 There are many scoring functions which measure different aspects of such a
 decision, most of them are covered with or derived from the
 :func:`metrics.confusion_matrix`.
diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/31338.major-feature.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/31338.major-feature.rst
new file mode 100644
index 0000000000000..52ebf021f0a7f
--- /dev/null
+++ b/doc/whats_new/upcoming_changes/sklearn.metrics/31338.major-feature.rst
@@ -0,0 +1,4 @@
+- :func:`inspection.metric_threshold_curve` has been added to
+  assess performance over thresholds by computing a threshold-dependent
+  metric of interest per threshold. By
+  :user:`Carlo Lemos <vitaliset>` and :user:`Lucy Liu <lucyleeow>`.
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index ce86525acc368..acf135c427c8c 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -26,6 +26,7 @@
     recall_score,
     zero_one_loss,
 )
+from ._decision_threshold import decision_threshold_curve
 from ._dist_metrics import DistanceMetric
 from ._plot.confusion_matrix import ConfusionMatrixDisplay
 from ._plot.det_curve import DetCurveDisplay
@@ -124,6 +125,7 @@
     "d2_tweedie_score",
     "davies_bouldin_score",
     "dcg_score",
+    "decision_threshold_curve",
     "det_curve",
     "euclidean_distances",
     "explained_variance_score",
diff --git a/sklearn/metrics/_decision_threshold.py b/sklearn/metrics/_decision_threshold.py
new file mode 100644
index 0000000000000..9dbadb22edc38
--- /dev/null
+++ b/sklearn/metrics/_decision_threshold.py
@@ -0,0 +1,117 @@
+"""Metric per threshold curve to assess binary classification performance.
+
+Compute metric per threshold, over a range of threshold values to aid visualization
+of threshold-dependent metric behavior.
+
+Utilizes `_CurveScorer` methods to do all the computation.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral
+
+from ..utils._param_validation import Interval, validate_params
+
+
+@validate_params(
+    {
+        "score_func": [callable],
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "thresholds": [
+            Interval(Integral, 2, None, closed="left"),
+            "array-like",
+        ],
+        "greater_is_better": ["boolean"],
+        "labels": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def decision_threshold_curve(
+    score_func,
+    y_true,
+    y_score,
+    # Should below 2 have a default value?
+    thresholds=20,
+    greater_is_better=True,
+    labels=None,
+    **kwargs,
+):
+    """Compute threshold-dependent metric of interest per threshold.
+
+    Note: this implementation is restricted to the binary classification task.
+
+    Read more in the :ref:`User Guide <threshold_tunning>`.
+
+    .. versionadded:: 1.8
+
+    Parameters
+    ----------
+    score_func : callable
+        The score function to use. It will be called as
+        `score_func(y_true, y_pred, **kwargs)`.
+
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target labels.
+
+    y_score : array-like of shape (n_samples,)
+        Continuous response scores.
+
+    thresholds : int or array-like, default=20
+        Specifies number of decision thresholds to compute score for. If an integer,
+        it will be used to generate `thresholds` thresholds uniformly distributed
+        between the minimum and maximum of `y_score`. If an array-like, it will be
+        used as the thresholds.
+
+    greater_is_better : bool, default=True
+        Whether `score_func` is a score function (default), meaning high is
+        good, or a loss function, meaning low is good. In the latter case, the
+        the output of `score_func` will be sign-flipped.
+
+    labels : array-like, default=None
+        Class labels. If `None`, inferred from `y_true`.
+        TODO: used `labels` instead of `classes` to be consistent with other metrics.
+
+    **kwargs : dict
+        Parameters to pass to `score_func`.
+
+    Returns
+    -------
+    score_thresholds : ndarray of shape (n_thresholds,)
+        The scores associated with each threshold.
+
+    thresholds : ndarray of shape (n_thresholds,)
+        The thresholds used to compute the scores.
+
+    See Also
+    --------
+    precision_recall_curve : Compute precision-recall pairs for different
+        probability thresholds.
+    det_curve : Compute error rates for different probability thresholds.
+    roc_curve : Compute Receiver operating characteristic (ROC) curve.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import accuracy_score, decision_threshold_curve
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_score = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> score_thresholds, thresholds = decision_threshold_curve(
+    ...     accuracy_score, y_true, y_score, thresholds=4)
+    >>> thresholds
+    array([0.1, 0.33333333, 0.56666667, 0.8 ])
+    >>> score_thresholds
+    array([0.5, 0.75, 0.75, 0.75])
+    """
+    # To prevent circular import
+    from ._scorer import _CurveScorer
+
+    sign = 1 if greater_is_better else -1
+    curve_scorer = _CurveScorer(score_func, sign, {}, thresholds)
+    return curve_scorer._scores_from_predictions(
+        y_true,
+        y_score,
+        labels,
+        **kwargs,
+    )
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 08e5a20187de7..14391314356fa 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -30,6 +30,7 @@
 from ..utils import Bunch
 from ..utils._param_validation import HasMethods, Hidden, StrOptions, validate_params
 from ..utils._response import _get_response_values
+from ..utils._unique import cached_unique
 from ..utils.metadata_routing import (
     MetadataRequest,
     MetadataRouter,
@@ -1071,23 +1072,24 @@ class _CurveScorer(_BaseScorer):
         `score_func(y_true, y_pred, **kwargs)`.
 
     sign : int
-        Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
-        Thus, `sign` defined if higher scores are better or worse.
+        Either 1 or -1. Score is returned as `sign * score_func(estimator, X, y)`.
+        Thus, `sign` defines whether higher scores are better or worse.
 
     kwargs : dict
         Additional parameters to pass to the score function.
 
     thresholds : int or array-like
-        Related to the number of decision thresholds for which we want to compute the
-        score. If an integer, it will be used to generate `thresholds` thresholds
-        uniformly distributed between the minimum and maximum predicted scores. If an
-        array-like, it will be used as the thresholds.
+        Specifies number of decision thresholds to compute score for. If an integer,
+        it will be used to generate `thresholds` thresholds uniformly distributed
+        between the minimum and maximum of `y_score`. If an array-like, it will be
+        used as the thresholds.
 
-    response_method : str
+    response_method : str, default=None
         The method to call on the estimator to get the response values.
+        If set to `None`, the `_scores_from_estimator` method cannot be used.
     """
 
-    def __init__(self, score_func, sign, kwargs, thresholds, response_method):
+    def __init__(self, score_func, sign, kwargs, thresholds, response_method=None):
         super().__init__(
             score_func=score_func,
             sign=sign,
@@ -1110,8 +1112,75 @@ def from_scorer(cls, scorer, response_method, thresholds):
         instance._metadata_request = scorer._get_metadata_request()
         return instance
 
+    def _scores_from_predictions(
+        self,
+        y_true,
+        y_score,
+        classes=None,
+        **kwargs,
+    ):
+        """Computes scores per threshold, given continuous response and true labels.
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            Ground truth (correct) target labels.
+
+        y_score : array-like of shape (n_samples,)
+            Continuous response scores.
+
+        classes: array-like, default=None
+            Class labels. If `None`, inferred from `y_true`.
+
+        **kwargs : dict
+            Parameters to pass to `self.score_func`.
+
+        Returns
+        -------
+        score_thresholds : ndarray of shape (thresholds,)
+            The scores associated with each threshold.
+
+        thresholds : ndarray of shape (thresholds,)
+            The thresholds used to compute the scores.
+        """
+        # This could also be done in `decision_threshold_curve`, not sure which
+        # is better
+        y_true_unique = cached_unique(y_true)
+        if classes is None:
+            classes = y_true_unique
+        # not sure if this separate error msg needed.
+        # there is the possibility that set(classes) != set(y_true_unique) fails
+        # because `y_true` only contains one class.
+        if len(y_true_unique) == 1:
+            raise ValueError("`y_true` only contains one class label.")
+        if set(classes) != set(y_true_unique):
+            raise ValueError(
+                f"`classes` ({classes}) is not equal to the unique values found in "
+                f"`y_true` ({y_true_unique})."
+            )
+
+        if isinstance(self._thresholds, Integral):
+            potential_thresholds = np.linspace(
+                np.min(y_score), np.max(y_score), self._thresholds
+            )
+        else:
+            potential_thresholds = np.asarray(self._thresholds)
+
+        score_thresholds = [
+            self._sign
+            * self._score_func(
+                y_true,
+                _threshold_scores_to_class_labels(
+                    y_score, th, classes, self._get_pos_label()
+                ),
+                **{**self._kwargs, **kwargs},
+            )
+            for th in potential_thresholds
+        ]
+        return np.array(score_thresholds), potential_thresholds
+
     def _score(self, method_caller, estimator, X, y_true, **kwargs):
-        """Evaluate predicted target values for X relative to y_true.
+        """Computes scores per threshold, given estimator, X and true labels.
 
         Parameters
         ----------
@@ -1140,27 +1209,21 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
         potential_thresholds : ndarray of shape (thresholds,)
             The potential thresholds used to compute the scores.
         """
-        pos_label = self._get_pos_label()
+        if self._response_method is None:
+            raise ValueError(
+                "This method cannot be used when `_CurveScorer` initialized with "
+                "`response_method=None`"
+            )
+
         y_score = method_caller(
-            estimator, self._response_method, X, pos_label=pos_label
+            estimator, self._response_method, X, pos_label=self._get_pos_label()
         )
 
-        scoring_kwargs = {**self._kwargs, **kwargs}
-        if isinstance(self._thresholds, Integral):
-            potential_thresholds = np.linspace(
-                np.min(y_score), np.max(y_score), self._thresholds
-            )
-        else:
-            potential_thresholds = np.asarray(self._thresholds)
-        score_thresholds = [
-            self._sign
-            * self._score_func(
-                y_true,
-                _threshold_scores_to_class_labels(
-                    y_score, th, estimator.classes_, pos_label
-                ),
-                **scoring_kwargs,
-            )
-            for th in potential_thresholds
-        ]
-        return np.array(score_thresholds), potential_thresholds
+        # why 'potential' ?
+        score_thresholds, potential_thresholds = self._scores_from_predictions(
+            y_true,
+            y_score,
+            estimator.classes_,
+            **kwargs,
+        )
+        return score_thresholds, potential_thresholds
diff --git a/sklearn/metrics/tests/test_decision_threshold.py b/sklearn/metrics/tests/test_decision_threshold.py
new file mode 100644
index 0000000000000..950ea9e28c916
--- /dev/null
+++ b/sklearn/metrics/tests/test_decision_threshold.py
@@ -0,0 +1,133 @@
+from functools import partial
+
+import pytest
+
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    fbeta_score,
+    precision_score,
+    recall_score,
+)
+
+
+# TODO(Carlo): Update tests.
+def test_grid_int_bigger_than_set_then_all():
+    # """When `thresholds` parameter is bigger than the number of unique
+    # `y_score` then `len(thresholds)` should be equal to `len(set(y_score))`.
+    # """
+
+    # X, y = make_classification()
+    # clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X, y)
+    # y_score = clf.predict_proba(X)[:, 1]
+
+    # _, thresholds_big_int = decision_threshold_curve(
+    #     y, y_score, accuracy_score, thresholds=len(set(y_score)) + 1000
+    # )
+
+    # assert len(thresholds_big_int) == len(set(y_score))
+    assert True
+
+
+def test_binary_clf_curve_multiclass_error():
+    # rng = check_random_state(404)
+    # y_true = rng.randint(0, 3, size=10)
+    # y_pred = rng.rand(10)
+    # msg = "In a multiclass scenario, you must pass "
+    # with pytest.raises(ValueError, match=msg):
+    #     decision_threshold_curve(y_true, y_pred, accuracy_score)
+    assert True
+
+
+@pytest.mark.parametrize(
+    "metric",
+    [
+        # make_scorer(fbeta_score, beta=3),
+        # make_scorer(fbeta_score, beta=0.5),
+        f1_score,
+        precision_score,
+        recall_score,
+        accuracy_score,
+    ],
+)
+def test_decision_threshold_curve_end_points(metric):
+    # rng = check_random_state(0)
+    # y_true = np.array([0] * 50 + [1] * 50)
+    # y_score = rng.normal(3, size=100)
+    # min_pred, max_score = min(y_score), max(y_score)
+
+    # metric_values, _ = decision_threshold_curve(y_true, y_score, metric)
+
+    # assert metric_values[0] == metric(y_true, (y_score > min_pred) * 1)
+    # assert metric_values[-1] == metric(y_true, (y_score > max_score) * 1)
+    assert True
+
+
+@pytest.mark.parametrize(
+    "metric",
+    [partial(fbeta_score, beta=3), precision_score, recall_score],
+)
+def test_zero_sample_weight_equals_excluding(metric):
+    # rng = check_random_state(0)
+    # y_true = np.array([0] * 50 + [1] * 50)
+    # y_score = rng.normal(3, size=100)
+
+    # sample_weight = np.array([0] * 20 + [1] * 80)
+    # scoring_kwargs = {"sample_weight": sample_weight}
+    # metric_values_sw, _ = decision_threshold_curve(
+    #     y_true, y_score, metric, scoring_kwargs=scoring_kwargs
+    # )
+
+    # y_true_exclude = y_true[sample_weight != 0]
+    # y_score_exclude = y_score[sample_weight != 0]
+    # metric_values_exclude, _ = decision_threshold_curve(
+    #     y_true_exclude, y_score_exclude, metric
+    # )
+
+    # assert_allclose(metric_values_sw, metric_values_exclude)
+    assert True
+
+
+def test_len_of_threshold_when_passing_int():
+    # y = [0] * 500 + [1] * 500
+    # y_score = list(range(1000))
+    # _, thresholds = decision_threshold_curve(
+    #     y, y_score, accuracy_score, thresholds=13
+    # )
+
+    # assert len(thresholds) == 13
+    assert True
+
+
+@pytest.mark.parametrize(
+    "metric, scoring_kwargs",
+    [
+        (f1_score, None),
+        (f1_score, {}),
+        (fbeta_score, {"beta": 4}),
+    ],
+)
+def test_scoring_kwargs(metric, scoring_kwargs):
+    # y_true = np.array([0] * 50 + [1] * 50)
+    # decision_threshold_curve(y_true, y_true, metric, scoring_kwargs=scoring_kwargs)
+    assert True
+
+
+def test_passing_the_grid():
+    # y = [0] * 500 + [1] * 500
+    # y_score = list(range(1000))
+
+    # grid_sorted = np.array(list(range(200, 300)))
+    # _, thresholds_sorted = decision_threshold_curve(
+    #     y, y_score, accuracy_score, thresholds=grid_sorted
+    # )
+
+    # assert_allclose(grid_sorted, thresholds_sorted)
+
+    # grid_not_sorted = grid_sorted[::-1]
+    # _, thresholds_not_sorted = decision_threshold_curve(
+    #     y, y_score, accuracy_score, thresholds=grid_not_sorted
+    # )
+
+    # assert_allclose(grid_sorted, thresholds_not_sorted)
+    assert True
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 672ed8ae7eecc..8d4a8574f4a77 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -1653,6 +1653,21 @@ def test_curve_scorer_pos_label(global_random_seed):
     assert scores_pos_label_1.max() == pytest.approx(1.0)
 
 
+def test_curve_scorer_scores_from_predictions():
+    """Check behavior of `_CurveScorer._scores_from_predictions`."""
+    X, y = make_classification(random_state=0)
+    lr = LogisticRegression().fit(X, y)
+    y_score = lr.predict_proba(X)
+
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score, sign=1, kwargs={}, thresholds=10
+    )
+    score_thresholds, potential_thresholds = curve_scorer._scores_from_predictions(
+        y_true=y,
+        y_score=y_score[:, 1],
+    )
+
+
 # TODO(1.8): remove
 def test_make_scorer_reponse_method_default_warning():
     with pytest.warns(FutureWarning, match="response_method=None is deprecated"):
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index c68ed38b8819d..707f28c3bc64e 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -510,7 +510,7 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
     into a class label. The tuning is done by optimizing a binary metric,
     potentially constrained by a another metric.
 
-    Read more in the :ref:`User Guide <TunedThresholdClassifierCV>`.
+    Read more in the :ref:`User Guide <threshold_tunning>`.
 
     .. versionadded:: 1.5
 
diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py
index 707aa37737c1b..1ddb6421e4834 100644
--- a/sklearn/tests/test_public_functions.py
+++ b/sklearn/tests/test_public_functions.py
@@ -239,6 +239,7 @@ def _check_function_param_validation(
     "sklearn.metrics.d2_tweedie_score",
     "sklearn.metrics.davies_bouldin_score",
     "sklearn.metrics.dcg_score",
+    "sklearn.metrics.decision_threshold_curve",
     "sklearn.metrics.det_curve",
     "sklearn.metrics.explained_variance_score",
     "sklearn.metrics.f1_score",