diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 0e4b3580c5735..80e2516eec2da 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -208,8 +208,8 @@ two ways:
 - It allows specifying multiple metrics for evaluation.
 
 - It returns a dict containing fit-times, score-times
-  (and optionally training scores as well as fitted estimators) in
-  addition to the test score.
+  (and optionally training scores, fitted estimators, train-test split indices)
+  in addition to the test score.
 
 For single metric evaluation, where the scoring parameter is a string,
 callable or None, the keys will be - ``['test_score', 'fit_time', 'score_time']``
@@ -220,10 +220,10 @@ following keys -
 
 ``return_train_score`` is set to ``False`` by default to save computation time.
 To evaluate the scores on the training set as well you need to set it to
-``True``.
-
-You may also retain the estimator fitted on each training set by setting
-``return_estimator=True``.
+``True``. You may also retain the estimator fitted on each training set by
+setting ``return_estimator=True``. On the same line of thought, you may set
+`return_indices=True` to retain the training and testing indices used to split
+the dataset into train and test sets for each cv split.
 
 The multiple metrics can be specified either as a list, tuple or set of
 predefined scorer names::
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 3aabed6214771..a09a8c19c1afa 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -38,6 +38,15 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+:mod:`sklearn.calibration`
+..........................
+
+- |Feature| :class:`calibration.CalibrationDisplay` has a new method
+  :meth:`~calibration.CalibrationDisplay.from_cv_results` allowing to plot
+  calibration curves together with standard deviation uncertainty using
+  cross-validation results.
+  :pr:`21211` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 
@@ -45,6 +54,13 @@ Changelog
   message when the solver does not support sparse matrices with int64 indices.
   :pr:`21093` by `Tom Dupre la Tour`_.
 
+:mod:`sklearn.model_selection`
+..............................
+
+- |Enhancement| :func:`model_selection.cross_validate` accepts a new parameter
+  `return_indices` to return the train-test indices of each cv split.
+  :pr:`21211` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.utils`
 ....................
 
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 0785938135513..1ca64e39dec2a 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -8,8 +8,10 @@
 # License: BSD 3 clause
 
 import warnings
+from collections import Iterable
 from inspect import signature
 from functools import partial
+from numbers import Integral
 
 from math import log
 import numpy as np
@@ -29,6 +31,7 @@
 )
 from .preprocessing import label_binarize, LabelEncoder
 from .utils import (
+    _safe_indexing,
     column_or_1d,
     deprecated,
     indexable,
@@ -39,7 +42,6 @@
 from .utils.fixes import delayed
 from .utils.validation import check_is_fitted, check_consistent_length
 from .utils.validation import _check_sample_weight, _num_samples
-from .utils import _safe_indexing
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
 from .model_selection import check_cv, cross_val_predict
@@ -866,7 +868,54 @@ def predict(self, T):
         return expit(-(self.a_ * T + self.b_))
 
 
-def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5, strategy="uniform"):
+def _compute_bins(y_prob, *, n_bins, strategy):
+    """Helper to determine the bin edges given the number of bins and strategy.
+
+    .. versionadded:: 1.1
+
+    Parameters
+    ----------
+    y_prob : ndarray of shape (n_samples,)
+        The predicted probabilities of the positive class.
+
+    n_bins : int
+        The number of bins to discretize the [0, 1] interval.
+
+    strategy : {"uniform", "quantile"}
+        Strategy used to define the width of the bins.
+
+        - `"uniform"`: the bins have identical widths;
+        - `"quantile"`: the bins have the same number of samples and depends on
+          `y_prob`.
+
+    Returns
+    -------
+    bins : ndarray of shape (n_bins,)
+        The edges of the bins.
+    """
+    if strategy == "quantile":
+        quantiles = np.linspace(0, 1, n_bins + 1)
+        bins = np.percentile(y_prob, quantiles * 100)
+        bins[-1] = bins[-1] + 1e-8
+    elif strategy == "uniform":
+        bins = np.linspace(0.0, 1.0 + 1e-8, n_bins + 1)
+    else:
+        raise ValueError(
+            f"Invalid entry {strategy} to 'strategy' input. Strategy "
+            "must be either 'quantile' or 'uniform'."
+        )
+    return bins
+
+
+def calibration_curve(
+    y_true,
+    y_prob,
+    *,
+    normalize=False,
+    n_bins=5,
+    strategy="uniform",
+    drop_empty_bins=True,
+):
     """Compute true and predicted probabilities for a calibration curve.
 
     The method assumes the inputs come from a binary classifier, and
@@ -889,11 +938,15 @@ def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5, strategy="un
         is not a proper probability. If True, the smallest value in y_prob
         is linearly mapped onto 0 and the largest one onto 1.
 
-    n_bins : int, default=5
+    n_bins : int or ndarray of shape (n_bins,), default=5
         Number of bins to discretize the [0, 1] interval. A bigger number
         requires more data. Bins with no samples (i.e. without
         corresponding values in `y_prob`) will not be returned, thus the
-        returned arrays may have less than `n_bins` values.
+        returned arrays may have less than `n_bins` values. If a NumPy array
+        is passed, it will correspond to the bin edges.
+
+        .. versionadded:: 1.1
+           Added support for passing a custom array of bin edges.
 
     strategy : {'uniform', 'quantile'}, default='uniform'
         Strategy used to define the widths of the bins.
@@ -903,13 +956,19 @@ def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5, strategy="un
         quantile
             The bins have the same number of samples and depend on `y_prob`.
 
+    drop_empty_bins : bool, default=True
+        Drop bins if no samples fall within them. Otherwise, the bin will
+        contains `np.nan` values.
+
+        .. versionadded:: 1.1
+
     Returns
     -------
-    prob_true : ndarray of shape (n_bins,) or smaller
+    prob_true : ndarray of shape (n_bins + 1,) or smaller
         The proportion of samples whose class is the positive class, in each
         bin (fraction of positives).
 
-    prob_pred : ndarray of shape (n_bins,) or smaller
+    prob_pred : ndarray of shape (n_bins + 1,) or smaller
         The mean predicted probability in each bin.
 
     References
@@ -945,31 +1004,31 @@ def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5, strategy="un
     labels = np.unique(y_true)
     if len(labels) > 2:
         raise ValueError(
-            "Only binary classification is supported. Provided labels %s." % labels
+            f"Only binary classification is supported. Provided labels {labels}."
         )
     y_true = label_binarize(y_true, classes=labels)[:, 0]
 
-    if strategy == "quantile":  # Determine bin edges by distribution of data
-        quantiles = np.linspace(0, 1, n_bins + 1)
-        bins = np.percentile(y_prob, quantiles * 100)
-        bins[-1] = bins[-1] + 1e-8
-    elif strategy == "uniform":
-        bins = np.linspace(0.0, 1.0 + 1e-8, n_bins + 1)
+    if isinstance(n_bins, Integral):
+        bins = _compute_bins(y_prob, n_bins=n_bins, strategy=strategy)
     else:
-        raise ValueError(
-            "Invalid entry to 'strategy' input. Strategy "
-            "must be either 'quantile' or 'uniform'."
-        )
+        bins = n_bins
 
     binids = np.digitize(y_prob, bins) - 1
 
-    bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins))
-    bin_true = np.bincount(binids, weights=y_true, minlength=len(bins))
-    bin_total = np.bincount(binids, minlength=len(bins))
+    bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins) - 1)
+    bin_true = np.bincount(binids, weights=y_true, minlength=len(bins) - 1)
+    bin_total = np.bincount(binids, minlength=len(bins) - 1)
 
     nonzero = bin_total != 0
-    prob_true = bin_true[nonzero] / bin_total[nonzero]
-    prob_pred = bin_sums[nonzero] / bin_total[nonzero]
+    if drop_empty_bins:
+        prob_true = bin_true[nonzero] / bin_total[nonzero]
+        prob_pred = bin_sums[nonzero] / bin_total[nonzero]
+    else:
+        # pre-allocate to have always (n_bins,) elements
+        prob_true = np.full_like(bin_true, fill_value=np.nan)
+        prob_pred = np.full_like(prob_true, fill_value=np.nan)
+        prob_true[nonzero] = bin_true[nonzero] / bin_total[nonzero]
+        prob_pred[nonzero] = bin_sums[nonzero] / bin_total[nonzero]
 
     return prob_true, prob_pred
 
@@ -989,14 +1048,14 @@ class CalibrationDisplay:
 
     Parameters
     -----------
-    prob_true : ndarray of shape (n_bins,)
+    prob_true : ndarray of shape (n_bins,) or list of such arrays
         The proportion of samples whose class is the positive class (fraction
         of positives), in each bin.
 
-    prob_pred : ndarray of shape (n_bins,)
+    prob_pred : ndarray of shape (n_bins,) or list of such arrays
         The mean predicted probability in each bin.
 
-    y_prob : ndarray of shape (n_samples,)
+    y_prob : ndarray of shape (n_samples,) or list of such arrays
         Probability estimates for the positive class, for each sample.
 
     estimator_name : str, default=None
@@ -1047,7 +1106,16 @@ def __init__(self, prob_true, prob_pred, y_prob, *, estimator_name=None):
         self.y_prob = y_prob
         self.estimator_name = estimator_name
 
-    def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
+    def plot(
+        self,
+        *,
+        ax=None,
+        name=None,
+        ref_line=True,
+        plot_uncertainty_style="errorbar",
+        uncertainty_kwargs=None,
+        **kwargs,
+    ):
         """Plot visualization.
 
         Extra keyword arguments will be passed to
@@ -1067,6 +1135,22 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
             If `True`, plots a reference line representing a perfectly
             calibrated classifier.
 
+        plot_uncertainty_style : {"errorbar", "fill_between", "lines"}, \
+                default="errorbar"
+            Style to plot the uncertainty information. Possibilities are:
+
+            - "errorbar": error bars representing one standard deviation;
+            - "fill_between": filled area representing one standard deviation;
+            - "lines": plot all calibration curves for each CV fold separately.
+
+            .. versionadded:: 1.1
+
+        **uncertainty_kwargs : dict
+            Keyword arguments to be passed to the uncertainty plotting
+            function.
+
+            .. versionadded:: 1.1
+
         **kwargs : dict
             Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.
 
@@ -1092,7 +1176,62 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
         existing_ref_line = ref_line_label in ax.get_legend_handles_labels()[1]
         if ref_line and not existing_ref_line:
             ax.plot([0, 1], [0, 1], "k:", label=ref_line_label)
-        self.line_ = ax.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[0]
+
+        if isinstance(self.prob_pred[0], Iterable):
+            prob_true_mean = np.nanmean(self.prob_true, axis=0)
+            prob_pred_mean = np.nanmean(self.prob_pred, axis=0)
+            if plot_uncertainty_style in ("errorbar", "fill_between"):
+                prob_true_std = np.nanstd(self.prob_true, axis=0)
+                if plot_uncertainty_style == "errorbar":
+                    errorbar_kwargs = {}
+                    if name is not None:
+                        errorbar_kwargs["label"] = name
+                    if uncertainty_kwargs is not None:
+                        errorbar_kwargs.update(**uncertainty_kwargs)
+
+                    self.line_ = ax.errorbar(
+                        x=prob_pred_mean,
+                        y=prob_true_mean,
+                        yerr=prob_true_std,
+                        **line_kwargs,
+                    )
+                else:
+                    fill_between_kwargs = {"alpha": 0.2}
+                    if uncertainty_kwargs is not None:
+                        fill_between_kwargs.update(**uncertainty_kwargs)
+
+                    self.line_ = ax.plot(prob_pred_mean, prob_true_mean, **line_kwargs)[
+                        0
+                    ]
+                    self.uncertainty_ = ax.fill_between(
+                        x=prob_pred_mean,
+                        y1=prob_true_mean - prob_true_std,
+                        y2=prob_true_mean + prob_true_std,
+                        **fill_between_kwargs,
+                    )
+            elif plot_uncertainty_style == "lines":
+                self.line_ = ax.plot(prob_pred_mean, prob_true_mean, **line_kwargs)[0]
+
+                uncertainty_lines_kwargs = {
+                    "alpha": 0.2,
+                    "color": self.line_.get_color(),
+                    "linestyle": "--",
+                }
+                if uncertainty_kwargs is not None:
+                    uncertainty_lines_kwargs.update(**uncertainty_kwargs)
+
+                self.uncertainty_ = []
+                for pred, truth in zip(self.prob_pred, self.prob_true):
+                    line = ax.plot(pred, truth, **uncertainty_lines_kwargs)
+                    self.uncertainty_.append(line[0])
+            else:
+                raise ValueError(
+                    "plot_uncertainty_style must be one of ('errorbar', "
+                    f"'fill_between', 'lines'). Got {plot_uncertainty_style} "
+                    "instead."
+                )
+        else:
+            self.line_ = ax.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[0]
 
         if "label" in line_kwargs:
             ax.legend(loc="lower right")
@@ -1232,6 +1371,8 @@ def from_predictions(
         strategy="uniform",
         name=None,
         ref_line=True,
+        plot_uncertainty_style="errorbar",
+        uncertainty_kwargs=None,
         ax=None,
         **kwargs,
     ):
@@ -1252,10 +1393,10 @@ def from_predictions(
 
         Parameters
         ----------
-        y_true : array-like of shape (n_samples,)
+        y_true : array-like of shape (n_samples,) or list of such arrays
             True labels.
 
-        y_prob : array-like of shape (n_samples,)
+        y_prob : array-like of shape (n_samples,) or list of such arrays
             The predicted probabilities of the positive class.
 
         n_bins : int, default=5
@@ -1277,6 +1418,22 @@ def from_predictions(
             If `True`, plots a reference line representing a perfectly
             calibrated classifier.
 
+        plot_uncertainty_style : {"errorbar", "fill_between", "lines"}, \
+                default="errorbar"
+            Style to plot the uncertainty information. Possibilities are:
+
+            - "errorbar": error bars representing one standard deviation;
+            - "fill_between": filled area representing one standard deviation;
+            - "lines": plot all calibration curves for each CV fold separately.
+
+            .. versionadded:: 1.1
+
+        uncertainty_kwargs : dict, default=None
+            Dictionary with keyword arguments passed to the uncertainty
+            plotting function.
+
+            .. versionadded:: 1.1
+
         ax : matplotlib axes, default=None
             Axes object to plot on. If `None`, a new figure and axes is
             created.
@@ -1314,12 +1471,209 @@ def from_predictions(
         method_name = f"{cls.__name__}.from_estimator"
         check_matplotlib_support(method_name)
 
-        prob_true, prob_pred = calibration_curve(
-            y_true, y_prob, n_bins=n_bins, strategy=strategy
+        if not isinstance(y_prob[0], Iterable):
+            y_true, y_prob = [y_true], [y_prob]
+            bins = n_bins
+        else:
+            # precompute the bin edges based on all predicted probabilities
+            bins = _compute_bins(np.hstack(y_prob), n_bins=n_bins, strategy=strategy)
+
+        prob_true, prob_pred = zip(
+            *[
+                calibration_curve(
+                    truth, proba, n_bins=bins, strategy=strategy, drop_empty_bins=False
+                )
+                for truth, proba in zip(y_true, y_prob)
+            ]
         )
+
+        if len(prob_true) == 1:
+            # single estimator => we can safely get the first element and drop empty
+            # bins
+            prob_true, prob_pred, y_prob = prob_true[0], prob_pred[0], y_prob[0]
+            mask_nan = np.isnan(prob_pred)
+            prob_true, prob_pred = prob_true[~mask_nan], prob_pred[~mask_nan]
+
         name = name if name is not None else "Classifier"
 
         disp = cls(
             prob_true=prob_true, prob_pred=prob_pred, y_prob=y_prob, estimator_name=name
         )
-        return disp.plot(ax=ax, ref_line=ref_line, **kwargs)
+        return disp.plot(
+            ax=ax,
+            ref_line=ref_line,
+            plot_uncertainty_style=plot_uncertainty_style,
+            uncertainty_kwargs=uncertainty_kwargs,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_cv_results(
+        cls,
+        cv_results,
+        X,
+        y,
+        *,
+        n_bins=5,
+        strategy="uniform",
+        name=None,
+        ref_line=True,
+        plot_uncertainty_style="errorbar",
+        uncertainty_kwargs=None,
+        ax=None,
+        **kwargs,
+    ):
+        """Plot calibration curve using cross-validation results.
+
+        Calibration curve, also known as reliability diagram, uses inputs
+        from a binary classifier and plots the average predicted probability
+        for each bin against the fraction of positive classes, on the
+        y-axis.
+
+        Extra keyword arguments will be passed to
+        :func:`matplotlib.pyplot.plot`.
+
+        Read more about calibration in the :ref:`User Guide <calibration>` and
+        more about the scikit-learn visualization API in :ref:`visualizations`.
+
+        .. versionadded:: 1.1
+
+        Parameters
+        ----------
+        cv_results : dict
+            Dictionary returned by
+            :func:`~sklearn.model_selection.cross_validate`. You need to set
+            explicitely `return_estimator=True` and `return_indices=True`
+            required by this method.
+
+        X : array-like of shape (n_samples, n_features)
+            The dataset used during cross-validation.
+
+        y : array-like of shape (n_samples,)
+            The target used during cross-validation.
+
+        n_bins : int, default=5
+            Number of bins to discretize the [0, 1] interval into when
+            calculating the calibration curve. A bigger number requires more
+            data.
+
+        strategy : {'uniform', 'quantile'}, default='uniform'
+            Strategy used to define the widths of the bins.
+
+            - `'uniform'`: The bins have identical widths.
+            - `'quantile'`: The bins have the same number of samples and depend
+              on predicted probabilities.
+
+        name : str, default=None
+            Name for labeling curve.
+
+        ref_line : bool, default=True
+            If `True`, plots a reference line representing a perfectly
+            calibrated classifier.
+
+        plot_uncertainty_style : {"errorbar", "fill_between", "lines"}, \
+                default="errorbar"
+            Style to plot the uncertainty information. Possibilities are:
+
+            - "errorbar": error bars representing one standard deviation;
+            - "fill_between": filled area representing one standard deviation;
+            - "lines": plot all calibration curves for each CV fold separately.
+
+        uncertainty_kwargs : dict, default=None
+            Dictionary with keyword arguments passed to the uncertainty
+            plotting function.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        **kwargs : dict
+            Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.calibration.CalibrationDisplay`.
+            Object that stores computed values.
+
+        See Also
+        --------
+        CalibrationDisplay.from_estimator : Plot calibration curve using an
+            estimator and data.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.calibration import CalibrationDisplay
+        >>> X, y = make_classification(random_state=0)
+        >>> clf = LogisticRegression(random_state=0)
+        >>> cv_results = cross_validate(clf, X, y,
+        ...                             return_estimator=True,
+        ...                             return_indices=True)
+        >>> disp = CalibrationDisplay.from_cv_results(cv_results, X, y)
+        >>> plt.show()
+        """
+        method_name = f"{cls.__name__}.from_cv_results"
+        check_matplotlib_support(method_name)
+
+        required_keys = {"estimator", "indices"}
+        if not all(key in cv_results for key in required_keys):
+            raise ValueError(
+                "cv_results does not contain one of the following required keys: "
+                f"{required_keys}. Ensure to set explicitely the parameters "
+                "return_estimator=True and return_indices=True to the function "
+                "cross_validate"
+            )
+
+        train_size, test_size = (
+            len(cv_results["indices"]["train"][0]),
+            len(cv_results["indices"]["test"][0]),
+        )
+
+        if _num_samples(X) != train_size + test_size:
+            raise ValueError(
+                "X does not contain the correct number of samples. "
+                f"Expected {train_size + test_size}, got {_num_samples(X)}."
+            )
+
+        if not all(is_classifier(estimator) for estimator in cv_results["estimator"]):
+            raise ValueError(
+                "The estimators in cv_results['estimator'] must be fitted classifiers."
+            )
+
+        # FIXME: `pos_label` should not be set to None
+        # We should allow any int or string in `calibration_curve`.
+        y_prob = [
+            _get_response(
+                _safe_indexing(X, test_indices),
+                estimator,
+                response_method="predict_proba",
+                pos_label=None,
+            )[0]
+            for estimator, test_indices in zip(
+                cv_results["estimator"], cv_results["indices"]["test"]
+            )
+        ]
+        y_test = [
+            _safe_indexing(y, test_indices)
+            for test_indices in cv_results["indices"]["test"]
+        ]
+
+        name = (
+            name if name is not None else cv_results["estimator"][0].__class__.__name__
+        )
+
+        return cls.from_predictions(
+            y_test,
+            y_prob,
+            n_bins=n_bins,
+            strategy=strategy,
+            name=name,
+            ref_line=ref_line,
+            plot_uncertainty_style=plot_uncertainty_style,
+            uncertainty_kwargs=uncertainty_kwargs,
+            ax=ax,
+            **kwargs,
+        )
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 760418b7d8f54..c9bdfe9c606c6 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -16,7 +16,7 @@
 import time
 from traceback import format_exc
 from contextlib import suppress
-from collections import Counter
+from collections import Counter, defaultdict
 
 import numpy as np
 import scipy.sparse as sp
@@ -59,6 +59,7 @@ def cross_validate(
     pre_dispatch="2*n_jobs",
     return_train_score=False,
     return_estimator=False,
+    return_indices=False,
     error_score=np.nan,
 ):
     """Evaluate metric(s) by cross-validation and also record fit/score times.
@@ -169,6 +170,11 @@ def cross_validate(
 
         .. versionadded:: 0.20
 
+    return_indices : bool, default=False
+        Whether to return the train-test indices selected for each split.
+
+        .. versionadded:: 1.1
+
     error_score : 'raise' or numeric, default=np.nan
         Value to assign to the score if an error occurs in estimator fitting.
         If set to 'raise', the error is raised.
@@ -207,6 +213,11 @@ def cross_validate(
                 The estimator objects for each cv split.
                 This is available only if ``return_estimator`` parameter
                 is set to ``True``.
+            ``indices``
+                The train/test indices for each cv split. A Python dictionary
+                is returned where the keys are either `"train"` or `"test"`
+                and the associated values are a list of NumPy arrays with the
+                indices.
 
     Examples
     --------
@@ -278,6 +289,7 @@ def cross_validate(
             return_train_score=return_train_score,
             return_times=True,
             return_estimator=return_estimator,
+            return_indices=return_indices,
             error_score=error_score,
         )
         for train, test in cv.split(X, y, groups)
@@ -300,6 +312,10 @@ def cross_validate(
     if return_estimator:
         ret["estimator"] = results["estimator"]
 
+    if return_indices:
+        ret["indices"] = defaultdict(list)
+        ret["indices"]["train"], ret["indices"]["test"] = zip(*results["indices"])
+
     test_scores_dict = _normalize_score_results(results["test_scores"])
     if return_train_score:
         train_scores_dict = _normalize_score_results(results["train_scores"])
@@ -538,6 +554,7 @@ def _fit_and_score(
     return_n_test_samples=False,
     return_times=False,
     return_estimator=False,
+    return_indices=False,
     split_progress=None,
     candidate_progress=None,
     error_score=np.nan,
@@ -609,6 +626,9 @@ def _fit_and_score(
     return_estimator : bool, default=False
         Whether to return the fitted estimator.
 
+    return_indices : bool, default=False
+        Whether to return the train/test cv split indices.
+
     Returns
     -------
     result : dict with the following attributes
@@ -742,6 +762,8 @@ def _fit_and_score(
         result["parameters"] = parameters
     if return_estimator:
         result["estimator"] = estimator
+    if return_indices:
+        result["indices"] = (train, test)
     return result
 
 
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 215ceb5877669..c06e5c49621e8 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -2357,3 +2357,27 @@ def _more_tags(self):
     msg = "_pairwise was deprecated in 0.24 and will be removed in 1.1"
     with pytest.warns(FutureWarning, match=msg):
         cross_validate(svm, linear_kernel, y, cv=2)
+
+
+def test_cross_validate_return_indices():
+    """Check the behaviour of `return_indices`."""
+    X, y = load_iris(return_X_y=True)
+    clf = SVC(gamma="auto")
+    grid = GridSearchCV(clf, param_grid={"C": [1, 10]})
+
+    cv = KFold(n_splits=3, shuffle=False)
+    cv_results = cross_validate(grid, X, y, cv=cv, n_jobs=2, return_indices=False)
+    assert "indices" not in cv_results
+
+    cv_results = cross_validate(grid, X, y, cv=cv, n_jobs=2, return_indices=True)
+    assert "indices" in cv_results
+    train_indices = cv_results["indices"]["train"]
+    test_indices = cv_results["indices"]["test"]
+    assert len(train_indices) == cv.n_splits
+    assert len(test_indices) == cv.n_splits
+
+    assert_array_equal([indices.size for indices in train_indices], 100)
+    assert_array_equal([indices.size for indices in test_indices], 50)
+
+    assert_array_equal(np.unique(np.concatenate(train_indices)), np.arange(y.size))
+    assert_array_equal(np.concatenate(test_indices), np.arange(y.size))
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 7b8d656bef939..6fc0db8e61c48 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -20,7 +20,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.datasets import make_classification, make_blobs, load_iris
 from sklearn.preprocessing import LabelEncoder
-from sklearn.model_selection import KFold, cross_val_predict
+from sklearn.model_selection import KFold, cross_validate, cross_val_predict
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.ensemble import (
     RandomForestClassifier,
@@ -878,3 +878,12 @@ def test_calibrated_classifier_cv_zeros_sample_weights_equivalence(method, ensem
     y_pred_without_weights = calibrated_clf_without_weights.predict_proba(X)
 
     assert_allclose(y_pred_with_weights, y_pred_without_weights)
+
+
+def test_xxx(pyplot):
+    # FIXME: write proper test to check from_cv_results
+    X, y = make_classification(random_state=0)
+    clf = LogisticRegression(random_state=0)
+    cv_results = cross_validate(clf, X, y, return_estimator=True, return_indices=True)
+
+    CalibrationDisplay.from_cv_results(cv_results, X, y)