From 272e75fde2558f7bb0037fda06b75c3a10c05479 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 15 Dec 2021 00:58:45 +0100
Subject: [PATCH 01/20] callback API

---
 sklearn/__init__.py                           |   1 +
 sklearn/base.py                               | 136 +++++++++
 sklearn/callback/__init__.py                  |  25 ++
 sklearn/callback/_base.py                     | 126 ++++++++
 sklearn/callback/_computation_tree.py         | 268 ++++++++++++++++++
 sklearn/callback/_convergence_monitor.py      | 118 ++++++++
 sklearn/callback/_early_stopping.py           |  48 ++++
 sklearn/callback/_progressbar.py              | 257 +++++++++++++++++
 sklearn/callback/_snapshot.py                 |  82 ++++++
 sklearn/callback/_text_verbose.py             |  44 +++
 .../callback/tests/test_computation_tree.py   |  98 +++++++
 sklearn/decomposition/_nmf.py                 |  95 ++++++-
 sklearn/linear_model/_logistic.py             |  62 +++-
 sklearn/linear_model/_sag.py                  |   4 +
 sklearn/linear_model/_sag_fast.pyx.tp         |  21 +-
 sklearn/pipeline.py                           |  30 +-
 sklearn/utils/optimize.py                     |  21 +-
 17 files changed, 1416 insertions(+), 20 deletions(-)
 create mode 100644 sklearn/callback/__init__.py
 create mode 100644 sklearn/callback/_base.py
 create mode 100644 sklearn/callback/_computation_tree.py
 create mode 100644 sklearn/callback/_convergence_monitor.py
 create mode 100644 sklearn/callback/_early_stopping.py
 create mode 100644 sklearn/callback/_progressbar.py
 create mode 100644 sklearn/callback/_snapshot.py
 create mode 100644 sklearn/callback/_text_verbose.py
 create mode 100644 sklearn/callback/tests/test_computation_tree.py

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 77ee28271bfaf..0e667babf1cee 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -84,6 +84,7 @@
 
     __all__ = [
         "calibration",
+        "callback",
         "cluster",
         "covariance",
         "cross_decomposition",
diff --git a/sklearn/base.py b/sklearn/base.py
index 06e9a63630923..4f6b63cb2add1 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -9,6 +9,7 @@
 import platform
 import inspect
 import re
+import pickle
 
 import numpy as np
 
@@ -28,6 +29,9 @@
 from .utils.validation import check_is_fitted
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _get_feature_names
+from .callback import BaseCallback
+from .callback import AutoPropagatedMixin
+from .callback import ComputationTree
 
 
 def clone(estimator, *, safe=True):
@@ -84,6 +88,10 @@ def clone(estimator, *, safe=True):
     new_object = klass(**new_object_params)
     params_set = new_object.get_params(deep=False)
 
+    # copy callbacks
+    if hasattr(estimator, "_callbacks"):
+        new_object._callbacks = clone(estimator._callbacks, safe=False)
+
     # quick sanity check of the parameters of the clone
     for name in new_object_params:
         param1 = new_object_params[name]
@@ -597,6 +605,134 @@ def _validate_data(
 
         return out
 
+    def _set_callbacks(self, callbacks):
+        """Set callbacks for the estimator.
+
+        Parameters
+        ----------
+        callbacks : callback or list of callbacks
+            the callbacks to set.
+        """
+        if not isinstance(callbacks, list):
+            callbacks = [callbacks]
+
+        if not all(isinstance(callback, BaseCallback) for callback in callbacks):
+            raise TypeError(f"callbacks must be subclasses of BaseCallback.")
+
+        self._callbacks = callbacks
+
+    # XXX should be a method of MetaEstimatorMixin but this mixin can't handle all
+    # meta-estimators.
+    def _propagate_callbacks(self, sub_estimator, parent_node):
+        """Propagate the auto-propagated callbacks to a sub-estimator
+
+        Parameters
+        ----------
+        sub_estimator : estimator instance
+            The sub-estimator to propagate the callbacks to.
+
+        parent_node : ComputationNode instance
+            The computation node in this estimator to set as parent_node to the
+            computation tree of the sub-estimator. It must be the node where the fit
+            method of the sub-estimator is called.
+        """
+        if not hasattr(self, "_callbacks"):
+            return
+
+        if hasattr(sub_estimator, "_callbacks") and any(
+            isinstance(callback, AutoPropagatedMixin)
+            for callback in sub_estimator._callbacks
+        ):
+            bad_callbacks = [
+                callback.__class__.__name__
+                for callback in sub_estimator._callbacks
+                if isinstance(callback, AutoPropagatedMixin)
+            ]
+            raise TypeError(
+                f"The sub-estimators ({sub_estimator.__class__.__name__}) of a"
+                f" meta-estimator ({self.__class__.__name__}) can't have"
+                f" auto-propagated callbacks ({bad_callbacks})."
+                " Set them directly on the meta-estimator."
+            )
+
+        propagated_callbacks = [
+            callback
+            for callback in self._callbacks
+            if isinstance(callback, AutoPropagatedMixin)
+        ]
+
+        if not propagated_callbacks:
+            return
+
+        sub_estimator._parent_node = parent_node
+
+        if not hasattr(sub_estimator, "_callbacks"):
+            sub_estimator._callbacks = propagated_callbacks
+        else:
+            sub_estimator._callbacks.extend(propagated_callbacks)
+
+    def _eval_callbacks_on_fit_begin(self, *, levels, X=None, y=None):
+        """Evaluate the on_fit_begin method of the callbacks
+
+        The computation tree is also built at this point.
+
+        This method should be called after all data and parameters validation.
+
+        Parameters
+        ----------
+        X : ndarray or sparse matrix, default=None
+            The training data.
+
+        y : ndarray, default=None
+            The target.
+
+        levels : list of dict
+            A description of the nested levels of computation of the estimator to build
+            the computation tree. It's a list of dict with "descr" and "max_iter" keys.
+
+        Returns
+        -------
+        root : ComputationNode instance
+            The root of the computation tree.
+        """
+        self._computation_tree = ComputationTree(
+            estimator_name=self.__class__.__name__,
+            levels=levels,
+            parent_node=getattr(self, "_parent_node", None),
+        )
+
+        if hasattr(self, "_callbacks"):
+            file_path = self._computation_tree.tree_dir / "computation_tree.pkl"
+            with open(file_path, "wb") as f:
+                pickle.dump(self._computation_tree, f)
+
+            for callback in self._callbacks:
+                is_propagated = hasattr(self, "_parent_node") and isinstance(
+                    callback, AutoPropagatedMixin
+                )
+                if not is_propagated:
+                    # Only call the on_fit_begin method of callbacks that are not
+                    # propagated from a meta-estimator.
+                    callback.on_fit_begin(estimator=self, X=X, y=y)
+
+        return self._computation_tree.root
+
+    def _eval_callbacks_on_fit_end(self):
+        """Evaluate the on_fit_end method of the callbacks"""
+        if not hasattr(self, "_callbacks"):
+            return
+
+        self._computation_tree._tree_status[0] = True
+
+        for callback in self._callbacks:
+            is_propagated = isinstance(callback, AutoPropagatedMixin) and hasattr(
+                self, "_parent_node"
+            )
+            if not is_propagated:
+                # Only call the on_fit_end method of callbacks that are not
+                # propagated from a meta-estimator.
+                callback.on_fit_end()
+
     @property
     def _repr_html_(self):
         """HTML representation of estimator.
diff --git a/sklearn/callback/__init__.py b/sklearn/callback/__init__.py
new file mode 100644
index 0000000000000..1f0f3f7215a18
--- /dev/null
+++ b/sklearn/callback/__init__.py
@@ -0,0 +1,25 @@
+# License: BSD 3 clause
+
+from ._base import AutoPropagatedMixin
+from ._base import BaseCallback
+from ._computation_tree import ComputationNode
+from ._computation_tree import ComputationTree
+from ._computation_tree import load_computation_tree
+from ._convergence_monitor import ConvergenceMonitor
+from ._early_stopping import EarlyStopping
+from ._progressbar import ProgressBar
+from ._snapshot import Snapshot
+from ._text_verbose import TextVerbose
+
+__all__ = [
+    "AutoPropagatedMixin",
+    "Basecallback",
+    "ComputationNode",
+    "ComputationTree",
+    "load_computation_tree",
+    "ConvergenceMonitor",
+    "EarlyStopping",
+    "ProgressBar",
+    "Snapshot",
+    "TextVerbose",
+]
diff --git a/sklearn/callback/_base.py b/sklearn/callback/_base.py
new file mode 100644
index 0000000000000..604a450336610
--- /dev/null
+++ b/sklearn/callback/_base.py
@@ -0,0 +1,126 @@
+# License: BSD 3 clause
+
+from abc import ABC, abstractmethod
+
+
+# Not a method of BaseEstimator because it might be called from an extern function
+def _eval_callbacks_on_fit_iter_end(**kwargs):
+    """Evaluate the on_fit_iter_end method of the callbacks
+
+    This function should be called at the end of each computation node.
+
+    Parameters
+    ----------
+    kwargs : dict
+        arguments passed to the callback.
+
+    Returns
+    -------
+    stop : bool
+        Whether or not to stop the fit at this node.
+    """
+    estimator = kwargs.get("estimator")
+    node = kwargs.get("node")
+
+    if not hasattr(estimator, "_callbacks") or node is None:
+        return False
+
+    estimator._computation_tree._tree_status[node.tree_status_idx] = True
+
+    # stopping_criterion and reconstruction_attributes can be costly to compute. They
+    # are passed as lambdas for lazy evaluation. We only actually compute them if a
+    # callback requests it.
+    if any(
+        getattr(callback, "request_stopping_criterion", False)
+        for callback in estimator._callbacks
+    ):
+        kwarg = kwargs.pop("stopping_criterion", lambda: None)()
+        kwargs["stopping_criterion"] = kwarg
+
+    if any(
+        getattr(callback, "request_reconstruction_attributes", False)
+        for callback in estimator._callbacks
+    ):
+        kwarg = kwargs.pop("reconstruction_attributes", lambda: None)()
+        kwargs["reconstruction_attributes"] = kwarg
+
+    return any(callback.on_fit_iter_end(**kwargs) for callback in estimator._callbacks)
+
+
+class BaseCallback(ABC):
+    """Abstract class for the callbacks"""
+
+    @abstractmethod
+    def on_fit_begin(self, estimator, *, X=None, y=None):
+        """Method called at the beginning of the fit method of the estimator
+
+        Parameters
+        ----------
+        estimator: estimator instance
+            The estimator the callback is set on.
+        X: ndarray or sparse matrix, default=None
+            The training data.
+        y: ndarray, default=None
+            The target.
+        """
+        pass
+
+    @abstractmethod
+    def on_fit_end(self):
+        """Method called at the end of the fit method of the estimator"""
+        pass
+
+    @abstractmethod
+    def on_fit_iter_end(self, estimator, node, **kwargs):
+        """Method called at the end of each computation node of the estimator
+
+        Parameters
+        ----------
+        estimator : estimator instance
+            The caller estimator. It might differ from the estimator passed to the
+            `on_fit_begin` method for auto-propagated callbacks.
+
+        node : ComputationNode instance
+            The caller computation node.
+
+        kwargs : dict
+            arguments passed to the callback. Possible keys are
+
+            - stopping_criterion: float
+                Usually iterations stop when `stopping_criterion <= tol`.
+                This is only provided at the innermost level of iterations.
+
+            - tol: float
+                Tolerance for the stopping criterion.
+                This is only provided at the innermost level of iterations.
+
+            - reconstruction_attributes: dict
+                Necessary attributes to construct an estimator (by copying this
+                estimator and setting these as attributes) which will behave as if
+                the fit stopped at this node.
+                This is only provided at the outermost level of iterations.
+
+            - fit_state: dict
+                Model specific quantities updated during fit. This is not meant to be
+                used by generic callbacks but by a callback designed for a specific
+                estimator instead.
+
+        Returns
+        -------
+        stop : bool or None
+            Whether or not to stop the current level of iterations at this node.
+        """
+        pass
+
+
+class AutoPropagatedMixin:
+    """Mixin for auto-propagated callbacks
+
+    An auto-propagated callback (from a meta-estimator to its sub-estimators) must be
+    set on the meta-estimator. Its `on_fit_begin` and `on_fit_end` methods will only be
+    called at the beginning and end of the fit method of the meta-estimator, while its
+    `on_fit_iter_end` method will be called at each computation node of the
+    meta-estimator and its sub-estimators.
+    """
+
+    pass
diff --git a/sklearn/callback/_computation_tree.py b/sklearn/callback/_computation_tree.py
new file mode 100644
index 0000000000000..edd3c8f1f657f
--- /dev/null
+++ b/sklearn/callback/_computation_tree.py
@@ -0,0 +1,268 @@
+# License: BSD 3 clause
+
+from tempfile import mkdtemp
+from pathlib import Path
+import pickle
+import os
+
+import numpy as np
+
+
+class ComputationNode:
+    """A node in a ComputationTree
+
+    Parameters
+    ----------
+    computation_tree : ComputationTree instance
+        The computation tree it belongs to.
+
+    parent : ComputationNode instance, default=None
+        The parent node. None means this is the root.
+
+    max_iter : int, default=None
+        The number of its children. None means it's a leaf.
+
+    description : str, default=None
+        A description of this computation node. None means it's a leaf.
+
+    tree_status_idx : int, default=0
+        The index of the status of this node in the `tree_status` array of its
+        computation tree.
+
+    idx : int, default=0
+        The index of this node in the children list of its parent.
+
+    Attributes
+    ----------
+    children : list
+        The list of its children nodes. For a leaf, it's an empty list
+
+    depth : int
+        The depth of this node in its computation tree. The root has a depth of 0.
+    """
+
+    def __init__(
+        self,
+        computation_tree,
+        parent=None,
+        max_iter=None,
+        description=None,
+        tree_status_idx=0,
+        idx=0,
+    ):
+        self.computation_tree = computation_tree
+        self.parent = parent
+        self.max_iter = max_iter
+        self.description = description
+        self.tree_status_idx = tree_status_idx
+        self.idx = idx
+        self.children = []
+        self.depth = 0 if self.parent is None else self.parent.depth + 1
+
+    def get_ancestors(self, include_ancestor_trees=True):
+        """Get the list of all nodes in the path from the node to the root
+
+        Parameters
+        ----------
+        include_ancestor_trees : bool, default=True
+            If True, propagate to the tree of the `parent_node` of this tree if it
+            exists and so on.
+
+        Returns
+        -------
+        ancestors : list
+            The list of ancestors of this node (included).
+        """
+        node = self
+        ancestors = [node]
+
+        while node.parent is not None:
+            node = node.parent
+            ancestors.append(node)
+
+        if include_ancestor_trees:
+            node_parent_tree = node.computation_tree.parent_node
+            if node_parent_tree is not None:
+                ancestors.extend(node_parent_tree.get_ancestors())
+
+        return ancestors
+
+
+class ComputationTree:
+    """Data structure to store the computation tree of an estimator
+
+    Parameters
+    ----------
+    estimator_name : str
+        The name of the estimator.
+
+    levels : list of dict
+        A description of the nested levels of computation of the estimator to build the
+        tree. It's a list of dict with "descr" and "max_iter" keys.
+
+    parent_node : ComputationNode, default=None
+        The node where the estimator is used in the computation tree of a
+        meta-estimator. This node is not set to be the parent of the root of this tree.
+
+    Attributes
+    ----------
+    depth : int
+        The depth of the tree. It corresponds to the depth of its deepest leaf.
+
+    root : ComputationNode instance
+        The root of the computation tree.
+
+    tree_dir : pathlib.Path instance
+        The path of the directory where the computation tree is dumped during the fit of
+        its estimator. If it has a parent tree, this is a sub-directory of the
+        `tree_dir` of its parent.
+    """
+
+    def __init__(self, estimator_name, levels, *, parent_node=None):
+        self.estimator_name = estimator_name
+        self.parent_node = parent_node
+
+        self.depth = len(levels) - 1
+        self.root, self.n_nodes = self._build_tree(levels)
+
+        parent_tree_dir = (
+            None
+            if self.parent_node is None
+            else self.parent_node.computation_tree.tree_dir
+        )
+        if parent_tree_dir is None:
+            self.tree_dir = Path(mkdtemp())
+        else:
+            # This tree has a parent tree. Place it in a subdir of its parent dir
+            # and give it a name that allows from the parent tree to find the sub dir
+            # of the sub tree of a given leaf.
+            self.tree_dir = parent_tree_dir / str(parent_node.tree_status_idx)
+            self.tree_dir.mkdir()
+        self._filename = self.tree_dir / "tree_status.memmap"
+
+        self._set_tree_status(mode="w+")
+        self._tree_status[:] = False
+
+    def _build_tree(self, levels):
+        """Build the computation tree from the description of the levels"""
+        root = ComputationNode(
+            computation_tree=self,
+            max_iter=levels[0]["max_iter"],
+            description=levels[0]["descr"],
+        )
+
+        n_nodes = self._recursive_build_tree(root, levels)
+
+        return root, n_nodes
+
+    def _recursive_build_tree(self, parent, levels, n_nodes=1):
+        """Recursively build the tree from the root the leaves"""
+        if parent.depth == self.depth:
+            return n_nodes
+
+        for i in range(parent.max_iter):
+            children_max_iter = levels[parent.depth + 1]["max_iter"]
+            description = levels[parent.depth + 1]["descr"]
+
+            node = ComputationNode(
+                computation_tree=self,
+                parent=parent,
+                max_iter=children_max_iter,
+                description=description,
+                tree_status_idx=n_nodes,
+                idx=i,
+            )
+            parent.children.append(node)
+
+            n_nodes = self._recursive_build_tree(node, levels, n_nodes + 1)
+
+        return n_nodes
+
+    def _set_tree_status(self, mode):
+        """Create a memory-map to the tree_status array stored on the disk"""
+        # This has to be done each time we unpickle the tree
+        self._tree_status = np.memmap(
+            self._filename, dtype=bool, mode=mode, shape=(self.n_nodes,)
+        )
+
+    def get_progress(self, node):
+        """Return the number of finished child nodes of this node"""
+        if self._tree_status[node.tree_status_idx]:
+            return node.max_iter
+
+        # Since the children of a node are not ordered (to account for parallel
+        # execution), we can't rely on the highest index for which the status is True.
+        return sum(
+            [self._tree_status[child.tree_status_idx] for child in node.children]
+        )
+
+    def iterate(self, include_leaves=False):
+        """Return an iterable over the nodes of the computation tree
+
+        Nodes are discovered in a depth first search manner.
+
+        Parameters
+        ----------
+        include_leaves : bool
+            Whether or not to include the leaves of the tree in the iterable
+
+        Returns
+        -------
+        nodes_list : list
+            A list of the nodes of the computation tree.
+        """
+        return self._recursive_iterate(include_leaves=include_leaves)
+
+    def _recursive_iterate(self, node=None, include_leaves=False, node_list=None):
+        """Recursively constructs the iterable"""
+        # TODO make it a generator
+        if node is None:
+            node = self.root
+            node_list = []
+
+        if node.children or include_leaves:
+            node_list.append(node)
+
+        for child in node.children:
+            self._recursive_iterate(child, include_leaves, node_list)
+
+        return node_list
+
+    def __repr__(self):
+        res = (
+            f"[{self.estimator_name}] {self.root.description} : progress "
+            f"{self.get_progress(self.root)} / {self.root.max_iter}\n"
+        )
+        for node in self.iterate(include_leaves=False):
+            if node is not self.root:
+                res += (
+                    f"{'  ' * node.depth}{node.description} {node.idx}: progress "
+                    f"{self.get_progress(node)} / {node.max_iter}\n"
+                )
+        return res
+
+
+def load_computation_tree(directory):
+    """load the computation tree of a directory
+
+    Parameters
+    ----------
+    directory : pathlib.Path instance
+        The directory where the computation tree is dumped
+
+    Returns
+    -------
+    computation_tree : ComputationTree instance
+        The loaded computation tree
+    """
+    file_path = directory / "computation_tree.pkl"
+    if not file_path.exists() or not os.path.getsize(file_path) > 0:
+        # Do not try to load the tree when it's created but not yet written
+        return
+
+    with open(file_path, "rb") as f:
+        computation_tree = pickle.load(f)
+
+    computation_tree._set_tree_status(mode="r")
+
+    return computation_tree
diff --git a/sklearn/callback/_convergence_monitor.py b/sklearn/callback/_convergence_monitor.py
new file mode 100644
index 0000000000000..9f53d657cc75a
--- /dev/null
+++ b/sklearn/callback/_convergence_monitor.py
@@ -0,0 +1,118 @@
+# License: BSD 3 clause
+
+from copy import copy
+from pathlib import Path
+from tempfile import mkdtemp
+import time
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from . import BaseCallback
+
+
+class ConvergenceMonitor(BaseCallback):
+    """Monitor model convergence.
+
+    Parameters
+    ----------
+    monitor : 
+
+    X_val : ndarray, default=None
+        Validation data
+
+    y_val : ndarray, default=None
+        Validation target
+
+    Attributes
+    ----------
+    data : pandas.DataFrame
+        The monitored quantities at each iteration.
+    """
+
+    request_reconstruction_attributes = True
+
+    def __init__(self, *, monitor="objective_function", X_val=None, y_val=None):
+        self.X_val = X_val
+        self.y_val = y_val
+        self._data_file = Path(mkdtemp()) / "convergence_monitor.csv"
+
+    def on_fit_begin(self, estimator, *, X=None, y=None):
+        self.estimator = estimator
+        self.X_train = X
+        self.y_train = y
+        self._start_time = {}
+
+    def on_fit_iter_end(self, *, node, **kwargs):
+        if node.depth != node.computation_tree.depth:
+            return
+
+        reconstruction_attributes = kwargs.get("reconstruction_attributes", None)
+        if reconstruction_attributes is None:
+            return
+
+        new_estimator = copy(self.estimator)
+        for key, val in reconstruction_attributes.items():
+            setattr(new_estimator, key, val)
+
+        if node.idx == 0:
+            self._start_time[node.parent] = time.perf_counter()
+            curr_time = 0
+        else:
+            curr_time = time.perf_counter() - self._start_time[node.parent]
+
+        obj_train, *_ = new_estimator.objective_function(self.X_train, self.y_train, normalize=True)
+        if self.X_val is not None:
+            obj_val, *_ = new_estimator.objective_function(self.X_val, self.y_val, normalize=True)
+        else:
+            obj_val = None
+
+        ancestors = node.get_ancestors()[:0:-1]
+        ancestors_desc = [
+            f"{n.computation_tree.estimator_name}-{n.description}" for n in ancestors
+        ]
+        ancestors_idx = [f"{n.idx}" for n in ancestors]
+
+        if not self._data_file.exists():
+            with open(self._data_file, "w") as f:
+                f.write(
+                    f"{','.join(ancestors_desc)},iteration,time,obj_train,obj_val\n"
+                )
+
+        with open(self._data_file, "a") as f:
+            f.write(
+                f"{','.join(ancestors_idx)},{node.idx},{curr_time},{obj_train},{obj_val}\n"
+            )
+
+    def on_fit_end(self):
+        pass
+
+    def get_data(self):
+        if not hasattr(self, "data"):
+            self.data = pd.read_csv(self._data_file)
+        return self.data
+
+    def plot(self, x="iteration"):
+        data = self.get_data()
+
+        # all columns but iteration, time, obj_train, obj_val
+        group_by_columns = list(data.columns[:-4])
+        groups = data.groupby(group_by_columns)
+
+        for key in groups.groups.keys():
+            group = groups.get_group(key)
+            fig, ax = plt.subplots()
+
+            ax.plot(group[x], group["obj_train"], label="obj_train")
+            if self.X_val is not None:
+                ax.plot(group[x], group["obj_val"], label="obj_val")
+
+            if x == "iteration":
+                x_label = "Number of iterations"
+            elif x == "time":
+                x_label = "Time (s)"
+            ax.set_xlabel(x_label)
+            ax.set_ylabel("objective function")
+
+            ax.legend()
+            plt.show()
diff --git a/sklearn/callback/_early_stopping.py b/sklearn/callback/_early_stopping.py
new file mode 100644
index 0000000000000..44a0108e04b26
--- /dev/null
+++ b/sklearn/callback/_early_stopping.py
@@ -0,0 +1,48 @@
+# License: BSD 3 clause
+
+from . import BaseCallback
+
+
+class EarlyStopping(BaseCallback):
+    def __init__(
+        self,
+        X_val=None,
+        y_val=None,
+        monitor="objective_function",
+        max_no_improvement=10,
+        tol=1e-2,
+    ):
+        self.X_val = X_val
+        self.y_val = y_val
+        self.monitor = monitor
+        self.max_no_improvement = max_no_improvement
+        self.tol = tol
+
+    def on_fit_begin(self, estimator, X=None, y=None):
+        self.estimator = estimator
+        self._no_improvement = {}
+        self._last_monitored = {}
+
+    def on_fit_iter_end(self, *, node, **kwargs):
+        if node.depth != self.estimator._computation_tree.depth:
+            return
+
+        if self.monitor == "objective_function":
+            objective_function = kwargs.get("objective_function", None)
+            monitored, *_ = objective_function(self.X_val)
+        elif self.monitor == "TODO":
+            pass
+
+        if node.parent not in self._last_monitored or monitored < self._last_monitored[
+            node.parent
+        ] * (1 - self.tol):
+            self._no_improvement[node.parent] = 0
+            self._last_monitored[node.parent] = monitored
+        else:
+            self._no_improvement[node.parent] += 1
+
+        if self._no_improvement[node.parent] >= self.max_no_improvement:
+            return True
+
+    def on_fit_end(self):
+        pass
diff --git a/sklearn/callback/_progressbar.py b/sklearn/callback/_progressbar.py
new file mode 100644
index 0000000000000..ae11e67d59f57
--- /dev/null
+++ b/sklearn/callback/_progressbar.py
@@ -0,0 +1,257 @@
+# License: BSD 3 clause
+
+from copy import copy
+import pickle
+from threading import Thread, Event
+
+import numpy as np
+from tqdm import tqdm
+from rich.progress import Progress
+from rich.progress import BarColumn, TimeRemainingColumn, TextColumn
+from rich.style import Style
+
+from . import BaseCallback
+from . import AutoPropagatedMixin
+from . import load_computation_tree
+
+
+class ProgressBar(BaseCallback, AutoPropagatedMixin):
+    """Callback that displays progress bars for each iterative steps of the estimator
+
+    Parameters
+    ----------
+    backend: {"rich"}, default="rich"
+        The backend for the progress bars display.
+
+    max_depth_show : int, default=None
+        The maximum nested level of progress bars to display.
+
+    max_depth_keep : int, default=None
+        The maximum nested level of progress bars to keep displayed when they are
+        finished.
+    """
+
+    def __init__(self, backend="rich", max_depth_show=None, max_depth_keep=None):
+        self.backend = backend
+        if max_depth_show is not None and max_depth_show < 0:
+            raise ValueError(f"max_depth_show should be >= 0.")
+        if max_depth_keep is not None and max_depth_keep < 0:
+            raise ValueError(f"max_depth_keep should be >= 0.")
+        self.max_depth_show = max_depth_show
+        self.max_depth_keep = max_depth_keep
+
+    def on_fit_begin(self, estimator, X=None, y=None):
+        self._stop_event = Event()
+
+        if self.backend == "rich":
+            self.progress_monitor = _RichProgressMonitor(
+                estimator=estimator,
+                event=self._stop_event,
+                max_depth_show=self.max_depth_show,
+                max_depth_keep=self.max_depth_keep,
+            )
+        else:
+            raise ValueError(f"backend should be 'rich', got {self.backend} instead.")
+
+        self.progress_monitor.start()
+
+    def on_fit_iter_end(self, *, estimator, node, **kwargs):
+        pass
+
+    def on_fit_end(self):
+        self._stop_event.set()
+        self.progress_monitor.join()
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        if "_stop_event" in state:
+            del state["_stop_event"]
+        if "progress_monitor" in state:
+            del state["progress_monitor"]
+        return state
+
+
+# Custom Progress class to allow showing the tasks in a given order (given by setting
+# the _ordered_tasks attribute). In particular it allows to dynamically create and
+# insert tasks between existing tasks.
+class _Progress(Progress):
+    def get_renderables(self):
+        table = self.make_tasks_table(getattr(self, "_ordered_tasks", []))
+        yield table
+
+
+class _RichProgressMonitor(Thread):
+    """Thread monitoring the progress of an estimator with rich based display
+
+    The display is a list of nested rich tasks using rich.Progress. There is one for
+    each node in the computation tree of the estimator and in the computation trees of
+    estimators used in the estimator.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The estimator to monitor
+
+    event : threading.Event instance
+        This thread will run until event is set.
+
+    max_depth_show : int, default=None
+        The maximum nested level of progress bars to display.
+
+    max_depth_keep : int, default=None
+        The maximum nested level of progress bars to keep displayed when they are
+        finished.
+    """
+
+    def __init__(self, estimator, event, max_depth_show=None, max_depth_keep=None):
+        Thread.__init__(self)
+        self.estimator = estimator
+        self.event = event
+        self.max_depth_show = max_depth_show
+        self.max_depth_keep = max_depth_keep
+
+        # _computation_trees is a dict `directory: tuple` where
+        # - tuple[0] is the computation tree of the directory
+        # - tuple[1] is a dict `node.tree_status_idx: task_id`
+        self._computation_trees = {}
+
+    def run(self):
+        with _Progress(
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(
+                complete_style=Style(color="dark_orange"),
+                finished_style=Style(color="cyan"),
+            ),
+            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+            TimeRemainingColumn(),
+            auto_refresh=False,
+        ) as progress_ctx:
+            self._progress_ctx = progress_ctx
+
+            while not self.event.wait(0.05):
+                self._recursive_update_tasks()
+                self._progress_ctx.refresh()
+
+            self._recursive_update_tasks()
+            self._progress_ctx.refresh()
+
+    def _recursive_update_tasks(self, this_dir=None, depth=0):
+        """Recursively loop through directories and init or update tasks
+
+        Parameters
+        ----------
+        this_dir : pathlib.Path instance
+            The directory to
+
+        depth : int
+            The current depth
+        """
+        if self.max_depth_show is not None and depth > self.max_depth_show:
+            # Fast exit if this dir is deeper than what we want to show anyway
+            return
+
+        if this_dir is None:
+            this_dir = self.estimator._computation_tree.tree_dir
+            # _ordered_tasks holds the list of the tasks in the order we want them to
+            # be displayed.
+            self._progress_ctx._ordered_tasks = []
+
+        if this_dir not in self._computation_trees:
+            # First time we discover this directory -> store the computation tree
+            # If the computation tree is not readable yet, skip and try again next time
+            computation_tree = load_computation_tree(this_dir)
+            if computation_tree is None:
+                return
+
+            self._computation_trees[this_dir] = (computation_tree, {})
+
+        computation_tree, task_ids = self._computation_trees[this_dir]
+
+        for node in computation_tree.iterate(include_leaves=True):
+            if node.children:
+                # node is not a leaf, create or update its task
+                if node.tree_status_idx not in task_ids:
+                    visible = True
+                    if (
+                        self.max_depth_show is not None
+                        and depth + node.depth > self.max_depth_show
+                    ):
+                        # If this node is deeper than what we want to show, we create
+                        # the task anyway but make it not visible
+                        visible = False
+
+                    task_ids[node.tree_status_idx] = self._progress_ctx.add_task(
+                        self._format_task_description(node, computation_tree, depth),
+                        total=node.max_iter,
+                        visible=visible,
+                    )
+
+                task_id = task_ids[node.tree_status_idx]
+                task = self._progress_ctx.tasks[task_id]
+                self._progress_ctx._ordered_tasks.append(task)
+
+                parent_task = self._get_parent_task(node, computation_tree, task_ids)
+                if parent_task is not None and parent_task.finished:
+                    # If the task of the parent node is finished, make this task
+                    # finished. It can happen if some computations are stopped
+                    # before reaching max_iter.
+                    visible = True
+                    if (
+                        self.max_depth_keep is not None
+                        and depth + node.depth > self.max_depth_keep
+                    ):
+                        # If this node is deeper than what we want to keep in the output
+                        # make it not visible
+                        visible = False
+                    self._progress_ctx.update(
+                        task_id, completed=node.max_iter, visible=visible, refresh=False
+                    )
+                else:
+                    node_progress = computation_tree.get_progress(node)
+                    if node_progress != task.completed:
+                        self._progress_ctx.update(
+                            task_id, completed=node_progress, refresh=False
+                        )
+            else:
+                # node is a leaf, look for tasks of its sub computation tree before
+                # going to the next node
+                child_dir = this_dir / str(node.tree_status_idx)
+                if child_dir.exists():
+                    self._recursive_update_tasks(
+                        child_dir, depth + computation_tree.depth
+                    )
+
+    def _format_task_description(self, node, computation_tree, depth):
+        """Return a formatted description for the task of the node"""
+        colors = ["red", "green", "blue", "yellow"]
+
+        indent = f"{'  ' * (depth + node.depth)}"
+        style = f"[{colors[(depth + node.depth)%len(colors)]}]"
+
+        description = f"{computation_tree.estimator_name} - {node.description}"
+        if node.parent is None and computation_tree.parent_node is not None:
+            description = (
+                f"{computation_tree.parent_node.description} {computation_tree.parent_node.idx} |"
+                f" {description}"
+            )
+        if node.parent is not None:
+            description = f"{description} {node.idx}"
+
+        return f"{style}{indent}{description}"
+
+    def _get_parent_task(self, node, computation_tree, task_ids):
+        """Get the task of the parent node"""
+        if node.parent is not None:
+            # node is not the root, return the task of its parent
+            task_id = task_ids[node.parent.tree_status_idx]
+            return self._progress_ctx.tasks[task_id]
+        if computation_tree.parent_node is not None:
+            # node is the root, return the task of the parent of the parent_node of
+            # its computation tree
+            parent_dir = computation_tree.parent_node.computation_tree.tree_dir
+            _, parent_tree_task_ids = self._computation_trees[parent_dir]
+            task_id = parent_tree_task_ids[
+                computation_tree.parent_node.parent.tree_status_idx
+            ]
+            return self._progress_ctx._tasks[task_id]
+        return
diff --git a/sklearn/callback/_snapshot.py b/sklearn/callback/_snapshot.py
new file mode 100644
index 0000000000000..231eafc8cbb9e
--- /dev/null
+++ b/sklearn/callback/_snapshot.py
@@ -0,0 +1,82 @@
+# License: BSD 3 clause
+
+from copy import copy
+from datetime import datetime
+from pathlib import Path
+import pickle
+
+import numpy as np
+
+from . import BaseCallback
+
+
+class Snapshot(BaseCallback):
+    """Take regular snapshots of an estimator
+
+    Parameters
+    ----------
+    keep_last_n : int or None, default=1
+        Only the last `keep_last_n` snapshots are kept on the disk. None means all
+        snapshots are kept.
+
+    base_dir : str or pathlib.Path instance, default=None
+        The directory where the snapshots should be stored. If None, they are stored in
+        the current directory.
+
+    Attributes
+    ----------
+    directory : pathlib.Path instance
+        The directory where the snapshots are saved. It's a sub-directory of `base_dir`.
+    """
+
+    request_reconstruction_attributes = True
+
+    def __init__(self, keep_last_n=1, base_dir=None):
+        self.keep_last_n = keep_last_n
+        if keep_last_n is not None and keep_last_n <= 0:
+            raise ValueError(
+                "keep_last_n must be a positive integer, got"
+                f" {self.keep_last_n} instead."
+            )
+
+        self.base_dir = Path("." if base_dir is None else base_dir)
+
+    def on_fit_begin(self, estimator, X=None, y=None):
+        self.estimator = estimator
+
+        # Use a hash in the name of this directory to avoid name collision if several
+        # clones of this estimator are fitted in parallel in a meta-estimator for
+        # instance.
+        dir_name = (
+            "snapshots_"
+            f"{self.estimator.__class__.__name__}_"
+            f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')}_"
+            f"{hash(self.estimator._computation_tree)}"
+        )
+
+        self.directory = self.base_dir / dir_name
+        self.directory.mkdir()
+
+    def on_fit_iter_end(self, *, node, **kwargs):
+        reconstruction_attributes = kwargs.get("reconstruction_attributes", None)
+        if reconstruction_attributes is None:
+            return
+
+        new_estimator = copy(self.estimator)
+        for key, val in reconstruction_attributes.items():
+            setattr(new_estimator, key, val)
+
+        file_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')}.pkl"
+        file_path = self.directory / file_name
+
+        with open(file_path, "wb") as f:
+            pickle.dump(new_estimator, f)
+
+        if self.keep_last_n is not None:
+            for snapshot in sorted(self.directory.iterdir())[: -self.keep_last_n]:
+                snapshot.unlink(missing_ok=True)
+
+    def on_fit_end(self):
+        if self.keep_last_n is not None:
+            for snapshot in sorted(self.directory.iterdir())[: -self.keep_last_n]:
+                snapshot.unlink()
diff --git a/sklearn/callback/_text_verbose.py b/sklearn/callback/_text_verbose.py
new file mode 100644
index 0000000000000..b857ff592c87c
--- /dev/null
+++ b/sklearn/callback/_text_verbose.py
@@ -0,0 +1,44 @@
+# License: BSD 3 clause
+
+import time
+
+from . import BaseCallback
+from . import AutoPropagatedMixin
+
+
+class TextVerbose(BaseCallback, AutoPropagatedMixin):
+    request_stopping_criterion = True
+
+    def __init__(self, min_time_between_calls=0):
+        self.min_time_between_calls = min_time_between_calls
+
+    def on_fit_begin(self, estimator, X=None, y=None):
+        self.estimator = estimator
+        self._start_time = time.perf_counter()
+
+    def on_fit_iter_end(self, *, node, **kwargs):
+        if node.depth != node.computation_tree.depth:
+            return
+
+        stopping_criterion = kwargs.get("stopping_criterion", None)
+        tol = kwargs.get("tol", None)
+
+        current_time = time.perf_counter() - self._start_time
+
+        s = f"{node.description} {node.idx}"
+        parent = node.parent
+        while parent is not None and parent.parent is not None:
+            s = f"{parent.description} {parent.idx} - {s}"
+            parent = parent.parent
+
+        msg = (
+            f"[{parent.computation_tree.estimator_name}] {s} | time {current_time:.5f}s"
+        )
+
+        if stopping_criterion is not None and tol is not None:
+            msg += f" | stopping_criterion={stopping_criterion:.3E} | tol={tol:.3E}"
+
+        print(msg)
+
+    def on_fit_end(self):
+        pass
diff --git a/sklearn/callback/tests/test_computation_tree.py b/sklearn/callback/tests/test_computation_tree.py
new file mode 100644
index 0000000000000..b726177a342ec
--- /dev/null
+++ b/sklearn/callback/tests/test_computation_tree.py
@@ -0,0 +1,98 @@
+# License: BSD 3 clause
+
+import numpy as np
+import pytest
+
+from sklearn.callback import ComputationTree
+from sklearn.callback import ComputationNode
+from sklearn.callback import load_computation_tree
+
+
+levels = [
+    {"descr": "level0", "max_iter": 3},
+    {"descr": "level1", "max_iter": 5},
+    {"descr": "level2", "max_iter": 7},
+    {"descr": "level3", "max_iter": None},
+]
+
+
+def test_computation_tree():
+    # Check the construction of the computation tree
+    computation_tree = ComputationTree(estimator_name="estimator", levels=levels)
+    assert computation_tree.estimator_name == "estimator"
+
+    root = computation_tree.root
+    assert root.parent is None
+    assert root.idx == 0
+
+    assert len(root.children) == root.max_iter == 3
+    assert [node.idx for node in root.children] == list(range(3))
+
+    for node1 in root.children:
+        assert len(node1.children) == 5
+        assert [n.idx for n in node1.children] == list(range(5))
+
+        for node2 in node1.children:
+            assert len(node2.children) == 7
+            assert [n.idx for n in node2.children] == list(range(7))
+
+            for node3 in node2.children:
+                assert not node3.children
+
+
+def test_n_nodes():
+    # Check that the number of node in a comutation tree corresponds to what we expect
+    # from the level descriptions
+    computation_tree = ComputationTree(estimator_name="", levels=levels)
+
+    max_iter_per_level = [level["max_iter"] for level in levels[:-1]]
+    expected_n_nodes = 1 + np.sum(np.cumprod(max_iter_per_level))
+
+    assert computation_tree.n_nodes == expected_n_nodes
+    assert len(computation_tree.iterate(include_leaves=True)) == expected_n_nodes
+    assert computation_tree._tree_status.shape == (expected_n_nodes,)
+
+
+def test_tree_status_idx():
+    # Check that each node has a unique index in the _tree_status array and that their
+    # order corresponds to the order given by a depth first search.
+    computation_tree = ComputationTree(estimator_name="", levels=levels)
+
+    indexes = [
+        node.tree_status_idx for node in computation_tree.iterate(include_leaves=True)
+    ]
+    assert indexes == list(range(computation_tree.n_nodes))
+
+
+def test_get_ancestors():
+    # Check that the ancestor search excludes the root and can propagate to parent trees
+    parent_levels = [
+        {"descr": "parent_level0", "max_iter": 2},
+        {"descr": "parent_level1", "max_iter": 4},
+        {"descr": "parent_level2", "max_iter": None},
+    ]
+
+    parent_computation_tree = ComputationTree(
+        estimator_name="parent_estimator", levels=parent_levels
+    )
+    parent_node = parent_computation_tree.root.children[0].children[2]
+
+    computation_tree = ComputationTree(
+        estimator_name="estimator", levels=levels, parent_node=parent_node
+    )
+    node = computation_tree.root.children[1].children[3].children[5]
+
+    ancestors = node.get_ancestors(include_ancestor_trees=False)
+    assert ancestors == [node, node.parent, node.parent.parent]
+    assert [n.idx for n in ancestors] == [5, 3, 1]
+    assert computation_tree.root not in ancestors
+
+    ancestors = node.get_ancestors(include_ancestor_trees=True)
+    assert ancestors == [
+        node,
+        node.parent,
+        node.parent.parent,
+        parent_node,
+        parent_node.parent,
+    ]
+    assert [n.idx for n in ancestors] == [5, 3, 1, 2, 0]
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index cc1451be54567..f53f33c6b804a 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -6,6 +6,7 @@
 #         Tom Dupre la Tour
 # License: BSD 3 clause
 
+from functools import partial
 import numbers
 import numpy as np
 import scipy.sparse as sp
@@ -23,6 +24,7 @@
     check_is_fitted,
     check_non_negative,
 )
+from ..callback._base import _eval_callbacks_on_fit_iter_end
 
 EPSILON = np.finfo(np.float32).eps
 
@@ -424,6 +426,8 @@ def _fit_coordinate_descent(
     verbose=0,
     shuffle=False,
     random_state=None,
+    estimator=None,
+    parent_node=None,
 ):
     """Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent
 
@@ -500,7 +504,9 @@ def _fit_coordinate_descent(
 
     rng = check_random_state(random_state)
 
-    for n_iter in range(1, max_iter + 1):
+    nodes = parent_node.children if parent_node is not None else [None] * max_iter
+
+    for n_iter, node in enumerate(nodes, 1):
         violation = 0.0
 
         # Update W
@@ -519,6 +525,21 @@ def _fit_coordinate_descent(
         if violation_init == 0:
             break
 
+        if _eval_callbacks_on_fit_iter_end(
+            estimator=estimator,
+            node=node,
+            stopping_criterion=lambda: violation / violation_init,
+            tol=tol,
+            fit_state={"H": Ht.T, "W": W},
+            reconstruction_attributes=lambda: {
+                "n_components_": Ht.T.shape[0],
+                "components_": H,
+                "n_iter_": n_iter,
+                "reconstruction_err_": _beta_divergence(X, W, Ht.T, 2, True),
+            },
+        ):
+            break
+
         if verbose:
             print("violation:", violation / violation_init)
 
@@ -731,6 +752,8 @@ def _fit_multiplicative_update(
     l2_reg_H=0,
     update_H=True,
     verbose=0,
+    estimator=None,
+    parent_node=None,
 ):
     """Compute Non-negative Matrix Factorization with Multiplicative Update.
 
@@ -815,8 +838,10 @@ def _fit_multiplicative_update(
     error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True)
     previous_error = error_at_init
 
+    nodes = parent_node.children if parent_node is not None else [None] * max_iter
+
     H_sum, HHt, XHt = None, None, None
-    for n_iter in range(1, max_iter + 1):
+    for n_iter, node in enumerate(nodes, 1):
         # update W
         # H_sum, HHt and XHt are saved and reused if not update_H
         delta_W, H_sum, HHt, XHt = _multiplicative_update_w(
@@ -842,6 +867,27 @@ def _fit_multiplicative_update(
             if beta_loss <= 1:
                 H[H < np.finfo(np.float64).eps] = 0.0
 
+        if _eval_callbacks_on_fit_iter_end(
+            estimator=estimator,
+            node=node,
+            stopping_criterion=lambda: (
+                (
+                    previous_error
+                    - _beta_divergence(X, W, H, beta_loss, square_root=True)
+                )
+                / error_at_init
+            ),
+            tol=tol,
+            fit_state={"H": H, "W": W},
+            reconstruction_attributes=lambda: {
+                "n_components_": H.shape[0],
+                "components_": H,
+                "n_iter_": n_iter,
+                "reconstruction_err_": _beta_divergence(X, W, H, 2, True),
+            },
+        ):
+            break
+
         # test convergence criterion every 10 iterations
         if tol > 0 and n_iter % 10 == 0:
             error = _beta_divergence(X, W, H, beta_loss, square_root=True)
@@ -1538,20 +1584,27 @@ def fit_transform(self, X, y=None, W=None, H=None):
             X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32]
         )
 
-        with config_context(assume_finite=True):
-            W, H, n_iter = self._fit_transform(X, W=W, H=H)
-
-        self.reconstruction_err_ = _beta_divergence(
-            X, W, H, self._beta_loss, square_root=True
+        root = self._eval_callbacks_on_fit_begin(
+            levels=[
+                {"descr": "fit", "max_iter": self.max_iter},
+                {"descr": "iter", "max_iter": None},
+            ],
+            X=X,
         )
 
+        W, H, n_iter = self._fit_transform(X, W=W, H=H, parent_node=root)
+
         self.n_components_ = H.shape[0]
         self.components_ = H
         self.n_iter_ = n_iter
 
+        self._eval_callbacks_on_fit_end()
+
         return W
 
-    def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
+    def _fit_transform(
+        self, X, y=None, W=None, H=None, update_H=True, parent_node=None
+    ):
         """Learn a NMF model for the data X and returns the transformed data.
 
         Parameters
@@ -1618,6 +1671,8 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
                 verbose=self.verbose,
                 shuffle=self.shuffle,
                 random_state=self.random_state,
+                estimator=self,
+                parent_node=parent_node,
             )
         elif self.solver == "mu":
             W, H, n_iter = _fit_multiplicative_update(
@@ -1633,6 +1688,8 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
                 l2_reg_H,
                 update_H=update_H,
                 verbose=self.verbose,
+                estimator=self,
+                parent_node=parent_node,
             )
         else:
             raise ValueError("Invalid solver parameter '%s'." % self.solver)
@@ -1713,6 +1770,28 @@ def inverse_transform(self, W):
         check_is_fitted(self)
         return np.dot(W, self.components_)
 
+    def objective_function(self, X, y=None, *, W=None, H=None, normalize=False):
+        if W is None:
+            W = self.transform(X)
+        if H is None:
+            H = self.components_
+
+        data_fit = _beta_divergence(X, W, H, self._beta_loss)
+
+        l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = self._scale_regularization(X)
+        penalization = (
+            l1_reg_W * W.sum()
+            + l1_reg_H * H.sum()
+            + l2_reg_W * (W ** 2).sum()
+            + l2_reg_H * (H ** 2).sum()
+        )
+
+        if normalize:
+            data_fit /= X.shape[0]
+            penalization /= X.shape[0]
+
+        return data_fit + penalization, data_fit, penalization
+
     @property
     def _n_features_out(self):
         """Number of transformed output features."""
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 08e71edbc69ab..82063f36d0434 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -33,6 +33,7 @@
 from ..utils.fixes import delayed
 from ..model_selection import check_cv
 from ..metrics import get_scorer
+from ..callback._base import _eval_callbacks_on_fit_iter_end
 
 
 _LOGISTIC_SOLVER_CONVERGENCE_MSG = (
@@ -505,6 +506,8 @@ def _logistic_regression_path(
     max_squared_sum=None,
     sample_weight=None,
     l1_ratio=None,
+    estimator=None,
+    parent_node=None,
 ):
     """Compute a Logistic Regression model for a list of regularization
     parameters.
@@ -796,13 +799,20 @@ def grad(x, *args):
             hess = _logistic_grad_hess
         warm_start_sag = {"coef": np.expand_dims(w0, axis=1)}
 
+    # Distinguish between LogReg and LogRegCV
+    if parent_node is not None:
+        nodes = [parent_node] if len(Cs) == 1 else parent_node.children
+    else:
+        nodes = [None] * len(Cs)
+
     coefs = list()
     n_iter = np.zeros(len(Cs), dtype=np.int32)
-    for i, C in enumerate(Cs):
+    for i, (C, node) in enumerate(zip(Cs, nodes)):
         if solver == "lbfgs":
             iprint = [-1, 50, 1, 100, 101][
                 np.searchsorted(np.array([0, 1, 2, 3]), verbose)
             ]
+            children = iter(node.children) if node is not None else None
             opt_res = optimize.minimize(
                 func,
                 w0,
@@ -810,6 +820,10 @@ def grad(x, *args):
                 jac=True,
                 args=(X, target, 1.0 / C, sample_weight),
                 options={"iprint": iprint, "gtol": tol, "maxiter": max_iter},
+                callback=lambda xk: _eval_callbacks_on_fit_iter_end(
+                    estimator=estimator,
+                    node=next(children) if children is not None else None,
+                ),
             )
             n_iter_i = _check_optimize_result(
                 solver,
@@ -821,7 +835,15 @@ def grad(x, *args):
         elif solver == "newton-cg":
             args = (X, target, 1.0 / C, sample_weight)
             w0, n_iter_i = _newton_cg(
-                hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol
+                hess,
+                func,
+                grad,
+                w0,
+                args=args,
+                maxiter=max_iter,
+                tol=tol,
+                estimator=estimator,
+                parent_node=node,
             )
         elif solver == "liblinear":
             coef_, intercept_, n_iter_i, = _fit_liblinear(
@@ -876,6 +898,8 @@ def grad(x, *args):
                 max_squared_sum,
                 warm_start_sag,
                 is_saga=(solver == "saga"),
+                estimator=estimator,
+                parent_node=node,
             )
 
         else:
@@ -893,8 +917,20 @@ def grad(x, *args):
         else:
             coefs.append(w0.copy())
 
+        if len(Cs) > 1:
+            _eval_callbacks_on_fit_iter_end(
+                estimator=estimator,
+                node=node,
+            )
+
         n_iter[i] = n_iter_i
 
+    if multi_class == "ovr":
+        _eval_callbacks_on_fit_iter_end(
+            estimator=estimator,
+            node=parent_node,
+        )
+
     return np.array(coefs), np.array(Cs), n_iter
 
 
@@ -1578,6 +1614,22 @@ def fit(self, X, y, sample_weight=None):
         if warm_start_coef is None:
             warm_start_coef = [None] * n_classes
 
+        if len(classes_) == 1:
+            levels = [
+                {"descr": "fit", "max_iter": self.max_iter},
+                {"descr": "iter", "max_iter": None},
+            ]
+        else:
+            levels = [
+                {"descr": "fit", "max_iter": len(classes_)},
+                {"descr": "class", "max_iter": self.max_iter},
+                {"descr": "iter", "max_iter": None},
+            ]
+        root = self._eval_callbacks_on_fit_begin(levels=levels, X=X, y=y)
+
+        # distinguish between multinomial and ovr
+        nodes = [root] if len(classes_) == 1 else root.children
+
         path_func = delayed(_logistic_regression_path)
 
         # The SAG solver releases the GIL so it's more efficient to use
@@ -1610,8 +1662,10 @@ def fit(self, X, y, sample_weight=None):
                 penalty=penalty,
                 max_squared_sum=max_squared_sum,
                 sample_weight=sample_weight,
+                estimator=self,
+                parent_node=node,
             )
-            for class_, warm_start_coef_ in zip(classes_, warm_start_coef)
+            for class_, warm_start_coef_, node in zip(classes_, warm_start_coef, nodes)
         )
 
         fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
@@ -1632,6 +1686,8 @@ def fit(self, X, y, sample_weight=None):
         else:
             self.intercept_ = np.zeros(n_classes)
 
+        self._eval_callbacks_on_fit_end()
+
         return self
 
     def predict_proba(self, X):
diff --git a/sklearn/linear_model/_sag.py b/sklearn/linear_model/_sag.py
index 48dcd7aef8ad3..7307ca76c4408 100644
--- a/sklearn/linear_model/_sag.py
+++ b/sklearn/linear_model/_sag.py
@@ -101,6 +101,8 @@ def sag_solver(
     max_squared_sum=None,
     warm_start_mem=None,
     is_saga=False,
+    estimator=None,
+    parent_node=None,
 ):
     """SAG solver for Ridge and LogisticRegression.
 
@@ -346,6 +348,8 @@ def sag_solver(
         intercept_decay,
         is_saga,
         verbose,
+        estimator=estimator,
+        parent_node=parent_node,
     )
 
     if n_iter_ == max_iter:
diff --git a/sklearn/linear_model/_sag_fast.pyx.tp b/sklearn/linear_model/_sag_fast.pyx.tp
index 756a048eea999..8144c98df3012 100644
--- a/sklearn/linear_model/_sag_fast.pyx.tp
+++ b/sklearn/linear_model/_sag_fast.pyx.tp
@@ -47,6 +47,7 @@ from ._sgd_fast cimport LossFunction
 from ._sgd_fast cimport Log, SquaredLoss
 
 from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
+from ..callback._base import _eval_callbacks_on_fit_iter_end
 
 from libc.stdio cimport printf
 
@@ -231,7 +232,9 @@ def sag{{name_suffix}}(SequentialDataset{{name_suffix}} dataset,
         np.ndarray[{{c_type}}, ndim=1, mode='c'] intercept_sum_gradient_init,
         double intercept_decay,
         bint saga,
-        bint verbose):
+        bint verbose,
+        estimator,
+        parent_node):
     """Stochastic Average Gradient (SAG) and SAGA solvers.
 
     Used in Ridge and LogisticRegression.
@@ -515,6 +518,22 @@ def sag{{name_suffix}}(SequentialDataset{{name_suffix}} dataset,
                                   fabs(weights[idx] -
                                        previous_weights[idx]))
                 previous_weights[idx] = weights[idx]
+
+            with gil:
+                if _eval_callbacks_on_fit_iter_end(
+                    estimator=estimator,
+                    node=parent_node.children[n_iter] if parent_node is not None else None,
+                    stopping_criterion = (
+                        lambda: max_change / max_weight
+                        if max_weight != 0
+                        else 0
+                        if max_weight == max_change == 0
+                        else np.inf
+                    ),
+                    tol=tol,
+                ):
+                    break
+
             if ((max_weight != 0 and max_change / max_weight <= tol)
                 or max_weight == 0 and max_change == 0):
                 if verbose:
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 6134b6318c838..47553d07ac169 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -26,12 +26,13 @@
 )
 from .utils.deprecation import deprecated
 from .utils._tags import _safe_tags
+from .utils.metaestimators import _BaseComposition
 from .utils.validation import check_memory
 from .utils.validation import check_is_fitted
 from .utils.fixes import delayed
 from .exceptions import NotFittedError
+from .callback._base import _eval_callbacks_on_fit_iter_end
 
-from .utils.metaestimators import _BaseComposition
 
 __all__ = ["Pipeline", "FeatureUnion", "make_pipeline", "make_union"]
 
@@ -318,15 +319,24 @@ def _fit(self, X, y=None, **fit_params_steps):
         # Setup the memory
         memory = check_memory(self.memory)
 
+        root = self._eval_callbacks_on_fit_begin(
+            levels=[
+                {"descr": "fit", "max_iter": len(self.steps)},
+                {"descr": "step", "max_iter": None},
+            ],
+            X=X,
+            y=y,
+        )
+
         fit_transform_one_cached = memory.cache(_fit_transform_one)
 
-        for (step_idx, name, transformer) in self._iter(
+        for (step_idx, name, transformer), node in zip(self._iter(
             with_final=False, filter_passthrough=False
-        ):
+        ), root.children[:-1]):
             if transformer is None or transformer == "passthrough":
+                _eval_callbacks_on_fit_iter_end(estimator=self, node=node)
                 with _print_elapsed_time("Pipeline", self._log_message(step_idx)):
                     continue
-
             if hasattr(memory, "location"):
                 # joblib >= 0.12
                 if memory.location is None:
@@ -346,6 +356,7 @@ def _fit(self, X, y=None, **fit_params_steps):
             else:
                 cloned_transformer = clone(transformer)
             # Fit or load from cache the current transformer
+            self._propagate_callbacks(cloned_transformer, parent_node=node)
             X, fitted_transformer = fit_transform_one_cached(
                 cloned_transformer,
                 X,
@@ -359,6 +370,9 @@ def _fit(self, X, y=None, **fit_params_steps):
             # transformer. This is necessary when loading the transformer
             # from the cache.
             self.steps[step_idx] = (name, fitted_transformer)
+
+            _eval_callbacks_on_fit_iter_end(estimator=self, node=node)
+
         return X
 
     def fit(self, X, y=None, **fit_params):
@@ -388,12 +402,20 @@ def fit(self, X, y=None, **fit_params):
             Pipeline with fitted steps.
         """
         fit_params_steps = self._check_fit_params(**fit_params)
+
         Xt = self._fit(X, y, **fit_params_steps)
         with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
             if self._final_estimator != "passthrough":
+                node = self._computation_tree.root.children[-1]
+                self._propagate_callbacks(self._final_estimator, parent_node=node)
+
                 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
                 self._final_estimator.fit(Xt, y, **fit_params_last_step)
 
+                _eval_callbacks_on_fit_iter_end(estimator=self, node=node)
+
+        self._eval_callbacks_on_fit_end()
+
         return self
 
     def fit_transform(self, X, y=None, **fit_params):
diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index bd2ac8bdfd27d..2e3b6eb1c125b 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -18,6 +18,7 @@
 from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
 
 from ..exceptions import ConvergenceWarning
+from ..callback._base import _eval_callbacks_on_fit_iter_end
 
 
 class _LineSearchError(RuntimeError):
@@ -120,6 +121,8 @@ def _newton_cg(
     maxinner=200,
     line_search=True,
     warn=True,
+    estimator=None,
+    parent_node=None,
 ):
     """
     Minimization of scalar function of one or more variables using the
@@ -168,20 +171,31 @@ def _newton_cg(
     """
     x0 = np.asarray(x0).flatten()
     xk = x0
-    k = 0
 
     if line_search:
         old_fval = func(x0, *args)
         old_old_fval = None
 
+    nodes = parent_node.children if parent_node is not None else [None] * maxiter
+
     # Outer loop: our Newton iteration
-    while k < maxiter:
+    for k, node in enumerate(nodes, 1):
         # Compute a search direction pk by applying the CG method to
         #  del2 f(xk) p = - fgrad f(xk) starting from 0.
         fgrad, fhess_p = grad_hess(xk, *args)
 
         absgrad = np.abs(fgrad)
-        if np.max(absgrad) <= tol:
+        max_absgrad = np.max(absgrad)
+
+        if _eval_callbacks_on_fit_iter_end(
+            estimator=estimator,
+            node=node,
+            stopping_criterion=lambda: max_absgrad,
+            tol=tol,
+        ):
+            break
+
+        if max_absgrad <= tol:
             break
 
         maggrad = np.sum(absgrad)
@@ -204,7 +218,6 @@ def _newton_cg(
                 break
 
         xk = xk + alphak * xsupi  # upcast if necessary
-        k += 1
 
     if warn and k >= maxiter:
         warnings.warn(

From 584bdf72f1dfa969eceb0b7ab3ffbba5cfcf6aea Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 17 Dec 2021 18:18:35 +0100
Subject: [PATCH 02/20] cln nmf and test reconstruction attributes

---
 sklearn/decomposition/_nmf.py           | 18 +++++++--------
 sklearn/decomposition/tests/test_nmf.py | 29 +++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index f53f33c6b804a..4fa46dd2cb12c 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -6,7 +6,6 @@
 #         Tom Dupre la Tour
 # License: BSD 3 clause
 
-from functools import partial
 import numbers
 import numpy as np
 import scipy.sparse as sp
@@ -504,9 +503,7 @@ def _fit_coordinate_descent(
 
     rng = check_random_state(random_state)
 
-    nodes = parent_node.children if parent_node is not None else [None] * max_iter
-
-    for n_iter, node in enumerate(nodes, 1):
+    for n_iter in range(1, max_iter + 1):
         violation = 0.0
 
         # Update W
@@ -527,7 +524,7 @@ def _fit_coordinate_descent(
 
         if _eval_callbacks_on_fit_iter_end(
             estimator=estimator,
-            node=node,
+            node=parent_node.children[n_iter - 1] if parent_node is not None else None,
             stopping_criterion=lambda: violation / violation_init,
             tol=tol,
             fit_state={"H": Ht.T, "W": W},
@@ -838,10 +835,8 @@ def _fit_multiplicative_update(
     error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True)
     previous_error = error_at_init
 
-    nodes = parent_node.children if parent_node is not None else [None] * max_iter
-
     H_sum, HHt, XHt = None, None, None
-    for n_iter, node in enumerate(nodes, 1):
+    for n_iter in range(1, max_iter + 1):
         # update W
         # H_sum, HHt and XHt are saved and reused if not update_H
         delta_W, H_sum, HHt, XHt = _multiplicative_update_w(
@@ -869,7 +864,7 @@ def _fit_multiplicative_update(
 
         if _eval_callbacks_on_fit_iter_end(
             estimator=estimator,
-            node=node,
+            node=parent_node.children[n_iter - 1] if parent_node is not None else None,
             stopping_criterion=lambda: (
                 (
                     previous_error
@@ -883,7 +878,7 @@ def _fit_multiplicative_update(
                 "n_components_": H.shape[0],
                 "components_": H,
                 "n_iter_": n_iter,
-                "reconstruction_err_": _beta_divergence(X, W, H, 2, True),
+                "reconstruction_err_": _beta_divergence(X, W, H, beta_loss, True),
             },
         ):
             break
@@ -1594,6 +1589,9 @@ def fit_transform(self, X, y=None, W=None, H=None):
 
         W, H, n_iter = self._fit_transform(X, W=W, H=H, parent_node=root)
 
+        self.reconstruction_err_ = _beta_divergence(
+            X, W, H, self._beta_loss, square_root=True
+        )
         self.n_components_ = H.shape[0]
         self.components_ = H
         self.n_iter_ = n_iter
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index c95b7ceb737db..7a58b64d6464d 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -1,4 +1,6 @@
+import pickle
 import re
+import tempfile
 
 import numpy as np
 import scipy.sparse as sp
@@ -18,6 +20,7 @@
 from sklearn.utils.extmath import squared_norm
 from sklearn.base import clone
 from sklearn.exceptions import ConvergenceWarning
+from sklearn.callback import Snapshot
 
 
 @pytest.mark.parametrize("solver", ["cd", "mu"])
@@ -719,3 +722,29 @@ def test_feature_names_out():
 
     names = nmf.get_feature_names_out()
     assert_array_equal([f"nmf{i}" for i in range(3)], names)
+
+
+@pytest.mark.parametrize("solver, beta_loss", [("mu", 0), ("mu", 2), ("cd", 2)])
+def test_nmf_callback_reconstruction_attributes(solver, beta_loss):
+    # Check that the reconstruction attributes passed to the callback allow to make
+    # a new estimator as if the fit ended when the callback is called.
+    X = np.random.RandomState(0).random_sample((100, 100))
+
+    nmf = NMF(n_components=3, solver=solver, beta_loss=beta_loss, random_state=0)
+    nmf.fit(X)
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        callback = Snapshot(base_dir=tmp_dir)
+        nmf._set_callbacks(callback)
+        nmf.fit(X)
+
+        # load model from last iteration
+        snapshot = sorted(callback.directory.iterdir())[-1]
+        with open(snapshot, "rb") as f:
+            loaded_nmf = pickle.load(f)
+
+    # The model loaded from the last iteration is the same as the original model
+    assert nmf.n_iter_ == loaded_nmf.n_iter_
+    assert_allclose(nmf.components_, loaded_nmf.components_) 
+    assert_allclose(nmf.reconstruction_err_, loaded_nmf.reconstruction_err_)
+    assert_allclose(nmf.transform(X), loaded_nmf.transform(X))

From bb32ff3bbcd798f1cd2e204c2437dc38359a36a0 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 20 Dec 2021 19:12:27 +0100
Subject: [PATCH 03/20] cln snapshot + test snapshot + uuid for computation
 tree

---
 sklearn/callback/_computation_tree.py    |  10 +-
 sklearn/callback/_snapshot.py            |  46 ++++-----
 sklearn/callback/tests/test_callbacks.py | 120 +++++++++++++++++++++++
 sklearn/decomposition/tests/test_nmf.py  |   9 +-
 4 files changed, 151 insertions(+), 34 deletions(-)
 create mode 100644 sklearn/callback/tests/test_callbacks.py

diff --git a/sklearn/callback/_computation_tree.py b/sklearn/callback/_computation_tree.py
index edd3c8f1f657f..161891ca32004 100644
--- a/sklearn/callback/_computation_tree.py
+++ b/sklearn/callback/_computation_tree.py
@@ -1,9 +1,10 @@
 # License: BSD 3 clause
 
-from tempfile import mkdtemp
+import os
 from pathlib import Path
 import pickle
-import os
+from tempfile import mkdtemp
+from uuid import uuid4
 
 import numpy as np
 
@@ -116,6 +117,9 @@ class ComputationTree:
         The path of the directory where the computation tree is dumped during the fit of
         its estimator. If it has a parent tree, this is a sub-directory of the
         `tree_dir` of its parent.
+
+    uid : uuid.UUID
+        Unique indentifier for a ComputationTree instance.
     """
 
     def __init__(self, estimator_name, levels, *, parent_node=None):
@@ -125,6 +129,8 @@ def __init__(self, estimator_name, levels, *, parent_node=None):
         self.depth = len(levels) - 1
         self.root, self.n_nodes = self._build_tree(levels)
 
+        self.uid = uuid4()
+
         parent_tree_dir = (
             None
             if self.parent_node is None
diff --git a/sklearn/callback/_snapshot.py b/sklearn/callback/_snapshot.py
index 231eafc8cbb9e..99a1bcc0ce68a 100644
--- a/sklearn/callback/_snapshot.py
+++ b/sklearn/callback/_snapshot.py
@@ -22,11 +22,6 @@ class Snapshot(BaseCallback):
     base_dir : str or pathlib.Path instance, default=None
         The directory where the snapshots should be stored. If None, they are stored in
         the current directory.
-
-    Attributes
-    ----------
-    directory : pathlib.Path instance
-        The directory where the snapshots are saved. It's a sub-directory of `base_dir`.
     """
 
     request_reconstruction_attributes = True
@@ -42,41 +37,36 @@ def __init__(self, keep_last_n=1, base_dir=None):
         self.base_dir = Path("." if base_dir is None else base_dir)
 
     def on_fit_begin(self, estimator, X=None, y=None):
-        self.estimator = estimator
-
-        # Use a hash in the name of this directory to avoid name collision if several
-        # clones of this estimator are fitted in parallel in a meta-estimator for
-        # instance.
-        dir_name = (
-            "snapshots_"
-            f"{self.estimator.__class__.__name__}_"
-            f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')}_"
-            f"{hash(self.estimator._computation_tree)}"
-        )
-
-        self.directory = self.base_dir / dir_name
-        self.directory.mkdir()
+        subdir = self._get_subdir(estimator._computation_tree)
+        subdir.mkdir()
 
-    def on_fit_iter_end(self, *, node, **kwargs):
+    def on_fit_iter_end(self, *, estimator, node, **kwargs):
         reconstruction_attributes = kwargs.get("reconstruction_attributes", None)
         if reconstruction_attributes is None:
             return
 
-        new_estimator = copy(self.estimator)
+        new_estimator = copy(estimator)
         for key, val in reconstruction_attributes.items():
             setattr(new_estimator, key, val)
 
-        file_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')}.pkl"
-        file_path = self.directory / file_name
+        subdir = self._get_subdir(node.computation_tree)
+        snapshot_filename = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')}.pkl"
 
-        with open(file_path, "wb") as f:
+        with open(subdir / snapshot_filename, "wb") as f:
             pickle.dump(new_estimator, f)
 
         if self.keep_last_n is not None:
-            for snapshot in sorted(self.directory.iterdir())[: -self.keep_last_n]:
+            for snapshot in sorted(subdir.iterdir())[: -self.keep_last_n]:
                 snapshot.unlink(missing_ok=True)
 
     def on_fit_end(self):
-        if self.keep_last_n is not None:
-            for snapshot in sorted(self.directory.iterdir())[: -self.keep_last_n]:
-                snapshot.unlink()
+        pass
+
+    def _get_subdir(self, computation_tree):
+        """Return the sub directory containing the snapshots of the estimator"""
+        subdir = (
+            self.base_dir
+            / f"snapshots_{computation_tree.estimator_name}_{str(computation_tree.uid)}"
+        )
+
+        return subdir
diff --git a/sklearn/callback/tests/test_callbacks.py b/sklearn/callback/tests/test_callbacks.py
new file mode 100644
index 0000000000000..17dafc616f457
--- /dev/null
+++ b/sklearn/callback/tests/test_callbacks.py
@@ -0,0 +1,120 @@
+# License: BSD 3 clause
+
+import pickle
+import pytest
+import tempfile
+from time import sleep
+
+from joblib import Parallel, delayed
+
+from sklearn.base import BaseEstimator, clone
+from sklearn.callback import Snapshot
+from sklearn.callback._base import _eval_callbacks_on_fit_iter_end
+from sklearn.datasets import make_classification
+
+
+class Estimator(BaseEstimator):
+    def __init__(self, max_iter=20):
+        self.max_iter = max_iter
+
+    def fit(self, X, y):
+        root = self._eval_callbacks_on_fit_begin(
+            levels=[
+                {"descr": "fit", "max_iter": self.max_iter},
+                {"descr": "iter", "max_iter": None},
+            ],
+            X=X,
+            y=y,
+        )
+
+        for i in range(self.max_iter):
+            if _eval_callbacks_on_fit_iter_end(
+                estimator=self,
+                node=root.children[i],
+                reconstruction_attributes=lambda: {"n_iter_": i + 1},
+            ):
+                break
+
+        self.n_iter_ = i + 1
+
+        self._eval_callbacks_on_fit_end()
+
+        return self
+
+
+class MetaEstimator(BaseEstimator):
+    def __init__(
+        self, estimator, n_outer=4, n_inner=3, n_jobs=None, prefer="processes"
+    ):
+        self.estimator = estimator
+        self.n_outer = n_outer
+        self.n_inner = n_inner
+        self.n_jobs = n_jobs
+        self.prefer = prefer
+
+    def fit(self, X, y):
+        root = self._eval_callbacks_on_fit_begin(
+            levels=[
+                {"descr": "fit", "max_iter": self.n_outer},
+                {"descr": "outer", "max_iter": self.n_inner},
+                {"descr": "inner", "max_iter": None},
+            ],
+            X=X,
+            y=y,
+        )
+
+        res = Parallel(n_jobs=self.n_jobs, prefer=self.prefer)(
+            delayed(self._func)(self.estimator, X, y, node, i)
+            for i, node in enumerate(root.children)
+        )
+
+        self._eval_callbacks_on_fit_end()
+
+        return self
+
+    def _func(self, estimator, X, y, parent_node, i):
+        for j, node in enumerate(parent_node.children):
+            est = clone(estimator)
+            self._propagate_callbacks(est, parent_node=node)
+            est.fit(X, y)
+
+            _eval_callbacks_on_fit_iter_end(estimator=self, node=node)
+
+        _eval_callbacks_on_fit_iter_end(estimator=self, node=parent_node)
+
+        return
+
+
+@pytest.mark.parametrize("n_jobs", (1, 2))
+@pytest.mark.parametrize("prefer", ("threads", "processes"))
+def test_snapshot_meta_estimator(n_jobs, prefer):
+    # Test for the Snapshot callback
+    X, y = make_classification()
+    estimator = Estimator(max_iter=20)
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        keep_last_n = 5
+        callback = Snapshot(keep_last_n=keep_last_n, base_dir=tmp_dir)
+        estimator._set_callbacks(callback)
+        metaestimator = MetaEstimator(
+            estimator=estimator, n_outer=4, n_inner=3, n_jobs=n_jobs, prefer=prefer
+        )
+
+        metaestimator.fit(X, y)
+
+        # There's a subdir of base_dir for each clone of estimator fitted in
+        # metaestimator. There are n_outer * n_inner such clones
+        snapshot_dirs = list(callback.base_dir.iterdir())
+        assert len(snapshot_dirs) == metaestimator.n_outer * metaestimator.n_inner
+
+        for snapshot_dir in snapshot_dirs:
+            snapshots = sorted(snapshot_dir.iterdir())
+            assert len(snapshots) == keep_last_n
+
+            for i, snapshot in enumerate(snapshots):
+                with open(snapshot, "rb") as f:
+                    loaded_estimator = pickle.load(f)
+
+                # We kept last 5 snapshots out of 20 iterations.
+                # This one is the 16 + i-th.
+                assert loaded_estimator.n_iter_ == 16 + i
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index 7a58b64d6464d..a1ef1e90792af 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -728,9 +728,9 @@ def test_feature_names_out():
 def test_nmf_callback_reconstruction_attributes(solver, beta_loss):
     # Check that the reconstruction attributes passed to the callback allow to make
     # a new estimator as if the fit ended when the callback is called.
-    X = np.random.RandomState(0).random_sample((100, 100))
+    X = np.random.RandomState(0).random_sample((100, 20))
 
-    nmf = NMF(n_components=3, solver=solver, beta_loss=beta_loss, random_state=0)
+    nmf = NMF(n_components=5, solver=solver, beta_loss=beta_loss, random_state=0)
     nmf.fit(X)
 
     with tempfile.TemporaryDirectory() as tmp_dir:
@@ -739,11 +739,12 @@ def test_nmf_callback_reconstruction_attributes(solver, beta_loss):
         nmf.fit(X)
 
         # load model from last iteration
-        snapshot = sorted(callback.directory.iterdir())[-1]
+        snapshot_dir = next(callback.base_dir.iterdir())
+        snapshot = sorted(snapshot_dir.iterdir())[-1]
         with open(snapshot, "rb") as f:
             loaded_nmf = pickle.load(f)
 
-    # The model loaded from the last iteration is the same as the original model
+    # The model saved during the last iteration is the same as the original model
     assert nmf.n_iter_ == loaded_nmf.n_iter_
     assert_allclose(nmf.components_, loaded_nmf.components_) 
     assert_allclose(nmf.reconstruction_err_, loaded_nmf.reconstruction_err_)

From 7a1825db4c9d2a3a7170235fd95fdd7747c3ff96 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 31 Dec 2021 17:20:55 +0100
Subject: [PATCH 04/20] cln

---
 sklearn/base.py                          | 14 +++++++++++
 sklearn/callback/_base.py                | 16 ++++++------
 sklearn/callback/_snapshot.py            | 11 +++------
 sklearn/callback/tests/test_callbacks.py |  6 ++++-
 sklearn/decomposition/_nmf.py            | 31 +++++++++++++++---------
 sklearn/linear_model/_logistic.py        | 11 +++------
 sklearn/utils/optimize.py                |  8 +++---
 7 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 4f6b63cb2add1..7823e61f63c1e 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -733,6 +733,20 @@ def _eval_callbacks_on_fit_end(self):
                 # propagated from a meta-estimator.
                 callback.on_fit_end()
 
+    def _from_reconstruction_attributes(self, *, reconstruction_attributes):
+        """
+
+        Parameters
+        ----------
+        reconstruction_attributes : callable
+            The necessary fitted attributes to create a working fitted estimator from
+            this instance.
+        """
+        new_estimator = copy.copy(self)
+        for key, val in reconstruction_attributes().items():
+            setattr(new_estimator, key, val)
+        return new_estimator
+
     @property
     def _repr_html_(self):
         """HTML representation of estimator.
diff --git a/sklearn/callback/_base.py b/sklearn/callback/_base.py
index 604a450336610..a473f172fd575 100644
--- a/sklearn/callback/_base.py
+++ b/sklearn/callback/_base.py
@@ -38,11 +38,11 @@ def _eval_callbacks_on_fit_iter_end(**kwargs):
         kwargs["stopping_criterion"] = kwarg
 
     if any(
-        getattr(callback, "request_reconstruction_attributes", False)
+        getattr(callback, "request_from_reconstruction_attributes", False)
         for callback in estimator._callbacks
     ):
-        kwarg = kwargs.pop("reconstruction_attributes", lambda: None)()
-        kwargs["reconstruction_attributes"] = kwarg
+        kwarg = kwargs.pop("from_reconstruction_attributes", lambda: None)()
+        kwargs["from_reconstruction_attributes"] = kwarg
 
     return any(callback.on_fit_iter_end(**kwargs) for callback in estimator._callbacks)
 
@@ -94,11 +94,11 @@ def on_fit_iter_end(self, estimator, node, **kwargs):
                 Tolerance for the stopping criterion.
                 This is only provided at the innermost level of iterations.
 
-            - reconstruction_attributes: dict
-                Necessary attributes to construct an estimator (by copying this
-                estimator and setting these as attributes) which will behave as if
-                the fit stopped at this node.
-                This is only provided at the outermost level of iterations.
+            - from_reconstruction_attributes: estimator instance
+                A ready to predict, transform, etc ... estimator as if the fit stopped
+                at this node. Usually it's a copy of the caller estimator with the
+                necessary attributes set but it can sometimes be an instance of another
+                class (e.g. LogisticRegressionCV -> LogisticRegression)
 
             - fit_state: dict
                 Model specific quantities updated during fit. This is not meant to be
diff --git a/sklearn/callback/_snapshot.py b/sklearn/callback/_snapshot.py
index 99a1bcc0ce68a..cbf200336c749 100644
--- a/sklearn/callback/_snapshot.py
+++ b/sklearn/callback/_snapshot.py
@@ -1,6 +1,5 @@
 # License: BSD 3 clause
 
-from copy import copy
 from datetime import datetime
 from pathlib import Path
 import pickle
@@ -24,7 +23,7 @@ class Snapshot(BaseCallback):
         the current directory.
     """
 
-    request_reconstruction_attributes = True
+    request_from_reconstruction_attributes = True
 
     def __init__(self, keep_last_n=1, base_dir=None):
         self.keep_last_n = keep_last_n
@@ -41,14 +40,10 @@ def on_fit_begin(self, estimator, X=None, y=None):
         subdir.mkdir()
 
     def on_fit_iter_end(self, *, estimator, node, **kwargs):
-        reconstruction_attributes = kwargs.get("reconstruction_attributes", None)
-        if reconstruction_attributes is None:
+        new_estimator = kwargs.get("from_reconstruction_attributes", None)
+        if new_estimator is None:
             return
 
-        new_estimator = copy(estimator)
-        for key, val in reconstruction_attributes.items():
-            setattr(new_estimator, key, val)
-
         subdir = self._get_subdir(node.computation_tree)
         snapshot_filename = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')}.pkl"
 
diff --git a/sklearn/callback/tests/test_callbacks.py b/sklearn/callback/tests/test_callbacks.py
index 17dafc616f457..c43241f469f8c 100644
--- a/sklearn/callback/tests/test_callbacks.py
+++ b/sklearn/callback/tests/test_callbacks.py
@@ -1,5 +1,6 @@
 # License: BSD 3 clause
 
+from functools import partial
 import pickle
 import pytest
 import tempfile
@@ -31,7 +32,10 @@ def fit(self, X, y):
             if _eval_callbacks_on_fit_iter_end(
                 estimator=self,
                 node=root.children[i],
-                reconstruction_attributes=lambda: {"n_iter_": i + 1},
+                from_reconstruction_attributes=partial(
+                    self._from_reconstruction_attributes,
+                    reconstruction_attributes=lambda : {"n_iter_": i + 1},
+                )
             ):
                 break
 
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 4fa46dd2cb12c..154dbb3db6532 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -6,6 +6,7 @@
 #         Tom Dupre la Tour
 # License: BSD 3 clause
 
+from functools import partial
 import numbers
 import numpy as np
 import scipy.sparse as sp
@@ -528,12 +529,15 @@ def _fit_coordinate_descent(
             stopping_criterion=lambda: violation / violation_init,
             tol=tol,
             fit_state={"H": Ht.T, "W": W},
-            reconstruction_attributes=lambda: {
-                "n_components_": Ht.T.shape[0],
-                "components_": H,
-                "n_iter_": n_iter,
-                "reconstruction_err_": _beta_divergence(X, W, Ht.T, 2, True),
-            },
+            from_reconstruction_attributes=partial(
+                estimator._from_reconstruction_attributes,
+                reconstruction_attributes=lambda : {
+                    "n_components_": Ht.T.shape[0],
+                    "components_": H,
+                    "n_iter_": n_iter,
+                    "reconstruction_err_": _beta_divergence(X, W, Ht.T, 2, True),
+                }
+            ),
         ):
             break
 
@@ -874,12 +878,15 @@ def _fit_multiplicative_update(
             ),
             tol=tol,
             fit_state={"H": H, "W": W},
-            reconstruction_attributes=lambda: {
-                "n_components_": H.shape[0],
-                "components_": H,
-                "n_iter_": n_iter,
-                "reconstruction_err_": _beta_divergence(X, W, H, beta_loss, True),
-            },
+            from_reconstruction_attributes=partial(
+                estimator._from_reconstruction_attributes,
+                reconstruction_attributes=lambda {
+                    "n_components_": H.shape[0],
+                    "components_": H,
+                    "n_iter_": n_iter,
+                    "reconstruction_err_": _beta_divergence(X, W, H, beta_loss, True),
+                }
+            ),
         ):
             break
 
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 82063f36d0434..540f1a656c077 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -799,15 +799,12 @@ def grad(x, *args):
             hess = _logistic_grad_hess
         warm_start_sag = {"coef": np.expand_dims(w0, axis=1)}
 
-    # Distinguish between LogReg and LogRegCV
-    if parent_node is not None:
-        nodes = [parent_node] if len(Cs) == 1 else parent_node.children
-    else:
-        nodes = [None] * len(Cs)
-
     coefs = list()
     n_iter = np.zeros(len(Cs), dtype=np.int32)
-    for i, (C, node) in enumerate(zip(Cs, nodes)):
+    for i, C in enumerate(Cs):
+        # Distinguish between LogReg and LogRegCV
+        node = None if parent_node is None else parent_node if len(Cs) == 1 else parent_node.children
+
         if solver == "lbfgs":
             iprint = [-1, 50, 1, 100, 101][
                 np.searchsorted(np.array([0, 1, 2, 3]), verbose)
diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index 2e3b6eb1c125b..b634f457bd287 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -171,15 +171,14 @@ def _newton_cg(
     """
     x0 = np.asarray(x0).flatten()
     xk = x0
+    k = 0
 
     if line_search:
         old_fval = func(x0, *args)
         old_old_fval = None
 
-    nodes = parent_node.children if parent_node is not None else [None] * maxiter
-
     # Outer loop: our Newton iteration
-    for k, node in enumerate(nodes, 1):
+    while k < maxiter:
         # Compute a search direction pk by applying the CG method to
         #  del2 f(xk) p = - fgrad f(xk) starting from 0.
         fgrad, fhess_p = grad_hess(xk, *args)
@@ -189,7 +188,7 @@ def _newton_cg(
 
         if _eval_callbacks_on_fit_iter_end(
             estimator=estimator,
-            node=node,
+            node=None if parent_node is None else parent_node.children[k],
             stopping_criterion=lambda: max_absgrad,
             tol=tol,
         ):
@@ -218,6 +217,7 @@ def _newton_cg(
                 break
 
         xk = xk + alphak * xsupi  # upcast if necessary
+        k += 1
 
     if warn and k >= maxiter:
         warnings.warn(

From 3e3b25f3d5202a3a56a7fdfcc22e373a538a30bd Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 31 Dec 2021 17:25:57 +0100
Subject: [PATCH 05/20] black

---
 sklearn/callback/tests/test_callbacks.py | 4 ++--
 sklearn/decomposition/_nmf.py            | 8 ++++----
 sklearn/decomposition/tests/test_nmf.py  | 2 +-
 sklearn/linear_model/_logistic.py        | 8 +++++++-
 sklearn/pipeline.py                      | 5 +++--
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/sklearn/callback/tests/test_callbacks.py b/sklearn/callback/tests/test_callbacks.py
index c43241f469f8c..1f5fcf6bdd3c4 100644
--- a/sklearn/callback/tests/test_callbacks.py
+++ b/sklearn/callback/tests/test_callbacks.py
@@ -34,8 +34,8 @@ def fit(self, X, y):
                 node=root.children[i],
                 from_reconstruction_attributes=partial(
                     self._from_reconstruction_attributes,
-                    reconstruction_attributes=lambda : {"n_iter_": i + 1},
-                )
+                    reconstruction_attributes=lambda: {"n_iter_": i + 1},
+                ),
             ):
                 break
 
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 154dbb3db6532..f63146dc11250 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -531,12 +531,12 @@ def _fit_coordinate_descent(
             fit_state={"H": Ht.T, "W": W},
             from_reconstruction_attributes=partial(
                 estimator._from_reconstruction_attributes,
-                reconstruction_attributes=lambda : {
+                reconstruction_attributes=lambda: {
                     "n_components_": Ht.T.shape[0],
                     "components_": H,
                     "n_iter_": n_iter,
                     "reconstruction_err_": _beta_divergence(X, W, Ht.T, 2, True),
-                }
+                },
             ),
         ):
             break
@@ -880,12 +880,12 @@ def _fit_multiplicative_update(
             fit_state={"H": H, "W": W},
             from_reconstruction_attributes=partial(
                 estimator._from_reconstruction_attributes,
-                reconstruction_attributes=lambda {
+                reconstruction_attributes=lambda: {
                     "n_components_": H.shape[0],
                     "components_": H,
                     "n_iter_": n_iter,
                     "reconstruction_err_": _beta_divergence(X, W, H, beta_loss, True),
-                }
+                },
             ),
         ):
             break
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index a1ef1e90792af..c84ee43175df4 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -746,6 +746,6 @@ def test_nmf_callback_reconstruction_attributes(solver, beta_loss):
 
     # The model saved during the last iteration is the same as the original model
     assert nmf.n_iter_ == loaded_nmf.n_iter_
-    assert_allclose(nmf.components_, loaded_nmf.components_) 
+    assert_allclose(nmf.components_, loaded_nmf.components_)
     assert_allclose(nmf.reconstruction_err_, loaded_nmf.reconstruction_err_)
     assert_allclose(nmf.transform(X), loaded_nmf.transform(X))
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 540f1a656c077..1d4bbc815bb3d 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -803,7 +803,13 @@ def grad(x, *args):
     n_iter = np.zeros(len(Cs), dtype=np.int32)
     for i, C in enumerate(Cs):
         # Distinguish between LogReg and LogRegCV
-        node = None if parent_node is None else parent_node if len(Cs) == 1 else parent_node.children
+        node = (
+            None
+            if parent_node is None
+            else parent_node
+            if len(Cs) == 1
+            else parent_node.children
+        )
 
         if solver == "lbfgs":
             iprint = [-1, 50, 1, 100, 101][
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 47553d07ac169..657ba79307ce3 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -330,9 +330,10 @@ def _fit(self, X, y=None, **fit_params_steps):
 
         fit_transform_one_cached = memory.cache(_fit_transform_one)
 
-        for (step_idx, name, transformer), node in zip(self._iter(
+        for (step_idx, name, transformer), in self._iter(
             with_final=False, filter_passthrough=False
-        ), root.children[:-1]):
+        ):
+            node = root.children[step_idx]
             if transformer is None or transformer == "passthrough":
                 _eval_callbacks_on_fit_iter_end(estimator=self, node=node)
                 with _print_elapsed_time("Pipeline", self._log_message(step_idx)):

From 26dbb6954c4155daf2ce7b09b91911379bf4705f Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 31 Dec 2021 17:33:47 +0100
Subject: [PATCH 06/20] lint

---
 sklearn/base.py              | 4 ++--
 sklearn/callback/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 7823e61f63c1e..c14a5d314a502 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -617,7 +617,7 @@ def _set_callbacks(self, callbacks):
             callbacks = [callbacks]
 
         if not all(isinstance(callback, BaseCallback) for callback in callbacks):
-            raise TypeError(f"callbacks must be subclasses of BaseCallback.")
+            raise TypeError("callbacks must be subclasses of BaseCallback.")
 
         self._callbacks = callbacks
 
@@ -734,7 +734,7 @@ def _eval_callbacks_on_fit_end(self):
                 callback.on_fit_end()
 
     def _from_reconstruction_attributes(self, *, reconstruction_attributes):
-        """
+        """Return a as if fitted copy of this estimator
 
         Parameters
         ----------
diff --git a/sklearn/callback/__init__.py b/sklearn/callback/__init__.py
index 1f0f3f7215a18..c8d5ea0bf0606 100644
--- a/sklearn/callback/__init__.py
+++ b/sklearn/callback/__init__.py
@@ -13,7 +13,7 @@
 
 __all__ = [
     "AutoPropagatedMixin",
-    "Basecallback",
+    "BaseCallback",
     "ComputationNode",
     "ComputationTree",
     "load_computation_tree",

From eb7b8246d5fc1e770cc0d5d98b1f6130d6fba461 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 14 Feb 2022 16:02:42 +0100
Subject: [PATCH 07/20] wip

---
 sklearn/callback/_convergence_monitor.py | 35 ++++++++++++++----------
 sklearn/pipeline.py                      |  2 +-
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/sklearn/callback/_convergence_monitor.py b/sklearn/callback/_convergence_monitor.py
index 9f53d657cc75a..ac04335e04661 100644
--- a/sklearn/callback/_convergence_monitor.py
+++ b/sklearn/callback/_convergence_monitor.py
@@ -3,20 +3,21 @@
 from copy import copy
 from pathlib import Path
 from tempfile import mkdtemp
-import time
 
 import matplotlib.pyplot as plt
 import pandas as pd
 
 from . import BaseCallback
 
+# import ..metrics as metrics
+
 
 class ConvergenceMonitor(BaseCallback):
     """Monitor model convergence.
 
     Parameters
     ----------
-    monitor : 
+    monitor :
 
     X_val : ndarray, default=None
         Validation data
@@ -33,37 +34,41 @@ class ConvergenceMonitor(BaseCallback):
     request_reconstruction_attributes = True
 
     def __init__(self, *, monitor="objective_function", X_val=None, y_val=None):
+        if monitor == "objective_function":
+            self._monitor = "objective_function"
+        else:
+            self._monitor = getattr(metrics, monitor, None)
+            if self._monitor is None:
+                raise ValueError(f"unknown metric {monitor}")
+
         self.X_val = X_val
         self.y_val = y_val
+
         self._data_file = Path(mkdtemp()) / "convergence_monitor.csv"
 
     def on_fit_begin(self, estimator, *, X=None, y=None):
         self.estimator = estimator
         self.X_train = X
         self.y_train = y
-        self._start_time = {}
-
-    def on_fit_iter_end(self, *, node, **kwargs):
-        if node.depth != node.computation_tree.depth:
-            return
 
+    def on_fit_iter_end(self, *, estimator, node, **kwargs):
         reconstruction_attributes = kwargs.get("reconstruction_attributes", None)
         if reconstruction_attributes is None:
             return
 
-        new_estimator = copy(self.estimator)
+        new_estimator = copy(estimator)
         for key, val in reconstruction_attributes.items():
             setattr(new_estimator, key, val)
 
-        if node.idx == 0:
-            self._start_time[node.parent] = time.perf_counter()
-            curr_time = 0
-        else:
-            curr_time = time.perf_counter() - self._start_time[node.parent]
+        # if self._monitor =
 
-        obj_train, *_ = new_estimator.objective_function(self.X_train, self.y_train, normalize=True)
+        obj_train, *_ = new_estimator.objective_function(
+            self.X_train, self.y_train, normalize=True
+        )
         if self.X_val is not None:
-            obj_val, *_ = new_estimator.objective_function(self.X_val, self.y_val, normalize=True)
+            obj_val, *_ = new_estimator.objective_function(
+                self.X_val, self.y_val, normalize=True
+            )
         else:
             obj_val = None
 
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 657ba79307ce3..96a4738a9196a 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -330,7 +330,7 @@ def _fit(self, X, y=None, **fit_params_steps):
 
         fit_transform_one_cached = memory.cache(_fit_transform_one)
 
-        for (step_idx, name, transformer), in self._iter(
+        for (step_idx, name, transformer) in self._iter(
             with_final=False, filter_passthrough=False
         ):
             node = root.children[step_idx]

From f78442ebc9895210f34905c96e6ca7fd4d2b6e3a Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 23 Feb 2022 14:46:17 +0100
Subject: [PATCH 08/20] class

---
 sklearn/model_selection/_search.py     | 73 +++++++++++++++++++++++---
 sklearn/model_selection/_validation.py |  8 +++
 2 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 5ceb71569b932..fc16eefe8070f 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -15,6 +15,7 @@
 from collections.abc import Mapping, Sequence, Iterable
 from functools import partial, reduce
 from itertools import product
+from itertools import cycle
 import numbers
 import operator
 import time
@@ -23,6 +24,7 @@
 import numpy as np
 from numpy.ma import MaskedArray
 from scipy.stats import rankdata
+from joblib import Parallel
 
 from ..base import BaseEstimator, is_classifier, clone
 from ..base import MetaEstimatorMixin
@@ -33,7 +35,6 @@
 from ._validation import _normalize_score_results
 from ._validation import _warn_or_raise_about_fit_failures
 from ..exceptions import NotFittedError
-from joblib import Parallel
 from ..utils import check_random_state
 from ..utils.random import sample_without_replacement
 from ..utils._tags import _safe_tags
@@ -783,7 +784,7 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
         X, y, groups = indexable(X, y, groups)
         fit_params = _check_fit_params(X, fit_params)
 
-        cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
+        cv_orig = self._checked_cv_orig
         n_splits = cv_orig.get_n_splits(X, y, groups)
 
         base_estimator = clone(self.estimator)
@@ -806,7 +807,7 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
             all_out = []
             all_more_results = defaultdict(list)
 
-            def evaluate_candidates(candidate_params, cv=None, more_results=None):
+            def evaluate_candidates(candidate_params, cv=None, more_results=None, parent_node=None):
                 cv = cv or cv_orig
                 candidate_params = list(candidate_params)
                 n_candidates = len(candidate_params)
@@ -819,6 +820,11 @@ def evaluate_candidates(candidate_params, cv=None, more_results=None):
                         )
                     )
 
+                if parent_node is not None:
+                    nodes = parent_node.children
+                else:
+                    nodes = cycle([None])
+
                 out = parallel(
                     delayed(_fit_and_score)(
                         clone(base_estimator),
@@ -830,10 +836,11 @@ def evaluate_candidates(candidate_params, cv=None, more_results=None):
                         split_progress=(split_idx, n_splits),
                         candidate_progress=(cand_idx, n_candidates),
                         **fit_and_score_kwargs,
+                        caller=self,
+                        node=node,
                     )
-                    for (cand_idx, parameters), (split_idx, (train, test)) in product(
-                        enumerate(candidate_params), enumerate(cv.split(X, y, groups))
-                    )
+                    for ((cand_idx, parameters), (split_idx, (train, test))), node in zip(product(
+                        enumerate(candidate_params), enumerate(cv.split(X, y, groups))), nodes)
                 )
 
                 if len(out) < 1:
@@ -1370,10 +1377,60 @@ def __init__(
         )
         self.param_grid = param_grid
 
+    def fit(self, X, y=None, *, groups=None, **fit_params):
+        """Run fit with all sets of parameters.
+
+        Parameters
+        ----------
+
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples, n_output) or (n_samples,), default=None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Only used in conjunction with a "Group" :term:`cv`
+            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
+
+        **fit_params : dict of str -> object
+            Parameters passed to the `fit` method of the estimator.
+
+            If a fit parameter is an array-like whose length is equal to
+            `num_samples` then it will be split across CV groups along with `X`
+            and `y`. For example, the :term:`sample_weight` parameter is split
+            because `len(sample_weights) = len(X)`.
+
+        Returns
+        -------
+        self : object
+            Instance of fitted estimator.
+        """
+        self._param_grid = ParameterGrid(self.param_grid)
+
+        self._checked_cv_orig = check_cv(
+            self.cv, y, classifier=is_classifier(self.estimator)
+        )
+        n_splits = self._checked_cv_orig.get_n_splits(X, y, groups)
+
+        self._eval_callbacks_on_fit_begin(
+            levels=[
+                {"descr": "fit", "max_iter": len(self._param_grid) * n_splits},
+                {"descr": "param - fold", "max_iter": None},
+            ],
+            X=X,
+            y=y,
+        )
+        super().fit(X, y=y, groups=groups, **fit_params)
+
+        self._eval_callbacks_on_fit_end()
+
     def _run_search(self, evaluate_candidates):
         """Search all candidates in param_grid"""
-        evaluate_candidates(ParameterGrid(self.param_grid))
-
+        evaluate_candidates(self._param_grid, parent_node=self._computation_tree.root)
 
 class RandomizedSearchCV(BaseSearchCV):
     """Randomized search on hyper parameters.
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 927fe7a2cc452..6bf61bf246302 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -33,6 +33,7 @@
 from ..exceptions import FitFailedWarning
 from ._split import check_cv
 from ..preprocessing import LabelEncoder
+from ..callback._base import _eval_callbacks_on_fit_iter_end
 
 
 __all__ = [
@@ -547,6 +548,8 @@ def _fit_and_score(
     split_progress=None,
     candidate_progress=None,
     error_score=np.nan,
+    caller=None,
+    node=None,
 ):
 
     """Fit estimator and compute scores for a given dataset split.
@@ -673,6 +676,9 @@ def _fit_and_score(
             cloned_parameters[k] = clone(v, safe=False)
 
         estimator = estimator.set_params(**cloned_parameters)
+   
+    if caller is not None:
+        caller._propagate_callbacks(estimator, parent_node=node)
 
     start_time = time.time()
 
@@ -736,6 +742,8 @@ def _fit_and_score(
         end_msg += result_msg
         print(end_msg)
 
+    _eval_callbacks_on_fit_iter_end(estimator=caller, node=node)
+
     result["test_scores"] = test_scores
     if return_train_score:
         result["train_scores"] = train_scores

From 34bab15d7feb1dc191df80d731f03c5454244011 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 23 Feb 2022 17:57:36 +0100
Subject: [PATCH 09/20] more tests

---
 sklearn/base.py                               | 47 +++++----
 sklearn/callback/_computation_tree.py         |  8 +-
 .../test_base_estimator_callback_methods.py   | 95 +++++++++++++++++++
 sklearn/callback/tests/test_callbacks.py      | 89 ++---------------
 .../callback/tests/test_computation_tree.py   | 33 ++++---
 5 files changed, 161 insertions(+), 111 deletions(-)
 create mode 100644 sklearn/callback/tests/test_base_estimator_callback_methods.py

diff --git a/sklearn/base.py b/sklearn/base.py
index cf3459267d13a..c542332280a07 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -616,6 +616,11 @@ def _set_callbacks(self, callbacks):
         ----------
         callbacks : callback or list of callbacks
             the callbacks to set.
+
+        Returns
+        -------
+        self : estimator instance
+            The estimator instance itself. 
         """
         if not isinstance(callbacks, list):
             callbacks = [callbacks]
@@ -625,9 +630,11 @@ def _set_callbacks(self, callbacks):
 
         self._callbacks = callbacks
 
+        return self
+
     # XXX should be a method of MetaEstimatorMixin but this mixin can't handle all
     # meta-estimators.
-    def _propagate_callbacks(self, sub_estimator, parent_node):
+    def _propagate_callbacks(self, sub_estimator, *, parent_node):
         """Propagate the auto-propagated callbacks to a sub-estimator
 
         Parameters
@@ -640,9 +647,6 @@ def _propagate_callbacks(self, sub_estimator, parent_node):
             computation tree of the sub-estimator. It must be the node where the fit
             method of the sub-estimator is called.
         """
-        if not hasattr(self, "_callbacks"):
-            return
-
         if hasattr(sub_estimator, "_callbacks") and any(
             isinstance(callback, AutoPropagatedMixin)
             for callback in sub_estimator._callbacks
@@ -659,6 +663,9 @@ def _propagate_callbacks(self, sub_estimator, parent_node):
                 " Set them directly on the meta-estimator."
             )
 
+        if not hasattr(self, "_callbacks"):
+            return
+
         propagated_callbacks = [
             callback
             for callback in self._callbacks
@@ -668,7 +675,7 @@ def _propagate_callbacks(self, sub_estimator, parent_node):
         if not propagated_callbacks:
             return
 
-        sub_estimator._parent_node = parent_node
+        sub_estimator._parent_ct_node = parent_node
 
         if not hasattr(sub_estimator, "_callbacks"):
             sub_estimator._callbacks = propagated_callbacks
@@ -702,7 +709,7 @@ def _eval_callbacks_on_fit_begin(self, *, levels, X=None, y=None):
         self._computation_tree = ComputationTree(
             estimator_name=self.__class__.__name__,
             levels=levels,
-            parent_node=getattr(self, "_parent_node", None),
+            parent_node=getattr(self, "_parent_ct_node", None),
         )
 
         if hasattr(self, "_callbacks"):
@@ -710,13 +717,13 @@ def _eval_callbacks_on_fit_begin(self, *, levels, X=None, y=None):
             with open(file_path, "wb") as f:
                 pickle.dump(self._computation_tree, f)
 
+            # Only call the on_fit_begin method of callbacks that are not
+            # propagated from a meta-estimator.
             for callback in self._callbacks:
-                is_propagated = hasattr(self, "_parent_node") and isinstance(
+                is_propagated = hasattr(self, "_parent_ct_node") and isinstance(
                     callback, AutoPropagatedMixin
                 )
                 if not is_propagated:
-                    # Only call the on_fit_begin method of callbacks that are not
-                    # propagated from a meta-estimator.
                     callback.on_fit_begin(estimator=self, X=X, y=y)
 
         return self._computation_tree.root
@@ -728,25 +735,33 @@ def _eval_callbacks_on_fit_end(self):
 
         self._computation_tree._tree_status[0] = True
 
+        # Only call the on_fit_end method of callbacks that are not
+        # propagated from a meta-estimator.
         for callback in self._callbacks:
             is_propagated = isinstance(callback, AutoPropagatedMixin) and hasattr(
-                self, "_parent_node"
+                self, "_parent_ct_node"
             )
             if not is_propagated:
-                # Only call the on_fit_end method of callbacks that are not
-                # propagated from a meta-estimator.
                 callback.on_fit_end()
 
     def _from_reconstruction_attributes(self, *, reconstruction_attributes):
-        """Return a as if fitted copy of this estimator
+        """Return an as if fitted copy of this estimator
 
         Parameters
         ----------
         reconstruction_attributes : callable
-            The necessary fitted attributes to create a working fitted estimator from
-            this instance.
+            A callable that has no arguments and returns the necessary fitted attributes
+            to create a working fitted estimator from this instance.
+
+            Using a callable allows lazy evaluation of the potentially costly
+            reconstruction attributes.
+
+        Returns
+        -------
+        fitted_estimator : estimator instance
+            The fitted copy of this estimator.
         """
-        new_estimator = copy.copy(self)
+        new_estimator = copy.copy(self)  # XXX deepcopy ?
         for key, val in reconstruction_attributes().items():
             setattr(new_estimator, key, val)
         return new_estimator
diff --git a/sklearn/callback/_computation_tree.py b/sklearn/callback/_computation_tree.py
index 161891ca32004..a69a8788e26c5 100644
--- a/sklearn/callback/_computation_tree.py
+++ b/sklearn/callback/_computation_tree.py
@@ -88,6 +88,12 @@ def get_ancestors(self, include_ancestor_trees=True):
 
         return ancestors
 
+    def __repr__(self):
+        return (
+            f"ComputationNode(description={self.description}, "
+            f"depth={self.depth}, idx={self.idx})"
+        )
+
 
 class ComputationTree:
     """Data structure to store the computation tree of an estimator
@@ -221,7 +227,7 @@ def iterate(self, include_leaves=False):
 
     def _recursive_iterate(self, node=None, include_leaves=False, node_list=None):
         """Recursively constructs the iterable"""
-        # TODO make it a generator
+        # TODO make it an iterator ?
         if node is None:
             node = self.root
             node_list = []
diff --git a/sklearn/callback/tests/test_base_estimator_callback_methods.py b/sklearn/callback/tests/test_base_estimator_callback_methods.py
new file mode 100644
index 0000000000000..676f0a5cfdd0e
--- /dev/null
+++ b/sklearn/callback/tests/test_base_estimator_callback_methods.py
@@ -0,0 +1,95 @@
+# License: BSD 3 clause
+
+from pathlib import Path
+import pytest
+
+from sklearn.callback.tests._utils import TestingCallback
+from sklearn.callback.tests._utils import TestingAutoPropagatedCallback
+from sklearn.callback.tests._utils import NotValidCallback
+from sklearn.callback.tests._utils import Estimator
+from sklearn.callback.tests._utils import MetaEstimator
+
+
+@pytest.mark.parametrize("callbacks",
+    [
+        TestingCallback(),
+        [TestingCallback()],
+        [TestingCallback(), TestingAutoPropagatedCallback()],
+    ]
+)
+def test_set_callbacks(callbacks):
+    """Sanity check for the _set_callbacks method"""
+    estimator = Estimator()
+
+    set_callbacks_return = estimator._set_callbacks(callbacks)
+    assert hasattr(estimator, "_callbacks")
+    assert estimator._callbacks in (callbacks, [callbacks])
+    assert set_callbacks_return is estimator
+
+
+@pytest.mark.parametrize("callbacks", [None, NotValidCallback()])
+def test_set_callbacks_error(callbacks):
+    """Check the error message when not passing a valid callback to _set_callbacks"""
+    estimator = Estimator()
+
+    with pytest.raises(TypeError, match="callbacks must be subclasses of BaseCallback"):
+        estimator._set_callbacks(callbacks)
+
+
+def test_propagate_callbacks():
+    """Sanity check for the _propagate_callbacks method"""
+    not_propagated_callback = TestingCallback()
+    propagated_callback = TestingAutoPropagatedCallback()
+
+    estimator = Estimator()
+    estimator._set_callbacks([not_propagated_callback, propagated_callback])
+
+    sub_estimator = Estimator()
+    estimator._propagate_callbacks(sub_estimator, parent_node=None)
+
+    assert hasattr(sub_estimator, "_parent_ct_node")
+    assert not_propagated_callback not in sub_estimator._callbacks
+    assert propagated_callback in sub_estimator._callbacks 
+
+
+def test_propagate_callback_no_callback():
+    """Check that no callback is propagated if there's no callback"""
+    estimator = Estimator()
+    sub_estimator = Estimator()
+    estimator._propagate_callbacks(sub_estimator, parent_node=None)
+
+    assert not hasattr(estimator, "_callbacks")
+    assert not hasattr(sub_estimator, "_callbacks")
+
+
+def test_auto_propagated_callbacks():
+    """Check that it's not possible to set an auto-propagated callback on the
+    sub-estimator of a meta-estimator.
+    """
+    estimator = Estimator()
+    estimator._set_callbacks(TestingAutoPropagatedCallback())
+
+    meta_estimator = MetaEstimator(estimator=estimator)
+
+    match = (
+        r"sub-estimators .*of a meta-estimator .*can't have auto-propagated callbacks"
+    )
+    with pytest.raises(TypeError, match=match):
+        meta_estimator.fit(X=None, y=None)
+
+
+def test_eval_callbacks_on_fit_begin():
+    """Check that _eval_callbacks_on_fit_begin creates and dumps the computation tree"""
+    estimator = Estimator()._set_callbacks(TestingCallback())
+    assert not hasattr(estimator, "_computation_tree")
+
+    levels = [
+        {"descr": "fit", "max_iter": 10},
+        {"descr": "iter", "max_iter": None},
+    ]
+    ct_root = estimator._eval_callbacks_on_fit_begin(levels=levels)
+    assert hasattr(estimator, "_computation_tree")
+    assert ct_root is estimator._computation_tree.root
+
+    ct_pickle = Path(estimator._computation_tree.tree_dir) / "computation_tree.pkl"
+    assert ct_pickle.exists()
diff --git a/sklearn/callback/tests/test_callbacks.py b/sklearn/callback/tests/test_callbacks.py
index 1f5fcf6bdd3c4..a87cdbcbf3199 100644
--- a/sklearn/callback/tests/test_callbacks.py
+++ b/sklearn/callback/tests/test_callbacks.py
@@ -1,99 +1,24 @@
 # License: BSD 3 clause
 
-from functools import partial
 import pickle
 import pytest
 import tempfile
-from time import sleep
 
-from joblib import Parallel, delayed
+import numpy as np
 
-from sklearn.base import BaseEstimator, clone
 from sklearn.callback import Snapshot
-from sklearn.callback._base import _eval_callbacks_on_fit_iter_end
-from sklearn.datasets import make_classification
+from sklearn.callback.tests._utils import Estimator
+from sklearn.callback.tests._utils import MetaEstimator
 
 
-class Estimator(BaseEstimator):
-    def __init__(self, max_iter=20):
-        self.max_iter = max_iter
-
-    def fit(self, X, y):
-        root = self._eval_callbacks_on_fit_begin(
-            levels=[
-                {"descr": "fit", "max_iter": self.max_iter},
-                {"descr": "iter", "max_iter": None},
-            ],
-            X=X,
-            y=y,
-        )
-
-        for i in range(self.max_iter):
-            if _eval_callbacks_on_fit_iter_end(
-                estimator=self,
-                node=root.children[i],
-                from_reconstruction_attributes=partial(
-                    self._from_reconstruction_attributes,
-                    reconstruction_attributes=lambda: {"n_iter_": i + 1},
-                ),
-            ):
-                break
-
-        self.n_iter_ = i + 1
-
-        self._eval_callbacks_on_fit_end()
-
-        return self
-
-
-class MetaEstimator(BaseEstimator):
-    def __init__(
-        self, estimator, n_outer=4, n_inner=3, n_jobs=None, prefer="processes"
-    ):
-        self.estimator = estimator
-        self.n_outer = n_outer
-        self.n_inner = n_inner
-        self.n_jobs = n_jobs
-        self.prefer = prefer
-
-    def fit(self, X, y):
-        root = self._eval_callbacks_on_fit_begin(
-            levels=[
-                {"descr": "fit", "max_iter": self.n_outer},
-                {"descr": "outer", "max_iter": self.n_inner},
-                {"descr": "inner", "max_iter": None},
-            ],
-            X=X,
-            y=y,
-        )
-
-        res = Parallel(n_jobs=self.n_jobs, prefer=self.prefer)(
-            delayed(self._func)(self.estimator, X, y, node, i)
-            for i, node in enumerate(root.children)
-        )
-
-        self._eval_callbacks_on_fit_end()
-
-        return self
-
-    def _func(self, estimator, X, y, parent_node, i):
-        for j, node in enumerate(parent_node.children):
-            est = clone(estimator)
-            self._propagate_callbacks(est, parent_node=node)
-            est.fit(X, y)
-
-            _eval_callbacks_on_fit_iter_end(estimator=self, node=node)
-
-        _eval_callbacks_on_fit_iter_end(estimator=self, node=parent_node)
-
-        return
+X = np.zeros((100, 3))
+y = np.zeros(100, dtype=int)
 
 
 @pytest.mark.parametrize("n_jobs", (1, 2))
 @pytest.mark.parametrize("prefer", ("threads", "processes"))
 def test_snapshot_meta_estimator(n_jobs, prefer):
-    # Test for the Snapshot callback
-    X, y = make_classification()
+    """Test for the Snapshot callback"""
     estimator = Estimator(max_iter=20)
 
     with tempfile.TemporaryDirectory() as tmp_dir:
@@ -122,3 +47,5 @@ def test_snapshot_meta_estimator(n_jobs, prefer):
                 # We kept last 5 snapshots out of 20 iterations.
                 # This one is the 16 + i-th.
                 assert loaded_estimator.n_iter_ == 16 + i
+
+
diff --git a/sklearn/callback/tests/test_computation_tree.py b/sklearn/callback/tests/test_computation_tree.py
index b726177a342ec..902175b71a250 100644
--- a/sklearn/callback/tests/test_computation_tree.py
+++ b/sklearn/callback/tests/test_computation_tree.py
@@ -1,11 +1,8 @@
 # License: BSD 3 clause
 
 import numpy as np
-import pytest
 
 from sklearn.callback import ComputationTree
-from sklearn.callback import ComputationNode
-from sklearn.callback import load_computation_tree
 
 
 levels = [
@@ -17,7 +14,7 @@
 
 
 def test_computation_tree():
-    # Check the construction of the computation tree
+    """Check the construction of the computation tree"""
     computation_tree = ComputationTree(estimator_name="estimator", levels=levels)
     assert computation_tree.estimator_name == "estimator"
 
@@ -41,8 +38,9 @@ def test_computation_tree():
 
 
 def test_n_nodes():
-    # Check that the number of node in a comutation tree corresponds to what we expect
-    # from the level descriptions
+    """Check that the number of node in a comutation tree corresponds to what we expect
+    from the level descriptions
+    """
     computation_tree = ComputationTree(estimator_name="", levels=levels)
 
     max_iter_per_level = [level["max_iter"] for level in levels[:-1]]
@@ -54,8 +52,9 @@ def test_n_nodes():
 
 
 def test_tree_status_idx():
-    # Check that each node has a unique index in the _tree_status array and that their
-    # order corresponds to the order given by a depth first search.
+    """Check that each node has a unique index in the _tree_status array and that their
+    order corresponds to the order given by a depth first search.
+    """
     computation_tree = ComputationTree(estimator_name="", levels=levels)
 
     indexes = [
@@ -65,7 +64,7 @@ def test_tree_status_idx():
 
 
 def test_get_ancestors():
-    # Check that the ancestor search excludes the root and can propagate to parent trees
+    """Check the ancestor search and its propagation to parent trees"""
     parent_levels = [
         {"descr": "parent_level0", "max_iter": 2},
         {"descr": "parent_level1", "max_iter": 4},
@@ -76,23 +75,31 @@ def test_get_ancestors():
         estimator_name="parent_estimator", levels=parent_levels
     )
     parent_node = parent_computation_tree.root.children[0].children[2]
+    # indices of each node (in its parent children) in this chain are 0, 0, 2.
+    # (root is always 0).
+    expected_parent_indices = [2, 0, 0]
 
     computation_tree = ComputationTree(
         estimator_name="estimator", levels=levels, parent_node=parent_node
     )
     node = computation_tree.root.children[1].children[3].children[5]
+    expected_node_indices = [5, 3, 1, 0]
 
     ancestors = node.get_ancestors(include_ancestor_trees=False)
-    assert ancestors == [node, node.parent, node.parent.parent]
-    assert [n.idx for n in ancestors] == [5, 3, 1]
-    assert computation_tree.root not in ancestors
+    assert ancestors == [
+        node, node.parent, node.parent.parent, node.parent.parent.parent
+    ]
+    assert [n.idx for n in ancestors] == expected_node_indices
+    assert computation_tree.root in ancestors
 
     ancestors = node.get_ancestors(include_ancestor_trees=True)
     assert ancestors == [
         node,
         node.parent,
         node.parent.parent,
+        node.parent.parent.parent,
         parent_node,
         parent_node.parent,
+        parent_node.parent.parent,
     ]
-    assert [n.idx for n in ancestors] == [5, 3, 1, 2, 0]
+    assert [n.idx for n in ancestors] == expected_node_indices + expected_parent_indices

From 596a58ef39815701346c47769d2bb14ab7814da9 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 23 Feb 2022 17:57:49 +0100
Subject: [PATCH 10/20] cln

---
 sklearn/pipeline.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index c845f684a7945..433b9e4d57c56 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -344,8 +344,10 @@ def _fit(self, X, y=None, **fit_params_steps):
                 cloned_transformer = transformer
             else:
                 cloned_transformer = clone(transformer)
-            # Fit or load from cache the current transformer
+
             self._propagate_callbacks(cloned_transformer, parent_node=node)
+
+            # Fit or load from cache the current transformer
             X, fitted_transformer = fit_transform_one_cached(
                 cloned_transformer,
                 X,

From 4f9363cf7ec622bab3cd320c9fcadc128ffcbb47 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 12 Sep 2022 18:29:10 +0200
Subject: [PATCH 11/20] wip

---
 sklearn/callback/_base.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sklearn/callback/_base.py b/sklearn/callback/_base.py
index a473f172fd575..0e11acd4f54ef 100644
--- a/sklearn/callback/_base.py
+++ b/sklearn/callback/_base.py
@@ -3,7 +3,8 @@
 from abc import ABC, abstractmethod
 
 
-# Not a method of BaseEstimator because it might be called from an extern function
+# Not a method of BaseEstimator because it might not be directly called from fit but
+# by a non-method function called by fit
 def _eval_callbacks_on_fit_iter_end(**kwargs):
     """Evaluate the on_fit_iter_end method of the callbacks
 
@@ -54,6 +55,8 @@ class BaseCallback(ABC):
     def on_fit_begin(self, estimator, *, X=None, y=None):
         """Method called at the beginning of the fit method of the estimator
 
+        Only called 
+
         Parameters
         ----------
         estimator: estimator instance
@@ -105,6 +108,11 @@ class (e.g. LogisticRegressionCV -> LogisticRegression)
                 used by generic callbacks but by a callback designed for a specific
                 estimator instead.
 
+            - extra_verbose: dict
+                Model specific . This is not meant to be
+                used by generic callbacks but by a callback designed for a specific
+                estimator instead.
+
         Returns
         -------
         stop : bool or None

From 35c5284239faf6f962d8d3f66436889da0020291 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 16 Sep 2022 11:05:15 +0200
Subject: [PATCH 12/20] wip

---
 sklearn/base.py                               |   8 ++
 sklearn/callback/_base.py                     |  11 ++
 sklearn/callback/tests/_utils.py              | 109 ++++++++++++++++++
 .../test_base_estimator_callback_methods.py   |  29 +++++
 sklearn/callback/tests/test_callbacks.py      |  17 +++
 5 files changed, 174 insertions(+)
 create mode 100644 sklearn/callback/tests/_utils.py

diff --git a/sklearn/base.py b/sklearn/base.py
index 11de1ecfb1fd2..e8938f1c134e8 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -10,6 +10,8 @@
 import inspect
 import re
 import pickle
+from shutil import rmtree
+from functools import partial
 
 import numpy as np
 
@@ -32,6 +34,7 @@
 from .callback import BaseCallback
 from .callback import AutoPropagatedMixin
 from .callback import ComputationTree
+from .callback._base import CallbackContext
 
 
 def clone(estimator, *, safe=True):
@@ -678,6 +681,11 @@ def _eval_callbacks_on_fit_begin(self, *, levels, X=None, y=None):
         )
 
         if hasattr(self, "_callbacks"):
+            #
+            #if self._computation_tree.parent_node is None:
+            CallbackContext(self._callbacks, finalizer=partial(rmtree, ignore_errors=True), finalizer_args=self._computation_tree.tree_dir)
+
+            # 
             file_path = self._computation_tree.tree_dir / "computation_tree.pkl"
             with open(file_path, "wb") as f:
                 pickle.dump(self._computation_tree, f)
diff --git a/sklearn/callback/_base.py b/sklearn/callback/_base.py
index 0e11acd4f54ef..ea0b28be5f937 100644
--- a/sklearn/callback/_base.py
+++ b/sklearn/callback/_base.py
@@ -1,6 +1,7 @@
 # License: BSD 3 clause
 
 from abc import ABC, abstractmethod
+import weakref
 
 
 # Not a method of BaseEstimator because it might not be directly called from fit but
@@ -120,6 +121,9 @@ class (e.g. LogisticRegressionCV -> LogisticRegression)
         """
         pass
 
+    def _set_context(self, context):
+        self._callback_context = context
+
 
 class AutoPropagatedMixin:
     """Mixin for auto-propagated callbacks
@@ -132,3 +136,10 @@ class AutoPropagatedMixin:
     """
 
     pass
+
+
+class CallbackContext:
+    def __init__(self, callbacks, finalizer, finalizer_args):
+        for callback in callbacks:
+            callback._set_context(self)
+        weakref.finalize(self, finalizer, finalizer_args)
diff --git a/sklearn/callback/tests/_utils.py b/sklearn/callback/tests/_utils.py
new file mode 100644
index 0000000000000..84e94fce16e7c
--- /dev/null
+++ b/sklearn/callback/tests/_utils.py
@@ -0,0 +1,109 @@
+from functools import partial
+
+from joblib.parallel import Parallel, delayed
+
+from sklearn.base import BaseEstimator, clone
+from sklearn.callback import BaseCallback
+from sklearn.callback import AutoPropagatedMixin
+from sklearn.callback._base import _eval_callbacks_on_fit_iter_end
+
+
+class TestingCallback(BaseCallback):
+    def on_fit_begin(self, estimator, *, X=None, y=None):
+        pass
+
+    def on_fit_end(self):
+        pass
+
+    def on_fit_iter_end(self, estimator, node, **kwargs):
+        pass
+
+
+class TestingAutoPropagatedCallback(TestingCallback, AutoPropagatedMixin):
+    pass
+
+
+class NotValidCallback:
+    def on_fit_begin(self, estimator, *, X=None, y=None):
+        pass
+
+    def on_fit_end(self):
+        pass
+
+    def on_fit_iter_end(self, estimator, node, **kwargs):
+        pass
+
+
+class Estimator(BaseEstimator):
+    def __init__(self, max_iter=20):
+        self.max_iter = max_iter
+
+    def fit(self, X, y):
+        root = self._eval_callbacks_on_fit_begin(
+            levels=[
+                {"descr": "fit", "max_iter": self.max_iter},
+                {"descr": "iter", "max_iter": None},
+            ],
+            X=X,
+            y=y,
+        )
+
+        for i in range(self.max_iter):
+            if _eval_callbacks_on_fit_iter_end(
+                estimator=self,
+                node=root.children[i],
+                from_reconstruction_attributes=partial(
+                    self._from_reconstruction_attributes,
+                    reconstruction_attributes=lambda: {"n_iter_": i + 1},
+                ),
+            ):
+                break
+
+        self.n_iter_ = i + 1
+
+        self._eval_callbacks_on_fit_end()
+
+        return self
+
+
+class MetaEstimator(BaseEstimator):
+    def __init__(
+        self, estimator, n_outer=4, n_inner=3, n_jobs=None, prefer="processes"
+    ):
+        self.estimator = estimator
+        self.n_outer = n_outer
+        self.n_inner = n_inner
+        self.n_jobs = n_jobs
+        self.prefer = prefer
+
+    def fit(self, X, y):
+        root = self._eval_callbacks_on_fit_begin(
+            levels=[
+                {"descr": "fit", "max_iter": self.n_outer},
+                {"descr": "outer", "max_iter": self.n_inner},
+                {"descr": "inner", "max_iter": None},
+            ],
+            X=X,
+            y=y,
+        )
+
+        res = Parallel(n_jobs=self.n_jobs, prefer=self.prefer)(
+            delayed(self._func)(self.estimator, X, y, node, i)
+            for i, node in enumerate(root.children)
+        )
+
+        self._eval_callbacks_on_fit_end()
+
+        return self
+    
+    def _func(self, estimator, X, y, parent_node, i):
+        for j, node in enumerate(parent_node.children):
+            est = clone(estimator)
+            self._propagate_callbacks(est, parent_node=node)
+            est.fit(X, y)
+
+            _eval_callbacks_on_fit_iter_end(estimator=self, node=node)
+
+        _eval_callbacks_on_fit_iter_end(estimator=self, node=parent_node)
+
+        return
\ No newline at end of file
diff --git a/sklearn/callback/tests/test_base_estimator_callback_methods.py b/sklearn/callback/tests/test_base_estimator_callback_methods.py
index 676f0a5cfdd0e..ea750abbcf890 100644
--- a/sklearn/callback/tests/test_base_estimator_callback_methods.py
+++ b/sklearn/callback/tests/test_base_estimator_callback_methods.py
@@ -9,6 +9,8 @@
 from sklearn.callback.tests._utils import Estimator
 from sklearn.callback.tests._utils import MetaEstimator
 
+from sklearn.callback import ProgressBar
+
 
 @pytest.mark.parametrize("callbacks",
     [
@@ -93,3 +95,30 @@ def test_eval_callbacks_on_fit_begin():
 
     ct_pickle = Path(estimator._computation_tree.tree_dir) / "computation_tree.pkl"
     assert ct_pickle.exists()
+
+
+def test_callback_context_finalize():
+    """Check that the folder containing the computation tree of the estimator is
+    deleted when there are no reference left to its callbacks.
+    """
+    callback = TestingCallback()
+
+    # estimator is not fitted, its computation tree is not built yet
+    est = Estimator()._set_callbacks(callbacks=callback)
+    assert not hasattr(est, "_computation_tree")
+
+    # estimator is fitted, a folder has been created to hold its computation tree
+    est.fit(X=None, y=None)
+    assert hasattr(est, "_computation_tree")
+    tree_dir = est._computation_tree.tree_dir
+    assert tree_dir.is_dir()
+
+    # there is no more reference to the estimator, but there is still a reference to the
+    # callback which might need to access the computation tree
+    del est
+    assert tree_dir.is_dir()
+
+    # there is no more reference to the callback, the computation tree folder must be
+    # deleted
+    del callback
+    assert not tree_dir.is_dir()
diff --git a/sklearn/callback/tests/test_callbacks.py b/sklearn/callback/tests/test_callbacks.py
index a87cdbcbf3199..bd76325e0af28 100644
--- a/sklearn/callback/tests/test_callbacks.py
+++ b/sklearn/callback/tests/test_callbacks.py
@@ -6,7 +6,11 @@
 
 import numpy as np
 
+from sklearn.callback import ConvergenceMonitor
+from sklearn.callback import EarlyStopping
+from sklearn.callback import ProgressBar
 from sklearn.callback import Snapshot
+from sklearn.callback import TextVerbose
 from sklearn.callback.tests._utils import Estimator
 from sklearn.callback.tests._utils import MetaEstimator
 
@@ -15,6 +19,19 @@
 y = np.zeros(100, dtype=int)
 
 
+@pytest.mark.parametrize("Callback", [ConvergenceMonitor, EarlyStopping, ProgressBar, Snapshot, TextVerbose,])
+def test_callback_doesnt_hold_ref_to_estimator(Callback):
+    callback = Callback()
+    est = Estimator()._set_callbacks(callbacks=callback)
+    est.fit(X, y)
+
+    tree_dir = est._computation_tree.tree_dir
+
+    del est
+    del callback
+    assert not tree_dir.is_dir()
+
+
 @pytest.mark.parametrize("n_jobs", (1, 2))
 @pytest.mark.parametrize("prefer", ("threads", "processes"))
 def test_snapshot_meta_estimator(n_jobs, prefer):

From 115e1840122e542fc940c2dbec056273e6440965 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 16 Sep 2022 17:48:42 +0200
Subject: [PATCH 13/20] wip

---
 sklearn/base.py                          | 15 ++++++++-------
 sklearn/callback/_base.py                |  5 ++++-
 sklearn/callback/_progressbar.py         |  4 ++--
 sklearn/callback/tests/test_callbacks.py | 16 +++++++++-------
 4 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index e8938f1c134e8..14da63c1b9cb2 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -3,6 +3,7 @@
 # Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
 # License: BSD 3 clause
 
+from codecs import ignore_errors
 import copy
 import warnings
 from collections import defaultdict
@@ -645,10 +646,11 @@ def _propagate_callbacks(self, sub_estimator, *, parent_node):
 
         sub_estimator._parent_ct_node = parent_node
 
-        if not hasattr(sub_estimator, "_callbacks"):
-            sub_estimator._callbacks = propagated_callbacks
-        else:
-            sub_estimator._callbacks.extend(propagated_callbacks)
+        # if not hasattr(sub_estimator, "_callbacks"):
+        #     sub_estimator._callbacks = propagated_callbacks
+        # else:
+        #     sub_estimator._callbacks.extend(propagated_callbacks)
+        sub_estimator._set_callbacks(getattr(sub_estimator, "_callbacks", []) + propagated_callbacks)
 
     def _eval_callbacks_on_fit_begin(self, *, levels, X=None, y=None):
         """Evaluate the on_fit_begin method of the callbacks
@@ -681,11 +683,10 @@ def _eval_callbacks_on_fit_begin(self, *, levels, X=None, y=None):
         )
 
         if hasattr(self, "_callbacks"):
-            #
-            #if self._computation_tree.parent_node is None:
+            # 
             CallbackContext(self._callbacks, finalizer=partial(rmtree, ignore_errors=True), finalizer_args=self._computation_tree.tree_dir)
 
-            # 
+            #
             file_path = self._computation_tree.tree_dir / "computation_tree.pkl"
             with open(file_path, "wb") as f:
                 pickle.dump(self._computation_tree, f)
diff --git a/sklearn/callback/_base.py b/sklearn/callback/_base.py
index ea0b28be5f937..045065801cbbd 100644
--- a/sklearn/callback/_base.py
+++ b/sklearn/callback/_base.py
@@ -122,7 +122,10 @@ class (e.g. LogisticRegressionCV -> LogisticRegression)
         pass
 
     def _set_context(self, context):
-        self._callback_context = context
+        if not hasattr(self, "_callback_contexts"):
+            self._callback_contexts = []
+        
+        self._callback_contexts.append(context)
 
 
 class AutoPropagatedMixin:
diff --git a/sklearn/callback/_progressbar.py b/sklearn/callback/_progressbar.py
index ae11e67d59f57..fd7201de5c918 100644
--- a/sklearn/callback/_progressbar.py
+++ b/sklearn/callback/_progressbar.py
@@ -105,7 +105,7 @@ class _RichProgressMonitor(Thread):
 
     def __init__(self, estimator, event, max_depth_show=None, max_depth_keep=None):
         Thread.__init__(self)
-        self.estimator = estimator
+        self.computation_tree = estimator._computation_tree
         self.event = event
         self.max_depth_show = max_depth_show
         self.max_depth_keep = max_depth_keep
@@ -151,7 +151,7 @@ def _recursive_update_tasks(self, this_dir=None, depth=0):
             return
 
         if this_dir is None:
-            this_dir = self.estimator._computation_tree.tree_dir
+            this_dir = self.computation_tree.tree_dir
             # _ordered_tasks holds the list of the tasks in the order we want them to
             # be displayed.
             self._progress_ctx._ordered_tasks = []
diff --git a/sklearn/callback/tests/test_callbacks.py b/sklearn/callback/tests/test_callbacks.py
index bd76325e0af28..2a457d354077e 100644
--- a/sklearn/callback/tests/test_callbacks.py
+++ b/sklearn/callback/tests/test_callbacks.py
@@ -2,6 +2,7 @@
 
 import pickle
 import pytest
+import sys
 import tempfile
 
 import numpy as np
@@ -22,14 +23,15 @@
 @pytest.mark.parametrize("Callback", [ConvergenceMonitor, EarlyStopping, ProgressBar, Snapshot, TextVerbose,])
 def test_callback_doesnt_hold_ref_to_estimator(Callback):
     callback = Callback()
-    est = Estimator()._set_callbacks(callbacks=callback)
+    est = Estimator()
+    callback_refcount = sys.getrefcount(callback)
+    est_refcount = sys.getrefcount(est)
+    
+    est._set_callbacks(callbacks=callback)
     est.fit(X, y)
-
-    tree_dir = est._computation_tree.tree_dir
-
-    del est
-    del callback
-    assert not tree_dir.is_dir()
+    # estimator has a ref on the callback but the callback has no ref to the estimator
+    assert sys.getrefcount(est) == est_refcount
+    assert sys.getrefcount(callback) == callback_refcount + 1
 
 
 @pytest.mark.parametrize("n_jobs", (1, 2))

From bdb49901b778faf670e08a8025e8e0d727466608 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 21 Sep 2022 12:11:49 +0200
Subject: [PATCH 14/20] wip

---
 sklearn/base.py                  |  7 +++++++
 sklearn/callback/_base.py        | 21 +++++++++++++++++++++
 sklearn/callback/_progressbar.py |  3 +++
 sklearn/decomposition/_nmf.py    |  5 +++++
 sklearn/pipeline.py              |  2 ++
 5 files changed, 38 insertions(+)

diff --git a/sklearn/base.py b/sklearn/base.py
index 14da63c1b9cb2..78dfa06178bc7 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -718,6 +718,13 @@ def _eval_callbacks_on_fit_end(self):
             if not is_propagated:
                 callback.on_fit_end()
 
+    def _eval_callbacks_on_fit_exception(self):
+        if not hasattr(self, "_callbacks"):
+            return
+
+        for callback in self._callbacks:
+            callback.on_fit_exception()
+
     def _from_reconstruction_attributes(self, *, reconstruction_attributes):
         """Return an as if fitted copy of this estimator
 
diff --git a/sklearn/callback/_base.py b/sklearn/callback/_base.py
index 045065801cbbd..c78fb3c773b61 100644
--- a/sklearn/callback/_base.py
+++ b/sklearn/callback/_base.py
@@ -1,6 +1,7 @@
 # License: BSD 3 clause
 
 from abc import ABC, abstractmethod
+from functools import wraps
 import weakref
 
 
@@ -121,6 +122,10 @@ class (e.g. LogisticRegressionCV -> LogisticRegression)
         """
         pass
 
+    @abstractmethod
+    def on_fit_exception(self):
+        pass
+
     def _set_context(self, context):
         if not hasattr(self, "_callback_contexts"):
             self._callback_contexts = []
@@ -146,3 +151,19 @@ def __init__(self, callbacks, finalizer, finalizer_args):
         for callback in callbacks:
             callback._set_context(self)
         weakref.finalize(self, finalizer, finalizer_args)
+
+
+def callback_aware(fit_method):
+    """Decorator ...
+    """
+    @wraps(fit_method)
+    def inner(self, *args, **kwargs):
+        try:
+            return fit_method(self, *args, **kwargs)
+        except BaseException:
+            self._eval_callbacks_on_fit_exception()
+            raise
+        finally:
+            self._eval_callbacks_on_fit_end()
+
+    return inner
diff --git a/sklearn/callback/_progressbar.py b/sklearn/callback/_progressbar.py
index fd7201de5c918..713ab995e169a 100644
--- a/sklearn/callback/_progressbar.py
+++ b/sklearn/callback/_progressbar.py
@@ -62,6 +62,9 @@ def on_fit_end(self):
         self._stop_event.set()
         self.progress_monitor.join()
 
+    def on_fit_exception(self):
+        pass
+
     def __getstate__(self):
         state = self.__dict__.copy()
         if "_stop_event" in state:
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index f54fd2d18e690..759946afc8c5a 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -9,6 +9,7 @@
 from abc import ABC
 from functools import partial
 from numbers import Integral, Real
+from subprocess import call
 import numpy as np
 import scipy.sparse as sp
 import time
@@ -34,6 +35,7 @@
     validate_params,
 )
 from ..callback._base import _eval_callbacks_on_fit_iter_end
+from ..callback._base import callback_aware
 
 
 EPSILON = np.finfo(np.float32).eps
@@ -869,6 +871,8 @@ def _fit_multiplicative_update(
 
     H_sum, HHt, XHt = None, None, None
     for n_iter in range(1, max_iter + 1):
+        if n_iter == 30:
+            raise ValueError("eh ouais")
         # update W
         # H_sum, HHt and XHt are saved and reused if not update_H
         W, H_sum, HHt, XHt = _multiplicative_update_w(
@@ -1726,6 +1730,7 @@ def _check_params(self, X):
 
         return self
 
+    @callback_aware
     def fit_transform(self, X, y=None, W=None, H=None):
         """Learn a NMF model for the data X and returns the transformed data.
 
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index b93d412020bef..81f4500726ff3 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -32,6 +32,7 @@
 from .utils.fixes import delayed
 from .exceptions import NotFittedError
 from .callback._base import _eval_callbacks_on_fit_iter_end
+from .callback._base import callback_aware
 
 
 __all__ = ["Pipeline", "FeatureUnion", "make_pipeline", "make_union"]
@@ -366,6 +367,7 @@ def _fit(self, X, y=None, **fit_params_steps):
 
         return X
 
+    @callback_aware
     def fit(self, X, y=None, **fit_params):
         """Fit the model.
 

From 7a43c306b6f57b847b3f7348905c47d2786757fc Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 23 Sep 2022 18:15:57 +0200
Subject: [PATCH 15/20] wip

---
 sklearn/base.py                       | 26 +++------
 sklearn/callback/__init__.py          |  2 -
 sklearn/callback/_base.py             | 49 +++++++++-------
 sklearn/callback/_computation_tree.py |  5 ++
 sklearn/callback/_progressbar.py      | 84 +++++++++++++++++++++------
 sklearn/callback/_snapshot.py         |  2 -
 sklearn/callback/_text_verbose.py     |  5 +-
 sklearn/callback/tests/_utils.py      |  6 +-
 sklearn/decomposition/_nmf.py         |  2 -
 9 files changed, 112 insertions(+), 69 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 78dfa06178bc7..9b4e659d8647a 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -33,7 +33,6 @@
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils._param_validation import validate_parameter_constraints
 from .callback import BaseCallback
-from .callback import AutoPropagatedMixin
 from .callback import ComputationTree
 from .callback._base import CallbackContext
 
@@ -617,13 +616,12 @@ def _propagate_callbacks(self, sub_estimator, *, parent_node):
             method of the sub-estimator is called.
         """
         if hasattr(sub_estimator, "_callbacks") and any(
-            isinstance(callback, AutoPropagatedMixin)
-            for callback in sub_estimator._callbacks
+            callback.auto_propagate for callback in sub_estimator._callbacks
         ):
             bad_callbacks = [
                 callback.__class__.__name__
                 for callback in sub_estimator._callbacks
-                if isinstance(callback, AutoPropagatedMixin)
+                if callback.auto_propagate
             ]
             raise TypeError(
                 f"The sub-estimators ({sub_estimator.__class__.__name__}) of a"
@@ -638,7 +636,7 @@ def _propagate_callbacks(self, sub_estimator, *, parent_node):
         propagated_callbacks = [
             callback
             for callback in self._callbacks
-            if isinstance(callback, AutoPropagatedMixin)
+            if callback.auto_propagate
         ]
 
         if not propagated_callbacks:
@@ -646,11 +644,9 @@ def _propagate_callbacks(self, sub_estimator, *, parent_node):
 
         sub_estimator._parent_ct_node = parent_node
 
-        # if not hasattr(sub_estimator, "_callbacks"):
-        #     sub_estimator._callbacks = propagated_callbacks
-        # else:
-        #     sub_estimator._callbacks.extend(propagated_callbacks)
-        sub_estimator._set_callbacks(getattr(sub_estimator, "_callbacks", []) + propagated_callbacks)
+        sub_estimator._set_callbacks(
+            getattr(sub_estimator, "_callbacks", []) + propagated_callbacks
+        )
 
     def _eval_callbacks_on_fit_begin(self, *, levels, X=None, y=None):
         """Evaluate the on_fit_begin method of the callbacks
@@ -694,10 +690,7 @@ def _eval_callbacks_on_fit_begin(self, *, levels, X=None, y=None):
             # Only call the on_fit_begin method of callbacks that are not
             # propagated from a meta-estimator.
             for callback in self._callbacks:
-                is_propagated = hasattr(self, "_parent_ct_node") and isinstance(
-                    callback, AutoPropagatedMixin
-                )
-                if not is_propagated:
+                if not callback._is_propagated(estimator=self):
                     callback.on_fit_begin(estimator=self, X=X, y=y)
 
         return self._computation_tree.root
@@ -712,10 +705,7 @@ def _eval_callbacks_on_fit_end(self):
         # Only call the on_fit_end method of callbacks that are not
         # propagated from a meta-estimator.
         for callback in self._callbacks:
-            is_propagated = isinstance(callback, AutoPropagatedMixin) and hasattr(
-                self, "_parent_ct_node"
-            )
-            if not is_propagated:
+            if not callback._is_propagated(estimator=self):
                 callback.on_fit_end()
 
     def _eval_callbacks_on_fit_exception(self):
diff --git a/sklearn/callback/__init__.py b/sklearn/callback/__init__.py
index c8d5ea0bf0606..9767411b6c934 100644
--- a/sklearn/callback/__init__.py
+++ b/sklearn/callback/__init__.py
@@ -1,6 +1,5 @@
 # License: BSD 3 clause
 
-from ._base import AutoPropagatedMixin
 from ._base import BaseCallback
 from ._computation_tree import ComputationNode
 from ._computation_tree import ComputationTree
@@ -12,7 +11,6 @@
 from ._text_verbose import TextVerbose
 
 __all__ = [
-    "AutoPropagatedMixin",
     "BaseCallback",
     "ComputationNode",
     "ComputationTree",
diff --git a/sklearn/callback/_base.py b/sklearn/callback/_base.py
index c78fb3c773b61..65b24dc85e9bb 100644
--- a/sklearn/callback/_base.py
+++ b/sklearn/callback/_base.py
@@ -33,17 +33,11 @@ def _eval_callbacks_on_fit_iter_end(**kwargs):
     # stopping_criterion and reconstruction_attributes can be costly to compute. They
     # are passed as lambdas for lazy evaluation. We only actually compute them if a
     # callback requests it.
-    if any(
-        getattr(callback, "request_stopping_criterion", False)
-        for callback in estimator._callbacks
-    ):
+    if any(cb.request_stopping_criterion for cb in estimator._callbacks):
         kwarg = kwargs.pop("stopping_criterion", lambda: None)()
         kwargs["stopping_criterion"] = kwarg
 
-    if any(
-        getattr(callback, "request_from_reconstruction_attributes", False)
-        for callback in estimator._callbacks
-    ):
+    if any(cb.request_from_reconstruction_attributes for cb in estimator._callbacks):
         kwarg = kwargs.pop("from_reconstruction_attributes", lambda: None)()
         kwargs["from_reconstruction_attributes"] = kwarg
 
@@ -126,6 +120,32 @@ class (e.g. LogisticRegressionCV -> LogisticRegression)
     def on_fit_exception(self):
         pass
 
+    @property
+    def auto_propagate(self):
+        """Whether or not this callback should be propagated to sub-estimators.
+
+        An auto-propagated callback (from a meta-estimator to its sub-estimators) must
+        be set on the meta-estimator. Its `on_fit_begin` and `on_fit_end` methods will
+        only be called at the beginning and end of the fit method of the meta-estimator,
+        while its `on_fit_iter_end` method will be called at each computation node of
+        the meta-estimator and its sub-estimators.
+        """
+        return False
+
+    def _is_propagated(self, estimator):
+        """Check if this callback attached to estimator has been propagated from a
+        meta-estimator.
+        """
+        return self.auto_propagate and hasattr(estimator, "_parent_ct_node")
+
+    @property
+    def request_stopping_criterion(self):
+        return False
+
+    @property
+    def request_from_reconstruction_attributes(self):
+        return False
+
     def _set_context(self, context):
         if not hasattr(self, "_callback_contexts"):
             self._callback_contexts = []
@@ -133,19 +153,6 @@ def _set_context(self, context):
         self._callback_contexts.append(context)
 
 
-class AutoPropagatedMixin:
-    """Mixin for auto-propagated callbacks
-
-    An auto-propagated callback (from a meta-estimator to its sub-estimators) must be
-    set on the meta-estimator. Its `on_fit_begin` and `on_fit_end` methods will only be
-    called at the beginning and end of the fit method of the meta-estimator, while its
-    `on_fit_iter_end` method will be called at each computation node of the
-    meta-estimator and its sub-estimators.
-    """
-
-    pass
-
-
 class CallbackContext:
     def __init__(self, callbacks, finalizer, finalizer_args):
         for callback in callbacks:
diff --git a/sklearn/callback/_computation_tree.py b/sklearn/callback/_computation_tree.py
index a69a8788e26c5..a6eb739580446 100644
--- a/sklearn/callback/_computation_tree.py
+++ b/sklearn/callback/_computation_tree.py
@@ -208,6 +208,11 @@ def get_progress(self, node):
             [self._tree_status[child.tree_status_idx] for child in node.children]
         )
 
+    def get_child_computation_tree_dir(self, node):
+        if node.children:
+            raise ValueError("node is not a leaf")
+        return self.tree_dir / str(node.tree_status_idx)
+
     def iterate(self, include_leaves=False):
         """Return an iterable over the nodes of the computation tree
 
diff --git a/sklearn/callback/_progressbar.py b/sklearn/callback/_progressbar.py
index 713ab995e169a..1de13c87f2a8f 100644
--- a/sklearn/callback/_progressbar.py
+++ b/sklearn/callback/_progressbar.py
@@ -1,21 +1,30 @@
 # License: BSD 3 clause
 
-from copy import copy
-import pickle
+import importlib
 from threading import Thread, Event
 
-import numpy as np
-from tqdm import tqdm
-from rich.progress import Progress
-from rich.progress import BarColumn, TimeRemainingColumn, TextColumn
-from rich.style import Style
-
 from . import BaseCallback
-from . import AutoPropagatedMixin
 from . import load_computation_tree
 
 
-class ProgressBar(BaseCallback, AutoPropagatedMixin):
+def _check_backend_support(backend, caller_name):
+    """Raise ImportError with detailed error message if backend is not installed.
+
+    Parameters
+    ----------
+    backend : {"rich", "tqdm"}
+        The requested backend.
+
+    caller_name : str
+        The name of the caller that requires the backend.
+    """
+    try:
+        importlib.import_module(backend)  # noqa
+    except ImportError as e:
+        raise ImportError(f"{caller_name} requires {backend} installed.") from e
+
+
+class ProgressBar(BaseCallback):
     """Callback that displays progress bars for each iterative steps of the estimator
 
     Parameters
@@ -31,13 +40,20 @@ class ProgressBar(BaseCallback, AutoPropagatedMixin):
         finished.
     """
 
+    auto_propagate = True
+
     def __init__(self, backend="rich", max_depth_show=None, max_depth_keep=None):
+        if backend not in ("rich", "tqdm"):
+            raise ValueError(f"backend should be 'rich' or 'tqdm', got {self.backend} instead.")
+        _check_backend_support(backend, caller_name="Progressbar")
         self.backend = backend
+
         if max_depth_show is not None and max_depth_show < 0:
             raise ValueError(f"max_depth_show should be >= 0.")
+        self.max_depth_show = max_depth_show
+
         if max_depth_keep is not None and max_depth_keep < 0:
             raise ValueError(f"max_depth_keep should be >= 0.")
-        self.max_depth_show = max_depth_show
         self.max_depth_keep = max_depth_keep
 
     def on_fit_begin(self, estimator, X=None, y=None):
@@ -50,8 +66,11 @@ def on_fit_begin(self, estimator, X=None, y=None):
                 max_depth_show=self.max_depth_show,
                 max_depth_keep=self.max_depth_keep,
             )
-        else:
-            raise ValueError(f"backend should be 'rich', got {self.backend} instead.")
+        elif self.backend == "tqdm":
+            self.progress_monitor = _TqdmProgressMonitor(
+                estimator=estimator,
+                event=self._stop_event,
+            )
 
         self.progress_monitor.start()
 
@@ -77,10 +96,15 @@ def __getstate__(self):
 # Custom Progress class to allow showing the tasks in a given order (given by setting
 # the _ordered_tasks attribute). In particular it allows to dynamically create and
 # insert tasks between existing tasks.
-class _Progress(Progress):
-    def get_renderables(self):
-        table = self.make_tasks_table(getattr(self, "_ordered_tasks", []))
-        yield table
+
+try:
+    from rich.progress import Progress
+    class _Progress(Progress):
+        def get_renderables(self):
+            table = self.make_tasks_table(getattr(self, "_ordered_tasks", []))
+            yield table
+except:
+    pass
 
 
 class _RichProgressMonitor(Thread):
@@ -119,6 +143,9 @@ def __init__(self, estimator, event, max_depth_show=None, max_depth_keep=None):
         self._computation_trees = {}
 
     def run(self):
+        from rich.progress import BarColumn, TimeRemainingColumn, TextColumn
+        from rich.style import Style
+
         with _Progress(
             TextColumn("[progress.description]{task.description}"),
             BarColumn(
@@ -218,7 +245,8 @@ def _recursive_update_tasks(self, this_dir=None, depth=0):
             else:
                 # node is a leaf, look for tasks of its sub computation tree before
                 # going to the next node
-                child_dir = this_dir / str(node.tree_status_idx)
+                child_dir = computation_tree.get_child_computation_tree_dir(node)
+                # child_dir = this_dir / str(node.tree_status_idx)
                 if child_dir.exists():
                     self._recursive_update_tasks(
                         child_dir, depth + computation_tree.depth
@@ -258,3 +286,23 @@ def _get_parent_task(self, node, computation_tree, task_ids):
             ]
             return self._progress_ctx._tasks[task_id]
         return
+
+
+class _TqdmProgressMonitor(Thread):
+    def __init__(self, estimator, event):
+        Thread.__init__(self)
+        self.computation_tree = estimator._computation_tree
+        self.event = event
+
+    def run(self):
+        from tqdm import tqdm
+
+        root = self.computation_tree.root
+
+        with tqdm(total=len(root.children)) as pbar:
+            while not self.event.wait(0.05):
+                node_progress = self.computation_tree.get_progress(root)
+                if node_progress != pbar.total:
+                    pbar.update(node_progress - pbar.n)
+
+            pbar.update(pbar.total - pbar.n)
diff --git a/sklearn/callback/_snapshot.py b/sklearn/callback/_snapshot.py
index cbf200336c749..238bc29cf8543 100644
--- a/sklearn/callback/_snapshot.py
+++ b/sklearn/callback/_snapshot.py
@@ -4,8 +4,6 @@
 from pathlib import Path
 import pickle
 
-import numpy as np
-
 from . import BaseCallback
 
 
diff --git a/sklearn/callback/_text_verbose.py b/sklearn/callback/_text_verbose.py
index b857ff592c87c..0064ec97f2052 100644
--- a/sklearn/callback/_text_verbose.py
+++ b/sklearn/callback/_text_verbose.py
@@ -3,10 +3,11 @@
 import time
 
 from . import BaseCallback
-from . import AutoPropagatedMixin
 
 
-class TextVerbose(BaseCallback, AutoPropagatedMixin):
+class TextVerbose(BaseCallback):
+
+    auto_propagate = True
     request_stopping_criterion = True
 
     def __init__(self, min_time_between_calls=0):
diff --git a/sklearn/callback/tests/_utils.py b/sklearn/callback/tests/_utils.py
index 84e94fce16e7c..888a5649c19bb 100644
--- a/sklearn/callback/tests/_utils.py
+++ b/sklearn/callback/tests/_utils.py
@@ -4,7 +4,6 @@
 
 from sklearn.base import BaseEstimator, clone
 from sklearn.callback import BaseCallback
-from sklearn.callback import AutoPropagatedMixin
 from sklearn.callback._base import _eval_callbacks_on_fit_iter_end
 
 
@@ -19,9 +18,8 @@ def on_fit_iter_end(self, estimator, node, **kwargs):
         pass
 
 
-class TestingAutoPropagatedCallback(TestingCallback, AutoPropagatedMixin):
-    pass
-
+class TestingAutoPropagatedCallback(TestingCallback):
+    auto_propagate = True
 
 class NotValidCallback:
     def on_fit_begin(self, estimator, *, X=None, y=None):
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 7d0ae56f09c31..75185df49de5f 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -847,8 +847,6 @@ def _fit_multiplicative_update(
 
     H_sum, HHt, XHt = None, None, None
     for n_iter in range(1, max_iter + 1):
-        if n_iter == 30:
-            raise ValueError("eh ouais")
         # update W
         # H_sum, HHt and XHt are saved and reused if not update_H
         W, H_sum, HHt, XHt = _multiplicative_update_w(

From a218068ef0a6627e1e1436b6f7fd6c02186b1dd8 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 13 Oct 2022 09:55:20 +0200
Subject: [PATCH 16/20] wip

---
 sklearn/base.py                                           | 7 -------
 sklearn/callback/_base.py                                 | 7 -------
 sklearn/callback/_early_stopping.py                       | 8 +++++++-
 sklearn/callback/_progressbar.py                          | 5 +----
 .../tests/test_base_estimator_callback_methods.py         | 2 --
 sklearn/callback/tests/test_callbacks.py                  | 2 --
 6 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 9b4e659d8647a..687c1a9954ab8 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -708,13 +708,6 @@ def _eval_callbacks_on_fit_end(self):
             if not callback._is_propagated(estimator=self):
                 callback.on_fit_end()
 
-    def _eval_callbacks_on_fit_exception(self):
-        if not hasattr(self, "_callbacks"):
-            return
-
-        for callback in self._callbacks:
-            callback.on_fit_exception()
-
     def _from_reconstruction_attributes(self, *, reconstruction_attributes):
         """Return an as if fitted copy of this estimator
 
diff --git a/sklearn/callback/_base.py b/sklearn/callback/_base.py
index 65b24dc85e9bb..96cc1619651dd 100644
--- a/sklearn/callback/_base.py
+++ b/sklearn/callback/_base.py
@@ -116,10 +116,6 @@ class (e.g. LogisticRegressionCV -> LogisticRegression)
         """
         pass
 
-    @abstractmethod
-    def on_fit_exception(self):
-        pass
-
     @property
     def auto_propagate(self):
         """Whether or not this callback should be propagated to sub-estimators.
@@ -167,9 +163,6 @@ def callback_aware(fit_method):
     def inner(self, *args, **kwargs):
         try:
             return fit_method(self, *args, **kwargs)
-        except BaseException:
-            self._eval_callbacks_on_fit_exception()
-            raise
         finally:
             self._eval_callbacks_on_fit_end()
 
diff --git a/sklearn/callback/_early_stopping.py b/sklearn/callback/_early_stopping.py
index 44a0108e04b26..dc45da6379a52 100644
--- a/sklearn/callback/_early_stopping.py
+++ b/sklearn/callback/_early_stopping.py
@@ -1,9 +1,13 @@
 # License: BSD 3 clause
 
+from urllib import request
 from . import BaseCallback
 
 
 class EarlyStopping(BaseCallback):
+
+    request_from_reconstruction_attributes = True
+
     def __init__(
         self,
         X_val=None,
@@ -23,7 +27,9 @@ def on_fit_begin(self, estimator, X=None, y=None):
         self._no_improvement = {}
         self._last_monitored = {}
 
-    def on_fit_iter_end(self, *, node, **kwargs):
+    def on_fit_iter_end(self, *, estimator, node, **kwargs):
+        new_estimator = kwargs.get("from_reconstruction_attributes", None)
+
         if node.depth != self.estimator._computation_tree.depth:
             return
 
diff --git a/sklearn/callback/_progressbar.py b/sklearn/callback/_progressbar.py
index 1de13c87f2a8f..bd371bc1c3a7c 100644
--- a/sklearn/callback/_progressbar.py
+++ b/sklearn/callback/_progressbar.py
@@ -29,7 +29,7 @@ class ProgressBar(BaseCallback):
 
     Parameters
     ----------
-    backend: {"rich"}, default="rich"
+    backend: {"rich", "tqdm"}, default="rich"
         The backend for the progress bars display.
 
     max_depth_show : int, default=None
@@ -81,9 +81,6 @@ def on_fit_end(self):
         self._stop_event.set()
         self.progress_monitor.join()
 
-    def on_fit_exception(self):
-        pass
-
     def __getstate__(self):
         state = self.__dict__.copy()
         if "_stop_event" in state:
diff --git a/sklearn/callback/tests/test_base_estimator_callback_methods.py b/sklearn/callback/tests/test_base_estimator_callback_methods.py
index ea750abbcf890..c77d88b68ce3d 100644
--- a/sklearn/callback/tests/test_base_estimator_callback_methods.py
+++ b/sklearn/callback/tests/test_base_estimator_callback_methods.py
@@ -9,8 +9,6 @@
 from sklearn.callback.tests._utils import Estimator
 from sklearn.callback.tests._utils import MetaEstimator
 
-from sklearn.callback import ProgressBar
-
 
 @pytest.mark.parametrize("callbacks",
     [
diff --git a/sklearn/callback/tests/test_callbacks.py b/sklearn/callback/tests/test_callbacks.py
index 2a457d354077e..fb99003eb3b09 100644
--- a/sklearn/callback/tests/test_callbacks.py
+++ b/sklearn/callback/tests/test_callbacks.py
@@ -66,5 +66,3 @@ def test_snapshot_meta_estimator(n_jobs, prefer):
                 # We kept last 5 snapshots out of 20 iterations.
                 # This one is the 16 + i-th.
                 assert loaded_estimator.n_iter_ == 16 + i
-
-

From f794694ce9fea32213503d926a314511bade5e8e Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 13 Oct 2022 16:27:27 +0200
Subject: [PATCH 17/20] update poor_score

---
 doc/developers/develop.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index 0e4b8258476da..3329649d20513 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -553,8 +553,9 @@ preserves_dtype (default=``[np.float64]``)
 
 poor_score (default=False)
     whether the estimator fails to provide a "reasonable" test-set score, which
-    currently for regression is an R2 of 0.5 on a subset of the boston housing
-    dataset, and for classification an accuracy of 0.83 on
+    currently for regression is an R2 of 0.5 on ``make_regression(n_samples=200,
+    n_features=10, n_informative=1, bias=5.0, noise=20, random_state=42)``, and
+    for classification an accuracy of 0.83 on
     ``make_blobs(n_samples=300, random_state=0)``. These datasets and values
     are based on current estimators in sklearn and might be replaced by
     something more systematic.

From 37e569b13a7202aa79d7b5aa1b8d30de323139ca Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 21 Jun 2023 10:24:55 +0200
Subject: [PATCH 18/20] wip

---
 sklearn/base.py                               | 60 +++++++++-----
 sklearn/callback/_base.py                     | 27 +++----
 sklearn/callback/_convergence_monitor.py      | 11 ++-
 sklearn/callback/_early_stopping.py           | 65 +++++++++++-----
 sklearn/callback/_progressbar.py              |  6 +-
 sklearn/callback/_text_verbose.py             |  1 -
 sklearn/callback/tests/_utils.py              | 13 +++-
 .../test_base_estimator_callback_methods.py   |  7 +-
 sklearn/callback/tests/test_callbacks.py      | 13 +++-
 .../callback/tests/test_computation_tree.py   |  5 +-
 sklearn/decomposition/_nmf.py                 | 78 ++++++++++++-------
 sklearn/linear_model/_logistic.py             |  8 +-
 sklearn/model_selection/_search.py            | 17 +++-
 sklearn/model_selection/_validation.py        |  2 +-
 sklearn/pipeline.py                           |  4 +-
 15 files changed, 205 insertions(+), 112 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index feb0fb4e31a57..9c802b536f89d 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -3,7 +3,6 @@
 # Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
 # License: BSD 3 clause
 
-from codecs import ignore_errors
 import copy
 import functools
 import warnings
@@ -659,7 +658,7 @@ def _set_callbacks(self, callbacks):
         Returns
         -------
         self : estimator instance
-            The estimator instance itself. 
+            The estimator instance itself.
         """
         if not isinstance(callbacks, list):
             callbacks = [callbacks]
@@ -705,9 +704,7 @@ def _propagate_callbacks(self, sub_estimator, *, parent_node):
             return
 
         propagated_callbacks = [
-            callback
-            for callback in self._callbacks
-            if callback.auto_propagate
+            callback for callback in self._callbacks if callback.auto_propagate
         ]
 
         if not propagated_callbacks:
@@ -749,28 +746,50 @@ def _eval_callbacks_on_fit_begin(self, *, levels, X=None, y=None):
             parent_node=getattr(self, "_parent_ct_node", None),
         )
 
-        if hasattr(self, "_callbacks"):
-            # 
-            CallbackContext(self._callbacks, finalizer=partial(rmtree, ignore_errors=True), finalizer_args=self._computation_tree.tree_dir)
+        if not hasattr(self, "_callbacks"):
+            return self._computation_tree.root, None, None, None, None
+
+        X_val, y_val = None, None
 
-            #
-            file_path = self._computation_tree.tree_dir / "computation_tree.pkl"
-            with open(file_path, "wb") as f:
-                pickle.dump(self._computation_tree, f)
+        if any(callback.request_validation_split for callback in self._callbacks):
+            splitter = next(
+                callback.validation_split for callback in self._callbacks if hasattr(callback, "validation_split")
+            )
 
-            # Only call the on_fit_begin method of callbacks that are not
-            # propagated from a meta-estimator.
-            for callback in self._callbacks:
-                if not callback._is_propagated(estimator=self):
-                    callback.on_fit_begin(estimator=self, X=X, y=y)
+            train, val = next(splitter.split(X))
+            if X is not None:
+                X, X_val = X[train], X[val]
+            if y is not None:
+                y, y_val = y[train], y[val]
+
+        #
+        CallbackContext(
+            self._callbacks,
+            finalizer=partial(rmtree, ignore_errors=True),
+            finalizer_args=self._computation_tree.tree_dir,
+        )
 
-        return self._computation_tree.root
+        #
+        file_path = self._computation_tree.tree_dir / "computation_tree.pkl"
+        with open(file_path, "wb") as f:
+            pickle.dump(self._computation_tree, f)
+
+        # Only call the on_fit_begin method of callbacks that are not
+        # propagated from a meta-estimator.
+        for callback in self._callbacks:
+            if not callback._is_propagated(estimator=self):
+                callback.on_fit_begin(estimator=self, X=X, y=y)
+
+        return self._computation_tree.root, X, y, X_val, y_val
 
     def _eval_callbacks_on_fit_end(self):
         """Evaluate the on_fit_end method of the callbacks"""
         if not hasattr(self, "_callbacks"):
             return
 
+        if not hasattr(self, "_computation_tree"):
+            return
+
         self._computation_tree._tree_status[0] = True
 
         # Only call the on_fit_end method of callbacks that are not
@@ -1309,7 +1328,10 @@ def wrapper(estimator, *args, **kwargs):
                     prefer_skip_nested_validation or global_skip_validation
                 )
             ):
-                return fit_method(estimator, *args, **kwargs)
+                try:
+                    return fit_method(estimator, *args, **kwargs)
+                finally:
+                    estimator._eval_callbacks_on_fit_end()
 
         return wrapper
 
diff --git a/sklearn/callback/_base.py b/sklearn/callback/_base.py
index 96cc1619651dd..a07115f7e4e0c 100644
--- a/sklearn/callback/_base.py
+++ b/sklearn/callback/_base.py
@@ -30,9 +30,9 @@ def _eval_callbacks_on_fit_iter_end(**kwargs):
 
     estimator._computation_tree._tree_status[node.tree_status_idx] = True
 
-    # stopping_criterion and reconstruction_attributes can be costly to compute. They
-    # are passed as lambdas for lazy evaluation. We only actually compute them if a
-    # callback requests it.
+    # stopping_criterion and reconstruction_attributes can be costly to compute.
+    # They are passed as lambdas for lazy evaluation. We only actually
+    # compute them if a callback requests it.
     if any(cb.request_stopping_criterion for cb in estimator._callbacks):
         kwarg = kwargs.pop("stopping_criterion", lambda: None)()
         kwargs["stopping_criterion"] = kwarg
@@ -51,7 +51,7 @@ class BaseCallback(ABC):
     def on_fit_begin(self, estimator, *, X=None, y=None):
         """Method called at the beginning of the fit method of the estimator
 
-        Only called 
+        Only called
 
         Parameters
         ----------
@@ -141,11 +141,15 @@ def request_stopping_criterion(self):
     @property
     def request_from_reconstruction_attributes(self):
         return False
+    
+    @property
+    def request_validation_split(self):
+        return False
 
     def _set_context(self, context):
         if not hasattr(self, "_callback_contexts"):
             self._callback_contexts = []
-        
+
         self._callback_contexts.append(context)
 
 
@@ -154,16 +158,3 @@ def __init__(self, callbacks, finalizer, finalizer_args):
         for callback in callbacks:
             callback._set_context(self)
         weakref.finalize(self, finalizer, finalizer_args)
-
-
-def callback_aware(fit_method):
-    """Decorator ...
-    """
-    @wraps(fit_method)
-    def inner(self, *args, **kwargs):
-        try:
-            return fit_method(self, *args, **kwargs)
-        finally:
-            self._eval_callbacks_on_fit_end()
-
-    return inner
diff --git a/sklearn/callback/_convergence_monitor.py b/sklearn/callback/_convergence_monitor.py
index ac04335e04661..98fec496d6eb7 100644
--- a/sklearn/callback/_convergence_monitor.py
+++ b/sklearn/callback/_convergence_monitor.py
@@ -33,7 +33,13 @@ class ConvergenceMonitor(BaseCallback):
 
     request_reconstruction_attributes = True
 
-    def __init__(self, *, monitor="objective_function", X_val=None, y_val=None):
+    def __init__(
+        self,
+        *,
+        monitor="objective_function",
+        on="val",
+        higher_is_better=False,
+    ):
         if monitor == "objective_function":
             self._monitor = "objective_function"
         else:
@@ -41,9 +47,6 @@ def __init__(self, *, monitor="objective_function", X_val=None, y_val=None):
             if self._monitor is None:
                 raise ValueError(f"unknown metric {monitor}")
 
-        self.X_val = X_val
-        self.y_val = y_val
-
         self._data_file = Path(mkdtemp()) / "convergence_monitor.csv"
 
     def on_fit_begin(self, estimator, *, X=None, y=None):
diff --git a/sklearn/callback/_early_stopping.py b/sklearn/callback/_early_stopping.py
index dc45da6379a52..6d408dda8c960 100644
--- a/sklearn/callback/_early_stopping.py
+++ b/sklearn/callback/_early_stopping.py
@@ -1,54 +1,77 @@
 # License: BSD 3 clause
 
-from urllib import request
 from . import BaseCallback
 
 
 class EarlyStopping(BaseCallback):
-
     request_from_reconstruction_attributes = True
 
     def __init__(
         self,
-        X_val=None,
-        y_val=None,
         monitor="objective_function",
+        on="validation_set",
+        higher_is_better=False,
+        validation_split="auto",
         max_no_improvement=10,
-        tol=1e-2,
+        threshold=1e-2,
     ):
-        self.X_val = X_val
-        self.y_val = y_val
+        from ..model_selection import KFold
+        self.validation_split = validation_split
+        if validation_split == "auto":
+            self.validation_split = KFold(n_splits=5, shuffle=True, random_state=42)
         self.monitor = monitor
+        self.on = on
+        self.higher_is_better = higher_is_better
         self.max_no_improvement = max_no_improvement
-        self.tol = tol
+        self.threshold = threshold
 
     def on_fit_begin(self, estimator, X=None, y=None):
-        self.estimator = estimator
         self._no_improvement = {}
         self._last_monitored = {}
+        self.early_stopped_ = None
 
     def on_fit_iter_end(self, *, estimator, node, **kwargs):
-        new_estimator = kwargs.get("from_reconstruction_attributes", None)
-
-        if node.depth != self.estimator._computation_tree.depth:
+        if node.depth != estimator._computation_tree.depth:
             return
 
+        reconstructed_estimator = kwargs.pop("from_reconstruction_attributes")
+        data = kwargs.pop("data")
+
+        X = data["X_val"] if self.on == "validation_set" else data["X"]
+        y = data["y_val"] if self.on == "validation_set" else data["y"]
+
         if self.monitor == "objective_function":
-            objective_function = kwargs.get("objective_function", None)
-            monitored, *_ = objective_function(self.X_val)
-        elif self.monitor == "TODO":
-            pass
-
-        if node.parent not in self._last_monitored or monitored < self._last_monitored[
-            node.parent
-        ] * (1 - self.tol):
+            new_monitored, *_ = reconstructed_estimator.objective_function(X, y, normalize=True)
+        elif callable(self.monitor):
+            new_monitored = self.monitor(reconstructed_estimator, X, y)
+        elif self.monitor is None or isinstance(self.monitor, str):
+            from ..metrics import check_scoring
+            scorer = check_scoring(estimator, self.monitor)
+            new_monitored = scorer(estimator, X, y)
+
+        if self._score_improved(node, new_monitored):
             self._no_improvement[node.parent] = 0
-            self._last_monitored[node.parent] = monitored
+            self._last_monitored[node.parent] = new_monitored
         else:
             self._no_improvement[node.parent] += 1
 
         if self._no_improvement[node.parent] >= self.max_no_improvement:
+            self.early_stopped_ = node.idx
+            return True
+        
+    def _score_improved(self, node, new_monitored):
+        if node.parent not in self._last_monitored:
             return True
+        
+        last_monitored = self._last_monitored[node.parent]
+        if self.higher_is_better:
+            return new_monitored > last_monitored * (1 + self.threshold)
+        else:
+            return new_monitored < last_monitored * (1 - self.threshold)
 
     def on_fit_end(self):
         pass
+
+    @property
+    def request_validation_split(self):
+        return self.on == "val"
diff --git a/sklearn/callback/_progressbar.py b/sklearn/callback/_progressbar.py
index bd371bc1c3a7c..738e8f897ce4a 100644
--- a/sklearn/callback/_progressbar.py
+++ b/sklearn/callback/_progressbar.py
@@ -44,7 +44,9 @@ class ProgressBar(BaseCallback):
 
     def __init__(self, backend="rich", max_depth_show=None, max_depth_keep=None):
         if backend not in ("rich", "tqdm"):
-            raise ValueError(f"backend should be 'rich' or 'tqdm', got {self.backend} instead.")
+            raise ValueError(
+                f"backend should be 'rich' or 'tqdm', got {self.backend} instead."
+            )
         _check_backend_support(backend, caller_name="Progressbar")
         self.backend = backend
 
@@ -96,10 +98,12 @@ def __getstate__(self):
 
 try:
     from rich.progress import Progress
+
     class _Progress(Progress):
         def get_renderables(self):
             table = self.make_tasks_table(getattr(self, "_ordered_tasks", []))
             yield table
+
 except:
     pass
 
diff --git a/sklearn/callback/_text_verbose.py b/sklearn/callback/_text_verbose.py
index 0064ec97f2052..93f783a297d30 100644
--- a/sklearn/callback/_text_verbose.py
+++ b/sklearn/callback/_text_verbose.py
@@ -6,7 +6,6 @@
 
 
 class TextVerbose(BaseCallback):
-
     auto_propagate = True
     request_stopping_criterion = True
 
diff --git a/sklearn/callback/tests/_utils.py b/sklearn/callback/tests/_utils.py
index 888a5649c19bb..f61ffc4077dff 100644
--- a/sklearn/callback/tests/_utils.py
+++ b/sklearn/callback/tests/_utils.py
@@ -21,6 +21,7 @@ def on_fit_iter_end(self, estimator, node, **kwargs):
 class TestingAutoPropagatedCallback(TestingCallback):
     auto_propagate = True
 
+
 class NotValidCallback:
     def on_fit_begin(self, estimator, *, X=None, y=None):
         pass
@@ -37,7 +38,7 @@ def __init__(self, max_iter=20):
         self.max_iter = max_iter
 
     def fit(self, X, y):
-        root = self._eval_callbacks_on_fit_begin(
+        root, X, y, X_val, y_val = self._eval_callbacks_on_fit_begin(
             levels=[
                 {"descr": "fit", "max_iter": self.max_iter},
                 {"descr": "iter", "max_iter": None},
@@ -54,6 +55,7 @@ def fit(self, X, y):
                     self._from_reconstruction_attributes,
                     reconstruction_attributes=lambda: {"n_iter_": i + 1},
                 ),
+                data={"X": X, "y": y, "X_val": X_val, "y_val": y_val"},
             ):
                 break
 
@@ -63,6 +65,9 @@ def fit(self, X, y):
 
         return self
 
+    def objective_function(self, X, y=None):
+        return 0, 0, 0
+
 
 class MetaEstimator(BaseEstimator):
     def __init__(
@@ -75,7 +80,7 @@ def __init__(
         self.prefer = prefer
 
     def fit(self, X, y):
-        root = self._eval_callbacks_on_fit_begin(
+        root, *_ = self._eval_callbacks_on_fit_begin(
             levels=[
                 {"descr": "fit", "max_iter": self.n_outer},
                 {"descr": "outer", "max_iter": self.n_inner},
@@ -93,7 +98,7 @@ def fit(self, X, y):
         self._eval_callbacks_on_fit_end()
 
         return self
-    
+
     def _func(self, estimator, X, y, parent_node, i):
         for j, node in enumerate(parent_node.children):
             est = clone(estimator)
@@ -104,4 +109,4 @@ def _func(self, estimator, X, y, parent_node, i):
 
         _eval_callbacks_on_fit_iter_end(estimator=self, node=parent_node)
 
-        return
\ No newline at end of file
+        return
diff --git a/sklearn/callback/tests/test_base_estimator_callback_methods.py b/sklearn/callback/tests/test_base_estimator_callback_methods.py
index c77d88b68ce3d..01669a5494dde 100644
--- a/sklearn/callback/tests/test_base_estimator_callback_methods.py
+++ b/sklearn/callback/tests/test_base_estimator_callback_methods.py
@@ -10,12 +10,13 @@
 from sklearn.callback.tests._utils import MetaEstimator
 
 
-@pytest.mark.parametrize("callbacks",
+@pytest.mark.parametrize(
+    "callbacks",
     [
         TestingCallback(),
         [TestingCallback()],
         [TestingCallback(), TestingAutoPropagatedCallback()],
-    ]
+    ],
 )
 def test_set_callbacks(callbacks):
     """Sanity check for the _set_callbacks method"""
@@ -49,7 +50,7 @@ def test_propagate_callbacks():
 
     assert hasattr(sub_estimator, "_parent_ct_node")
     assert not_propagated_callback not in sub_estimator._callbacks
-    assert propagated_callback in sub_estimator._callbacks 
+    assert propagated_callback in sub_estimator._callbacks
 
 
 def test_propagate_callback_no_callback():
diff --git a/sklearn/callback/tests/test_callbacks.py b/sklearn/callback/tests/test_callbacks.py
index fb99003eb3b09..aa79503545acb 100644
--- a/sklearn/callback/tests/test_callbacks.py
+++ b/sklearn/callback/tests/test_callbacks.py
@@ -20,13 +20,22 @@
 y = np.zeros(100, dtype=int)
 
 
-@pytest.mark.parametrize("Callback", [ConvergenceMonitor, EarlyStopping, ProgressBar, Snapshot, TextVerbose,])
+@pytest.mark.parametrize(
+    "Callback",
+    [
+        ConvergenceMonitor,
+        EarlyStopping,
+        ProgressBar,
+        Snapshot,
+        TextVerbose,
+    ],
+)
 def test_callback_doesnt_hold_ref_to_estimator(Callback):
     callback = Callback()
     est = Estimator()
     callback_refcount = sys.getrefcount(callback)
     est_refcount = sys.getrefcount(est)
-    
+
     est._set_callbacks(callbacks=callback)
     est.fit(X, y)
     # estimator has a ref on the callback but the callback has no ref to the estimator
diff --git a/sklearn/callback/tests/test_computation_tree.py b/sklearn/callback/tests/test_computation_tree.py
index 902175b71a250..2fe3766eba489 100644
--- a/sklearn/callback/tests/test_computation_tree.py
+++ b/sklearn/callback/tests/test_computation_tree.py
@@ -87,7 +87,10 @@ def test_get_ancestors():
 
     ancestors = node.get_ancestors(include_ancestor_trees=False)
     assert ancestors == [
-        node, node.parent, node.parent.parent, node.parent.parent.parent
+        node,
+        node.parent,
+        node.parent.parent,
+        node.parent.parent.parent,
     ]
     assert [n.idx for n in ancestors] == expected_node_indices
     assert computation_tree.root in ancestors
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 003dbec919033..a3eac9c7e3468 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -36,7 +36,6 @@
 )
 from ..utils import metadata_routing
 from ..callback._base import _eval_callbacks_on_fit_iter_end
-from ..callback._base import callback_aware
 
 
 EPSILON = np.finfo(np.float32).eps
@@ -403,6 +402,7 @@ def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle, random_state):
 
 def _fit_coordinate_descent(
     X,
+    X_val,
     W,
     H,
     tol=1e-4,
@@ -429,6 +429,9 @@ def _fit_coordinate_descent(
     X : array-like of shape (n_samples, n_features)
         Constant matrix.
 
+    X_val : array-like of shape (n_samples_val, n_features)
+        Constant validation matrix.
+
     W : array-like of shape (n_samples, n_components)
         Initial guess for the solution.
 
@@ -469,6 +472,12 @@ def _fit_coordinate_descent(
         results across multiple function calls.
         See :term:`Glossary <random_state>`.
 
+    estimator : estimator instance, default=None
+        The estimator calling this function. Used by callbacks.
+
+    parent_node : ComputationNode instance, default=None
+        The parent node of the current node. Used by callbacks.
+
     Returns
     -------
     W : ndarray of shape (n_samples, n_components)
@@ -490,6 +499,8 @@ def _fit_coordinate_descent(
     # so W and Ht are both in C order in memory
     Ht = check_array(H.T, order="C")
     X = check_array(X, accept_sparse="csr")
+    if X_val is not None:
+        X_val = check_array(X_val, accept_sparse="csr")
 
     rng = check_random_state(random_state)
 
@@ -527,6 +538,7 @@ def _fit_coordinate_descent(
                     "reconstruction_err_": _beta_divergence(X, W, Ht.T, 2, True),
                 },
             ),
+            data={"X": X, "y": None, "X_val": X_val, "y_val": None},
         ):
             break
 
@@ -748,6 +760,7 @@ def _multiplicative_update_h(
 
 def _fit_multiplicative_update(
     X,
+    X_val,
     W,
     H,
     beta_loss="frobenius",
@@ -773,6 +786,9 @@ def _fit_multiplicative_update(
     X : array-like of shape (n_samples, n_features)
         Constant input matrix.
 
+    X_val : array-like of shape (n_samples_val, n_features)
+        Constant validation matrix.
+        
     W : array-like of shape (n_samples, n_components)
         Initial guess for the solution.
 
@@ -813,6 +829,12 @@ def _fit_multiplicative_update(
     verbose : int, default=0
         The verbosity level.
 
+    estimator : estimator instance, default=None
+        The estimator calling this function. Used by callbacks.
+
+    parent_node : ComputationNode instance, default=None
+        The parent node of the current node. Used by callbacks.
+
     Returns
     -------
     W : ndarray of shape (n_samples, n_components)
@@ -909,6 +931,7 @@ def _fit_multiplicative_update(
                     "reconstruction_err_": _beta_divergence(X, W, H, beta_loss, True),
                 },
             ),
+            data={"X": X, "y": None, "X_val": X_val, "y_val": None},
         ):
             break
 
@@ -1340,6 +1363,28 @@ def inverse_transform(self, Xt=None, W=None):
 
         check_is_fitted(self)
         return Xt @ self.components_
+    
+    def objective_function(self, X, y=None, *, W=None, H=None, normalize=False):
+        if W is None:
+            W = self.transform(X)
+        if H is None:
+            H = self.components_
+
+        data_fit = _beta_divergence(X, W, H, self._beta_loss)
+
+        l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = self._compute_regularization(X)
+        penalization = (
+            l1_reg_W * W.sum()
+            + l1_reg_H * H.sum()
+            + l2_reg_W * (W**2).sum()
+            + l2_reg_H * (H**2).sum()
+        )
+
+        if normalize:
+            data_fit /= X.shape[0]
+            penalization /= X.shape[0]
+
+        return data_fit + penalization, data_fit, penalization
 
     @property
     def _n_features_out(self):
@@ -1617,7 +1662,6 @@ def _check_params(self, X):
 
         return self
 
-    @callback_aware
     @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None, W=None, H=None):
         """Learn a NMF model for the data X and returns the transformed data.
@@ -1650,7 +1694,7 @@ def fit_transform(self, X, y=None, W=None, H=None):
             X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32]
         )
 
-        root = self._eval_callbacks_on_fit_begin(
+        root, X, _, X_val, _ = self._eval_callbacks_on_fit_begin(
             levels=[
                 {"descr": "fit", "max_iter": self.max_iter},
                 {"descr": "iter", "max_iter": None},
@@ -1658,7 +1702,7 @@ def fit_transform(self, X, y=None, W=None, H=None):
             X=X,
         )
 
-        W, H, n_iter = self._fit_transform(X, W=W, H=H, parent_node=root)
+        W, H, n_iter = self._fit_transform(X, X_val, W=W, H=H, parent_node=root)
 
         self.reconstruction_err_ = _beta_divergence(
             X, W, H, self._beta_loss, square_root=True
@@ -1672,7 +1716,7 @@ def fit_transform(self, X, y=None, W=None, H=None):
         return W
 
     def _fit_transform(
-        self, X, y=None, W=None, H=None, update_H=True, parent_node=None
+        self, X, X_val=None, W=None, H=None, update_H=True, parent_node=None
     ):
         """Learn a NMF model for the data X and returns the transformed data.
 
@@ -1733,6 +1777,7 @@ def _fit_transform(
         if self.solver == "cd":
             W, H, n_iter = _fit_coordinate_descent(
                 X,
+                X_val,
                 W,
                 H,
                 self.tol,
@@ -1751,6 +1796,7 @@ def _fit_transform(
         elif self.solver == "mu":
             W, H, n_iter, *_ = _fit_multiplicative_update(
                 X,
+                X_val,
                 W,
                 H,
                 self._beta_loss,
@@ -2439,28 +2485,6 @@ def partial_fit(self, X, y=None, W=None, H=None):
 
         return self
 
-    def objective_function(self, X, y=None, *, W=None, H=None, normalize=False):
-        if W is None:
-            W = self.transform(X)
-        if H is None:
-            H = self.components_
-
-        data_fit = _beta_divergence(X, W, H, self._beta_loss)
-
-        l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = self._scale_regularization(X)
-        penalization = (
-            l1_reg_W * W.sum()
-            + l1_reg_H * H.sum()
-            + l2_reg_W * (W ** 2).sum()
-            + l2_reg_H * (H ** 2).sum()
-        )
-
-        if normalize:
-            data_fit /= X.shape[0]
-            penalization /= X.shape[0]
-
-        return data_fit + penalization, data_fit, penalization
-
     @property
     def _n_features_out(self):
         """Number of transformed output features."""
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 732f3c25a93eb..b949a35b28d02 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -449,9 +449,7 @@ def _logistic_regression_path(
         node = (
             None
             if parent_node is None
-            else parent_node
-            if len(Cs) == 1
-            else parent_node.children
+            else parent_node if len(Cs) == 1 else parent_node.children
         )
 
         if solver == "lbfgs":
@@ -1324,7 +1322,9 @@ def fit(self, X, y, sample_weight=None):
                 {"descr": "class", "max_iter": self.max_iter},
                 {"descr": "iter", "max_iter": None},
             ]
-        root = self._eval_callbacks_on_fit_begin(levels=levels, X=X, y=y)
+        root, X, y, X_val, y_val = self._eval_callbacks_on_fit_begin(
+            levels=levels, X=X, y=y
+        )
 
         # distinguish between multinomial and ovr
         nodes = [root] if len(classes_) == 1 else root.children
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 9b9450cfee0ec..9ec5ce8414201 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -831,7 +831,9 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
             all_out = []
             all_more_results = defaultdict(list)
 
-            def evaluate_candidates(candidate_params, cv=None, more_results=None, parent_node=None):
+            def evaluate_candidates(
+                candidate_params, cv=None, more_results=None, parent_node=None
+            ):
                 cv = cv or cv_orig
                 candidate_params = list(candidate_params)
                 n_candidates = len(candidate_params)
@@ -863,8 +865,16 @@ def evaluate_candidates(candidate_params, cv=None, more_results=None, parent_nod
                         caller=self,
                         node=node,
                     )
-                    for ((cand_idx, parameters), (split_idx, (train, test))), node in zip(product(
-                        enumerate(candidate_params), enumerate(cv.split(X, y, groups))), nodes)
+                    for (
+                        (cand_idx, parameters),
+                        (split_idx, (train, test)),
+                    ), node in zip(
+                        product(
+                            enumerate(candidate_params),
+                            enumerate(cv.split(X, y, groups)),
+                        ),
+                        nodes,
+                    )
                 )
 
                 if len(out) < 1:
@@ -1477,6 +1487,7 @@ def _run_search(self, evaluate_candidates):
         """Search all candidates in param_grid"""
         evaluate_candidates(self._param_grid, parent_node=self._computation_tree.root)
 
+
 class RandomizedSearchCV(BaseSearchCV):
     """Randomized search on hyper parameters.
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index cb3563723027c..30fc160880d89 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -723,7 +723,7 @@ def _fit_and_score(
             cloned_parameters[k] = clone(v, safe=False)
 
         estimator = estimator.set_params(**cloned_parameters)
-   
+
     if caller is not None:
         caller._propagate_callbacks(estimator, parent_node=node)
 
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 9002bfcb0d8ad..0eb02009ecf91 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -34,7 +34,6 @@
 from .utils.parallel import delayed, Parallel
 from .exceptions import NotFittedError
 from .callback._base import _eval_callbacks_on_fit_iter_end
-from .callback._base import callback_aware
 
 
 __all__ = ["Pipeline", "FeatureUnion", "make_pipeline", "make_union"]
@@ -357,7 +356,7 @@ def _fit(self, X, y=None, **fit_params_steps):
         # Setup the memory
         memory = check_memory(self.memory)
 
-        root = self._eval_callbacks_on_fit_begin(
+        root, *_ = self._eval_callbacks_on_fit_begin(
             levels=[
                 {"descr": "fit", "max_iter": len(self.steps)},
                 {"descr": "step", "max_iter": None},
@@ -405,7 +404,6 @@ def _fit(self, X, y=None, **fit_params_steps):
 
         return X
 
-    @callback_aware
     @_fit_context(
         # estimators in Pipeline.steps are not validated yet
         prefer_skip_nested_validation=False

From d7208facafece078b0c8e687dc066b432eac2cbc Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 29 Jun 2023 15:04:29 +0200
Subject: [PATCH 19/20] wip

---
 sklearn/base.py                               |   2 +-
 sklearn/callback/__init__.py                  |   4 +-
 sklearn/callback/_convergence_monitor.py      | 126 ------------------
 sklearn/callback/_early_stopping.py           |  16 +--
 sklearn/callback/_monitoring.py               | 111 +++++++++++++++
 sklearn/callback/_text_verbose.py             |   4 -
 sklearn/callback/tests/_utils.py              |  18 +--
 .../test_base_estimator_callback_methods.py   |   2 +-
 sklearn/callback/tests/test_callbacks.py      |   4 +-
 sklearn/decomposition/_nmf.py                 |   2 -
 .../gradient_boosting.py                      |  46 +++++++
 sklearn/linear_model/_logistic.py             |   2 -
 sklearn/model_selection/_search.py            |   2 -
 sklearn/pipeline.py                           |   2 -
 14 files changed, 181 insertions(+), 160 deletions(-)
 delete mode 100644 sklearn/callback/_convergence_monitor.py
 create mode 100644 sklearn/callback/_monitoring.py

diff --git a/sklearn/base.py b/sklearn/base.py
index 9c802b536f89d..09c76277b986e 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -119,7 +119,7 @@ def _clone_parametrized(estimator, *, safe=True):
 
     # copy callbacks
     if hasattr(estimator, "_callbacks"):
-        new_object._callbacks = clone(estimator._callbacks, safe=False)
+        new_object._callbacks = estimator._callbacks
 
     # quick sanity check of the parameters of the clone
     for name in new_object_params:
diff --git a/sklearn/callback/__init__.py b/sklearn/callback/__init__.py
index 9767411b6c934..b74126e1ce327 100644
--- a/sklearn/callback/__init__.py
+++ b/sklearn/callback/__init__.py
@@ -4,7 +4,7 @@
 from ._computation_tree import ComputationNode
 from ._computation_tree import ComputationTree
 from ._computation_tree import load_computation_tree
-from ._convergence_monitor import ConvergenceMonitor
+from ._monitoring import Monitoring
 from ._early_stopping import EarlyStopping
 from ._progressbar import ProgressBar
 from ._snapshot import Snapshot
@@ -15,7 +15,7 @@
     "ComputationNode",
     "ComputationTree",
     "load_computation_tree",
-    "ConvergenceMonitor",
+    "Monitoring",
     "EarlyStopping",
     "ProgressBar",
     "Snapshot",
diff --git a/sklearn/callback/_convergence_monitor.py b/sklearn/callback/_convergence_monitor.py
deleted file mode 100644
index 98fec496d6eb7..0000000000000
--- a/sklearn/callback/_convergence_monitor.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# License: BSD 3 clause
-
-from copy import copy
-from pathlib import Path
-from tempfile import mkdtemp
-
-import matplotlib.pyplot as plt
-import pandas as pd
-
-from . import BaseCallback
-
-# import ..metrics as metrics
-
-
-class ConvergenceMonitor(BaseCallback):
-    """Monitor model convergence.
-
-    Parameters
-    ----------
-    monitor :
-
-    X_val : ndarray, default=None
-        Validation data
-
-    y_val : ndarray, default=None
-        Validation target
-
-    Attributes
-    ----------
-    data : pandas.DataFrame
-        The monitored quantities at each iteration.
-    """
-
-    request_reconstruction_attributes = True
-
-    def __init__(
-        self,
-        *,
-        monitor="objective_function",
-        on="val",
-        higher_is_better=False,
-    ):
-        if monitor == "objective_function":
-            self._monitor = "objective_function"
-        else:
-            self._monitor = getattr(metrics, monitor, None)
-            if self._monitor is None:
-                raise ValueError(f"unknown metric {monitor}")
-
-        self._data_file = Path(mkdtemp()) / "convergence_monitor.csv"
-
-    def on_fit_begin(self, estimator, *, X=None, y=None):
-        self.estimator = estimator
-        self.X_train = X
-        self.y_train = y
-
-    def on_fit_iter_end(self, *, estimator, node, **kwargs):
-        reconstruction_attributes = kwargs.get("reconstruction_attributes", None)
-        if reconstruction_attributes is None:
-            return
-
-        new_estimator = copy(estimator)
-        for key, val in reconstruction_attributes.items():
-            setattr(new_estimator, key, val)
-
-        # if self._monitor =
-
-        obj_train, *_ = new_estimator.objective_function(
-            self.X_train, self.y_train, normalize=True
-        )
-        if self.X_val is not None:
-            obj_val, *_ = new_estimator.objective_function(
-                self.X_val, self.y_val, normalize=True
-            )
-        else:
-            obj_val = None
-
-        ancestors = node.get_ancestors()[:0:-1]
-        ancestors_desc = [
-            f"{n.computation_tree.estimator_name}-{n.description}" for n in ancestors
-        ]
-        ancestors_idx = [f"{n.idx}" for n in ancestors]
-
-        if not self._data_file.exists():
-            with open(self._data_file, "w") as f:
-                f.write(
-                    f"{','.join(ancestors_desc)},iteration,time,obj_train,obj_val\n"
-                )
-
-        with open(self._data_file, "a") as f:
-            f.write(
-                f"{','.join(ancestors_idx)},{node.idx},{curr_time},{obj_train},{obj_val}\n"
-            )
-
-    def on_fit_end(self):
-        pass
-
-    def get_data(self):
-        if not hasattr(self, "data"):
-            self.data = pd.read_csv(self._data_file)
-        return self.data
-
-    def plot(self, x="iteration"):
-        data = self.get_data()
-
-        # all columns but iteration, time, obj_train, obj_val
-        group_by_columns = list(data.columns[:-4])
-        groups = data.groupby(group_by_columns)
-
-        for key in groups.groups.keys():
-            group = groups.get_group(key)
-            fig, ax = plt.subplots()
-
-            ax.plot(group[x], group["obj_train"], label="obj_train")
-            if self.X_val is not None:
-                ax.plot(group[x], group["obj_val"], label="obj_val")
-
-            if x == "iteration":
-                x_label = "Number of iterations"
-            elif x == "time":
-                x_label = "Time (s)"
-            ax.set_xlabel(x_label)
-            ax.set_ylabel("objective function")
-
-            ax.legend()
-            plt.show()
diff --git a/sklearn/callback/_early_stopping.py b/sklearn/callback/_early_stopping.py
index 6d408dda8c960..1ad9ad8437d37 100644
--- a/sklearn/callback/_early_stopping.py
+++ b/sklearn/callback/_early_stopping.py
@@ -25,13 +25,14 @@ def __init__(
         self.max_no_improvement = max_no_improvement
         self.threshold = threshold
 
-    def on_fit_begin(self, estimator, X=None, y=None):
         self._no_improvement = {}
         self._last_monitored = {}
-        self.early_stopped_ = None
+
+    def on_fit_begin(self, estimator, X=None, y=None):
+        pass
 
     def on_fit_iter_end(self, *, estimator, node, **kwargs):
-        if node.depth != estimator._computation_tree.depth:
+        if node.depth != node.computation_tree.depth:
             return
 
         reconstructed_estimator = kwargs.pop("from_reconstruction_attributes")
@@ -46,8 +47,8 @@ def on_fit_iter_end(self, *, estimator, node, **kwargs):
             new_monitored = self.monitor(reconstructed_estimator, X, y)
         elif self.monitor is None or isinstance(self.monitor, str):
             from ..metrics import check_scoring
-            scorer = check_scoring(estimator, self.monitor)
-            new_monitored = scorer(estimator, X, y)
+            scorer = check_scoring(reconstructed_estimator, self.monitor)
+            new_monitored = scorer(reconstructed_estimator, X, y)
 
         if self._score_improved(node, new_monitored):
             self._no_improvement[node.parent] = 0
@@ -56,9 +57,8 @@ def on_fit_iter_end(self, *, estimator, node, **kwargs):
             self._no_improvement[node.parent] += 1
 
         if self._no_improvement[node.parent] >= self.max_no_improvement:
-            self.early_stopped_ = node.idx
             return True
-        
+
     def _score_improved(self, node, new_monitored):
         if node.parent not in self._last_monitored:
             return True
@@ -74,4 +74,4 @@ def on_fit_end(self):
 
     @property
     def request_validation_split(self):
-        return self.on == "val"
+        return self.on == "validation_set"
diff --git a/sklearn/callback/_monitoring.py b/sklearn/callback/_monitoring.py
new file mode 100644
index 0000000000000..2eb7c8de5885f
--- /dev/null
+++ b/sklearn/callback/_monitoring.py
@@ -0,0 +1,111 @@
+# License: BSD 3 clause
+
+# import os
+from pathlib import Path
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+from tempfile import mkdtemp
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from . import BaseCallback
+
+
+class Monitoring(BaseCallback):
+    """Monitor model convergence.
+
+    Parameters
+    ----------
+    monitor :
+
+    X_val : ndarray, default=None
+        Validation data
+
+    y_val : ndarray, default=None
+        Validation target
+
+    Attributes
+    ----------
+    data : pandas.DataFrame
+        The monitored quantities at each iteration.
+    """
+
+    request_from_reconstruction_attributes = True
+
+    def __init__(
+        self,
+        *,
+        monitor="objective_function",
+        on="validation_set",
+        validation_split="auto",
+    ):
+        from ..model_selection import KFold
+        self.validation_split = validation_split
+        if validation_split == "auto":
+            self.validation_split = KFold(n_splits=5, shuffle=True, random_state=42)
+        self.monitor = monitor
+        self.on = on
+
+        self._data_dir = TemporaryDirectory()
+        self._data_files = {}
+
+        if isinstance(self.monitor, str):
+            self.monitor_name = self.monitor
+        elif callable(self.monitor):
+            self.monitor_name = self.monitor.__name__
+
+    def on_fit_begin(self, estimator, *, X=None, y=None):
+        fname = Path(self._data_dir.name) / f"{estimator._computation_tree.uid}.csv"
+        with open(fname, "w") as file:
+            file.write(f"iteration,{self.monitor_name}_train,{self.monitor_name}_val\n")
+        self._data_files[estimator._computation_tree] = fname
+
+    def on_fit_iter_end(self, *, estimator, node, from_reconstruction_attributes, data, **kwargs):
+        if node.depth != node.computation_tree.depth:
+            return
+
+        new_estimator = from_reconstruction_attributes     
+
+        X, y, X_val, y_val = data["X"], data["y"], data["X_val"], data["y_val"]
+
+        if self.monitor == "objective_function":
+            new_monitored_train, *_ = new_estimator.objective_function(X, y, normalize=True)
+            if X_val is not None:
+                new_monitored_val, *_ = new_estimator.objective_function(X_val, y_val, normalize=True)
+        elif callable(self.monitor):
+            new_monitored_train = self.monitor(new_estimator, X, y)
+            if X_val is not None:
+                new_monitored_val = self.monitor(new_estimator, X_val, y_val)
+        elif self.monitor is None or isinstance(self.monitor, str):
+            from ..metrics import check_scoring
+            scorer = check_scoring(new_estimator, self.monitor)
+            new_monitored_train = scorer(new_estimator, X, y)
+            if X_val is not None:
+                new_monitored_val = scorer(new_estimator, X_val, y_val)
+
+        if X_val is None:
+            new_monitored_val = None
+
+        with open(self._data_files[node.computation_tree], "a") as f:
+            f.write(f"{node.idx},{new_monitored_train},{new_monitored_val}\n")
+
+    def on_fit_end(self):
+        pass
+
+    # @property
+    # def data(self):
+
+    def plot(self):
+        data_files = [p for p in Path(self._data_dir.name).iterdir() if p.is_file()]
+        for f in data_files:
+            data = pd.read_csv(f)
+            fig, ax = plt.subplots()
+            ax.plot(data["iteration"], data[f"{self.monitor_name}_train"], label=f"train set")
+            if self.on != "train_set":
+                ax.plot(data["iteration"], data[f"{self.monitor_name}_val"], label=f"validation set")
+
+            ax.set_xlabel("Number of iterations")
+            ax.set_ylabel(self.monitor_name)
+
+            ax.legend()
+            plt.show()
diff --git a/sklearn/callback/_text_verbose.py b/sklearn/callback/_text_verbose.py
index 93f783a297d30..9773f1c8a6f51 100644
--- a/sklearn/callback/_text_verbose.py
+++ b/sklearn/callback/_text_verbose.py
@@ -9,11 +9,7 @@ class TextVerbose(BaseCallback):
     auto_propagate = True
     request_stopping_criterion = True
 
-    def __init__(self, min_time_between_calls=0):
-        self.min_time_between_calls = min_time_between_calls
-
     def on_fit_begin(self, estimator, X=None, y=None):
-        self.estimator = estimator
         self._start_time = time.perf_counter()
 
     def on_fit_iter_end(self, *, node, **kwargs):
diff --git a/sklearn/callback/tests/_utils.py b/sklearn/callback/tests/_utils.py
index f61ffc4077dff..4144ba3ddf3f3 100644
--- a/sklearn/callback/tests/_utils.py
+++ b/sklearn/callback/tests/_utils.py
@@ -2,7 +2,7 @@
 
 from joblib.parallel import Parallel, delayed
 
-from sklearn.base import BaseEstimator, clone
+from sklearn.base import BaseEstimator, clone, _fit_context
 from sklearn.callback import BaseCallback
 from sklearn.callback._base import _eval_callbacks_on_fit_iter_end
 
@@ -34,9 +34,12 @@ def on_fit_iter_end(self, estimator, node, **kwargs):
 
 
 class Estimator(BaseEstimator):
+    _parameter_constraints = {}
+
     def __init__(self, max_iter=20):
         self.max_iter = max_iter
 
+    @_fit_context(prefer_skip_nested_validation=False)
     def fit(self, X, y):
         root, X, y, X_val, y_val = self._eval_callbacks_on_fit_begin(
             levels=[
@@ -55,21 +58,21 @@ def fit(self, X, y):
                     self._from_reconstruction_attributes,
                     reconstruction_attributes=lambda: {"n_iter_": i + 1},
                 ),
-                data={"X": X, "y": y, "X_val": X_val, "y_val": y_val"},
+                data={"X": X, "y": y, "X_val": X_val, "y_val": y_val},
             ):
                 break
 
         self.n_iter_ = i + 1
 
-        self._eval_callbacks_on_fit_end()
-
         return self
 
-    def objective_function(self, X, y=None):
+    def objective_function(self, X, y=None, normalize=False):
         return 0, 0, 0
 
 
 class MetaEstimator(BaseEstimator):
+    _parameter_constraints = {}
+
     def __init__(
         self, estimator, n_outer=4, n_inner=3, n_jobs=None, prefer="processes"
     ):
@@ -79,8 +82,9 @@ def __init__(
         self.n_jobs = n_jobs
         self.prefer = prefer
 
+    @_fit_context(prefer_skip_nested_validation=False)
     def fit(self, X, y):
-        root, *_ = self._eval_callbacks_on_fit_begin(
+        root, X, y, _, _ = self._eval_callbacks_on_fit_begin(
             levels=[
                 {"descr": "fit", "max_iter": self.n_outer},
                 {"descr": "outer", "max_iter": self.n_inner},
@@ -95,8 +99,6 @@ def fit(self, X, y):
             for i, node in enumerate(root.children)
         )
 
-        self._eval_callbacks_on_fit_end()
-
         return self
 
     def _func(self, estimator, X, y, parent_node, i):
diff --git a/sklearn/callback/tests/test_base_estimator_callback_methods.py b/sklearn/callback/tests/test_base_estimator_callback_methods.py
index 01669a5494dde..2f554101dcfa3 100644
--- a/sklearn/callback/tests/test_base_estimator_callback_methods.py
+++ b/sklearn/callback/tests/test_base_estimator_callback_methods.py
@@ -88,7 +88,7 @@ def test_eval_callbacks_on_fit_begin():
         {"descr": "fit", "max_iter": 10},
         {"descr": "iter", "max_iter": None},
     ]
-    ct_root = estimator._eval_callbacks_on_fit_begin(levels=levels)
+    ct_root, *_ = estimator._eval_callbacks_on_fit_begin(levels=levels)
     assert hasattr(estimator, "_computation_tree")
     assert ct_root is estimator._computation_tree.root
 
diff --git a/sklearn/callback/tests/test_callbacks.py b/sklearn/callback/tests/test_callbacks.py
index aa79503545acb..e453705b637e1 100644
--- a/sklearn/callback/tests/test_callbacks.py
+++ b/sklearn/callback/tests/test_callbacks.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 
-from sklearn.callback import ConvergenceMonitor
+from sklearn.callback import Monitoring
 from sklearn.callback import EarlyStopping
 from sklearn.callback import ProgressBar
 from sklearn.callback import Snapshot
@@ -23,7 +23,7 @@
 @pytest.mark.parametrize(
     "Callback",
     [
-        ConvergenceMonitor,
+        Monitoring,
         EarlyStopping,
         ProgressBar,
         Snapshot,
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index a3eac9c7e3468..8cd485114ac9c 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -1711,8 +1711,6 @@ def fit_transform(self, X, y=None, W=None, H=None):
         self.components_ = H
         self.n_iter_ = n_iter
 
-        self._eval_callbacks_on_fit_end()
-
         return W
 
     def _fit_transform(
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index e44b6428f8f4e..e5df230279b59 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -19,6 +19,7 @@
 )
 from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier
 from ...base import _fit_context
+from ...callback._base import _eval_callbacks_on_fit_iter_end
 from ...utils import check_random_state, resample, compute_sample_weight
 from ...utils.validation import (
     check_is_fitted,
@@ -462,6 +463,17 @@ def fit(self, X, y, sample_weight=None):
             X_train, y_train, sample_weight_train = X, y, sample_weight
             X_val = y_val = sample_weight_val = None
 
+        begin_at_stage = 0 if not (self._is_fitted() and self.warm_start) else self.n_iter_    
+
+        root, X_train, y_train, X_val, y_val = self._eval_callbacks_on_fit_begin(
+            levels=[
+                {"descr": "fit", "max_iter": self.max_iter - begin_at_stage},
+                {"descr": "iter", "max_iter": None},
+            ],
+            X=X,
+            y=y,
+        )
+
         # Bin the data
         # For ease of use of the API, the user-facing GBDT classes accept the
         # parameter max_bins, which doesn't take into account the bin for
@@ -756,6 +768,26 @@ def fit(self, X, y, sample_weight=None):
             if should_early_stop:
                 break
 
+            if _eval_callbacks_on_fit_iter_end(
+                estimator=self,
+                node=root.children[iteration - begin_at_stage],
+                fit_state={},
+                from_reconstruction_attributes=partial(
+                    self._from_reconstruction_attributes,
+                    reconstruction_attributes=lambda: {
+                        "train_score_": np.asarray(self.train_score_),
+                        "validation_score_": np.asarray(self.validation_score_),
+                    },
+                ),
+                data={
+                    "X": X_binned_train,
+                    "y": y_train,
+                    "X_val": X_binned_val,
+                    "y_val": y_val
+                },
+            ):
+                break
+
         if self.verbose:
             duration = time() - fit_start_time
             n_total_leaves = sum(
@@ -794,8 +826,22 @@ def fit(self, X, y, sample_weight=None):
         self.train_score_ = np.asarray(self.train_score_)
         self.validation_score_ = np.asarray(self.validation_score_)
         del self._in_fit  # hard delete so we're sure it can't be used anymore
+
         return self
 
+    def objective_function(self, X, y, *, raw_predictions=None, normalize=False):
+        if raw_predictions is None:
+            raw_predictions = self._raw_predict(X)
+
+        loss = self._loss(
+                y_true=y,
+                raw_prediction=raw_predictions,
+            )
+        if normalize:
+            loss /= raw_predictions.shape[0]
+
+        return loss, loss, 0
+
     def _is_fitted(self):
         return len(getattr(self, "_predictors", [])) > 0
 
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index b949a35b28d02..dcaf6377dbe1f 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -1397,8 +1397,6 @@ def fit(self, X, y, sample_weight=None):
         else:
             self.intercept_ = np.zeros(n_classes)
 
-        self._eval_callbacks_on_fit_end()
-
         return self
 
     def predict_proba(self, X):
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 9ec5ce8414201..c6c718ca0d4fa 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1481,8 +1481,6 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
         )
         super().fit(X, y=y, groups=groups, **fit_params)
 
-        self._eval_callbacks_on_fit_end()
-
     def _run_search(self, evaluate_candidates):
         """Search all candidates in param_grid"""
         evaluate_candidates(self._param_grid, parent_node=self._computation_tree.root)
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 0eb02009ecf91..0d563cbb10c12 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -447,8 +447,6 @@ def fit(self, X, y=None, **fit_params):
 
                 _eval_callbacks_on_fit_iter_end(estimator=self, node=node)
 
-        self._eval_callbacks_on_fit_end()
-
         return self
 
     def _can_fit_transform(self):

From b8ac1a5e86aeee791675aebd36758c23831a3efa Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 18 Oct 2023 12:10:21 +0200
Subject: [PATCH 20/20] cln

---
 sklearn/callback/_progressbar.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/callback/_progressbar.py b/sklearn/callback/_progressbar.py
index f802cc63b3b9a..f8ed251add34a 100644
--- a/sklearn/callback/_progressbar.py
+++ b/sklearn/callback/_progressbar.py
@@ -103,7 +103,7 @@ def get_renderables(self):
             table = self.make_tasks_table(getattr(self, "_ordered_tasks", []))
             yield table
 
-except:
+except ImportError:
     pass
 
 
@@ -262,7 +262,8 @@ def _format_task_description(self, node, computation_tree, depth):
         description = f"{computation_tree.estimator_name} - {node.description}"
         if node.parent is None and computation_tree.parent_node is not None:
             description = (
-                f"{computation_tree.parent_node.description} {computation_tree.parent_node.idx} |"
+                f"{computation_tree.parent_node.description} "
+                f"{computation_tree.parent_node.idx} |"
                 f" {description}"
             )
         if node.parent is not None: