From ae216ce3ac3f20d46b723fe0083b4c32c30337af Mon Sep 17 00:00:00 2001 From: Jovan Stojanovic Date: Wed, 3 Jul 2024 15:45:37 +0200 Subject: [PATCH 1/2] add parameter description --- hazardous/data/_competing_weibull.py | 68 +++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index 2b15f9c..d72ddbc 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -20,6 +20,25 @@ def _censor(y, relative_scale, random_state=None): + """Censoring a population based on a relative scale. + + Individuals are censored by sampling a censoring time from + a Weibull distribution with shape 1 and scale equal to + the mean duration of the target event times the + ``censoring_relative_scale``. + + Parameters + ---------- + y: ndarray + The target population. + relative_scale: float + Relative scale of the censoring. Setting it to 0 or None + disables censoring, setting it to a small value (e.g. 0.5 + instead of 1.5) will result in a larger fraction of + censored individuals. + + """ + if relative_scale == 0 or relative_scale is None: return y @@ -59,13 +78,48 @@ def make_synthetic_competing_weibull( event (competing risks setting) and its event identifier and duration are returned as the target dataframe. - A fraction of the individuals are censored by sampling a censoring time - from a Weibull distribution with shape 1 and scale equal to the mean - duration of the target event times the ``censoring_relative_scale``. + A fraction of the individuals are censored if ``censoring_relative_scale`` + is not None or 0. + + Parameters + ---------- + n_events: int, default=3 + Number of events. + n_samples: int, default=3000 + Number of samples by event type. + return_X_y: bool, default=False + If True, returns ``(data, target)`` instead of a Bunch object. + feature_rounding: int or None, default=2 + Round the feature values. If None, no rounding will be applied. + target_rounding: int or None, default=1 + Round the target values. If None, no rounding will be applied. + shape_ranges: tuple of shape (2, n_events) + The lower and upper boundary of the shape, `n_samples` shape + values for `n_events` will be drawn from a uniform distribution. + scale_ranges: tuple of shape (2, n_events) + The lower and upper boundary of the scale, `n_samples` scale + values for `n_events` will be drawn from a uniform distribution. + base_scale: int, default=1000 + Scaling parameter of the ``scale_range``. + censoring_relative_scale: float, default=1.5 + Relative scale of the censoring level. Individuals are censored by + sampling a censoring time from a Weibull distribution with shape 1 + and scale equal to the mean duration of the target event times + the ``censoring_relative_scale``. + Setting ``censoring_relative_scale`` to 0 or None disables censoring. + Setting it to a small value (e.g. 0.5 instead of 1.5) will result in a + larger fraction of censored individuals. + random_state : int, RandomState instance or None, default=None + Controls the randomness of the uniform time sampler. + + Returns + ------- + (data, target): tuple if ``return_X_y`` is True + A tuple of two ndarray. The first containing a 2D array of shape + (n_samples, n_features) with each row representing one sample + and each column representing the events. The second ndarray + of shape (n_samples,) containing the target samples. - Setting ``censoring_relative_scale`` to 0 or None disables censoring. - Setting it to a small value (e.g. 0.5 instead of 1.5) will result in a - larger fraction of censored individuals. """ rng = check_random_state(random_state) all_features = [] @@ -101,4 +155,4 @@ def make_synthetic_competing_weibull( return X, y frame = pd.concat([X, y], axis=1) - return Bunch(data=frame[X.columns], target=X[y.columns], frame=frame) + return Bunch(data=frame[X.columns], target=frame[y.columns], frame=frame) From 05e08cac1d11eaf5da55c39cdc8d63acab17784a Mon Sep 17 00:00:00 2001 From: Jovan Stojanovic Date: Wed, 3 Jul 2024 16:27:19 +0200 Subject: [PATCH 2/2] improve explanations --- hazardous/data/_competing_weibull.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index d72ddbc..e6d6893 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -25,7 +25,7 @@ def _censor(y, relative_scale, random_state=None): Individuals are censored by sampling a censoring time from a Weibull distribution with shape 1 and scale equal to the mean duration of the target event times the - ``censoring_relative_scale``. + ``relative_scale``. Parameters ---------- @@ -86,17 +86,17 @@ def make_synthetic_competing_weibull( n_events: int, default=3 Number of events. n_samples: int, default=3000 - Number of samples by event type. + Number of individuals in the population. return_X_y: bool, default=False If True, returns ``(data, target)`` instead of a Bunch object. feature_rounding: int or None, default=2 Round the feature values. If None, no rounding will be applied. target_rounding: int or None, default=1 Round the target values. If None, no rounding will be applied. - shape_ranges: tuple of shape (2, n_events) + shape_ranges: tuple of shape (n_events, 2) The lower and upper boundary of the shape, `n_samples` shape values for `n_events` will be drawn from a uniform distribution. - scale_ranges: tuple of shape (2, n_events) + scale_ranges: tuple of shape (n_events, 2) The lower and upper boundary of the scale, `n_samples` scale values for `n_events` will be drawn from a uniform distribution. base_scale: int, default=1000 @@ -115,10 +115,10 @@ def make_synthetic_competing_weibull( Returns ------- (data, target): tuple if ``return_X_y`` is True - A tuple of two ndarray. The first containing a 2D array of shape + A tuple of two dataframes. The first containing a 2D array of shape (n_samples, n_features) with each row representing one sample - and each column representing the events. The second ndarray - of shape (n_samples,) containing the target samples. + and each column representing the events. The second dataframe + of shape (n_samples, 2) containing the target samples. """ rng = check_random_state(random_state)