From 686d7581496fde08dc0fdcba59e46d47c89580d6 Mon Sep 17 00:00:00 2001 From: RianneSchouten Date: Wed, 27 Jun 2018 15:21:12 +0200 Subject: [PATCH 001/163] add example multiple imputation --- examples/plot_multiple_imputation.py | 389 +++++++++++++++++++++++++++ 1 file changed, 389 insertions(+) create mode 100644 examples/plot_multiple_imputation.py diff --git a/examples/plot_multiple_imputation.py b/examples/plot_multiple_imputation.py new file mode 100644 index 0000000000000..ac3c8bf1d6d08 --- /dev/null +++ b/examples/plot_multiple_imputation.py @@ -0,0 +1,389 @@ +""" +================================================= +Imputing missing values using multiple imputation +================================================= + +By default, the ChainedImputer performs single imputation: a method where every +missing value is replaced with one imputed value. The strength of the method is +that it allows for finding unbiased statistical estimates due to its chained +character. However, the disadvantage is that every imputed value is treated as +if the value was observed, leading to an imputed dataset that does not reflect +the uncertainty that occurs due to the presence of missing values. This makes it +hard to find valid statistical inferences because the variance (and standard error) +of statistical estimates become too small. + +An alternative is using the ChainedImputer to perform multiple imputation: a method +where every missing value is imputed multiple times. The procedure results in +multiple datasets where the observed data is similar in every dataset, but the imputed +data is different. All desired steps after imputation are performed on every dataset, +including the analysis. Then, Rubin's pooling rules are used to combine the estimates +into one final result. + +In this example we will show how to use the ChainedImputer to perform multiple imputation, +what the effect is on the standard error of beta coefficients and how to set up a prediction +model using multiple imputation. +""" + +import numpy as np +from scipy import stats +import matplotlib.pyplot as plt + +from sklearn.datasets import load_boston +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +from sklearn.impute import SimpleImputer, ChainedImputer +from sklearn.metrics import mean_squared_error as mse + +rng = np.random.RandomState(0) + +def ampute(X, missing_rate = 0.75, mech = "MCAR"): + + n_samples = X.shape[0] + n_features = X.shape[1] + X_incomplete = X.copy() + + # MCAR mechanism + if mech == 'MCAR': + for i in np.arange(n_features): + dropped_indices = np.array(np.random.choice(np.arange(n_samples), size=int(missing_rate * n_samples), replace=False)) + X_incomplete[dropped_indices[:, None], i] = None + + # MNAR mechanism + if mech == "MNAR": + for i in np.arange(n_features): + data_values = -np.mean(X[:, i]) + X[:, i] + weights = list(map(lambda x: math.exp(x) / (1 + math.exp(x)), data_values)) + probs = np.array(weights) / np.sum(np.array(weights)) + dropped_indices = np.array(np.random.choice(np.arange(n_samples), size=int(missing_rate * n_samples), p=probs, replace=False)) + X_incomplete[dropped_indices[:, None], i] = None + + return X_incomplete + +def calculate_variance_of_beta_estimates(y_true, y_pred, X): + + residuals = np.sum((y_true - y_pred)**2) + sigma_hat_squared = (1 / (len(y_true) - 2)) * residuals + X_prime_X = np.dot(X.T, X) + covariance_matrix = sigma_hat_squared / X_prime_X + vars = np.diag(covariance_matrix) + + return vars + +### EXAMPLE 1. +### COMPARE STATISTICAL ESTIMATES AND THEIR VARIANCE FOR LINEAR REGRESSION MODEL + +def get_results_full_dataset(X, y): + + # Perform linear regression on full data as a way of comparison + estimator = LinearRegression() + estimator.fit(X, y) + y_predict = estimator.predict(X) + + # Save the beta estimates + # The variance of these estimates + # And 1.96 * standard error of the estimates (useful to know the 95% confidence interval) + full_coefs = estimator.coef_ + full_vars = calculate_variance_of_beta_estimates(y, y_predict, X) + full_errorbar = 1.96 * np.sqrt(full_vars) + + return full_coefs, full_vars, full_errorbar + +def get_results_chained_imputation(X_incomplete, y): + + # Impute incomplete data with ChainedImputer + # Setting burnin at 99 and using only the last imputation + imputer = ChainedImputer(n_burn_in=99, n_imputations=1) + imputer.fit(X_incomplete) + X_imputed = imputer.transform(X_incomplete) + + # Perform linear regression on chained single imputed data + # Estimate beta estimates and their variances + estimator = LinearRegression() + estimator.fit(X_imputed, y) + y_predict = estimator.predict(X_imputed) + + # Save the beta estimates + # The variance of these estimates + # And 1.96 * standard error of the estimates + chained_coefs = estimator.coef_ + chained_vars = calculate_variance_of_beta_estimates(y, y_predict, X_imputed) + chained_errorbar = 1.96 * np.sqrt(chained_vars) + + return chained_coefs, chained_vars, chained_errorbar + +def get_results_mice_imputation(X_incomplete, y): + + # Impute incomplete data using the ChainedImputer as a MICEImputer + # Setting burnin at 99, using only last imputation and loop this procedure m times + m = 5 + multiple_imputations = [] + + for i in range(m): + + imputer = ChainedImputer(n_burn_in=99, n_imputations=1,random_state=i) + imputer.fit(X_incomplete) + X_imputed = imputer.transform(X_incomplete) + multiple_imputations.append(X_imputed) + + # Perform a model on each of the m imputed datasets + # Estimate the estimates for each model/dataset + m_coefs = [] + m_vars = [] + for i in range(m): + + estimator = LinearRegression() + estimator.fit(multiple_imputations[i], y) + y_predict = estimator.predict(multiple_imputations[i]) + + m_coefs.append(estimator.coef_) + m_vars.append(calculate_variance_of_beta_estimates(y, y_predict, multiple_imputations[i])) + + # Calculate the end estimates by applying Rubin's rules + # Rubin's rules can be slightly different for different types of estimates + # In case of linear regression, these are the rules: + # The value of every estimate is the mean of estimates in each of the m datasets + # The variance of these estimates is a combination of the variance of each of the m estimates (Ubar) + # And the variance between the m estimates (B) + + Qbar = np.mean(m_coefs, axis = 0) + Ubar = np.mean(m_vars, axis = 0) + B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis = 0) + T = Ubar + B + (B/m) + + # The final 1.96 * standard error is then the sqrt of the variance + mice_errorbar = 1.96 * np.sqrt(T) + + return Qbar, T, mice_errorbar + +# The original MICE procedure includes all variables inluding the output variable in the imputation +# process. The idea is that the imputation model should at least contain the analysis model to +# result in unbiased estimates +def get_results_mice_imputation_includingy(X_incomplete, y): + + # Impute incomplete data using the ChainedImputer as a MICEImputer + # Now using the output variable in the imputation loop + m = 5 + multiple_imputations = [] + + for i in range(m): + + Xy = np.column_stack((X_incomplete, y)) + imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) + imputer.fit(Xy) + data_imputed = imputer.transform(Xy) + + # We save only the X imputed data because we don't want to use y to predict y later on + X_imputed = data_imputed[:, :-1] + multiple_imputations.append(X_imputed) + + # Perform linear regression on mice multiple imputed data + # Estimate beta estimates and their variances + m_coefs = [] + m_vars = [] + for i in range(m): + + estimator = LinearRegression() + estimator.fit(multiple_imputations[i], y) + y_predict = estimator.predict(multiple_imputations[i]) + + m_coefs.append(estimator.coef_) + m_vars.append(calculate_variance_of_beta_estimates(y, y_predict, multiple_imputations[i])) + + # Calculate the end results by applying Rubin's rules + # The value of every estimate is the mean of the values over the m datasets + # The variance of these estimates is a combination of the variance of each of the m estimates (Ubar) + # And the variance between the m estimates (B) + + Qbar = np.mean(m_coefs, axis = 0) + Ubar = np.mean(m_vars, axis = 0) + B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis = 0) + T = Ubar + B + (B/m) + + # The final 1.96 * standard error is then the sqrt of the variance + mice_errorbar = 1.96 * np.sqrt(T) + + return Qbar, T, mice_errorbar + +# Now lets run these imputation procedures +# We use the Boston dataset and analyze the outcomes of the beta coefficients and their standard errors +# We standardize the data before running the procedure to be able to compare the coefficients +# We run the procedure for 3 missingness mechanisms (MCAR, MAR and MNAR) + +dataset = load_boston() +X_full, y = dataset.data, dataset.target + +scaler = StandardScaler() +X_scaled = scaler.fit_transform(X_full) +y_scaled = stats.zscore(y) + +print("Executing Example 1 MCAR Missingness") +Boston_X_incomplete_MCAR = ampute(X_scaled, mech = "MCAR") + +full_coefs, full_vars, full_errorbar = get_results_full_dataset(X_scaled, y_scaled) +chained_coefs, chained_vars, chained_errorbar = get_results_chained_imputation(Boston_X_incomplete_MCAR, y_scaled) +mice_coefs, mice_vars, mice_errorbar = get_results_mice_imputation(Boston_X_incomplete_MCAR, y_scaled) +mice_y_coefs, mice_y_vars, mice_y_errorbar = get_results_mice_imputation_includingy(Boston_X_incomplete_MCAR, y_scaled) + +coefs = (full_coefs, chained_coefs, mice_coefs, mice_y_coefs) +vars = (full_vars, chained_vars, mice_vars, mice_y_vars) +errorbars = (full_errorbar, chained_errorbar, mice_errorbar, mice_y_errorbar) + +# We plot the results +n_situations = 4 +n = np.arange(n_situations) +n_labels = ['Full Data', 'Chained Imputer', 'Mice Imputer', 'Mice Imputer with y'] +colors = ['r', 'orange', 'b', 'purple'] +width = 0.3 +plt.figure(figsize=(24, 16)) + +plt1 = plt.subplot(211) +for j in n: + plt1.bar(np.arange(len(coefs[j])) + (3*j*(width/n_situations)), coefs[j], width = width, color = colors[j]) +plt.legend(n_labels) + +plt2 = plt.subplot(212) +for j in n: + plt2.bar(np.arange(len(errorbars[j])) + (3*j*(width/n_situations)), errorbars[j], width = width, color = colors[j]) + +plt1.set_title("MCAR Missingness") +plt1.set_ylabel("Beta Coefficients") +plt2.set_ylabel("Standard Errors") +plt1.set_xlabel("Features") +plt2.set_xlabel("Features") + +plt.show() + +### EXAMPLE 2. ### +### SHOW MULTIPLE IMPUTATION IN PREDICTION CONTEXT ### + +# In this example, we show how to apply the imputer in a train/test situation +# There are two approaches to get the end result of the prediction model +# In approach 1 you calculate the evaluation metric for every i in m and later average these values +# In approach 2 you average the predictions of every i in m and then calculate the evaluation metric + +def get_results_full_data(X_train, X_test, y_train, y_test): + + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(X_test) + + estimator = LinearRegression() + estimator.fit(X_train_scaled, y_train) + y_predict = estimator.predict(X_test_scaled) + mse_full = mse(y_test, y_predict) + + return mse_full + +# Perform pipeline for i in m +# Approach 1: pool the mse values of the m datasets +def get_results_multiple_imputation_approach1(X_train, X_test, y_train, y_test): + + m = 5 + multiple_mses = [] + + for i in range(m): + + # Fit the imputer for every i in im + # Be aware that you fit the imputer on the train data + # And apply to the test data + imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) + X_train_imputed = imputer.fit_transform(X_train) + X_test_imputed = imputer.transform(X_test) + + # Perform the steps you wish to take before fitting the estimator + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train_imputed) + X_test_scaled = scaler.transform(X_test_imputed) + + # Finally fit the estimator and calculate the error metric for every i in m + estimator = LinearRegression() + estimator.fit(X_train_scaled, y_train) + y_predict = estimator.predict(X_test_scaled) + mse_approach1 = mse(y_test, y_predict) + multiple_mses.append(mse_approach1) + + # Average the error metric over the m loops to get a final result + mse_approach1 = np.mean(multiple_mses, axis=0) + + return mse_approach1 + +# Approach 2: average the predictions of the m datasets and then calculate the mse +def get_results_multiple_imputation_approach2(X_train, X_test, y_train, y_test): + + m = 5 + multiple_predictions = [] + + for i in range(m): + + # Fit the imputer for every i in im + # Be aware that you fit the imputer on the train data + # And apply to the test data + imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) + X_train_imputed = imputer.fit_transform(X_train) + X_test_imputed = imputer.transform(X_test) + + # Perform the steps you wish to take before fitting the estimator + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train_imputed) + X_test_scaled = scaler.transform(X_test_imputed) + + # Finally fit the estimator and calculate the predictions for every i in m + estimator = LinearRegression() + estimator.fit(X_train_scaled, y_train) + y_predict = estimator.predict(X_test_scaled) + multiple_predictions.append(y_predict) + + # Average the predictions over the m loops + # Then calculate the error metric + predictions_average = np.mean(multiple_predictions, axis=0) + mse_approach2 = mse(y_test, predictions_average) + + return mse_approach2 + +def perform_simulation(dataset, X_incomplete, nsim = 10): + + X_full, y = dataset.data, dataset.target + outcome = [] + + for j in np.arange(nsim): + + train_indices, test_indices = train_test_split(np.arange(X_full.shape[0])) + + X_incomplete_train = X_incomplete[train_indices] + X_full_train = X_full[train_indices] + X_incomplete_test = X_incomplete[test_indices] + X_full_test = X_full[test_indices] + y_train = y[train_indices] + y_test = y[test_indices] + + mse_full = get_results_full_data(X_full_train, X_full_test, y_train, y_test) + mse_approach1 = get_results_multiple_imputation_approach1(X_incomplete_train, X_incomplete_test, y_train, y_test) + mse_approach2 = get_results_multiple_imputation_approach2(X_incomplete_train, X_incomplete_test, y_train, y_test) + + outcome.append((mse_full, mse_approach1, mse_approach2)) + + return np.mean(outcome, axis = 0), np.std(outcome, axis = 0) + +# Execute +print("Executing Example 1 MCAR Missingness") +Boston_X_incomplete_MCAR = ampute(X_scaled, mech = "MCAR") +mse_means, mse_std = perform_simulation(load_boston(), Boston_X_incomplete_MCAR, nsim=10) + +# Plot results +n_situations = 3 +n = np.arange(n_situations) +n_labels = ['Full Data', 'Average MSE', 'Average Predictions'] +colors = ['r', 'green', 'yellow'] +width = 0.3 +plt.figure(figsize=(6, 6)) + +plt1 = plt.subplot(111) +for j in n: + plt1.bar(j, mse_means[j], yerr = mse_std[j], + width = width, color = colors[j]) + +plt1.set_title("MCAR Missingness") +plt1.set_ylabel("Mean Squared Error") +plt.legend(n_labels) +plt.show() From 965ae8ef42b6870a85eaa3f596a266fdca424ae0 Mon Sep 17 00:00:00 2001 From: RianneSchouten Date: Wed, 27 Jun 2018 17:36:17 +0200 Subject: [PATCH 002/163] adjust figure widths and legends --- examples/plot_multiple_imputation.py | 48 ++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/examples/plot_multiple_imputation.py b/examples/plot_multiple_imputation.py index ac3c8bf1d6d08..9528ee9a017ee 100644 --- a/examples/plot_multiple_imputation.py +++ b/examples/plot_multiple_imputation.py @@ -235,7 +235,7 @@ def get_results_mice_imputation_includingy(X_incomplete, y): n_labels = ['Full Data', 'Chained Imputer', 'Mice Imputer', 'Mice Imputer with y'] colors = ['r', 'orange', 'b', 'purple'] width = 0.3 -plt.figure(figsize=(24, 16)) +plt.figure(figsize=(12, 16)) plt1 = plt.subplot(211) for j in n: @@ -275,6 +275,23 @@ def get_results_full_data(X_train, X_test, y_train, y_test): return mse_full +def get_results_single_imputation(X_train, X_test, y_train, y_test): + + imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=0) + X_train_imputed = imputer.fit_transform(X_train) + X_test_imputed = imputer.transform(X_test) + + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train_imputed) + X_test_scaled = scaler.transform(X_test_imputed) + + estimator = LinearRegression() + estimator.fit(X_train_scaled, y_train) + y_predict = estimator.predict(X_test_scaled) + mse_single = mse(y_test, y_predict) + + return mse_single + # Perform pipeline for i in m # Approach 1: pool the mse values of the m datasets def get_results_multiple_imputation_approach1(X_train, X_test, y_train, y_test): @@ -358,32 +375,35 @@ def perform_simulation(dataset, X_incomplete, nsim = 10): y_test = y[test_indices] mse_full = get_results_full_data(X_full_train, X_full_test, y_train, y_test) + mse_single = get_results_single_imputation(X_incomplete_train, X_incomplete_test, y_train, y_test) mse_approach1 = get_results_multiple_imputation_approach1(X_incomplete_train, X_incomplete_test, y_train, y_test) mse_approach2 = get_results_multiple_imputation_approach2(X_incomplete_train, X_incomplete_test, y_train, y_test) - outcome.append((mse_full, mse_approach1, mse_approach2)) + outcome.append((mse_full, mse_single, mse_approach1, mse_approach2)) return np.mean(outcome, axis = 0), np.std(outcome, axis = 0) # Execute -print("Executing Example 1 MCAR Missingness") +print("Executing Example 2 MCAR Missingness") Boston_X_incomplete_MCAR = ampute(X_scaled, mech = "MCAR") mse_means, mse_std = perform_simulation(load_boston(), Boston_X_incomplete_MCAR, nsim=10) # Plot results -n_situations = 3 +n_situations = 4 n = np.arange(n_situations) -n_labels = ['Full Data', 'Average MSE', 'Average Predictions'] -colors = ['r', 'green', 'yellow'] -width = 0.3 -plt.figure(figsize=(6, 6)) +n_labels = ['Full Data', 'Single Imputation', 'MI Average MSE', 'MI Average Predictions'] +colors = ['r', 'orange', 'green', 'yellow'] -plt1 = plt.subplot(111) +plt.figure(figsize=(12, 6)) +ax1 = plt.subplot(111) for j in n: - plt1.bar(j, mse_means[j], yerr = mse_std[j], - width = width, color = colors[j]) + ax1.barh(j, mse_means[j], xerr=mse_std[j], + color=colors[j], alpha=0.6, align='center') + +ax1.set_title('MCAR Missingness') +ax1.set_yticks(n) +ax1.set_xlabel('Mean Squared Error') +ax1.invert_yaxis() +ax1.set_yticklabels(n_labels) -plt1.set_title("MCAR Missingness") -plt1.set_ylabel("Mean Squared Error") -plt.legend(n_labels) plt.show() From fa082de111fd68d211a4e8473264938a1e1db00b Mon Sep 17 00:00:00 2001 From: RianneSchouten Date: Thu, 28 Jun 2018 12:53:44 +0200 Subject: [PATCH 003/163] changed code according pep rules and increased figure size --- examples/plot_multiple_imputation.py | 276 ++++++++++++++++----------- 1 file changed, 160 insertions(+), 116 deletions(-) diff --git a/examples/plot_multiple_imputation.py b/examples/plot_multiple_imputation.py index 9528ee9a017ee..2384fe00d49fb 100644 --- a/examples/plot_multiple_imputation.py +++ b/examples/plot_multiple_imputation.py @@ -8,25 +8,26 @@ that it allows for finding unbiased statistical estimates due to its chained character. However, the disadvantage is that every imputed value is treated as if the value was observed, leading to an imputed dataset that does not reflect -the uncertainty that occurs due to the presence of missing values. This makes it -hard to find valid statistical inferences because the variance (and standard error) -of statistical estimates become too small. - -An alternative is using the ChainedImputer to perform multiple imputation: a method -where every missing value is imputed multiple times. The procedure results in -multiple datasets where the observed data is similar in every dataset, but the imputed -data is different. All desired steps after imputation are performed on every dataset, -including the analysis. Then, Rubin's pooling rules are used to combine the estimates -into one final result. - -In this example we will show how to use the ChainedImputer to perform multiple imputation, -what the effect is on the standard error of beta coefficients and how to set up a prediction -model using multiple imputation. +the uncertainty that occurs due to the presence of missing values. This makes +it hard to find valid statistical inferences because the variance (and standard +error) of statistical estimates become too small. + +An alternative is using the ChainedImputer to perform multiple imputation: a +method where every missing value is imputed multiple times. The procedure +results in multiple datasets where the observed data is similar in every +dataset, but the imputed data is different. All desired steps after imputation +are performed on every dataset, including the analysis. Then, Rubin's pooling +rules are used to combine the estimates into one final result. + +In this example we will show how to use the ChainedImputer to perform multiple +imputation, what the effect is on the standard error of beta coefficients and +how to set up a prediction model using multiple imputation. """ +import math import numpy as np -from scipy import stats import matplotlib.pyplot as plt +from scipy import stats from sklearn.datasets import load_boston from sklearn.linear_model import LinearRegression @@ -38,7 +39,6 @@ rng = np.random.RandomState(0) def ampute(X, missing_rate = 0.75, mech = "MCAR"): - n_samples = X.shape[0] n_features = X.shape[1] X_incomplete = X.copy() @@ -46,22 +46,29 @@ def ampute(X, missing_rate = 0.75, mech = "MCAR"): # MCAR mechanism if mech == 'MCAR': for i in np.arange(n_features): - dropped_indices = np.array(np.random.choice(np.arange(n_samples), size=int(missing_rate * n_samples), replace=False)) + dropped_indices = np.array(np.random.choice(np.arange(n_samples), + size=int(missing_rate + * n_samples), + replace=False)) X_incomplete[dropped_indices[:, None], i] = None # MNAR mechanism if mech == "MNAR": for i in np.arange(n_features): data_values = -np.mean(X[:, i]) + X[:, i] - weights = list(map(lambda x: math.exp(x) / (1 + math.exp(x)), data_values)) + weights = list(map(lambda x: math.exp(x) / (1 + math.exp(x)), + data_values)) probs = np.array(weights) / np.sum(np.array(weights)) - dropped_indices = np.array(np.random.choice(np.arange(n_samples), size=int(missing_rate * n_samples), p=probs, replace=False)) + dropped_indices = np.array(np.random.choice(np.arange(n_samples), + size=int(missing_rate + * n_samples), + p=probs, + replace=False)) X_incomplete[dropped_indices[:, None], i] = None return X_incomplete def calculate_variance_of_beta_estimates(y_true, y_pred, X): - residuals = np.sum((y_true - y_pred)**2) sigma_hat_squared = (1 / (len(y_true) - 2)) * residuals X_prime_X = np.dot(X.T, X) @@ -70,19 +77,22 @@ def calculate_variance_of_beta_estimates(y_true, y_pred, X): return vars -### EXAMPLE 1. -### COMPARE STATISTICAL ESTIMATES AND THEIR VARIANCE FOR LINEAR REGRESSION MODEL +############################################################################### -def get_results_full_dataset(X, y): +# EXAMPLE 1. COMPARE STATISTICAL ESTIMATES AND THEIR VARIANCE USING MULTIPLE +# IMPUTATION IN A LINEAR REGRESSION MODEL. + +############################################################################### +def get_results_full_dataset(X, y): # Perform linear regression on full data as a way of comparison estimator = LinearRegression() estimator.fit(X, y) y_predict = estimator.predict(X) - # Save the beta estimates - # The variance of these estimates - # And 1.96 * standard error of the estimates (useful to know the 95% confidence interval) + # Save the beta estimates, the variance of these estimates and 1.96 * + # standard error of the estimates. The latter is useful to know the 95% + # confidence interval. full_coefs = estimator.coef_ full_vars = calculate_variance_of_beta_estimates(y, y_predict, X) full_errorbar = 1.96 * np.sqrt(full_vars) @@ -90,9 +100,8 @@ def get_results_full_dataset(X, y): return full_coefs, full_vars, full_errorbar def get_results_chained_imputation(X_incomplete, y): - # Impute incomplete data with ChainedImputer - # Setting burnin at 99 and using only the last imputation + # Setting n_burn_in at 99 and using only the last imputation imputer = ChainedImputer(n_burn_in=99, n_imputations=1) imputer.fit(X_incomplete) X_imputed = imputer.transform(X_incomplete) @@ -103,24 +112,22 @@ def get_results_chained_imputation(X_incomplete, y): estimator.fit(X_imputed, y) y_predict = estimator.predict(X_imputed) - # Save the beta estimates - # The variance of these estimates - # And 1.96 * standard error of the estimates + # Save the beta estimates, the variance of these estimates and 1.96 * + # standard error of the estimates chained_coefs = estimator.coef_ - chained_vars = calculate_variance_of_beta_estimates(y, y_predict, X_imputed) + chained_vars = calculate_variance_of_beta_estimates( + y, y_predict, X_imputed) chained_errorbar = 1.96 * np.sqrt(chained_vars) return chained_coefs, chained_vars, chained_errorbar def get_results_mice_imputation(X_incomplete, y): - # Impute incomplete data using the ChainedImputer as a MICEImputer - # Setting burnin at 99, using only last imputation and loop this procedure m times + # Setting n_burn_in at 99 and using only last imputation and loop this + # procedure m times. m = 5 multiple_imputations = [] - for i in range(m): - imputer = ChainedImputer(n_burn_in=99, n_imputations=1,random_state=i) imputer.fit(X_incomplete) X_imputed = imputer.transform(X_incomplete) @@ -131,49 +138,46 @@ def get_results_mice_imputation(X_incomplete, y): m_coefs = [] m_vars = [] for i in range(m): - estimator = LinearRegression() estimator.fit(multiple_imputations[i], y) y_predict = estimator.predict(multiple_imputations[i]) - m_coefs.append(estimator.coef_) - m_vars.append(calculate_variance_of_beta_estimates(y, y_predict, multiple_imputations[i])) + m_vars.append(calculate_variance_of_beta_estimates( + y, y_predict, multiple_imputations[i])) - # Calculate the end estimates by applying Rubin's rules + # Calculate the end estimates by applying Rubin's rules. # Rubin's rules can be slightly different for different types of estimates # In case of linear regression, these are the rules: - # The value of every estimate is the mean of estimates in each of the m datasets - # The variance of these estimates is a combination of the variance of each of the m estimates (Ubar) - # And the variance between the m estimates (B) - + # + # The value of every estimate is the mean of the estimates in each of the m + # datasets. The variance of these estimates is a combination of the + # variance of each of the m estimates (Ubar) and the variance between the m + # estimates (B). The standard error is the sqrt of the variance. Qbar = np.mean(m_coefs, axis = 0) Ubar = np.mean(m_vars, axis = 0) B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis = 0) T = Ubar + B + (B/m) - - # The final 1.96 * standard error is then the sqrt of the variance mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar -# The original MICE procedure includes all variables inluding the output variable in the imputation -# process. The idea is that the imputation model should at least contain the analysis model to -# result in unbiased estimates +# The original MICE procedure includes all variables inluding the output +# variable in the imputation process. The idea is that the imputation model +# should at least contain the analysis model to result in unbiased estimates. +# In this function, we will also include y in the imputation process. def get_results_mice_imputation_includingy(X_incomplete, y): - # Impute incomplete data using the ChainedImputer as a MICEImputer # Now using the output variable in the imputation loop m = 5 multiple_imputations = [] - for i in range(m): - Xy = np.column_stack((X_incomplete, y)) imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) imputer.fit(Xy) data_imputed = imputer.transform(Xy) - # We save only the X imputed data because we don't want to use y to predict y later on + # We save only the X imputed data because we do not want to use y to + # predict y later on. X_imputed = data_imputed[:, :-1] multiple_imputations.append(X_imputed) @@ -182,92 +186,115 @@ def get_results_mice_imputation_includingy(X_incomplete, y): m_coefs = [] m_vars = [] for i in range(m): - estimator = LinearRegression() estimator.fit(multiple_imputations[i], y) y_predict = estimator.predict(multiple_imputations[i]) - m_coefs.append(estimator.coef_) - m_vars.append(calculate_variance_of_beta_estimates(y, y_predict, multiple_imputations[i])) - - # Calculate the end results by applying Rubin's rules - # The value of every estimate is the mean of the values over the m datasets - # The variance of these estimates is a combination of the variance of each of the m estimates (Ubar) - # And the variance between the m estimates (B) + m_vars.append(calculate_variance_of_beta_estimates( + y, y_predict, multiple_imputations[i])) + # Calculate the end estimates by applying Rubin's rules. + # Rubin's rules can be slightly different for different types of estimates + # In case of linear regression, these are the rules: + # + # The value of every estimate is the mean of the estimates in each of the m + # datasets. The variance of these estimates is a combination of the + # variance of each of the m estimates (Ubar) and the variance between the m + # estimates (B). The standard error is the sqrt of the variance. Qbar = np.mean(m_coefs, axis = 0) Ubar = np.mean(m_vars, axis = 0) B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis = 0) T = Ubar + B + (B/m) - - # The final 1.96 * standard error is then the sqrt of the variance mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar -# Now lets run these imputation procedures -# We use the Boston dataset and analyze the outcomes of the beta coefficients and their standard errors -# We standardize the data before running the procedure to be able to compare the coefficients -# We run the procedure for 3 missingness mechanisms (MCAR, MAR and MNAR) - +# Now lets run all these imputation procedures. +# We use the Boston dataset and analyze the outcomes of the beta coefficients +# and their standard errors. We standardize the data before running the +# procedure to be able to compare the coefficients. We run the procedure for +# MCAR missingness only. This can easily be changed to MNAR by setting the +# `mech` argument. +# +# Loading the data dataset = load_boston() X_full, y = dataset.data, dataset.target +# Standardizing the data scaler = StandardScaler() X_scaled = scaler.fit_transform(X_full) y_scaled = stats.zscore(y) +# Start the procedure print("Executing Example 1 MCAR Missingness") -Boston_X_incomplete_MCAR = ampute(X_scaled, mech = "MCAR") -full_coefs, full_vars, full_errorbar = get_results_full_dataset(X_scaled, y_scaled) -chained_coefs, chained_vars, chained_errorbar = get_results_chained_imputation(Boston_X_incomplete_MCAR, y_scaled) -mice_coefs, mice_vars, mice_errorbar = get_results_mice_imputation(Boston_X_incomplete_MCAR, y_scaled) -mice_y_coefs, mice_y_vars, mice_y_errorbar = get_results_mice_imputation_includingy(Boston_X_incomplete_MCAR, y_scaled) +# First, make the data incomplete with a MCAR mechanism. +Boston_X_incomplete_MCAR = ampute(X_scaled, mech = "MCAR") +# Second, run all the imputation procedures as described above. +full_coefs, full_vars, full_errorbar = get_results_full_dataset( + X_scaled, y_scaled) +chained_coefs, chained_vars, chained_errorbar = get_results_chained_imputation( + Boston_X_incomplete_MCAR, y_scaled) +mice_coefs, mice_vars, mice_errorbar = get_results_mice_imputation( + Boston_X_incomplete_MCAR, y_scaled) +mice_y_coefs, mice_y_vars, mice_y_errorbar = \ + get_results_mice_imputation_includingy( + Boston_X_incomplete_MCAR, y_scaled) + +# Combine the results from the four imputation procedures. coefs = (full_coefs, chained_coefs, mice_coefs, mice_y_coefs) vars = (full_vars, chained_vars, mice_vars, mice_y_vars) errorbars = (full_errorbar, chained_errorbar, mice_errorbar, mice_y_errorbar) -# We plot the results +# And plot the results n_situations = 4 n = np.arange(n_situations) -n_labels = ['Full Data', 'Chained Imputer', 'Mice Imputer', 'Mice Imputer with y'] +n_labels = ['Full Data', 'Chained Imputer', + 'Mice Imputer', 'Mice Imputer with y'] colors = ['r', 'orange', 'b', 'purple'] width = 0.3 -plt.figure(figsize=(12, 16)) +plt.figure(figsize=(24, 32)) plt1 = plt.subplot(211) for j in n: - plt1.bar(np.arange(len(coefs[j])) + (3*j*(width/n_situations)), coefs[j], width = width, color = colors[j]) + plt1.bar(np.arange(len(coefs[j])) + (3*j*(width/n_situations)), + coefs[j], width = width, color = colors[j]) plt.legend(n_labels) plt2 = plt.subplot(212) for j in n: - plt2.bar(np.arange(len(errorbars[j])) + (3*j*(width/n_situations)), errorbars[j], width = width, color = colors[j]) + plt2.bar(np.arange(len(errorbars[j])) + (3*j*(width/n_situations)), + errorbars[j], width = width, color = colors[j]) plt1.set_title("MCAR Missingness") plt1.set_ylabel("Beta Coefficients") plt2.set_ylabel("Standard Errors") plt1.set_xlabel("Features") plt2.set_xlabel("Features") - plt.show() -### EXAMPLE 2. ### -### SHOW MULTIPLE IMPUTATION IN PREDICTION CONTEXT ### +############################################################################### -# In this example, we show how to apply the imputer in a train/test situation -# There are two approaches to get the end result of the prediction model -# In approach 1 you calculate the evaluation metric for every i in m and later average these values -# In approach 2 you average the predictions of every i in m and then calculate the evaluation metric +# EXAMPLE 2. SHOW MULTIPLE IMPUTATION IN A PREDICTION CONTEXT. -def get_results_full_data(X_train, X_test, y_train, y_test): +############################################################################### +# In this example, we show how to apply MICE imputation in a train/test +# situation. There are two approaches to get the end result of the prediction +# model. In approach 1 you calculate the evaluation metric for every i in m and +# later average these values. In approach 2 you average the predictions of +# every i in m and then calculate the evaluation metric. We test both +# approaches. +# +# Apply the regression model on the full dataset as a way of comparison. +def get_results_full_data(X_train, X_test, y_train, y_test): + # Standardize data scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) + # Perform estimation and prediction estimator = LinearRegression() estimator.fit(X_train_scaled, y_train) y_predict = estimator.predict(X_test_scaled) @@ -275,16 +302,19 @@ def get_results_full_data(X_train, X_test, y_train, y_test): return mse_full +# Use the ChainedImputer as a single imputation procedure. def get_results_single_imputation(X_train, X_test, y_train, y_test): - + # Apply imputation imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=0) X_train_imputed = imputer.fit_transform(X_train) X_test_imputed = imputer.transform(X_test) + # Standardize data scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_imputed) X_test_scaled = scaler.transform(X_test_imputed) + # Perform estimation and prediction estimator = LinearRegression() estimator.fit(X_train_scaled, y_train) y_predict = estimator.predict(X_test_scaled) @@ -292,15 +322,13 @@ def get_results_single_imputation(X_train, X_test, y_train, y_test): return mse_single -# Perform pipeline for i in m -# Approach 1: pool the mse values of the m datasets -def get_results_multiple_imputation_approach1(X_train, X_test, y_train, y_test): - +# Now use the IterativeImputer as a MICE Imputer by looping over i in m. +# Approach 1: pool the mse values of the m datasets. +def get_results_multiple_imputation_approach1(X_train, X_test, + y_train, y_test): m = 5 multiple_mses = [] - for i in range(m): - # Fit the imputer for every i in im # Be aware that you fit the imputer on the train data # And apply to the test data @@ -309,31 +337,32 @@ def get_results_multiple_imputation_approach1(X_train, X_test, y_train, y_test): X_test_imputed = imputer.transform(X_test) # Perform the steps you wish to take before fitting the estimator + # Such as standardization. scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_imputed) X_test_scaled = scaler.transform(X_test_imputed) - # Finally fit the estimator and calculate the error metric for every i in m + # Finally fit the estimator and calculate the error metric for every i + # in m. Save all error metric values. estimator = LinearRegression() estimator.fit(X_train_scaled, y_train) y_predict = estimator.predict(X_test_scaled) mse_approach1 = mse(y_test, y_predict) multiple_mses.append(mse_approach1) - # Average the error metric over the m loops to get a final result + # Average the error metric values over the m loops to get a final result. mse_approach1 = np.mean(multiple_mses, axis=0) return mse_approach1 -# Approach 2: average the predictions of the m datasets and then calculate the mse -def get_results_multiple_imputation_approach2(X_train, X_test, y_train, y_test): - +# Approach 2: average the predictions of the m datasets and then calculate the +# error metric. +def get_results_multiple_imputation_approach2(X_train, X_test, + y_train, y_test): m = 5 multiple_predictions = [] - for i in range(m): - - # Fit the imputer for every i in im + # Fit the imputer for every i in m # Be aware that you fit the imputer on the train data # And apply to the test data imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) @@ -341,32 +370,34 @@ def get_results_multiple_imputation_approach2(X_train, X_test, y_train, y_test): X_test_imputed = imputer.transform(X_test) # Perform the steps you wish to take before fitting the estimator + # Such as standardization scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_imputed) X_test_scaled = scaler.transform(X_test_imputed) - # Finally fit the estimator and calculate the predictions for every i in m + # Finally fit the estimator and calculate the predictions for every i + # in m. Save the predictions. estimator = LinearRegression() estimator.fit(X_train_scaled, y_train) y_predict = estimator.predict(X_test_scaled) multiple_predictions.append(y_predict) # Average the predictions over the m loops - # Then calculate the error metric + # Then calculate the error metric. predictions_average = np.mean(multiple_predictions, axis=0) mse_approach2 = mse(y_test, predictions_average) return mse_approach2 def perform_simulation(dataset, X_incomplete, nsim = 10): - X_full, y = dataset.data, dataset.target outcome = [] + # Start a simulation process that executes the process nsim times. for j in np.arange(nsim): - - train_indices, test_indices = train_test_split(np.arange(X_full.shape[0])) - + # First, split the data in train and test dataset. + train_indices, test_indices = train_test_split( + np.arange(X_full.shape[0])) X_incomplete_train = X_incomplete[train_indices] X_full_train = X_full[train_indices] X_incomplete_test = X_incomplete[test_indices] @@ -374,19 +405,33 @@ def perform_simulation(dataset, X_incomplete, nsim = 10): y_train = y[train_indices] y_test = y[test_indices] - mse_full = get_results_full_data(X_full_train, X_full_test, y_train, y_test) - mse_single = get_results_single_imputation(X_incomplete_train, X_incomplete_test, y_train, y_test) - mse_approach1 = get_results_multiple_imputation_approach1(X_incomplete_train, X_incomplete_test, y_train, y_test) - mse_approach2 = get_results_multiple_imputation_approach2(X_incomplete_train, X_incomplete_test, y_train, y_test) - + # Second, perform the imputation procedures and calculation of the + # error metric for every one of the four situations. + mse_full = get_results_full_data( + X_full_train, X_full_test, y_train, y_test) + mse_single = get_results_single_imputation( + X_incomplete_train, X_incomplete_test, y_train, y_test) + mse_approach1 = get_results_multiple_imputation_approach1( + X_incomplete_train, X_incomplete_test, y_train, y_test) + mse_approach2 = get_results_multiple_imputation_approach2( + X_incomplete_train, X_incomplete_test, y_train, y_test) + + # Save the outcome of every simulation round outcome.append((mse_full, mse_single, mse_approach1, mse_approach2)) + # Return the mean and standard deviation of the nsim outcome values return np.mean(outcome, axis = 0), np.std(outcome, axis = 0) -# Execute +# Execute the simulation print("Executing Example 2 MCAR Missingness") + +# Generate missing values with a MCAR mechanism Boston_X_incomplete_MCAR = ampute(X_scaled, mech = "MCAR") -mse_means, mse_std = perform_simulation(load_boston(), Boston_X_incomplete_MCAR, nsim=10) + +# Perform the simulation +mse_means, mse_std = perform_simulation(load_boston(), + Boston_X_incomplete_MCAR, + nsim=10) # Plot results n_situations = 4 @@ -394,7 +439,7 @@ def perform_simulation(dataset, X_incomplete, nsim = 10): n_labels = ['Full Data', 'Single Imputation', 'MI Average MSE', 'MI Average Predictions'] colors = ['r', 'orange', 'green', 'yellow'] -plt.figure(figsize=(12, 6)) +plt.figure(figsize=(24, 12)) ax1 = plt.subplot(111) for j in n: ax1.barh(j, mse_means[j], xerr=mse_std[j], @@ -405,5 +450,4 @@ def perform_simulation(dataset, X_incomplete, nsim = 10): ax1.set_xlabel('Mean Squared Error') ax1.invert_yaxis() ax1.set_yticklabels(n_labels) - plt.show() From e3e2465fc10bf6859c5a5f758017de832ea1dd57 Mon Sep 17 00:00:00 2001 From: RianneSchouten Date: Thu, 28 Jun 2018 13:07:25 +0200 Subject: [PATCH 004/163] solve two issues from lgtm and improve introduction text --- examples/plot_multiple_imputation.py | 72 +++++++++++++++------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/examples/plot_multiple_imputation.py b/examples/plot_multiple_imputation.py index 2384fe00d49fb..bb2b6cb225560 100644 --- a/examples/plot_multiple_imputation.py +++ b/examples/plot_multiple_imputation.py @@ -3,25 +3,26 @@ Imputing missing values using multiple imputation ================================================= -By default, the ChainedImputer performs single imputation: a method where every -missing value is replaced with one imputed value. The strength of the method is -that it allows for finding unbiased statistical estimates due to its chained -character. However, the disadvantage is that every imputed value is treated as -if the value was observed, leading to an imputed dataset that does not reflect -the uncertainty that occurs due to the presence of missing values. This makes -it hard to find valid statistical inferences because the variance (and standard -error) of statistical estimates become too small. - -An alternative is using the ChainedImputer to perform multiple imputation: a +By default, the IterativeImputer performs single imputation: a method where +every missing value is replaced with one imputed value. The chained character +of the method and the possiblity to draw imputation values from the posterior +distribution of a Bayesian imputation model allows for the finding of unbiased +statistical estimates. However, the disadvantage is that every imputed value is +treated as if the value was observed, leading to an imputed dataset that does +not reflect the uncertainty that occurs due to the presence of missing values. +This makes it hard to find valid statistical inferences because the variance +(and standard error) of statistical estimates become too small. + +An alternative is using the IterativeImputer to perform multiple imputation: a method where every missing value is imputed multiple times. The procedure results in multiple datasets where the observed data is similar in every dataset, but the imputed data is different. All desired steps after imputation are performed on every dataset, including the analysis. Then, Rubin's pooling rules are used to combine the estimates into one final result. -In this example we will show how to use the ChainedImputer to perform multiple -imputation, what the effect is on the standard error of beta coefficients and -how to set up a prediction model using multiple imputation. +In this example we will show how to use the ITerativeImputer to perform +multiple imputation, what the effect is on the standard error of beta +coefficients and how to set up a prediction model using multiple imputation. """ import math @@ -33,11 +34,12 @@ from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split -from sklearn.impute import SimpleImputer, ChainedImputer +from sklearn.impute import ChainedImputer from sklearn.metrics import mean_squared_error as mse rng = np.random.RandomState(0) +# Start by defining a basic amputation function def ampute(X, missing_rate = 0.75, mech = "MCAR"): n_samples = X.shape[0] n_features = X.shape[1] @@ -68,6 +70,9 @@ def ampute(X, missing_rate = 0.75, mech = "MCAR"): return X_incomplete +# Make a function that calculates the variance of the beta estimates. This is +# necessary because the linear regression model from sklearn does not provide +# these values. def calculate_variance_of_beta_estimates(y_true, y_pred, X): residuals = np.sum((y_true - y_pred)**2) sigma_hat_squared = (1 / (len(y_true) - 2)) * residuals @@ -100,8 +105,8 @@ def get_results_full_dataset(X, y): return full_coefs, full_vars, full_errorbar def get_results_chained_imputation(X_incomplete, y): - # Impute incomplete data with ChainedImputer - # Setting n_burn_in at 99 and using only the last imputation + # Impute incomplete data with IterativeImputer using single imputation + # We set n_burn_in at 99 and use only the last imputation imputer = ChainedImputer(n_burn_in=99, n_imputations=1) imputer.fit(X_incomplete) X_imputed = imputer.transform(X_incomplete) @@ -122,9 +127,9 @@ def get_results_chained_imputation(X_incomplete, y): return chained_coefs, chained_vars, chained_errorbar def get_results_mice_imputation(X_incomplete, y): - # Impute incomplete data using the ChainedImputer as a MICEImputer - # Setting n_burn_in at 99 and using only last imputation and loop this - # procedure m times. + # Impute incomplete data using the IterativeImputer to perform multiple + # imputation. We set n_burn_in at 99 and use only last imputation and + # loop this procedure m times. m = 5 multiple_imputations = [] for i in range(m): @@ -161,12 +166,13 @@ def get_results_mice_imputation(X_incomplete, y): return Qbar, T, mice_errorbar -# The original MICE procedure includes all variables inluding the output -# variable in the imputation process. The idea is that the imputation model -# should at least contain the analysis model to result in unbiased estimates. -# In this function, we will also include y in the imputation process. +# The original multiple imputation procedure as developed under the name +# MICE includes all variables in the imputation process; including the output +# variable. The reason to do this is that the imputation model should at least +# contain the analysis model to result in unbiased estimates. In this function, +# we will also include y in the imputation process. def get_results_mice_imputation_includingy(X_incomplete, y): - # Impute incomplete data using the ChainedImputer as a MICEImputer + # Impute incomplete data using the IterativeImputer as a MICEImputer # Now using the output variable in the imputation loop m = 5 multiple_imputations = [] @@ -213,8 +219,7 @@ def get_results_mice_imputation_includingy(X_incomplete, y): # We use the Boston dataset and analyze the outcomes of the beta coefficients # and their standard errors. We standardize the data before running the # procedure to be able to compare the coefficients. We run the procedure for -# MCAR missingness only. This can easily be changed to MNAR by setting the -# `mech` argument. +# MCAR missingness only. # # Loading the data dataset = load_boston() @@ -280,7 +285,7 @@ def get_results_mice_imputation_includingy(X_incomplete, y): ############################################################################### -# In this example, we show how to apply MICE imputation in a train/test +# In this example, we show how to apply multiple imputation in a train/test # situation. There are two approaches to get the end result of the prediction # model. In approach 1 you calculate the evaluation metric for every i in m and # later average these values. In approach 2 you average the predictions of @@ -322,8 +327,8 @@ def get_results_single_imputation(X_train, X_test, y_train, y_test): return mse_single -# Now use the IterativeImputer as a MICE Imputer by looping over i in m. -# Approach 1: pool the mse values of the m datasets. +# Now use the IterativeImputer to perform multiple imputation by looping over +# i in m. Approach 1: pool the mse values of the m datasets. def get_results_multiple_imputation_approach1(X_train, X_test, y_train, y_test): m = 5 @@ -355,8 +360,8 @@ def get_results_multiple_imputation_approach1(X_train, X_test, return mse_approach1 -# Approach 2: average the predictions of the m datasets and then calculate the -# error metric. +# Approach 2: We average the predictions of the m datasets and then calculate +# the error metric. def get_results_multiple_imputation_approach2(X_train, X_test, y_train, y_test): m = 5 @@ -397,7 +402,7 @@ def perform_simulation(dataset, X_incomplete, nsim = 10): for j in np.arange(nsim): # First, split the data in train and test dataset. train_indices, test_indices = train_test_split( - np.arange(X_full.shape[0])) + np.arange(X_full.shape[0]), random_state=j) X_incomplete_train = X_incomplete[train_indices] X_full_train = X_full[train_indices] X_incomplete_test = X_incomplete[test_indices] @@ -436,7 +441,8 @@ def perform_simulation(dataset, X_incomplete, nsim = 10): # Plot results n_situations = 4 n = np.arange(n_situations) -n_labels = ['Full Data', 'Single Imputation', 'MI Average MSE', 'MI Average Predictions'] +n_labels = ['Full Data', 'Single Imputation', + 'MI Average MSE', 'MI Average Predictions'] colors = ['r', 'orange', 'green', 'yellow'] plt.figure(figsize=(24, 12)) From 15b3b914fe893d79f5fa2e162d80e6f639d146f4 Mon Sep 17 00:00:00 2001 From: RianneSchouten Date: Thu, 28 Jun 2018 14:41:22 +0200 Subject: [PATCH 005/163] remove spaces in arguments and add lines for definitions --- examples/plot_multiple_imputation.py | 41 ++++++++++++++++++---------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/examples/plot_multiple_imputation.py b/examples/plot_multiple_imputation.py index bb2b6cb225560..2bcc8c9fd2c27 100644 --- a/examples/plot_multiple_imputation.py +++ b/examples/plot_multiple_imputation.py @@ -39,8 +39,9 @@ rng = np.random.RandomState(0) + # Start by defining a basic amputation function -def ampute(X, missing_rate = 0.75, mech = "MCAR"): +def ampute(X, missing_rate=0.75, mech="MCAR"): n_samples = X.shape[0] n_features = X.shape[1] X_incomplete = X.copy() @@ -70,6 +71,7 @@ def ampute(X, missing_rate = 0.75, mech = "MCAR"): return X_incomplete + # Make a function that calculates the variance of the beta estimates. This is # necessary because the linear regression model from sklearn does not provide # these values. @@ -89,6 +91,7 @@ def calculate_variance_of_beta_estimates(y_true, y_pred, X): ############################################################################### + def get_results_full_dataset(X, y): # Perform linear regression on full data as a way of comparison estimator = LinearRegression() @@ -104,6 +107,7 @@ def get_results_full_dataset(X, y): return full_coefs, full_vars, full_errorbar + def get_results_chained_imputation(X_incomplete, y): # Impute incomplete data with IterativeImputer using single imputation # We set n_burn_in at 99 and use only the last imputation @@ -126,6 +130,7 @@ def get_results_chained_imputation(X_incomplete, y): return chained_coefs, chained_vars, chained_errorbar + def get_results_mice_imputation(X_incomplete, y): # Impute incomplete data using the IterativeImputer to perform multiple # imputation. We set n_burn_in at 99 and use only last imputation and @@ -133,7 +138,7 @@ def get_results_mice_imputation(X_incomplete, y): m = 5 multiple_imputations = [] for i in range(m): - imputer = ChainedImputer(n_burn_in=99, n_imputations=1,random_state=i) + imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) imputer.fit(X_incomplete) X_imputed = imputer.transform(X_incomplete) multiple_imputations.append(X_imputed) @@ -158,14 +163,15 @@ def get_results_mice_imputation(X_incomplete, y): # datasets. The variance of these estimates is a combination of the # variance of each of the m estimates (Ubar) and the variance between the m # estimates (B). The standard error is the sqrt of the variance. - Qbar = np.mean(m_coefs, axis = 0) - Ubar = np.mean(m_vars, axis = 0) - B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis = 0) + Qbar = np.mean(m_coefs, axis=0) + Ubar = np.mean(m_vars, axis=0) + B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis=0) T = Ubar + B + (B/m) mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar + # The original multiple imputation procedure as developed under the name # MICE includes all variables in the imputation process; including the output # variable. The reason to do this is that the imputation model should at least @@ -207,14 +213,15 @@ def get_results_mice_imputation_includingy(X_incomplete, y): # datasets. The variance of these estimates is a combination of the # variance of each of the m estimates (Ubar) and the variance between the m # estimates (B). The standard error is the sqrt of the variance. - Qbar = np.mean(m_coefs, axis = 0) - Ubar = np.mean(m_vars, axis = 0) - B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis = 0) + Qbar = np.mean(m_coefs, axis=0) + Ubar = np.mean(m_vars, axis=0) + B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis=0) T = Ubar + B + (B/m) mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar + # Now lets run all these imputation procedures. # We use the Boston dataset and analyze the outcomes of the beta coefficients # and their standard errors. We standardize the data before running the @@ -234,7 +241,7 @@ def get_results_mice_imputation_includingy(X_incomplete, y): print("Executing Example 1 MCAR Missingness") # First, make the data incomplete with a MCAR mechanism. -Boston_X_incomplete_MCAR = ampute(X_scaled, mech = "MCAR") +Boston_X_incomplete_MCAR = ampute(X_scaled, mech="MCAR") # Second, run all the imputation procedures as described above. full_coefs, full_vars, full_errorbar = get_results_full_dataset( @@ -264,13 +271,13 @@ def get_results_mice_imputation_includingy(X_incomplete, y): plt1 = plt.subplot(211) for j in n: plt1.bar(np.arange(len(coefs[j])) + (3*j*(width/n_situations)), - coefs[j], width = width, color = colors[j]) + coefs[j], width=width, color=colors[j]) plt.legend(n_labels) plt2 = plt.subplot(212) for j in n: plt2.bar(np.arange(len(errorbars[j])) + (3*j*(width/n_situations)), - errorbars[j], width = width, color = colors[j]) + errorbars[j], width=width, color=colors[j]) plt1.set_title("MCAR Missingness") plt1.set_ylabel("Beta Coefficients") @@ -285,6 +292,7 @@ def get_results_mice_imputation_includingy(X_incomplete, y): ############################################################################### + # In this example, we show how to apply multiple imputation in a train/test # situation. There are two approaches to get the end result of the prediction # model. In approach 1 you calculate the evaluation metric for every i in m and @@ -307,6 +315,7 @@ def get_results_full_data(X_train, X_test, y_train, y_test): return mse_full + # Use the ChainedImputer as a single imputation procedure. def get_results_single_imputation(X_train, X_test, y_train, y_test): # Apply imputation @@ -327,6 +336,7 @@ def get_results_single_imputation(X_train, X_test, y_train, y_test): return mse_single + # Now use the IterativeImputer to perform multiple imputation by looping over # i in m. Approach 1: pool the mse values of the m datasets. def get_results_multiple_imputation_approach1(X_train, X_test, @@ -360,6 +370,7 @@ def get_results_multiple_imputation_approach1(X_train, X_test, return mse_approach1 + # Approach 2: We average the predictions of the m datasets and then calculate # the error metric. def get_results_multiple_imputation_approach2(X_train, X_test, @@ -394,7 +405,8 @@ def get_results_multiple_imputation_approach2(X_train, X_test, return mse_approach2 -def perform_simulation(dataset, X_incomplete, nsim = 10): + +def perform_simulation(dataset, X_incomplete, nsim=10): X_full, y = dataset.data, dataset.target outcome = [] @@ -425,13 +437,14 @@ def perform_simulation(dataset, X_incomplete, nsim = 10): outcome.append((mse_full, mse_single, mse_approach1, mse_approach2)) # Return the mean and standard deviation of the nsim outcome values - return np.mean(outcome, axis = 0), np.std(outcome, axis = 0) + return np.mean(outcome, axis=0), np.std(outcome, axis=0) + # Execute the simulation print("Executing Example 2 MCAR Missingness") # Generate missing values with a MCAR mechanism -Boston_X_incomplete_MCAR = ampute(X_scaled, mech = "MCAR") +Boston_X_incomplete_MCAR = ampute(X_scaled, mech="MCAR") # Perform the simulation mse_means, mse_std = perform_simulation(load_boston(), From ed1db8bb6d5e33c50e354de2246324d7c7f19210 Mon Sep 17 00:00:00 2001 From: RianneSchouten Date: Fri, 29 Jun 2018 16:19:29 +0200 Subject: [PATCH 006/163] put rules in separate functions and include explanation --- examples/plot_multiple_imputation.py | 106 ++++++++++++++++++--------- 1 file changed, 71 insertions(+), 35 deletions(-) diff --git a/examples/plot_multiple_imputation.py b/examples/plot_multiple_imputation.py index 2bcc8c9fd2c27..901781cb294bd 100644 --- a/examples/plot_multiple_imputation.py +++ b/examples/plot_multiple_imputation.py @@ -17,15 +17,44 @@ method where every missing value is imputed multiple times. The procedure results in multiple datasets where the observed data is similar in every dataset, but the imputed data is different. All desired steps after imputation -are performed on every dataset, including the analysis. Then, Rubin's pooling -rules are used to combine the estimates into one final result. - -In this example we will show how to use the ITerativeImputer to perform -multiple imputation, what the effect is on the standard error of beta -coefficients and how to set up a prediction model using multiple imputation. +are performed on every dataset, such as standardization and other feature +engineering steps. The estimation model is also fitted on each of the datasets. + +One final model is obtained by combining the estimates of each model with +Rubin's pooling rules. These rules assume that the parameters of interest are +normally distributed which is the case with, for example, estimates of the mean +and regression coefficients. Other parameters, such as correlation +coefficients need transformation to suit the assumption of normality. +If it is not possible to approximate a normal distribution, it is better to use +robust summary measures such as medians or ranges instead of using Rubin’s +pooling rules. This applies to an estimate like explained variance. + +In sum, Rubin’s pooling rules are as follows. The overall point estimate after +multiple imputation (denoted by Qbar) is the average of all the m point +estimates. The variance of the overall point estimate is a combination of +so-called within imputation variance (Ubar) and between imputation +variance (B). Ubar is the average of the m variances of the m point estimates. +Both Qbar and Ubar are corrected with a factor 1 / m to account for sampling +variance. The between imputation variance (B) is the sum of the squared +difference between Qbar and the m point estimates, corrected with a factor +1 / (m – 1). Then, the total variance (T) of the MI overall point estimate is +Ubar + B + B/m. + +In this document we will show how to use the IterativeImputer to perform +multiple imputation. In example 1 we show the effect of Rubin’s pooling +rules on the variance of regression estimates. Due to the between imputation +variance, the standard errors of all regression coefficients are larger with +multiple imputation than with single imputation. This allows for valid +statistical inference making. + +In example 2 we show how to set up a prediction model using multiple imputation. +We compare two approaches. In one approach, we make predictions for each of the +m datasets and combine the m evaluation error metrics into one overall value. +In the other approach, we combine the predictions and calculate one evaluation +error metric over the averaged predictions. A short simulation study shows that +the second approach results in the smallest Mean Squared Error. """ -import math import numpy as np import matplotlib.pyplot as plt from scipy import stats @@ -59,8 +88,7 @@ def ampute(X, missing_rate=0.75, mech="MCAR"): if mech == "MNAR": for i in np.arange(n_features): data_values = -np.mean(X[:, i]) + X[:, i] - weights = list(map(lambda x: math.exp(x) / (1 + math.exp(x)), - data_values)) + weights = 1 / (1 + np.exp(-data_values)) probs = np.array(weights) / np.sum(np.array(weights)) dropped_indices = np.array(np.random.choice(np.arange(n_samples), size=int(missing_rate @@ -84,6 +112,31 @@ def calculate_variance_of_beta_estimates(y_true, y_pred, X): return vars + +# Apply Rubin's pooling rules as follows. +# The value of every estimate is the mean of the estimates in each of the m +# datasets (Qbar). The variance of these estimates is a combination of the +# variance of each of the m estimates (Ubar) and the variance between the m +# estimates (B). +# +# Make a function that calculates Qbar from m estimates +def calculate_Qbar(m_estimates): + m = len(m_estimates) + Qbar = 1/m * np.sum(m_estimates, axis=0) + + return Qbar + + +# Make a function that calculates T from m estimates and their variances +def calculate_T(m_estimates, m_variances, Qbar): + m = len(m_estimates) + Ubar = 1/m * np.sum(m_variances, axis=0) + B = 1/(m - 1) * np.sum((Qbar - m_estimates) ** 2, axis=0) + T = Ubar + B + (B/m) + + return T + + ############################################################################### # EXAMPLE 1. COMPARE STATISTICAL ESTIMATES AND THEIR VARIANCE USING MULTIPLE @@ -156,17 +209,8 @@ def get_results_mice_imputation(X_incomplete, y): y, y_predict, multiple_imputations[i])) # Calculate the end estimates by applying Rubin's rules. - # Rubin's rules can be slightly different for different types of estimates - # In case of linear regression, these are the rules: - # - # The value of every estimate is the mean of the estimates in each of the m - # datasets. The variance of these estimates is a combination of the - # variance of each of the m estimates (Ubar) and the variance between the m - # estimates (B). The standard error is the sqrt of the variance. - Qbar = np.mean(m_coefs, axis=0) - Ubar = np.mean(m_vars, axis=0) - B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis=0) - T = Ubar + B + (B/m) + Qbar = calculate_Qbar(m_coefs) + T = calculate_T(m_coefs, m_vars, Qbar) mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar @@ -206,17 +250,8 @@ def get_results_mice_imputation_includingy(X_incomplete, y): y, y_predict, multiple_imputations[i])) # Calculate the end estimates by applying Rubin's rules. - # Rubin's rules can be slightly different for different types of estimates - # In case of linear regression, these are the rules: - # - # The value of every estimate is the mean of the estimates in each of the m - # datasets. The variance of these estimates is a combination of the - # variance of each of the m estimates (Ubar) and the variance between the m - # estimates (B). The standard error is the sqrt of the variance. - Qbar = np.mean(m_coefs, axis=0) - Ubar = np.mean(m_vars, axis=0) - B = (1 / (m-1)) * np.mean((Qbar - m_coefs) ** 2, axis=0) - T = Ubar + B + (B/m) + Qbar = calculate_Qbar(m_coefs) + T = calculate_T(m_coefs, m_vars, Qbar) mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar @@ -238,7 +273,7 @@ def get_results_mice_imputation_includingy(X_incomplete, y): y_scaled = stats.zscore(y) # Start the procedure -print("Executing Example 1 MCAR Missingness") +print("Executing Example 1 MCAR Missingness...") # First, make the data incomplete with a MCAR mechanism. Boston_X_incomplete_MCAR = ampute(X_scaled, mech="MCAR") @@ -434,14 +469,15 @@ def perform_simulation(dataset, X_incomplete, nsim=10): X_incomplete_train, X_incomplete_test, y_train, y_test) # Save the outcome of every simulation round - outcome.append((mse_full, mse_single, mse_approach1, mse_approach2)) + outcome.append((mse_full, mse_single, mse_approach1, + mse_approach2)) # Return the mean and standard deviation of the nsim outcome values return np.mean(outcome, axis=0), np.std(outcome, axis=0) # Execute the simulation -print("Executing Example 2 MCAR Missingness") +print("Executing Example 2 MCAR Missingness...") # Generate missing values with a MCAR mechanism Boston_X_incomplete_MCAR = ampute(X_scaled, mech="MCAR") @@ -449,7 +485,7 @@ def perform_simulation(dataset, X_incomplete, nsim=10): # Perform the simulation mse_means, mse_std = perform_simulation(load_boston(), Boston_X_incomplete_MCAR, - nsim=10) + nsim=2) # Plot results n_situations = 4 From 7c3fb9705a21f34bd392361233111ea6976054d8 Mon Sep 17 00:00:00 2001 From: RianneSchouten Date: Fri, 29 Jun 2018 17:01:27 +0200 Subject: [PATCH 007/163] line from 80 to 79 characters --- examples/plot_multiple_imputation.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/plot_multiple_imputation.py b/examples/plot_multiple_imputation.py index 901781cb294bd..781d29db0fd4b 100644 --- a/examples/plot_multiple_imputation.py +++ b/examples/plot_multiple_imputation.py @@ -47,12 +47,13 @@ multiple imputation than with single imputation. This allows for valid statistical inference making. -In example 2 we show how to set up a prediction model using multiple imputation. -We compare two approaches. In one approach, we make predictions for each of the -m datasets and combine the m evaluation error metrics into one overall value. -In the other approach, we combine the predictions and calculate one evaluation -error metric over the averaged predictions. A short simulation study shows that -the second approach results in the smallest Mean Squared Error. +In example 2 we show how to set up a prediction model using multiple +imputation. We compare two approaches. In one approach, we make predictions for +each of the m datasets and combine the m evaluation error metrics into one +overall value. In the other approach, we combine the predictions and calculate +one evaluation error metric over the averaged predictions. A short simulation +study shows that the second approach results in the smallest Mean Squared +Error. """ import numpy as np From 40dd0bf6b62ef3824ff68af5d3c51d4412fd4e67 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Mon, 3 Sep 2018 16:05:59 +0800 Subject: [PATCH 008/163] DOC Format in DBSCAN --- sklearn/cluster/dbscan_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index f10890e10f2c8..c1239b1388dce 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -233,7 +233,7 @@ class DBSCAN(BaseEstimator, ClusterMixin): n_jobs : int or None, optional (default=None) The number of parallel jobs to run. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. From 721ebaece0a1829eecaf0ab0a597b52de97b0d8c Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 3 Sep 2018 16:36:40 +0200 Subject: [PATCH 009/163] MNT Change max_bound -> max_eps in OPTICS (#11984) --- doc/modules/clustering.rst | 10 +++---- sklearn/cluster/optics_.py | 40 ++++++++++++++-------------- sklearn/cluster/tests/test_optics.py | 10 +++---- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 968a66e67fdcf..1f8210f35ffb4 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -838,9 +838,9 @@ algorithm builds a *reachability* graph, which assigns each sample both a ``reachability_`` distance, and a spot within the cluster ``ordering_`` attribute; these two attributes are assigned when the model is fitted, and are used to determine cluster membership. If OPTICS is run with the default value -of *inf* set for ``max_bound``, then DBSCAN style cluster extraction can be +of *inf* set for ``max_eps``, then DBSCAN style cluster extraction can be performed in linear time for any given ``eps`` value using the -``extract_dbscan`` method. Setting ``max_bound`` to a lower value will result +``extract_dbscan`` method. Setting ``max_eps`` to a lower value will result in shorter run times, and can be thought of as the maximum cluster object size (in diameter) that OPTICS will be able to extract. @@ -892,10 +892,10 @@ larger parent cluster. shorter run time than OPTICS; however, for repeated runs at varying ``eps`` values, a single run of OPTICS may require less cumulative runtime than DBSCAN. It is also important to note that OPTICS output can be unstable at - ``eps`` values very close to the initial ``max_bound`` value. OPTICS seems + ``eps`` values very close to the initial ``max_eps`` value. OPTICS seems to produce near identical results to DBSCAN provided that ``eps`` passed to ``extract_dbscan`` is a half order of magnitude less than the inital - ``max_bound`` that was used to fit; using a value close to ``max_bound`` + ``max_eps`` that was used to fit; using a value close to ``max_eps`` will throw a warning, and using a value larger will result in an exception. .. topic:: Computational Complexity @@ -909,7 +909,7 @@ larger parent cluster. multithreaded, and has better algorithmic runtime complexity than OPTICS-- at the cost of worse memory scaling. For extremely large datasets that exhaust system memory using HDBSCAN, OPTICS will maintain *n* (as opposed - to *n^2* memory scaling); however, tuning of the ``max_bound`` parameter + to *n^2* memory scaling); however, tuning of the ``max_eps`` parameter will likely need to be used to give a solution in a reasonable amount of wall time. diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index e10a92a7590e6..bc0fe5bfe7ceb 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -21,7 +21,7 @@ from ._optics_inner import quick_scan -def optics(X, min_samples=5, max_bound=np.inf, metric='euclidean', +def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size_ratio=.005, @@ -45,11 +45,11 @@ def optics(X, min_samples=5, max_bound=np.inf, metric='euclidean', The number of samples in a neighborhood for a point to be considered as a core point. - max_bound : float, optional + max_eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. This is also the largest object size expected within the dataset. Default value of "np.inf" will identify - clusters across all scales; reducing `max_bound` will result in + clusters across all scales; reducing `max_eps` will result in shorter run times. metric : string or callable, optional @@ -147,7 +147,7 @@ def optics(X, min_samples=5, max_bound=np.inf, metric='euclidean', Record 28, no. 2 (1999): 49-60. """ - clust = OPTICS(min_samples, max_bound, metric, p, metric_params, + clust = OPTICS(min_samples, max_eps, metric, p, metric_params, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, min_cluster_size_ratio, min_maxima_ratio, @@ -172,11 +172,11 @@ class OPTICS(BaseEstimator, ClusterMixin): The number of samples in a neighborhood for a point to be considered as a core point. - max_bound : float, optional + max_eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. This is also the largest object size expected within the dataset. Default value of "np.inf" will identify - clusters across all scales; reducing `max_bound` will result in + clusters across all scales; reducing `max_eps` will result in shorter run times. metric : string or callable, optional @@ -284,14 +284,14 @@ class OPTICS(BaseEstimator, ClusterMixin): Record 28, no. 2 (1999): 49-60. """ - def __init__(self, min_samples=5, max_bound=np.inf, metric='euclidean', + def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size_ratio=.005, min_maxima_ratio=0.001, algorithm='ball_tree', leaf_size=30, n_jobs=None): - self.max_bound = max_bound + self.max_eps = max_eps self.min_samples = min_samples self.maxima_ratio = maxima_ratio self.rejection_ratio = rejection_ratio @@ -310,7 +310,7 @@ def fit(self, X, y=None): """Perform OPTICS clustering Extracts an ordered list of points and reachability distances, and - performs initial clustering using `max_bound` distance specified at + performs initial clustering using `max_eps` distance specified at OPTICS object instantiation. Parameters @@ -378,7 +378,7 @@ def fit(self, X, y=None): def _expand_cluster_order(self, point, X, nbrs): # As above, not parallelizable. Parallelizing would allow items in # the 'unprocessed' list to switch to 'processed' - if self.core_distances_[point] <= self.max_bound: + if self.core_distances_[point] <= self.max_eps: while not self._processed[point]: self._processed[point] = True self.ordering_.append(point) @@ -389,7 +389,7 @@ def _expand_cluster_order(self, point, X, nbrs): def _set_reach_dist(self, point_index, X, nbrs): P = np.array(X[point_index]).reshape(1, -1) - indices = nbrs.radius_neighbors(P, radius=self.max_bound, + indices = nbrs.radius_neighbors(P, radius=self.max_eps, return_distance=False)[0] # Getting indices of neighbors that have not been processed @@ -416,17 +416,17 @@ def _set_reach_dist(self, point_index, X, nbrs): def extract_dbscan(self, eps): """Performs DBSCAN extraction for an arbitrary epsilon. - Extraction runs in linear time. Note that if the `max_bound` OPTICS + Extraction runs in linear time. Note that if the `max_eps` OPTICS parameter was set to < inf for extracting reachability and ordering arrays, DBSCAN extractions will be unstable for `eps` values close to - `max_bound`. Setting `eps` < (`max_bound` / 5.0) will guarantee + `max_eps`. Setting `eps` < (`max_eps` / 5.0) will guarantee extraction parity with DBSCAN. Parameters ---------- eps : float or int, required - DBSCAN `eps` parameter. Must be set to < `max_bound`. Equivalence - with DBSCAN algorithm is achieved if `eps` is < (`max_bound` / 5) + DBSCAN `eps` parameter. Must be set to < `max_eps`. Equivalence + with DBSCAN algorithm is achieved if `eps` is < (`max_eps` / 5) Returns ------- @@ -438,14 +438,14 @@ def extract_dbscan(self, eps): """ check_is_fitted(self, 'reachability_') - if eps > self.max_bound: + if eps > self.max_eps: raise ValueError('Specify an epsilon smaller than %s. Got %s.' - % (self.max_bound, eps)) + % (self.max_eps, eps)) - if eps * 5.0 > (self.max_bound * 1.05): + if eps * 5.0 > (self.max_eps * 1.05): warnings.warn( - "Warning, max_bound (%s) is close to eps (%s): " - "Output may be unstable." % (self.max_bound, eps), + "Warning, max_eps (%s) is close to eps (%s): " + "Output may be unstable." % (self.max_eps, eps), RuntimeWarning, stacklevel=2) # Stability warning is documented in _extract_dbscan method... diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 2116e75bf4a54..5a89cb7a0c439 100755 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -27,7 +27,7 @@ def test_correct_number_of_clusters(): X = generate_clustered_data(n_clusters=n_clusters) # Parameters chosen specifically for this task. # Compute OPTICS - clust = OPTICS(max_bound=5.0 * 6.0, min_samples=4, metric='euclidean') + clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, metric='euclidean') clust.fit(X) # number of clusters, ignoring noise if present n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) @@ -41,7 +41,7 @@ def test_minimum_number_of_sample_check(): # Compute OPTICS X = [[1, 1]] - clust = OPTICS(max_bound=5.0 * 0.3, min_samples=10) + clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10) # Run the fit assert_raise_message(ValueError, msg, clust.fit, X) @@ -51,7 +51,7 @@ def test_empty_extract(): # Test extract where fit() has not yet been run. msg = ("This OPTICS instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") - clust = OPTICS(max_bound=5.0 * 0.3, min_samples=10) + clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10) assert_raise_message(ValueError, msg, clust.extract_dbscan, 0.01) @@ -63,7 +63,7 @@ def test_bad_extract(): cluster_std=0.4, random_state=0) # Compute OPTICS - clust = OPTICS(max_bound=5.0 * 0.003, min_samples=10) + clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10) clust2 = clust.fit(X) assert_raise_message(ValueError, msg, clust2.extract_dbscan, 0.3) @@ -76,7 +76,7 @@ def test_close_extract(): cluster_std=0.4, random_state=0) # Compute OPTICS - clust = OPTICS(max_bound=1.0, min_samples=10) + clust = OPTICS(max_eps=1.0, min_samples=10) clust3 = clust.fit(X) # check warning when centers are passed assert_warns(RuntimeWarning, clust3.extract_dbscan, .3) From 84c4e544a0496bf382232e5c5bc7abf7eb699d70 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 4 Sep 2018 01:39:18 +1000 Subject: [PATCH 010/163] COSMIT remove unnecessary _TreeNode methods (#11983) --- sklearn/cluster/optics_.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index bc0fe5bfe7ceb..306fec73939e5 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -612,12 +612,6 @@ def __init__(self, points, start, end, parent_node): self.children = [] self.split_point = -1 - def assign_split_point(self, split_point): - self.split_point = split_point - - def add_child(self, child): - self.children.append(child) - def _is_local_maxima(index, reachability_plot, neighborhood_size): right_idx = slice(index + 1, index + neighborhood_size + 1) @@ -661,7 +655,7 @@ def _cluster_tree(node, parent_node, local_maxima_points, # take largest local maximum as possible separation between clusters s = local_maxima_points[0] - node.assign_split_point(s) + node.split_point = s local_maxima_points = local_maxima_points[1:] # create two new nodes and add to list of nodes @@ -683,7 +677,7 @@ def _cluster_tree(node, parent_node, local_maxima_points, node_list.append((node_2, local_max_2)) if reachability_plot[s] < significant_min: - node.assign_split_point(-1) + node.split_point = -1 # if split_point is not significant, ignore this split and continue _cluster_tree(node, parent_node, local_maxima_points, reachability_plot, reachability_ordering, @@ -715,7 +709,7 @@ def _cluster_tree(node, parent_node, local_maxima_points, (avg_reach2 / reachability_plot[s]) >= rejection_ratio): # since split_point is not significant, # ignore this split and continue (reject both child nodes) - node.assign_split_point(-1) + node.split_point = -1 _cluster_tree(node, parent_node, local_maxima_points, reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, @@ -733,7 +727,7 @@ def _cluster_tree(node, parent_node, local_maxima_points, node_list.remove((node_2, local_max_2)) if not node_list: # parent_node will be a leaf - node.assign_split_point(-1) + node.split_point = -1 return # Check if nodes can be moved up one level - the new cluster created @@ -748,13 +742,13 @@ def _cluster_tree(node, parent_node, local_maxima_points, for nl in node_list: if bypass_node == 1: - parent_node.add_child(nl[0]) + parent_node.children.append(nl[0]) _cluster_tree(nl[0], parent_node, nl[1], reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, similarity_threshold, significant_min) else: - node.add_child(nl[0]) + node.children.append(nl[0]) _cluster_tree(nl[0], node, nl[1], reachability_plot, reachability_ordering, min_cluster_size, maxima_ratio, rejection_ratio, From 07051bc04cc6746fb3370cb7ba0e246784372014 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 4 Sep 2018 00:23:16 +0200 Subject: [PATCH 011/163] DOC OPTICS: improve docstring and add default values. (#11987) --- sklearn/cluster/optics_.py | 54 ++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 306fec73939e5..272f987dc9177 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -41,20 +41,19 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', X : array, shape (n_samples, n_features) The data. - min_samples : int + min_samples : int (default=5) The number of samples in a neighborhood for a point to be considered as a core point. - max_eps : float, optional + max_eps : float, optional (default=np.inf) The maximum distance between two samples for them to be considered - as in the same neighborhood. This is also the largest object size - expected within the dataset. Default value of "np.inf" will identify + as in the same neighborhood. Default value of "np.inf" will identify clusters across all scales; reducing `max_eps` will result in shorter run times. - metric : string or callable, optional + metric : string or callable, optional (default='euclidean') The distance metric to use for neighborhood lookups. Default is - "minkowski". Other options include "euclidean", "manhattan", + "euclidean". Other options include "minkowski", "manhattan", "chebyshev", "haversine", "seuclidean", "hamming", "canberra", and "braycurtis". The "wminkowski" and "mahalanobis" metrics are also valid with an additional argument. @@ -68,20 +67,20 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', metric_params : dict, optional (default=None) Additional keyword arguments for the metric function. - maxima_ratio : float, optional + maxima_ratio : float, optional (default=.75) The maximum ratio we allow of average height of clusters on the right and left to the local maxima in question. The higher the ratio, the more generous the algorithm is to preserving local minima, and the more cuts the resulting tree will have. - rejection_ratio : float, optional + rejection_ratio : float, optional (default=.7) Adjusts the fitness of the clustering. When the maxima_ratio is exceeded, determine which of the clusters to the left and right to reject based on rejection_ratio. Higher values will result in points being more readily classified as noise; conversely, lower values will result in more points being clustered. - similarity_threshold : float, optional + similarity_threshold : float, optional (default=.4) Used to check if nodes can be moved up one level, that is, if the new cluster created is too "similar" to its parent, given the similarity threshold. Similarity can be determined by 1) the size @@ -91,19 +90,21 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', node. A lower value for the similarity threshold means less levels in the tree. - significant_min : float, optional + significant_min : float, optional (default=.003) Sets a lower threshold on how small a significant maxima can be. - min_cluster_size_ratio : float, optional + min_cluster_size_ratio : float, optional (default=.005) Minimum percentage of dataset expected for cluster membership. - min_maxima_ratio : float, optional + min_maxima_ratio : float, optional (default=.001) Used to determine neighborhood size for minimum cluster membership. + Each local maxima should be a largest value in a neighborhood + of the `size min_maxima_ratio * len(X)` from left and right. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - - 'ball_tree' will use :class:`BallTree` + - 'ball_tree' will use :class:`BallTree` (default) - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm @@ -168,20 +169,19 @@ class OPTICS(BaseEstimator, ClusterMixin): Parameters ---------- - min_samples : int + min_samples : int (default=5) The number of samples in a neighborhood for a point to be considered as a core point. - max_eps : float, optional + max_eps : float, optional (default=np.inf) The maximum distance between two samples for them to be considered - as in the same neighborhood. This is also the largest object size - expected within the dataset. Default value of "np.inf" will identify + as in the same neighborhood. Default value of "np.inf" will identify clusters across all scales; reducing `max_eps` will result in shorter run times. - metric : string or callable, optional + metric : string or callable, optional (default='euclidean') The distance metric to use for neighborhood lookups. Default is - "minkowski". Other options include "euclidean", "manhattan", + "euclidean". Other options include "minkowski", "manhattan", "chebyshev", "haversine", "seuclidean", "hamming", "canberra", and "braycurtis". The "wminkowski" and "mahalanobis" metrics are also valid with an additional argument. @@ -195,20 +195,20 @@ class OPTICS(BaseEstimator, ClusterMixin): metric_params : dict, optional (default=None) Additional keyword arguments for the metric function. - maxima_ratio : float, optional + maxima_ratio : float, optional (default=.75) The maximum ratio we allow of average height of clusters on the right and left to the local maxima in question. The higher the ratio, the more generous the algorithm is to preserving local minima, and the more cuts the resulting tree will have. - rejection_ratio : float, optional + rejection_ratio : float, optional (default=.7) Adjusts the fitness of the clustering. When the maxima_ratio is exceeded, determine which of the clusters to the left and right to reject based on rejection_ratio. Higher values will result in points being more readily classified as noise; conversely, lower values will result in more points being clustered. - similarity_threshold : float, optional + similarity_threshold : float, optional (default=.4) Used to check if nodes can be moved up one level, that is, if the new cluster created is too "similar" to its parent, given the similarity threshold. Similarity can be determined by 1) the size @@ -218,19 +218,21 @@ class OPTICS(BaseEstimator, ClusterMixin): node. A lower value for the similarity threshold means less levels in the tree. - significant_min : float, optional + significant_min : float, optional (default=.003) Sets a lower threshold on how small a significant maxima can be. - min_cluster_size_ratio : float, optional + min_cluster_size_ratio : float, optional (default=.005) Minimum percentage of dataset expected for cluster membership. - min_maxima_ratio : float, optional + min_maxima_ratio : float, optional (default=.001) Used to determine neighborhood size for minimum cluster membership. + Each local maxima should be a largest value in a neighborhood + of the `size min_maxima_ratio * len(X)` from left and right. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - - 'ball_tree' will use :class:`BallTree` + - 'ball_tree' will use :class:`BallTree` (default) - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm From ddf37c75c7b912104df56e1325363cd94a4fdd5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Szyma=C5=84ski?= Date: Tue, 4 Sep 2018 01:49:42 +0200 Subject: [PATCH 012/163] DOC adding scikit-multilearn to related projects list (#11988) --- doc/related_projects.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 9e5d5a32c0575..ce5f5c24dbf3a 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -183,7 +183,10 @@ and tasks. - `multiisotonic `_ Isotonic regression on multidimensional features. - + +- `scikit-multilearn `_ Multi-label classification with + focus on label space manipulation. + - `seglearn `_ Time series and sequence learning using sliding window segmentation. From a0418215fe34b68d96f611de08ba2558c4d791fb Mon Sep 17 00:00:00 2001 From: William de Vazelhes <31916524+wdevazelhes@users.noreply.github.com> Date: Tue, 4 Sep 2018 23:36:15 +0200 Subject: [PATCH 013/163] TST FIX use match rather than message in pytest.raises (#12001) Previously these assertions would pass without matching. --- sklearn/model_selection/tests/test_search.py | 8 ++++---- sklearn/utils/tests/test_validation.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 0409794bf08eb..969b6288a71e8 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -133,13 +133,13 @@ def assert_grid_iter_equals_getitem(grid): @pytest.mark.parametrize( "input, error_type, error_message", - [(0, TypeError, 'Parameter grid is not a dict or a list (0)'), - ([{'foo': [0]}, 0], TypeError, 'Parameter grid is not a dict (0)'), + [(0, TypeError, 'Parameter grid is not a dict or a list \(0\)'), + ([{'foo': [0]}, 0], TypeError, 'Parameter grid is not a dict \(0\)'), ({'foo': 0}, TypeError, "Parameter grid value is not iterable " - "(key='foo', value=0)")] + "\(key='foo', value=0\)")] ) def test_validate_parameter_grid_input(input, error_type, error_message): - with pytest.raises(error_type, message=error_message): + with pytest.raises(error_type, match=error_message): ParameterGrid(input) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 3e577ebaa8eec..5b32d9e2115d3 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -172,7 +172,7 @@ def test_check_array_force_all_finite_valid(value, force_all_finite, retype): (np.inf, 'allow-nan', 'Input contains infinity'), (np.nan, True, 'Input contains NaN, infinity'), (np.nan, 'allow-inf', 'force_all_finite should be a bool or "allow-nan"'), - (np.nan, 1, 'force_all_finite should be a bool or "allow-nan"')] + (np.nan, 1, 'Input contains NaN, infinity')] ) @pytest.mark.parametrize( "retype", @@ -182,7 +182,7 @@ def test_check_array_force_all_finiteinvalid(value, force_all_finite, match_msg, retype): X = retype(np.arange(4).reshape(2, 2).astype(np.float)) X[0, 0] = value - with pytest.raises(ValueError, message=match_msg): + with pytest.raises(ValueError, match=match_msg): check_array(X, force_all_finite=force_all_finite, accept_sparse=True) From efeb23dbba80d03ba96d39c262f49c9cf9e279e4 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 5 Sep 2018 19:02:23 +1000 Subject: [PATCH 014/163] DOC note controversy on multiclass balanced accuracy definition (#11994) --- sklearn/metrics/classification.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py index 52a07df9aea29..60f47980d6a17 100644 --- a/sklearn/metrics/classification.py +++ b/sklearn/metrics/classification.py @@ -1403,6 +1403,13 @@ def balanced_accuracy_score(y_true, y_pred, sample_weight=None, -------- recall_score, roc_auc_score + Notes + ----- + Some literature promotes alternative definitions of balanced accuracy. Our + definition is equivalent to :func:`accuracy_score` with class-balanced + sample weights, and shares desirable properties with the binary case. + See the :ref:`User Guide `. + References ---------- .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010). From dff84c81949374ca49eae2b1b2d267c5b5f12505 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 5 Sep 2018 05:11:00 -0400 Subject: [PATCH 015/163] MNT Use `fmax` when finding the maximum (#12005) Instead of adding an `if` to check for values that become the new max, simply use `fmax` to get the maximum and update the value. This improves readability. It may improve performance as `fmax` can be a single assembly instruction. Though most compilers can probably figure this out anyways. --- sklearn/linear_model/cd_fast.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx index cd044824b4b7a..c75ad0f667d46 100644 --- a/sklearn/linear_model/cd_fast.pyx +++ b/sklearn/linear_model/cd_fast.pyx @@ -251,11 +251,9 @@ def enet_coordinate_descent(floating[::1] w, # update the maximum absolute coefficient update d_w_ii = fabs(w[ii] - w_ii) - if d_w_ii > d_w_max: - d_w_max = d_w_ii + d_w_max = fmax(d_w_max, d_w_ii) - if fabs(w[ii]) > w_max: - w_max = fabs(w[ii]) + w_max = fmax(w_max, fabs(w[ii])) if (w_max == 0.0 or d_w_max / w_max < d_w_tol or From 8d5b08d94db2e19382f67edaff2a4b80ab6605de Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Wed, 5 Sep 2018 11:15:35 +0200 Subject: [PATCH 016/163] DOC small changes in outlier detection documentation (#12003) --- doc/modules/outlier_detection.rst | 34 +++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index 9dbe013bef5d7..3482d4246cda7 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -8,9 +8,9 @@ Novelty and Outlier Detection Many applications require being able to decide whether a new observation belongs to the same distribution as existing observations (it is an -`inlier`), or should be considered as different (it is an outlier). +*inlier*), or should be considered as different (it is an *outlier*). Often, this ability is used to clean real data sets. Two important -distinction must be made: +distinctions must be made: :outlier detection: The training data contains outliers which are defined as observations that @@ -35,7 +35,7 @@ a low density region of the training data, considered as normal in this context. The scikit-learn project provides a set of machine learning tools that -can be used both for novelty or outliers detection. This strategy is +can be used both for novelty or outlier detection. This strategy is implemented with objects learning in an unsupervised way from the data:: estimator.fit(X_train) @@ -77,6 +77,18 @@ not available. The scores of abnormality of the training samples are always accessible through the ``negative_outlier_factor_`` attribute. +The behavior of :class:`neighbors.LocalOutlierFactor` is summarized in the +following table. + +===================== ================================ ===================== +Method Outlier detection Novelty detection +===================== ================================ ===================== +``fit_predict`` OK Not available +``predict`` Not available Use only on new data +``decision_function`` Not available Use only on new data +``score_samples`` Use ``negative_outlier_factor_`` Use only on new data +===================== ================================ ===================== + Overview of outlier detection methods ===================================== @@ -162,7 +174,7 @@ Outlier Detection Outlier detection is similar to novelty detection in the sense that the goal is to separate a core of regular observations from some -polluting ones, called "outliers". Yet, in the case of outlier +polluting ones, called *outliers*. Yet, in the case of outlier detection, we don't have a clean data set representing the population of regular observations that can be used to train any tool. @@ -341,19 +353,7 @@ Note that ``fit_predict`` is not available in this case. The scores of abnormality of the training samples are always accessible through the ``negative_outlier_factor_`` attribute. -The behavior of LOF is summarized in the following table. - -==================== ================================ ===================== -Method Outlier detection Novelty detection -==================== ================================ ===================== -`fit_predict` OK Not available -`predict` Not available Use only on test data -`decision_function` Not available Use only on test data -`score_samples` Use `negative_outlier_factor_` Use only on test data -==================== ================================ ===================== - - -This strategy is illustrated below. +Novelty detection with Local Outlier Factor is illustrated below. .. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_novelty_detection_001.png :target: ../auto_examples/neighbors/sphx_glr_plot_lof_novelty_detection.html From c2682309206a9e6b298a00479af8b82b80e444f4 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 5 Sep 2018 17:51:49 +0800 Subject: [PATCH 017/163] MNT Remove n_clusters_ in OPTICS (#11981) --- sklearn/cluster/optics_.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 272f987dc9177..5c20ddb421845 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -372,7 +372,6 @@ def fit(self, X, y=None): self.min_cluster_size_ratio, self.min_maxima_ratio) self.core_sample_indices_ = indices_ - self.n_clusters_ = np.max(self.labels_) return self # OPTICS helper functions; these should not be public # From 5e101a2a07ea3586fe663598495cdc3893cb7665 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 5 Sep 2018 13:02:48 +0200 Subject: [PATCH 018/163] Joblib 0.12.4 (#12007) This should fix #11971 (fixed PyPy support, pypy3 is now part of the joblib build matrix on travis). It should also be backported to 0.20.X. --- sklearn/externals/copy_joblib.sh | 2 +- sklearn/externals/joblib/__init__.py | 2 +- .../joblib/externals/loky/__init__.py | 20 +- .../externals/joblib/externals/loky/_base.py | 1055 +++++++++-------- .../joblib/externals/loky/backend/compat.py | 4 +- .../joblib/externals/loky/backend/context.py | 24 +- .../externals/loky/backend/reduction.py | 11 +- .../joblib/externals/loky/process_executor.py | 30 +- sklearn/externals/joblib/memory.py | 34 +- 9 files changed, 617 insertions(+), 565 deletions(-) diff --git a/sklearn/externals/copy_joblib.sh b/sklearn/externals/copy_joblib.sh index 878413297759f..f2c4ab3ed359b 100755 --- a/sklearn/externals/copy_joblib.sh +++ b/sklearn/externals/copy_joblib.sh @@ -11,7 +11,7 @@ else JOBLIB=$1 fi -pip install $JOBLIB --target $INSTALL_FOLDER +pip install --no-cache $JOBLIB --target $INSTALL_FOLDER cp -r $INSTALL_FOLDER/joblib joblib rm -rf $INSTALL_FOLDER diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py index 5953feeb92a52..a42646eb4c754 100644 --- a/sklearn/externals/joblib/__init__.py +++ b/sklearn/externals/joblib/__init__.py @@ -106,7 +106,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.12.3' +__version__ = '0.12.4' from .memory import Memory, MemorizedResult, register_store_backend diff --git a/sklearn/externals/joblib/externals/loky/__init__.py b/sklearn/externals/joblib/externals/loky/__init__.py index 6c5296210e427..18c01d0a6aa04 100644 --- a/sklearn/externals/joblib/externals/loky/__init__.py +++ b/sklearn/externals/joblib/externals/loky/__init__.py @@ -3,10 +3,20 @@ :class:`ProcessPoolExecutor` and a function :func:`get_reusable_executor` which hide the pool management under the hood. """ -from .reusable_executor import get_reusable_executor # noqa: F401 -from .process_executor import ProcessPoolExecutor # noqa: F401 -from .process_executor import BrokenProcessPool # noqa: F401 +from ._base import Executor, Future +from ._base import wait, as_completed +from ._base import TimeoutError, CancelledError +from ._base import ALL_COMPLETED, FIRST_COMPLETED, FIRST_EXCEPTION -from .backend.context import cpu_count # noqa: F401 +from .backend.context import cpu_count +from .reusable_executor import get_reusable_executor +from .process_executor import BrokenProcessPool, ProcessPoolExecutor -__version__ = '2.2.2' + +__all__ = ["get_reusable_executor", "cpu_count", "wait", "as_completed", + "Future", "Executor", "ProcessPoolExecutor", + "BrokenProcessPool", "CancelledError", "TimeoutError", + "FIRST_COMPLETED", "FIRST_EXCEPTION", "ALL_COMPLETED", ] + + +__version__ = '2.3.0' diff --git a/sklearn/externals/joblib/externals/loky/_base.py b/sklearn/externals/joblib/externals/loky/_base.py index ff4ac92cf402d..92422bbf3f2a4 100644 --- a/sklearn/externals/joblib/externals/loky/_base.py +++ b/sklearn/externals/joblib/externals/loky/_base.py @@ -11,46 +11,58 @@ # Licensed to PSF under a Contributor Agreement. import sys -import collections +import time import logging import threading -import time +import collections + + +if sys.version_info[:2] >= (3, 3): + + from concurrent.futures import wait, as_completed + from concurrent.futures import TimeoutError, CancelledError + from concurrent.futures import Executor, Future as _BaseFuture + + from concurrent.futures import FIRST_EXCEPTION + from concurrent.futures import ALL_COMPLETED, FIRST_COMPLETED + + from concurrent.futures._base import LOGGER + from concurrent.futures._base import PENDING, RUNNING, CANCELLED + from concurrent.futures._base import CANCELLED_AND_NOTIFIED, FINISHED +else: -FIRST_COMPLETED = 'FIRST_COMPLETED' -FIRST_EXCEPTION = 'FIRST_EXCEPTION' -ALL_COMPLETED = 'ALL_COMPLETED' -_AS_COMPLETED = '_AS_COMPLETED' - -# Possible future states (for internal use by the futures package). -PENDING = 'PENDING' -RUNNING = 'RUNNING' -# The future was cancelled by the user... -CANCELLED = 'CANCELLED' -# ...and _Waiter.add_cancelled() was called by a worker. -CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED' -FINISHED = 'FINISHED' - -_FUTURE_STATES = [ - PENDING, - RUNNING, - CANCELLED, - CANCELLED_AND_NOTIFIED, - FINISHED -] - -_STATE_TO_DESCRIPTION_MAP = { - PENDING: "pending", - RUNNING: "running", - CANCELLED: "cancelled", - CANCELLED_AND_NOTIFIED: "cancelled", - FINISHED: "finished" -} - -# Logger for internal use by the futures package. -LOGGER = logging.getLogger("concurrent.futures") - - -if sys.version_info[:2] < (3, 3): + FIRST_COMPLETED = 'FIRST_COMPLETED' + FIRST_EXCEPTION = 'FIRST_EXCEPTION' + ALL_COMPLETED = 'ALL_COMPLETED' + _AS_COMPLETED = '_AS_COMPLETED' + + # Possible future states (for internal use by the futures package). + PENDING = 'PENDING' + RUNNING = 'RUNNING' + # The future was cancelled by the user... + CANCELLED = 'CANCELLED' + # ...and _Waiter.add_cancelled() was called by a worker. + CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED' + FINISHED = 'FINISHED' + + _FUTURE_STATES = [ + PENDING, + RUNNING, + CANCELLED, + CANCELLED_AND_NOTIFIED, + FINISHED + ] + + _STATE_TO_DESCRIPTION_MAP = { + PENDING: "pending", + RUNNING: "running", + CANCELLED: "cancelled", + CANCELLED_AND_NOTIFIED: "cancelled", + FINISHED: "finished" + } + + # Logger for internal use by the futures package. + LOGGER = logging.getLogger("concurrent.futures") class Error(Exception): """Base class for all future-related exceptions.""" @@ -63,548 +75,553 @@ class CancelledError(Error): class TimeoutError(Error): """The operation exceeded the given deadline.""" pass -else: - from concurrent.futures import CancelledError, TimeoutError + class _Waiter(object): + """Provides the event that wait() and as_completed() block on.""" + def __init__(self): + self.event = threading.Event() + self.finished_futures = [] -class _Waiter(object): - """Provides the event that wait() and as_completed() block on.""" - def __init__(self): - self.event = threading.Event() - self.finished_futures = [] + def add_result(self, future): + self.finished_futures.append(future) - def add_result(self, future): - self.finished_futures.append(future) + def add_exception(self, future): + self.finished_futures.append(future) - def add_exception(self, future): - self.finished_futures.append(future) + def add_cancelled(self, future): + self.finished_futures.append(future) - def add_cancelled(self, future): - self.finished_futures.append(future) + class _AsCompletedWaiter(_Waiter): + """Used by as_completed().""" + def __init__(self): + super(_AsCompletedWaiter, self).__init__() + self.lock = threading.Lock() -class _AsCompletedWaiter(_Waiter): - """Used by as_completed().""" + def add_result(self, future): + with self.lock: + super(_AsCompletedWaiter, self).add_result(future) + self.event.set() - def __init__(self): - super(_AsCompletedWaiter, self).__init__() - self.lock = threading.Lock() + def add_exception(self, future): + with self.lock: + super(_AsCompletedWaiter, self).add_exception(future) + self.event.set() - def add_result(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_result(future) - self.event.set() + def add_cancelled(self, future): + with self.lock: + super(_AsCompletedWaiter, self).add_cancelled(future) + self.event.set() - def add_exception(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_exception(future) - self.event.set() + class _FirstCompletedWaiter(_Waiter): + """Used by wait(return_when=FIRST_COMPLETED).""" - def add_cancelled(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_cancelled(future) + def add_result(self, future): + super(_FirstCompletedWaiter, self).add_result(future) self.event.set() + def add_exception(self, future): + super(_FirstCompletedWaiter, self).add_exception(future) + self.event.set() -class _FirstCompletedWaiter(_Waiter): - """Used by wait(return_when=FIRST_COMPLETED).""" - - def add_result(self, future): - super(_FirstCompletedWaiter, self).add_result(future) - self.event.set() - - def add_exception(self, future): - super(_FirstCompletedWaiter, self).add_exception(future) - self.event.set() + def add_cancelled(self, future): + super(_FirstCompletedWaiter, self).add_cancelled(future) + self.event.set() - def add_cancelled(self, future): - super(_FirstCompletedWaiter, self).add_cancelled(future) - self.event.set() + class _AllCompletedWaiter(_Waiter): + """Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED).""" + def __init__(self, num_pending_calls, stop_on_exception): + self.num_pending_calls = num_pending_calls + self.stop_on_exception = stop_on_exception + self.lock = threading.Lock() + super(_AllCompletedWaiter, self).__init__() -class _AllCompletedWaiter(_Waiter): - """Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED).""" + def _decrement_pending_calls(self): + with self.lock: + self.num_pending_calls -= 1 + if not self.num_pending_calls: + self.event.set() - def __init__(self, num_pending_calls, stop_on_exception): - self.num_pending_calls = num_pending_calls - self.stop_on_exception = stop_on_exception - self.lock = threading.Lock() - super(_AllCompletedWaiter, self).__init__() + def add_result(self, future): + super(_AllCompletedWaiter, self).add_result(future) + self._decrement_pending_calls() - def _decrement_pending_calls(self): - with self.lock: - self.num_pending_calls -= 1 - if not self.num_pending_calls: + def add_exception(self, future): + super(_AllCompletedWaiter, self).add_exception(future) + if self.stop_on_exception: self.event.set() + else: + self._decrement_pending_calls() - def add_result(self, future): - super(_AllCompletedWaiter, self).add_result(future) - self._decrement_pending_calls() - - def add_exception(self, future): - super(_AllCompletedWaiter, self).add_exception(future) - if self.stop_on_exception: - self.event.set() - else: + def add_cancelled(self, future): + super(_AllCompletedWaiter, self).add_cancelled(future) self._decrement_pending_calls() - def add_cancelled(self, future): - super(_AllCompletedWaiter, self).add_cancelled(future) - self._decrement_pending_calls() - - -class _AcquireFutures(object): - """A context manager that does an ordered acquire of Future conditions.""" - - def __init__(self, futures): - self.futures = sorted(futures, key=id) - - def __enter__(self): - for future in self.futures: - future._condition.acquire() + class _AcquireFutures(object): + """A context manager that does an ordered acquire of Future conditions. + """ - def __exit__(self, *args): - for future in self.futures: - future._condition.release() + def __init__(self, futures): + self.futures = sorted(futures, key=id) + def __enter__(self): + for future in self.futures: + future._condition.acquire() -def _create_and_install_waiters(fs, return_when): - if return_when == _AS_COMPLETED: - waiter = _AsCompletedWaiter() - elif return_when == FIRST_COMPLETED: - waiter = _FirstCompletedWaiter() - else: - pending_count = sum( - f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs) + def __exit__(self, *args): + for future in self.futures: + future._condition.release() - if return_when == FIRST_EXCEPTION: - waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True) - elif return_when == ALL_COMPLETED: - waiter = _AllCompletedWaiter(pending_count, - stop_on_exception=False) + def _create_and_install_waiters(fs, return_when): + if return_when == _AS_COMPLETED: + waiter = _AsCompletedWaiter() + elif return_when == FIRST_COMPLETED: + waiter = _FirstCompletedWaiter() else: - raise ValueError("Invalid return condition: %r" % return_when) - - for f in fs: - f._waiters.append(waiter) - - return waiter - - -def as_completed(fs, timeout=None): - """An iterator over the given futures that yields each as it completes. - - Args: - fs: The sequence of Futures (possibly created by different Executors) - to iterate over. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - - Returns: - An iterator that yields the given Futures as they complete (finished or - cancelled). If any given Futures are duplicated, they will be returned - once. - - Raises: - TimeoutError: If the entire result iterator could not be generated - before the given timeout. - """ - if timeout is not None: - end_time = timeout + time.time() - - fs = set(fs) - with _AcquireFutures(fs): - finished = set( - f for f in fs - if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) - pending = fs - finished - waiter = _create_and_install_waiters(fs, _AS_COMPLETED) - - try: - for future in finished: - yield future - - while pending: - if timeout is None: - wait_timeout = None + pending_count = sum( + f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] + for f in fs) + + if return_when == FIRST_EXCEPTION: + waiter = _AllCompletedWaiter(pending_count, + stop_on_exception=True) + elif return_when == ALL_COMPLETED: + waiter = _AllCompletedWaiter(pending_count, + stop_on_exception=False) else: - wait_timeout = end_time - time.time() - if wait_timeout < 0: - raise TimeoutError('%d (of %d) futures unfinished' % ( - len(pending), len(fs))) - - waiter.event.wait(wait_timeout) - - with waiter.lock: - finished = waiter.finished_futures - waiter.finished_futures = [] - waiter.event.clear() + raise ValueError("Invalid return condition: %r" % return_when) - for future in finished: - yield future - pending.remove(future) - - finally: for f in fs: - with f._condition: - f._waiters.remove(waiter) - - -DoneAndNotDoneFutures = collections.namedtuple( - 'DoneAndNotDoneFutures', 'done not_done') - - -def wait(fs, timeout=None, return_when=ALL_COMPLETED): - """Wait for the futures in the given sequence to complete. - - Args: - fs: The sequence of Futures (possibly created by different Executors) - to wait upon. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - return_when: Indicates when this function should return. The options - are: - - FIRST_COMPLETED - Return when any future finishes or is - cancelled. - FIRST_EXCEPTION - Return when any future finishes by raising an - exception. If no future raises an exception - then it is equivalent to ALL_COMPLETED. - ALL_COMPLETED - Return when all futures finish or are cancelled. - - Returns: - A named 2-tuple of sets. The first set, named 'done', contains the - futures that completed (is finished or cancelled) before the wait - completed. The second set, named 'not_done', contains uncompleted - futures. - """ - with _AcquireFutures(fs): - done = set(f for f in fs - if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) - not_done = set(fs) - done - - if (return_when == FIRST_COMPLETED) and done: - return DoneAndNotDoneFutures(done, not_done) - elif (return_when == FIRST_EXCEPTION) and done: - if any(f for f in done - if not f.cancelled() and f.exception() is not None): - return DoneAndNotDoneFutures(done, not_done) - - if len(done) == len(fs): - return DoneAndNotDoneFutures(done, not_done) - - waiter = _create_and_install_waiters(fs, return_when) - - waiter.event.wait(timeout) - for f in fs: - with f._condition: - f._waiters.remove(waiter) - - done.update(waiter.finished_futures) - return DoneAndNotDoneFutures(done, set(fs) - done) - - -class Future(object): - """Represents the result of an asynchronous computation.""" - - def __init__(self): - """Initializes the future. Should not be called by clients.""" - self._condition = threading.Condition() - self._state = PENDING - self._result = None - self._exception = None - self._waiters = [] - self._done_callbacks = [] - - def _invoke_callbacks(self): - for callback in self._done_callbacks: - try: - callback(self) - except BaseException: - LOGGER.exception('exception calling callback for %r', self) - - def __repr__(self): - with self._condition: - if self._state == FINISHED: - if self._exception: - return '<%s at %#x state=%s raised %s>' % ( - self.__class__.__name__, - id(self), - _STATE_TO_DESCRIPTION_MAP[self._state], - self._exception.__class__.__name__) - else: - return '<%s at %#x state=%s returned %s>' % ( - self.__class__.__name__, - id(self), - _STATE_TO_DESCRIPTION_MAP[self._state], - self._result.__class__.__name__) - return '<%s at %#x state=%s>' % ( - self.__class__.__name__, - id(self), - _STATE_TO_DESCRIPTION_MAP[self._state]) - - def cancel(self): - """Cancel the future if possible. - - Returns True if the future was cancelled, False otherwise. A future - cannot be cancelled if it is running or has already completed. - """ - with self._condition: - if self._state in [RUNNING, FINISHED]: - return False - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - return True - - self._state = CANCELLED - self._condition.notify_all() - - self._invoke_callbacks() - return True - - def cancelled(self): - """Return True if the future was cancelled.""" - with self._condition: - return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED] - - def running(self): - """Return True if the future is currently executing.""" - with self._condition: - return self._state == RUNNING - - def done(self): - """Return True of the future was cancelled or finished executing.""" - with self._condition: - return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED] - - def __get_result(self): - if self._exception: - raise self._exception - else: - return self._result - - def add_done_callback(self, fn): - """Attaches a callable that will be called when the future finishes. - - Args: - fn: A callable that will be called with this future as its only - argument when the future completes or is cancelled. The - callable will always be called by a thread in the same process - in which it was added. If the future has already completed or - been cancelled then the callable will be called immediately. - These callables are called in the order that they were added. - """ - with self._condition: - if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, - FINISHED]: - self._done_callbacks.append(fn) - return - fn(self) - - def result(self, timeout=None): - """Return the result of the call that the future represents. - - Args: - timeout: The number of seconds to wait for the result if the future - isn't done. If None, then there is no limit on the wait time. + f._waiters.append(waiter) - Returns: - The result of the call that the future represents. - - Raises: - CancelledError: If the future was cancelled. - TimeoutError: If the future didn't finish executing before the - given timeout. - Exception: If the call raised then that exception will be raised. - """ - with self._condition: - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self.__get_result() - - self._condition.wait(timeout) - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self.__get_result() - else: - raise TimeoutError() + return waiter - def exception(self, timeout=None): - """Return the exception raised by the call that the future represents. + def as_completed(fs, timeout=None): + """An iterator over the given futures that yields each as it completes. Args: - timeout: The number of seconds to wait for the exception if the - future isn't done. If None, then there is no limit on the wait - time. + fs: The sequence of Futures (possibly created by different + Executors) to iterate over. + timeout: The maximum number of seconds to wait. If None, then there + is no limit on the wait time. Returns: - The exception raised by the call that the future represents or None - if the call completed without raising. + An iterator that yields the given Futures as they complete + (finished or cancelled). If any given Futures are duplicated, they + will be returned once. Raises: - CancelledError: If the future was cancelled. - TimeoutError: If the future didn't finish executing before the - given timeout. + TimeoutError: If the entire result iterator could not be generated + before the given timeout. """ + if timeout is not None: + end_time = timeout + time.time() - with self._condition: - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self._exception - - self._condition.wait(timeout) - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self._exception - else: - raise TimeoutError() - - # The following methods should only be used by Executors and in tests. - def set_running_or_notify_cancel(self): - """Mark the future as running or process any cancel notifications. - - Should only be used by Executor implementations and unit tests. - - If the future has been cancelled (cancel() was called and returned - True) then any threads waiting on the future completing (though calls - to as_completed() or wait()) are notified and False is returned. - - If the future was not cancelled then it is put in the running state - (future calls to running() will return True) and True is returned. - - This method should be called by Executor implementations before - executing the work associated with this future. If this method returns - False then the work should not be executed. + fs = set(fs) + with _AcquireFutures(fs): + finished = set( + f for f in fs + if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) + pending = fs - finished + waiter = _create_and_install_waiters(fs, _AS_COMPLETED) - Returns: - False if the Future was cancelled, True otherwise. + try: + for future in finished: + yield future - Raises: - RuntimeError: if this method was already called or if set_result() - or set_exception() was called. - """ - with self._condition: - if self._state == CANCELLED: - self._state = CANCELLED_AND_NOTIFIED - for waiter in self._waiters: - waiter.add_cancelled(self) - # self._condition.notify_all() is not necessary because - # self.cancel() triggers a notification. - return False - elif self._state == PENDING: - self._state = RUNNING - return True - else: - LOGGER.critical('Future %s in unexpected state: %s', - id(self), - self._state) - raise RuntimeError('Future in unexpected state') + while pending: + if timeout is None: + wait_timeout = None + else: + wait_timeout = end_time - time.time() + if wait_timeout < 0: + raise TimeoutError('%d (of %d) futures unfinished' % ( + len(pending), len(fs))) - def set_result(self, result): - """Sets the return value of work associated with the future. + waiter.event.wait(wait_timeout) - Should only be used by Executor implementations and unit tests. - """ - with self._condition: - self._result = result - self._state = FINISHED - for waiter in self._waiters: - waiter.add_result(self) - self._condition.notify_all() - self._invoke_callbacks() - - def set_exception(self, exception): - """Sets the result of the future as being the given exception. - - Should only be used by Executor implementations and unit tests. - """ - with self._condition: - self._exception = exception - self._state = FINISHED - for waiter in self._waiters: - waiter.add_exception(self) - self._condition.notify_all() - self._invoke_callbacks() + with waiter.lock: + finished = waiter.finished_futures + waiter.finished_futures = [] + waiter.event.clear() + for future in finished: + yield future + pending.remove(future) -class Executor(object): - """This is an abstract base class for concrete asynchronous executors.""" + finally: + for f in fs: + with f._condition: + f._waiters.remove(waiter) - def submit(self, fn, *args, **kwargs): - """Submits a callable to be executed with the given arguments. + DoneAndNotDoneFutures = collections.namedtuple( + 'DoneAndNotDoneFutures', 'done not_done') - Schedules the callable to be executed as fn(*args, **kwargs) and - returns a Future instance representing the execution of the callable. - - Returns: - A Future representing the given call. - """ - raise NotImplementedError() - - def map(self, fn, *iterables, **kwargs): - """Returns an iterator equivalent to map(fn, iter). + def wait(fs, timeout=None, return_when=ALL_COMPLETED): + """Wait for the futures in the given sequence to complete. Args: - fn: A callable that will take as many arguments as there are - passed iterables. + fs: The sequence of Futures (possibly created by different + Executors) to wait upon. timeout: The maximum number of seconds to wait. If None, then there is no limit on the wait time. - chunksize: The size of the chunks the iterable will be broken into - before being passed to a child process. This argument is only - used by ProcessPoolExecutor; it is ignored by - ThreadPoolExecutor. + return_when: Indicates when this function should return. The + options are: - Returns: - An iterator equivalent to: map(func, *iterables) but the calls may - be evaluated out-of-order. + FIRST_COMPLETED - Return when any future finishes or is + cancelled. + FIRST_EXCEPTION - Return when any future finishes by raising an + exception. If no future raises an exception + then it is equivalent to ALL_COMPLETED. + ALL_COMPLETED - Return when all futures finish or are + cancelled. - Raises: - TimeoutError: If the entire result iterator could not be generated - before the given timeout. - Exception: If fn(*args) raises for any values. + Returns: + A named 2-tuple of sets. The first set, named 'done', contains the + futures that completed (is finished or cancelled) before the wait + completed. The second set, named 'not_done', contains uncompleted + futures. """ - timeout = kwargs.get('timeout') - if timeout is not None: - end_time = timeout + time.time() + with _AcquireFutures(fs): + done = set(f for f in fs + if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) + not_done = set(fs) - done - fs = [self.submit(fn, *args) for args in zip(*iterables)] + if (return_when == FIRST_COMPLETED) and done: + return DoneAndNotDoneFutures(done, not_done) + elif (return_when == FIRST_EXCEPTION) and done: + if any(f for f in done + if not f.cancelled() and f.exception() is not None): + return DoneAndNotDoneFutures(done, not_done) - # Yield must be hidden in closure so that the futures are submitted - # before the first iterator value is required. - def result_iterator(): - try: - for future in fs: - if timeout is None: - yield future.result() - else: - yield future.result(end_time - time.time()) - finally: - for future in fs: - future.cancel() - return result_iterator() + if len(done) == len(fs): + return DoneAndNotDoneFutures(done, not_done) - def shutdown(self, wait=True): - """Clean-up the resources associated with the Executor. + waiter = _create_and_install_waiters(fs, return_when) - It is safe to call this method several times. Otherwise, no other - methods can be called after this one. + waiter.event.wait(timeout) + for f in fs: + with f._condition: + f._waiters.remove(waiter) - Args: - wait: If True then shutdown will not return until all running - futures have finished executing and the resources used by the - executor have been reclaimed. - """ - pass + done.update(waiter.finished_futures) + return DoneAndNotDoneFutures(done, set(fs) - done) + + class _BaseFuture(object): + """Represents the result of an asynchronous computation.""" + + def __init__(self): + """Initializes the future. Should not be called by clients.""" + self._condition = threading.Condition() + self._state = PENDING + self._result = None + self._exception = None + self._waiters = [] + self._done_callbacks = [] + + def __repr__(self): + with self._condition: + if self._state == FINISHED: + if self._exception: + return '<%s at %#x state=%s raised %s>' % ( + self.__class__.__name__, + id(self), + _STATE_TO_DESCRIPTION_MAP[self._state], + self._exception.__class__.__name__) + else: + return '<%s at %#x state=%s returned %s>' % ( + self.__class__.__name__, + id(self), + _STATE_TO_DESCRIPTION_MAP[self._state], + self._result.__class__.__name__) + return '<%s at %#x state=%s>' % ( + self.__class__.__name__, + id(self), + _STATE_TO_DESCRIPTION_MAP[self._state]) + + def cancel(self): + """Cancel the future if possible. + + Returns True if the future was cancelled, False otherwise. A future + cannot be cancelled if it is running or has already completed. + """ + with self._condition: + if self._state in [RUNNING, FINISHED]: + return False + + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + return True + + self._state = CANCELLED + self._condition.notify_all() + + self._invoke_callbacks() + return True + + def cancelled(self): + """Return True if the future was cancelled.""" + with self._condition: + return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED] + + def running(self): + """Return True if the future is currently executing.""" + with self._condition: + return self._state == RUNNING + + def done(self): + """Return True of the future was cancelled or finished executing. + """ + with self._condition: + return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, + FINISHED] + + def __get_result(self): + if self._exception: + raise self._exception + else: + return self._result + + def add_done_callback(self, fn): + """Attaches a callable that will be called when the future finishes. + + Args: + fn: A callable that will be called with this future as its only + argument when the future completes or is cancelled. The + callable will always be called by a thread in the same + process in which it was added. If the future has already + completed or been cancelled then the callable will be + called immediately. These callables are called in the order + that they were added. + """ + with self._condition: + if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, + FINISHED]: + self._done_callbacks.append(fn) + return + fn(self) + + def result(self, timeout=None): + """Return the result of the call that the future represents. + + Args: + timeout: The number of seconds to wait for the result if the + future isn't done. If None, then there is no limit on the + wait time. + + Returns: + The result of the call that the future represents. + + Raises: + CancelledError: If the future was cancelled. + TimeoutError: If the future didn't finish executing before the + given timeout. + Exception: If the call raised then that exception will be + raised. + """ + with self._condition: + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + raise CancelledError() + elif self._state == FINISHED: + return self.__get_result() + + self._condition.wait(timeout) + + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + raise CancelledError() + elif self._state == FINISHED: + return self.__get_result() + else: + raise TimeoutError() + + def exception(self, timeout=None): + """Return the exception raised by the call that the future + represents. + + Args: + timeout: The number of seconds to wait for the exception if the + future isn't done. If None, then there is no limit on the + wait time. + + Returns: + The exception raised by the call that the future represents or + None if the call completed without raising. + + Raises: + CancelledError: If the future was cancelled. + TimeoutError: If the future didn't finish executing before the + given timeout. + """ + + with self._condition: + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + raise CancelledError() + elif self._state == FINISHED: + return self._exception + + self._condition.wait(timeout) + + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + raise CancelledError() + elif self._state == FINISHED: + return self._exception + else: + raise TimeoutError() + + # The following methods should only be used by Executors and in tests. + def set_running_or_notify_cancel(self): + """Mark the future as running or process any cancel notifications. + + Should only be used by Executor implementations and unit tests. + + If the future has been cancelled (cancel() was called and returned + True) then any threads waiting on the future completing (though + calls to as_completed() or wait()) are notified and False is + returned. + + If the future was not cancelled then it is put in the running state + (future calls to running() will return True) and True is returned. + + This method should be called by Executor implementations before + executing the work associated with this future. If this method + returns False then the work should not be executed. + + Returns: + False if the Future was cancelled, True otherwise. + + Raises: + RuntimeError: if this method was already called or if + set_result() or set_exception() was called. + """ + with self._condition: + if self._state == CANCELLED: + self._state = CANCELLED_AND_NOTIFIED + for waiter in self._waiters: + waiter.add_cancelled(self) + # self._condition.notify_all() is not necessary because + # self.cancel() triggers a notification. + return False + elif self._state == PENDING: + self._state = RUNNING + return True + else: + LOGGER.critical('Future %s in unexpected state: %s', + id(self), + self._state) + raise RuntimeError('Future in unexpected state') + + def set_result(self, result): + """Sets the return value of work associated with the future. + + Should only be used by Executor implementations and unit tests. + """ + with self._condition: + self._result = result + self._state = FINISHED + for waiter in self._waiters: + waiter.add_result(self) + self._condition.notify_all() + self._invoke_callbacks() + + def set_exception(self, exception): + """Sets the result of the future as being the given exception. + + Should only be used by Executor implementations and unit tests. + """ + with self._condition: + self._exception = exception + self._state = FINISHED + for waiter in self._waiters: + waiter.add_exception(self) + self._condition.notify_all() + self._invoke_callbacks() - def __enter__(self): - return self + class Executor(object): + """This is an abstract base class for concrete asynchronous executors. + """ - def __exit__(self, exc_type, exc_val, exc_tb): - self.shutdown(wait=True) - return False + def submit(self, fn, *args, **kwargs): + """Submits a callable to be executed with the given arguments. + + Schedules the callable to be executed as fn(*args, **kwargs) and + returns a Future instance representing the execution of the + callable. + + Returns: + A Future representing the given call. + """ + raise NotImplementedError() + + def map(self, fn, *iterables, **kwargs): + """Returns an iterator equivalent to map(fn, iter). + + Args: + fn: A callable that will take as many arguments as there are + passed iterables. + timeout: The maximum number of seconds to wait. If None, then + there is no limit on the wait time. + chunksize: The size of the chunks the iterable will be broken + into before being passed to a child process. This argument + is only used by ProcessPoolExecutor; it is ignored by + ThreadPoolExecutor. + + Returns: + An iterator equivalent to: map(func, *iterables) but the calls + may be evaluated out-of-order. + + Raises: + TimeoutError: If the entire result iterator could not be + generated before the given timeout. + Exception: If fn(*args) raises for any values. + """ + timeout = kwargs.get('timeout') + if timeout is not None: + end_time = timeout + time.time() + + fs = [self.submit(fn, *args) for args in zip(*iterables)] + + # Yield must be hidden in closure so that the futures are submitted + # before the first iterator value is required. + def result_iterator(): + try: + for future in fs: + if timeout is None: + yield future.result() + else: + yield future.result(end_time - time.time()) + finally: + for future in fs: + future.cancel() + return result_iterator() + + def shutdown(self, wait=True): + """Clean-up the resources associated with the Executor. + + It is safe to call this method several times. Otherwise, no other + methods can be called after this one. + + Args: + wait: If True then shutdown will not return until all running + futures have finished executing and the resources used by + the executor have been reclaimed. + """ + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.shutdown(wait=True) + return False + + +# To make loky._base.Future instances awaitable by concurrent.futures.wait, +# derive our custom Future class from _BaseFuture. _invoke_callback is the only +# modification made to this class in loky. +class Future(_BaseFuture): + def _invoke_callbacks(self): + for callback in self._done_callbacks: + try: + callback(self) + except BaseException: + LOGGER.exception('exception calling callback for %r', self) diff --git a/sklearn/externals/joblib/externals/loky/backend/compat.py b/sklearn/externals/joblib/externals/loky/backend/compat.py index 6366b23d9f380..729c77c7d9bca 100644 --- a/sklearn/externals/joblib/externals/loky/backend/compat.py +++ b/sklearn/externals/joblib/externals/loky/backend/compat.py @@ -9,10 +9,10 @@ if sys.version_info[:2] >= (3, 3): import queue - from _pickle import PicklingError else: import Queue as queue - from pickle import PicklingError + +from pickle import PicklingError if sys.version_info >= (3, 4): from multiprocessing.process import BaseProcess diff --git a/sklearn/externals/joblib/externals/loky/backend/context.py b/sklearn/externals/joblib/externals/loky/backend/context.py index b38787efb5d0d..0f744c5918b5c 100644 --- a/sklearn/externals/joblib/externals/loky/backend/context.py +++ b/sklearn/externals/joblib/externals/loky/backend/context.py @@ -106,12 +106,13 @@ def cpu_count(): The returned number of CPUs accounts for: * the number of CPUs in the system, as given by - ``multiprocessing.cpu_count`` + ``multiprocessing.cpu_count``; * the CPU affinity settings of the current process - (available with Python 3.4+ on some Unix systems) - * CFS scheduler CPU bandwidth limit - (available on Linux only) - and is given as the minimum of these three constraints. + (available with Python 3.4+ on some Unix systems); + * CFS scheduler CPU bandwidth limit (available on Linux only, typically + set by docker and similar container orchestration systems); + * the value of the LOKY_MAX_CPU_COUNT environment variable if defined. + and is given as the minimum of these constraints. It is also always larger or equal to 1. """ import math @@ -141,10 +142,15 @@ def cpu_count(): cfs_period_us = int(fh.read()) if cfs_quota_us > 0 and cfs_period_us > 0: - cpu_count_cfs = math.ceil(cfs_quota_us / cfs_period_us) - cpu_count_cfs = max(cpu_count_cfs, 1) - - return min(cpu_count_mp, cpu_count_affinity, cpu_count_cfs) + # Make sure this quantity is an int as math.ceil returns a + # float in python2.7. (See issue #165) + cpu_count_cfs = int(math.ceil(cfs_quota_us / cfs_period_us)) + + # User defined soft-limit passed as an loky specific environment variable. + cpu_count_loky = int(os.environ.get('LOKY_MAX_CPU_COUNT', cpu_count_mp)) + aggregate_cpu_count = min(cpu_count_mp, cpu_count_affinity, cpu_count_cfs, + cpu_count_loky) + return max(aggregate_cpu_count, 1) class LokyContext(BaseContext): diff --git a/sklearn/externals/joblib/externals/loky/backend/reduction.py b/sklearn/externals/joblib/externals/loky/backend/reduction.py index 20eb581cbfce7..b621a92930c92 100644 --- a/sklearn/externals/joblib/externals/loky/backend/reduction.py +++ b/sklearn/externals/joblib/externals/loky/backend/reduction.py @@ -181,12 +181,13 @@ def h(cls): register(type(_C.h), _reduce_method) -def _reduce_method_descriptor(m): - return getattr, (m.__objclass__, m.__name__) +if not hasattr(sys, "pypy_version_info"): + # PyPy uses functions instead of method_descriptors and wrapper_descriptors + def _reduce_method_descriptor(m): + return getattr, (m.__objclass__, m.__name__) - -register(type(list.append), _reduce_method_descriptor) -register(type(int.__add__), _reduce_method_descriptor) + register(type(list.append), _reduce_method_descriptor) + register(type(int.__add__), _reduce_method_descriptor) # Make partial func pickable diff --git a/sklearn/externals/joblib/externals/loky/process_executor.py b/sklearn/externals/joblib/externals/loky/process_executor.py index c3072453109d9..57a7617d9ab7e 100644 --- a/sklearn/externals/joblib/externals/loky/process_executor.py +++ b/sklearn/externals/joblib/externals/loky/process_executor.py @@ -117,14 +117,16 @@ MAX_DEPTH = int(os.environ.get("LOKY_MAX_DEPTH", 10)) _CURRENT_DEPTH = 0 -# Minimum time interval between two consecutive memory usage checks. -_MEMORY_CHECK_DELAY = 1. +# Minimum time interval between two consecutive memory leak protection checks. +_MEMORY_LEAK_CHECK_DELAY = 1. # Number of bytes of memory usage allowed over the reference process size. _MAX_MEMORY_LEAK_SIZE = int(1e8) + try: from psutil import Process + _USE_PSUTIL = True def _get_memory_usage(pid, force_gc=False): if force_gc: @@ -133,7 +135,7 @@ def _get_memory_usage(pid, force_gc=False): return Process(pid).memory_info().rss except ImportError: - _get_memory_usage = None + _USE_PSUTIL = False class _ThreadWakeup: @@ -383,7 +385,7 @@ def _process_worker(call_queue, result_queue, initializer, initargs, global _CURRENT_DEPTH _CURRENT_DEPTH = current_depth _process_reference_size = None - _process_last_memory_check = None + _last_memory_leak_check = None pid = os.getpid() mp.util.debug('Worker started with timeout=%s' % timeout) @@ -422,20 +424,22 @@ def _process_worker(call_queue, result_queue, initializer, initargs, result_queue.put(_ResultItem(call_item.work_id, exception=exc)) else: _sendback_result(result_queue, call_item.work_id, result=r) + del r # Free the resource as soon as possible, to avoid holding onto # open files or shared memory that is not needed anymore del call_item - if _get_memory_usage is not None: + if _USE_PSUTIL: if _process_reference_size is None: # Make reference measurement after the first call _process_reference_size = _get_memory_usage(pid, force_gc=True) - _process_last_memory_check = time() + _last_memory_leak_check = time() continue - if time() - _process_last_memory_check > _MEMORY_CHECK_DELAY: + if time() - _last_memory_leak_check > _MEMORY_LEAK_CHECK_DELAY: mem_usage = _get_memory_usage(pid) - _process_last_memory_check = time() + print(mem_usage) + _last_memory_leak_check = time() if mem_usage - _process_reference_size < _MAX_MEMORY_LEAK_SIZE: # Memory usage stays within bounds: everything is fine. continue @@ -444,7 +448,7 @@ def _process_worker(call_queue, result_queue, initializer, initargs, # after a forced garbage collection to break any reference # cycles. mem_usage = _get_memory_usage(pid, force_gc=True) - _process_last_memory_check = time() + _last_memory_leak_check = time() if mem_usage - _process_reference_size < _MAX_MEMORY_LEAK_SIZE: # The GC managed to free the memory: everything is fine. continue @@ -455,6 +459,14 @@ def _process_worker(call_queue, result_queue, initializer, initargs, result_queue.put(pid) with worker_exit_lock: return + else: + # if psutil is not installed, trigger gc.collect events + # regularly to limit potential memory leaks due to reference cycles + if ((_last_memory_leak_check is None) or + (time() - _last_memory_leak_check > + _MEMORY_LEAK_CHECK_DELAY)): + gc.collect() + _last_memory_leak_check = time() def _add_call_item_to_queue(pending_work_items, diff --git a/sklearn/externals/joblib/memory.py b/sklearn/externals/joblib/memory.py index 5ae6940f91776..e31ba2edb72eb 100644 --- a/sklearn/externals/joblib/memory.py +++ b/sklearn/externals/joblib/memory.py @@ -454,12 +454,16 @@ def _cached_call(self, args, kwargs, shelving=False): metadata: dict Some metadata about wrapped function call (see _persist_input()). """ - # Compare the function code with the previous to see if the - # function code has changed func_id, args_id = self._get_output_identifiers(*args, **kwargs) metadata = None msg = None + + # Wether or not the memorized function must be called + must_call = False + # FIXME: The statements below should be try/excepted + # Compare the function code with the previous to see if the + # function code has changed if not (self._check_previous_func_code(stacklevel=4) and self.store_backend.contains_item([func_id, args_id])): if self._verbose > 10: @@ -469,16 +473,7 @@ def _cached_call(self, args, kwargs, shelving=False): .format(name, args_id, self.store_backend. get_cached_func_info([func_id])['location'])) - out, metadata = self.call(*args, **kwargs) - if self.mmap_mode is not None: - # Memmap the output at the first call to be consistent with - # later calls - if self._verbose: - msg = _format_load_msg(func_id, args_id, - timestamp=self.timestamp, - metadata=metadata) - out = self.store_backend.load_item([func_id, args_id], msg=msg, - verbose=self._verbose) + must_call = True else: try: t0 = time.time() @@ -507,8 +502,19 @@ def _cached_call(self, args, kwargs, shelving=False): self.warn('Exception while loading results for ' '{}\n {}'.format(signature, traceback.format_exc())) - out, metadata = self.call(*args, **kwargs) - args_id = None + must_call = True + + if must_call: + out, metadata = self.call(*args, **kwargs) + if self.mmap_mode is not None: + # Memmap the output at the first call to be consistent with + # later calls + if self._verbose: + msg = _format_load_msg(func_id, args_id, + timestamp=self.timestamp, + metadata=metadata) + out = self.store_backend.load_item([func_id, args_id], msg=msg, + verbose=self._verbose) return (out, args_id, metadata) From 242410f7cd93e7af8786dd57d6537376b6c0b36d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 5 Sep 2018 17:33:06 +0200 Subject: [PATCH 019/163] MAINT make pytest collection ignore folders with Python scripts (#12011) Some IDEs such as VS Code use the pytest command to collect all the tests of the workspace in the background. This can cause unexpected execution of arbitrary Python scripts in the workspace (examples, benchmarks...). The doc folder is also ignored because it has python scripts for sphinx along with copies of the examples. To safely run pytest in the doc folder, we need to used the find command to find all "*.rst" files as done in the project Makefile. --- setup.cfg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.cfg b/setup.cfg index 09c5c9829ae21..93aca4a44f9e1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,6 +5,10 @@ test = pytest # disable-pytest-warnings should be removed once we rewrite tests # using yield with parametrize addopts = + --ignore build_tools + --ignore benchmarks + --ignore doc + --ignore examples --doctest-modules --disable-pytest-warnings -rs From a9c6ad9baf878015653569109091828ceaf2db8e Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Wed, 5 Sep 2018 17:55:07 +0200 Subject: [PATCH 020/163] [MRG+1] break the tie in Meanshift in case cluster intensities are the same (#11901) --- doc/whats_new/v0.20.rst | 6 ++++++ sklearn/cluster/mean_shift_.py | 8 +++++--- sklearn/cluster/tests/test_mean_shift.py | 12 ++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 2ed336b782174..46b262896145c 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -63,6 +63,7 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. +- :class:`cluster.MeanShift` (bug fix) - :class:`decomposition.IncrementalPCA` in Python 2 (bug fix) - :class:`decomposition.SparsePCA` (bug fix) - :class:`ensemble.GradientBoostingClassifier` (bug fix affecting feature importances) @@ -151,6 +152,11 @@ Support for Python 3.3 has been officially dropped. ``n_iter_`` attribute in the docstring of :class:`cluster.KMeans`. :issue:`11353` by :user:`Jeremie du Boisberranger `. +- |Fix| Fixed a bug in :func:`cluster.mean_shift` where the assigned labels + were not deterministic if there were multiple clusters with the same + intensities. + :issue:`11901` by :user:`Adrin Jalali `. + - |API| Deprecate ``pooling_func`` unused parameter in :class:`cluster.AgglomerativeClustering`. :issue:`9875` by :user:`Kumar Ashutosh `. diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py index 487545ac039d3..800c85c365988 100644 --- a/sklearn/cluster/mean_shift_.py +++ b/sklearn/cluster/mean_shift_.py @@ -215,8 +215,10 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False, # If the distance between two kernels is less than the bandwidth, # then we have to remove one because it is a duplicate. Remove the # one with fewer points. + sorted_by_intensity = sorted(center_intensity_dict.items(), - key=lambda tup: tup[1], reverse=True) + key=lambda tup: (tup[1], tup[0]), + reverse=True) sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) unique = np.ones(len(sorted_centers), dtype=np.bool) nbrs = NearestNeighbors(radius=bandwidth, @@ -359,9 +361,9 @@ class MeanShift(BaseEstimator, ClusterMixin): ... [4, 7], [3, 5], [3, 6]]) >>> clustering = MeanShift(bandwidth=2).fit(X) >>> clustering.labels_ - array([0, 0, 0, 1, 1, 1]) + array([1, 1, 1, 0, 0, 0]) >>> clustering.predict([[0, 0], [5, 5]]) - array([0, 1]) + array([1, 0]) >>> clustering # doctest: +NORMALIZE_WHITESPACE MeanShift(bandwidth=2, bin_seeding=False, cluster_all=True, min_bin_freq=1, n_jobs=None, seeds=None) diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py index 1d6940a947dc2..441f822cdbded 100644 --- a/sklearn/cluster/tests/test_mean_shift.py +++ b/sklearn/cluster/tests/test_mean_shift.py @@ -101,6 +101,18 @@ def test_unfitted(): assert_false(hasattr(ms, "labels_")) +def test_cluster_intensity_tie(): + X = np.array([[1, 1], [2, 1], [1, 0], + [4, 7], [3, 5], [3, 6]]) + c1 = MeanShift(bandwidth=2).fit(X) + + X = np.array([[4, 7], [3, 5], [3, 6], + [1, 1], [2, 1], [1, 0]]) + c2 = MeanShift(bandwidth=2).fit(X) + assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0]) + assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1]) + + def test_bin_seeds(): # Test the bin seeding technique which can be used in the mean shift # algorithm From e726f7a3e6a89f898de2e22880aa653fe43949c4 Mon Sep 17 00:00:00 2001 From: Gabriele Calvo Date: Wed, 5 Sep 2018 21:50:20 +0100 Subject: [PATCH 021/163] DOC fix minor spacing issue in the iris dataset description (#12019) --- sklearn/datasets/descr/iris.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/descr/iris.rst b/sklearn/datasets/descr/iris.rst index a35edc728c7d9..e05206454d218 100644 --- a/sklearn/datasets/descr/iris.rst +++ b/sklearn/datasets/descr/iris.rst @@ -25,7 +25,7 @@ Iris plants dataset sepal length: 4.3 7.9 5.84 0.83 0.7826 sepal width: 2.0 4.4 3.05 0.43 -0.4194 petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) - petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) + petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) ============== ==== ==== ======= ===== ==================== :Missing Attribute Values: None From 1fafc5c56d496728ec276e99382efa8e84034b13 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 6 Sep 2018 17:29:37 +1000 Subject: [PATCH 022/163] TST use urlopen monkeypatch for test_decode_* (#12020) Avoid requiring internet for test suite. Examples will still run with internet (as long as cache is occasionally cleared). --- sklearn/datasets/tests/test_openml.py | 71 ++++++++++----------------- 1 file changed, 27 insertions(+), 44 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 3f5716cb96784..cf9cfcdc81ede 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -24,7 +24,6 @@ currdir = os.path.dirname(os.path.abspath(__file__)) # if True, urlopen will be monkey patched to only use local files test_offline = True -test_gzip = True def _test_features_list(data_id): @@ -138,18 +137,14 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, def _monkey_patch_webbased_functions(context, data_id, - gziped_files, gzip_response): url_prefix_data_description = "https://openml.org/api/v1/json/data/" url_prefix_data_features = "https://openml.org/api/v1/json/data/features/" url_prefix_download_data = "https://openml.org/data/v1/" url_prefix_data_list = "https://openml.org/api/v1/json/data/list/" - path_suffix = '' - read_fn = open - if gziped_files: - path_suffix = '.gz' - read_fn = gzip.open + path_suffix = '.gz' + read_fn = gzip.open class MockHTTPResponse(object): def __init__(self, data, is_gzip): @@ -264,8 +259,7 @@ def test_fetch_openml_iris(monkeypatch, gzip_response): expected_features = 4 expected_missing = 0 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) assert_warns_message( UserWarning, "Multiple active versions of the dataset matching the name" @@ -285,8 +279,9 @@ def test_fetch_openml_iris(monkeypatch, gzip_response): ) -def test_decode_iris(): +def test_decode_iris(monkeypatch): data_id = 61 + _monkey_patch_webbased_functions(monkeypatch, data_id, False) _test_features_list(data_id) @@ -301,8 +296,7 @@ def test_fetch_openml_iris_multitarget(monkeypatch, gzip_response): expected_features = 3 expected_missing = 0 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, @@ -321,8 +315,7 @@ def test_fetch_openml_anneal(monkeypatch, gzip_response): expected_observations = 11 expected_features = 38 expected_missing = 267 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, @@ -330,8 +323,9 @@ def test_fetch_openml_anneal(monkeypatch, gzip_response): compare_default_target=True) -def test_decode_anneal(): +def test_decode_anneal(monkeypatch): data_id = 2 + _monkey_patch_webbased_functions(monkeypatch, data_id, False) _test_features_list(data_id) @@ -346,8 +340,7 @@ def test_fetch_openml_anneal_multitarget(monkeypatch, gzip_response): expected_observations = 11 expected_features = 36 expected_missing = 267 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, @@ -365,8 +358,7 @@ def test_fetch_openml_cpu(monkeypatch, gzip_response): expected_observations = 209 expected_features = 7 expected_missing = 0 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, @@ -374,8 +366,9 @@ def test_fetch_openml_cpu(monkeypatch, gzip_response): compare_default_target=True) -def test_decode_cpu(): +def test_decode_cpu(monkeypatch): data_id = 561 + _monkey_patch_webbased_functions(monkeypatch, data_id, False) _test_features_list(data_id) @@ -393,8 +386,7 @@ def test_fetch_openml_australian(monkeypatch, gzip_response): expected_observations = 85 expected_features = 14 expected_missing = 0 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) assert_warns_message( UserWarning, "Version 1 of dataset Australian is inactive,", @@ -426,8 +418,7 @@ def test_fetch_openml_miceprotein(monkeypatch, gzip_response): expected_observations = 7 expected_features = 77 expected_missing = 7 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, @@ -446,8 +437,7 @@ def test_fetch_openml_emotions(monkeypatch, gzip_response): expected_observations = 13 expected_features = 72 expected_missing = 0 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, @@ -456,8 +446,9 @@ def test_fetch_openml_emotions(monkeypatch, gzip_response): compare_default_target=True) -def test_decode_emotions(): +def test_decode_emotions(monkeypatch): data_id = 40589 + _monkey_patch_webbased_functions(monkeypatch, data_id, False) _test_features_list(data_id) @@ -466,7 +457,7 @@ def test_open_openml_url_cache(monkeypatch, gzip_response): data_id = 61 _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + monkeypatch, data_id, gzip_response) openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) test_directory = os.path.join(os.path.expanduser('~'), 'scikit_learn_data') # first fill the cache @@ -486,8 +477,7 @@ def test_fetch_openml_notarget(monkeypatch, gzip_response): expected_observations = 150 expected_features = 5 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) data = fetch_openml(data_id=data_id, target_column=target_column, cache=False) assert data.data.shape == (expected_observations, expected_features) @@ -498,8 +488,7 @@ def test_fetch_openml_notarget(monkeypatch, gzip_response): def test_fetch_openml_inactive(monkeypatch, gzip_response): # fetch inactive dataset by id data_id = 40675 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) glas2 = assert_warns_message( UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, data_id=data_id, cache=False) @@ -515,8 +504,7 @@ def test_fetch_openml_inactive(monkeypatch, gzip_response): def test_fetch_nonexiting(monkeypatch, gzip_response): # there is no active version of glass2 data_id = 40675 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # Note that we only want to search by name (not data id) assert_raise_message(ValueError, "No active dataset glass2 found", fetch_openml, name='glass2', cache=False) @@ -526,8 +514,7 @@ def test_fetch_nonexiting(monkeypatch, gzip_response): def test_raises_illegal_multitarget(monkeypatch, gzip_response): data_id = 61 targets = ['sepalwidth', 'class'] - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # Note that we only want to search by name (not data id) assert_raise_message(ValueError, "Can only handle homogeneous multi-target datasets,", @@ -540,8 +527,7 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response): data_id = 40966 expected_row_id_msg = "target_column={} has flag is_row_identifier." expected_ignore_msg = "target_column={} has flag is_ignore." - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # single column test assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), fetch_openml, data_id=data_id, @@ -565,8 +551,7 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response): @pytest.mark.parametrize('gzip_response', [True, False]) def test_string_attribute(monkeypatch, gzip_response): data_id = 40945 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # single column test assert_raise_message(ValueError, 'STRING attributes are not yet supported', @@ -576,8 +561,7 @@ def test_string_attribute(monkeypatch, gzip_response): @pytest.mark.parametrize('gzip_response', [True, False]) def test_illegal_column(monkeypatch, gzip_response): data_id = 61 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) assert_raise_message(KeyError, "Could not find target_column=", fetch_openml, data_id=data_id, target_column='undefined', cache=False) @@ -591,8 +575,7 @@ def test_illegal_column(monkeypatch, gzip_response): @pytest.mark.parametrize('gzip_response', [True, False]) def test_fetch_openml_raises_missing_values_target(monkeypatch, gzip_response): data_id = 2 - _monkey_patch_webbased_functions( - monkeypatch, data_id, test_gzip, gzip_response) + _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) assert_raise_message(ValueError, "Target column ", fetch_openml, data_id=data_id, target_column='family') From 3a80162b422fab6d569d3462cc1f0c047cd53e04 Mon Sep 17 00:00:00 2001 From: Vivek Kumar Date: Thu, 6 Sep 2018 14:08:48 +0530 Subject: [PATCH 023/163] DOC gradient boosting fit() supports sparse X (#12022) --- sklearn/ensemble/gradient_boosting.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index ec2800ac669d5..c6e0fbee3fe51 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1358,9 +1358,10 @@ def fit(self, X, y, sample_weight=None, monitor=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) - Training vectors, where n_samples is the number of samples - and n_features is the number of features. + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. y : array-like, shape (n_samples,) Target values (strings or integers in classification, real numbers From b4bf033104436ca0789a2e1607d09e7d98ff0b3d Mon Sep 17 00:00:00 2001 From: William de Vazelhes <31916524+wdevazelhes@users.noreply.github.com> Date: Thu, 6 Sep 2018 10:46:56 +0200 Subject: [PATCH 024/163] DOC: Add pytest version in documentation (#12002) --- README.rst | 2 +- conftest.py | 5 +++++ doc/developers/advanced_installation.rst | 9 +++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index eb1957686acaf..fa2ef793b9e26 100644 --- a/README.rst +++ b/README.rst @@ -120,7 +120,7 @@ Testing ~~~~~~~ After installation, you can launch the test suite from outside the -source directory (you will need to have the ``pytest`` package installed):: +source directory (you will need to have ``pytest`` >= 3.3.0 installed):: pytest sklearn diff --git a/conftest.py b/conftest.py index 621097bfc47ab..bad99b5c99272 100644 --- a/conftest.py +++ b/conftest.py @@ -11,6 +11,11 @@ import pytest from _pytest.doctest import DoctestItem +PYTEST_MIN_VERSION = '3.3.0' + +if LooseVersion(pytest.__version__) < PYTEST_MIN_VERSION: + raise('Your version of pytest is too old, you should have at least ' + 'pytest >= {} installed.'.format(PYTEST_MIN_VERSION)) def pytest_collection_modifyitems(config, items): diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index 720c11ed98f4c..e146363d0ac4e 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -50,7 +50,9 @@ Building Scikit-learn also requires Running tests requires -- pytest +.. |PytestMinVersion| replace:: 3.3.0 + +- pytest >=\ |PytestMinVersion| Some tests also require `pandas `_. @@ -276,9 +278,8 @@ Testing Testing scikit-learn once installed ----------------------------------- -Testing requires having the `pytest -`_ library. Some tests also require having -`pandas ` installed. +Testing requires having `pytest `_ >=\ |PytestMinVersion|\ . +Some tests also require having `pandas ` installed. After installation, the package can be tested by executing *from outside* the source directory:: From a056a573252adebeba98fe84ede93ef3b1e3f5a1 Mon Sep 17 00:00:00 2001 From: Umar Farouk Umar Date: Thu, 6 Sep 2018 12:10:17 +0100 Subject: [PATCH 025/163] DOC fix for linnerud dataset (#12024) The descriptions were the wrong way around --- sklearn/datasets/descr/linnerud.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/descr/linnerud.rst b/sklearn/datasets/descr/linnerud.rst index 848ee193e1adc..5585b50a7e42b 100644 --- a/sklearn/datasets/descr/linnerud.rst +++ b/sklearn/datasets/descr/linnerud.rst @@ -11,12 +11,12 @@ Linnerrud dataset The Linnerud dataset constains two small dataset: -- *exercise*: A list containing the following components: exercise data with - 20 observations on 3 exercise variables: Weight, Waist and Pulse. +- *physiological* - CSV containing 20 observations on 3 exercise variables: + Weight, Waist and Pulse. -- *physiological*: Data frame with 20 observations on 3 physiological variables: +- *exercise* - CSV containing 20 observations on 3 physiological variables: Chins, Situps and Jumps. .. topic:: References - * Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris: Editions Technic. \ No newline at end of file + * Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris: Editions Technic. From d3d09c383cf25f987e54c063c546b3bfeac971cb Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 6 Sep 2018 14:44:10 +0200 Subject: [PATCH 026/163] MAINT skip joblib vendor test on debian (#12027) --- sklearn/tests/test_site_joblib.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/tests/test_site_joblib.py b/sklearn/tests/test_site_joblib.py index 7ceb80a281661..bffd43cc1416f 100644 --- a/sklearn/tests/test_site_joblib.py +++ b/sklearn/tests/test_site_joblib.py @@ -1,4 +1,6 @@ import os +import pytest +from sklearn import externals from sklearn.externals import joblib as joblib_vendored from sklearn.utils import Parallel, delayed, Memory, parallel_backend @@ -9,6 +11,11 @@ def test_old_pickle(tmpdir): + vendored_joblib_home = os.path.dirname(joblib_vendored.__file__) + sklearn_externals_home = os.path.dirname(externals.__file__) + if not vendored_joblib_home.startswith(sklearn_externals_home): + pytest.skip("joblib is physically unvendored (e.g. as in debian)") + # Check that a pickle that references sklearn.external.joblib can load f = tmpdir.join('foo.pkl') f.write(b'\x80\x02csklearn.externals.joblib.numpy_pickle\nNumpyArrayWrappe' From 121dd5ab3bb03203480941ccef2df72cf9cf791d Mon Sep 17 00:00:00 2001 From: jeremiedbb <34657725+jeremiedbb@users.noreply.github.com> Date: Fri, 7 Sep 2018 15:58:21 +0200 Subject: [PATCH 027/163] MNT Fix utils.sparse import in neural_network.rbm (#12032) --- sklearn/neural_network/rbm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py index ccf933ed19b56..c35e8840d23f7 100644 --- a/sklearn/neural_network/rbm.py +++ b/sklearn/neural_network/rbm.py @@ -19,7 +19,6 @@ from ..utils import check_array from ..utils import check_random_state from ..utils import gen_even_slices -from ..utils import issparse from ..utils.extmath import safe_sparse_dot from ..utils.extmath import log_logistic from ..utils.validation import check_is_fitted @@ -310,7 +309,7 @@ def score_samples(self, X): # Randomly corrupt one feature in each sample in v. ind = (np.arange(v.shape[0]), rng.randint(0, v.shape[1], v.shape[0])) - if issparse(v): + if sp.issparse(v): data = -2 * v[ind] + 1 v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape) else: From 79f5d147ea2c79efc78cbb8d380b64e15d7bd3ad Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 9 Sep 2018 00:43:21 +1000 Subject: [PATCH 028/163] MNT Revert the deprecation of min_samples_leaf and min_weight_fraction_leaf (#11998) --- doc/modules/ensemble.rst | 5 +- doc/modules/tree.rst | 27 ++- doc/whats_new/v0.20.rst | 13 -- .../ensemble/plot_adaboost_hastie_10_2.py | 4 +- .../ensemble/plot_gradient_boosting_oob.py | 2 +- .../plot_gradient_boosting_quantile.py | 3 +- sklearn/ensemble/forest.py | 167 ++++++++---------- sklearn/ensemble/gradient_boosting.py | 72 +++----- sklearn/ensemble/tests/test_forest.py | 20 +-- .../ensemble/tests/test_gradient_boosting.py | 34 ++-- sklearn/tree/tests/test_tree.py | 69 +++----- sklearn/tree/tree.py | 162 +++++++---------- 12 files changed, 229 insertions(+), 349 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index a41c8201a3fa1..5399f13dbc9f4 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -218,7 +218,7 @@ setting ``oob_score=True``. The size of the model with the default parameters is :math:`O( M * N * log (N) )`, where :math:`M` is the number of trees and :math:`N` is the number of samples. In order to reduce the size of the model, you can change these parameters: - ``min_samples_split``, ``max_leaf_nodes`` and ``max_depth``. + ``min_samples_split``, ``max_leaf_nodes``, ``max_depth`` and ``min_samples_leaf``. Parallelization --------------- @@ -393,7 +393,8 @@ The number of weak learners is controlled by the parameter ``n_estimators``. The the final combination. By default, weak learners are decision stumps. Different weak learners can be specified through the ``base_estimator`` parameter. The main parameters to tune to obtain good results are ``n_estimators`` and -the complexity of the base estimators (e.g., its depth ``max_depth``). +the complexity of the base estimators (e.g., its depth ``max_depth`` or +minimum required number of samples to consider a split ``min_samples_split``). .. topic:: Examples: diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 5d448f86a3f11..86f8b2f6fabdf 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -330,18 +330,31 @@ Tips on practical use for each additional level the tree grows to. Use ``max_depth`` to control the size of the tree to prevent overfitting. - * Use ``min_samples_split`` to control the number of samples at a leaf node. - A very small number will usually mean the tree will overfit, whereas a - large number will prevent the tree from learning the data. If the sample - size varies greatly, a float number can be used as percentage in this - parameter. Note that ``min_samples_split`` can create arbitrarily - small leaves. + * Use ``min_samples_split`` or ``min_samples_leaf`` to ensure that multiple + samples inform every decision in the tree, by controlling which splits will + be considered. A very small number will usually mean the tree will overfit, + whereas a large number will prevent the tree from learning the data. Try + ``min_samples_leaf=5`` as an initial value. If the sample size varies + greatly, a float number can be used as percentage in these two parameters. + While ``min_samples_split`` can create arbitrarily small leaves, + ``min_samples_leaf`` guarantees that each leaf has a minimum size, avoiding + low-variance, over-fit leaf nodes in regression problems. For + classification with few classes, ``min_samples_leaf=1`` is often the best + choice. * Balance your dataset before training to prevent the tree from being biased toward the classes that are dominant. Class balancing can be done by sampling an equal number of samples from each class, or preferably by normalizing the sum of the sample weights (``sample_weight``) for each - class to the same value. + class to the same value. Also note that weight-based pre-pruning criteria, + such as ``min_weight_fraction_leaf``, will then be less biased toward + dominant classes than criteria that are not aware of the sample weights, + like ``min_samples_leaf``. + + * If the samples are weighted, it will be easier to optimize the tree + structure using weight-based pre-pruning criterion such as + ``min_weight_fraction_leaf``, which ensure that leaf nodes contain at least + a fraction of the overall sum of the sample weights. * All decision trees use ``np.float32`` arrays internally. If training data is not in this format, a copy of the dataset will be made. diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 46b262896145c..0fe95de46eb42 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -343,12 +343,6 @@ Support for Python 3.3 has been officially dropped. while mask does not allow this functionality. :issue:`9524` by :user:`Guillaume Lemaitre `. -- |API| The parameters ``min_samples_leaf`` and ``min_weight_fraction_leaf`` in - tree-based ensembles are deprecated and will be removed (fixed to 1 and 0 - respectively) in version 0.22. These parameters were not effective for - regularization and at worst would produce bad splits. :issue:`10773` by - :user:`Bob Chen ` and `Joel Nothman`_. - - |Fix| :class:`ensemble.BaseBagging` where one could not deterministically reproduce ``fit`` result using the object attributes when ``random_state`` is set. :issue:`9723` by :user:`Guillaume Lemaitre `. @@ -1035,13 +1029,6 @@ Support for Python 3.3 has been officially dropped. considered all samples to be of equal weight importance. :issue:`11464` by :user:`John Stott `. -- |API| The parameters ``min_samples_leaf`` and ``min_weight_fraction_leaf`` in - :class:`tree.DecisionTreeClassifier` and :class:`tree.DecisionTreeRegressor` - are deprecated and will be removed (fixed to 1 and 0 respectively) in version - 0.22. These parameters were not effective for regularization and at worst - would produce bad splits. :issue:`10773` by :user:`Bob Chen ` - and `Joel Nothman`_. - :mod:`sklearn.utils` .................... diff --git a/examples/ensemble/plot_adaboost_hastie_10_2.py b/examples/ensemble/plot_adaboost_hastie_10_2.py index 7fc00a77e3eab..4d48d13dd24f2 100644 --- a/examples/ensemble/plot_adaboost_hastie_10_2.py +++ b/examples/ensemble/plot_adaboost_hastie_10_2.py @@ -43,11 +43,11 @@ X_test, y_test = X[2000:], y[2000:] X_train, y_train = X[:2000], y[:2000] -dt_stump = DecisionTreeClassifier(max_depth=1) +dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(X_train, y_train) dt_stump_err = 1.0 - dt_stump.score(X_test, y_test) -dt = DecisionTreeClassifier(max_depth=9) +dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1) dt.fit(X_train, y_train) dt_err = 1.0 - dt.score(X_test, y_test) diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py index 99f30e750b7ed..ea38b326ce5c9 100644 --- a/examples/ensemble/plot_gradient_boosting_oob.py +++ b/examples/ensemble/plot_gradient_boosting_oob.py @@ -55,7 +55,7 @@ # Fit classifier with out-of-bag estimates params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5, - 'learning_rate': 0.01, 'random_state': 3} + 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3} clf = ensemble.GradientBoostingClassifier(**params) clf.fit(X_train, y_train) diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py index 99e7289710e35..6fb2731a513ec 100644 --- a/examples/ensemble/plot_gradient_boosting_quantile.py +++ b/examples/ensemble/plot_gradient_boosting_quantile.py @@ -41,7 +41,8 @@ def f(x): clf = GradientBoostingRegressor(loss='quantile', alpha=alpha, n_estimators=250, max_depth=3, - learning_rate=.1, min_samples_split=9) + learning_rate=.1, min_samples_leaf=9, + min_samples_split=9) clf.fit(X, y) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 125f48d5b0da6..542f7ca8043f1 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -784,8 +784,8 @@ class RandomForestClassifier(ForestClassifier): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -793,30 +793,25 @@ class RandomForestClassifier(ForestClassifier): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_features : int, float, string or None, optional (default="auto") The number of features to consider when looking for the best split: @@ -963,10 +958,9 @@ class labels (multi-output problem). RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, - min_samples_leaf='deprecated', min_samples_split=2, - min_weight_fraction_leaf='deprecated', n_estimators=100, - n_jobs=None, oob_score=False, random_state=0, verbose=0, - warm_start=False) + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, + oob_score=False, random_state=0, verbose=0, warm_start=False) >>> print(clf.feature_importances_) [0.14205973 0.76664038 0.0282433 0.06305659] >>> print(clf.predict([[0, 0, 0, 0]])) @@ -975,7 +969,7 @@ class labels (multi-output problem). Notes ----- The default values for the parameters controlling the size of the trees - (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. @@ -1001,8 +995,8 @@ def __init__(self, criterion="gini", max_depth=None, min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, + min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., @@ -1079,8 +1073,8 @@ class RandomForestRegressor(ForestRegressor): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -1088,30 +1082,25 @@ class RandomForestRegressor(ForestRegressor): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_features : int, float, string or None, optional (default="auto") The number of features to consider when looking for the best split: @@ -1220,10 +1209,9 @@ class RandomForestRegressor(ForestRegressor): RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, - min_samples_leaf='deprecated', min_samples_split=2, - min_weight_fraction_leaf='deprecated', n_estimators=100, - n_jobs=None, oob_score=False, random_state=0, verbose=0, - warm_start=False) + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, + oob_score=False, random_state=0, verbose=0, warm_start=False) >>> print(regr.feature_importances_) [0.18146984 0.81473937 0.00145312 0.00233767] >>> print(regr.predict([[0, 0, 0, 0]])) @@ -1232,7 +1220,7 @@ class RandomForestRegressor(ForestRegressor): Notes ----- The default values for the parameters controlling the size of the trees - (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. @@ -1265,8 +1253,8 @@ def __init__(self, criterion="mse", max_depth=None, min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, + min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., @@ -1334,8 +1322,8 @@ class ExtraTreesClassifier(ForestClassifier): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -1343,30 +1331,25 @@ class ExtraTreesClassifier(ForestClassifier): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_features : int, float, string or None, optional (default="auto") The number of features to consider when looking for the best split: @@ -1501,7 +1484,7 @@ class labels (multi-output problem). Notes ----- The default values for the parameters controlling the size of the trees - (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. @@ -1523,8 +1506,8 @@ def __init__(self, criterion="gini", max_depth=None, min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, + min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., @@ -1599,8 +1582,8 @@ class ExtraTreesRegressor(ForestRegressor): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -1608,30 +1591,25 @@ class ExtraTreesRegressor(ForestRegressor): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_features : int, float, string or None, optional (default="auto") The number of features to consider when looking for the best split: @@ -1729,7 +1707,7 @@ class ExtraTreesRegressor(ForestRegressor): Notes ----- The default values for the parameters controlling the size of the trees - (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. @@ -1750,8 +1728,8 @@ def __init__(self, criterion="mse", max_depth=None, min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, + min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., @@ -1820,8 +1798,8 @@ class RandomTreesEmbedding(BaseForest): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split. @@ -1829,30 +1807,25 @@ class RandomTreesEmbedding(BaseForest): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_leaf_nodes : int or None, optional (default=None) Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. @@ -1928,8 +1901,8 @@ def __init__(self, n_estimators='warn', max_depth=5, min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, + min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index c6e0fbee3fe51..6ae4f6fd1b277 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -25,7 +25,6 @@ from abc import ABCMeta from abc import abstractmethod -import warnings from .base import BaseEnsemble from ..base import ClassifierMixin @@ -1125,13 +1124,13 @@ class BaseGradientBoosting(six.with_metaclass(ABCMeta, BaseEnsemble)): @abstractmethod def __init__(self, loss, learning_rate, n_estimators, criterion, - min_samples_split, min_weight_fraction_leaf, + min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, min_impurity_decrease, min_impurity_split, init, subsample, max_features, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, - min_samples_leaf='deprecated', warm_start=False, - presort='auto', validation_fraction=0.1, - n_iter_no_change=None, tol=1e-4): + warm_start=False, presort='auto', + validation_fraction=0.1, n_iter_no_change=None, + tol=1e-4): self.n_estimators = n_estimators self.learning_rate = learning_rate @@ -1491,17 +1490,9 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state, n_inbag = max(1, int(self.subsample * n_samples)) loss_ = self.loss_ - if self.min_weight_fraction_leaf != 'deprecated': - warnings.warn("'min_weight_fraction_leaf' is deprecated in 0.20 " - "and will be fixed to a value of 0 in 0.22.", - DeprecationWarning) - min_weight_fraction_leaf = self.min_weight_fraction_leaf - else: - min_weight_fraction_leaf = 0. - # Set min_weight_leaf from min_weight_fraction_leaf - if min_weight_fraction_leaf != 0. and sample_weight is not None: - min_weight_leaf = (min_weight_fraction_leaf * + if self.min_weight_fraction_leaf != 0. and sample_weight is not None: + min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. @@ -1739,8 +1730,8 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -1748,30 +1739,25 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_depth : integer, optional (default=3) maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter @@ -1948,8 +1934,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, @@ -2204,8 +2189,8 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -2213,19 +2198,19 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all @@ -2403,8 +2388,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index a470913f5f327..d7586c2866571 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -762,16 +762,13 @@ def check_min_samples_leaf(name): ForestEstimator = FOREST_ESTIMATORS[name] # test boundary value - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - assert_raises(ValueError, - ForestEstimator(min_samples_leaf=-1).fit, X, y) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - assert_raises(ValueError, - ForestEstimator(min_samples_leaf=0).fit, X, y) + assert_raises(ValueError, + ForestEstimator(min_samples_leaf=-1).fit, X, y) + assert_raises(ValueError, + ForestEstimator(min_samples_leaf=0).fit, X, y) est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - est.fit(X, y) + est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes @@ -781,8 +778,7 @@ def check_min_samples_leaf(name): est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - est.fit(X, y) + est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes @@ -815,9 +811,7 @@ def check_min_weight_fraction_leaf(name): if "RandomForest" in name: est.bootstrap = False - with pytest.warns(DeprecationWarning, - match='min_weight_fraction_leaf'): - est.fit(X, y, sample_weight=weights) + est.fit(X, y, sample_weight=weights) out = est.estimators_[0].tree_.apply(X) node_weights = np.bincount(out, weights=weights) # drop inner nodes diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 332ab89317e1c..6f7654c7d6061 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -106,29 +106,17 @@ def test_classifier_parameter_checks(): assert_raises(ValueError, GradientBoostingClassifier(min_samples_split=1.1).fit, X, y) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - assert_raises( - ValueError, - GradientBoostingClassifier(min_samples_leaf=0).fit, - X, y - ) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - assert_raises( - ValueError, - GradientBoostingClassifier(min_samples_leaf=-1.0).fit, - X, y - ) - - with pytest.warns(DeprecationWarning, match='min_weight_fraction_leaf'): - assert_raises(ValueError, - GradientBoostingClassifier( - min_weight_fraction_leaf=-1.).fit, - X, y) - with pytest.warns(DeprecationWarning, match='min_weight_fraction_leaf'): - assert_raises(ValueError, - GradientBoostingClassifier( - min_weight_fraction_leaf=0.6).fit, - X, y) + assert_raises(ValueError, + GradientBoostingClassifier(min_samples_leaf=0).fit, X, y) + assert_raises(ValueError, + GradientBoostingClassifier(min_samples_leaf=-1.0).fit, X, y) + + assert_raises(ValueError, + GradientBoostingClassifier(min_weight_fraction_leaf=-1.).fit, + X, y) + assert_raises(ValueError, + GradientBoostingClassifier(min_weight_fraction_leaf=0.6).fit, + X, y) assert_raises(ValueError, GradientBoostingClassifier(subsample=0.0).fit, X, y) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 68b5040374290..37eb6582c7023 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -507,28 +507,16 @@ def test_error(): assert_raises(ValueError, est.predict_proba, X2) for name, TreeEstimator in ALL_TREES.items(): - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - assert_raises(ValueError, - TreeEstimator(min_samples_leaf=-1).fit, X, y) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - assert_raises(ValueError, - TreeEstimator(min_samples_leaf=.6).fit, X, y) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - assert_raises(ValueError, - TreeEstimator(min_samples_leaf=0.).fit, X, y) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - assert_raises(ValueError, - TreeEstimator(min_samples_leaf=3.).fit, X, y) - with pytest.warns(DeprecationWarning, - match='min_weight_fraction_leaf'): - assert_raises(ValueError, - TreeEstimator(min_weight_fraction_leaf=-1).fit, - X, y) - with pytest.warns(DeprecationWarning, - match='min_weight_fraction_leaf'): - assert_raises(ValueError, - TreeEstimator(min_weight_fraction_leaf=0.51).fit, - X, y) + assert_raises(ValueError, TreeEstimator(min_samples_leaf=-1).fit, X, y) + assert_raises(ValueError, TreeEstimator(min_samples_leaf=.6).fit, X, y) + assert_raises(ValueError, TreeEstimator(min_samples_leaf=0.).fit, X, y) + assert_raises(ValueError, TreeEstimator(min_samples_leaf=3.).fit, X, y) + assert_raises(ValueError, + TreeEstimator(min_weight_fraction_leaf=-1).fit, + X, y) + assert_raises(ValueError, + TreeEstimator(min_weight_fraction_leaf=0.51).fit, + X, y) assert_raises(ValueError, TreeEstimator(min_samples_split=-1).fit, X, y) assert_raises(ValueError, TreeEstimator(min_samples_split=0.0).fit, @@ -631,8 +619,7 @@ def test_min_samples_leaf(): est = TreeEstimator(min_samples_leaf=5, max_leaf_nodes=max_leaf_nodes, random_state=0) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - est.fit(X, y) + est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes @@ -644,8 +631,7 @@ def test_min_samples_leaf(): est = TreeEstimator(min_samples_leaf=0.1, max_leaf_nodes=max_leaf_nodes, random_state=0) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - est.fit(X, y) + est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes @@ -674,9 +660,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): est = TreeEstimator(min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0) - with pytest.warns(DeprecationWarning, - match='min_weight_fraction_leaf'): - est.fit(X, y, sample_weight=weights) + est.fit(X, y, sample_weight=weights) if sparse: out = est.tree_.apply(X.tocsr()) @@ -701,9 +685,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): est = TreeEstimator(min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0) - with pytest.warns(DeprecationWarning, - match='min_weight_fraction_leaf'): - est.fit(X, y) + est.fit(X, y) if sparse: out = est.tree_.apply(X.tocsr()) @@ -749,8 +731,7 @@ def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets, max_leaf_nodes=max_leaf_nodes, min_samples_leaf=5, random_state=0) - with pytest.warns(DeprecationWarning): - est.fit(X, y) + est.fit(X, y) if sparse: out = est.tree_.apply(X.tocsr()) @@ -775,8 +756,7 @@ def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets, max_leaf_nodes=max_leaf_nodes, min_samples_leaf=.1, random_state=0) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - est.fit(X, y) + est.fit(X, y) if sparse: out = est.tree_.apply(X.tocsr()) @@ -1432,16 +1412,10 @@ def check_sparse_parameters(tree, dataset): assert_array_almost_equal(s.predict(X), d.predict(X)) # Check min_samples_leaf - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - d = TreeEstimator( - random_state=0, - min_samples_leaf=X_sparse.shape[0] // 2 - ).fit(X, y) - with pytest.warns(DeprecationWarning, match='min_samples_leaf'): - s = TreeEstimator( - random_state=0, - min_samples_leaf=X_sparse.shape[0] // 2 - ).fit(X_sparse, y) + d = TreeEstimator(random_state=0, + min_samples_leaf=X_sparse.shape[0] // 2).fit(X, y) + s = TreeEstimator(random_state=0, + min_samples_leaf=X_sparse.shape[0] // 2).fit(X_sparse, y) assert_tree_equal(d.tree_, s.tree_, "{0} with dense and sparse format gave different " "trees".format(tree)) @@ -1586,8 +1560,7 @@ def _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight): assert_equal(est.tree_.max_depth, 1) est = TreeEstimator(random_state=0, min_weight_fraction_leaf=0.4) - with pytest.warns(DeprecationWarning, match='min_weight_fraction_leaf'): - est.fit(X, y, sample_weight=sample_weight) + est.fit(X, y, sample_weight=sample_weight) assert_equal(est.tree_.max_depth, 0) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 437dc197c7a04..9985cee2eef77 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -85,26 +85,26 @@ def __init__(self, splitter, max_depth, min_samples_split, + min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, random_state, min_impurity_decrease, min_impurity_split, - min_samples_leaf='deprecated', class_weight=None, presort=False): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split - self.min_samples_leaf = min_samples_leaf self.class_weight = class_weight self.presort = presort @@ -173,24 +173,18 @@ def fit(self, X, y, sample_weight=None, check_input=True, max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) - if self.min_samples_leaf != 'deprecated': - warnings.warn("'min_samples_leaf' is deprecated in 0.20 and " - "will be fixed to a value of 1 in 0.22.", - DeprecationWarning) - min_samples_leaf = self.min_samples_leaf - else: - min_samples_leaf = 1 - if isinstance(min_samples_leaf, (numbers.Integral, np.integer)): - if not 1 <= min_samples_leaf: + if isinstance(self.min_samples_leaf, (numbers.Integral, np.integer)): + if not 1 <= self.min_samples_leaf: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" - % min_samples_leaf) + % self.min_samples_leaf) + min_samples_leaf = self.min_samples_leaf else: # float - if not 0. < min_samples_leaf <= 0.5: + if not 0. < self.min_samples_leaf <= 0.5: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" - % min_samples_leaf) - min_samples_leaf = int(ceil(min_samples_leaf * n_samples)) + % self.min_samples_leaf) + min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, (numbers.Integral, np.integer)): if not 2 <= self.min_samples_split: @@ -240,15 +234,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) - - if self.min_weight_fraction_leaf != 'deprecated': - warnings.warn("'min_weight_fraction_leaf' is deprecated in 0.20 " - "and will be fixed to a value of 0 in 0.22.", - DeprecationWarning) - min_weight_fraction_leaf = self.min_weight_fraction_leaf - else: - min_weight_fraction_leaf = 0 - if not 0 <= min_weight_fraction_leaf <= 0.5: + if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") @@ -283,10 +269,10 @@ def fit(self, X, y, sample_weight=None, check_input=True, # Set min_weight_leaf from min_weight_fraction_leaf if sample_weight is None: - min_weight_leaf = (min_weight_fraction_leaf * + min_weight_leaf = (self.min_weight_fraction_leaf * n_samples) else: - min_weight_leaf = (min_weight_fraction_leaf * + min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) if self.min_impurity_split is not None: @@ -553,8 +539,8 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -562,30 +548,25 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_features : int, float, string or None, optional (default=None) The number of features to consider when looking for the best split: @@ -703,7 +684,7 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): Notes ----- The default values for the parameters controlling the size of the trees - (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. @@ -751,8 +732,8 @@ def __init__(self, splitter="best", max_depth=None, min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, + min_weight_fraction_leaf=0., max_features=None, random_state=None, max_leaf_nodes=None, @@ -930,8 +911,8 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -939,30 +920,25 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_features : int, float, string or None, optional (default=None) The number of features to consider when looking for the best split: @@ -1051,7 +1027,7 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): Notes ----- The default values for the parameters controlling the size of the trees - (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. @@ -1099,8 +1075,8 @@ def __init__(self, splitter="best", max_depth=None, min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, + min_weight_fraction_leaf=0., max_features=None, random_state=None, max_leaf_nodes=None, @@ -1197,8 +1173,8 @@ class ExtraTreeClassifier(DecisionTreeClassifier): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -1206,30 +1182,25 @@ class ExtraTreeClassifier(DecisionTreeClassifier): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_features : int, float, string or None, optional (default="auto") The number of features to consider when looking for the best split: @@ -1313,7 +1284,7 @@ class ExtraTreeClassifier(DecisionTreeClassifier): Notes ----- The default values for the parameters controlling the size of the trees - (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. @@ -1329,8 +1300,8 @@ def __init__(self, splitter="random", max_depth=None, min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, + min_weight_fraction_leaf=0., max_features="auto", random_state=None, max_leaf_nodes=None, @@ -1390,8 +1361,8 @@ class ExtraTreeRegressor(DecisionTreeRegressor): min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: - - If int, then consider ``min_samples_split`` as the minimum number. - - If float, then ``min_samples_split`` is a fraction and + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. @@ -1399,30 +1370,25 @@ class ExtraTreeRegressor(DecisionTreeRegressor): Added float values for fractions. min_samples_leaf : int, float, optional (default=1) - The minimum number of samples required to be at a leaf node: - - - If int, then consider ``min_samples_leaf`` as the minimum number. - - If float, then ``min_samples_leaf`` is a fraction and + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. - .. deprecated:: 0.20 - The parameter ``min_samples_leaf`` is deprecated in version 0.20 and - will be fixed to a value of 1 in version 0.22. It was not effective - for regularization and empirically, 1 is the best value. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - .. deprecated:: 0.20 - The parameter ``min_weight_fraction_leaf`` is deprecated in version - 0.20. Its implementation, like ``min_samples_leaf``, is ineffective - for regularization. - max_features : int, float, string or None, optional (default="auto") The number of features to consider when looking for the best split: @@ -1486,7 +1452,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor): Notes ----- The default values for the parameters controlling the size of the trees - (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. @@ -1502,8 +1468,8 @@ def __init__(self, splitter="random", max_depth=None, min_samples_split=2, - min_samples_leaf='deprecated', - min_weight_fraction_leaf='deprecated', + min_samples_leaf=1, + min_weight_fraction_leaf=0., max_features="auto", random_state=None, min_impurity_decrease=0., From e5333f5dfe61a69bede562c20a055359adad7e51 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Sat, 8 Sep 2018 16:46:48 +0200 Subject: [PATCH 029/163] OPTICS remove redundant recursion (#11985) --- sklearn/cluster/optics_.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 5c20ddb421845..1d7a677b51fb7 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -680,10 +680,6 @@ def _cluster_tree(node, parent_node, local_maxima_points, if reachability_plot[s] < significant_min: node.split_point = -1 # if split_point is not significant, ignore this split and continue - _cluster_tree(node, parent_node, local_maxima_points, - reachability_plot, reachability_ordering, - min_cluster_size, maxima_ratio, rejection_ratio, - similarity_threshold, significant_min) return # only check a certain ratio of points in the child From 2242f4c1d00d11de8fa01e67647f8a5188269fcb Mon Sep 17 00:00:00 2001 From: Max Copeland Date: Sat, 8 Sep 2018 07:52:01 -0700 Subject: [PATCH 030/163] EXA use openml fetcher in plot_gpr_co2.py example (#12004) --- examples/gaussian_process/plot_gpr_co2.py | 34 ++++++++++------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py index 8170de01898dc..4c438ce821284 100644 --- a/examples/gaussian_process/plot_gpr_co2.py +++ b/examples/gaussian_process/plot_gpr_co2.py @@ -66,7 +66,7 @@ import numpy as np from matplotlib import pyplot as plt - +from sklearn.datasets import fetch_openml from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels \ import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared @@ -79,29 +79,25 @@ print(__doc__) -def load_mauna_loa_atmospheric_c02(): - url = ('http://cdiac.ess-dive.lbl.gov/' - 'ftp/trends/co2/sio-keel-flask/maunaloa_c.dat') +def load_mauna_loa_atmospheric_co2(): + ml_data = fetch_openml(data_id=41187) months = [] ppmv_sums = [] counts = [] - for line in urlopen(url): - line = line.decode('utf8') - if not line.startswith('MLO'): - # ignore headers - continue - station, date, weight, flag, ppmv = line.split() - y = date[:2] - m = date[2:4] - month_float = (int(('20' if y < '20' else '19') + y) + - (int(m) - 1) / 12) - if not months or month_float != months[-1]: - months.append(month_float) - ppmv_sums.append(float(ppmv)) + + y = ml_data.data[:, 0] + m = ml_data.data[:, 1] + month_float = y + (m - 1) / 12 + ppmvs = ml_data.target + + for month, ppmv in zip(month_float, ppmvs): + if not months or month != months[-1]: + months.append(month) + ppmv_sums.append(ppmv) counts.append(1) else: # aggregate monthly sum to produce average - ppmv_sums[-1] += float(ppmv) + ppmv_sums[-1] += ppmv counts[-1] += 1 months = np.asarray(months).reshape(-1, 1) @@ -109,7 +105,7 @@ def load_mauna_loa_atmospheric_c02(): return months, avg_ppmvs -X, y = load_mauna_loa_atmospheric_c02() +X, y = load_mauna_loa_atmospheric_co2() # Kernel with parameters given in GPML book k1 = 66.0**2 * RBF(length_scale=67.0) # long term smooth rising trend From 177900bca7f53cb44b02dfed9d21efb6fda7c434 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 9 Sep 2018 04:19:38 +0200 Subject: [PATCH 031/163] CI Workaround to test numpy 1.8.2 and scipy 0.13.3 (#12042) --- build_tools/travis/install.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index d41e746a1ab2e..b15e76ea397ce 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -84,7 +84,11 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then # and scipy virtualenv --system-site-packages testvenv source testvenv/bin/activate - pip install pytest pytest-cov cython==$CYTHON_VERSION + # FIXME: Importing scipy.sparse with numpy 1.8.2 and scipy 0.13.3 produces + # a deprecation warning and the test suite fails on such warnings. + # To test these numpy/scipy versions, we use pytest<3.8 as it has + # a known limitation/bug of not capturing warnings during test collection. + pip install pytest==3.7.4 pytest-cov cython==$CYTHON_VERSION elif [[ "$DISTRIB" == "scipy-dev" ]]; then make_conda python=3.7 From 251e58b9e2c098aa805b58dd128864ec66ec782e Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 9 Sep 2018 12:30:16 +1000 Subject: [PATCH 032/163] FIX ordering_ type and cosmetic changes to structure for OPTICS main loop (#11986) --- sklearn/cluster/optics_.py | 79 ++++++++++++++-------------- sklearn/cluster/tests/test_optics.py | 18 +++++++ 2 files changed, 58 insertions(+), 39 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 1d7a677b51fb7..165102d0a52bc 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -331,14 +331,12 @@ def fit(self, X, y=None): n_samples = len(X) # Start all points as 'unprocessed' ## - self._processed = np.zeros((n_samples, 1), dtype=bool) self.reachability_ = np.empty(n_samples) self.reachability_.fill(np.inf) self.core_distances_ = np.empty(n_samples) self.core_distances_.fill(np.nan) # Start all points as noise ## self.labels_ = np.full(n_samples, -1, dtype=int) - self.ordering_ = [] # Check for valid n_samples relative to min_samples if self.min_samples > n_samples: @@ -357,11 +355,7 @@ def fit(self, X, y=None): self.core_distances_[:] = nbrs.kneighbors(X, self.min_samples)[0][:, -1] - # Main OPTICS loop. Not parallelizable. The order that entries are - # written to the 'ordering_' list is important! - for point in range(n_samples): - if not self._processed[point]: - self._expand_cluster_order(point, X, nbrs) + self.ordering_ = self._calculate_optics_order(X, nbrs) indices_, self.labels_ = _extract_optics(self.ordering_, self.reachability_, @@ -374,46 +368,53 @@ def fit(self, X, y=None): self.core_sample_indices_ = indices_ return self - # OPTICS helper functions; these should not be public # - - def _expand_cluster_order(self, point, X, nbrs): - # As above, not parallelizable. Parallelizing would allow items in - # the 'unprocessed' list to switch to 'processed' - if self.core_distances_[point] <= self.max_eps: - while not self._processed[point]: - self._processed[point] = True - self.ordering_.append(point) - point = self._set_reach_dist(point, X, nbrs) - else: # For very noisy points - self.ordering_.append(point) - self._processed[point] = True - - def _set_reach_dist(self, point_index, X, nbrs): - P = np.array(X[point_index]).reshape(1, -1) + # OPTICS helper functions + + def _calculate_optics_order(self, X, nbrs): + # Main OPTICS loop. Not parallelizable. The order that entries are + # written to the 'ordering_' list is important! + processed = np.zeros(X.shape[0], dtype=bool) + ordering = np.zeros(X.shape[0], dtype=int) + ordering_idx = 0 + for point in range(X.shape[0]): + if processed[point]: + continue + if self.core_distances_[point] <= self.max_eps: + while not processed[point]: + processed[point] = True + ordering[ordering_idx] = point + ordering_idx += 1 + point = self._set_reach_dist(point, processed, X, nbrs) + else: # For very noisy points + ordering[ordering_idx] = point + ordering_idx += 1 + processed[point] = True + return ordering + + def _set_reach_dist(self, point_index, processed, X, nbrs): + P = X[point_index:point_index + 1] indices = nbrs.radius_neighbors(P, radius=self.max_eps, return_distance=False)[0] # Getting indices of neighbors that have not been processed - unproc = np.compress((~np.take(self._processed, indices)).ravel(), + unproc = np.compress((~np.take(processed, indices)).ravel(), indices, axis=0) # Keep n_jobs = 1 in the following lines...please - if len(unproc) > 0: - dists = pairwise_distances(P, np.take(X, unproc, axis=0), - self.metric, n_jobs=None).ravel() - - rdists = np.maximum(dists, self.core_distances_[point_index]) - new_reach = np.minimum(np.take(self.reachability_, unproc), rdists) - self.reachability_[unproc] = new_reach - - # Checks to see if everything is already processed; - # if so, return control to main loop - if unproc.size > 0: - # Define return order based on reachability distance - return(unproc[quick_scan(np.take(self.reachability_, unproc), - dists)]) - else: + if not unproc.size: + # Everything is already processed. Return to main loop return point_index + dists = pairwise_distances(P, np.take(X, unproc, axis=0), + self.metric, n_jobs=1).ravel() + + rdists = np.maximum(dists, self.core_distances_[point_index]) + new_reach = np.minimum(np.take(self.reachability_, unproc), rdists) + self.reachability_[unproc] = new_reach + + # Define return order based on reachability distance + return (unproc[quick_scan(np.take(self.reachability_, unproc), + dists)]) + def extract_dbscan(self, eps): """Performs DBSCAN extraction for an arbitrary epsilon. diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 5a89cb7a0c439..545ffbf0ba797 100755 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -33,6 +33,24 @@ def test_correct_number_of_clusters(): n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) assert_equal(n_clusters_1, n_clusters) + # check attribute types and sizes + assert clust.core_sample_indices_.ndim == 1 + assert clust.core_sample_indices_.size > 0 + assert clust.core_sample_indices_.dtype.kind == 'i' + + assert clust.labels_.shape == (len(X),) + assert clust.labels_.dtype.kind == 'i' + + assert clust.reachability_.shape == (len(X),) + assert clust.reachability_.dtype.kind == 'f' + + assert clust.core_distances_.shape == (len(X),) + assert clust.core_distances_.dtype.kind == 'f' + + assert clust.ordering_.shape == (len(X),) + assert clust.ordering_.dtype.kind == 'i' + assert set(clust.ordering_) == set(range(len(X))) + def test_minimum_number_of_sample_check(): # test that we check a minimum number of samples From a86709fdc379f7d7db76a75f39572890e4ddcad1 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 9 Sep 2018 12:36:00 +1000 Subject: [PATCH 033/163] [MRG] MNT rename min_cluster_size_ratio to min_cluster_size (#11913) --- sklearn/cluster/optics_.py | 66 +++++++++++++++----------- sklearn/cluster/tests/test_optics.py | 69 +++++++++++++++------------- 2 files changed, 77 insertions(+), 58 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 165102d0a52bc..899da518ae796 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -24,7 +24,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, - significant_min=.003, min_cluster_size_ratio=.005, + significant_min=.003, min_cluster_size=.005, min_maxima_ratio=0.001, algorithm='ball_tree', leaf_size=30, n_jobs=None): """Perform OPTICS clustering from vector array @@ -93,8 +93,10 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', significant_min : float, optional (default=.003) Sets a lower threshold on how small a significant maxima can be. - min_cluster_size_ratio : float, optional (default=.005) - Minimum percentage of dataset expected for cluster membership. + min_cluster_size : int > 1 or float between 0 and 1 (default=0.005) + Minimum number of samples in an OPTICS cluster, expressed as an + absolute number or a fraction of the number of samples (rounded + to be at least 2). min_maxima_ratio : float, optional (default=.001) Used to determine neighborhood size for minimum cluster membership. @@ -151,7 +153,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', clust = OPTICS(min_samples, max_eps, metric, p, metric_params, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, - min_cluster_size_ratio, min_maxima_ratio, + min_cluster_size, min_maxima_ratio, algorithm, leaf_size, n_jobs) clust.fit(X) return clust.core_sample_indices_, clust.labels_ @@ -221,8 +223,10 @@ class OPTICS(BaseEstimator, ClusterMixin): significant_min : float, optional (default=.003) Sets a lower threshold on how small a significant maxima can be. - min_cluster_size_ratio : float, optional (default=.005) - Minimum percentage of dataset expected for cluster membership. + min_cluster_size : int > 1 or float between 0 and 1 (default=0.005) + Minimum number of samples in an OPTICS cluster, expressed as an + absolute number or a fraction of the number of samples (rounded + to be at least 2). min_maxima_ratio : float, optional (default=.001) Used to determine neighborhood size for minimum cluster membership. @@ -289,7 +293,7 @@ class OPTICS(BaseEstimator, ClusterMixin): def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, - significant_min=.003, min_cluster_size_ratio=.005, + significant_min=.003, min_cluster_size=.005, min_maxima_ratio=0.001, algorithm='ball_tree', leaf_size=30, n_jobs=None): @@ -299,7 +303,7 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', self.rejection_ratio = rejection_ratio self.similarity_threshold = similarity_threshold self.significant_min = significant_min - self.min_cluster_size_ratio = min_cluster_size_ratio + self.min_cluster_size = min_cluster_size self.min_maxima_ratio = min_maxima_ratio self.algorithm = algorithm self.metric = metric @@ -330,6 +334,24 @@ def fit(self, X, y=None): X = check_array(X, dtype=np.float) n_samples = len(X) + + if self.min_samples > n_samples: + raise ValueError("Number of training samples (n_samples=%d) must " + "be greater than min_samples (min_samples=%d) " + "used for clustering." % + (n_samples, self.min_samples)) + + if self.min_cluster_size <= 0 or (self.min_cluster_size != + int(self.min_cluster_size) + and self.min_cluster_size > 1): + raise ValueError('min_cluster_size must be a positive integer or ' + 'a float between 0 and 1. Got %r' % + self.min_cluster_size) + elif self.min_cluster_size > n_samples: + raise ValueError('min_cluster_size must be no greater than the ' + 'number of samples (%d). Got %d' % + (n_samples, self.min_cluster_size)) + # Start all points as 'unprocessed' ## self.reachability_ = np.empty(n_samples) self.reachability_.fill(np.inf) @@ -338,13 +360,6 @@ def fit(self, X, y=None): # Start all points as noise ## self.labels_ = np.full(n_samples, -1, dtype=int) - # Check for valid n_samples relative to min_samples - if self.min_samples > n_samples: - raise ValueError("Number of training samples (n_samples=%d) must " - "be greater than min_samples (min_samples=%d) " - "used for clustering." % - (n_samples, self.min_samples)) - nbrs = NearestNeighbors(n_neighbors=self.min_samples, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, @@ -363,7 +378,7 @@ def fit(self, X, y=None): self.rejection_ratio, self.similarity_threshold, self.significant_min, - self.min_cluster_size_ratio, + self.min_cluster_size, self.min_maxima_ratio) self.core_sample_indices_ = indices_ return self @@ -492,7 +507,7 @@ def _extract_dbscan(ordering, core_distances, reachability, eps): def _extract_optics(ordering, reachability, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, - significant_min=.003, min_cluster_size_ratio=.005, + significant_min=.003, min_cluster_size=.005, min_maxima_ratio=0.001): """Performs automatic cluster extraction for variable density data. @@ -530,8 +545,10 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, significant_min : float, optional Sets a lower threshold on how small a significant maxima can be. - min_cluster_size_ratio : float, optional - Minimum percentage of dataset expected for cluster membership. + min_cluster_size : int > 1 or float between 0 and 1 + Minimum number of samples in an OPTICS cluster, expressed as an + absolute number or a fraction of the number of samples (rounded + to be at least 2). min_maxima_ratio : float, optional Used to determine neighborhood size for minimum cluster membership. @@ -551,7 +568,7 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, root_node = _automatic_cluster(reachability_plot, ordering, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, - min_cluster_size_ratio, min_maxima_ratio) + min_cluster_size, min_maxima_ratio) leaves = _get_leaves(root_node, []) # Start cluster id's at 0 clustid = 0 @@ -570,7 +587,7 @@ def _extract_optics(ordering, reachability, maxima_ratio=.75, def _automatic_cluster(reachability_plot, ordering, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, - min_cluster_size_ratio, min_maxima_ratio): + min_cluster_size, min_maxima_ratio): """Converts reachability plot to cluster tree and returns root node. Parameters @@ -582,13 +599,10 @@ def _automatic_cluster(reachability_plot, ordering, """ min_neighborhood_size = 2 - min_cluster_size = int(min_cluster_size_ratio * len(ordering)) + if min_cluster_size <= 1: + min_cluster_size = max(2, min_cluster_size * len(ordering)) neighborhood_size = int(min_maxima_ratio * len(ordering)) - # Should this check for < min_samples? Should this be public? - if min_cluster_size < 5: - min_cluster_size = 5 - # Again, should this check < min_samples, should the parameter be public? if neighborhood_size < min_neighborhood_size: neighborhood_size = min_neighborhood_size diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 545ffbf0ba797..bddf57ec7b5d1 100755 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -2,6 +2,7 @@ # Amy X. Zhang # License: BSD 3 clause +from __future__ import print_function, division import numpy as np import pytest @@ -20,6 +21,17 @@ from sklearn.cluster.tests.common import generate_clustered_data +rng = np.random.RandomState(0) +n_points_per_cluster = 250 +C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2) +C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2) +C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2) +C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2) +C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2) +C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2) +X = np.vstack((C1, C2, C3, C4, C5, C6)) + + def test_correct_number_of_clusters(): # in 'auto' mode @@ -135,27 +147,36 @@ def test_dbscan_optics_parity(eps, min_samples): def test_auto_extract_hier(): # Tests auto extraction gets correct # of clusters with varying density + clust = OPTICS(min_samples=9).fit(X) + assert_equal(len(set(clust.labels_)), 6) - # Generate sample data - rng = np.random.RandomState(0) - n_points_per_cluster = 250 - C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2) - C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2) - C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2) - C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2) - C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2) - C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2) - X = np.vstack((C1, C2, C3, C4, C5, C6)) +# try arbitrary minimum sizes +@pytest.mark.parametrize('min_cluster_size', range(2, X.shape[0] // 10, 23)) +def test_min_cluster_size(min_cluster_size): + redX = X[::10] # reduce for speed + clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX) + cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) + if cluster_sizes.size: + assert min(cluster_sizes) >= min_cluster_size + # check behaviour is the same when min_cluster_size is a fraction + clust_frac = OPTICS(min_samples=9, + min_cluster_size=min_cluster_size / redX.shape[0]) + clust_frac.fit(redX) + assert_array_equal(clust.labels_, clust_frac.labels_) - # Compute OPTICS - clust = OPTICS(min_samples=9) +@pytest.mark.parametrize('min_cluster_size', [0, -1, 1.1, 2.2]) +def test_min_cluster_size_invalid(min_cluster_size): + clust = OPTICS(min_cluster_size=min_cluster_size) + with pytest.raises(ValueError, match="must be a positive integer or a "): + clust.fit(X) - # Run the fit - clust.fit(X) - assert_equal(len(set(clust.labels_)), 6) +def test_min_cluster_size_invalid2(): + clust = OPTICS(min_cluster_size=len(X) + 1) + with pytest.raises(ValueError, match="must be no greater than the "): + clust.fit(X) @pytest.mark.parametrize("reach, n_child, members", [ @@ -187,23 +208,7 @@ def test_cluster_sigmin_pruning(reach, n_child, members): def test_reach_dists(): # Tests against known extraction array - rng = np.random.RandomState(0) - n_points_per_cluster = 250 - - C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2) - C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2) - C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2) - C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2) - C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2) - C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2) - X = np.vstack((C1, C2, C3, C4, C5, C6)) - - # Compute OPTICS - - clust = OPTICS(min_samples=10, metric='minkowski') - - # Run the fit - clust.fit(X) + clust = OPTICS(min_samples=10, metric='minkowski').fit(X) # Expected values, matches 'RD' results from: # http://chemometria.us.edu.pl/download/optics.py From 5ec0001e93d3d0e071689e9b898850b7c35b0851 Mon Sep 17 00:00:00 2001 From: vqean3 Date: Tue, 11 Sep 2018 16:27:55 -0700 Subject: [PATCH 034/163] DOC `sample_weight` removed from the docs in `SVR` class. (#12046) --- sklearn/svm/classes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index a2d96c322b332..1028843a9bf19 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -841,9 +841,6 @@ class SVR(BaseLibSVM, RegressorMixin): intercept_ : array, shape = [1] Constants in decision function. - sample_weight : array-like, shape = [n_samples] - Individual weights for each sample - Examples -------- >>> from sklearn.svm import SVR From f71de6fd264ba350e69737973e4eadebbe900469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BCdiger=20Busche?= Date: Wed, 12 Sep 2018 06:48:11 +0200 Subject: [PATCH 035/163] MNT Unify and refactor strategy error (#12050) --- sklearn/dummy.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/dummy.py b/sklearn/dummy.py index f2c866413183b..ade45a1735879 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -105,9 +105,11 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - if self.strategy not in ("most_frequent", "stratified", "uniform", - "constant", "prior"): - raise ValueError("Unknown strategy type.") + allowed_strategies = ("most_frequent", "stratified", "uniform", + "constant", "prior") + if self.strategy not in allowed_strategies: + raise ValueError("Unknown strategy type: %s, expected one of %s." + % (self.strategy, allowed_strategies)) if self.strategy == "uniform" and sp.issparse(y): y = y.toarray() @@ -386,10 +388,10 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - if self.strategy not in ("mean", "median", "quantile", "constant"): - raise ValueError("Unknown strategy type: %s, expected " - "'mean', 'median', 'quantile' or 'constant'" - % self.strategy) + allowed_strategies = ("mean", "median", "quantile", "constant") + if self.strategy not in allowed_strategies: + raise ValueError("Unknown strategy type: %s, expected one of %s." + % (self.strategy, allowed_strategies)) y = check_array(y, ensure_2d=False) if len(y) == 0: From 2ed18e00f77c8cd5b99f52ab1623ecaa8794b399 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Wed, 12 Sep 2018 09:47:42 +0200 Subject: [PATCH 036/163] [MRG] DOC Examples added to the rest of linear models (#11975) --- sklearn/linear_model/base.py | 17 ++++++++++++++ sklearn/linear_model/coordinate_descent.py | 24 ++++++++++++++++++++ sklearn/linear_model/huber.py | 23 +++++++++++++++++++ sklearn/linear_model/least_angle.py | 26 ++++++++++++++++++++++ sklearn/linear_model/omp.py | 25 +++++++++++++++++++++ sklearn/linear_model/ransac.py | 12 ++++++++++ sklearn/linear_model/theil_sen.py | 12 ++++++++++ 7 files changed, 139 insertions(+) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index 30a28cd507f67..29734a2135d8f 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -399,6 +399,23 @@ class LinearRegression(LinearModel, RegressorMixin): intercept_ : array Independent term in the linear model. + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import LinearRegression + >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) + >>> # y = 1 * x_0 + 2 * x_1 + 3 + >>> y = np.dot(X, np.array([1, 2])) + 3 + >>> reg = LinearRegression().fit(X, y) + >>> reg.score(X, y) + 1.0 + >>> reg.coef_ + array([1., 2.]) + >>> reg.intercept_ # doctest: +ELLIPSIS + 3.0000... + >>> reg.predict(np.array([[3, 5]])) + array([16.]) + Notes ----- From the implementation point of view, this is just plain Ordinary diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py index 6fa71f2dddcf4..2d0723944be4e 100644 --- a/sklearn/linear_model/coordinate_descent.py +++ b/sklearn/linear_model/coordinate_descent.py @@ -1368,6 +1368,17 @@ class LassoCV(LinearModelCV, RegressorMixin): number of iterations run by the coordinate descent solver to reach the specified tolerance for the optimal alpha. + Examples + -------- + >>> from sklearn.linear_model import LassoCV + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(noise=4, random_state=0) + >>> reg = LassoCV(cv=5, random_state=0).fit(X, y) + >>> reg.score(X, y) # doctest: +ELLIPSIS + 0.9993... + >>> reg.predict(X[:1,]) + array([-78.4951...]) + Notes ----- For an example, see @@ -2235,6 +2246,19 @@ class MultiTaskLassoCV(LinearModelCV, RegressorMixin): number of iterations run by the coordinate descent solver to reach the specified tolerance for the optimal alpha. + Examples + -------- + >>> from sklearn.linear_model import MultiTaskLassoCV + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_targets=2, noise=4, random_state=0) + >>> reg = MultiTaskLassoCV(cv=5, random_state=0).fit(X, y) + >>> reg.score(X, y) # doctest: +ELLIPSIS + 0.9994... + >>> reg.alpha_ + 0.5713... + >>> reg.predict(X[:1,]) + array([[153.7971..., 94.9015...]]) + See also -------- MultiTaskElasticNet diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py index b6f4658ea573d..3270b5d221a51 100644 --- a/sklearn/linear_model/huber.py +++ b/sklearn/linear_model/huber.py @@ -192,6 +192,29 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator): A boolean mask which is set to True where the samples are identified as outliers. + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import HuberRegressor, LinearRegression + >>> from sklearn.datasets import make_regression + >>> np.random.seed(0) + >>> X, y, coef = make_regression( + ... n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0) + >>> X[:4] = np.random.uniform(10, 20, (4, 2)) + >>> y[:4] = np.random.uniform(10, 20, 4) + >>> huber = HuberRegressor().fit(X, y) + >>> huber.score(X, y) # doctest: +ELLIPSIS + -7.284608623514573 + >>> huber.predict(X[:1,]) + array([806.7200...]) + >>> linear = LinearRegression().fit(X, y) + >>> print("True coefficients:", coef) + True coefficients: [20.4923... 34.1698...] + >>> print("Huber coefficients:", huber.coef_) + Huber coefficients: [17.7906... 31.0106...] + >>> print("Linear Regression coefficients:", linear.coef_) + Linear Regression coefficients: [-1.9221... 7.0226...] + References ---------- .. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index d139560260a87..ce13b99b6aae5 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -1070,6 +1070,19 @@ class LarsCV(Lars): n_iter_ : array-like or int the number of iterations run by Lars with the optimal alpha. + Examples + -------- + >>> from sklearn.linear_model import LarsCV + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_samples=200, noise=4.0, random_state=0) + >>> reg = LarsCV(cv=5).fit(X, y) + >>> reg.score(X, y) # doctest: +ELLIPSIS + 0.9996... + >>> reg.alpha_ + 0.0254... + >>> reg.predict(X[:1,]) + array([154.0842...]) + See also -------- lars_path, LassoLars, LassoLarsCV @@ -1290,6 +1303,19 @@ class LassoLarsCV(LarsCV): n_iter_ : array-like or int the number of iterations run by Lars with the optimal alpha. + Examples + -------- + >>> from sklearn.linear_model import LassoLarsCV + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(noise=4.0, random_state=0) + >>> reg = LassoLarsCV(cv=5).fit(X, y) + >>> reg.score(X, y) # doctest: +ELLIPSIS + 0.9992... + >>> reg.alpha_ + 0.0484... + >>> reg.predict(X[:1,]) + array([-77.8723...]) + Notes ----- diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py index a0f6d49490948..c304c0f341821 100644 --- a/sklearn/linear_model/omp.py +++ b/sklearn/linear_model/omp.py @@ -583,6 +583,17 @@ class OrthogonalMatchingPursuit(LinearModel, RegressorMixin): n_iter_ : int or array-like Number of active features across every target. + Examples + -------- + >>> from sklearn.linear_model import OrthogonalMatchingPursuit + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(noise=4, random_state=0) + >>> reg = OrthogonalMatchingPursuit().fit(X, y) + >>> reg.score(X, y) # doctest: +ELLIPSIS + 0.9991... + >>> reg.predict(X[:1,]) + array([-78.3854...]) + Notes ----- Orthogonal matching pursuit was introduced in G. Mallat, Z. Zhang, @@ -814,6 +825,20 @@ class OrthogonalMatchingPursuitCV(LinearModel, RegressorMixin): Number of active features across every target for the model refit with the best hyperparameters got by cross-validating across all folds. + Examples + -------- + >>> from sklearn.linear_model import OrthogonalMatchingPursuitCV + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_features=100, n_informative=10, + ... noise=4, random_state=0) + >>> reg = OrthogonalMatchingPursuitCV(cv=5).fit(X, y) + >>> reg.score(X, y) # doctest: +ELLIPSIS + 0.9991... + >>> reg.n_nonzero_coefs_ + 10 + >>> reg.predict(X[:1,]) + array([-78.3854...]) + See also -------- orthogonal_mp diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py index 9dcd044d1f3ea..f929533e871a8 100644 --- a/sklearn/linear_model/ransac.py +++ b/sklearn/linear_model/ransac.py @@ -186,6 +186,18 @@ class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin): .. versionadded:: 0.19 + Examples + -------- + >>> from sklearn.linear_model import RANSACRegressor + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression( + ... n_samples=200, n_features=2, noise=4.0, random_state=0) + >>> reg = RANSACRegressor(random_state=0).fit(X, y) + >>> reg.score(X, y) # doctest: +ELLIPSIS + 0.9885... + >>> reg.predict(X[:1,]) + array([-31.9417...]) + References ---------- .. [1] https://en.wikipedia.org/wiki/RANSAC diff --git a/sklearn/linear_model/theil_sen.py b/sklearn/linear_model/theil_sen.py index 0f3b19164b146..00ad26d41b031 100644 --- a/sklearn/linear_model/theil_sen.py +++ b/sklearn/linear_model/theil_sen.py @@ -276,6 +276,18 @@ class TheilSenRegressor(LinearModel, RegressorMixin): Number of combinations taken into account from 'n choose k', where n is the number of samples and k is the number of subsamples. + Examples + -------- + >>> from sklearn.linear_model import TheilSenRegressor + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression( + ... n_samples=200, n_features=2, noise=4.0, random_state=0) + >>> reg = TheilSenRegressor(random_state=0).fit(X, y) + >>> reg.score(X, y) # doctest: +ELLIPSIS + 0.9884... + >>> reg.predict(X[:1,]) + array([-31.5871...]) + References ---------- - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009 From 1906c959a9350ba388bcd349969ad1555fa4e2f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Wed, 12 Sep 2018 17:47:52 +0200 Subject: [PATCH 037/163] DOC Generated author list from github (#11708) --- AUTHORS.rst | 75 --------- build_tools/Makefile | 4 + build_tools/generate_authors_table.py | 117 ++++++++++++++ doc/about.rst | 26 ++- doc/authors.rst | 220 ++++++++++++++++++++++++++ doc/developers/maintainer.rst | 11 +- 6 files changed, 376 insertions(+), 77 deletions(-) delete mode 100644 AUTHORS.rst create mode 100644 build_tools/Makefile create mode 100644 build_tools/generate_authors_table.py create mode 100644 doc/authors.rst diff --git a/AUTHORS.rst b/AUTHORS.rst deleted file mode 100644 index 48427fc0a2b3a..0000000000000 --- a/AUTHORS.rst +++ /dev/null @@ -1,75 +0,0 @@ -.. -*- mode: rst -*- - - -This is a community effort, and as such many people have contributed -to it over the years. - -History -------- - -This project was started in 2007 as a Google Summer of Code project by -David Cournapeau. Later that year, Matthieu Brucher started work on -this project as part of his thesis. - -In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent -Michel of INRIA took leadership of the project and made the first public -release, February the 1st 2010. Since then, several releases have appeared -following a ~3 month cycle, and a thriving international community has -been leading the development. - -People ------- - -The following people have been core contributors to scikit-learn's development and maintenance: - -.. hlist:: - - * `Mathieu Blondel `_ - * `Matthieu Brucher `_ - * Lars Buitinck - * David Cournapeau - * `Noel Dawe `_ - * Vincent Dubourg - * Edouard Duchesnay - * `Tom Dupré la Tour `_ - * Alexander Fabisch - * `Virgile Fritsch `_ - * `Satra Ghosh `_ - * `Angel Soler Gollonet `_ - * Chris Filo Gorgolewski - * `Alexandre Gramfort `_ - * `Olivier Grisel `_ - * `Jaques Grobler `_ - * `Yaroslav Halchenko `_ - * `Brian Holt `_ - * `Arnaud Joly `_ - * Thouis (Ray) Jones - * `Kyle Kastner `_ - * `Manoj Kumar `_ - * Robert Layton - * `Guillaume Lemaitre `_ - * `Wei Li `_ - * Paolo Losi - * `Gilles Louppe `_ - * `Jan Hendrik Metzen `_ - * Vincent Michel - * Jarrod Millman - * `Andreas Müller `_ (release manager) - * `Vlad Niculae `_ - * `Joel Nothman `_ - * `Alexandre Passos `_ - * `Fabian Pedregosa `_ - * `Peter Prettenhofer `_ - * `Hanmin Qin `_ - * Bertrand Thirion - * `Joris Van den Bossche `_ - * `Jake VanderPlas `_ - * Nelle Varoquaux - * `Gael Varoquaux `_ - * Ron Weiss - * `Roman Yurchak `_ - -Please do not email the authors directly to ask for assistance or report issues. -Instead, please see `What's the best way to ask questions about scikit-learn -`_ -in the FAQ. diff --git a/build_tools/Makefile b/build_tools/Makefile new file mode 100644 index 0000000000000..68162733b4b11 --- /dev/null +++ b/build_tools/Makefile @@ -0,0 +1,4 @@ +# Makefile for maintenance tools + +authors: + python generate_authors_table.py > ../doc/authors.rst diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py new file mode 100644 index 0000000000000..ea3796473396d --- /dev/null +++ b/build_tools/generate_authors_table.py @@ -0,0 +1,117 @@ +""" +This script generates an html table of contributors, with names and avatars. +The list is generated from scikit-learn's teams on GitHub, plus a small number +of hard-coded contributors. + +The table should be updated for each new inclusion in the teams. +Generating the table requires admin rights. +""" +from __future__ import print_function + +import sys +import requests +import getpass + +try: + # With authentication: up to 5000 requests per hour. + print("user:", file=sys.stderr) + user = input() + passwd = getpass.getpass() + auth = (user, passwd) +except IndexError: + # Without authentication: up to 60 requests per hour. + auth = None + +ROW_SIZE = 7 +LOGO_URL = 'https://avatars2.githubusercontent.com/u/365630?v=4' + + +def group_iterable(iterable, size): + """Group iterable into lines""" + group = [] + for element in iterable: + group.append(element) + if len(group) == size: + yield group + group = [] + if len(group) != 0: + yield group + + +def get_contributors(): + """Get the list of contributor profiles. Require admin rights.""" + # get members of scikit-learn teams on GitHub + members = [] + for team in [11523, 33471]: + for page in [1, 2]: # 30 per page + members.extend(requests.get( + "https://api.github.com/teams/%d/members?page=%d" + % (team, page), auth=auth).json()) + + # keep only the logins + logins = [c['login'] for c in members] + # add missing contributors with GitHub accounts + logins.extend(['dubourg', 'jarrodmillman', 'mbrucher', 'thouis']) + # add missing contributors without GitHub accounts + logins.extend(['Angel Soler Gollonet']) + # remove duplicate + logins = set(logins) + # remove CI + logins.remove('sklearn-ci') + + # get profiles from GitHub + profiles = [get_profile(login) for login in logins] + # sort by last name + profiles = sorted(profiles, key=key) + + return profiles + + +def get_profile(login): + """Get the GitHub profile from login""" + profile = requests.get("https://api.github.com/users/%s" % login, + auth=auth).json() + if 'name' not in profile: + # default profile if the login does not exist + return dict(name=login, avatar_url=LOGO_URL, html_url="") + else: + if profile["name"] is None: + profile["name"] = profile["login"] + + # fix missing names + missing_names = {'bthirion': 'Bertrand Thirion', + 'dubourg': 'Vincent Dubourg', + 'Duchesnay': 'Edouard Duchesnay', + 'Lars': 'Lars Buitinck', + 'MechCoder': 'Manoj Kumar'} + if profile["name"] in missing_names: + profile["name"] = missing_names[profile["name"]] + return profile + + +def key(profile): + """Get the last name in lower case""" + return profile["name"].split(' ')[-1].lower() + + +contributors = get_contributors() + +print(".. raw :: html\n") +print(" ") +print(" ") +print(" " + % (int(100 / ROW_SIZE), ROW_SIZE)) +print(" ") +for row in group_iterable(contributors, size=ROW_SIZE): + print(" ") + for contributor in row: + print(" ") + print(" ") +print("
") + print("
" + % (contributor["html_url"], contributor["avatar_url"])) + print("

%s

" % contributor["name"]) + print("
") diff --git a/doc/about.rst b/doc/about.rst index 90295b96fb6ff..218b0ad897fe4 100644 --- a/doc/about.rst +++ b/doc/about.rst @@ -1,7 +1,31 @@ About us ======== -.. include:: ../AUTHORS.rst +History +------- + +This project was started in 2007 as a Google Summer of Code project by +David Cournapeau. Later that year, Matthieu Brucher started work on +this project as part of his thesis. + +In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent +Michel of INRIA took leadership of the project and made the first public +release, February the 1st 2010. Since then, several releases have appeared +following a ~3 month cycle, and a thriving international community has +been leading the development. + +Authors +------- + +The following people have been core contributors to scikit-learn's development +and maintenance: + +.. include:: authors.rst + +Please do not email the authors directly to ask for assistance or report issues. +Instead, please see `What's the best way to ask questions about scikit-learn +`_ +in the FAQ. .. seealso:: diff --git a/doc/authors.rst b/doc/authors.rst new file mode 100644 index 0000000000000..0210dff4bef6e --- /dev/null +++ b/doc/authors.rst @@ -0,0 +1,220 @@ +.. raw :: html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

Mathieu Blondel

+
+
+

Joris Van den Bossche

+
+
+

Matthieu Brucher

+
+
+

Lars Buitinck

+
+
+

David Cournapeau

+
+
+

Noel Dawe

+
+
+

Shiqiao Du

+
+
+

Vincent Dubourg

+
+
+

Edouard Duchesnay

+
+
+

Loïc Estève

+
+
+

Alexander Fabisch

+
+
+

Virgile Fritsch

+
+
+

Satrajit Ghosh

+
+
+

Angel Soler Gollonet

+
+
+

Chris Filo Gorgolewski

+
+
+

Alexandre Gramfort

+
+
+

Olivier Grisel

+
+
+

Jaques Grobler

+
+
+

Yaroslav Halchenko

+
+
+

Brian Holt

+
+
+

Arnaud Joly

+
+
+

Thouis (Ray) Jones

+
+
+

Kyle Kastner

+
+
+

Manoj Kumar

+
+
+

Robert Layton

+
+
+

Guillaume Lemaitre

+
+
+

Wei Li

+
+
+

Paolo Losi

+
+
+

Gilles Louppe

+
+
+

Jan Hendrik Metzen

+
+
+

Vincent Michel

+
+
+

Jarrod Millman

+
+
+

Andreas Mueller

+
+
+

Vlad Niculae

+
+
+

Joel Nothman

+
+
+

Alexandre Passos

+
+
+

Fabian Pedregosa

+
+
+

Peter Prettenhofer

+
+
+

Hanmin Qin

+
+
+

(Venkat) Raghav, Rajagopalan

+
+
+

Jacob Schreiber

+
+
+

Bertrand Thirion

+
+
+

Tom Dupré la Tour

+
+
+

Jake Vanderplas

+
+
+

Nelle Varoquaux

+
+
+

Gael Varoquaux

+
+
+

David Warde-Farley

+
+
+

Ron Weiss

+
+
+

Roman Yurchak

+
diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst index d0d0db8a041bb..a3309abcfbf10 100644 --- a/doc/developers/maintainer.rst +++ b/doc/developers/maintainer.rst @@ -1,8 +1,17 @@ Maintainer / core-developer information ======================================== +Before a release +---------------- + +1. Update authors table:: + + $ cd build_tools; make authors; cd .. + + and commit. + Making a release ------------------- +---------------- For more information see https://github.com/scikit-learn/scikit-learn/wiki/How-to-make-a-release From 3ee1cfc873270fdf075fad2a03a2695388fd5ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BCdiger=20Busche?= Date: Thu, 13 Sep 2018 10:03:59 +0200 Subject: [PATCH 038/163] ENH Allow scoring of dummies without testsamples (#11957) --- doc/whats_new/v0.20.rst | 4 ++ sklearn/dummy.py | 69 +++++++++++++++++++++++++++++++++ sklearn/tests/test_dummy.py | 77 +++++++++++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 0fe95de46eb42..8cbb8074ed735 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -297,6 +297,10 @@ Support for Python 3.3 has been officially dropped. only require X to be an object with finite length or shape. :issue:`9832` by :user:`Vrishank Bhardwaj `. +- |Feature| :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor` + can now be scored without supplying test samples. + :issue:`11951` by :user:`Rüdiger Busche `. + :mod:`sklearn.ensemble` ....................... diff --git a/sklearn/dummy.py b/sklearn/dummy.py index ade45a1735879..2fac84fd7bea4 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -320,6 +320,37 @@ def predict_log_proba(self, X): else: return [np.log(p) for p in proba] + def score(self, X, y, sample_weight=None): + """Returns the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : {array-like, None} + Test samples with shape = (n_samples, n_features) or + None. Passing None as test samples gives the same result + as passing real test samples, since DummyClassifier + operates independently of the sampled observations. + + y : array-like, shape = (n_samples) or (n_samples, n_outputs) + True labels for X. + + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + + Returns + ------- + score : float + Mean accuracy of self.predict(X) wrt. y. + + """ + if X is None: + X = np.zeros(shape=(len(y), 1)) + return super(DummyClassifier, self).score(X, y, sample_weight) + class DummyRegressor(BaseEstimator, RegressorMixin): """ @@ -480,3 +511,41 @@ def predict(self, X, return_std=False): y_std = np.ravel(y_std) return (y, y_std) if return_std else y + + def score(self, X, y, sample_weight=None): + """Returns the coefficient of determination R^2 of the prediction. + + The coefficient R^2 is defined as (1 - u/v), where u is the residual + sum of squares ((y_true - y_pred) ** 2).sum() and v is the total + sum of squares ((y_true - y_true.mean()) ** 2).sum(). + The best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). A constant model that always + predicts the expected value of y, disregarding the input features, + would get a R^2 score of 0.0. + + Parameters + ---------- + X : {array-like, None} + Test samples with shape = (n_samples, n_features) or None. + For some estimators this may be a + precomputed kernel matrix instead, shape = (n_samples, + n_samples_fitted], where n_samples_fitted is the number of + samples used in the fitting for the estimator. + Passing None as test samples gives the same result + as passing real test samples, since DummyRegressor + operates independently of the sampled observations. + + y : array-like, shape = (n_samples) or (n_samples, n_outputs) + True values for X. + + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + + Returns + ------- + score : float + R^2 of self.predict(X) wrt. y. + """ + if X is None: + X = np.zeros(shape=(len(y), 1)) + return super(DummyRegressor, self).score(X, y, sample_weight) diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index 5d955f51017a1..805c90a7e018e 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -1,5 +1,7 @@ from __future__ import division +import pytest + import numpy as np import scipy.sparse as sp @@ -200,6 +202,45 @@ def test_string_labels(): assert_array_equal(clf.predict(X), ["paris"] * 5) +@pytest.mark.parametrize("y,y_test", [ + ([2, 1, 1, 1], [2, 2, 1, 1]), + (np.array([[2, 2], + [1, 1], + [1, 1], + [1, 1]]), + np.array([[2, 2], + [2, 2], + [1, 1], + [1, 1]])) +]) +def test_classifier_score_with_None(y, y_test): + clf = DummyClassifier(strategy="most_frequent") + clf.fit(None, y) + assert_equal(clf.score(None, y_test), 0.5) + + +@pytest.mark.parametrize("strategy", [ + "stratified", + "most_frequent", + "prior", + "uniform", + "constant" +]) +def test_classifier_prediction_independent_of_X(strategy): + y = [0, 2, 1, 1] + X1 = [[0]] * 4 + clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0) + clf1.fit(X1, y) + predictions1 = clf1.predict(X1) + + X2 = [[1]] * 4 + clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0) + clf2.fit(X2, y) + predictions2 = clf2.predict(X2) + + assert_array_equal(predictions1, predictions2) + + def test_classifier_exceptions(): clf = DummyClassifier(strategy="unknown") assert_raises(ValueError, clf.fit, [], []) @@ -633,3 +674,39 @@ def test_dummy_regressor_return_std(): assert_equal(len(y_pred_list), 2) # the second element should be all zeros assert_array_equal(y_pred_list[1], y_std_expected) + + +@pytest.mark.parametrize("y,y_test", [ + ([1, 1, 1, 2], [1.25] * 4), + (np.array([[2, 2], + [1, 1], + [1, 1], + [1, 1]]), + [[1.25, 1.25]] * 4) + +]) +def test_regressor_score_with_None(y, y_test): + reg = DummyRegressor() + reg.fit(None, y) + assert_equal(reg.score(None, y_test), 1.0) + + +@pytest.mark.parametrize("strategy", [ + "mean", + "median", + "quantile", + "constant" +]) +def test_regressor_prediction_independent_of_X(strategy): + y = [0, 2, 1, 1] + X1 = [[0]] * 4 + reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7) + reg1.fit(X1, y) + predictions1 = reg1.predict(X1) + + X2 = [[1]] * 4 + reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7) + reg2.fit(X2, y) + predictions2 = reg2.predict(X2) + + assert_array_equal(predictions1, predictions2) From dad5c36c5eda5e677dabaac4be330fb7517ca4d6 Mon Sep 17 00:00:00 2001 From: "Zijie (ZJ) Poh" <8103276+zjpoh@users.noreply.github.com> Date: Thu, 13 Sep 2018 02:01:43 -0700 Subject: [PATCH 039/163] DOC Fix docstring inconsistency in nmf.py (#12063) --- sklearn/decomposition/nmf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index 990d31bf2ccc0..0617a1797fcdc 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -880,7 +880,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom' Method used to initialize the procedure. - Default: 'nndsvd' if n_components < n_features, otherwise random. + Default: 'random'. Valid options: - 'random': non-negative random matrices, scaled with: From 36536c6f46ac060d4b9c9e48d79d42fafa3fb344 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Thu, 13 Sep 2018 11:08:23 +0200 Subject: [PATCH 040/163] MAINT Fix invalid escape sequence (#12064) --- sklearn/cluster/tests/test_k_means.py | 4 ++-- sklearn/datasets/mlcomp.py | 2 +- sklearn/externals/_arff.py | 2 +- sklearn/model_selection/tests/test_search.py | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 7935e7134d242..5994c770db9c9 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -885,7 +885,7 @@ def test_sparse_validate_centers(): # Test that a ValueError is raised for validate_center_shape classifier = KMeans(n_clusters=3, init=centers, n_init=1) - msg = "The shape of the initial centers \(\(4L?, 4L?\)\) " \ + msg = r"The shape of the initial centers \(\(4L?, 4L?\)\) " \ "does not match the number of clusters 3" assert_raises_regex(ValueError, msg, classifier.fit, X) @@ -969,7 +969,7 @@ def test_sample_weight_length(): # check that an error is raised when passing sample weights # with an incompatible shape km = KMeans(n_clusters=n_clusters, random_state=42) - assert_raises_regex(ValueError, 'len\(sample_weight\)', km.fit, X, + assert_raises_regex(ValueError, r'len\(sample_weight\)', km.fit, X, sample_weight=np.ones(2)) diff --git a/sklearn/datasets/mlcomp.py b/sklearn/datasets/mlcomp.py index 169df6e55151a..9adb7bbc1c06e 100644 --- a/sklearn/datasets/mlcomp.py +++ b/sklearn/datasets/mlcomp.py @@ -24,7 +24,7 @@ def _load_document_classification(dataset_path, metadata, set_=None, **kwargs): "in March 2017, the load_mlcomp function was deprecated " "in version 0.19 and will be removed in 0.21.") def load_mlcomp(name_or_id, set_="raw", mlcomp_root=None, **kwargs): - """Load a datasets as downloaded from http://mlcomp.org + r"""Load a datasets as downloaded from http://mlcomp.org Read more in the :ref:`User Guide `. diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py index 7fb445ef9d5a5..eaec6083d0ae4 100644 --- a/sklearn/externals/_arff.py +++ b/sklearn/externals/_arff.py @@ -641,7 +641,7 @@ def _decode_comment(self, s): :param s: a normalized string. :return: a string with the decoded comment. ''' - res = re.sub('^\%( )?', '', s) + res = re.sub(r'^\%( )?', '', s) return res def _decode_relation(self, s): diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 969b6288a71e8..916804b384c7b 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -133,10 +133,10 @@ def assert_grid_iter_equals_getitem(grid): @pytest.mark.parametrize( "input, error_type, error_message", - [(0, TypeError, 'Parameter grid is not a dict or a list \(0\)'), - ([{'foo': [0]}, 0], TypeError, 'Parameter grid is not a dict \(0\)'), + [(0, TypeError, r'Parameter grid is not a dict or a list \(0\)'), + ([{'foo': [0]}, 0], TypeError, r'Parameter grid is not a dict \(0\)'), ({'foo': 0}, TypeError, "Parameter grid value is not iterable " - "\(key='foo', value=0\)")] + r"\(key='foo', value=0\)")] ) def test_validate_parameter_grid_input(input, error_type, error_message): with pytest.raises(error_type, match=error_message): From 17c6c908738bedda834d49a94a8804c179e9cbfb Mon Sep 17 00:00:00 2001 From: Zach Griffith Date: Thu, 13 Sep 2018 04:18:59 -0500 Subject: [PATCH 041/163] DOC fix typos in documentation. (#12059) --- doc/modules/lda_qda.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst index 3d45dd78f3179..e1dfb0c03ea4b 100644 --- a/doc/modules/lda_qda.rst +++ b/doc/modules/lda_qda.rst @@ -15,7 +15,7 @@ surface, respectively. These classifiers are attractive because they have closed-form solutions that can be easily computed, are inherently multiclass, have proven to work well in -practice and have no hyperparameters to tune. +practice, and have no hyperparameters to tune. .. |ldaqda| image:: ../auto_examples/classification/images/sphx_glr_plot_lda_qda_001.png :target: ../auto_examples/classification/plot_lda_qda.html @@ -43,7 +43,7 @@ linear subspace consisting of the directions which maximize the separation between classes (in a precise sense discussed in the mathematics section below). The dimension of the output is necessarily less than the number of classes, so this is, in general, a rather strong dimensionality reduction, and -only makes senses in a multiclass setting. +only makes sense in a multiclass setting. This is implemented in :func:`discriminant_analysis.LinearDiscriminantAnalysis.transform`. The desired @@ -70,10 +70,10 @@ the class conditional distribution of the data :math:`P(X|y=k)` for each class and we select the class :math:`k` which maximizes this conditional probability. More specifically, for linear and quadratic discriminant analysis, -:math:`P(X|y)` is modelled as a multivariate Gaussian distribution with +:math:`P(X|y)` is modeled as a multivariate Gaussian distribution with density: -.. math:: P(X | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right) +.. math:: P(X | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right) where :math:`d` is the number of features. @@ -85,7 +85,7 @@ matrices, or by a regularized estimator: see the section on shrinkage below). In the case of LDA, the Gaussians for each class are assumed to share the same covariance matrix: :math:`\Sigma_k = \Sigma` for all :math:`k`. This leads to -linear decision surfaces between, as can be seen by comparing the +linear decision surfaces, which can be seen by comparing the log-probability ratios :math:`\log[P(y=k | X) / P(y=l | X)]`: .. math:: @@ -127,7 +127,7 @@ classifier, there is a dimensionality reduction by linear projection onto a :math:`K-1` dimensional space. We can reduce the dimension even more, to a chosen :math:`L`, by projecting -onto the linear subspace :math:`H_L` which maximize the variance of the +onto the linear subspace :math:`H_L` which maximizes the variance of the :math:`\mu^*_k` after projection (in effect, we are doing a form of PCA for the transformed class means :math:`\mu^*_k`). This :math:`L` corresponds to the ``n_components`` parameter used in the From 06b4307fbca82b7ff73b1319cd67a4fab34d7c11 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Thu, 13 Sep 2018 18:17:31 +0800 Subject: [PATCH 042/163] DOC Include fetch_openml doc in user guide (#12065) --- doc/datasets/index.rst | 148 +++++++++++++++++++++++++++++++++++++ doc/datasets/openml.rst | 148 ------------------------------------- sklearn/datasets/openml.py | 2 + 3 files changed, 150 insertions(+), 148 deletions(-) delete mode 100644 doc/datasets/openml.rst diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index 947e55f0c4c37..e0640916fbb64 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -351,6 +351,154 @@ features:: _`Faster API-compatible implementation`: https://github.com/mblondel/svmlight-loader +.. + For doctests: + + >>> import numpy as np + >>> import os + +.. _openml: + +Downloading datasets from the openml.org repository +--------------------------------------------------- + +`openml.org `_ is a public repository for machine learning +data and experiments, that allows everybody to upload open datasets. + +The ``sklearn.datasets`` package is able to download datasets +from the repository using the function +:func:`sklearn.datasets.fetch_openml`. + +For example, to download a dataset of gene expressions in mice brains:: + + >>> from sklearn.datasets import fetch_openml + >>> mice = fetch_openml(name='miceprotein', version=4) + +To fully specify a dataset, you need to provide a name and a version, though +the version is optional, see :ref:`openml_versions` below. +The dataset contains a total of 1080 examples belonging to 8 different +classes:: + + >>> mice.data.shape + (1080, 77) + >>> mice.target.shape + (1080,) + >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE + array(['c-CS-m', 'c-CS-s', 'c-SC-m', 'c-SC-s', 't-CS-m', 't-CS-s', 't-SC-m', 't-SC-s'], dtype=object) + +You can get more information on the dataset by looking at the ``DESCR`` +and ``details`` attributes:: + + >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios + **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015 + **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing + Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down + Syndrome. PLoS ONE 10(6): e0129126... + + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + {'id': '40966', 'name': 'MiceProtein', 'version': '4', 'format': 'ARFF', + 'upload_date': '2017-11-08T16:00:15', 'licence': 'Public', + 'url': 'https://www.openml.org/data/v1/download/17928620/MiceProtein.arff', + 'file_id': '17928620', 'default_target_attribute': 'class', + 'row_id_attribute': 'MouseID', + 'ignore_attribute': ['Genotype', 'Treatment', 'Behavior'], + 'tag': ['OpenML-CC18', 'study_135', 'study_98', 'study_99'], + 'visibility': 'public', 'status': 'active', + 'md5_checksum': '3c479a6885bfa0438971388283a1ce32'} + + +The ``DESCR`` contains a free-text description of the data, while ``details`` +contains a dictionary of meta-data stored by openml, like the dataset id. +For more details, see the `OpenML documentation +`_ The ``data_id`` of the mice protein dataset +is 40966, and you can use this (or the name) to get more information on the +dataset on the openml website:: + + >>> mice.url + 'https://www.openml.org/d/40966' + +The ``data_id`` also uniquely identifies a dataset from OpenML:: + + >>> mice = fetch_openml(data_id=40966) + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', + 'creator': ..., + 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': + 'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id': + '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C, + Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins + Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): + e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14', + 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': + '3c479a6885bfa0438971388283a1ce32'} + +.. _openml_versions: + +Dataset Versions +~~~~~~~~~~~~~~~~ + +A dataset is uniquely specified by its ``data_id``, but not necessarily by its +name. Several different "versions" of a dataset with the same name can exist +which can contain entirely different datasets. +If a particular version of a dataset has been found to contain significant +issues, it might be deactivated. Using a name to specify a dataset will yield +the earliest version of a dataset that is still active. That means that +``fetch_openml(name="miceprotein")`` can yield different results at different +times if earlier versions become inactive. +You can see that the dataset with ``data_id`` 40966 that we fetched above is +the version 1 of the "miceprotein" dataset:: + + >>> mice.details['version'] #doctest: +SKIP + '1' + +In fact, this dataset only has one version. The iris dataset on the other hand +has multiple versions:: + + >>> iris = fetch_openml(name="iris") + >>> iris.details['version'] #doctest: +SKIP + '1' + >>> iris.details['id'] #doctest: +SKIP + '61' + + >>> iris_61 = fetch_openml(data_id=61) + >>> iris_61.details['version'] + '1' + >>> iris_61.details['id'] + '61' + + >>> iris_969 = fetch_openml(data_id=969) + >>> iris_969.details['version'] + '3' + >>> iris_969.details['id'] + '969' + +Specifying the dataset by the name "iris" yields the lowest version, version 1, +with the ``data_id`` 61. To make sure you always get this exact dataset, it is +safest to specify it by the dataset ``data_id``. The other dataset, with +``data_id`` 969, is version 3 (version 2 has become inactive), and contains a +binarized version of the data:: + + >>> np.unique(iris_969.target) + array(['N', 'P'], dtype=object) + +You can also specify both the name and the version, which also uniquely +identifies the dataset:: + + >>> iris_version_3 = fetch_openml(name="iris", version=3) + >>> iris_version_3.details['version'] + '3' + >>> iris_version_3.details['id'] + '969' + + +.. topic:: References: + + * Vanschoren, van Rijn, Bischl and Torgo + `"OpenML: networked science in machine learning" + `_, + ACM SIGKDD Explorations Newsletter, 15(2), 49-60, 2014. + .. _external_datasets: Loading from external datasets diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst deleted file mode 100644 index 52dd453919522..0000000000000 --- a/doc/datasets/openml.rst +++ /dev/null @@ -1,148 +0,0 @@ -.. - For doctests: - - >>> import numpy as np - >>> import os - - -.. _openml: - -Downloading datasets from the openml.org repository -=================================================== - -`openml.org `_ is a public repository for machine learning -data and experiments, that allows everybody to upload open datasets. - -The ``sklearn.datasets`` package is able to download datasets -from the repository using the function -:func:`sklearn.datasets.fetch_openml`. - -For example, to download a dataset of gene expressions in mice brains:: - - >>> from sklearn.datasets import fetch_openml - >>> mice = fetch_openml(name='miceprotein', version=4) - -To fully specify a dataset, you need to provide a name and a version, though -the version is optional, see :ref:`openml_versions` below. -The dataset contains a total of 1080 examples belonging to 8 different -classes:: - - >>> mice.data.shape - (1080, 77) - >>> mice.target.shape - (1080,) - >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE - array(['c-CS-m', 'c-CS-s', 'c-SC-m', 'c-SC-s', 't-CS-m', 't-CS-s', 't-SC-m', 't-SC-s'], dtype=object) - -You can get more information on the dataset by looking at the ``DESCR`` -and ``details`` attributes:: - - >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP - **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios - **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015 - **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing - Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down - Syndrome. PLoS ONE 10(6): e0129126... - - >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP - {'id': '40966', 'name': 'MiceProtein', 'version': '4', 'format': 'ARFF', - 'upload_date': '2017-11-08T16:00:15', 'licence': 'Public', - 'url': 'https://www.openml.org/data/v1/download/17928620/MiceProtein.arff', - 'file_id': '17928620', 'default_target_attribute': 'class', - 'row_id_attribute': 'MouseID', - 'ignore_attribute': ['Genotype', 'Treatment', 'Behavior'], - 'tag': ['OpenML-CC18', 'study_135', 'study_98', 'study_99'], - 'visibility': 'public', 'status': 'active', - 'md5_checksum': '3c479a6885bfa0438971388283a1ce32'} - - -The ``DESCR`` contains a free-text description of the data, while ``details`` -contains a dictionary of meta-data stored by openml, like the dataset id. -For more details, see the `OpenML documentation -`_ The ``data_id`` of the mice protein dataset -is 40966, and you can use this (or the name) to get more information on the -dataset on the openml website:: - - >>> mice.url - 'https://www.openml.org/d/40966' - -The ``data_id`` also uniquely identifies a dataset from OpenML:: - - >>> mice = fetch_openml(data_id=40966) - >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP - {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', - 'creator': ..., - 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': - 'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id': - '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C, - Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins - Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): - e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14', - 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': - '3c479a6885bfa0438971388283a1ce32'} - -.. _openml_versions: - -Dataset Versions ----------------- - -A dataset is uniquely specified by its ``data_id``, but not necessarily by its -name. Several different "versions" of a dataset with the same name can exist -which can contain entirely different datasets. -If a particular version of a dataset has been found to contain significant -issues, it might be deactivated. Using a name to specify a dataset will yield -the earliest version of a dataset that is still active. That means that -``fetch_openml(name="miceprotein")`` can yield different results at different -times if earlier versions become inactive. -You can see that the dataset with ``data_id`` 40966 that we fetched above is -the version 1 of the "miceprotein" dataset:: - - >>> mice.details['version'] #doctest: +SKIP - '1' - -In fact, this dataset only has one version. The iris dataset on the other hand -has multiple versions:: - - >>> iris = fetch_openml(name="iris") - >>> iris.details['version'] #doctest: +SKIP - '1' - >>> iris.details['id'] #doctest: +SKIP - '61' - - >>> iris_61 = fetch_openml(data_id=61) - >>> iris_61.details['version'] - '1' - >>> iris_61.details['id'] - '61' - - >>> iris_969 = fetch_openml(data_id=969) - >>> iris_969.details['version'] - '3' - >>> iris_969.details['id'] - '969' - -Specifying the dataset by the name "iris" yields the lowest version, version 1, -with the ``data_id`` 61. To make sure you always get this exact dataset, it is -safest to specify it by the dataset ``data_id``. The other dataset, with -``data_id`` 969, is version 3 (version 2 has become inactive), and contains a -binarized version of the data:: - - >>> np.unique(iris_969.target) - array(['N', 'P'], dtype=object) - -You can also specify both the name and the version, which also uniquely -identifies the dataset:: - - >>> iris_version_3 = fetch_openml(name="iris", version=3) - >>> iris_version_3.details['version'] - '3' - >>> iris_version_3.details['id'] - '969' - - -.. topic:: References: - - * Vanschoren, van Rijn, Bischl and Torgo - `"OpenML: networked science in machine learning" - `_, - ACM SIGKDD Explorations Newsletter, 15(2), 49-60, 2014. diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index a58aa7482cda3..d667cb3699b28 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -367,6 +367,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, (not both). In case a name is given, a version can also be provided. + Read more in the :ref:`User Guide `. + .. note:: EXPERIMENTAL The API is experimental in version 0.20 (particularly the return value From e36254c98ed10dca70be3997765387655478b44f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 13 Sep 2018 13:56:19 +0200 Subject: [PATCH 043/163] MNT: Anonimize IP for Google Analytics (#12038) --- doc/themes/scikit-learn/layout.html | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/doc/themes/scikit-learn/layout.html b/doc/themes/scikit-learn/layout.html index 79ddd08093012..21136856aa6d2 100644 --- a/doc/themes/scikit-learn/layout.html +++ b/doc/themes/scikit-learn/layout.html @@ -340,17 +340,13 @@

Machine Learning in Python

{% if theme_google_analytics|tobool %} - + {% endif %}