scikit-learn · amueller · Jun 19, 2017 · Nov 29, 2016 · Feb 15, 2017 · Jun 7, 2017
diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py
@@ -64,27 +64,59 @@
 # Author: Gael Varoquaux gael.varoquaux@normalesup.org
 # License: BSD 3 clause
 
-import datetime
+from datetime import datetime
 
 import numpy as np
-import matplotlib.pyplot as plt
-try:
-     from matplotlib.finance import quotes_historical_yahoo_ochl
-except ImportError:
-     # quotes_historical_yahoo_ochl was named quotes_historical_yahoo before matplotlib 1.4
-     from matplotlib.finance import quotes_historical_yahoo as quotes_historical_yahoo_ochl
+from matplotlib import pyplot as plt
 from matplotlib.collections import LineCollection
+from six.moves.urllib.request import urlopen
+from six.moves.urllib.parse import urlencode
 from sklearn import cluster, covariance, manifold
 
 ###############################################################################
 # Retrieve the data from Internet
 
+def quotes_historical_google(symbol, date1, date2):
+    """Get the historical data from Google finance.
+
+    Parameters
+    ----------
+    symbol : str
+        Ticker symbol to query for, for example ``"DELL"``.
+    date1 : datetime.datetime
+        Start date.
+    date2 : datetime.datetime
+        End date.
+
+    Returns
+    -------
+    X : array
+        The columns are ``date`` -- datetime, ``open``, ``high``,
+        ``low``, ``close`` and ``volume`` of type float.
+    """
+    params = urlencode({
+        'q': symbol,
+        'startdate': date1.strftime('%b %d, %Y'),
+        'enddate': date2.strftime('%b %d, %Y'),
+        'output': 'csv'
+    })
+    url = 'http://www.google.com/finance/historical?' + params
+    with urlopen(url) as response:
+        dtype = {
+            'names': ['date', 'open', 'high', 'low', 'close', 'volume'],
+            'formats': ['object', 'f4', 'f4', 'f4', 'f4', 'f4']
+        }
+        converters = {0: lambda s: datetime.strptime(s.decode(), '%d-%b-%y')}
+        return np.genfromtxt(response, delimiter=',', skip_header=1,
+                             dtype=dtype, converters=converters,
+                             missing_values='-', filling_values=-1)
+
+
 # Choose a time period reasonably calm (not too long ago so that we get
 # high-tech firms, and before the 2008 crash)
-d1 = datetime.datetime(2003, 1, 1)
-d2 = datetime.datetime(2008, 1, 1)
+d1 = datetime(2003, 1, 1)
+d2 = datetime(2008, 1, 1)
 
-# kraft symbol has now changed from KFT to MDLZ in yahoo
 symbol_dict = {
     'TOT': 'Total',
     'XOM': 'Exxon',
@@ -102,7 +134,6 @@
     'AMZN': 'Amazon',
     'TM': 'Toyota',
     'CAJ': 'Canon',
-    'MTU': 'Mitsubishi',
     'SNE': 'Sony',
     'F': 'Ford',
     'HMC': 'Honda',
@@ -111,9 +142,8 @@
     'BA': 'Boeing',
     'KO': 'Coca Cola',
     'MMM': '3M',
-    'MCD': 'Mc Donalds',
+    'MCD': 'McDonald\'s',
     'PEP': 'Pepsi',
-    'MDLZ': 'Kraft Foods',
     'K': 'Kellogg',
     'UN': 'Unilever',
     'MAR': 'Marriott',
@@ -129,11 +159,9 @@
     'AAPL': 'Apple',
     'SAP': 'SAP',
     'CSCO': 'Cisco',
-    'TXN': 'Texas instruments',
+    'TXN': 'Texas Instruments',
     'XRX': 'Xerox',
-    'LMT': 'Lookheed Martin',
     'WMT': 'Wal-Mart',
-    'WBA': 'Walgreen',
     'HD': 'Home Depot',
     'GSK': 'GlaxoSmithKline',
     'PFE': 'Pfizer',
@@ -149,14 +177,16 @@
 
 symbols, names = np.array(list(symbol_dict.items())).T
 
-quotes = [quotes_historical_yahoo_ochl(symbol, d1, d2, asobject=True)
-          for symbol in symbols]
+quotes = [
+    quotes_historical_google(symbol, d1, d2) for symbol in symbols
+]
 
-open = np.array([q.open for q in quotes]).astype(np.float)
-close = np.array([q.close for q in quotes]).astype(np.float)
+close_prices = np.stack([q['close'] for q in quotes])
+open_prices = np.stack([q['open'] for q in quotes])
 
 # The daily variations of the quotes are what carry most information
-variation = close - open
+variation = close_prices - open_prices
+
 
 ###############################################################################
 # Learn a graphical structure from the correlations

diff --git a/examples/applications/plot_tomography_l1_reconstruction.py b/examples/applications/plot_tomography_l1_reconstruction.py
@@ -99,7 +99,7 @@ def build_projection_operator(l_x, n_dir):
 def generate_synthetic_data():
     """ Synthetic binary data """
     rs = np.random.RandomState(0)
-    n_pts = 36.
+    n_pts = 36
     x, y = np.ogrid[0:l, 0:l]
     mask_outer = (x - l / 2) ** 2 + (y - l / 2) ** 2 < (l / 2) ** 2
     mask = np.zeros((l, l))

diff --git a/examples/classification/plot_digits_classification.py b/examples/classification/plot_digits_classification.py
@@ -46,17 +46,17 @@
 classifier = svm.SVC(gamma=0.001)
 
 # We learn the digits on the first half of the digits
-classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2])
+classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])
 
 # Now predict the value of the digit on the second half:
-expected = digits.target[n_samples / 2:]
-predicted = classifier.predict(data[n_samples / 2:])
+expected = digits.target[n_samples // 2:]
+predicted = classifier.predict(data[n_samples // 2:])
 
 print("Classification report for classifier %s:\n%s\n"
       % (classifier, metrics.classification_report(expected, predicted)))
 print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
 
-images_and_predictions = list(zip(digits.images[n_samples / 2:], predicted))
+images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
 for index, (image, prediction) in enumerate(images_and_predictions[:4]):
     plt.subplot(2, 4, index + 5)
     plt.axis('off')

diff --git a/examples/covariance/plot_robust_vs_empirical_covariance.py b/examples/covariance/plot_robust_vs_empirical_covariance.py
@@ -67,7 +67,7 @@
 
 range_n_outliers = np.concatenate(
     (np.linspace(0, n_samples / 8, 5),
-     np.linspac
1E0A
e(n_samples / 8, n_samples / 2, 5)[1:-1]))
+     np.linspace(n_samples / 8, n_samples / 2, 5)[1:-1])).astype(np.int)
 
 # definition of arrays to store results
 err_loc_mcd = np.zeros((range_n_outliers.size, repeat))
@@ -135,13 +135,13 @@
 plt.errorbar(range_n_outliers, err_cov_mcd.mean(1),
              yerr=er
F438
r_cov_mcd.std(1),
              label="Robust covariance (mcd)", color='m')
-plt.errorbar(range_n_outliers[:(x_size / 5 + 1)],
-             err_cov_emp_full.mean(1)[:(x_size / 5 + 1)],
-             yerr=err_cov_emp_full.std(1)[:(x_size / 5 + 1)],
+plt.errorbar(range_n_outliers[:(x_size // 5 + 1)],
+             err_cov_emp_full.mean(1)[:(x_size // 5 + 1)],
+             yerr=err_cov_emp_full.std(1)[:(x_size // 5 + 1)],
              label="Full data set empirical covariance", color='green')
-plt.plot(range_n_outliers[(x_size / 5):(x_size / 2 - 1)],
-         err_cov_emp_full.mean(1)[(x_size / 5):(x_size / 2 - 1)], color='green',
-         ls='--')
+plt.plot(range_n_outliers[(x_size // 5):(x_size // 2 - 1)],
+         err_cov_emp_full.mean(1)[(x_size // 5):(x_size // 2 - 1)],
+         color='green', ls='--')
 plt.errorbar(range_n_outliers, err_cov_emp_pure.mean(1),
              yerr=err_cov_emp_pure.std(1),
              label="Pure data set empirical covariance", color='black')

diff --git a/examples/cross_decomposition/plot_compare_cross_decomposition.py b/examples/cross_decomposition/plot_compare_cross_decomposition.py
@@ -36,10 +36,10 @@
 X = latents + np.random.normal(size=4 * n).reshape((n, 4))
 Y = latents + np.random.normal(size=4 * n).reshape((n, 4))
 
-X_train = X[:n / 2]
-Y_train = Y[:n / 2]
-X_test = X[n / 2:]
-Y_test = Y[n / 2:]
+X_train = X[:n // 2]
+Y_train = Y[:n // 2]
+X_test = X[n // 2:]
+Y_test = Y[n // 2:]
 
 print("Corr(X)")
 print(np.round(np.corrcoef(X.T), 2))

diff --git a/examples/decomposition/plot_sparse_coding.py b/examples/decomposition/plot_sparse_coding.py
@@ -44,13 +44,13 @@ def ricker_matrix(width, resolution, n_components):
 resolution = 1024
 subsampling = 3  # subsampling factor
 width = 100
-n_components = resolution / subsampling
+n_components = resolution // subsampling
 
 # Compute a wavelet dictionary
 D_fixed = ricker_matrix(width=width, resolution=resolution,
                         n_components=n_components)
 D_multi = np.r_[tuple(ricker_matrix(width=w, resolution=resolution,
-                                    n_components=np.floor(n_components / 5))
+                      n_components=n_components // 5)
                 for w in (10, 50, 100, 500, 1000))]
 
 # Generate a signal

diff --git a/examples/exercises/plot_iris_exercise.py b/examples/exercises/plot_iris_exercise.py
@@ -29,10 +29,10 @@
 X = X[order]
 y = y[order].astype(np.float)
 
-X_train = X[:.9 * n_sample]
-y_train = y[:.9 * n_sample]
-X_test = X[.9 * n_sample:]
-y_test = y[.9 * n_sample:]
+X_train = X[:int(.9 * n_sample)]
+y_train = y[:int(.9 * n_sample)]
+X_test = X[int(.9 * n_sample):]
+y_test = y[int(.9 * n_sample):]
 
 # fit the model
 for fig_num, kernel in enumerate(('linear', 'rbf', 'poly')):
@@ -58,8 +58,8 @@
     # Put the result into a color plot
     Z = Z.reshape(XX.shape)
     plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
-    plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'],
-                levels=[-.5, 0, .5])
+    plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
+                linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
 
     plt.title(kernel)
 plt.show()
diff --git a/examples/linear_model/plot_lasso_and_elasticnet.py b/examples/linear_model/plot_lasso_and_elasticnet.py
@@ -32,8 +32,8 @@
 
 # Split data in train set and test set
 n_samples = X.shape[0]
-X_train, y_train = X[:n_samples / 2], y[:n_samples / 2]
-X_test, y_test = X[n_samples / 2:], y[n_samples / 2:]
+X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
+X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]
 
 ###############################################################################
 # Lasso

diff --git a/examples/neighbors/plot_kde_1d.py b/examples/neighbors/plot_kde_1d.py
@@ -38,8 +38,8 @@
 # Plot the progression of histograms to kernels
 np.random.seed(1)
 N = 20
-X = np.concatenate((np.random.normal(0, 1, 0.3 * N),
-                    np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis]
+X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)),
+                    np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis]
 X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]
 bins = np.linspace(-5, 10, 10)
 
@@ -116,8 +116,8 @@ def format_func(x, loc):
 # Plot a 1D density example
 N = 100
 np.random.seed(1)
-X = np.concatenate((np.random.normal(0, 1, 0.3 * N),
-                    np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis]
+X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)),
+                    np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis]
 
 X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]
 

diff --git a/examples/plot_kernel_approximation.py b/examples/plot_kernel_approximation.py
@@ -68,12 +68,14 @@
 data -= data.mean(axis=0)
 
 # We learn the digits on the first half of the digits
-data_train, targets_train = data[:n_samples / 2], digits.target[:n_samples / 2]
+data_train, targets_train = (data[:n_samples // 2],
+                             digits.target[:n_samples // 2])
 
 
 # Now predict the value of the digit on the second half:
-data_test, targets_test = data[n_samples / 2:], digits.target[n_samples / 2:]
-#data_test = scaler.transform(data_test)
+data_test, targets_test = (data[n_samples // 2:],
+                           digits.target[n_samples // 2:])
+# data_test = scaler.transform(data_test)
 
 # Create a classifier: a support vector classifier
 kernel_svm = svm.SVC(gamma=.2)

diff --git a/examples/plot_kernel_ridge_regression.py b/examples/plot_kernel_ridge_regression.py
@@ -54,7 +54,7 @@
 y = np.sin(X).ravel()
 
 # Add noise to targets
-y[::5] += 3 * (0.5 - rng.rand(X.shape[0]/5))
+y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
 
 X_plot = np.linspace(0, 5, 100000)[:, None]
 
@@ -119,8 +119,8 @@
 # Generate sample data
 X = 5 * rng.rand(10000, 1)
 y = np.sin(X).ravel()
-y[::5] += 3 * (0.5 - rng.rand(X.shape[0]/5))
-sizes = np.logspace(1, 4, 7)
+y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
+sizes = np.logspace(1, 4, 7, dtype=np.int)
 for name, estimator in {"KRR": KernelRidge(kernel='rbf', alpha=0.1,
                                            gamma=10),
                         "SVR": SVR(kernel='rbf', C=1e1, gamma=10)}.items():

diff --git a/examples/plot_multioutput_face_completion.py b/examples/plot_multioutput_face_completion.py
@@ -39,10 +39,12 @@
 test = test[face_ids, :]
 
 n_pixels = data.shape[1]
-X_train = train[:, :np.ceil(0.5 * n_pixels)]  # Upper half of the faces
-y_train = train[:, np.floor(0.5 * n_pixels):]  # Lower half of the faces
-X_test = test[:, :np.ceil(0.5 * n_pixels)]
-y_test = test[:, np.floor(0.5 * n_pixels):]
+# Upper half of the faces
+X_train = train[:, :(n_pixels + 1) // 2]
+# Lower half of the faces
+y_train = train[:, n_pixels // 2:]
+X_test = test[:, :(n_pixels + 1) // 2]
+y_test = test[:, n_pixels // 2:]
 
 # Fit estimators
 ESTIMATORS = {
@@ -74,7 +76,6 @@
         sub = plt.subplot(n_faces, n_cols, i * n_cols + 1,
                           title="true faces")
 
-
     sub.axis("off")
     sub.imshow(true_face.reshape(image_shape),
                cmap=plt.cm.gray,

diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
@@ -106,8 +106,8 @@
 
 # l2 data: non sparse, but less features
 y_2 = np.sign(.5 - rnd.rand(n_samples))
-X_2 = rnd.randn(n_samples, n_features / 5) + y_2[:, np.newaxis]
-X_2 += 5 * rnd.randn(n_samples, n_features / 5)
+X_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis]
+X_2 += 5 * rnd.randn(n_samples, n_features // 5)
 
 clf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,
                        tol=1e-3),

diff --git a/examples/tree/plot_unveil_tree_structure.py b/examples/tree/plot_unveil_tree_structure.py
@@ -54,7 +54,7 @@
 
 # The tree structure can be traversed to compute various properties such
 # as the depth of each node and whether or not it is a leaf.
-node_depth = np.zeros(shape=n_nodes)
+node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
 is_leaves = np.zeros(shape=n_nodes, dtype=bool)
 stack = [(0, -1)]  # seed is the root node id and its parent depth
 while len(stack) > 0:

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
@@ -1852,7 +1852,7 @@ def diag(self, X):
             Diagonal of kernel k(X, X)
         """
         # We have to fall back to slow way of computing diagonal
-        return np.apply_along_axis(self, 1, X)[:, 0]
+        return np.apply_along_axis(self, 1, X).ravel()
 
     def is_stationary(self):
         """Returns whether the kernel is stationary. """

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -1028,16 +1028,23 @@ def test_cv_iterable_wrapper():
     # Since the wrapped iterable is enlisted and stored,
     # split can be called any number of times to produce
     # consistent results.
-    assert_array_equal(list(kf_iter_wrapped.split(X, y)),
-                       list(kf_iter_wrapped.split(X, y)))
+    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
+                            list(kf_iter_wrapped.split(X, y)))
     # If the splits are randomized, successive calls to split yields different
     # results
     kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
     kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
-    assert_array_equal(list(kf_randomized_iter_wrapped.split(X, y)),
-                       list(kf_randomized_iter_wrapped.split(X, y)))
-    assert_true(np.any(np.array(list(kf_iter_wrapped.split(X, y))) !=
-                       np.array(list(kf_randomized_iter_wrapped.split(X, y)))))
+    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
+                            list(kf_randomized_iter_wrapped.split(X, y)))
+
+    try:
+        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
+                                list(kf_randomized_iter_wrapped.split(X, y)))
+        splits_are_equal = True
+    except AssertionError:
+        splits_are_equal = False
+    assert_false(splits_are_equal, "If the splits are randomized, "
+                 "successive calls to split should yield different results")
 
 
 def test_group_kfold():