scikit-learn · ogrisel · Jan 20, 2016 · Jan 18, 2016 · Jan 18, 2016
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -124,6 +124,25 @@ API changes summary
    - ``residual_metric`` has been deprecated in :class:`linear_model.RANSACRegressor`.
      Use ``loss`` instead. By `Manoj Kumar`_.
 
+
+.. _changes_0_17_1:
+
+Version 0.17.1
+==============
+
+Changelog
+---------
+
+Bug fixes
+.........
+
+
+    - Upgrade vendored joblib to version 0.9.4 that fixes an important bug in
+      ``joblib.Parallel`` that can silently yield to wrong results when working
+      on datasets larger than 1MB:
+      https://github.com/joblib/joblib/blob/0.9.4/CHANGES.rst
+
+
 .. _changes_0_17:
 
 Version 0.17

diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py
@@ -74,7 +74,7 @@
       >>> c = square(a)
       >>> # The above call did not trigger an evaluation
 
-2) **Embarrassingly parallel helper:** to make is easy to write readable
+2) **Embarrassingly parallel helper:** to make it easy to write readable
    parallel code and debug it quickly::
 
       >>> from sklearn.externals.joblib import Parallel, delayed
@@ -86,7 +86,7 @@
 3) **Logging/tracing:** The different functionalities will
    progressively acquire better logging mechanism to help track what
    has been ran, and capture I/O easily. In addition, Joblib will
-   provide a few I/O primitives, to easily define define logging and
+   provide a few I/O primitives, to easily define logging and
    display streams, and provide a way of compiling a report.
    We want to be able to quickly inspect what has been run.
 
@@ -115,7 +115,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.9.3'
+__version__ = '0.9.4'
 
 
 from .memory import Memory, MemorizedResult

diff --git a/sklearn/externals/joblib/_compat.py b/sklearn/externals/joblib/_compat.py
@@ -1,10 +1,13 @@
 """
 Compatibility layer for Python 3/Python 2 single codebase
 """
+import sys
+
+PY3_OR_LATER = sys.version_info[0] >= 3
 
 try:
     _basestring = basestring
     _bytes_or_unicode = (str, unicode)
 except NameError:
     _basestring = str
-    _bytes_or_unicode = (bytes, str)
+    _bytes_or_unicode = (bytes, str)
diff --git a/sklearn/externals/joblib/format_stack.py b/sklearn/externals/joblib/format_stack.py
@@ -28,16 +28,14 @@
 import time
 import tokenize
 import traceback
-import types
+
 try:                           # Python 2
     generate_tokens = tokenize.generate_tokens
 except AttributeError:         # Python 3
     generate_tokens = tokenize.tokenize
 
-PY3 = (sys.version[0] == '3')
 INDENT = ' ' * 8
 
-from ._compat import _basestring
 
 ###############################################################################
 # some internal-use functions
@@ -195,14 +193,13 @@ def format_records(records):   # , print_globals=False):
             # the abspath call will throw an OSError.  Just ignore it and
             # keep the original file string.
             pass
+
+        if file.endswith('.pyc'):
+            file = file[:-4] + '.py'
+
         link = file
-        try:
-            args, varargs, varkw, locals = inspect.getargvalues(frame)
-        except:
-            # This can happen due to a bug in python2.3.  We should be
-            # able to remove this try/except when 2.4 becomes a
-            # requirement.  Bug details at http://python.org/sf/1005466
-            print("\nJoblib's exception reporting continues...\n")
+
+        args, varargs, varkw, locals = inspect.getargvalues(frame)
 
         if func == '?':
             call = ''
@@ -350,13 +347,11 @@ def format_exc(etype, evalue, etb, context=5, tb_offset=0):
     date = time.ctime(time.time())
     pid = 'PID: %i' % os.getpid()
 
-    head = '%s%s%s\n%s%s%s' % (etype, ' ' * (75 - len(str(etype)) - len(date)),
-                           date, pid, ' ' * (75 - len(str(pid)) - len(pyver)),
-                           pyver)
+    head = '%s%s%s\n%s%s%s' % (
+        etype, ' ' * (75 - len(str(etype)) - len(date)),
+        date, pid, ' ' * (75 - len(str(pid)) - len(pyver)),
+        pyver)
 
-    # Flush cache before calling inspect.  This helps alleviate some of the
-    # problems with python 2.3's inspect.py.
-    linecache.checkcache()
     # Drop topmost frames if requested
     try:
         records = _fixed_getframes(etb, context, tb_offset)

diff --git a/sklearn/externals/joblib/func_inspect.py b/sklearn/externals/joblib/func_inspect.py
@@ -15,6 +15,8 @@
 from ._compat import _basestring
 from .logger import pformat
 from ._memory_helpers import open_py_source
+from ._compat import PY3_OR_LATER
+
 
 def get_func_code(func):
     """ Attempts to retrieve a reliable function code hash.
@@ -156,6 +158,53 @@ def get_func_name(func, resolv_alias=True, win_characters=True):
     return module, name
 
 
+def getfullargspec(func):
+    """Compatibility function to provide inspect.getfullargspec in Python 2
+
+    This should be rewritten using a backport of Python 3 signature
+    once we drop support for Python 2.6. We went for a simpler
+    approach at the time of writing because signature uses OrderedDict
+    which is not available in Python 2.6.
+    """
+    try:
+        return inspect.getfullargspec(func)
+    except AttributeError:
+        arg_spec = inspect.getargspec(func)
+        import collections
+        tuple_fields = ('args varargs varkw defaults kwonlyargs '
+                        'kwonlydefaults annotations')
+        tuple_type = collections.namedtuple('FullArgSpec', tuple_fields)
+
+        return tuple_type(args=arg_spec.args,
+                          varargs=arg_spec.varargs,
+                          varkw=arg_spec.keywords,
+                          defaults=arg_spec.defaults,
+                          kwonlyargs=[],
+                          kwonlydefaults=None,
+                          annotations={})
+
+
+def _signature_str(function_name, arg_spec):
+    """Helper function to output a function signature"""
+    # inspect.formatargspec can not deal with the same
+    # number of arguments in python 2 and 3
+    arg_spec_for_format = arg_spec[:7 if PY3_OR_LATER else 4]
+
+    arg_spec_str = inspect.formatargspec(*arg_spec_for_format)
+    return '{0}{1}'.format(function_name, arg_spec_str)
+
+
+def _function_called_str(function_name, args, kwargs):
+    """Helper function to output a function call"""
+    template_str = '{0}({1}, {2})'
+
+    args_str = repr(args)[1:-1]
+    kwargs_str = ', '.join('%s=%s' % (k, v)
+                           for k, v in kwargs.items())
+    return template_str.format(function_name, args_str,
+                               kwargs_str)
+
+
 def filter_args(func, ignore_lst, args=(), kwargs=dict()):
     """ Filters the given args and kwargs using a list of arguments to
         ignore, and a function specification.
@@ -180,24 +229,23 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()):
     args = list(args)
     if isinstance(ignore_lst, _basestring):
         # Catch a common mistake
-        raise ValueError('ignore_lst must be a list of parameters to ignore '
+        raise ValueError(
+            'ignore_lst must be a list of parameters to ignore '
             '%s (type %s) was given' % (ignore_lst, type(ignore_lst)))
     # Special case for functools.partial objects
     if (not inspect.ismethod(func) and not inspect.isfunction(func)):
         if ignore_lst:
             warnings.warn('Cannot inspect object %s, ignore list will '
-                'not work.' % func, stacklevel=2)
+                          'not work.' % func, stacklevel=2)
         return {'*': args, '**': kwargs}
-    arg_spec = inspect.getargspec(func)
-    # We need to if/them to account for different versions of Python
-    if hasattr(arg_spec, 'args'):
-        arg_names = arg_spec.args
-        arg_defaults = arg_spec.defaults
-        arg_keywords = arg_spec.keywords
-        arg_varargs = arg_spec.varargs
-    else:
-        arg_names, arg_varargs, arg_keywords, arg_defaults = arg_spec
-    arg_defaults = arg_defaults or {}
+    arg_spec = getfullargspec(func)
+    arg_names = arg_spec.args + arg_spec.kwonlyargs
+    arg_defaults = arg_spec.defaults or ()
+    arg_defaults = arg_defaults + tuple(arg_spec.kwonlydefaults[k]
+                                        for k in arg_spec.kwonlyargs)
+    arg_varargs = arg_spec.varargs
+    arg_varkw = arg_spec.varkw
+
     if inspect.ismethod(func):
         # First argument is 'self', it has been removed by Python
         # we need to add it back:
@@ -211,7 +259,18 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()):
     for arg_position, arg_name in enumerate(arg_names):
         if arg_position < len(args):
             # Positional argument or keyword argument given as positional
-            arg_dict[arg_name] = args[arg_position]
+            if arg_name not in arg_spec.kwonlyargs:
+                arg_dict[arg_name] = args[arg_position]
+            else:
+                raise ValueError(
+                    "Keyword-only parameter '%s' was passed as "
+                    'positional parameter for %s:\n'
+                    '     %s was called.'
+                    % (arg_name,
+                       _signature_str(name, arg_spec),
+                       _function_called_str(name, args, kwargs))
+                )
+
         else:
             position = arg_position - len(arg_names)
             if arg_name in kwargs:
@@ -221,28 +280,24 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()):
                     arg_dict[arg_name] = arg_defaults[position]
                 except (IndexError, KeyError):
                     # Missing argument
-                    raise ValueError('Wrong number of arguments for %s%s:\n'
-                                     '     %s(%s, %s) was called.'
-                        % (name,
-                           inspect.formatargspec(*inspect.getargspec(func)),
-                           name,
-                           repr(args)[1:-1],
-                           ', '.join('%s=%s' % (k, v)
-                                    for k, v in kwargs.items())
-                           )
-                        )
+                    raise ValueError(
+                        'Wrong number of arguments for %s:\n'
+                        '     %s was called.'
+                        % (_signature_str(name, arg_spec),
+                           _function_called_str(name, args, kwargs))
+                    )
 
     varkwargs = dict()
     for arg_name, arg_value in sorted(kwargs.items()):
         if arg_name in arg_dict:
             arg_dict[arg_name] = arg_value
-        elif arg_keywords is not None:
+        elif arg_varkw is not None:
             varkwargs[arg_name] = arg_value
         else:
             raise TypeError("Ignore list for %s() contains an unexpected "
                             "keyword argument '%s'" % (name, arg_name))
 
-    if arg_keywords is not None:
+    if arg_varkw is not None:
         arg_dict['**'] = varkwargs
     if arg_varargs is not None:
         varargs = args[arg_position + 1:]
@@ -254,13 +309,10 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()):
             arg_dict.pop(item)
         else:
             raise ValueError("Ignore list: argument '%s' is not defined for "
-            "function %s%s" %
-                            (item, name,
-                             inspect.formatargspec(arg_names,
-                                                   arg_varargs,
-                                                   arg_keywords,
-                                                   arg_defaults,
-                                                   )))
+                             "function %s"
+                             % (item,
+                                _signature_str(name, arg_spec))
+            )
     # XXX: Return a sorted list of pairs?
     return arg_dict
 

diff --git a/sklearn/externals/joblib/hashing.py b/sklearn/externals/joblib/hashing.py
@@ -7,19 +7,17 @@
 # Copyright (c) 2009 Gael Varoquaux
 # License: BSD Style, 3 clauses.
 
-import warnings
 import pickle
 import hashlib
 import sys
 import types
 import struct
-from ._compat import _bytes_or_unicode
-
 import io
 
 PY3 = sys.version[0] == '3'
+from ._compat import _bytes_or_unicode, PY3_OR_LATER
+
 
-if PY3:
+if PY3_OR_LATER:
     Pickler = pickle._Pickler
 else:
     Pickler = pickle.Pickler
@@ -30,7 +28,17 @@ class _ConsistentSet(object):
         whatever the order of its items.
     """
     def __init__(self, set_sequence):
-        self._sequence = sorted(set_sequence)
+        # Forces order of elements in set to ensure consistent hash.
+        try:
+            # Trying first to order the set assuming the type of elements is
+            # consistent and orderable.
+            # This fails on python 3 when elements are unorderable
+            # but we keep it in a try as it's faster.
+            self._sequence = sorted(set_sequence)
+        except TypeError:
+            # If elements are unorderable, sorting them using their hash.
+            # This is slower but works in any case.
+            self._sequence = sorted((hash(e) for e in set_sequence))
 
 
 class _MyHash(object):
@@ -49,7 +57,7 @@ def __init__(self, hash_name='md5'):
         self.stream = io.BytesIO()
         # By default we want a pickle protocol that only changes with
         # the major python version and not the minor one
-        protocol = (pickle.DEFAULT_PROTOCOL if PY3
+        protocol = (pickle.DEFAULT_PROTOCOL if PY3_OR_LATER
                     else pickle.HIGHEST_PROTOCOL)
         Pickler.__init__(self, self.stream, protocol=protocol)
         # Initialise the hash obj
@@ -59,7 +67,8 @@ def hash(self, obj, return_digest=True):
         try:
             self.dump(obj)
         except pickle.PicklingError as e:
-            warnings.warn('PicklingError while hashing %r: %r' % (obj, e))
+            e.args += ('PicklingError while hashing %r: %r' % (obj, e),)
+            raise
         dumps = self.stream.getvalue()
         self._hash.update(dumps)
         if return_digest:
@@ -128,8 +137,18 @@ def save_global(self, obj, name=None, pack=struct.pack):
     dispatch[type(pickle.dump)] = save_global
 
     def _batch_setitems(self, items):
-        # forces order of keys in dict to ensure consistent hash
-        Pickler._batch_setitems(self, iter(sorted(items)))
+        # forces order of keys in dict to ensure consistent hash.
+        try:
+            # Trying first to compare dict assuming the type of keys is
+            # consistent and orderable.
+            # This fails on python 3 when keys are unorderable
+            # but we keep it in a try as it's faster.
+            Pickler._batch_setitems(self, iter(sorted(items)))
+        except TypeError:
+            # If keys are unorderable, sorting them using their hash. This is
+            # slower but works in any case.
+            Pickler._batch_setitems(self, iter(sorted((hash(k), v)
+                                                      for k, v in items)))
 
     def save_set(self, set_items):
         # forces order of items in Set to ensure consistent hash
@@ -182,7 +201,7 @@ def save(self, obj):
             # the array is Fortran rather than C contiguous
             except (ValueError, BufferError):
                 # Cater for non-single-segment arrays: this creates a
-                # copy, and thus alleviates this issue.
+                # copy, and thus aleviates this issue.
                 # XXX: There might be a more efficient way of doing this
                 obj_bytes_view = obj.flatten().view(self.np.uint8)
                 self._hash.update(self._getbuffer(obj_bytes_view))