8000 MTN vendor joblib 0.11.0dev0 · scikit-learn/scikit-learn@923d766 · GitHub
[go: up one dir, main page]

Skip to content

Commit 923d766

Browse files
committed
MTN vendor joblib 0.11.0dev0
1 parent addf05e commit 923d766

30 files changed

+2981
-1653
lines changed

sklearn/externals/joblib/__init__.py

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,25 @@
11
"""Joblib is a set of tools to provide **lightweight pipelining in
2-
Python**. In particular, joblib offers:
2+
Python**. In particular:
33
4-
1. transparent disk-caching of the output values and lazy re-evaluation
4+
1. transparent disk-caching of functions and lazy re-evaluation
55
(memoize pattern)
66
77
2. easy simple parallel computing
88
9-
3. logging and tracing of the execution
10-
119
Joblib is optimized to be **fast** and **robust** in particular on large
1210
data and has specific optimizations for `numpy` arrays. It is
1311
**BSD-licensed**.
1412
1513
16-
========================= ================================================
17-
**User documentation:** http://pythonhosted.org/joblib
14+
==================== ===============================================
15+
**Documentation:** http://pythonhosted.org/joblib
1816
19-
**Download packages:** http://pypi.python.org/pypi/joblib#downloads
17+
**Download:** http://pypi.python.org/pypi/joblib#downloads
2018
21-
**Source code:** http://github.com/joblib/joblib
19+
**Source code:** http://github.com/joblib/joblib
2220
23-
**Report issues:** http://github.com/joblib/joblib/issues
24-
========================= ================================================
21+
**Report issues:** http://github.com/joblib/joblib/issues
22+
==================== ===============================================
2523
2624
2725
Vision
@@ -43,9 +41,8 @@
4341
good for resuming an application status or computational job, eg
4442
after a crash.
4543
46-
Joblib strives to address these problems while **leaving your code and
47-
your flow control as unmodified as possible** (no framework, no new
48-
paradigms).
44+
Joblib addresses these problems while **leaving your code and your flow
45+
control as unmodified as possible** (no framework, no new paradigms).
4946
5047
Main features
5148
------------------
@@ -59,16 +56,17 @@
5956
computation to disk and rerun it only if necessary::
6057
6158
>>> from sklearn.externals.joblib import Memory
62-
>>> mem = Memory(cachedir='/tmp/joblib')
59+
>>> cachedir = 'your_cache_dir_goes_here'
60+
>>> mem = Memory(cachedir)
6361
>>> import numpy as np
6462
>>> a = np.vander(np.arange(3)).astype(np.float)
6563
>>> square = mem.cache(np.square)
6664
>>> b = square(a) # doctest: +ELLIPSIS
6765
________________________________________________________________________________
6866
[Memory] Calling square...
69-
square(array([[ 0., 0., 1.],
70-
[ 1., 1., 1.],
71-
[ 4., 2., 1.]]))
67+
square(array([[0., 0., 1.],
68+
[1., 1., 1.],
69+
[4., 2., 1.]]))
7270
___________________________________________________________square - 0...s, 0.0min
7371
7472
>>> c = square(a)
@@ -83,19 +81,12 @@
8381
[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
8482
8583
86-
3) **Logging/tracing:** The different functionalities will
87-
progressively acquire better logging mechanism to help track what
88-
has been ran, and capture I/O easily. In addition, Joblib will
89-
provide a few I/O primitives, to easily define logging and
90-
display streams, and provide a way of compiling a report.
91-
We want to be able to quickly inspect what has been run.
92-
93-
4) **Fast compressed Persistence**: a replacement for pickle to work
84+
3) **Fast compressed Persistence**: a replacement for pickle to work
9485
efficiently on Python objects containing large data (
9586
*joblib.dump* & *joblib.load* ).
9687
9788
..
98-
>>> import shutil ; shutil.rmtree('/tmp/joblib/')
89+
>>> import shutil ; shutil.rmtree(cachedir)
9990
10091
"""
10192

@@ -118,12 +109,13 @@
118109
__version__ = '0.11.1.dev0'
119110

120111

121-
from .memory import Memory, MemorizedResult
112+
from .memory import Memory, MemorizedResult, register_store_backend
122113
from .logger import PrintTime
123114
from .logger import Logger
124115
from .hashing import hash
125116
from .numpy_pickle import dump
126117
from .numpy_pickle import load
118+
from .compressor import register_compressor
127119
from .parallel import Parallel
128120
from .parallel import delayed
129121
from .parallel import cpu_count
@@ -134,4 +126,5 @@
134126

135127
__all__ = ['Memory', 'MemorizedResult', 'PrintTime', 'Logger', 'hash', 'dump',
136128
'load', 'Parallel', 'delayed', 'cpu_count', 'effective_n_jobs',
137-
'register_parallel_backend', 'parallel_backend']
129+
'register_parallel_backend', 'parallel_backend',
130+
'register_store_backend', 'register_compressor']

sklearn/externals/joblib/_memmapping_reducer.py

Lines changed: 84 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import atexit
1414
import tempfile
1515
import warnings
16+
import weakref
17+
from uuid import uuid4
1618

1719
try:
1820
WindowsError
@@ -28,7 +30,7 @@
2830
from pickle import loads
2931
from pickle import dumps
3032

31-
from pickle import HIGHEST_PROTOCOL
33+
from pickle import HIGHEST_PROTOCOL, PicklingError
3234

3335
try:
3436
import numpy as np
@@ -38,20 +40,64 @@
3840

3941
from .numpy_pickle import load
4042
from .numpy_pickle import dump
41-
from .hashing import hash
4243
from .backports import make_memmap
4344
from .disk import delete_folder
4445

4546
# Some system have a ramdisk mounted by default, we can use it instead of /tmp
46-
# as the default folder to dump big arrays to share with subprocesses
47+
# as the default folder to dump big arrays to share with subprocesses.
4748
SYSTEM_SHARED_MEM_FS = '/dev/shm'
4849

50+
# Minimal number of bytes available on SYSTEM_SHARED_MEM_FS to consider using
51+
# it as the default folder to dump big arrays to share with subprocesses.
52+
SYSTEM_SHARED_MEM_FS_MIN_SIZE = int(2e9)
53+
4954
# Folder and file permissions to chmod temporary files generated by the
5055
# memmapping pool. Only the owner of the Python process can access the
5156
# temporary files and folder.
5257
FOLDER_PERMISSIONS = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
5358
FILE_PERMISSIONS = stat.S_IRUSR | stat.S_IWUSR
5459

60+
61+
class _WeakArrayKeyMap:
62+
"""A variant of weakref.WeakKeyDictionary for unhashable numpy arrays.
63+
64+
This datastructure will be used with numpy arrays as obj keys, therefore we
65+
do not use the __get__ / __set__ methods to avoid any conflict with the
66+
numpy fancy indexing syntax.
67+
"""
68+
69+
def __init__(self):
70+
self._data = {}
71+
72+
def get(self, obj):
73+
ref, val = self._data[id(obj)]
74+
if ref() is not obj:
75+
# In case of race condition with on_destroy: could never be
76+
# triggered by the joblib tests with CPython.
77+
raise KeyError(obj)
78+
return val
79+
80+
def set(self, obj, value):
81+
key = id(obj)
82+
try:
83+
ref, _ = self._data[key]
84+
if ref() is not obj:
85+
# In case of race condition with on_destroy: could never be
86+
# triggered by the joblib tests with CPython.
87+
raise KeyError(obj)
88+
except KeyError:
89+
# Insert the new entry in the mapping along with a weakref
90+
# callback to automatically delete the entry from the mapping
91+
# as soon as the object used as key is garbage collected.
92+
def on_destroy(_):
93+
del self._data[key]
94+
ref = weakref.ref(obj, on_destroy)
95+
self._data[key] = ref, value
96+
97+
def __getstate__(self):
98+
raise PicklingError("_WeakArrayKeyMap is not pickleable")
99+
100+
55101
###############################################################################
56102
# Support for efficient transient pickling of numpy data structures
57103

@@ -109,12 +155,18 @@ def _get_temp_dir(pool_folder_name, temp_folder=None):
109155
if temp_folder is None:
110156
if os.path.exists(SYSTEM_SHARED_MEM_FS):
111157
try:
112-
temp_folder = SYSTEM_SHARED_MEM_FS
113-
pool_folder = os.path.join(temp_folder, pool_folder_name)
114-
if not os.path.exists(pool_folder):
115-
os.makedirs(pool_folder)
116-
use_shared_mem = True
117-
except IOError:
158+
shm_stats = os.statvfs(SYSTEM_SHARED_MEM_FS)
159+
available_nbytes = shm_stats.f_bsize * shm_stats.f_ba E0EE vail
160+
if available_nbytes > SYSTEM_SHARED_MEM_FS_MIN_SIZE:
161+
# Try to see if we have write access to the shared mem
162+
# folder only if it is reasonably large (that is 2GB or
163+
# more).
164+
temp_folder = SYSTEM_SHARED_MEM_FS
165+
pool_folder = os.path.join(temp_folder, pool_folder_name)
166+
if not os.path.exists(pool_folder):
167+
os.makedirs(pool_folder)
168+
use_shared_mem = True
169+
except (IOError, OSError):
118170
# Missing rights in the the /dev/shm partition,
119171
# fallback to regular temp folder.
120172
temp_folder = None
@@ -231,6 +283,19 @@ def __init__(self, max_nbytes, temp_folder, mmap_mode, verbose=0,
231283
self._mmap_mode = mmap_mode
232284
self.verbose = int(verbose)
233285
self._prewarm = prewarm
286+
self._memmaped_arrays = _WeakArrayKeyMap()
287+
288+
def __reduce__(self):
289+
# The ArrayMemmapReducer is passed to the children processes: it needs
290+
# to be pickled but the _WeakArrayKeyMap need to be skipped as it's
291+
# only guaranteed to be consistent with the parent process memory
292+
# garbage collection.
293+
args = (self._max_nbytes, self._temp_folder, self._mmap_mode)
294+
kwargs = {
295+
'verbose': self.verbose,
296+
'prewarm': self._prewarm,
297+
}
298+
return ArrayMemmapReducer, args, kwargs
234299

235300
def __call__(self, a):
236301
m = _get_backing_memmap(a)
@@ -249,10 +314,16 @@ def __call__(self, a):
249314
if e.errno != errno.EEXIST:
250315
raise e
251316

252-
# Find a unique, concurrent safe filename for writing the
253-
# content of this array only once.
254-
basename = "{}-{}-{}.pkl".format(
255-
os.getpid(), id(threading.current_thread()), hash(a))
317+
try:
318+
basename = self._memmaped_arrays.get(a)
319+
except KeyError:
320+
# Generate a new unique random filename. The process and thread
321+
# ids are only useful for debugging purpose and to make it
322+
# easier to cleanup orphaned files in case of hard process
323+
# kill (e.g. by "kill -9" or segfault).
324+
basename = "{}-{}-{}.pkl".format(
325+
os.getpid(), id(threading.current_thread()), uuid4().hex)
326+
self._memmaped_arrays.set(a, basename)
256327
filename = os.path.join(self._temp_folder, basename)
257328

258329
# In case the same array with the same content is passed several

0 commit comments

Comments
 (0)
0