diff --git a/docs/_static/donotdelete b/docs/_static/donotdelete new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/release.rst b/docs/release.rst index 9dccc9bdb6..d68a4614c9 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -1,6 +1,14 @@ Release notes ============= +* Added ``overwrite`` keyword argument to array and group creation methods + on the :class:`zarr.hierarchy.Group` class + (`#71 `_). +* Added ``cache_metadata`` keyword argument to array creation methods. +* The functions :func:`zarr.creation.open_array` and + :func:`zarr.hierarchy.open_group` now accept any store as first argument + (`#56 `_). + .. _release_2.0.1: 2.0.1 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 7946c7a71c..0db9c9a992 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -230,7 +230,7 @@ the delta filter:: ... chunks=(1000, 1000), compressor=compressor) >>> z Array((10000, 10000), int32, chunks=(1000, 1000), order=C) - nbytes: 381.5M; nbytes_stored: 248.9K; ratio: 1569.6; initialized: 100/100 + nbytes: 381.5M; nbytes_stored: 248.9K; ratio: 1569.7; initialized: 100/100 compressor: LZMA(format=1, check=-1, preset=None, filters=[{'dist': 4, 'id': 3}, {'preset': 1, 'id': 33}]) store: dict @@ -327,7 +327,7 @@ provided that all processes have access to a shared file system. E.g.:: ... synchronizer=synchronizer) >>> z Array((10000, 10000), int32, chunks=(1000, 1000), order=C) - nbytes: 381.5M; nbytes_stored: 326; ratio: 1226993.9; initialized: 0/100 + nbytes: 381.5M; nbytes_stored: 323; ratio: 1238390.1; initialized: 0/100 compressor: Blosc(cname='lz4', clevel=5, shuffle=1) store: DirectoryStore; synchronizer: ProcessSynchronizer diff --git a/zarr/core.py b/zarr/core.py index db81688913..b42a3e7c96 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -13,7 +13,7 @@ from zarr.storage import array_meta_key, attrs_key, listdir, getsize from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.attrs import Attributes -from zarr.errors import PermissionError +from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce from zarr.codecs import get_codec @@ -34,6 +34,11 @@ class Array(object): for storage of both chunks and metadata. synchronizer : object, optional Array synchronizer. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). Attributes ---------- @@ -56,8 +61,9 @@ class Array(object): itemsize nbytes nbytes_stored - initialized cdata_shape + nchunks + nchunks_initialized is_view Methods @@ -71,7 +77,7 @@ class Array(object): """ # flake8: noqa def __init__(self, store, path=None, read_only=False, chunk_store=None, - synchronizer=None): + synchronizer=None, cache_metadata=True): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized @@ -87,13 +93,23 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, else: self._chunk_store = chunk_store self._synchronizer = synchronizer + self._cache_metadata = cache_metadata + self._is_view = False # initialize metadata + self._load_metadata() + + # initialize attributes + akey = self._key_prefix + attrs_key + self._attrs = Attributes(store, key=akey, read_only=read_only, + synchronizer=synchronizer) + + def _load_metadata(self): try: mkey = self._key_prefix + array_meta_key - meta_bytes = store[mkey] + meta_bytes = self._store[mkey] except KeyError: - raise ValueError('store has no metadata') + err_array_not_found(self._path) else: # decode and store metadata @@ -104,7 +120,6 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, self._dtype = meta['dtype'] self._fill_value = meta['fill_value'] self._order = meta['order'] - self._is_view = False # setup compressor config = meta['compressor'] @@ -119,14 +134,10 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, filters = [get_codec(f) for f in filters] self._filters = filters - # initialize attributes - akey = self._key_prefix + attrs_key - self._attrs = Attributes(store, key=akey, read_only=read_only, - synchronizer=synchronizer) - def _flush_metadata(self): if self._is_view: - raise PermissionError('operation not permitted for views') + raise PermissionError('not permitted for views') + if self._compressor: compressor_config = self._compressor.get_config() else: @@ -253,12 +264,6 @@ def nbytes_stored(self): else: return m + n - @property - def initialized(self): - """The number of chunks that have been initialized with some data.""" - return sum(1 for k in listdir(self._chunk_store, self._path) - if k not in [array_meta_key, attrs_key]) - @property def cdata_shape(self): """A tuple of integers describing the number of chunks along each @@ -267,6 +272,20 @@ def cdata_shape(self): int(np.ceil(s / c)) for s, c in zip(self._shape, self._chunks) ) + @property + def nchunks(self): + """Total number of chunks.""" + return reduce(operator.mul, self.cdata_shape) + + @property + def nchunks_initialized(self): + """The number of chunks that have been initialized with some data.""" + return sum(1 for k in listdir(self._chunk_store, self._path) + if k not in [array_meta_key, attrs_key]) + + # backwards compability + initialized = nchunks_initialized + @property def is_view(self): """A boolean, True if this array is a view on another array.""" @@ -366,6 +385,10 @@ def __getitem__(self, item): """ # flake8: noqa + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + # normalize selection selection = normalize_array_selection(item, self._shape) @@ -482,7 +505,11 @@ def __setitem__(self, key, value): # guard conditions if self._read_only: - raise PermissionError('array is read-only') + err_read_only() + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() # normalize selection selection = normalize_array_selection(key, self._shape) @@ -717,6 +744,10 @@ def _encode_chunk(self, chunk): def __repr__(self): + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + # main line r = '%s(' % type(self).__name__ if self.name: @@ -733,8 +764,8 @@ def __repr__(self): r += '; nbytes_stored: %s' % human_readable_size( self.nbytes_stored) r += '; ratio: %.1f' % (self.nbytes / self.nbytes_stored) - n_chunks = reduce(operator.mul, self.cdata_shape) - r += '; initialized: %s/%s' % (self.initialized, n_chunks) + r += '; initialized: %s/%s' % (self.nchunks_initialized, + self.nchunks) # filters if self.filters: @@ -768,15 +799,28 @@ def _write_op(self, f, *args, **kwargs): # guard condition if self._read_only: - raise PermissionError('array is read-only') + err_read_only() # synchronization if self._synchronizer is None: + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + return f(*args, **kwargs) + else: + # synchronize on the array mkey = self._key_prefix + array_meta_key + with self._synchronizer[mkey]: + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + return f(*args, **kwargs) def resize(self, *args): @@ -1022,7 +1066,7 @@ def view(self, shape=None, chunks=None, dtype=None, ... v.resize(20000) ... except PermissionError as e: ... print(e) - operation not permitted for views + not permitted for views """ # flake8: noqa @@ -1034,7 +1078,8 @@ def view(self, shape=None, chunks=None, dtype=None, if synchronizer is None: synchronizer = self._synchronizer a = Array(store=store, path=path, chunk_store=chunk_store, - read_only=read_only, synchronizer=synchronizer) + read_only=read_only, synchronizer=synchronizer, + cache_metadata=True) a._is_view = True # allow override of some properties diff --git a/zarr/creation.py b/zarr/creation.py index c12281cec7..81589e414a 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -8,14 +8,16 @@ from zarr.core import Array from zarr.storage import DirectoryStore, init_array, contains_array, \ - contains_group, default_compressor + contains_group, default_compressor, normalize_storage_path from zarr.codecs import codec_registry +from zarr.errors import err_contains_array, err_contains_group, \ + err_array_not_found def create(shape, chunks=None, dtype=None, compressor='default', - fill_value=None, order='C', store=None, synchronizer=None, + fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, path=None, chunk_store=None, filters=None, - **kwargs): + cache_metadata=True, **kwargs): """Create an array. Parameters @@ -32,14 +34,13 @@ def create(shape, chunks=None, dtype=None, compressor='default', Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. - store : MutableMapping, optional - Array storage. If not provided, a Python dict will be used, meaning - array data will be stored in memory. + store : MutableMapping or string + Store or path to directory in file system. synchronizer : object, optional Array synchronizer. overwrite : bool, optional - If True, delete all pre-existing data in `store` before creating the - array. + If True, delete all pre-existing data in `store` at `path` before + creating the array. path : string, optional Path under which array is stored. chunk_store : MutableMapping, optional @@ -47,6 +48,11 @@ def create(shape, chunks=None, dtype=None, compressor='default', for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). Returns ------- @@ -61,15 +67,14 @@ def create(shape, chunks=None, dtype=None, compressor='default', >>> z = zarr.create((10000, 10000), chunks=(1000, 1000)) >>> z Array((10000, 10000), float64, chunks=(1000, 1000), order=C) - nbytes: 762.9M; nbytes_stored: 326; ratio: 2453987.7; initialized: 0/100 + nbytes: 762.9M; nbytes_stored: 323; ratio: 2476780.2; initialized: 0/100 compressor: Blosc(cname='lz4', clevel=5, shuffle=1) store: dict """ # flake8: noqa - # initialize store - if store is None: - store = dict() + # handle polymorphic store arg + store = _handle_store_arg(store) # compatibility compressor, fill_value = _handle_kwargs(compressor, fill_value, kwargs) @@ -82,17 +87,33 @@ def create(shape, chunks=None, dtype=None, compressor='default', # instantiate array z = Array(store, path=path, chunk_store=chunk_store, - synchronizer=synchronizer) + synchronizer=synchronizer, cache_metadata=cache_metadata) return z +def _handle_store_arg(store): + if store is None: + return dict() + elif isinstance(store, str): + return DirectoryStore(store) + else: + return store + + def _handle_kwargs(compressor, fill_value, kwargs): # to be compatible with h5py, as well as backwards-compatible with Zarr # 1.x, accept 'compression' and 'compression_opts' keyword arguments - if 'compression' in kwargs: + if compressor != 'default': + # 'compressor' overrides 'compression' + if 'compression' in kwargs: + warn("'compression' keyword argument overridden by 'compressor'") + if 'compression_opts' in kwargs: + warn("ignoring keyword argument 'compression_opts'") + + elif 'compression' in kwargs: compression = kwargs.pop('compression') compression_opts = kwargs.pop('compression_opts', None) @@ -131,7 +152,7 @@ def _handle_kwargs(compressor, fill_value, kwargs): # ignore other keyword arguments for k in kwargs: - warn('ignoring keyword argument: %r' % k) + warn('ignoring keyword argument %r' % k) return compressor, fill_value @@ -223,6 +244,26 @@ def full(shape, fill_value, **kwargs): return create(shape=shape, fill_value=fill_value, **kwargs) +def _get_shape_chunks(a): + shape = None + chunks = None + + if hasattr(a, 'shape') and \ + isinstance(a.shape, tuple): + shape = a.shape + + if hasattr(a, 'chunks') and \ + isinstance(a.chunks, tuple) and \ + (len(a.chunks) == len(a.shape)): + chunks = a.chunks + + elif hasattr(a, 'chunklen'): + # bcolz carray + chunks = (a.chunklen,) + a.shape[1:] + + return shape, chunks + + def array(data, **kwargs): """Create an array filled with `data`. @@ -258,13 +299,7 @@ def array(data, **kwargs): # setup chunks chunks = kwargs.pop('chunks', None) if chunks is None: - # try to use same chunks as data - if hasattr(data, 'chunklen'): - # bcolz carray - chunks = (data.chunklen,) + shape[1:] - elif hasattr(data, 'chunks') and len(data.chunks) == len(data.shape): - # h5py dataset or zarr array - chunks = data.chunks + _, chunks = _get_shape_chunks(data) # instantiate array z = create(shape=shape, chunks=chunks, dtype=dtype, **kwargs) @@ -275,16 +310,16 @@ def array(data, **kwargs): return z -def open_array(path, mode='a', shape=None, chunks=None, dtype=None, - compressor='default', fill_value=None, order='C', - synchronizer=None, filters=None, **kwargs): - """Convenience function to instantiate an array stored in a - directory on the file system. +def open_array(store=None, mode='a', shape=None, chunks=None, dtype=None, + compressor='default', fill_value=0, order='C', + synchronizer=None, filters=None, cache_metadata=True, + path=None, **kwargs): + """Open array using mode-like semantics. Parameters ---------- - path : string - Path to directory in file system in which to store the array. + store : MutableMapping or string + Store or path to directory in file system. mode : {'r', 'r+', 'a', 'w', 'w-'} Persistence mode: 'r' means read only (must exist); 'r+' means read/write (must exist); 'a' means read/write (create if doesn't @@ -306,6 +341,13 @@ def open_array(path, mode='a', shape=None, chunks=None, dtype=None, Array synchronizer. filters : sequence, optional Sequence of filters to use to encode chunk data prior to compression. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). + path : string, optional + Array path. Returns ------- @@ -339,16 +381,16 @@ def open_array(path, mode='a', shape=None, chunks=None, dtype=None, """ # flake8: noqa - # use same mode semantics as h5py, although N.B., here `path` is a - # directory: + # use same mode semantics as h5py # r : read only, must exist # r+ : read/write, must exist # w : create, delete if exists # w- or x : create, fail if exists # a : read/write if exists, create otherwise (default) - # setup store - store = DirectoryStore(path) + # handle polymorphic store arg + store = _handle_store_arg(store) + path = normalize_storage_path(path) # compatibility compressor, fill_value = _handle_kwargs(compressor, fill_value, kwargs) @@ -356,39 +398,40 @@ def open_array(path, mode='a', shape=None, chunks=None, dtype=None, # ensure store is initialized if mode in ['r', 'r+']: - if contains_group(store): - raise ValueError('store contains group') - elif not contains_array(store): - raise ValueError('array does not exist') + if contains_group(store, path=path): + err_contains_group(path) + elif not contains_array(store, path=path): + err_array_not_found(path) elif mode == 'w': init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, - order=order, filters=filters, overwrite=True) + order=order, filters=filters, overwrite=True, path=path) elif mode == 'a': - if contains_group(store): - raise ValueError('store contains group') - elif not contains_array(store): + if contains_group(store, path=path): + err_contains_group(path) + elif not contains_array(store, path=path): init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, - order=order, filters=filters) + order=order, filters=filters, path=path) elif mode in ['w-', 'x']: - if contains_group(store): - raise ValueError('store contains group') - elif contains_array(store): - raise ValueError('store contains array') + if contains_group(store, path=path): + err_contains_group(path) + elif contains_array(store, path=path): + err_contains_array(path) else: init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, - order=order, filters=filters) + order=order, filters=filters, path=path) # determine read only status read_only = mode == 'r' # instantiate array - z = Array(store, read_only=read_only, synchronizer=synchronizer) + z = Array(store, read_only=read_only, synchronizer=synchronizer, + cache_metadata=cache_metadata, path=path) return z @@ -399,11 +442,11 @@ def open_array(path, mode='a', shape=None, chunks=None, dtype=None, def _like_args(a, kwargs): - if hasattr(a, 'shape'): - kwargs.setdefault('shape', a.shape) - - if hasattr(a, 'chunks'): - kwargs.setdefault('chunks', a.chunks) + shape, chunks = _get_shape_chunks(a) + if shape is not None: + kwargs.setdefault('shape', shape) + if chunks is not None: + kwargs.setdefault('chunks', chunks) if hasattr(a, 'dtype'): kwargs.setdefault('dtype', a.dtype) diff --git a/zarr/errors.py b/zarr/errors.py index 6f4e5a40f7..f1baf429e6 100644 --- a/zarr/errors.py +++ b/zarr/errors.py @@ -17,3 +17,36 @@ class PermissionError(Exception): class MetadataError(Exception): pass + + +def err_contains_group(path): + raise KeyError('path %r contains a group' % path) + + +def err_contains_array(path): + raise KeyError('path %r contains an array' % path) + + +def err_array_not_found(path): + raise KeyError('array not found at path %r' % path) + + +def err_group_not_found(path): + raise KeyError('group not found at path %r' % path) + + +def err_path_not_found(path): + raise KeyError('path %r not found' % path) + + +def err_bad_compressor(compressor): + raise ValueError('bad compressor; expected Codec object, found %r' % + compressor) + + +def err_fspath_exists_notdir(fspath): + raise ValueError('path exists but is not a directory: %r' % fspath) + + +def err_read_only(): + raise PermissionError('object is read-only') diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 78efbe4087..7ce2c65e92 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -13,7 +13,8 @@ from zarr.creation import array, create, empty, zeros, ones, full, \ empty_like, zeros_like, ones_like, full_like from zarr.util import normalize_storage_path, normalize_shape -from zarr.errors import PermissionError +from zarr.errors import PermissionError, err_contains_array, \ + err_contains_group, err_group_not_found, err_read_only from zarr.meta import decode_group_metadata @@ -22,10 +23,10 @@ class Group(Mapping): Parameters ---------- - store : HierarchicalStore + store : MutableMapping Group store, already initialized. path : string, optional - Storage path. + Group path. read_only : bool, optional True if group should be protected against modification. chunk_store : MutableMapping, optional @@ -91,14 +92,14 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, # guard conditions if contains_array(store, path=self._path): - raise ValueError('store contains an array') + err_contains_array(path) # initialize metadata try: mkey = self._key_prefix + group_meta_key meta_bytes = store[mkey] except KeyError: - raise ValueError('store has no metadata') + err_group_not_found(path) else: meta = decode_group_metadata(meta_bytes) self._meta = meta @@ -285,7 +286,7 @@ def __getitem__(self, item): store: DictStore >>> g1['foo/bar/baz'] Array(/foo/bar/baz, (100,), float64, chunks=(10,), order=C) - nbytes: 800; nbytes_stored: 293; ratio: 2.7; initialized: 0/10 + nbytes: 800; nbytes_stored: 290; ratio: 2.8; initialized: 0/10 compressor: Blosc(cname='lz4', clevel=5, shuffle=1) store: DictStore @@ -396,7 +397,7 @@ def _write_op(self, f, *args, **kwargs): # guard condition if self._read_only: - raise PermissionError('group is read-only') + err_read_only() # synchronization if self._synchronizer is None: @@ -406,13 +407,15 @@ def _write_op(self, f, *args, **kwargs): with self._synchronizer[group_meta_key]: return f(*args, **kwargs) - def create_group(self, name): + def create_group(self, name, overwrite=False): """Create a sub-group. Parameters ---------- name : string Group name. + overwrite : bool, optional + If True, overwrite any existing array with the given name. Returns ------- @@ -428,43 +431,33 @@ def create_group(self, name): """ - return self._write_op(self._create_group_nosync, name) - - def _create_group_nosync(self, name): + return self._write_op(self._create_group_nosync, name, + overwrite=overwrite) + def _create_group_nosync(self, name, overwrite=False): path = self._item_path(name) - # require intermediate groups - segments = path.split('/') - for i in range(len(segments)): - p = '/'.join(segments[:i]) - if contains_array(self._store, p): - raise KeyError(name) - elif not contains_group(self._store, p): - init_group(self._store, path=p, chunk_store=self._chunk_store) - # create terminal group - if contains_array(self._store, path): - raise KeyError(name) - if contains_group(self._store, path): - raise KeyError(name) - else: - init_group(self._store, path=path, chunk_store=self._chunk_store) - return Group(self._store, path=path, read_only=self._read_only, - chunk_store=self._chunk_store, - synchronizer=self._synchronizer) + init_group(self._store, path=path, chunk_store=self._chunk_store, + overwrite=overwrite) - def create_groups(self, *names): + return Group(self._store, path=path, read_only=self._read_only, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) + + def create_groups(self, *names, **kwargs): """Convenience method to create multiple groups in a single call.""" - return tuple(self.create_group(name) for name in names) + return tuple(self.create_group(name, **kwargs) for name in names) - def require_group(self, name): + def require_group(self, name, overwrite=False): """Obtain a sub-group, creating one if it doesn't exist. Parameters ---------- name : string Group name. + overwrite : bool, optional + Overwrite any existing array with given `name` if present. Returns ------- @@ -481,20 +474,17 @@ def require_group(self, name): """ - return self._write_op(self._require_group_nosync, name) - - def _require_group_nosync(self, name): + return self._write_op(self._require_group_nosync, name, + overwrite=overwrite) + def _require_group_nosync(self, name, overwrite=False): path = self._item_path(name) - # require all intermediate groups - segments = path.split('/') - for i in range(len(segments) + 1): - p = '/'.join(segments[:i]) - if contains_array(self._store, p): - raise KeyError(name) - elif not contains_group(self._store, p): - init_group(self._store, path=p, chunk_store=self._chunk_store) + # create terminal group if necessary + if not contains_group(self._store, path): + init_group(store=self._store, path=path, + chunk_store=self._chunk_store, + overwrite=overwrite) return Group(self._store, path=path, read_only=self._read_only, chunk_store=self._chunk_store, @@ -504,18 +494,10 @@ def require_groups(self, *names): """Convenience method to require multiple groups in a single call.""" return tuple(self.require_group(name) for name in names) - def _require_parent_group(self, path): - segments = path.split('/') - for i in range(len(segments)): - p = '/'.join(segments[:i]) - if contains_array(self._store, p): - raise KeyError(path) - elif not contains_group(self._store, p): - init_group(self._store, path=p, chunk_store=self._chunk_store) - def create_dataset(self, name, data=None, shape=None, chunks=None, - dtype=None, compressor='default', fill_value=None, - order='C', synchronizer=None, filters=None, **kwargs): + dtype=None, compressor='default', fill_value=0, + order='C', synchronizer=None, filters=None, + overwrite=False, cache_metadata=True, **kwargs): """Create an array. Parameters @@ -540,7 +522,15 @@ def create_dataset(self, name, data=None, shape=None, chunks=None, synchronizer : zarr.sync.ArraySynchronizer, optional Array synchronizer. filters : sequence of Codecs, optional - Sequence of filters to use to encode chunk data prior to compression. + Sequence of filters to use to encode chunk data prior to + compression. + overwrite : bool, optional + If True, replace any existing array or group with the given name. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). Returns ------- @@ -554,7 +544,7 @@ def create_dataset(self, name, data=None, shape=None, chunks=None, ... chunks=(1000, 1000)) >>> d1 Array(/foo, (10000, 10000), float64, chunks=(1000, 1000), order=C) - nbytes: 762.9M; nbytes_stored: 326; ratio: 2453987.7; initialized: 0/100 + nbytes: 762.9M; nbytes_stored: 323; ratio: 2476780.2; initialized: 0/100 compressor: Blosc(cname='lz4', clevel=5, shuffle=1) store: DictStore @@ -564,21 +554,16 @@ def create_dataset(self, name, data=None, shape=None, chunks=None, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, synchronizer=synchronizer, - filters=filters, **kwargs) + filters=filters, overwrite=overwrite, + cache_metadata=cache_metadata, **kwargs) def _create_dataset_nosync(self, name, data=None, shape=None, chunks=None, dtype=None, compressor='default', - fill_value=None, order='C', synchronizer=None, - filters=None, **kwargs): + fill_value=0, order='C', synchronizer=None, + filters=None, overwrite=False, + cache_metadata=True, **kwargs): path = self._item_path(name) - self._require_parent_group(path) - - # guard conditions - if contains_array(self._store, path): - raise KeyError(name) - if contains_group(self._store, path): - raise KeyError(name) # determine synchronizer if synchronizer is None: @@ -590,7 +575,9 @@ def _create_dataset_nosync(self, name, data=None, shape=None, chunks=None, compressor=compressor, fill_value=fill_value, order=order, synchronizer=synchronizer, store=self._store, path=path, - chunk_store=self._chunk_store, filters=filters, **kwargs) + chunk_store=self._chunk_store, filters=filters, + overwrite=overwrite, cache_metadata=cache_metadata, + **kwargs) else: a = create(shape=shape, chunks=chunks, dtype=dtype, @@ -598,6 +585,7 @@ def _create_dataset_nosync(self, name, data=None, shape=None, chunks=None, order=order, synchronizer=synchronizer, store=self._store, path=path, chunk_store=self._chunk_store, filters=filters, + overwrite=overwrite, cache_metadata=cache_metadata, **kwargs) return a @@ -630,8 +618,10 @@ def _require_dataset_nosync(self, name, shape, dtype=None, exact=False, if contains_array(self._store, path): synchronizer = kwargs.get('synchronizer', self._synchronizer) + cache_metadata = kwargs.get('cache_metadata', True) a = Array(self._store, path=path, read_only=self._read_only, - chunk_store=self._chunk_store, synchronizer=synchronizer) + chunk_store=self._chunk_store, + synchronizer=synchronizer, cache_metadata=cache_metadata) shape = normalize_shape(shape) if shape != a.shape: raise TypeError('shapes do not match') @@ -655,7 +645,6 @@ def create(self, name, **kwargs): def _create_nosync(self, name, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return create(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) @@ -667,7 +656,6 @@ def empty(self, name, **kwargs): def _empty_nosync(self, name, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return empty(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) @@ -679,7 +667,6 @@ def zeros(self, name, **kwargs): def _zeros_nosync(self, name, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return zeros(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) @@ -691,7 +678,6 @@ def ones(self, name, **kwargs): def _ones_nosync(self, name, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return ones(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) @@ -703,7 +689,6 @@ def full(self, name, fill_value, **kwargs): def _full_nosync(self, name, fill_value, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return full(store=self._store, path=path, chunk_store=self._chunk_store, @@ -716,7 +701,6 @@ def array(self, name, data, **kwargs): def _array_nosync(self, name, data, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return array(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) @@ -728,7 +712,6 @@ def empty_like(self, name, data, **kwargs): def _empty_like_nosync(self, name, data, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return empty_like(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) @@ -740,7 +723,6 @@ def zeros_like(self, name, data, **kwargs): def _zeros_like_nosync(self, name, data, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return zeros_like(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) @@ -752,7 +734,6 @@ def ones_like(self, name, data, **kwargs): def _ones_like_nosync(self, name, data, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return ones_like(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) @@ -764,20 +745,28 @@ def full_like(self, name, data, **kwargs): def _full_like_nosync(self, name, data, **kwargs): path = self._item_path(name) - self._require_parent_group(path) kwargs.setdefault('synchronizer', self._synchronizer) return full_like(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) -def group(store=None, overwrite=False, chunk_store=None, synchronizer=None): +def _handle_store_arg(store): + if store is None: + return DictStore() + elif isinstance(store, str): + return DirectoryStore(store) + else: + return store + + +def group(store=None, overwrite=False, chunk_store=None, synchronizer=None, + path=None): """Create a group. Parameters ---------- - store : MutableMapping, optional - Group storage. If not provided, a DictStore will be used, meaning - that data will be stored in memory. + store : MutableMapping or string + Store or path to directory in file system. overwrite : bool, optional If True, delete any pre-existing data in `store` at `path` before creating the group. @@ -786,6 +775,8 @@ def group(store=None, overwrite=False, chunk_store=None, synchronizer=None): for storage of both chunks and metadata. synchronizer : object, optional Array synchronizer. + path : string, optional + Group path. Returns ------- @@ -812,30 +803,26 @@ def group(store=None, overwrite=False, chunk_store=None, synchronizer=None): """ - # ensure store - if store is None: - store = DictStore() + # handle polymorphic store arg + store = _handle_store_arg(store) + path = normalize_storage_path(path) # require group - if overwrite: - init_group(store, overwrite=True, chunk_store=chunk_store) - elif contains_array(store): - raise ValueError('store contains an array') - elif not contains_group(store): - init_group(store, chunk_store=chunk_store) + if overwrite or not contains_group(store): + init_group(store, overwrite=overwrite, chunk_store=chunk_store, + path=path) return Group(store, read_only=False, chunk_store=chunk_store, - synchronizer=synchronizer) + synchronizer=synchronizer, path=path) -def open_group(path, mode='a', synchronizer=None): - """Convenience function to instantiate a group stored in a directory on - the file system. +def open_group(store=None, mode='a', synchronizer=None, path=None): + """Open a group using mode-like semantics. Parameters ---------- - path : string - Path to directory in file system in which to store the group. + store : MutableMapping or string + Store or path to directory in file system. mode : {'r', 'r+', 'a', 'w', 'w-'} Persistence mode: 'r' means read only (must exist); 'r+' means read/write (must exist); 'a' means read/write (create if doesn't @@ -843,6 +830,8 @@ def open_group(path, mode='a', synchronizer=None): (fail if exists). synchronizer : object, optional Array synchronizer. + path : string, optional + Group path. Returns ------- @@ -868,35 +857,37 @@ def open_group(path, mode='a', synchronizer=None): """ - # setup store - store = DirectoryStore(path) + # handle polymorphic store arg + store = _handle_store_arg(store) + path = normalize_storage_path(path) # ensure store is initialized if mode in ['r', 'r+']: - if contains_array(store): - raise ValueError('store contains array') - elif not contains_group(store): - raise ValueError('group does not exist') + if contains_array(store, path=path): + err_contains_array(path) + elif not contains_group(store, path=path): + err_group_not_found(path) elif mode == 'w': - init_group(store, overwrite=True) + init_group(store, overwrite=True, path=path) elif mode == 'a': - if contains_array(store): - raise ValueError('store contains array') - elif not contains_group(store): - init_group(store) + if contains_array(store, path=path): + err_contains_array(path) + if not contains_group(store, path=path): + init_group(store, path=path) elif mode in ['w-', 'x']: - if contains_array(store): - raise ValueError('store contains array') - elif contains_group(store): - raise ValueError('store contains group') + if contains_array(store, path=path): + err_contains_array(path) + elif contains_group(store, path=path): + err_contains_group(path) else: - init_group(store) + init_group(store, path=path) # determine read only status read_only = mode == 'r' - return Group(store, read_only=read_only, synchronizer=synchronizer) + return Group(store, read_only=read_only, synchronizer=synchronizer, + path=path) diff --git a/zarr/storage.py b/zarr/storage.py index bab70e7046..c71ea8b655 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -16,7 +16,9 @@ from zarr.meta import encode_array_metadata, encode_group_metadata from zarr.compat import PY2, binary_type from zarr.codecs import codec_registry -from zarr.errors import PermissionError +from zarr.errors import PermissionError, err_contains_group, \ + err_contains_array, err_path_not_found, err_bad_compressor, \ + err_fspath_exists_notdir, err_read_only array_meta_key = '.zarray' @@ -122,6 +124,19 @@ def getsize(store, path=None): return -1 +def _require_parent_group(path, store, chunk_store, overwrite): + # assume path is normalized + if path: + segments = path.split('/') + for i in range(len(segments)): + p = '/'.join(segments[:i]) + if contains_array(store, p): + _init_group_metadata(store, path=p, chunk_store=chunk_store, + overwrite=overwrite) + elif not contains_group(store, p): + _init_group_metadata(store, path=p, chunk_store=chunk_store) + + def init_array(store, shape, chunks, dtype=None, compressor='default', fill_value=None, order='C', overwrite=False, path=None, chunk_store=None, filters=None): @@ -195,11 +210,12 @@ def init_array(store, shape, chunks, dtype=None, compressor='default', Initialize an array using a storage path:: + >>> store = dict() >>> init_array(store, shape=100000000, chunks=1000000, dtype='i1', - ... path='foo/bar') + ... path='foo') >>> sorted(store.keys()) - ['.zarray', '.zattrs', 'foo/bar/.zarray', 'foo/bar/.zattrs'] - >>> print(str(store['foo/bar/.zarray'], 'ascii')) + ['.zattrs', '.zgroup', 'foo/.zarray', 'foo/.zattrs'] + >>> print(str(store['foo/.zarray'], 'ascii')) { "chunks": [ 1000000 @@ -231,6 +247,21 @@ def init_array(store, shape, chunks, dtype=None, compressor='default', # normalize path path = normalize_storage_path(path) + # ensure parent group initialized + _require_parent_group(path, store=store, chunk_store=chunk_store, + overwrite=overwrite) + + _init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype, + compressor=compressor, fill_value=fill_value, + order=order, overwrite=overwrite, path=path, + chunk_store=chunk_store, filters=filters) + + +def _init_array_metadata(store, shape, chunks, dtype=None, + compressor='default', + fill_value=None, order='C', overwrite=False, + path=None, chunk_store=None, filters=None): + # guard conditions if overwrite: # attempt to delete any pre-existing items in store @@ -238,9 +269,9 @@ def init_array(store, shape, chunks, dtype=None, compressor='default', if chunk_store is not None and chunk_store != store: rmdir(chunk_store, path) elif contains_array(store, path): - raise ValueError('store contains an array') + err_contains_array(path) elif contains_group(store, path): - raise ValueError('store contains a group') + err_contains_group(path) # normalize metadata shape = normalize_shape(shape) @@ -258,8 +289,7 @@ def init_array(store, shape, chunks, dtype=None, compressor='default', try: compressor_config = compressor.get_config() except AttributeError: - raise ValueError('bad compressor argument; expected Codec object, ' - 'found %r' % compressor) + err_bad_compressor(compressor) else: compressor_config = None @@ -304,7 +334,18 @@ def init_group(store, overwrite=False, path=None, chunk_store=None): # normalize path path = normalize_storage_path(path) - + + # ensure parent group initialized + _require_parent_group(path, store=store, chunk_store=chunk_store, + overwrite=overwrite) + + # initialise metadata + _init_group_metadata(store=store, overwrite=overwrite, path=path, + chunk_store=chunk_store) + + +def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None): + # guard conditions if overwrite: # attempt to delete any pre-existing items in store @@ -312,9 +353,9 @@ def init_group(store, overwrite=False, path=None, chunk_store=None): if chunk_store is not None and chunk_store != store: rmdir(chunk_store, path) elif contains_array(store, path): - raise ValueError('store contains an array') + err_contains_array(path) elif contains_group(store, path): - raise ValueError('store contains a group') + err_contains_group(path) # initialize metadata # N.B., currently no metadata properties are needed, however there may @@ -492,7 +533,7 @@ def getsize(self, path=None): parent, key = self._get_parent(path) value = parent[key] except KeyError: - raise ValueError('path not found: %r' % path) + err_path_not_found(path) else: value = self.root @@ -557,7 +598,7 @@ def __init__(self, path): # guard conditions path = os.path.abspath(path) if os.path.exists(path) and not os.path.isdir(path): - raise ValueError('path exists but is not a directory') + err_fspath_exists_notdir(path) self.path = path @@ -673,7 +714,7 @@ def getsize(self, path=None): size += os.path.getsize(child_fs_path) return size else: - raise ValueError('path not found: %r' % path) + err_path_not_found(path) # noinspection PyPep8Naming @@ -736,7 +777,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): if self.mode == 'r': - raise PermissionError('mapping is read-only') + err_read_only() value = ensure_bytes(value) with zipfile.ZipFile(self.path, mode='a', compression=self.compression, @@ -806,7 +847,7 @@ def getsize(self, path=None): info = zf.getinfo(path) return info.compress_size except KeyError: - raise ValueError('path not found: %r' % path) + err_path_not_found(path) else: return 0 diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index b39803ac17..b047bd6b91 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -5,7 +5,7 @@ import atexit import shutil import pickle -import os +from collections import MutableMapping import numpy as np @@ -51,95 +51,35 @@ def test_array_init(self): # store not initialized store = dict() - with assert_raises(ValueError): + with assert_raises(KeyError): Array(store) # group is in the way store = dict() init_group(store, path='baz') - with assert_raises(ValueError): + with assert_raises(KeyError): Array(store, path='baz') - def create_array(self, store=None, path=None, read_only=False, - chunk_store=None, **kwargs): - if store is None: - store = dict() + def create_array(self, read_only=False, **kwargs): + store = dict() kwargs.setdefault('compressor', Zlib(level=1)) - init_array(store, path=path, chunk_store=chunk_store, **kwargs) - return Array(store, path=path, read_only=read_only, - chunk_store=chunk_store) + init_array(store, **kwargs) + return Array(store, read_only=read_only) def test_nbytes_stored(self): - # custom store, does not implement getsize() - class CustomMapping(object): - def __init__(self): - self.inner = dict() - - def __getitem__(self, item): - return self.inner[item] - - def __setitem__(self, item, value): - self.inner[item] = value - - def __contains__(self, item): - return item in self.inner - - store = CustomMapping() - z = self.create_array(store=store, shape=1000, chunks=100) - eq(-1, z.nbytes_stored) - z[:] = 42 - eq(-1, z.nbytes_stored) - - store = dict() - chunk_store = CustomMapping() - z = self.create_array(store=store, chunk_store=chunk_store, - shape=1000, chunks=100) - eq(-1, z.nbytes_stored) - z[:] = 42 - eq(-1, z.nbytes_stored) - # dict as store - store = dict() - z = self.create_array(store=store, shape=1000, chunks=100) + z = self.create_array(shape=1000, chunks=100) expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - if z.store != z.chunk_store: - expect_nbytes_stored += sum(buffer_size(v) for v in - z.chunk_store.values()) eq(expect_nbytes_stored, z.nbytes_stored) z[:] = 42 expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - if z.store != z.chunk_store: - expect_nbytes_stored += sum(buffer_size(v) for v in - z.chunk_store.values()) eq(expect_nbytes_stored, z.nbytes_stored) + # mess with store - store[z._key_prefix + 'foo'] = list(range(10)) + z.store[z._key_prefix + 'foo'] = list(range(10)) eq(-1, z.nbytes_stored) - # for comparison - z = self.create_array(store=dict(), shape=1000, chunks=100, - compressor=Zlib(1)) - z[:] = 42 - - # DirectoryStore - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = DirectoryStore(path) - zz = self.create_array(store=store, shape=1000, chunks=100, - compressor=Zlib(1)) - zz[:] = 42 - eq(z.nbytes_stored, zz.nbytes_stored) - - # ZipStore - if os.path.exists('test.zip'): - os.remove('test.zip') - store = ZipStore('test.zip') - zz = self.create_array(store=store, shape=1000, chunks=100, - compressor=Zlib(1)) - zz[:] = 42 - eq(z.nbytes_stored, zz.nbytes_stored) - def test_array_1d(self): a = np.arange(1050) z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) @@ -150,8 +90,8 @@ def test_array_1d(self): eq(a.dtype, z.dtype) eq((100,), z.chunks) eq(a.nbytes, z.nbytes) - eq(sum(len(v) for v in z.store.values()), z.nbytes_stored) - eq(0, z.initialized) + eq(11, z.nchunks) + eq(0, z.nchunks_initialized) eq((11,), z.cdata_shape) # check empty @@ -169,12 +109,8 @@ def test_array_1d(self): # check properties eq(a.nbytes, z.nbytes) - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - if z.store != z.chunk_store: - expect_nbytes_stored += sum(buffer_size(v) for v in - z.chunk_store.values()) - eq(expect_nbytes_stored, z.nbytes_stored) - eq(11, z.initialized) + eq(11, z.nchunks) + eq(11, z.nchunks_initialized) # check slicing assert_array_equal(a, np.array(z)) @@ -239,8 +175,7 @@ def test_array_2d(self): eq(a.shape, z.shape) eq(a.dtype, z.dtype) eq((100, 2), z.chunks) - eq(sum(len(v) for v in z.store.values()), z.nbytes_stored) - eq(0, z.initialized) + eq(0, z.nchunks_initialized) eq((10, 5), z.cdata_shape) # set data @@ -248,11 +183,7 @@ def test_array_2d(self): # check properties eq(a.nbytes, z.nbytes) - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - if z.store != z.chunk_store: - expect_nbytes_stored += sum(buffer_size(v) for v in - z.chunk_store.values()) - eq(50, z.initialized) + eq(50, z.nchunks_initialized) # check slicing assert_array_equal(a, np.array(z)) @@ -546,12 +477,28 @@ def test_repr(self): class TestArrayWithPath(TestArray): @staticmethod - def create_array(store=None, read_only=False, chunk_store=None, **kwargs): - if store is None: - store = dict() - init_array(store, path='foo/bar', chunk_store=chunk_store, **kwargs) - return Array(store, path='foo/bar', read_only=read_only, - chunk_store=chunk_store) + def create_array(read_only=False, **kwargs): + store = dict() + init_array(store, path='foo/bar', **kwargs) + return Array(store, path='foo/bar', read_only=read_only) + + def test_nbytes_stored(self): + + # dict as store + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) + for k, v in z.store.items() + if k.startswith('foo/bar/')) + eq(expect_nbytes_stored, z.nbytes_stored) + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) + for k, v in z.store.items() + if k.startswith('foo/bar/')) + eq(expect_nbytes_stored, z.nbytes_stored) + + # mess with store + z.store[z._key_prefix + 'foo'] = list(range(10)) + eq(-1, z.nbytes_stored) def test_repr(self): if not PY2: @@ -570,21 +517,35 @@ def test_repr(self): class TestArrayWithChunkStore(TestArray): @staticmethod - def create_array(store=None, read_only=False, chunk_store=None, **kwargs): - if store is None: - store = dict() - if chunk_store is None: - # separate chunk store - chunk_store = dict() - init_array(store, path='foo/bar', chunk_store=chunk_store, **kwargs) - return Array(store, path='foo/bar', read_only=read_only, - chunk_store=chunk_store) + def create_array(read_only=False, **kwargs): + store = dict() + # separate chunk store + chunk_store = dict() + init_array(store, chunk_store=chunk_store, **kwargs) + return Array(store, read_only=read_only, chunk_store=chunk_store) + + def test_nbytes_stored(self): + + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored += sum(buffer_size(v) + for v in z.chunk_store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored += sum(buffer_size(v) + for v in z.chunk_store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + + # mess with store + z.chunk_store[z._key_prefix + 'foo'] = list(range(10)) + eq(-1, z.nbytes_stored) def test_repr(self): if not PY2: z = self.create_array(shape=100, chunks=10, dtype='f4') # flake8: noqa - expect = """Array(/foo/bar, (100,), float32, chunks=(10,), order=C) + expect = """Array((100,), float32, chunks=(10,), order=C) nbytes: 400; nbytes_stored: 293; ratio: 1.4; initialized: 0/10 compressor: Blosc(cname='lz4', clevel=5, shuffle=1) store: dict; chunk_store: dict @@ -597,22 +558,29 @@ def test_repr(self): class TestArrayWithDirectoryStore(TestArray): @staticmethod - def create_array(store=None, read_only=False, chunk_store=None, **kwargs): - if store is None: - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = DirectoryStore(path) - chunk_store = store + def create_array(read_only=False, **kwargs): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + store = DirectoryStore(path) kwargs.setdefault('compressor', Zlib(1)) - init_array(store, path='foo/bar', chunk_store=chunk_store, **kwargs) - return Array(store, path='foo/bar', read_only=read_only, - chunk_store=chunk_store) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_nbytes_stored(self): + + # dict as store + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) def test_repr(self): if not PY2: z = self.create_array(shape=100, chunks=10, dtype='f4') # flake8: noqa - expect = """Array(/foo/bar, (100,), float32, chunks=(10,), order=C) + expect = """Array((100,), float32, chunks=(10,), order=C) nbytes: 400; nbytes_stored: 245; ratio: 1.6; initialized: 0/10 compressor: Zlib(level=1) store: DirectoryStore @@ -624,14 +592,11 @@ def test_repr(self): class TestArrayWithNoCompressor(TestArray): - def create_array(self, store=None, path=None, read_only=False, - chunk_store=None, **kwargs): - if store is None: - store = dict() + def create_array(self, read_only=False, **kwargs): + store = dict() kwargs.setdefault('compressor', None) - init_array(store, path=path, chunk_store=chunk_store, **kwargs) - return Array(store, path=path, read_only=read_only, - chunk_store=chunk_store) + init_array(store, **kwargs) + return Array(store, read_only=read_only) def test_repr(self): if not PY2: @@ -647,15 +612,12 @@ def test_repr(self): class TestArrayWithBZ2Compressor(TestArray): - def create_array(self, store=None, path=None, read_only=False, - chunk_store=None, **kwargs): - if store is None: - store = dict() + def create_array(self, read_only=False, **kwargs): + store = dict() compressor = BZ2(level=1) kwargs.setdefault('compressor', compressor) - init_array(store, path=path, chunk_store=chunk_store, **kwargs) - return Array(store, path=path, read_only=read_only, - chunk_store=chunk_store) + init_array(store, **kwargs) + return Array(store, read_only=read_only) def test_repr(self): if not PY2: @@ -672,15 +634,12 @@ def test_repr(self): class TestArrayWithBloscCompressor(TestArray): - def create_array(self, store=None, path=None, read_only=False, - chunk_store=None, **kwargs): - if store is None: - store = dict() + def create_array(self, read_only=False, **kwargs): + store = dict() compressor = Blosc(cname='zstd', clevel=1, shuffle=1) kwargs.setdefault('compressor', compressor) - init_array(store, path=path, chunk_store=chunk_store, **kwargs) - return Array(store, path=path, read_only=read_only, - chunk_store=chunk_store) + init_array(store, **kwargs) + return Array(store, read_only=read_only) def test_repr(self): if not PY2: @@ -701,15 +660,12 @@ def test_repr(self): class TestArrayWithLZMACompressor(TestArray): - def create_array(self, store=None, path=None, read_only=False, - chunk_store=None, **kwargs): - if store is None: - store = dict() + def create_array(self, read_only=False, **kwargs): + store = dict() compressor = LZMA(preset=1) kwargs.setdefault('compressor', compressor) - init_array(store, path=path, chunk_store=chunk_store, **kwargs) - return Array(store, path=path, read_only=read_only, - chunk_store=chunk_store) + init_array(store, **kwargs) + return Array(store, read_only=read_only) def test_repr(self): z = self.create_array(shape=100, chunks=10, dtype='f4') @@ -726,11 +682,8 @@ def test_repr(self): class TestArrayWithFilters(TestArray): @staticmethod - def create_array(store=None, read_only=False, chunk_store=None, **kwargs): - if store is None: - store = dict() - if chunk_store is None: - chunk_store = store + def create_array(read_only=False, **kwargs): + store = dict() dtype = kwargs.get('dtype', None) filters = [ Delta(dtype=dtype), @@ -739,9 +692,8 @@ def create_array(store=None, read_only=False, chunk_store=None, **kwargs): kwargs.setdefault('filters', filters) compressor = Zlib(1) kwargs.setdefault('compressor', compressor) - init_array(store, chunk_store=chunk_store, **kwargs) - return Array(store, read_only=read_only, - chunk_store=chunk_store) + init_array(store, **kwargs) + return Array(store, read_only=read_only) def test_repr(self): if not PY2: @@ -757,3 +709,64 @@ def test_repr(self): actual = repr(z) for l1, l2 in zip(expect.split('\n'), actual.split('\n')): eq(l1, l2) + + +# custom store, does not support getsize() +class CustomMapping(object): + + def __init__(self): + self.inner = dict() + + def keys(self): + return self.inner.keys() + + def __getitem__(self, item): + return self.inner[item] + + def __setitem__(self, item, value): + self.inner[item] = value + + def __delitem__(self, key): + del self.inner[key] + + def __contains__(self, item): + return item in self.inner + + +class TestArrayWithCustomMapping(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + store = CustomMapping() + kwargs.setdefault('compressor', Zlib(1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_nbytes_stored(self): + z = self.create_array(shape=1000, chunks=100) + eq(-1, z.nbytes_stored) + z[:] = 42 + eq(-1, z.nbytes_stored) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + # flake8: noqa + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; initialized: 0/10 + compressor: Zlib(level=1) + store: CustomMapping +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayNoCacheMetadata(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + store = dict() + kwargs.setdefault('compressor', Zlib(level=1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=False) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 1269ca0ae5..39d47a360c 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -21,6 +21,34 @@ from zarr.codecs import Zlib +# something bcolz-like +class MockBcolzArray(object): + + def __init__(self, data, chunklen): + self.data = data + self.chunklen = chunklen + + def __getattr__(self, item): + return getattr(self.data, item) + + def __getitem__(self, item): + return self.data[item] + + +# something h5py-like +class MockH5pyDataset(object): + + def __init__(self, data, chunks): + self.data = data + self.chunks = chunks + + def __getattr__(self, item): + return getattr(self.data, item) + + def __getitem__(self, item): + return self.data[item] + + def test_array(): # with numpy array @@ -44,25 +72,23 @@ def test_array(): eq(z.dtype, z2.dtype) assert_array_equal(z[:], z2[:]) - # with something bcolz-like - class MockBcolzArray(object): - - def __init__(self, data, chunklen): - self.data = data - self.chunklen = chunklen - - def __getattr__(self, item): - return getattr(self.data, item) - - def __getitem__(self, item): - return self.data[item] - b = np.arange(1000).reshape(100, 10) c = MockBcolzArray(b, 10) z3 = array(c) eq(c.shape, z3.shape) eq((10, 10), z3.chunks) + b = np.arange(1000).reshape(100, 10) + c = MockH5pyDataset(b, chunks=(10, 2)) + z4 = array(c) + eq(c.shape, z4.shape) + eq((10, 2), z4.chunks) + + c = MockH5pyDataset(b, chunks=None) + z5 = array(c) + eq(c.shape, z5.shape) + assert_is_instance(z5.chunks, tuple) + def test_empty(): z = empty(100, chunks=10) @@ -101,10 +127,10 @@ def test_full(): def test_open_array(): - path = 'example' + store = 'example' # mode == 'w' - z = open_array(path, mode='w', shape=100, chunks=10) + z = open_array(store, mode='w', shape=100, chunks=10) z[:] = 42 assert_is_instance(z, Array) assert_is_instance(z.store, DirectoryStore) @@ -115,11 +141,11 @@ def test_open_array(): # mode in 'r', 'r+' open_group('example_group', mode='w') for mode in 'r', 'r+': - with assert_raises(ValueError): + with assert_raises(KeyError): open_array('doesnotexist', mode=mode) - with assert_raises(ValueError): + with assert_raises(KeyError): open_array('example_group', mode=mode) - z = open_array(path, mode='r') + z = open_array(store, mode='r') assert_is_instance(z, Array) assert_is_instance(z.store, DirectoryStore) eq((100,), z.shape) @@ -127,7 +153,7 @@ def test_open_array(): assert_array_equal(np.full(100, fill_value=42), z[:]) with assert_raises(PermissionError): z[:] = 43 - z = open_array(path, mode='r+') + z = open_array(store, mode='r+') assert_is_instance(z, Array) assert_is_instance(z.store, DirectoryStore) eq((100,), z.shape) @@ -137,38 +163,44 @@ def test_open_array(): assert_array_equal(np.full(100, fill_value=43), z[:]) # mode == 'a' - shutil.rmtree(path) - z = open_array(path, mode='a', shape=100, chunks=10) + shutil.rmtree(store) + z = open_array(store, mode='a', shape=100, chunks=10) z[:] = 42 assert_is_instance(z, Array) assert_is_instance(z.store, DirectoryStore) eq((100,), z.shape) eq((10,), z.chunks) assert_array_equal(np.full(100, fill_value=42), z[:]) - with assert_raises(ValueError): + with assert_raises(KeyError): open_array('example_group', mode='a') # mode in 'w-', 'x' for mode in 'w-', 'x': - shutil.rmtree(path) - z = open_array(path, mode=mode, shape=100, chunks=10) + shutil.rmtree(store) + z = open_array(store, mode=mode, shape=100, chunks=10) z[:] = 42 assert_is_instance(z, Array) assert_is_instance(z.store, DirectoryStore) eq((100,), z.shape) eq((10,), z.chunks) assert_array_equal(np.full(100, fill_value=42), z[:]) - with assert_raises(ValueError): - open_array(path, mode=mode) - with assert_raises(ValueError): + with assert_raises(KeyError): + open_array(store, mode=mode) + with assert_raises(KeyError): open_array('example_group', mode=mode) # with synchronizer - z = open_array(path, synchronizer=ThreadSynchronizer()) + z = open_array(store, synchronizer=ThreadSynchronizer()) + assert_is_instance(z, Array) + + # with path + z = open_array(store, shape=100, path='foo/bar', mode='w') assert_is_instance(z, Array) + eq('foo/bar', z.path) def test_empty_like(): + # zarr array z = empty(100, chunks=10, dtype='f4', compressor=Zlib(5), order='F') @@ -179,6 +211,7 @@ def test_empty_like(): eq(z.compressor.get_config(), z2.compressor.get_config()) eq(z.fill_value, z2.fill_value) eq(z.order, z2.order) + # numpy array a = np.empty(100, dtype='f4') z3 = empty_like(a) @@ -186,11 +219,27 @@ def test_empty_like(): eq((100,), z3.chunks) eq(a.dtype, z3.dtype) assert_is_none(z3.fill_value) + # something slightly silly a = [0] * 100 z3 = empty_like(a, shape=200) eq((200,), z3.shape) + # other array-likes + b = np.arange(1000).reshape(100, 10) + c = MockBcolzArray(b, 10) + z = empty_like(c) + eq(b.shape, z.shape) + eq((10, 10), z.chunks) + c = MockH5pyDataset(b, chunks=(10, 2)) + z = empty_like(c) + eq(b.shape, z.shape) + eq((10, 2), z.chunks) + c = MockH5pyDataset(b, chunks=None) + z = empty_like(c) + eq(b.shape, z.shape) + assert_is_instance(z.chunks, tuple) + def test_zeros_like(): # zarr array @@ -275,7 +324,7 @@ def test_open_like(): eq(a.shape, z3.shape) eq((10,), z3.chunks) eq(a.dtype, z3.dtype) - assert_is_none(z3.fill_value) + eq(0, z3.fill_value) def test_create(): @@ -287,7 +336,7 @@ def test_create(): eq((100,), z.chunks) # auto-chunks eq(np.dtype(None), z.dtype) eq('blosc', z.compressor.codec_id) - assert_is_none(z.fill_value) + eq(0, z.fill_value) # all specified z = create(100, chunks=10, dtype='i4', compressor=Zlib(1), @@ -325,3 +374,23 @@ def test_create(): # errors with assert_raises(ValueError): create(100, compression=1) + + +def test_compression_args(): + + z = create(100, compression='zlib', compression_opts=9) + assert_is_instance(z, Array) + eq('zlib', z.compressor.codec_id) + eq(9, z.compressor.level) + + # 'compressor' overrides 'compression' + z = create(100, compressor=Zlib(9), compression='bz2', compression_opts=1) + assert_is_instance(z, Array) + eq('zlib', z.compressor.codec_id) + eq(9, z.compressor.level) + + # 'compressor' ignores 'compression_opts' + z = create(100, compressor=Zlib(9), compression_opts=1) + assert_is_instance(z, Array) + eq('zlib', z.compressor.codec_id) + eq(9, z.compressor.level) diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 9ddc2f53f1..a17372620d 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -66,14 +66,14 @@ def test_group_init_2(self): def test_group_init_errors_1(self): store, chunk_store = self.create_store() # group metadata not initialized - with assert_raises(ValueError): + with assert_raises(KeyError): Group(store, chunk_store=chunk_store) def test_group_init_errors_2(self): store, chunk_store = self.create_store() init_array(store, shape=1000, chunks=100, chunk_store=chunk_store) # array blocks group - with assert_raises(ValueError): + with assert_raises(KeyError): Group(store, chunk_store=chunk_store) def test_create_group(self): @@ -342,6 +342,31 @@ def test_create_errors(self): with assert_raises(PermissionError): g.require_dataset('zzz', shape=100, chunks=10) + def test_create_overwrite(self): + try: + for method_name in 'create_dataset', 'create', 'empty', 'zeros', \ + 'ones': + g = self.create_group() + getattr(g, method_name)('foo', shape=100, chunks=10) + + # overwrite array with array + d = getattr(g, method_name)('foo', shape=200, chunks=20, + overwrite=True) + eq((200,), d.shape) + # overwrite array with group + g2 = g.create_group('foo', overwrite=True) + eq(0, len(g2)) + # overwrite group with array + d = getattr(g, method_name)('foo', shape=300, chunks=30, + overwrite=True) + eq((300,), d.shape) + # overwrite array with group + d = getattr(g, method_name)('foo/bar', shape=400, chunks=40, + overwrite=True) + assert_is_instance(g['foo'], Group) + except NotImplementedError: + pass + def test_getitem_contains_iterators(self): # setup g1 = self.create_group() @@ -760,7 +785,7 @@ def test_group(): # overwrite behaviour store = dict() init_array(store, shape=100, chunks=10) - with assert_raises(ValueError): + with assert_raises(KeyError): group(store) g = group(store, overwrite=True) assert_is_instance(g, Group) @@ -770,10 +795,10 @@ def test_group(): def test_open_group(): # test the open_group() convenience function - path = 'example' + store = 'example' # mode == 'w' - g = open_group(path, mode='w') + g = open_group(store, mode='w') assert_is_instance(g, Group) assert_is_instance(g.store, DirectoryStore) eq(0, len(g)) @@ -783,42 +808,47 @@ def test_open_group(): # mode in 'r', 'r+' open_array('example_array', shape=100, chunks=10, mode='w') for mode in 'r', 'r+': - with assert_raises(ValueError): + with assert_raises(KeyError): open_group('doesnotexist', mode=mode) - with assert_raises(ValueError): + with assert_raises(KeyError): open_group('example_array', mode=mode) - g = open_group(path, mode='r') + g = open_group(store, mode='r') assert_is_instance(g, Group) eq(2, len(g)) with assert_raises(PermissionError): g.create_group('baz') - g = open_group(path, mode='r+') + g = open_group(store, mode='r+') assert_is_instance(g, Group) eq(2, len(g)) g.create_groups('baz', 'quux') eq(4, len(g)) # mode == 'a' - shutil.rmtree(path) - g = open_group(path, mode='a') + shutil.rmtree(store) + g = open_group(store, mode='a') assert_is_instance(g, Group) assert_is_instance(g.store, DirectoryStore) eq(0, len(g)) g.create_groups('foo', 'bar') eq(2, len(g)) - with assert_raises(ValueError): + with assert_raises(KeyError): open_group('example_array', mode='a') # mode in 'w-', 'x' for mode in 'w-', 'x': - shutil.rmtree(path) - g = open_group(path, mode=mode) + shutil.rmtree(store) + g = open_group(store, mode=mode) assert_is_instance(g, Group) assert_is_instance(g.store, DirectoryStore) eq(0, len(g)) g.create_groups('foo', 'bar') eq(2, len(g)) - with assert_raises(ValueError): - open_group(path, mode=mode) - with assert_raises(ValueError): + with assert_raises(KeyError): + open_group(store, mode=mode) + with assert_raises(KeyError): open_group('example_array', mode=mode) + + # open with path + g = open_group(store, path='foo/bar') + assert_is_instance(g, Group) + eq('foo/bar', g.path) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 089e3e32b9..1622c8f049 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -168,17 +168,17 @@ def test_hierarchy(self): eq(6, store.getsize('c/e')) eq(3, store.getsize('c/e/f')) eq(3, store.getsize('c/e/g')) - with assert_raises(ValueError): + with assert_raises(KeyError): store.getsize('x') - with assert_raises(ValueError): + with assert_raises(KeyError): store.getsize('a/x') - with assert_raises(ValueError): + with assert_raises(KeyError): store.getsize('c/x') - with assert_raises(ValueError): + with assert_raises(KeyError): store.getsize('c/x/y') - with assert_raises(ValueError): + with assert_raises(KeyError): store.getsize('c/d/y') - with assert_raises(ValueError): + with assert_raises(KeyError): store.getsize('c/d/y/z') # test listdir (optional) @@ -253,7 +253,7 @@ def test_init_array_overwrite(self): ) # don't overwrite (default) - with assert_raises(ValueError): + with assert_raises(KeyError): init_array(store, shape=1000, chunks=100) # do overwrite @@ -306,7 +306,7 @@ def test_init_array_overwrite_path(self): store[path + '/' + array_meta_key] = encode_array_metadata(meta) # don't overwrite - with assert_raises(ValueError): + with assert_raises(KeyError): init_array(store, shape=1000, chunks=100, path=path) # do overwrite @@ -316,7 +316,8 @@ def test_init_array_overwrite_path(self): except NotImplementedError: pass else: - assert array_meta_key in store + assert group_meta_key in store + assert array_meta_key not in store assert (path + '/' + array_meta_key) in store # should have been overwritten meta = decode_array_metadata(store[path + '/' + array_meta_key]) @@ -324,12 +325,6 @@ def test_init_array_overwrite_path(self): eq((1000,), meta['shape']) eq((100,), meta['chunks']) eq(np.dtype('i4'), meta['dtype']) - # should have been left untouched - meta = decode_array_metadata(store[array_meta_key]) - eq(ZARR_FORMAT, meta['zarr_format']) - eq((2000,), meta['shape']) - eq((200,), meta['chunks']) - eq(np.dtype('u1'), meta['dtype']) def test_init_array_overwrite_group(self): # setup @@ -338,7 +333,7 @@ def test_init_array_overwrite_group(self): store[path + '/' + group_meta_key] = encode_group_metadata() # don't overwrite - with assert_raises(ValueError): + with assert_raises(KeyError): init_array(store, shape=1000, chunks=100, path=path) # do overwrite @@ -373,7 +368,7 @@ def test_init_array_overwrite_chunk_store(self): chunk_store['1'] = b'bbb' # don't overwrite (default) - with assert_raises(ValueError): + with assert_raises(KeyError): init_array(store, shape=1000, chunks=100, chunk_store=chunk_store) # do overwrite @@ -425,7 +420,7 @@ def test_init_group_overwrite(self): ) # don't overwrite array (default) - with assert_raises(ValueError): + with assert_raises(KeyError): init_group(store) # do overwrite @@ -440,7 +435,7 @@ def test_init_group_overwrite(self): eq(ZARR_FORMAT, meta['zarr_format']) # don't overwrite group - with assert_raises(ValueError): + with assert_raises(KeyError): init_group(store) def test_init_group_overwrite_path(self): @@ -458,7 +453,7 @@ def test_init_group_overwrite_path(self): store[path + '/' + array_meta_key] = encode_array_metadata(meta) # don't overwrite - with assert_raises(ValueError): + with assert_raises(KeyError): init_group(store, path=path) # do overwrite @@ -467,18 +462,13 @@ def test_init_group_overwrite_path(self): except NotImplementedError: pass else: - assert array_meta_key in store + assert array_meta_key not in store + assert group_meta_key in store assert (path + '/' + array_meta_key) not in store assert (path + '/' + group_meta_key) in store # should have been overwritten meta = decode_group_metadata(store[path + '/' + group_meta_key]) eq(ZARR_FORMAT, meta['zarr_format']) - # should have been left untouched - meta = decode_array_metadata(store[array_meta_key]) - eq(ZARR_FORMAT, meta['zarr_format']) - eq((2000,), meta['shape']) - eq((200,), meta['chunks']) - eq(np.dtype('u1'), meta['dtype']) def test_init_group_overwrite_chunk_store(self): # setup @@ -497,7 +487,7 @@ def test_init_group_overwrite_chunk_store(self): chunk_store['baz'] = b'quux' # don't overwrite array (default) - with assert_raises(ValueError): + with assert_raises(KeyError): init_group(store, chunk_store=chunk_store) # do overwrite @@ -514,7 +504,7 @@ def test_init_group_overwrite_chunk_store(self): assert 'baz' not in chunk_store # don't overwrite group - with assert_raises(ValueError): + with assert_raises(KeyError): init_group(store) diff --git a/zarr/tests/test_sync.py b/zarr/tests/test_sync.py index bed27f7d9e..4e2bfcd3b9 100644 --- a/zarr/tests/test_sync.py +++ b/zarr/tests/test_sync.py @@ -78,7 +78,8 @@ def create_array(self, store=None, path=None, read_only=False, atexit.register(shutil.rmtree, sync_path) synchronizer = ProcessSynchronizer(sync_path) return Array(store, path=path, synchronizer=synchronizer, - read_only=read_only, chunk_store=chunk_store) + read_only=read_only, chunk_store=chunk_store, + cache_metadata=False) def test_repr(self): if not PY2: diff --git a/zarr/util.py b/zarr/util.py index 525e537dad..97da2f8e51 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -12,6 +12,9 @@ def normalize_shape(shape): """Convenience function to normalize the `shape` argument.""" + if shape is None: + raise TypeError('shape is None') + # handle 1D convenience form if isinstance(shape, integer_types): shape = (int(shape),)