From 45186538b0525a5c3bf5a4420ae0f27adf37c050 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Mon, 21 Apr 2025 18:18:54 +0100 Subject: [PATCH 01/25] More consistent store docstrings (#2976) Co-authored-by: Davis Bennett --- src/zarr/storage/_fsspec.py | 2 +- src/zarr/storage/_local.py | 2 +- src/zarr/storage/_logging.py | 2 +- src/zarr/storage/_memory.py | 8 +++++--- src/zarr/storage/_obstore.py | 3 ++- src/zarr/storage/_wrapper.py | 3 ++- src/zarr/storage/_zip.py | 2 +- 7 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index a4730a93d9..40f1b2fbc0 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -32,7 +32,7 @@ class FsspecStore(Store): """ - A remote Store based on FSSpec + Store for remote data based on FSSpec. Parameters ---------- diff --git a/src/zarr/storage/_local.py b/src/zarr/storage/_local.py index bd5bfc1da2..85d244f17b 100644 --- a/src/zarr/storage/_local.py +++ b/src/zarr/storage/_local.py @@ -67,7 +67,7 @@ def _put( class LocalStore(Store): """ - Local file system store. + Store for the local file system. Parameters ---------- diff --git a/src/zarr/storage/_logging.py b/src/zarr/storage/_logging.py index 5f1a97acd9..a2164a418f 100644 --- a/src/zarr/storage/_logging.py +++ b/src/zarr/storage/_logging.py @@ -24,7 +24,7 @@ class LoggingStore(WrapperStore[T_Store]): """ - Store wrapper that logs all calls to the wrapped store. + Store that logs all calls to another wrapped store. Parameters ---------- diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py index b37fc8d5c9..ea25f82a3b 100644 --- a/src/zarr/storage/_memory.py +++ b/src/zarr/storage/_memory.py @@ -19,7 +19,7 @@ class MemoryStore(Store): """ - In-memory store. + Store for local memory. Parameters ---------- @@ -173,8 +173,10 @@ async def list_dir(self, prefix: str) -> AsyncIterator[str]: class GpuMemoryStore(MemoryStore): - """A GPU only memory store that stores every chunk in GPU memory irrespective - of the original location. + """ + Store for GPU memory. + + Stores every chunk in GPU memory irrespective of the original location. The dictionary of buffers to initialize this memory store with *must* be GPU Buffers. diff --git a/src/zarr/storage/_obstore.py b/src/zarr/storage/_obstore.py index 4381acb2ae..8c2469747d 100644 --- a/src/zarr/storage/_obstore.py +++ b/src/zarr/storage/_obstore.py @@ -37,7 +37,8 @@ class ObjectStore(Store): - """A Zarr store that uses obstore for fast read/write from AWS, GCP, Azure. + """ + Store that uses obstore for fast read/write from AWS, GCP, Azure. Parameters ---------- diff --git a/src/zarr/storage/_wrapper.py b/src/zarr/storage/_wrapper.py index 349048e495..f21d378191 100644 --- a/src/zarr/storage/_wrapper.py +++ b/src/zarr/storage/_wrapper.py @@ -18,7 +18,8 @@ class WrapperStore(Store, Generic[T_Store]): """ - A store class that wraps an existing ``Store`` instance. + Store that wraps an existing Store. + By default all of the store methods are delegated to the wrapped store instance, which is accessible via the ``._store`` attribute of this class. diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index bbfe6c67aa..f9eb8d8808 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -24,7 +24,7 @@ class ZipStore(Store): """ - Storage class using a ZIP file. + Store using a ZIP file. Parameters ---------- From cf879eb78cfaebe85c5cc42fd2bd0daab898b023 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Tue, 22 Apr 2025 23:18:26 +0100 Subject: [PATCH 02/25] Improve array and group docstringspc (#2975) --- src/zarr/core/array.py | 4 +++- src/zarr/core/group.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f2c88c508b..62efe44e4c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1711,7 +1711,9 @@ def _info( # TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed @dataclass(frozen=False) class Array: - """Instantiate an array from an initialized store.""" + """ + A Zarr array. + """ _async_array: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 925252ccf0..3f8dad1740 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1744,6 +1744,10 @@ async def move(self, source: str, dest: str) -> None: @dataclass(frozen=True) class Group(SyncMixin): + """ + A Zarr group. + """ + _async_group: AsyncGroup @classmethod From 630897b7ba55a0c6d0aa376b86399a55ef8088b5 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 23 Apr 2025 16:51:30 +0200 Subject: [PATCH 03/25] 3.0.7 release notes (#3008) * create 3.0.7 release notes * describe misc changes * add pr link for 2997 --- changes/1661.feature.rst | 1 - changes/2622.feature.rst | 1 - changes/2714.misc.rst | 1 - changes/2718.bugfix.rst | 3 --- changes/2802.fix.rst | 1 - changes/2924.chore.rst | 2 -- changes/2944.misc.rst | 1 - changes/2991.doc.rst | 1 - changes/2996.bugfix.rst | 4 ---- docs/release-notes.rst | 36 ++++++++++++++++++++++++++++++++++++ 10 files changed, 36 insertions(+), 15 deletions(-) delete mode 100644 changes/1661.feature.rst delete mode 100644 changes/2622.feature.rst delete mode 100644 changes/2714.misc.rst delete mode 100644 changes/2718.bugfix.rst delete mode 100644 changes/2802.fix.rst delete mode 100644 changes/2924.chore.rst delete mode 100644 changes/2944.misc.rst delete mode 100644 changes/2991.doc.rst delete mode 100644 changes/2996.bugfix.rst diff --git a/changes/1661.feature.rst b/changes/1661.feature.rst deleted file mode 100644 index 38d60b23c1..0000000000 --- a/changes/1661.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add experimental ObjectStore storage class based on obstore. \ No newline at end of file diff --git a/changes/2622.feature.rst b/changes/2622.feature.rst deleted file mode 100644 index f5c7cbe192..0000000000 --- a/changes/2622.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add ``zarr.from_array`` using concurrent streaming of source data \ No newline at end of file diff --git a/changes/2714.misc.rst b/changes/2714.misc.rst deleted file mode 100644 index 9ab55089d2..0000000000 --- a/changes/2714.misc.rst +++ /dev/null @@ -1 +0,0 @@ -Make warning filters in the tests more specific, so warnings emitted by tests added in the future are more likely to be caught instead of ignored. diff --git a/changes/2718.bugfix.rst b/changes/2718.bugfix.rst deleted file mode 100644 index 48ddf8b5a8..0000000000 --- a/changes/2718.bugfix.rst +++ /dev/null @@ -1,3 +0,0 @@ -0-dimensional arrays are now returning a scalar. Therefore, the return type of ``__getitem__`` changed -to NDArrayLikeOrScalar. This change is to make the behavior of 0-dimensional arrays consistent with -``numpy`` scalars. \ No newline at end of file diff --git a/changes/2802.fix.rst b/changes/2802.fix.rst deleted file mode 100644 index 471ddf66f4..0000000000 --- a/changes/2802.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization \ No newline at end of file diff --git a/changes/2924.chore.rst b/changes/2924.chore.rst deleted file mode 100644 index 7bfbb2e1c7..0000000000 --- a/changes/2924.chore.rst +++ /dev/null @@ -1,2 +0,0 @@ -Define a new versioning policy based on Effective Effort Versioning. This replaces the old -Semantic Versioning-based policy. \ No newline at end of file diff --git a/changes/2944.misc.rst b/changes/2944.misc.rst deleted file mode 100644 index 48356a1fef..0000000000 --- a/changes/2944.misc.rst +++ /dev/null @@ -1 +0,0 @@ -Avoid an unnecessary memory copy when writing Zarr to a local file diff --git a/changes/2991.doc.rst b/changes/2991.doc.rst deleted file mode 100644 index 828cfcdb2f..0000000000 --- a/changes/2991.doc.rst +++ /dev/null @@ -1 +0,0 @@ -Updated the 3.0 migration guide to include the removal of "." syntax for getting group members. diff --git a/changes/2996.bugfix.rst b/changes/2996.bugfix.rst deleted file mode 100644 index 977dc79d0b..0000000000 --- a/changes/2996.bugfix.rst +++ /dev/null @@ -1,4 +0,0 @@ -Fixes `ConsolidatedMetadata` serialization of `nan`, `inf`, and `-inf` to be -consistent with the behavior of `ArrayMetadata`. - - diff --git a/docs/release-notes.rst b/docs/release-notes.rst index c585e4f0d3..341a32c364 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -3,6 +3,42 @@ Release notes .. towncrier release notes start +3.0.7 (2025-04-22) +------------------ + +Features +~~~~~~~~ + +- Add experimental ObjectStore storage class based on obstore. (:issue:`1661`) +- Add ``zarr.from_array`` using concurrent streaming of source data (:issue:`2622`) + + +Bugfixes +~~~~~~~~ + +- 0-dimensional arrays are now returning a scalar. Therefore, the return type of ``__getitem__`` changed + to NDArrayLikeOrScalar. This change is to make the behavior of 0-dimensional arrays consistent with + ``numpy`` scalars. (:issue:`2718`) +- Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization (:issue:`2802`) +- Fixes `ConsolidatedMetadata` serialization of `nan`, `inf`, and `-inf` to be + consistent with the behavior of `ArrayMetadata`. (:issue:`2996`) + + +Improved Documentation +~~~~~~~~~~~~~~~~~~~~~~ + +- Updated the 3.0 migration guide to include the removal of "." syntax for getting group members. (:issue:`2991`, :issue:`2997`) + + +Misc +~~~~ +- Define a new versioning policy based on Effective Effort Versioning. This replaces the old Semantic + Versioning-based policy. (:issue:`2924`, :issue:`2910`) +- Make warning filters in the tests more specific, so warnings emitted by tests added in the future + are more likely to be caught instead of ignored. (:issue:`2714`) +- Avoid an unnecessary memory copy when writing Zarr to a local file (:issue:`2944`) + + 3.0.6 (2025-03-20) ------------------ From 5f4aeb457072a503d92e6c63c6b66f920cb91611 Mon Sep 17 00:00:00 2001 From: Altay Sansal Date: Thu, 24 Apr 2025 11:26:31 -0500 Subject: [PATCH 04/25] remove debug print statement (#3007) --- src/zarr/core/metadata/v2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 11f14b37aa..d19193963f 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -294,7 +294,6 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: """Handle structured dtype/fill value pairs""" - print("FILL VALUE", fill_value, "DT", dtype) try: if isinstance(fill_value, list): return np.array([tuple(fill_value)], dtype=dtype)[0] From 0351c4e524e60b849dbaca2c00fabee20e882f46 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 25 Apr 2025 08:27:45 -0600 Subject: [PATCH 05/25] hypothesis: Don't generate node name: 'zarr.json' (#3020) --- src/zarr/testing/strategies.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index f2dc38483a..663d46034d 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -96,11 +96,15 @@ def clear_store(x: Store) -> Store: zarr_key_chars = st.sampled_from( ".-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz" ) -node_names = st.text(zarr_key_chars, min_size=1).filter( - lambda t: t not in (".", "..") and not t.startswith("__") +node_names = ( + st.text(zarr_key_chars, min_size=1) + .filter(lambda t: t not in (".", "..") and not t.startswith("__")) + .filter(lambda name: name.lower() != "zarr.json") ) -short_node_names = st.text(zarr_key_chars, max_size=3, min_size=1).filter( - lambda t: t not in (".", "..") and not t.startswith("__") +short_node_names = ( + st.text(zarr_key_chars, max_size=3, min_size=1) + .filter(lambda t: t not in (".", "..") and not t.startswith("__")) + .filter(lambda name: name.lower() != "zarr.json") ) array_names = node_names attrs = st.none() | st.dictionaries(_attr_keys, _attr_values) From 0c7677890d497918ae11d7a633c5fac93781f211 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Wed, 30 Apr 2025 18:10:38 +0200 Subject: [PATCH 06/25] (fix): structured dtype fill value consolidated metadata (#3015) * (fix): structured dtype consolidated metadata fill value * (chore): relnote * (chore): test * (fix): more robust testing --------- Co-authored-by: Davis Bennett --- changes/2998.bugfix.md | 1 + src/zarr/core/group.py | 9 ++++++++- tests/test_metadata/test_v2.py | 25 +++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 changes/2998.bugfix.md diff --git a/changes/2998.bugfix.md b/changes/2998.bugfix.md new file mode 100644 index 0000000000..7b94223122 --- /dev/null +++ b/changes/2998.bugfix.md @@ -0,0 +1 @@ +Fix structured `dtype` fill value serialization for consolidated metadata \ No newline at end of file diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 3f8dad1740..3f4f15b9e9 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import base64 import itertools import json import logging @@ -358,7 +359,13 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: d[f"{k}/{ZATTRS_JSON}"] = _replace_special_floats(attrs) if "shape" in v: # it's an array - d[f"{k}/{ZARRAY_JSON}"] = _replace_special_floats(v) + if isinstance(v.get("fill_value", None), np.void): + v["fill_value"] = base64.standard_b64encode( + cast(bytes, v["fill_value"]) + ).decode("ascii") + else: + v = _replace_special_floats(v) + d[f"{k}/{ZARRAY_JSON}"] = v else: d[f"{k}/{ZGROUP_JSON}"] = { "zarr_format": self.zarr_format, diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 4600a977d4..08b9cb2507 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -316,3 +316,28 @@ def test_zstd_checksum() -> None: arr.metadata.to_buffer_dict(default_buffer_prototype())[".zarray"].to_bytes() ) assert "checksum" not in metadata["compressor"] + + +@pytest.mark.parametrize( + "fill_value", [None, np.void((0, 0), np.dtype([("foo", "i4"), ("bar", "i4")]))] +) +def test_structured_dtype_fill_value_serialization(tmp_path, fill_value): + group_path = tmp_path / "test.zarr" + root_group = zarr.open_group(group_path, mode="w", zarr_format=2) + dtype = np.dtype([("foo", "i4"), ("bar", "i4")]) + root_group.create_array( + name="structured_dtype", + shape=(100, 100), + chunks=(100, 100), + dtype=dtype, + fill_value=fill_value, + ) + + zarr.consolidate_metadata(root_group.store, zarr_format=2) + root_group = zarr.open_group(group_path, mode="r") + assert ( + root_group.metadata.consolidated_metadata.to_dict()["metadata"]["structured_dtype"][ + "fill_value" + ] + == fill_value + ) From 36a1bac850c50fbaa1708f8986edd5766f5fd67b Mon Sep 17 00:00:00 2001 From: David Stansby Date: Wed, 30 Apr 2025 17:55:27 +0100 Subject: [PATCH 07/25] Fix specifying memory order in v2 arrays (#2951) * Fix specifying memory order in v2 arrays * Re-work test_order * Fix getting array order in v3 * Fix order of arrays for v2 * Fix order with V3 arrays * Fix mypy * Remove errant print() * Fix order with v3 arrays * Fix v2 test * Add numpy order parametrization * Add changelog entry --- changes/2950.bufgix.rst | 1 + src/zarr/api/asynchronous.py | 17 +++++++-------- src/zarr/core/array.py | 15 ++++++++++--- tests/test_api.py | 7 +++--- tests/test_array.py | 42 ++++++++++++++++++++---------------- tests/test_v2.py | 41 ++++++++++++++++++----------------- 6 files changed, 68 insertions(+), 55 deletions(-) create mode 100644 changes/2950.bufgix.rst diff --git a/changes/2950.bufgix.rst b/changes/2950.bufgix.rst new file mode 100644 index 0000000000..67cd61f377 --- /dev/null +++ b/changes/2950.bufgix.rst @@ -0,0 +1 @@ +Specifying the memory order of Zarr format 2 arrays using the ``order`` keyword argument has been fixed. diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 285d777258..9b8b43a517 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1040,15 +1040,13 @@ async def create( ) warnings.warn(UserWarning(msg), stacklevel=1) config_dict["write_empty_chunks"] = write_empty_chunks - if order is not None: - if config is not None: - msg = ( - "Both order and config keyword arguments are set. " - "This is redundant. When both are set, order will be ignored and " - "config will be used." - ) - warnings.warn(UserWarning(msg), stacklevel=1) - config_dict["order"] = order + if order is not None and config is not None: + msg = ( + "Both order and config keyword arguments are set. " + "This is redundant. When both are set, order will be ignored and " + "config will be used." + ) + warnings.warn(UserWarning(msg), stacklevel=1) config_parsed = ArrayConfig.from_dict(config_dict) @@ -1062,6 +1060,7 @@ async def create( overwrite=overwrite, filters=filters, dimension_separator=dimension_separator, + order=order, zarr_format=zarr_format, chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 62efe44e4c..b0e8b03cd7 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -609,6 +609,7 @@ async def _create( if order is not None: _warn_order_kwarg() + config_parsed = replace(config_parsed, order=order) result = await cls._create_v3( store_path, @@ -1044,7 +1045,10 @@ def order(self) -> MemoryOrder: bool Memory order of the array """ - return self._config.order + if self.metadata.zarr_format == 2: + return self.metadata.order + else: + return self._config.order @property def attrs(self) -> dict[str, JSON]: @@ -1276,14 +1280,14 @@ async def _get_selection( out_buffer = prototype.nd_buffer.create( shape=indexer.shape, dtype=out_dtype, - order=self._config.order, + order=self.order, fill_value=self.metadata.fill_value, ) if product(indexer.shape) > 0: # need to use the order from the metadata for v2 _config = self._config if self.metadata.zarr_format == 2: - _config = replace(_config, order=self.metadata.order) + _config = replace(_config, order=self.order) # reading chunks and decoding them await self.codec_pipeline.read( @@ -4256,6 +4260,11 @@ async def init_array( chunks_out = chunk_shape_parsed codecs_out = sub_codecs + if config is None: + config = {} + if order is not None and isinstance(config, dict): + config["order"] = config.get("order", order) + meta = AsyncArray._create_metadata_v3( shape=shape_parsed, dtype=dtype_parsed, diff --git a/tests/test_api.py b/tests/test_api.py index f03fd53f7a..9f03a1067a 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -326,13 +326,12 @@ def test_array_order(zarr_format: ZarrFormat) -> None: def test_array_order_warns(order: MemoryOrder | None, zarr_format: ZarrFormat) -> None: with pytest.warns(RuntimeWarning, match="The `order` keyword argument .*"): arr = zarr.ones(shape=(2, 2), order=order, zarr_format=zarr_format) - expected = order or zarr.config.get("array.order") - assert arr.order == expected + assert arr.order == order vals = np.asarray(arr) - if expected == "C": + if order == "C": assert vals.flags.c_contiguous - elif expected == "F": + elif order == "F": assert vals.flags.f_contiguous else: raise AssertionError diff --git a/tests/test_array.py b/tests/test_array.py index 5c3c556dfb..4be9bbde43 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1262,9 +1262,11 @@ async def test_data_ignored_params(store: Store) -> None: await create_array(store, data=data, shape=None, dtype=data.dtype, overwrite=True) @staticmethod - @pytest.mark.parametrize("order_config", ["C", "F", None]) + @pytest.mark.parametrize("order", ["C", "F", None]) + @pytest.mark.parametrize("with_config", [True, False]) def test_order( - order_config: MemoryOrder | None, + order: MemoryOrder | None, + with_config: bool, zarr_format: ZarrFormat, store: MemoryStore, ) -> None: @@ -1272,29 +1274,31 @@ def test_order( Test that the arrays generated by array indexing have a memory order defined by the config order value, and that for zarr v2 arrays, the ``order`` field in the array metadata is set correctly. """ - config: ArrayConfigLike = {} - if order_config is None: + config: ArrayConfigLike | None = {} + if order is None: config = {} expected = zarr.config.get("array.order") else: - config = {"order": order_config} - expected = order_config + config = {"order": order} + expected = order + + if not with_config: + # Test without passing config parameter + config = None + + arr = zarr.create_array( + store=store, + shape=(2, 2), + zarr_format=zarr_format, + dtype="i4", + order=order, + config=config, + ) + assert arr.order == expected if zarr_format == 2: - arr = zarr.create_array( - store=store, - shape=(2, 2), - zarr_format=zarr_format, - dtype="i4", - order=expected, - config=config, - ) - # guard for type checking assert arr.metadata.zarr_format == 2 assert arr.metadata.order == expected - else: - arr = zarr.create_array( - store=store, shape=(2, 2), zarr_format=zarr_format, dtype="i4", config=config - ) + vals = np.asarray(arr) if expected == "C": assert vals.flags.c_contiguous diff --git a/tests/test_v2.py b/tests/test_v2.py index 3a36bc01fd..8f0e1b2d29 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -195,12 +195,13 @@ def test_create_array_defaults(store: Store): ) -@pytest.mark.parametrize("array_order", ["C", "F"]) -@pytest.mark.parametrize("data_order", ["C", "F"]) -@pytest.mark.parametrize("memory_order", ["C", "F"]) -def test_v2_non_contiguous( - array_order: Literal["C", "F"], data_order: Literal["C", "F"], memory_order: Literal["C", "F"] -) -> None: +@pytest.mark.parametrize("numpy_order", ["C", "F"]) +@pytest.mark.parametrize("zarr_order", ["C", "F"]) +def test_v2_non_contiguous(numpy_order: Literal["C", "F"], zarr_order: Literal["C", "F"]) -> None: + """ + Make sure zarr v2 arrays save data using the memory order given to the zarr array, + not the memory order of the original numpy array. + """ store = MemoryStore() arr = zarr.create_array( store, @@ -212,12 +213,11 @@ def test_v2_non_contiguous( filters=None, compressors=None, overwrite=True, - order=array_order, - config={"order": memory_order}, + order=zarr_order, ) - # Non-contiguous write - a = np.arange(arr.shape[0] * arr.shape[1]).reshape(arr.shape, order=data_order) + # Non-contiguous write, using numpy memory order + a = np.arange(arr.shape[0] * arr.shape[1]).reshape(arr.shape, order=numpy_order) arr[6:9, 3:6] = a[6:9, 3:6] # The slice on the RHS is important np.testing.assert_array_equal(arr[6:9, 3:6], a[6:9, 3:6]) @@ -225,13 +225,15 @@ def test_v2_non_contiguous( a[6:9, 3:6], np.frombuffer( sync(store.get("2.1", default_buffer_prototype())).to_bytes(), dtype="float64" - ).reshape((3, 3), order=array_order), + ).reshape((3, 3), order=zarr_order), ) - if memory_order == "F": + # After writing and reading from zarr array, order should be same as zarr order + if zarr_order == "F": assert (arr[6:9, 3:6]).flags.f_contiguous else: assert (arr[6:9, 3:6]).flags.c_contiguous + # Contiguous write store = MemoryStore() arr = zarr.create_array( store, @@ -243,18 +245,17 @@ def test_v2_non_contiguous( compressors=None, filters=None, overwrite=True, - order=array_order, - config={"order": memory_order}, + order=zarr_order, ) - # Contiguous write - a = np.arange(9).reshape((3, 3), order=data_order) - if data_order == "F": - assert a.flags.f_contiguous - else: - assert a.flags.c_contiguous + a = np.arange(9).reshape((3, 3), order=numpy_order) arr[6:9, 3:6] = a np.testing.assert_array_equal(arr[6:9, 3:6], a) + # After writing and reading from zarr array, order should be same as zarr order + if zarr_order == "F": + assert (arr[6:9, 3:6]).flags.f_contiguous + else: + assert (arr[6:9, 3:6]).flags.c_contiguous def test_default_compressor_deprecation_warning(): From 0b97e784788c7b4386fd295b4574bb5794dc0e37 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Fri, 2 May 2025 16:08:45 +0200 Subject: [PATCH 08/25] simplify NDBuffer.as_scalar (#3027) * index with an empty tuple to get scalar * changelog * add as_scalar test --- changes/3027.misc.rst | 1 + src/zarr/core/buffer/core.py | 11 +---------- tests/test_buffer.py | 6 ++++++ 3 files changed, 8 insertions(+), 10 deletions(-) create mode 100644 changes/3027.misc.rst diff --git a/changes/3027.misc.rst b/changes/3027.misc.rst new file mode 100644 index 0000000000..ffbfe9b808 --- /dev/null +++ b/changes/3027.misc.rst @@ -0,0 +1 @@ +Simplified scalar indexing of size-1 arrays. \ No newline at end of file diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 1318f868a0..cfcd7e6633 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -427,16 +427,7 @@ def as_scalar(self) -> ScalarType: """Returns the buffer as a scalar value""" if self._data.size != 1: raise ValueError("Buffer does not contain a single scalar value") - item = self.as_numpy_array().item() - scalar: ScalarType - - if np.issubdtype(self.dtype, np.datetime64): - unit: str = np.datetime_data(self.dtype)[0] # Extract the unit (e.g., 'Y', 'D', etc.) - scalar = np.datetime64(item, unit) - else: - scalar = self.dtype.type(item) # Regular conversion for non-datetime types - - return scalar + return cast(ScalarType, self.as_numpy_array()[()]) @property def dtype(self) -> np.dtype[Any]: diff --git a/tests/test_buffer.py b/tests/test_buffer.py index 33ac0266eb..73b3a16677 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -155,3 +155,9 @@ def test_numpy_buffer_prototype() -> None: assert isinstance(ndbuffer.as_ndarray_like(), np.ndarray) with pytest.raises(ValueError, match="Buffer does not contain a single scalar value"): ndbuffer.as_scalar() + + +# TODO: the same test for other buffer classes +def test_cpu_buffer_as_scalar() -> None: + buf = cpu.buffer_prototype.nd_buffer.create(shape=(), dtype="int64") + assert buf.as_scalar() == buf.as_ndarray_like()[()] # type: ignore[index] From 213863ba0548647cba1ab1d53f8eaa3631f4e0d0 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Sat, 3 May 2025 23:15:26 +0200 Subject: [PATCH 09/25] Use a dictionary comprehension instead (#3029) --- tests/test_group.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/test_group.py b/tests/test_group.py index 1e4f31b5d6..b4dace2568 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -1583,14 +1583,12 @@ async def test_create_hierarchy( sync_group.create_hierarchy(store=store, nodes=hierarchy_spec, overwrite=overwrite) ) elif impl == "async": - created = dict( - [ - a - async for a in create_hierarchy( - store=store, nodes=hierarchy_spec, overwrite=overwrite - ) - ] - ) + created = { + k: v + async for k, v in create_hierarchy( + store=store, nodes=hierarchy_spec, overwrite=overwrite + ) + } else: raise ValueError(f"Invalid impl: {impl}") if not overwrite: From c51150cba4666952978b563279503a9119bd1bc2 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Wed, 7 May 2025 19:11:19 +0100 Subject: [PATCH 10/25] Pin minimum s3fs (#3041) --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0b351c3b27..09615b6b22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,7 @@ test = [ remote_tests = [ 'zarr[remote]', "botocore", - "s3fs", + "s3fs>=2023.10.0", "moto[s3,server]", "requests", ] @@ -104,7 +104,7 @@ docs = [ # Optional dependencies to run examples 'numcodecs[msgpack]', 'rich', - 's3fs', + 's3fs>=2023.10.0', 'astroid<4' ] From 693324c19d00dd013aa614643c6513ea20740950 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Wed, 7 May 2025 19:37:47 +0100 Subject: [PATCH 11/25] Fix some mypy errors (#3044) * Clean mypy config * Fix typing errors in entrypoint test package * Fix test_transpose typing errors * Fix typing errors in test_config --------- Co-authored-by: Davis Bennett --- pyproject.toml | 14 +++++++------- tests/package_with_entrypoint/__init__.py | 23 ++++++++++++----------- tests/test_codecs/test_transpose.py | 3 ++- tests/test_config.py | 14 +++++++------- 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 09615b6b22..9244a9ec0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -348,27 +348,27 @@ python_version = "3.11" ignore_missing_imports = true namespace_packages = false - strict = true warn_unreachable = true - enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] + [[tool.mypy.overrides]] module = [ - "zarr.v2.*", + "tests.package_with_entrypoint.*", + "tests.test_codecs.test_transpose", + "tests.test_config" ] -ignore_errors = true +strict = false +# TODO: Move the next modules up to the strict = false section +# and fix the errors [[tool.mypy.overrides]] module = [ "zarr.testing.stateful", # lots of hypothesis decorator errors - "tests.package_with_entrypoint.*", "tests.test_codecs.test_codecs", - "tests.test_codecs.test_transpose", "tests.test_metadata.*", "tests.test_store.*", - "tests.test_config", "tests.test_group", "tests.test_indexing", "tests.test_properties", diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index b818adf8ea..cfbd4f23a9 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -1,13 +1,14 @@ from collections.abc import Iterable +from typing import Any -from numpy import ndarray +import numpy as np +import numpy.typing as npt import zarr.core.buffer -from zarr.abc.codec import ArrayBytesCodec, CodecInput, CodecOutput, CodecPipeline +from zarr.abc.codec import ArrayBytesCodec, CodecInput, CodecPipeline from zarr.codecs import BytesCodec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import BytesLike class TestEntrypointCodec(ArrayBytesCodec): @@ -16,14 +17,14 @@ class TestEntrypointCodec(ArrayBytesCodec): async def encode( self, chunks_and_specs: Iterable[tuple[CodecInput | None, ArraySpec]], - ) -> Iterable[CodecOutput | None]: - pass + ) -> Iterable[Buffer | None]: + return [None] async def decode( self, chunks_and_specs: Iterable[tuple[CodecInput | None, ArraySpec]], - ) -> ndarray: - pass + ) -> npt.NDArray[Any]: + return np.array(1) def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: return input_byte_length @@ -35,13 +36,13 @@ def __init__(self, batch_size: int = 1) -> None: async def encode( self, chunks_and_specs: Iterable[tuple[CodecInput | None, ArraySpec]] - ) -> BytesLike: - pass + ) -> Iterable[Buffer | None]: + return [None] async def decode( self, chunks_and_specs: Iterable[tuple[CodecInput | None, ArraySpec]] - ) -> ndarray: - pass + ) -> Iterable[NDBuffer | None]: + return np.array(1) class TestEntrypointBuffer(Buffer): diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 18ea8e65d0..06ec668ad3 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -48,6 +48,7 @@ async def test_transpose( read_data = await _AsyncArrayProxy(a)[:, :].get() assert np.array_equal(data, read_data) + assert isinstance(read_data, np.ndarray) if runtime_read_order == "F": assert read_data.flags["F_CONTIGUOUS"] assert not read_data.flags["C_CONTIGUOUS"] @@ -90,5 +91,5 @@ def test_transpose_invalid( dtype=data.dtype, fill_value=0, chunk_key_encoding={"name": "v2", "separator": "."}, - filters=[TransposeCodec(order=order)], + filters=[TransposeCodec(order=order)], # type: ignore[arg-type] ) diff --git a/tests/test_config.py b/tests/test_config.py index 1a2453d646..2cbf172752 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -10,7 +10,7 @@ import zarr import zarr.api from zarr import zeros -from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline +from zarr.abc.codec import CodecPipeline from zarr.abc.store import ByteSetter, Store from zarr.codecs import ( BloscCodec, @@ -21,6 +21,7 @@ ) from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer +from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config from zarr.core.indexing import SelectorTuple @@ -144,7 +145,7 @@ def test_config_codec_pipeline_class(store: Store) -> None: class MockCodecPipeline(BatchedCodecPipeline): async def write( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -174,7 +175,7 @@ async def write( class MockEnvCodecPipeline(CodecPipeline): pass - register_pipeline(MockEnvCodecPipeline) + register_pipeline(MockEnvCodecPipeline) # type: ignore[type-abstract] with mock.patch.dict( os.environ, {"ZARR_CODEC_PIPELINE__PATH": fully_qualified_name(MockEnvCodecPipeline)} @@ -191,10 +192,9 @@ def test_config_codec_implementation(store: Store) -> None: _mock = Mock() class MockBloscCodec(BloscCodec): - async def _encode_single( - self, chunk_data: CodecInput, chunk_spec: ArraySpec - ) -> CodecOutput | None: + async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None: _mock.call() + return None register_codec("blosc", MockBloscCodec) with config.set({"codecs.blosc": fully_qualified_name(MockBloscCodec)}): @@ -245,7 +245,7 @@ def test_config_buffer_implementation() -> None: # has default value assert fully_qualified_name(get_buffer_class()) == config.defaults[0]["buffer"] - arr = zeros(shape=(100), store=StoreExpectingTestBuffer()) + arr = zeros(shape=(100,), store=StoreExpectingTestBuffer()) # AssertionError of StoreExpectingTestBuffer when not using my buffer with pytest.raises(AssertionError): From 260974832d3d2771ce8cf5b1ae67b95ab028c28d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 7 May 2025 20:37:38 -0400 Subject: [PATCH 12/25] Fix and test sharding with GPU buffers (#2978) --- changes/2978.bugfix.rst | 1 + src/zarr/codecs/sharding.py | 2 +- src/zarr/testing/utils.py | 2 +- tests/test_buffer.py | 38 +++++++++++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 changes/2978.bugfix.rst diff --git a/changes/2978.bugfix.rst b/changes/2978.bugfix.rst new file mode 100644 index 0000000000..fe9f3d3f64 --- /dev/null +++ b/changes/2978.bugfix.rst @@ -0,0 +1 @@ +Fixed sharding with GPU buffers. diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 42b1313fac..bee36b3160 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -683,7 +683,7 @@ def _get_index_chunk_spec(self, chunks_per_shard: ChunkCoords) -> ArraySpec: config=ArrayConfig( order="C", write_empty_chunks=False ), # Note: this is hard-coded for simplicity -- it is not surfaced into user code, - prototype=numpy_buffer_prototype(), + prototype=default_buffer_prototype(), ) def _get_chunk_spec(self, shard_spec: ArraySpec) -> ArraySpec: diff --git a/src/zarr/testing/utils.py b/src/zarr/testing/utils.py index 0a93b93fdb..28d6774286 100644 --- a/src/zarr/testing/utils.py +++ b/src/zarr/testing/utils.py @@ -44,7 +44,7 @@ def has_cupy() -> bool: # Decorator for GPU tests def gpu_test(func: T_Callable) -> T_Callable: return cast( - T_Callable, + "T_Callable", pytest.mark.gpu( pytest.mark.skipif(not has_cupy(), reason="CuPy not installed or no GPU available")( func diff --git a/tests/test_buffer.py b/tests/test_buffer.py index 73b3a16677..11ff7cd96c 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -148,6 +148,34 @@ async def test_codecs_use_of_gpu_prototype() -> None: assert cp.array_equal(expect, got) +@gpu_test +@pytest.mark.asyncio +async def test_sharding_use_of_gpu_prototype() -> None: + with zarr.config.enable_gpu(): + expect = cp.zeros((10, 10), dtype="uint16", order="F") + + a = await zarr.api.asynchronous.create_array( + StorePath(MemoryStore()) / "test_codecs_use_of_gpu_prototype", + shape=expect.shape, + chunks=(5, 5), + shards=(10, 10), + dtype=expect.dtype, + fill_value=0, + ) + expect[:] = cp.arange(100).reshape(10, 10) + + await a.setitem( + selection=(slice(0, 10), slice(0, 10)), + value=expect[:], + prototype=gpu.buffer_prototype, + ) + got = await a.getitem( + selection=(slice(0, 10), slice(0, 10)), prototype=gpu.buffer_prototype + ) + assert isinstance(got, cp.ndarray) + assert cp.array_equal(expect, got) + + def test_numpy_buffer_prototype() -> None: buffer = cpu.buffer_prototype.buffer.create_zero_length() ndbuffer = cpu.buffer_prototype.nd_buffer.create(shape=(1, 2), dtype=np.dtype("int64")) @@ -157,6 +185,16 @@ def test_numpy_buffer_prototype() -> None: ndbuffer.as_scalar() +@gpu_test +def test_gpu_buffer_prototype() -> None: + buffer = gpu.buffer_prototype.buffer.create_zero_length() + ndbuffer = gpu.buffer_prototype.nd_buffer.create(shape=(1, 2), dtype=cp.dtype("int64")) + assert isinstance(buffer.as_array_like(), cp.ndarray) + assert isinstance(ndbuffer.as_ndarray_like(), cp.ndarray) + with pytest.raises(ValueError, match="Buffer does not contain a single scalar value"): + ndbuffer.as_scalar() + + # TODO: the same test for other buffer classes def test_cpu_buffer_as_scalar() -> None: buf = cpu.buffer_prototype.nd_buffer.create(shape=(), dtype="int64") From 0465c2b37d937c905c6adb6b6734422fb4f99866 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 8 May 2025 15:15:44 +0100 Subject: [PATCH 13/25] Fix typing errors in testing.stateful (#3045) * Fix typing errors in testing.stateful * Add a dimensionnames type * Add bugfix entry * Fix properties test --- .pre-commit-config.yaml | 1 + changes/3045.bugfix.rst | 1 + pyproject.toml | 2 +- src/zarr/api/asynchronous.py | 3 +- src/zarr/api/synchronous.py | 7 +-- src/zarr/core/array.py | 35 +++++++------- src/zarr/core/common.py | 1 + src/zarr/core/group.py | 7 +-- src/zarr/core/metadata/v3.py | 5 +- src/zarr/testing/stateful.py | 4 +- src/zarr/testing/strategies.py | 86 +++++++++++++++++++++------------- tests/conftest.py | 8 ++-- 12 files changed, 94 insertions(+), 66 deletions(-) create mode 100644 changes/3045.bugfix.rst diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 75ef0face8..474d109c80 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,6 +37,7 @@ repos: - obstore>=0.5.1 # Tests - pytest + - hypothesis - repo: https://github.com/scientific-python/cookie rev: 2025.01.22 hooks: diff --git a/changes/3045.bugfix.rst b/changes/3045.bugfix.rst new file mode 100644 index 0000000000..a3886717a7 --- /dev/null +++ b/changes/3045.bugfix.rst @@ -0,0 +1 @@ +Fixed the typing of ``dimension_names`` arguments throughout so that it now accepts iterables that contain `None` alongside `str`. diff --git a/pyproject.toml b/pyproject.toml index 9244a9ec0b..1c534f7927 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -356,6 +356,7 @@ enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] [[tool.mypy.overrides]] module = [ "tests.package_with_entrypoint.*", + "zarr.testing.stateful", "tests.test_codecs.test_transpose", "tests.test_config" ] @@ -365,7 +366,6 @@ strict = false # and fix the errors [[tool.mypy.overrides]] module = [ - "zarr.testing.stateful", # lots of hypothesis decorator errors "tests.test_codecs.test_codecs", "tests.test_metadata.*", "tests.test_store.*", diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 9b8b43a517..ac143f6dea 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -16,6 +16,7 @@ JSON, AccessModeLiteral, ChunkCoords, + DimensionNames, MemoryOrder, ZarrFormat, _default_zarr_format, @@ -865,7 +866,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, storage_options: dict[str, Any] | None = None, config: ArrayConfigLike | None = None, **kwargs: Any, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 4c577936cd..5662f5c247 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -33,6 +33,7 @@ JSON, AccessModeLiteral, ChunkCoords, + DimensionNames, MemoryOrder, ShapeLike, ZarrFormat, @@ -626,7 +627,7 @@ def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, storage_options: dict[str, Any] | None = None, config: ArrayConfigLike | None = None, **kwargs: Any, @@ -761,7 +762,7 @@ def create_array( zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -926,7 +927,7 @@ def from_array( zarr_format: ZarrFormat | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index b0e8b03cd7..c6217a3d93 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -54,6 +54,7 @@ ZARRAY_JSON, ZATTRS_JSON, ChunkCoords, + DimensionNames, MemoryOrder, ShapeLike, ZarrFormat, @@ -330,7 +331,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -358,7 +359,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -386,7 +387,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, # v2 only chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -421,7 +422,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, # v2 only chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -473,7 +474,7 @@ async def create( These defaults can be changed by modifying the value of ``array.v3_default_filters``, ``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str], optional + dimension_names : Iterable[str | None], optional The names of the dimensions (default is None). Zarr format 3 only. Zarr format 2 arrays should not use this parameter. chunks : ShapeLike, optional @@ -562,7 +563,7 @@ async def _create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, # v2 only chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -672,7 +673,7 @@ def _create_metadata_v3( fill_value: Any | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV3Metadata: """ @@ -723,7 +724,7 @@ async def _create_v3( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV3Metadata]: @@ -1743,7 +1744,7 @@ def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, # v2 only chunks: ChunkCoords | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -1788,7 +1789,7 @@ def create( These defaults can be changed by modifying the value of ``array.v3_default_filters``, ``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str], optional + dimension_names : Iterable[str | None], optional The names of the dimensions (default is None). Zarr format 3 only. Zarr format 2 arrays should not use this parameter. chunks : ChunkCoords, optional @@ -1872,7 +1873,7 @@ def _create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, # v2 only chunks: ChunkCoords | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -3821,7 +3822,7 @@ async def from_array( zarr_format: ZarrFormat | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, @@ -3929,7 +3930,7 @@ async def from_array( For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. If not specified and the data array has the same zarr format as the target array, the chunk key encoding of the data array is used. - dimension_names : Iterable[str], optional + dimension_names : Iterable[str | None], optional The names of the dimensions (default is None). Zarr format 3 only. Zarr format 2 arrays should not use this parameter. If not specified, defaults to the dimension names of the data array. @@ -4083,7 +4084,7 @@ async def init_array( zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, overwrite: bool = False, config: ArrayConfigLike | None, ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: @@ -4298,7 +4299,7 @@ async def create_array( zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -4477,7 +4478,7 @@ def _parse_keep_array_attr( order: MemoryOrder | None, zarr_format: ZarrFormat | None, chunk_key_encoding: ChunkKeyEncodingLike | None, - dimension_names: Iterable[str] | None, + dimension_names: DimensionNames, ) -> tuple[ ChunkCoords | Literal["auto"], ShardsLike | None, @@ -4488,7 +4489,7 @@ def _parse_keep_array_attr( MemoryOrder | None, ZarrFormat, ChunkKeyEncodingLike | None, - Iterable[str] | None, + DimensionNames, ]: if isinstance(data, Array): if chunks == "keep": diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 3308ca3247..a670834206 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -40,6 +40,7 @@ JSON = str | int | float | Mapping[str, "JSON"] | Sequence["JSON"] | None MemoryOrder = Literal["C", "F"] AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"] +DimensionNames = Iterable[str | None] | None def product(tup: ChunkCoords) -> int: diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 3f4f15b9e9..5c470e29ca 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -43,6 +43,7 @@ ZGROUP_JSON, ZMETADATA_V2_JSON, ChunkCoords, + DimensionNames, NodeType, ShapeLike, ZarrFormat, @@ -1006,7 +1007,7 @@ async def create_array( order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, @@ -2381,7 +2382,7 @@ def create_array( order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, @@ -2775,7 +2776,7 @@ def array( order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 9154762648..63f6515e44 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -32,6 +32,7 @@ JSON, ZARR_JSON, ChunkCoords, + DimensionNames, parse_named_configuration, parse_shapelike, ) @@ -242,7 +243,7 @@ class ArrayV3Metadata(Metadata): fill_value: Any codecs: tuple[Codec, ...] attributes: dict[str, Any] = field(default_factory=dict) - dimension_names: tuple[str, ...] | None = None + dimension_names: tuple[str | None, ...] | None = None zarr_format: Literal[3] = field(default=3, init=False) node_type: Literal["array"] = field(default="array", init=False) storage_transformers: tuple[dict[str, JSON], ...] @@ -257,7 +258,7 @@ def __init__( fill_value: Any, codecs: Iterable[Codec | dict[str, JSON]], attributes: dict[str, JSON] | None, - dimension_names: Iterable[str] | None, + dimension_names: DimensionNames, storage_transformers: Iterable[dict[str, JSON]] | None = None, ) -> None: """ diff --git a/src/zarr/testing/stateful.py b/src/zarr/testing/stateful.py index ede83201ae..acc5f63f19 100644 --- a/src/zarr/testing/stateful.py +++ b/src/zarr/testing/stateful.py @@ -326,8 +326,8 @@ def init_store(self) -> None: self.store.clear() @rule(key=zarr_keys(), data=st.binary(min_size=0, max_size=MAX_BINARY_SIZE)) - def set(self, key: str, data: DataObject) -> None: - note(f"(set) Setting {key!r} with {data}") + def set(self, key: str, data: bytes) -> None: + note(f"(set) Setting {key!r} with {data!r}") assert not self.store.read_only data_buf = cpu.Buffer.from_bytes(data) self.store.set(key, data_buf) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 663d46034d..3b10592ec0 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -1,10 +1,12 @@ import math import sys +from collections.abc import Callable, Mapping from typing import Any, Literal import hypothesis.extra.numpy as npst import hypothesis.strategies as st import numpy as np +import numpy.typing as npt from hypothesis import event from hypothesis.strategies import SearchStrategy @@ -14,7 +16,7 @@ from zarr.core.array import Array from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding -from zarr.core.common import ZarrFormat +from zarr.core.common import JSON, ZarrFormat from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike @@ -30,17 +32,17 @@ ) -@st.composite # type: ignore[misc] -def keys(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> Any: +@st.composite +def keys(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> str: return draw(st.lists(node_names, min_size=1, max_size=max_num_nodes).map("/".join)) -@st.composite # type: ignore[misc] -def paths(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> Any: +@st.composite +def paths(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> str: return draw(st.just("/") | keys(max_num_nodes=max_num_nodes)) -def v3_dtypes() -> st.SearchStrategy[np.dtype]: +def v3_dtypes() -> st.SearchStrategy[np.dtype[Any]]: return ( npst.boolean_dtypes() | npst.integer_dtypes(endianness="=") @@ -54,7 +56,7 @@ def v3_dtypes() -> st.SearchStrategy[np.dtype]: ) -def v2_dtypes() -> st.SearchStrategy[np.dtype]: +def v2_dtypes() -> st.SearchStrategy[np.dtype[Any]]: return ( npst.boolean_dtypes() | npst.integer_dtypes(endianness="=") @@ -107,7 +109,9 @@ def clear_store(x: Store) -> Store: .filter(lambda name: name.lower() != "zarr.json") ) array_names = node_names -attrs = st.none() | st.dictionaries(_attr_keys, _attr_values) +attrs: st.SearchStrategy[Mapping[str, JSON] | None] = st.none() | st.dictionaries( + _attr_keys, _attr_values +) # st.builds will only call a new store constructor for different keyword arguments # i.e. stores.examples() will always return the same object per Store class. # So we map a clear to reset the store. @@ -118,19 +122,19 @@ def clear_store(x: Store) -> Store: array_shapes = npst.array_shapes(max_dims=4, min_side=3) | npst.array_shapes(max_dims=4, min_side=0) -@st.composite # type: ignore[misc] +@st.composite def dimension_names(draw: st.DrawFn, *, ndim: int | None = None) -> list[None | str] | None: simple_text = st.text(zarr_key_chars, min_size=0) - return draw(st.none() | st.lists(st.none() | simple_text, min_size=ndim, max_size=ndim)) # type: ignore[no-any-return] + return draw(st.none() | st.lists(st.none() | simple_text, min_size=ndim, max_size=ndim)) # type: ignore[arg-type] -@st.composite # type: ignore[misc] +@st.composite def array_metadata( draw: st.DrawFn, *, - array_shapes: st.SearchStrategy[tuple[int, ...]] = npst.array_shapes, + array_shapes: Callable[..., st.SearchStrategy[tuple[int, ...]]] = npst.array_shapes, zarr_formats: st.SearchStrategy[Literal[2, 3]] = zarr_formats, - attributes: st.SearchStrategy[dict[str, Any]] = attrs, + attributes: SearchStrategy[Mapping[str, JSON] | None] = attrs, ) -> ArrayV2Metadata | ArrayV3Metadata: zarr_format = draw(zarr_formats) # separator = draw(st.sampled_from(['/', '\\'])) @@ -146,7 +150,7 @@ def array_metadata( dtype=dtype, fill_value=fill_value, order=draw(st.sampled_from(["C", "F"])), - attributes=draw(attributes), + attributes=draw(attributes), # type: ignore[arg-type] dimension_separator=draw(st.sampled_from([".", "/"])), filters=None, compressor=None, @@ -157,7 +161,7 @@ def array_metadata( data_type=dtype, chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), fill_value=fill_value, - attributes=draw(attributes), + attributes=draw(attributes), # type: ignore[arg-type] dimension_names=draw(dimension_names(ndim=ndim)), chunk_key_encoding=DefaultChunkKeyEncoding(separator="/"), # FIXME codecs=[BytesCodec()], @@ -165,14 +169,14 @@ def array_metadata( ) -@st.composite # type: ignore[misc] +@st.composite def numpy_arrays( draw: st.DrawFn, *, shapes: st.SearchStrategy[tuple[int, ...]] = array_shapes, dtype: np.dtype[Any] | None = None, - zarr_formats: st.SearchStrategy[ZarrFormat] | None = zarr_formats, -) -> Any: + zarr_formats: st.SearchStrategy[ZarrFormat] = zarr_formats, +) -> npt.NDArray[Any]: """ Generate numpy arrays that can be saved in the provided Zarr format. """ @@ -186,7 +190,7 @@ def numpy_arrays( return draw(npst.arrays(dtype=dtype, shape=shapes)) -@st.composite # type: ignore[misc] +@st.composite def chunk_shapes(draw: st.DrawFn, *, shape: tuple[int, ...]) -> tuple[int, ...]: # We want this strategy to shrink towards arrays with smaller number of chunks # 1. st.integers() shrinks towards smaller values. So we use that to generate number of chunks @@ -208,7 +212,7 @@ def chunk_shapes(draw: st.DrawFn, *, shape: tuple[int, ...]) -> tuple[int, ...]: return chunks -@st.composite # type: ignore[misc] +@st.composite def shard_shapes( draw: st.DrawFn, *, shape: tuple[int, ...], chunk_shape: tuple[int, ...] ) -> tuple[int, ...]: @@ -220,9 +224,11 @@ def shard_shapes( return tuple(m * c for m, c in zip(multiples, chunk_shape, strict=True)) -@st.composite # type: ignore[misc] +@st.composite def np_array_and_chunks( - draw: st.DrawFn, *, arrays: st.SearchStrategy[np.ndarray] = numpy_arrays + draw: st.DrawFn, + *, + arrays: st.SearchStrategy[npt.NDArray[Any]] = numpy_arrays(), # noqa: B008 ) -> tuple[np.ndarray, tuple[int, ...]]: # type: ignore[type-arg] """A hypothesis strategy to generate small sized random arrays. @@ -232,14 +238,14 @@ def np_array_and_chunks( return (array, draw(chunk_shapes(shape=array.shape))) -@st.composite # type: ignore[misc] +@st.composite def arrays( draw: st.DrawFn, *, shapes: st.SearchStrategy[tuple[int, ...]] = array_shapes, compressors: st.SearchStrategy = compressors, stores: st.SearchStrategy[StoreLike] = stores, - paths: st.SearchStrategy[str | None] = paths(), # noqa: B008 + paths: st.SearchStrategy[str] = paths(), # noqa: B008 array_names: st.SearchStrategy = array_names, arrays: st.SearchStrategy | None = None, attrs: st.SearchStrategy = attrs, @@ -296,7 +302,7 @@ def arrays( return a -@st.composite # type: ignore[misc] +@st.composite def simple_arrays( draw: st.DrawFn, *, @@ -317,7 +323,7 @@ def is_negative_slice(idx: Any) -> bool: return isinstance(idx, slice) and idx.step is not None and idx.step < 0 -@st.composite # type: ignore[misc] +@st.composite def end_slices(draw: st.DrawFn, *, shape: tuple[int]) -> Any: """ A strategy that slices ranges that include the last chunk. @@ -332,14 +338,28 @@ def end_slices(draw: st.DrawFn, *, shape: tuple[int]) -> Any: return tuple(slicers) -@st.composite # type: ignore[misc] -def basic_indices(draw: st.DrawFn, *, shape: tuple[int], **kwargs: Any) -> Any: +@st.composite +def basic_indices( + draw: st.DrawFn, + *, + shape: tuple[int], + min_dims: int = 0, + max_dims: int | None = None, + allow_newaxis: bool = False, + allow_ellipsis: bool = True, +) -> Any: """Basic indices without unsupported negative slices.""" - strategy = npst.basic_indices(shape=shape, **kwargs).filter( + strategy = npst.basic_indices( + shape=shape, + min_dims=min_dims, + max_dims=max_dims, + allow_newaxis=allow_newaxis, + allow_ellipsis=allow_ellipsis, + ).filter( lambda idxr: ( not ( is_negative_slice(idxr) - or (isinstance(idxr, tuple) and any(is_negative_slice(idx) for idx in idxr)) + or (isinstance(idxr, tuple) and any(is_negative_slice(idx) for idx in idxr)) # type: ignore[redundant-expr] ) ) ) @@ -348,7 +368,7 @@ def basic_indices(draw: st.DrawFn, *, shape: tuple[int], **kwargs: Any) -> Any: return draw(strategy) -@st.composite # type: ignore[misc] +@st.composite def orthogonal_indices( draw: st.DrawFn, *, shape: tuple[int] ) -> tuple[tuple[np.ndarray[Any, Any], ...], tuple[np.ndarray[Any, Any], ...]]: @@ -386,8 +406,8 @@ def orthogonal_indices( def key_ranges( - keys: SearchStrategy = node_names, max_size: int = sys.maxsize -) -> SearchStrategy[list[int]]: + keys: SearchStrategy[str] = node_names, max_size: int = sys.maxsize +) -> SearchStrategy[list[tuple[str, RangeByteRequest]]]: """ Function to generate key_ranges strategy for get_partial_values() returns list strategy w/ form:: diff --git a/tests/conftest.py b/tests/conftest.py index 74a140c5c7..948d3cd055 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,7 +18,7 @@ _parse_chunk_key_encoding, ) from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition -from zarr.core.common import JSON, parse_dtype, parse_shapelike +from zarr.core.common import JSON, DimensionNames, parse_dtype, parse_shapelike from zarr.core.config import config as zarr_config from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata @@ -26,7 +26,7 @@ from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore if TYPE_CHECKING: - from collections.abc import Generator, Iterable + from collections.abc import Generator from typing import Any, Literal from _pytest.compat import LEGACY_PATH @@ -255,7 +255,7 @@ def create_array_metadata( zarr_format: ZarrFormat, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, ) -> ArrayV2Metadata | ArrayV3Metadata: """ Create array metadata @@ -388,7 +388,7 @@ def meta_from_array( zarr_format: ZarrFormat = 3, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, - dimension_names: Iterable[str] | None = None, + dimension_names: DimensionNames = None, ) -> ArrayV3Metadata | ArrayV2Metadata: """ Create array metadata from an array From 5ff3fbe5fe1488310301e9d2ae56a9880d1ddfb2 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 8 May 2025 16:45:30 +0200 Subject: [PATCH 14/25] (fix): use `typesize` on `Blosc` codec (#2962) * (fix): use `typesize` on `Blosc` codec * (chore): relnote * (fix): intersphinx * (fix): look at that compression ratio! * (fix): add test * (fix): min version * (fix): parenthesis? * (fix): try assertion error * (fix): windows size * (fix): add bytes print * (fix): aghh windows latest is correct, error for non latest * (fix): conditions for sizes * (fix): try clearer data * (fix): awesome! * (fix): pre-commit --------- Co-authored-by: David Stansby --- changes/2962.fix.rst | 1 + docs/user-guide/arrays.rst | 4 ++-- src/zarr/codecs/blosc.py | 4 ++++ tests/test_codecs/test_blosc.py | 19 +++++++++++++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 changes/2962.fix.rst diff --git a/changes/2962.fix.rst b/changes/2962.fix.rst new file mode 100644 index 0000000000..83d24b72ce --- /dev/null +++ b/changes/2962.fix.rst @@ -0,0 +1 @@ +Internally use `typesize` constructor parameter for :class:`numcodecs.blosc.Blosc` to improve compression ratios back to the v2-package levels. \ No newline at end of file diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index a62b2ea0fa..e6d1bcdc54 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -209,8 +209,8 @@ prints additional diagnostics, e.g.:: Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) - No. bytes stored : 9696520 - Storage ratio : 41.3 + No. bytes stored : 3558573 + Storage ratio : 112.4 Chunks Initialized : 100 .. note:: diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 2fcc041a6b..9a999e10d7 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -8,6 +8,7 @@ import numcodecs from numcodecs.blosc import Blosc +from packaging.version import Version from zarr.abc.codec import BytesBytesCodec from zarr.core.buffer.cpu import as_numpy_array_wrapper @@ -163,6 +164,9 @@ def _blosc_codec(self) -> Blosc: "shuffle": map_shuffle_str_to_int[self.shuffle], "blocksize": self.blocksize, } + # See https://github.com/zarr-developers/numcodecs/pull/713 + if Version(numcodecs.__version__) >= Version("0.16.0"): + config_dict["typesize"] = self.typesize return Blosc.from_config(config_dict) async def _decode_single( diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index c1c5c92329..6e6e9df383 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -1,7 +1,9 @@ import json +import numcodecs import numpy as np import pytest +from packaging.version import Version import zarr from zarr.abc.store import Store @@ -54,3 +56,20 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: assert blosc_configuration_json["shuffle"] == "bitshuffle" else: assert blosc_configuration_json["shuffle"] == "shuffle" + + +async def test_typesize() -> None: + a = np.arange(1000000, dtype=np.uint64) + codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] + z = zarr.array(a, chunks=(10000), codecs=codecs) + data = await z.store.get("c/0", prototype=default_buffer_prototype()) + assert data is not None + bytes = data.to_bytes() + size = len(bytes) + msg = f"Blosc size mismatch. First 10 bytes: {bytes[:20]!r} and last 10 bytes: {bytes[-20:]!r}" + if Version(numcodecs.__version__) >= Version("0.16.0"): + expected_size = 402 + assert size == expected_size, msg + else: + expected_size = 10216 + assert size == expected_size, msg From 520bc1f6c2af511832c3bab6da1eb5e2e2c38900 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 12 May 2025 16:32:54 +0200 Subject: [PATCH 15/25] fix/unbreak chunks initialized (#2862) * unbreak chunks initialized * Update src/zarr/storage/_utils.py Co-authored-by: Tom Augspurger * update docstring * make relativize_paths kw-only, and add tests --------- Co-authored-by: Tom Augspurger --- changes/2862.bugfix.rst | 1 + docs/user-guide/groups.rst | 2 +- src/zarr/core/array.py | 8 +++++- src/zarr/storage/_utils.py | 53 ++++++++++++++++++++++++++++++++++- tests/test_array.py | 5 ++-- tests/test_store/test_core.py | 32 ++++++++++++++++++++- 6 files changed, 95 insertions(+), 6 deletions(-) create mode 100644 changes/2862.bugfix.rst diff --git a/changes/2862.bugfix.rst b/changes/2862.bugfix.rst new file mode 100644 index 0000000000..bbe6f0746e --- /dev/null +++ b/changes/2862.bugfix.rst @@ -0,0 +1 @@ +Fix a bug that prevented the number of initialized chunks being counted properly. \ No newline at end of file diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index 4268004f70..d5a0a7ccee 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -140,7 +140,7 @@ property. E.g.:: No. bytes : 8000000 (7.6M) No. bytes stored : 1614 Storage ratio : 4956.6 - Chunks Initialized : 0 + Chunks Initialized : 10 >>> baz.info Type : Array Zarr format : 3 diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index c6217a3d93..9852bf8d5f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -117,6 +117,7 @@ get_pipeline_class, ) from zarr.storage._common import StorePath, ensure_no_existing_node, make_store_path +from zarr.storage._utils import _relativize_path if TYPE_CHECKING: from collections.abc import Iterator, Sequence @@ -3737,7 +3738,12 @@ async def chunks_initialized( store_contents = [ x async for x in array.store_path.store.list_prefix(prefix=array.store_path.path) ] - return tuple(chunk_key for chunk_key in array._iter_chunk_keys() if chunk_key in store_contents) + store_contents_relative = [ + _relativize_path(path=key, prefix=array.store_path.path) for key in store_contents + ] + return tuple( + chunk_key for chunk_key in array._iter_chunk_keys() if chunk_key in store_contents_relative + ) def _build_parents( diff --git a/src/zarr/storage/_utils.py b/src/zarr/storage/_utils.py index eda4342f47..145790278c 100644 --- a/src/zarr/storage/_utils.py +++ b/src/zarr/storage/_utils.py @@ -74,11 +74,62 @@ def _join_paths(paths: Iterable[str]) -> str: """ Filter out instances of '' and join the remaining strings with '/'. - Because the root node of a zarr hierarchy is represented by an empty string, + Parameters + ---------- + paths : Iterable[str] + + Returns + ------- + str + + Examples + -------- + >>> _join_paths(["", "a", "b"]) + 'a/b' + >>> _join_paths(["a", "b", "c"]) + 'a/b/c' """ return "/".join(filter(lambda v: v != "", paths)) +def _relativize_path(*, path: str, prefix: str) -> str: + """ + Make a "/"-delimited path relative to some prefix. If the prefix is '', then the path is + returned as-is. Otherwise, the prefix is removed from the path as well as the separator + string "/". + + If ``prefix`` is not the empty string and ``path`` does not start with ``prefix`` + followed by a "/" character, then an error is raised. + + This function assumes that the prefix does not end with "/". + + Parameters + ---------- + path : str + The path to make relative to the prefix. + prefix : str + The prefix to make the path relative to. + + Returns + ------- + str + + Examples + -------- + >>> _relativize_path(path="", prefix="a/b") + 'a/b' + >>> _relativize_path(path="a/b", prefix="a/b/c") + 'c' + """ + if prefix == "": + return path + else: + _prefix = prefix + "/" + if not path.startswith(_prefix): + raise ValueError(f"The first component of {path} does not start with {prefix}.") + return path.removeprefix(f"{prefix}/") + + def _normalize_paths(paths: Iterable[str]) -> tuple[str, ...]: """ Normalize the input paths according to the normalization scheme used for zarr node paths. diff --git a/tests/test_array.py b/tests/test_array.py index 4be9bbde43..989fe30592 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -387,12 +387,13 @@ async def test_nchunks_initialized(test_cls: type[Array] | type[AsyncArray[Any]] assert observed == expected -async def test_chunks_initialized() -> None: +@pytest.mark.parametrize("path", ["", "foo"]) +async def test_chunks_initialized(path: str) -> None: """ Test that chunks_initialized accurately returns the keys of stored chunks. """ store = MemoryStore() - arr = zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create_array(store, name=path, shape=(100,), chunks=(10,), dtype="i4") chunks_accumulated = tuple( accumulate(tuple(tuple(v.split(" ")) for v in arr._iter_chunk_keys())) diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index 87d0e6e40d..1ac410954b 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -8,7 +8,13 @@ from zarr.core.common import AccessModeLiteral, ZarrFormat from zarr.storage import FsspecStore, LocalStore, MemoryStore, StoreLike, StorePath from zarr.storage._common import contains_array, contains_group, make_store_path -from zarr.storage._utils import _join_paths, _normalize_path_keys, _normalize_paths, normalize_path +from zarr.storage._utils import ( + _join_paths, + _normalize_path_keys, + _normalize_paths, + _relativize_path, + normalize_path, +) @pytest.mark.parametrize("path", ["foo", "foo/bar"]) @@ -221,3 +227,27 @@ def test_normalize_path_keys(): """ data = {"a": 10, "//b": 10} assert _normalize_path_keys(data) == {normalize_path(k): v for k, v in data.items()} + + +@pytest.mark.parametrize( + ("path", "prefix", "expected"), + [ + ("a", "", "a"), + ("a/b/c", "a/b", "c"), + ("a/b/c", "a", "b/c"), + ], +) +def test_relativize_path_valid(path: str, prefix: str, expected: str) -> None: + """ + Test the normal behavior of the _relativize_path function. Prefixes should be removed from the + path argument. + """ + assert _relativize_path(path=path, prefix=prefix) == expected + + +def test_relativize_path_invalid() -> None: + path = "a/b/c" + prefix = "b" + msg = f"The first component of {path} does not start with {prefix}." + with pytest.raises(ValueError, match=msg): + _relativize_path(path="a/b/c", prefix="b") From 7584b96f22b5d517cf61f13b415961d7be99b428 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Tue, 13 May 2025 11:48:46 +0100 Subject: [PATCH 16/25] Fix typing in a bunch of store tests (#3052) * Ignore explicit test store files * Fix test_zip * Fix test_local * Fix test_fsspec * Fix typing in test_memory * Remove walrus Co-authored-by: Davis Bennett --------- Co-authored-by: Davis Bennett --- .pre-commit-config.yaml | 1 + pyproject.toml | 12 ++++++++-- src/zarr/testing/store.py | 2 +- src/zarr/testing/utils.py | 9 ++++---- tests/test_store/test_fsspec.py | 38 +++++++++++++++++++----------- tests/test_store/test_local.py | 10 ++++---- tests/test_store/test_memory.py | 41 ++++++++++++++++++--------------- tests/test_store/test_zip.py | 19 +++++++++------ 8 files changed, 80 insertions(+), 52 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 474d109c80..80743a5dec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,6 +38,7 @@ repos: # Tests - pytest - hypothesis + - s3fs - repo: https://github.com/scientific-python/cookie rev: 2025.01.22 hooks: diff --git a/pyproject.toml b/pyproject.toml index 1c534f7927..033c9dc114 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -358,7 +358,11 @@ module = [ "tests.package_with_entrypoint.*", "zarr.testing.stateful", "tests.test_codecs.test_transpose", - "tests.test_config" + "tests.test_config", + "tests.test_store.test_zip", + "tests.test_store.test_local", + "tests.test_store.test_fsspec", + "tests.test_store.test_memory", ] strict = false @@ -368,7 +372,11 @@ strict = false module = [ "tests.test_codecs.test_codecs", "tests.test_metadata.*", - "tests.test_store.*", + "tests.test_store.test_core", + "tests.test_store.test_logging", + "tests.test_store.test_object", + "tests.test_store.test_stateful", + "tests.test_store.test_wrapper", "tests.test_group", "tests.test_indexing", "tests.test_properties", diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index 867df2121f..0e73599791 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -58,7 +58,7 @@ async def get(self, store: S, key: str) -> Buffer: @abstractmethod @pytest.fixture - def store_kwargs(self) -> dict[str, Any]: + def store_kwargs(self, *args: Any, **kwargs: Any) -> dict[str, Any]: """Kwargs for instantiating a store""" ... diff --git a/src/zarr/testing/utils.py b/src/zarr/testing/utils.py index 28d6774286..7cf57ab9d6 100644 --- a/src/zarr/testing/utils.py +++ b/src/zarr/testing/utils.py @@ -1,7 +1,6 @@ from __future__ import annotations -from collections.abc import Callable, Coroutine -from typing import TYPE_CHECKING, Any, TypeVar, cast +from typing import TYPE_CHECKING, TypeVar, cast import pytest @@ -38,13 +37,13 @@ def has_cupy() -> bool: return False -T_Callable = TypeVar("T_Callable", bound=Callable[..., Coroutine[Any, Any, None] | None]) +T = TypeVar("T") # Decorator for GPU tests -def gpu_test(func: T_Callable) -> T_Callable: +def gpu_test(func: T) -> T: return cast( - "T_Callable", + "T", pytest.mark.gpu( pytest.mark.skipif(not has_cupy(), reason="CuPy not installed or no GPU available")( func diff --git a/tests/test_store/test_fsspec.py b/tests/test_store/test_fsspec.py index 08cf2f286d..c10471809c 100644 --- a/tests/test_store/test_fsspec.py +++ b/tests/test_store/test_fsspec.py @@ -3,7 +3,7 @@ import json import os import re -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import pytest from packaging.version import parse as parse_version @@ -17,8 +17,13 @@ if TYPE_CHECKING: from collections.abc import Generator + from pathlib import Path import botocore.client + import s3fs + + from zarr.core.common import JSON + # Warning filter due to https://github.com/boto/boto3/issues/3889 pytestmark = [ @@ -109,10 +114,13 @@ async def test_basic() -> None: data = b"hello" await store.set("foo", cpu.Buffer.from_bytes(data)) assert await store.exists("foo") - assert (await store.get("foo", prototype=default_buffer_prototype())).to_bytes() == data + buf = await store.get("foo", prototype=default_buffer_prototype()) + assert buf is not None + assert buf.to_bytes() == data out = await store.get_partial_values( prototype=default_buffer_prototype(), key_ranges=[("foo", OffsetByteRequest(1))] ) + assert out[0] is not None assert out[0].to_bytes() == data[1:] @@ -121,7 +129,7 @@ class TestFsspecStoreS3(StoreTests[FsspecStore, cpu.Buffer]): buffer_cls = cpu.Buffer @pytest.fixture - def store_kwargs(self, request) -> dict[str, str | bool]: + def store_kwargs(self) -> dict[str, str | bool]: try: from fsspec import url_to_fs except ImportError: @@ -133,7 +141,7 @@ def store_kwargs(self, request) -> dict[str, str | bool]: return {"fs": fs, "path": path} @pytest.fixture - def store(self, store_kwargs: dict[str, str | bool]) -> FsspecStore: + async def store(self, store_kwargs: dict[str, Any]) -> FsspecStore: return self.store_cls(**store_kwargs) async def get(self, store: FsspecStore, key: str) -> Buffer: @@ -168,7 +176,11 @@ async def test_fsspec_store_from_uri(self, store: FsspecStore) -> None: "anon": False, } - meta = {"attributes": {"key": "value"}, "zarr_format": 3, "node_type": "group"} + meta: dict[str, JSON] = { + "attributes": {"key": "value"}, + "zarr_format": 3, + "node_type": "group", + } await store.set( "zarr.json", @@ -179,7 +191,7 @@ async def test_fsspec_store_from_uri(self, store: FsspecStore) -> None: ) assert dict(group.attrs) == {"key": "value"} - meta["attributes"]["key"] = "value-2" + meta["attributes"]["key"] = "value-2" # type: ignore[index] await store.set( "directory-2/zarr.json", self.buffer_cls.from_bytes(json.dumps(meta).encode()), @@ -189,7 +201,7 @@ async def test_fsspec_store_from_uri(self, store: FsspecStore) -> None: ) assert dict(group.attrs) == {"key": "value-2"} - meta["attributes"]["key"] = "value-3" + meta["attributes"]["key"] = "value-3" # type: ignore[index] await store.set( "directory-3/zarr.json", self.buffer_cls.from_bytes(json.dumps(meta).encode()), @@ -216,7 +228,7 @@ def test_from_upath(self) -> None: assert result.fs.asynchronous assert result.path == f"{test_bucket_name}/foo/bar" - def test_init_raises_if_path_has_scheme(self, store_kwargs) -> None: + def test_init_raises_if_path_has_scheme(self, store_kwargs: dict[str, Any]) -> None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2342 store_kwargs["path"] = "s3://" + store_kwargs["path"] with pytest.raises( @@ -237,7 +249,7 @@ def test_init_warns_if_fs_asynchronous_is_false(self) -> None: with pytest.warns(UserWarning, match=r".* was not created with `asynchronous=True`.*"): self.store_cls(**store_kwargs) - async def test_empty_nonexistent_path(self, store_kwargs) -> None: + async def test_empty_nonexistent_path(self, store_kwargs: dict[str, Any]) -> None: # regression test for https://github.com/zarr-developers/zarr-python/pull/2343 store_kwargs["path"] += "/abc" store = await self.store_cls.open(**store_kwargs) @@ -256,7 +268,7 @@ async def test_delete_dir_unsupported_deletes(self, store: FsspecStore) -> None: parse_version(fsspec.__version__) < parse_version("2024.12.0"), reason="No AsyncFileSystemWrapper", ) -def test_wrap_sync_filesystem(): +def test_wrap_sync_filesystem() -> None: """The local fs is not async so we should expect it to be wrapped automatically""" from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper @@ -270,7 +282,7 @@ def test_wrap_sync_filesystem(): parse_version(fsspec.__version__) < parse_version("2024.12.0"), reason="No AsyncFileSystemWrapper", ) -def test_no_wrap_async_filesystem(): +def test_no_wrap_async_filesystem() -> None: """An async fs should not be wrapped automatically; fsspec's https filesystem is such an fs""" from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper @@ -284,12 +296,12 @@ def test_no_wrap_async_filesystem(): parse_version(fsspec.__version__) < parse_version("2024.12.0"), reason="No AsyncFileSystemWrapper", ) -async def test_delete_dir_wrapped_filesystem(tmpdir) -> None: +async def test_delete_dir_wrapped_filesystem(tmp_path: Path) -> None: from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper from fsspec.implementations.local import LocalFileSystem wrapped_fs = AsyncFileSystemWrapper(LocalFileSystem(auto_mkdir=True)) - store = FsspecStore(wrapped_fs, read_only=False, path=f"{tmpdir}/test/path") + store = FsspecStore(wrapped_fs, read_only=False, path=f"{tmp_path}/test/path") assert isinstance(store.fs, AsyncFileSystemWrapper) assert store.fs.asynchronous diff --git a/tests/test_store/test_local.py b/tests/test_store/test_local.py index d9d941c6f0..8699a85082 100644 --- a/tests/test_store/test_local.py +++ b/tests/test_store/test_local.py @@ -28,7 +28,7 @@ async def set(self, store: LocalStore, key: str, value: Buffer) -> None: (store.root / key).write_bytes(value.to_bytes()) @pytest.fixture - def store_kwargs(self, tmpdir) -> dict[str, str]: + def store_kwargs(self, tmpdir: str) -> dict[str, str]: return {"root": str(tmpdir)} def test_store_repr(self, store: LocalStore) -> None: @@ -48,14 +48,14 @@ async def test_empty_with_empty_subdir(self, store: LocalStore) -> None: (store.root / "foo/bar").mkdir(parents=True) assert await store.is_empty("") - def test_creates_new_directory(self, tmp_path: pathlib.Path): + def test_creates_new_directory(self, tmp_path: pathlib.Path) -> None: target = tmp_path.joinpath("a", "b", "c") assert not target.exists() store = self.store_cls(root=target) zarr.group(store=store) - def test_invalid_root_raises(self): + def test_invalid_root_raises(self) -> None: """ Test that a TypeError is raised when a non-str/Path type is used for the `root` argument """ @@ -63,9 +63,9 @@ def test_invalid_root_raises(self): TypeError, match=r"'root' must be a string or Path instance. Got an instance of instead.", ): - LocalStore(root=0) + LocalStore(root=0) # type: ignore[arg-type] - async def test_get_with_prototype_default(self, store: LocalStore): + async def test_get_with_prototype_default(self, store: LocalStore) -> None: """ Ensure that data can be read via ``store.get`` if the prototype keyword argument is unspecified, i.e. set to ``None``. """ diff --git a/tests/test_store/test_memory.py b/tests/test_store/test_memory.py index e520c7d054..a090f56951 100644 --- a/tests/test_store/test_memory.py +++ b/tests/test_store/test_memory.py @@ -1,12 +1,15 @@ from __future__ import annotations import re -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import numpy as np +import numpy.typing as npt import pytest import zarr +import zarr.core +import zarr.core.array from zarr.core.buffer import Buffer, cpu, gpu from zarr.storage import GpuMemoryStore, MemoryStore from zarr.testing.store import StoreTests @@ -31,16 +34,16 @@ async def get(self, store: MemoryStore, key: str) -> Buffer: return store._store_dict[key] @pytest.fixture(params=[None, True]) - def store_kwargs( - self, request: pytest.FixtureRequest - ) -> dict[str, str | dict[str, Buffer] | None]: - kwargs = {"store_dict": None} + def store_kwargs(self, request: pytest.FixtureRequest) -> dict[str, Any]: + kwargs: dict[str, Any] if request.param is True: - kwargs["store_dict"] = {} + kwargs = {"store_dict": {}} + else: + kwargs = {"store_dict": None} return kwargs @pytest.fixture - def store(self, store_kwargs: str | dict[str, Buffer] | None) -> MemoryStore: + async def store(self, store_kwargs: dict[str, Any]) -> MemoryStore: return self.store_cls(**store_kwargs) def test_store_repr(self, store: MemoryStore) -> None: @@ -55,13 +58,13 @@ def test_store_supports_listing(self, store: MemoryStore) -> None: def test_store_supports_partial_writes(self, store: MemoryStore) -> None: assert store.supports_partial_writes - def test_list_prefix(self, store: MemoryStore) -> None: + async def test_list_prefix(self, store: MemoryStore) -> None: assert True @pytest.mark.parametrize("dtype", ["uint8", "float32", "int64"]) @pytest.mark.parametrize("zarr_format", [2, 3]) async def test_deterministic_size( - self, store: MemoryStore, dtype, zarr_format: ZarrFormat + self, store: MemoryStore, dtype: npt.DTypeLike, zarr_format: ZarrFormat ) -> None: a = zarr.empty( store=store, @@ -85,23 +88,23 @@ class TestGpuMemoryStore(StoreTests[GpuMemoryStore, gpu.Buffer]): store_cls = GpuMemoryStore buffer_cls = gpu.Buffer - async def set(self, store: GpuMemoryStore, key: str, value: Buffer) -> None: + async def set(self, store: GpuMemoryStore, key: str, value: gpu.Buffer) -> None: # type: ignore[override] store._store_dict[key] = value async def get(self, store: MemoryStore, key: str) -> Buffer: return store._store_dict[key] @pytest.fixture(params=[None, True]) - def store_kwargs( - self, request: pytest.FixtureRequest - ) -> dict[str, str | dict[str, Buffer] | None]: - kwargs = {"store_dict": None} + def store_kwargs(self, request: pytest.FixtureRequest) -> dict[str, Any]: + kwargs: dict[str, Any] if request.param is True: - kwargs["store_dict"] = {} + kwargs = {"store_dict": {}} + else: + kwargs = {"store_dict": None} return kwargs @pytest.fixture - def store(self, store_kwargs: str | dict[str, gpu.Buffer] | None) -> GpuMemoryStore: + async def store(self, store_kwargs: dict[str, Any]) -> GpuMemoryStore: return self.store_cls(**store_kwargs) def test_store_repr(self, store: GpuMemoryStore) -> None: @@ -116,15 +119,15 @@ def test_store_supports_listing(self, store: GpuMemoryStore) -> None: def test_store_supports_partial_writes(self, store: GpuMemoryStore) -> None: assert store.supports_partial_writes - def test_list_prefix(self, store: GpuMemoryStore) -> None: + async def test_list_prefix(self, store: GpuMemoryStore) -> None: assert True def test_dict_reference(self, store: GpuMemoryStore) -> None: - store_dict = {} + store_dict: dict[str, Any] = {} result = GpuMemoryStore(store_dict=store_dict) assert result._store_dict is store_dict - def test_from_dict(self): + def test_from_dict(self) -> None: d = { "a": gpu.Buffer.from_bytes(b"aaaa"), "b": cpu.Buffer.from_bytes(b"bbbb"), diff --git a/tests/test_store/test_zip.py b/tests/test_store/test_zip.py index 0237258ab1..fa99ca61bd 100644 --- a/tests/test_store/test_zip.py +++ b/tests/test_store/test_zip.py @@ -11,6 +11,7 @@ import zarr from zarr.core.buffer import Buffer, cpu, default_buffer_prototype +from zarr.core.group import Group from zarr.storage import ZipStore from zarr.testing.store import StoreTests @@ -32,7 +33,7 @@ class TestZipStore(StoreTests[ZipStore, cpu.Buffer]): buffer_cls = cpu.Buffer @pytest.fixture - def store_kwargs(self, request) -> dict[str, str | bool]: + def store_kwargs(self) -> dict[str, str | bool]: fd, temp_path = tempfile.mkstemp() os.close(fd) os.unlink(temp_path) @@ -40,12 +41,14 @@ def store_kwargs(self, request) -> dict[str, str | bool]: return {"path": temp_path, "mode": "w", "read_only": False} async def get(self, store: ZipStore, key: str) -> Buffer: - return store._get(key, prototype=default_buffer_prototype()) + buf = store._get(key, prototype=default_buffer_prototype()) + assert buf is not None + return buf async def set(self, store: ZipStore, key: str, value: Buffer) -> None: return store._set(key, value) - def test_store_read_only(self, store: ZipStore, store_kwargs: dict[str, Any]) -> None: + def test_store_read_only(self, store: ZipStore) -> None: assert not store.read_only async def test_read_only_store_raises(self, store_kwargs: dict[str, Any]) -> None: @@ -109,7 +112,7 @@ def test_api_integration(self, store: ZipStore) -> None: async def test_store_open_read_only( self, store_kwargs: dict[str, Any], read_only: bool ) -> None: - if read_only == "r": + if read_only: # create an empty zipfile with zipfile.ZipFile(store_kwargs["path"], mode="w"): pass @@ -129,9 +132,11 @@ def test_externally_zipped_store(self, tmp_path: Path) -> None: zarr_path = tmp_path / "foo.zarr" root = zarr.open_group(store=zarr_path, mode="w") root.require_group("foo") - root["foo"]["bar"] = np.array([1]) - shutil.make_archive(zarr_path, "zip", zarr_path) + assert isinstance(foo := root["foo"], Group) # noqa: RUF018 + foo["bar"] = np.array([1]) + shutil.make_archive(str(zarr_path), "zip", zarr_path) zip_path = tmp_path / "foo.zarr.zip" zipped = zarr.open_group(ZipStore(zip_path, mode="r"), mode="r") assert list(zipped.keys()) == list(root.keys()) - assert list(zipped["foo"].keys()) == list(root["foo"].keys()) + assert isinstance(group := zipped["foo"], Group) + assert list(group.keys()) == list(group.keys()) From 629b4e5094d566cdc96aa187d0faccee9b28861f Mon Sep 17 00:00:00 2001 From: David Stansby Date: Wed, 14 May 2025 09:35:21 +0100 Subject: [PATCH 17/25] Allow no compressor for v2 arrays (#3039) * Allow no compressor for v2 arrays * Use typing aliases for compressors * Test v2 array w/ v3 codec errors * Add changelog entry * Update type comment * fix test names Co-authored-by: Davis Bennett --------- Co-authored-by: Davis Bennett --- changes/3039.bugfix.rst | 5 +++++ src/zarr/api/asynchronous.py | 13 +++++++++--- src/zarr/api/synchronous.py | 4 ++-- src/zarr/core/array.py | 38 +++++++++++++++++++++++++----------- src/zarr/core/metadata/v2.py | 10 +++++++--- tests/test_api.py | 19 ++++++++++++++++++ 6 files changed, 70 insertions(+), 19 deletions(-) create mode 100644 changes/3039.bugfix.rst diff --git a/changes/3039.bugfix.rst b/changes/3039.bugfix.rst new file mode 100644 index 0000000000..be2b424cf5 --- /dev/null +++ b/changes/3039.bugfix.rst @@ -0,0 +1,5 @@ +It is now possible to specify no compressor when creating a zarr format 2 array. +This can be done by passing ``compressor=None`` to the various array creation routines. + +The default behaviour of automatically choosing a suitable default compressor remains if the compressor argument is not given. +To reproduce the behaviour in previous zarr-python versions when ``compressor=None`` was passed, pass ``compressor='auto'`` instead. diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index ac143f6dea..59261cca8a 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,7 +9,14 @@ import numpy.typing as npt from typing_extensions import deprecated -from zarr.core.array import Array, AsyncArray, create_array, from_array, get_array_metadata +from zarr.core.array import ( + Array, + AsyncArray, + CompressorLike, + create_array, + from_array, + get_array_metadata, +) from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.common import ( @@ -838,7 +845,7 @@ async def create( *, # Note: this is a change from v2 chunks: ChunkCoords | int | None = None, # TODO: v2 allowed chunks=True dtype: npt.DTypeLike | None = None, - compressor: dict[str, JSON] | None = None, # TODO: default and type change + compressor: CompressorLike = "auto", fill_value: Any | None = 0, # TODO: need type order: MemoryOrder | None = None, store: str | StoreLike | None = None, @@ -991,7 +998,7 @@ async def create( dtype = parse_dtype(dtype, zarr_format) if not filters: filters = _default_filters(dtype) - if not compressor: + if compressor == "auto": compressor = _default_compressor(dtype) elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 5662f5c247..24ab937db5 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -7,7 +7,7 @@ import zarr.api.asynchronous as async_api import zarr.core.array from zarr._compat import _deprecate_positional_args -from zarr.core.array import Array, AsyncArray +from zarr.core.array import Array, AsyncArray, CompressorLike from zarr.core.group import Group from zarr.core.sync import sync from zarr.core.sync_group import create_hierarchy @@ -599,7 +599,7 @@ def create( *, # Note: this is a change from v2 chunks: ChunkCoords | int | bool | None = None, dtype: npt.DTypeLike | None = None, - compressor: dict[str, JSON] | None = None, # TODO: default and type change + compressor: CompressorLike = "auto", fill_value: Any | None = 0, # TODO: need type order: MemoryOrder | None = None, store: str | StoreLike | None = None, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9852bf8d5f..cf4c36cc22 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -102,6 +102,7 @@ T_ArrayMetadata, ) from zarr.core.metadata.v2 import ( + CompressorLikev2, _default_compressor, _default_filters, parse_compressor, @@ -303,7 +304,7 @@ async def create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, + compressor: CompressorLikev2 | Literal["auto"] = "auto", # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -394,7 +395,7 @@ async def create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, + compressor: CompressorLike = "auto", # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -429,7 +430,7 @@ async def create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, + compressor: CompressorLike = "auto", # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -570,7 +571,7 @@ async def _create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, + compressor: CompressorLike = "auto", # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -604,7 +605,7 @@ async def _create( raise ValueError( "filters cannot be used for arrays with zarr_format 3. Use array-to-array codecs instead." ) - if compressor is not None: + if compressor != "auto": raise ValueError( "compressor cannot be used for arrays with zarr_format 3. Use bytes-to-bytes codecs instead." ) @@ -768,7 +769,7 @@ def _create_metadata_v2( dimension_separator: Literal[".", "/"] | None = None, fill_value: float | None = None, filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, - compressor: dict[str, JSON] | numcodecs.abc.Codec | None = None, + compressor: CompressorLikev2 = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: if dimension_separator is None: @@ -809,7 +810,7 @@ async def _create_v2( dimension_separator: Literal[".", "/"] | None = None, fill_value: float | None = None, filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, - compressor: dict[str, JSON] | numcodecs.abc.Codec | None = None, + compressor: CompressorLike = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: @@ -821,6 +822,17 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) + compressor_parsed: CompressorLikev2 + if compressor == "auto": + compressor_parsed = _default_compressor(dtype) + elif isinstance(compressor, BytesBytesCodec): + raise ValueError( + "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. " + "Use a numcodecs codec directly instead." + ) + else: + compressor_parsed = compressor + metadata = cls._create_metadata_v2( shape=shape, dtype=dtype, @@ -829,7 +841,7 @@ async def _create_v2( dimension_separator=dimension_separator, fill_value=fill_value, filters=filters, - compressor=compressor, + compressor=compressor_parsed, attributes=attributes, ) @@ -1751,7 +1763,7 @@ def create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, + compressor: CompressorLike = "auto", # runtime overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -1880,7 +1892,7 @@ def _create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, + compressor: CompressorLike = "auto", # runtime overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -3792,7 +3804,11 @@ def _get_default_codecs( | Literal["auto"] | None ) -CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | None +# Union of acceptable types for users to pass in for both v2 and v3 compressors +CompressorLike: TypeAlias = ( + dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] | None +) + CompressorsLike: TypeAlias = ( Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] | dict[str, JSON] diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index d19193963f..029a3e09a7 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -5,7 +5,7 @@ from collections.abc import Iterable, Sequence from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING, Any, TypedDict, cast +from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast import numcodecs.abc @@ -43,6 +43,10 @@ class ArrayV2MetadataDict(TypedDict): attributes: dict[str, JSON] +# Union of acceptable types for v2 compressors +CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None + + @dataclass(frozen=True, kw_only=True) class ArrayV2Metadata(Metadata): shape: ChunkCoords @@ -52,7 +56,7 @@ class ArrayV2Metadata(Metadata): order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: numcodecs.abc.Codec | None = None + compressor: CompressorLikev2 attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -65,7 +69,7 @@ def __init__( fill_value: Any, order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", - compressor: numcodecs.abc.Codec | dict[str, JSON] | None = None, + compressor: CompressorLikev2 = None, filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: diff --git a/tests/test_api.py b/tests/test_api.py index 9f03a1067a..d1912f7238 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING +import zarr.codecs + if TYPE_CHECKING: import pathlib @@ -1190,3 +1192,20 @@ def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None: # assert_array_equal doesn't check the type assert isinstance(result, type(src)) cp.testing.assert_array_equal(result, src[:10, :10]) + + +def test_v2_without_compressor() -> None: + # Make sure it's possible to set no compressor for v2 arrays + arr = zarr.create(store={}, shape=(1), dtype="uint8", zarr_format=2, compressor=None) + assert arr.compressors == () + + +def test_v2_with_v3_compressor() -> None: + # Check trying to create a v2 array with a v3 compressor fails + with pytest.raises( + ValueError, + match="Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. Use a numcodecs codec directly instead.", + ): + zarr.create( + store={}, shape=(1), dtype="uint8", zarr_format=2, compressor=zarr.codecs.BloscCodec() + ) From d615783f1588da8f555476e006622833743f15dc Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 14 May 2025 13:10:37 +0100 Subject: [PATCH 18/25] Avoid memory copy in obstore write (#2972) * Avoid memory copy in obstore write * Add as_bytes_like method to Buffer * Add changelog entry * No need to take unsigned bytes view following #2738 * Change method name to `as_buffer_like` --------- Co-authored-by: jakirkham Co-authored-by: Davis Bennett --- changes/2972.misc.rst | 1 + src/zarr/core/buffer/core.py | 13 +++++++++++++ src/zarr/storage/_local.py | 4 ++-- src/zarr/storage/_obstore.py | 4 ++-- 4 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 changes/2972.misc.rst diff --git a/changes/2972.misc.rst b/changes/2972.misc.rst new file mode 100644 index 0000000000..f0258c1d05 --- /dev/null +++ b/changes/2972.misc.rst @@ -0,0 +1 @@ +Avoid an unnecessary memory copy when writing Zarr with obstore diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index cfcd7e6633..94cd91f026 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -255,6 +255,19 @@ def as_numpy_array(self) -> npt.NDArray[Any]: """ ... + def as_buffer_like(self) -> BytesLike: + """Returns the buffer as an object that implements the Python buffer protocol. + + Notes + ----- + Might have to copy data, since the implementation uses `.as_numpy_array()`. + + Returns + ------- + An object that implements the Python buffer protocol + """ + return memoryview(self.as_numpy_array()) # type: ignore[arg-type] + def to_bytes(self) -> bytes: """Returns the buffer as `bytes` (host memory). diff --git a/src/zarr/storage/_local.py b/src/zarr/storage/_local.py index 85d244f17b..f2af75f43e 100644 --- a/src/zarr/storage/_local.py +++ b/src/zarr/storage/_local.py @@ -52,10 +52,10 @@ def _put( with path.open("r+b") as f: f.seek(start) # write takes any object supporting the buffer protocol - f.write(value.as_numpy_array()) # type: ignore[arg-type] + f.write(value.as_buffer_like()) return None else: - view = memoryview(value.as_numpy_array()) # type: ignore[arg-type] + view = value.as_buffer_like() if exclusive: mode = "xb" else: diff --git a/src/zarr/storage/_obstore.py b/src/zarr/storage/_obstore.py index 8c2469747d..738754a8b9 100644 --- a/src/zarr/storage/_obstore.py +++ b/src/zarr/storage/_obstore.py @@ -161,7 +161,7 @@ async def set(self, key: str, value: Buffer) -> None: self._check_writable() - buf = value.to_bytes() + buf = value.as_buffer_like() await obs.put_async(self.store, key, buf) async def set_if_not_exists(self, key: str, value: Buffer) -> None: @@ -169,7 +169,7 @@ async def set_if_not_exists(self, key: str, value: Buffer) -> None: import obstore as obs self._check_writable() - buf = value.to_bytes() + buf = value.as_buffer_like() with contextlib.suppress(obs.exceptions.AlreadyExistsError): await obs.put_async(self.store, key, buf, mode="create") From aa3341573b8397ce655058ec80335931131badbf Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Wed, 14 May 2025 20:26:43 +0200 Subject: [PATCH 19/25] Replace redundant list comprehension with generator (#3040) * Replace redundant list comprehension with generator * Partially revert There is no such thing as a "tuple comprehension": https://stackoverflow.com/questions/52285419/aggregating-an-async-generator-to-a-tuple#52285420 Fixes CI error: FAILED tests/test_group.py::test_create_hierarchy_existing_nodes[zarr2-async-array-memory] - TypeError: 'async_generator' object is not iterable --- tests/test_store/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index 1ac410954b..4b1858afb5 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -203,7 +203,7 @@ def test_valid() -> None: Test that path normalization works as expected """ paths = ["a", "b", "c", "d", "", "//a///b//"] - assert _normalize_paths(paths) == tuple([normalize_path(p) for p in paths]) + assert _normalize_paths(paths) == tuple(normalize_path(p) for p in paths) @staticmethod @pytest.mark.parametrize("paths", [("", "/"), ("///a", "a")]) From 882641519d8b79c32aa803b99e9cc70eccf6e066 Mon Sep 17 00:00:00 2001 From: Hannes Spitz <44113112+brokkoli71@users.noreply.github.com> Date: Thu, 15 May 2025 11:58:41 +0200 Subject: [PATCH 20/25] Additional testing for `AsyncArray`, `Array` (#3049) * remove duplicate metadata parsing * add test cases * add test cases * tests for different zarr_formats in test_storage_transformers * tests for different zarr_formats in test_storage_transformers * ignore mypy arg-type error for deprecation test * fix typing in tests * test_chunk_key_encoding * test_invalid_v2_arguments * test_array_repr * type annotation for parse_array_metadata * test_v2_and_v3_exist_at_same_path * remove duplicate check for dimension_separator in v3 * tests for invalid arguments in creation * format * revert typing * document changes --- changes/3049.misc.rst | 1 + src/zarr/api/asynchronous.py | 5 - src/zarr/core/array.py | 23 ++--- tests/test_api.py | 65 +++++++++---- tests/test_array.py | 178 +++++++++++++++++++++++++++++++---- 5 files changed, 213 insertions(+), 59 deletions(-) create mode 100644 changes/3049.misc.rst diff --git a/changes/3049.misc.rst b/changes/3049.misc.rst new file mode 100644 index 0000000000..79ecd6ed95 --- /dev/null +++ b/changes/3049.misc.rst @@ -0,0 +1 @@ +Added tests for ``AsyncArray``, ``Array`` and removed duplicate argument parsing. \ No newline at end of file diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 59261cca8a..cdedd5b033 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1019,11 +1019,6 @@ async def create( warnings.warn("object_codec is not yet implemented", RuntimeWarning, stacklevel=2) if read_only is not None: warnings.warn("read_only is not yet implemented", RuntimeWarning, stacklevel=2) - if dimension_separator is not None and zarr_format == 3: - raise ValueError( - "dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead" - ) - if order is not None: _warn_order_kwarg() if write_empty_chunks is not None: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index cf4c36cc22..78b5e92ed6 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -140,7 +140,8 @@ def parse_array_metadata(data: Any) -> ArrayMetadata: if isinstance(data, ArrayMetadata): return data elif isinstance(data, dict): - if data["zarr_format"] == 3: + zarr_format = data.get("zarr_format") + if zarr_format == 3: meta_out = ArrayV3Metadata.from_dict(data) if len(meta_out.storage_transformers) > 0: msg = ( @@ -149,9 +150,11 @@ def parse_array_metadata(data: Any) -> ArrayMetadata: ) raise ValueError(msg) return meta_out - elif data["zarr_format"] == 2: + elif zarr_format == 2: return ArrayV2Metadata.from_dict(data) - raise TypeError + else: + raise ValueError(f"Invalid zarr_format: {zarr_format}. Expected 2 or 3") + raise TypeError # pragma: no cover def create_codec_pipeline(metadata: ArrayMetadata) -> CodecPipeline: @@ -160,8 +163,7 @@ def create_codec_pipeline(metadata: ArrayMetadata) -> CodecPipeline: elif isinstance(metadata, ArrayV2Metadata): v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor) return get_pipeline_class().from_codecs([v2_codec]) - else: - raise TypeError + raise TypeError # pragma: no cover async def get_array_metadata( @@ -268,17 +270,6 @@ def __init__( store_path: StorePath, config: ArrayConfigLike | None = None, ) -> None: - if isinstance(metadata, dict): - zarr_format = metadata["zarr_format"] - # TODO: remove this when we extensively type the dict representation of metadata - _metadata = cast(dict[str, JSON], metadata) - if zarr_format == 2: - metadata = ArrayV2Metadata.from_dict(_metadata) - elif zarr_format == 3: - metadata = ArrayV3Metadata.from_dict(_metadata) - else: - raise ValueError(f"Invalid zarr_format: {zarr_format}. Expected 2 or 3") - metadata_parsed = parse_array_metadata(metadata) config_parsed = parse_array_config(config) diff --git a/tests/test_api.py b/tests/test_api.py index d1912f7238..6904f91fe7 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import TYPE_CHECKING import zarr.codecs @@ -72,13 +73,19 @@ def test_create(memory_store: Store) -> None: # TODO: parametrize over everything this function takes @pytest.mark.parametrize("store", ["memory"], indirect=True) -def test_create_array(store: Store) -> None: +def test_create_array(store: Store, zarr_format: ZarrFormat) -> None: attrs: dict[str, JSON] = {"foo": 100} # explicit type annotation to avoid mypy error shape = (10, 10) path = "foo" data_val = 1 array_w = create_array( - store, name=path, shape=shape, attributes=attrs, chunks=shape, dtype="uint8" + store, + name=path, + shape=shape, + attributes=attrs, + chunks=shape, + dtype="uint8", + zarr_format=zarr_format, ) array_w[:] = data_val assert array_w.shape == shape @@ -87,18 +94,27 @@ def test_create_array(store: Store) -> None: @pytest.mark.parametrize("write_empty_chunks", [True, False]) -def test_write_empty_chunks_warns(write_empty_chunks: bool) -> None: +def test_write_empty_chunks_warns(write_empty_chunks: bool, zarr_format: ZarrFormat) -> None: """ Test that using the `write_empty_chunks` kwarg on array access will raise a warning. """ match = "The `write_empty_chunks` keyword argument .*" with pytest.warns(RuntimeWarning, match=match): _ = zarr.array( - data=np.arange(10), shape=(10,), dtype="uint8", write_empty_chunks=write_empty_chunks + data=np.arange(10), + shape=(10,), + dtype="uint8", + write_empty_chunks=write_empty_chunks, + zarr_format=zarr_format, ) with pytest.warns(RuntimeWarning, match=match): - _ = zarr.create(shape=(10,), dtype="uint8", write_empty_chunks=write_empty_chunks) + _ = zarr.create( + shape=(10,), + dtype="uint8", + write_empty_chunks=write_empty_chunks, + zarr_format=zarr_format, + ) @pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"]) @@ -115,18 +131,18 @@ def test_open_normalized_path( assert node.path == normalize_path(path) -async def test_open_array(memory_store: MemoryStore) -> None: +async def test_open_array(memory_store: MemoryStore, zarr_format: ZarrFormat) -> None: store = memory_store # open array, create if doesn't exist - z = open(store=store, shape=100) + z = open(store=store, shape=100, zarr_format=zarr_format) assert isinstance(z, Array) assert z.shape == (100,) # open array, overwrite # store._store_dict = {} store = MemoryStore() - z = open(store=store, shape=200) + z = open(store=store, shape=200, zarr_format=zarr_format) assert isinstance(z, Array) assert z.shape == (200,) @@ -140,7 +156,16 @@ async def test_open_array(memory_store: MemoryStore) -> None: # path not found with pytest.raises(FileNotFoundError): - open(store="doesnotexist", mode="r") + open(store="doesnotexist", mode="r", zarr_format=zarr_format) + + +@pytest.mark.parametrize("store", ["memory", "local", "zip"], indirect=True) +def test_v2_and_v3_exist_at_same_path(store: Store) -> None: + zarr.create_array(store, shape=(10,), dtype="uint8", zarr_format=3) + zarr.create_array(store, shape=(10,), dtype="uint8", zarr_format=2) + msg = f"Both zarr.json (Zarr format 3) and .zarray (Zarr format 2) metadata objects exist at {store}. Zarr v3 will be used." + with pytest.warns(UserWarning, match=re.escape(msg)): + zarr.open(store=store, mode="r") @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -163,9 +188,9 @@ async def test_open_group(memory_store: MemoryStore) -> None: assert "foo" in g # open group, overwrite - # g = open_group(store=store) - # assert isinstance(g, Group) - # assert "foo" not in g + g = open_group(store=store, mode="w") + assert isinstance(g, Group) + assert "foo" not in g # open group, read-only store_cls = type(store) @@ -308,7 +333,6 @@ def test_open_with_mode_w_minus(tmp_path: pathlib.Path) -> None: zarr.open(store=tmp_path, mode="w-") -@pytest.mark.parametrize("zarr_format", [2, 3]) def test_array_order(zarr_format: ZarrFormat) -> None: arr = zarr.ones(shape=(2, 2), order=None, zarr_format=zarr_format) expected = zarr.config.get("array.order") @@ -324,7 +348,6 @@ def test_array_order(zarr_format: ZarrFormat) -> None: @pytest.mark.parametrize("order", ["C", "F"]) -@pytest.mark.parametrize("zarr_format", [2, 3]) def test_array_order_warns(order: MemoryOrder | None, zarr_format: ZarrFormat) -> None: with pytest.warns(RuntimeWarning, match="The `order` keyword argument .*"): arr = zarr.ones(shape=(2, 2), order=order, zarr_format=zarr_format) @@ -1095,13 +1118,16 @@ def test_open_falls_back_to_open_group() -> None: assert group.attrs == {"key": "value"} -async def test_open_falls_back_to_open_group_async() -> None: +async def test_open_falls_back_to_open_group_async(zarr_format: ZarrFormat) -> None: # https://github.com/zarr-developers/zarr-python/issues/2309 store = MemoryStore() - await zarr.api.asynchronous.open_group(store, attributes={"key": "value"}) + await zarr.api.asynchronous.open_group( + store, attributes={"key": "value"}, zarr_format=zarr_format + ) group = await zarr.api.asynchronous.open(store=store) assert isinstance(group, zarr.core.group.AsyncGroup) + assert group.metadata.zarr_format == zarr_format assert group.attrs == {"key": "value"} @@ -1137,13 +1163,14 @@ async def test_metadata_validation_error() -> None: ["local", "memory", "zip"], indirect=True, ) -def test_open_array_with_mode_r_plus(store: Store) -> None: +def test_open_array_with_mode_r_plus(store: Store, zarr_format: ZarrFormat) -> None: # 'r+' means read/write (must exist) with pytest.raises(FileNotFoundError): - zarr.open_array(store=store, mode="r+") - zarr.ones(store=store, shape=(3, 3)) + zarr.open_array(store=store, mode="r+", zarr_format=zarr_format) + zarr.ones(store=store, shape=(3, 3), zarr_format=zarr_format) z2 = zarr.open_array(store=store, mode="r+") assert isinstance(z2, Array) + assert z2.metadata.zarr_format == zarr_format result = z2[:] assert isinstance(result, NDArrayLike) assert (result == 1).all() diff --git a/tests/test_array.py b/tests/test_array.py index 989fe30592..eb19f0e7f3 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -41,6 +41,7 @@ from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer from zarr.core.chunk_grids import _auto_partition +from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv @@ -51,7 +52,7 @@ if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike - from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v2 import ArrayV2Metadata @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @@ -227,10 +228,13 @@ def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str assert arr.fill_value.dtype == arr.dtype -def test_create_positional_args_deprecated() -> None: - store = MemoryStore() - with pytest.warns(FutureWarning, match="Pass"): - zarr.Array.create(store, (2, 2), dtype="f8") +async def test_create_deprecated() -> None: + with pytest.warns(DeprecationWarning): + with pytest.warns(FutureWarning, match=re.escape("Pass shape=(2, 2) as keyword args")): + await zarr.AsyncArray.create(MemoryStore(), (2, 2), dtype="f8") # type: ignore[call-overload] + with pytest.warns(DeprecationWarning): + with pytest.warns(FutureWarning, match=re.escape("Pass shape=(2, 2) as keyword args")): + zarr.Array.create(MemoryStore(), (2, 2), dtype="f8") def test_selection_positional_args_deprecated() -> None: @@ -321,24 +325,47 @@ def test_serializable_sync_array(store: LocalStore, zarr_format: ZarrFormat) -> @pytest.mark.parametrize("store", ["memory"], indirect=True) -def test_storage_transformers(store: MemoryStore) -> None: +@pytest.mark.parametrize("zarr_format", [2, 3, "invalid"]) +def test_storage_transformers(store: MemoryStore, zarr_format: ZarrFormat | str) -> None: """ Test that providing an actual storage transformer produces a warning and otherwise passes through """ - metadata_dict: dict[str, JSON] = { - "zarr_format": 3, - "node_type": "array", - "shape": (10,), - "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, - "data_type": "uint8", - "chunk_key_encoding": {"name": "v2", "configuration": {"separator": "/"}}, - "codecs": (BytesCodec().to_dict(),), - "fill_value": 0, - "storage_transformers": ({"test": "should_raise"}), - } - match = "Arrays with storage transformers are not supported in zarr-python at this time." - with pytest.raises(ValueError, match=match): + metadata_dict: dict[str, JSON] + if zarr_format == 3: + metadata_dict = { + "zarr_format": 3, + "node_type": "array", + "shape": (10,), + "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, + "data_type": "uint8", + "chunk_key_encoding": {"name": "v2", "configuration": {"separator": "/"}}, + "codecs": (BytesCodec().to_dict(),), + "fill_value": 0, + "storage_transformers": ({"test": "should_raise"}), + } + else: + metadata_dict = { + "zarr_format": zarr_format, + "shape": (10,), + "chunks": (1,), + "dtype": "uint8", + "dimension_separator": ".", + "codecs": (BytesCodec().to_dict(),), + "fill_value": 0, + "order": "C", + "storage_transformers": ({"test": "should_raise"}), + } + if zarr_format == 3: + match = "Arrays with storage transformers are not supported in zarr-python at this time." + with pytest.raises(ValueError, match=match): + Array.from_dict(StorePath(store), data=metadata_dict) + elif zarr_format == 2: + # no warning Array.from_dict(StorePath(store), data=metadata_dict) + else: + match = f"Invalid zarr_format: {zarr_format}. Expected 2 or 3" + with pytest.raises(ValueError, match=match): + Array.from_dict(StorePath(store), data=metadata_dict) @pytest.mark.parametrize("test_cls", [Array, AsyncArray[Any]]) @@ -1106,6 +1133,111 @@ async def test_v3_chunk_encoding( assert arr.filters == filters_expected assert arr.compressors == compressors_expected + @staticmethod + @pytest.mark.parametrize("name", ["v2", "default", "invalid"]) + @pytest.mark.parametrize("separator", [".", "/"]) + async def test_chunk_key_encoding( + name: str, separator: Literal[".", "/"], zarr_format: ZarrFormat, store: MemoryStore + ) -> None: + chunk_key_encoding = ChunkKeyEncodingParams(name=name, separator=separator) # type: ignore[typeddict-item] + error_msg = "" + if name == "invalid": + error_msg = "Unknown chunk key encoding." + if zarr_format == 2 and name == "default": + error_msg = "Invalid chunk key encoding. For Zarr format 2 arrays, the `name` field of the chunk key encoding must be 'v2'." + if error_msg: + with pytest.raises(ValueError, match=re.escape(error_msg)): + arr = await create_array( + store=store, + dtype="uint8", + shape=(10,), + chunks=(1,), + zarr_format=zarr_format, + chunk_key_encoding=chunk_key_encoding, + ) + else: + arr = await create_array( + store=store, + dtype="uint8", + shape=(10,), + chunks=(1,), + zarr_format=zarr_format, + chunk_key_encoding=chunk_key_encoding, + ) + if isinstance(arr.metadata, ArrayV2Metadata): + assert arr.metadata.dimension_separator == separator + + @staticmethod + @pytest.mark.parametrize( + ("kwargs", "error_msg"), + [ + ({"serializer": "bytes"}, "Zarr format 2 arrays do not support `serializer`."), + ({"dimension_names": ["test"]}, "Zarr format 2 arrays do not support dimension names."), + ], + ) + async def test_create_array_invalid_v2_arguments( + kwargs: dict[str, Any], error_msg: str, store: MemoryStore + ) -> None: + with pytest.raises(ValueError, match=re.escape(error_msg)): + await zarr.api.asynchronous.create_array( + store=store, dtype="uint8", shape=(10,), chunks=(1,), zarr_format=2, **kwargs + ) + + @staticmethod + @pytest.mark.parametrize( + ("kwargs", "error_msg"), + [ + ( + {"dimension_names": ["test"]}, + "dimension_names cannot be used for arrays with zarr_format 2.", + ), + ( + {"chunk_key_encoding": {"name": "default", "separator": "/"}}, + "chunk_key_encoding cannot be used for arrays with zarr_format 2. Use dimension_separator instead.", + ), + ( + {"codecs": "bytes"}, + "codecs cannot be used for arrays with zarr_format 2. Use filters and compressor instead.", + ), + ], + ) + async def test_create_invalid_v2_arguments( + kwargs: dict[str, Any], error_msg: str, store: MemoryStore + ) -> None: + with pytest.raises(ValueError, match=re.escape(error_msg)): + await zarr.api.asynchronous.create( + store=store, dtype="uint8", shape=(10,), chunks=(1,), zarr_format=2, **kwargs + ) + + @staticmethod + @pytest.mark.parametrize( + ("kwargs", "error_msg"), + [ + ( + {"chunk_shape": (1,), "chunks": (2,)}, + "Only one of chunk_shape or chunks can be provided.", + ), + ( + {"dimension_separator": "/"}, + "dimension_separator cannot be used for arrays with zarr_format 3. Use chunk_key_encoding instead.", + ), + ( + {"filters": []}, + "filters cannot be used for arrays with zarr_format 3. Use array-to-array codecs instead", + ), + ( + {"compressor": "blosc"}, + "compressor cannot be used for arrays with zarr_format 3. Use bytes-to-bytes codecs instead", + ), + ], + ) + async def test_invalid_v3_arguments( + kwargs: dict[str, Any], error_msg: str, store: MemoryStore + ) -> None: + kwargs.setdefault("chunks", (1,)) + with pytest.raises(ValueError, match=re.escape(error_msg)): + zarr.create(store=store, dtype="uint8", shape=(10,), zarr_format=3, **kwargs) + @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) @pytest.mark.parametrize( @@ -1585,3 +1717,11 @@ async def test_sharding_coordinate_selection() -> None: result = arr[1, [0, 1]] # type: ignore[index] assert isinstance(result, NDArrayLike) assert (result == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() + + +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) +def test_array_repr(store: Store) -> None: + shape = (2, 3, 4) + dtype = "uint8" + arr = zarr.create_array(store, shape=shape, dtype=dtype) + assert str(arr) == f"" From ee1d70f9b036e3827dd6287a294e73d86bfed86a Mon Sep 17 00:00:00 2001 From: David Stansby Date: Fri, 16 May 2025 10:24:06 +0100 Subject: [PATCH 21/25] Update pre-commit hooks (#3058) * Update pre-commit hooks * Update type checking code --- .pre-commit-config.yaml | 4 ++-- pyproject.toml | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 80743a5dec..fd50366a1c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ ci: default_stages: [pre-commit, pre-push] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.9 + rev: v0.11.9 hooks: - id: ruff args: ["--fix", "--show-fixes"] @@ -40,7 +40,7 @@ repos: - hypothesis - s3fs - repo: https://github.com/scientific-python/cookie - rev: 2025.01.22 + rev: 2025.05.02 hooks: - id: sp-repo-review - repo: https://github.com/pre-commit/pygrep-hooks diff --git a/pyproject.toml b/pyproject.toml index 033c9dc114..f1c290e1b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -310,7 +310,7 @@ extend-select = [ "RUF", "SIM", # flake8-simplify "SLOT", # flake8-slots - "TCH", # flake8-type-checking + "TC", # flake8-type-checking "TRY", # tryceratops "UP", # pyupgrade "W", # pycodestyle warnings @@ -338,6 +338,7 @@ ignore = [ "Q003", "COM812", "COM819", + "TC006", ] [tool.ruff.lint.extend-per-file-ignores] From d57fbf78a83689d7886676a2f52a3d7a52462942 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Fri, 16 May 2025 17:35:16 +0100 Subject: [PATCH 22/25] Don't compress data in hypothesis store testing (#3063) * Don't compress data in hypothesis testing * Add comment --- src/zarr/testing/stateful.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/zarr/testing/stateful.py b/src/zarr/testing/stateful.py index acc5f63f19..f4f7b33318 100644 --- a/src/zarr/testing/stateful.py +++ b/src/zarr/testing/stateful.py @@ -17,6 +17,7 @@ import zarr from zarr import Array from zarr.abc.store import Store +from zarr.codecs.bytes import BytesCodec from zarr.core.buffer import Buffer, BufferPrototype, cpu, default_buffer_prototype from zarr.core.sync import SyncMixin from zarr.storage import LocalStore, MemoryStore @@ -108,7 +109,15 @@ def add_array( assume(self.can_add(path)) note(f"Adding array: path='{path}' shape={array.shape} chunks={chunks}") for store in [self.store, self.model]: - zarr.array(array, chunks=chunks, path=path, store=store, fill_value=fill_value) + zarr.array( + array, + chunks=chunks, + path=path, + store=store, + fill_value=fill_value, + # Chose bytes codec to avoid wasting time compressing the data being written + codecs=[BytesCodec()], + ) self.all_arrays.add(path) # @precondition(lambda self: bool(self.all_groups)) From a941224ab51dfb8ca98f9a53afb91f3b6d2c6955 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 16 May 2025 13:23:27 -0400 Subject: [PATCH 23/25] feat: add `print_debug_info` function (#2913) * feat: add function * doc: add docstring * doc: add change log for print_debug_info * Update .github/ISSUE_TEMPLATE/bug_report.yml * feat: expand debug printout + better test * feat: debug - also print out rich * test: print debug test for upstream * feat: better formatting for print_debug_info * restore original issue template --------- Co-authored-by: Davis Bennett Co-authored-by: David Stansby --- changes/2913.feature.rst | 1 + src/zarr/__init__.py | 49 ++++++++++++++++++++++++++++++++++++++++ tests/test_zarr.py | 18 +++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 changes/2913.feature.rst diff --git a/changes/2913.feature.rst b/changes/2913.feature.rst new file mode 100644 index 0000000000..e0bfcba791 --- /dev/null +++ b/changes/2913.feature.rst @@ -0,0 +1 @@ +Added a `print_debug_info` function for bug reports. diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 31796601b3..0d58ecf8e8 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -37,6 +37,54 @@ # in case setuptools scm screw up and find version to be 0.0.0 assert not __version__.startswith("0.0.0") + +def print_debug_info() -> None: + """ + Print version info for use in bug reports. + """ + import platform + from importlib.metadata import version + + def print_packages(packages: list[str]) -> None: + not_installed = [] + for package in packages: + try: + print(f"{package}: {version(package)}") + except ModuleNotFoundError: + not_installed.append(package) + if not_installed: + print("\n**Not Installed:**") + for package in not_installed: + print(package) + + required = [ + "packaging", + "numpy", + "numcodecs", + "typing_extensions", + "donfig", + ] + optional = [ + "botocore", + "cupy-cuda12x", + "fsspec", + "numcodecs", + "s3fs", + "gcsfs", + "universal-pathlib", + "rich", + "obstore", + ] + + print(f"platform: {platform.platform()}") + print(f"python: {platform.python_version()}") + print(f"zarr: {__version__}\n") + print("**Required dependencies:**") + print_packages(required) + print("\n**Optional dependencies:**") + print_packages(optional) + + __all__ = [ "Array", "AsyncArray", @@ -67,6 +115,7 @@ "open_consolidated", "open_group", "open_like", + "print_debug_info", "save", "save_array", "save_group", diff --git a/tests/test_zarr.py b/tests/test_zarr.py index 2aa62e4231..f49873132e 100644 --- a/tests/test_zarr.py +++ b/tests/test_zarr.py @@ -1,3 +1,5 @@ +import pytest + import zarr @@ -9,3 +11,19 @@ def test_exports() -> None: for export in __all__: getattr(zarr, export) + + +def test_print_debug_info(capsys: pytest.CaptureFixture[str]) -> None: + """ + Ensure that print_debug_info does not raise an error + """ + from importlib.metadata import version + + from zarr import __version__, print_debug_info + + print_debug_info() + captured = capsys.readouterr() + # test that at least some of what we expect is + # printed out + assert f"zarr: {__version__}" in captured.out + assert f"numpy: {version('numpy')}" in captured.out From dd161df93ec908b5f3ecad03b0f18916885c1f90 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Mon, 19 May 2025 11:27:02 +0100 Subject: [PATCH 24/25] Fix overwrite modes (#3062) * Fix overwrite modes * Add many more tests that data doesn't disappear * Add bugfix entry. * Fix function name --- changes/3062.bugfix.rst | 1 + src/zarr/api/asynchronous.py | 3 +- src/zarr/api/synchronous.py | 1 + tests/test_api.py | 81 ++++++++++++++++++++++++++++++++---- 4 files changed, 76 insertions(+), 10 deletions(-) create mode 100644 changes/3062.bugfix.rst diff --git a/changes/3062.bugfix.rst b/changes/3062.bugfix.rst new file mode 100644 index 0000000000..9e1e52ddb7 --- /dev/null +++ b/changes/3062.bugfix.rst @@ -0,0 +1 @@ +Using various functions to open data with ``mode='a'`` no longer deletes existing data in the store. diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index cdedd5b033..4f3c9c3f8f 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -88,7 +88,7 @@ _READ_MODES: tuple[AccessModeLiteral, ...] = ("r", "r+", "a") _CREATE_MODES: tuple[AccessModeLiteral, ...] = ("a", "w", "w-") -_OVERWRITE_MODES: tuple[AccessModeLiteral, ...] = ("a", "r+", "w") +_OVERWRITE_MODES: tuple[AccessModeLiteral, ...] = ("w",) def _infer_overwrite(mode: AccessModeLiteral) -> bool: @@ -817,7 +817,6 @@ async def open_group( warnings.warn("chunk_store is not yet implemented", RuntimeWarning, stacklevel=2) store_path = await make_store_path(store, mode=mode, storage_options=storage_options, path=path) - if attributes is None: attributes = {} diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 24ab937db5..d4b652ad6e 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -858,6 +858,7 @@ def create_array( Ignored otherwise. overwrite : bool, default False Whether to overwrite an array with the same name in the store, if one exists. + If `True`, all existing paths in the store will be deleted. config : ArrayConfigLike, optional Runtime configuration for the array. write_data : bool diff --git a/tests/test_api.py b/tests/test_api.py index 6904f91fe7..ae112756c5 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING import zarr.codecs +import zarr.storage if TYPE_CHECKING: import pathlib @@ -11,6 +12,7 @@ from zarr.abc.store import Store from zarr.core.common import JSON, MemoryOrder, ZarrFormat +import contextlib import warnings from typing import Literal @@ -27,9 +29,9 @@ create, create_array, create_group, + from_array, group, load, - open, open_group, save, save_array, @@ -41,6 +43,10 @@ from zarr.storage._utils import normalize_path from zarr.testing.utils import gpu_test +if TYPE_CHECKING: + from collections.abc import Callable + from pathlib import Path + def test_create(memory_store: Store) -> None: store = memory_store @@ -135,28 +141,28 @@ async def test_open_array(memory_store: MemoryStore, zarr_format: ZarrFormat) -> store = memory_store # open array, create if doesn't exist - z = open(store=store, shape=100, zarr_format=zarr_format) + z = zarr.api.synchronous.open(store=store, shape=100, zarr_format=zarr_format) assert isinstance(z, Array) assert z.shape == (100,) # open array, overwrite # store._store_dict = {} store = MemoryStore() - z = open(store=store, shape=200, zarr_format=zarr_format) + z = zarr.api.synchronous.open(store=store, shape=200, zarr_format=zarr_format) assert isinstance(z, Array) assert z.shape == (200,) # open array, read-only store_cls = type(store) ro_store = await store_cls.open(store_dict=store._store_dict, read_only=True) - z = open(store=ro_store, mode="r") + z = zarr.api.synchronous.open(store=ro_store, mode="r") assert isinstance(z, Array) assert z.shape == (200,) assert z.read_only # path not found with pytest.raises(FileNotFoundError): - open(store="doesnotexist", mode="r", zarr_format=zarr_format) + zarr.api.synchronous.open(store="doesnotexist", mode="r", zarr_format=zarr_format) @pytest.mark.parametrize("store", ["memory", "local", "zip"], indirect=True) @@ -233,12 +239,12 @@ def test_save(store: Store, n_args: int, n_kwargs: int) -> None: save(store) elif n_args == 1 and n_kwargs == 0: save(store, *args) - array = open(store) + array = zarr.api.synchronous.open(store) assert isinstance(array, Array) assert_array_equal(array[:], data) else: save(store, *args, **kwargs) # type: ignore [arg-type] - group = open(store) + group = zarr.api.synchronous.open(store) assert isinstance(group, Group) for array in group.array_values(): assert_array_equal(array[:], data) @@ -1077,7 +1083,7 @@ def test_tree() -> None: def test_open_positional_args_deprecated() -> None: store = MemoryStore() with pytest.warns(FutureWarning, match="pass"): - open(store, "w", shape=(1,)) + zarr.api.synchronous.open(store, "w", shape=(1,)) def test_save_array_positional_args_deprecated() -> None: @@ -1236,3 +1242,62 @@ def test_v2_with_v3_compressor() -> None: zarr.create( store={}, shape=(1), dtype="uint8", zarr_format=2, compressor=zarr.codecs.BloscCodec() ) + + +def add_empty_file(path: Path) -> Path: + fpath = path / "a.txt" + fpath.touch() + return fpath + + +@pytest.mark.parametrize("create_function", [create_array, from_array]) +@pytest.mark.parametrize("overwrite", [True, False]) +def test_no_overwrite_array(tmp_path: Path, create_function: Callable, overwrite: bool) -> None: # type:ignore[type-arg] + store = zarr.storage.LocalStore(tmp_path) + existing_fpath = add_empty_file(tmp_path) + + assert existing_fpath.exists() + create_function(store=store, data=np.ones(shape=(1,)), overwrite=overwrite) + if overwrite: + assert not existing_fpath.exists() + else: + assert existing_fpath.exists() + + +@pytest.mark.parametrize("create_function", [create_group, group]) +@pytest.mark.parametrize("overwrite", [True, False]) +def test_no_overwrite_group(tmp_path: Path, create_function: Callable, overwrite: bool) -> None: # type:ignore[type-arg] + store = zarr.storage.LocalStore(tmp_path) + existing_fpath = add_empty_file(tmp_path) + + assert existing_fpath.exists() + create_function(store=store, overwrite=overwrite) + if overwrite: + assert not existing_fpath.exists() + else: + assert existing_fpath.exists() + + +@pytest.mark.parametrize("open_func", [zarr.open, open_group]) +@pytest.mark.parametrize("mode", ["r", "r+", "a", "w", "w-"]) +def test_no_overwrite_open(tmp_path: Path, open_func: Callable, mode: str) -> None: # type:ignore[type-arg] + store = zarr.storage.LocalStore(tmp_path) + existing_fpath = add_empty_file(tmp_path) + + assert existing_fpath.exists() + with contextlib.suppress(FileExistsError, FileNotFoundError): + open_func(store=store, mode=mode) + if mode == "w": + assert not existing_fpath.exists() + else: + assert existing_fpath.exists() + + +def test_no_overwrite_load(tmp_path: Path) -> None: + store = zarr.storage.LocalStore(tmp_path) + existing_fpath = add_empty_file(tmp_path) + + assert existing_fpath.exists() + with contextlib.suppress(NotImplementedError): + zarr.load(store) + assert existing_fpath.exists() From 57107260291fc9c6f32c95346c321b8a28a6b6e8 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Mon, 19 May 2025 15:13:07 +0100 Subject: [PATCH 25/25] Changelog for 3.0.8 (#3071) * Changelog for 3.0.8 * Add warning to top of release notes * fix warning --- changes/2862.bugfix.rst | 1 - changes/2913.feature.rst | 1 - changes/2972.misc.rst | 1 - changes/2978.bugfix.rst | 1 - changes/2998.bugfix.md | 1 - changes/3027.misc.rst | 1 - changes/3039.bugfix.rst | 5 ----- changes/3045.bugfix.rst | 1 - changes/3049.misc.rst | 1 - changes/3062.bugfix.rst | 1 - docs/release-notes.rst | 36 ++++++++++++++++++++++++++++++++++++ 11 files changed, 36 insertions(+), 14 deletions(-) delete mode 100644 changes/2862.bugfix.rst delete mode 100644 changes/2913.feature.rst delete mode 100644 changes/2972.misc.rst delete mode 100644 changes/2978.bugfix.rst delete mode 100644 changes/2998.bugfix.md delete mode 100644 changes/3027.misc.rst delete mode 100644 changes/3039.bugfix.rst delete mode 100644 changes/3045.bugfix.rst delete mode 100644 changes/3049.misc.rst delete mode 100644 changes/3062.bugfix.rst diff --git a/changes/2862.bugfix.rst b/changes/2862.bugfix.rst deleted file mode 100644 index bbe6f0746e..0000000000 --- a/changes/2862.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix a bug that prevented the number of initialized chunks being counted properly. \ No newline at end of file diff --git a/changes/2913.feature.rst b/changes/2913.feature.rst deleted file mode 100644 index e0bfcba791..0000000000 --- a/changes/2913.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Added a `print_debug_info` function for bug reports. diff --git a/changes/2972.misc.rst b/changes/2972.misc.rst deleted file mode 100644 index f0258c1d05..0000000000 --- a/changes/2972.misc.rst +++ /dev/null @@ -1 +0,0 @@ -Avoid an unnecessary memory copy when writing Zarr with obstore diff --git a/changes/2978.bugfix.rst b/changes/2978.bugfix.rst deleted file mode 100644 index fe9f3d3f64..0000000000 --- a/changes/2978.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fixed sharding with GPU buffers. diff --git a/changes/2998.bugfix.md b/changes/2998.bugfix.md deleted file mode 100644 index 7b94223122..0000000000 --- a/changes/2998.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Fix structured `dtype` fill value serialization for consolidated metadata \ No newline at end of file diff --git a/changes/3027.misc.rst b/changes/3027.misc.rst deleted file mode 100644 index ffbfe9b808..0000000000 --- a/changes/3027.misc.rst +++ /dev/null @@ -1 +0,0 @@ -Simplified scalar indexing of size-1 arrays. \ No newline at end of file diff --git a/changes/3039.bugfix.rst b/changes/3039.bugfix.rst deleted file mode 100644 index be2b424cf5..0000000000 --- a/changes/3039.bugfix.rst +++ /dev/null @@ -1,5 +0,0 @@ -It is now possible to specify no compressor when creating a zarr format 2 array. -This can be done by passing ``compressor=None`` to the various array creation routines. - -The default behaviour of automatically choosing a suitable default compressor remains if the compressor argument is not given. -To reproduce the behaviour in previous zarr-python versions when ``compressor=None`` was passed, pass ``compressor='auto'`` instead. diff --git a/changes/3045.bugfix.rst b/changes/3045.bugfix.rst deleted file mode 100644 index a3886717a7..0000000000 --- a/changes/3045.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fixed the typing of ``dimension_names`` arguments throughout so that it now accepts iterables that contain `None` alongside `str`. diff --git a/changes/3049.misc.rst b/changes/3049.misc.rst deleted file mode 100644 index 79ecd6ed95..0000000000 --- a/changes/3049.misc.rst +++ /dev/null @@ -1 +0,0 @@ -Added tests for ``AsyncArray``, ``Array`` and removed duplicate argument parsing. \ No newline at end of file diff --git a/changes/3062.bugfix.rst b/changes/3062.bugfix.rst deleted file mode 100644 index 9e1e52ddb7..0000000000 --- a/changes/3062.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Using various functions to open data with ``mode='a'`` no longer deletes existing data in the store. diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 341a32c364..f8b00f83e7 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -3,6 +3,42 @@ Release notes .. towncrier release notes start +3.0.8 (2025-05-19) +------------------ + +.. warning:: + + In versions 3.0.0 to 3.0.7 opening arrays or groups with ``mode='a'`` (the default for many builtin functions) + would cause any existing paths in the store to be deleted. This is fixed in 3.0.8, and + we recommend all users upgrade to avoid this bug that could cause unintentional data loss. + +Features +~~~~~~~~ + +- Added a `print_debug_info` function for bug reports. (:issue:`2913`) + + +Bugfixes +~~~~~~~~ + +- Fix a bug that prevented the number of initialized chunks being counted properly. (:issue:`2862`) +- Fixed sharding with GPU buffers. (:issue:`2978`) +- Fix structured `dtype` fill value serialization for consolidated metadata (:issue:`2998`) +- It is now possible to specify no compressor when creating a zarr format 2 array. + This can be done by passing ``compressor=None`` to the various array creation routines. + + The default behaviour of automatically choosing a suitable default compressor remains if the compressor argument is not given. + To reproduce the behaviour in previous zarr-python versions when ``compressor=None`` was passed, pass ``compressor='auto'`` instead. (:issue:`3039`) +- Fixed the typing of ``dimension_names`` arguments throughout so that it now accepts iterables that contain `None` alongside `str`. (:issue:`3045`) +- Using various functions to open data with ``mode='a'`` no longer deletes existing data in the store. (:issue:`3062`) + + +Misc +~~~~ + +- :issue:`2972`, :issue:`3027`, :issue:`3049` + + 3.0.7 (2025-04-22) ------------------