diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 723c995cef..9b64c97d0a 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -4,6 +4,6 @@ TODO: * [ ] Add unit tests and/or doctests in docstrings * [ ] Add docstrings and API docs for any new/modified user-facing classes and functions * [ ] New/modified features documented in `docs/user-guide/*.rst` -* [ ] Changes documented in `docs/release-notes.rst` +* [ ] Changes documented as a new file in `changes/` * [ ] GitHub Actions have all passed * [ ] Test coverage is 100% (Codecov passes) diff --git a/.github/labeler.yml b/.github/labeler.yml index f186216099..f2529dfef5 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,2 +1,4 @@ needs release notes: -- all: ['!docs/release-notes.rst'] + - all: + - changed-files: + - any-glob-to-any-file: '!changes/*.rst' diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index b13da7d36f..c7056a2c4b 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -64,3 +64,9 @@ jobs: - name: Run Tests run: | hatch env run --env gputest.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} run-coverage + + - name: Upload coverage + uses: codecov/codecov-action@13ce06bfc6bbe3ecf90edbbf1bc32fe5978ca1d3 # v5.3.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + verbose: true # optional (default = false) diff --git a/.github/workflows/needs_release_notes.yml b/.github/workflows/needs_release_notes.yml index f37c6349d4..7a6c5462b4 100644 --- a/.github/workflows/needs_release_notes.yml +++ b/.github/workflows/needs_release_notes.yml @@ -4,8 +4,11 @@ on: - pull_request_target jobs: - triage: + labeler: if: ${{ github.event.pull_request.user.login != 'dependabot[bot]' }} && ${{ github.event.pull_request.user.login != 'pre-commit-ci[bot]' }} + permissions: + contents: read + pull-requests: write runs-on: ubuntu-latest steps: - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0 diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index 1b23260c2e..c8903aa779 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -55,7 +55,7 @@ jobs: with: name: releases path: dist - - uses: pypa/gh-action-pypi-publish@v1.12.3 + - uses: pypa/gh-action-pypi-publish@v1.12.4 with: user: __token__ password: ${{ secrets.pypi_password }} diff --git a/.gitignore b/.gitignore index 5663f62d04..1b2b63e651 100644 --- a/.gitignore +++ b/.gitignore @@ -83,6 +83,7 @@ src/zarr/_version.py data/* src/fixture/ fixture/ +junit.xml .DS_Store tests/.hypothesis diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 28d1673652..908b0d5c28 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -49,3 +49,7 @@ repos: rev: v1.8.0 hooks: - id: numpydoc-validation + - repo: https://github.com/twisted/towncrier + rev: 23.11.0 + hooks: + - id: towncrier-check diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 32a3f0e4e1..6253a7196f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -4,6 +4,13 @@ build: os: ubuntu-22.04 tools: python: "3.12" + jobs: + pre_build: + - | + if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; + then + towncrier build --version Unreleased --yes; + fi sphinx: configuration: docs/conf.py diff --git a/changes/.gitignore b/changes/.gitignore new file mode 100644 index 0000000000..f935021a8f --- /dev/null +++ b/changes/.gitignore @@ -0,0 +1 @@ +!.gitignore diff --git a/changes/README.md b/changes/README.md new file mode 100644 index 0000000000..74ed9f94a9 --- /dev/null +++ b/changes/README.md @@ -0,0 +1,14 @@ +Writing a changelog entry +------------------------- + +Please put a new file in this directory named `xxxx..rst`, where + +- `xxxx` is the pull request number associated with this entry +- `` is one of: + - feature + - bugfix + - doc + - removal + - misc + +Inside the file, please write a short description of what you have changed, and how it impacts users of `zarr-python`. diff --git a/docs/developers/contributing.rst b/docs/developers/contributing.rst index 71294826cb..220e24eced 100644 --- a/docs/developers/contributing.rst +++ b/docs/developers/contributing.rst @@ -98,7 +98,7 @@ you can do something like the following:: To verify that your development environment is working, you can run the unit tests for one of the test environments, e.g.:: - $ hatch env run --env test.py3.12-2.1-optional run + $ hatch env run --env test.py3.12-2.1-optional run-pytest Creating a branch ~~~~~~~~~~~~~~~~~ @@ -140,7 +140,7 @@ Zarr includes a suite of unit tests. The simplest way to run the unit tests is to activate your development environment (see `creating a development environment`_ above) and invoke:: - $ hatch env run --env test.py3.12-2.1-optional run + $ hatch env run --env test.py3.12-2.1-optional run-pytest All tests are automatically run via GitHub Actions for every pull request and must pass before code can be accepted. Test coverage is @@ -190,9 +190,13 @@ Both unit tests and docstring doctests are included when computing coverage. Run $ hatch env run --env test.py3.12-2.1-optional run-coverage -will automatically run the test suite with coverage and produce a coverage report. +will automatically run the test suite with coverage and produce a XML coverage report. This should be 100% before code can be accepted into the main code base. +You can also generate an HTML coverage report by running:: + + $ hatch env run --env test.py3.12-2.1-optional run-coverage-html + When submitting a pull request, coverage will also be collected across all supported Python versions via the Codecov service, and will be reported back within the pull request. Codecov coverage must also be 100% before code can be accepted. @@ -212,8 +216,8 @@ The documentation consists both of prose and API documentation. All user-facing and functions are included in the API documentation, under the ``docs/api`` folder using the `autodoc `_ extension to sphinx. Any new features or important usage information should be included in the -user-guide (``docs/user-guide``). Any changes should also be included in the release -notes (``docs/release-notes.rst``). +user-guide (``docs/user-guide``). Any changes should also be included as a new file in the +:file:`changes` directory. The documentation can be built locally by running:: @@ -331,11 +335,9 @@ Release procedure Pre-release """"""""""" -1. Make sure that all pull requests which will be - included in the release have been properly documented in - :file:`docs/release-notes.rst`. -2. Rename the "Unreleased" section heading in :file:`docs/release-notes.rst` - to the version you are about to release. +1. Make sure that all pull requests which will be included in the release + have been properly documented as changelog files in :file:`changes`. +2. Run ``towncrier build --version x.y.z`` to create the changelog. Releasing """"""""" @@ -348,7 +350,7 @@ appropriate suffix (e.g. `v0.0.0a1` or `v0.0.0rc2`). Set the description of the release to:: - See release notes https://zarr.readthedocs.io/en/stable/release.html#release-0-0-0 + See release notes https://zarr.readthedocs.io/en/stable/release-notes.html#release-0-0-0 replacing the correct version numbers. For pre-release versions, the URL should omit the pre-release suffix, e.g. "a1" or "rc1". diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 2c4b658d7b..08c64eb899 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -1,6 +1,63 @@ Release notes ============= +.. towncrier release notes start + +3.0.2 (2025-01-31) +------------------ + +Features +~~~~~~~~ + +- Test ``getsize()`` and ``getsize_prefix()`` in ``StoreTests``. (:issue:`2693`) +- Test that a ``ValueError`` is raised for invalid byte range syntax in ``StoreTests``. (:issue:`2693`) +- Separate instantiating and opening a store in ``StoreTests``. (:issue:`2693`) +- Add a test for using Stores as a context managers in ``StoreTests``. (:issue:`2693`) +- Implemented ``LogingStore.open()``. (:issue:`2693`) +- ``LoggingStore`` is now a generic class. (:issue:`2693`) +- Change StoreTest's ``test_store_repr``, ``test_store_supports_writes``, + ``test_store_supports_partial_writes``, and ``test_store_supports_listing`` + to to be implemented using ``@abstractmethod``, rather raising ``NotImplementedError``. (:issue:`2693`) +- Test the error raised for invalid buffer arguments in ``StoreTests``. (:issue:`2693`) +- Test that data can be written to a store that's not yet open using the store.set method in ``StoreTests``. (:issue:`2693`) +- Adds a new function ``init_array`` for initializing an array in storage, and refactors ``create_array`` + to use ``init_array``. ``create_array`` takes two new parameters: ``data``, an optional array-like object, and ``write_data``, a bool which defaults to ``True``. + If ``data`` is given to ``create_array``, then the ``dtype`` and ``shape`` attributes of ``data`` are used to define the + corresponding attributes of the resulting Zarr array. Additionally, if ``data`` given and ``write_data`` is ``True``, + then the values in ``data`` will be written to the newly created array. (:issue:`2761`) + + +Bugfixes +~~~~~~~~ + +- Wrap sync fsspec filesystems with ``AsyncFileSystemWrapper``. (:issue:`2533`) +- Added backwards compatibility for Zarr format 2 structured arrays. (:issue:`2681`) +- Update equality for ``LoggingStore`` and ``WrapperStore`` such that 'other' must also be a ``LoggingStore`` or ``WrapperStore`` respectively, rather than only checking the types of the stores they wrap. (:issue:`2693`) +- Ensure that ``ZipStore`` is open before getting or setting any values. (:issue:`2693`) +- Use stdout rather than stderr as the default stream for ``LoggingStore``. (:issue:`2693`) +- Match the errors raised by read only stores in ``StoreTests``. (:issue:`2693`) +- Fixed ``ZipStore`` to make sure the correct attributes are saved when instances are pickled. + This fixes a previous bug that prevent using ``ZipStore`` with a ``ProcessPoolExecutor``. (:issue:`2762`) +- Updated the optional test dependencies to include ``botocore`` and ``fsspec``. (:issue:`2768`) +- Fixed the fsspec tests to skip if ``botocore`` is not installed. + Previously they would have failed with an import error. (:issue:`2768`) +- Optimize full chunk writes. (:issue:`2782`) + + +Improved Documentation +~~~~~~~~~~~~~~~~~~~~~~ + +- Changed the machinery for creating changelog entries. + Now individual entries should be added as files to the `changes` directory in the `zarr-python` repository, instead of directly to the changelog file. (:issue:`2736`) + +Other +~~~~~ + +- Created a type alias ``ChunkKeyEncodingLike`` to model the union of ``ChunkKeyEncoding`` instances and the dict form of the + parameters of those instances. ``ChunkKeyEncodingLike`` should be used by high-level functions to provide a convenient + way for creating ``ChunkKeyEncoding`` objects. (:issue:`2763`) + + 3.0.1 (Jan. 17, 2025) --------------------- diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 871291b72b..3662f75dff 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -53,6 +53,7 @@ This is the current default configuration:: 'level': 0}}, 'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}], 'numeric': None, + 'raw': None, 'string': [{'id': 'vlen-utf8'}]}, 'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False, 'level': 0}, diff --git a/pyproject.toml b/pyproject.toml index c49778f285..8d73485dac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,10 +73,12 @@ test = [ "coverage", "pytest", "pytest-cov", + 'zarr[remote]', + "botocore", "s3fs", + "moto[s3,server]", "pytest-asyncio", "pytest-accept", - "moto[s3,server]", "requests", "rich", "mypy", @@ -85,6 +87,7 @@ test = [ ] optional = ["rich", "universal-pathlib"] docs = [ + # Doc building 'sphinx==8.1.3', 'sphinx-autobuild>=2021.3.14', 'sphinx-autoapi==3.4.0', @@ -94,6 +97,9 @@ docs = [ 'sphinx-reredirects', 'pydata-sphinx-theme', 'numpydoc', + # Changelog generation + 'towncrier', + # Optional dependencies to run examples 'numcodecs[msgpack]', 'rich', 's3fs', @@ -149,7 +155,9 @@ features = ["gpu"] [tool.hatch.envs.test.scripts] run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" +run-coverage-html = "pytest --cov-config=pyproject.toml --cov=pkg --cov-report html --cov=src" run = "run-coverage --no-cov" +run-pytest = "run" run-verbose = "run-coverage --verbose" run-mypy = "mypy src" run-hypothesis = "pytest --hypothesis-profile ci tests/test_properties.py tests/test_store/test_stateful*" @@ -413,3 +421,9 @@ checks = [ "PR05", "PR06", ] + +[tool.towncrier] +directory = 'changes' +filename = "docs/release-notes.rst" +underlines = ["-", "~", "^"] +issue_format = ":issue:`{issue}`" diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index e6a5518a4b..96165f8ba0 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -176,10 +176,10 @@ async def get( Parameters ---------- key : str + prototype : BufferPrototype + The prototype of the output buffer. Stores may support a default buffer prototype. byte_range : ByteRequest, optional - ByteRequest may be one of the following. If not provided, all data associated with the key is retrieved. - - RangeByteRequest(int, int): Request a specific range of bytes in the form (start, end). The end is exclusive. If the given range is zero-length or starts after the end of the object, an error will be returned. Additionally, if the range ends after the end of the object, the entire remainder of the object will be returned. Otherwise, the exact requested range will be returned. - OffsetByteRequest(int): Request all bytes starting from a given byte offset. This is equivalent to bytes={int}- as an HTTP header. - SuffixByteRequest(int): Request the last int bytes. Note that here, int is the size of the request, not the byte offset. This is equivalent to bytes=-{int} as an HTTP header. @@ -200,6 +200,8 @@ async def get_partial_values( Parameters ---------- + prototype : BufferPrototype + The prototype of the output buffer. Stores may support a default buffer prototype. key_ranges : Iterable[tuple[str, tuple[int | None, int | None]]] Ordered set of key, range pairs, a key may occur multiple times with different ranges diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index f8bee9fcef..305446ec97 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -14,6 +14,7 @@ if TYPE_CHECKING: from collections.abc import Iterable + import numpy as np import numpy.typing as npt from zarr.abc.codec import Codec @@ -744,8 +745,9 @@ def create_array( store: str | StoreLike, *, name: str | None = None, - shape: ShapeLike, - dtype: npt.DTypeLike, + shape: ShapeLike | None = None, + dtype: npt.DTypeLike | None = None, + data: np.ndarray[Any, np.dtype[Any]] | None = None, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", @@ -772,10 +774,14 @@ def create_array( name : str or None, optional The name of the array within the store. If ``name`` is ``None``, the array will be located at the root of the store. - shape : ChunkCoords - Shape of the array. - dtype : npt.DTypeLike - Data type of the array. + shape : ChunkCoords, optional + Shape of the array. Can be ``None`` if ``data`` is provided. + dtype : npt.DTypeLike, optional + Data type of the array. Can be ``None`` if ``data`` is provided. + data : np.ndarray, optional + Array-like data to use for initializing the array. If this parameter is provided, the + ``shape`` and ``dtype`` parameters must be identical to ``data.shape`` and ``data.dtype``, + or ``None``. chunks : ChunkCoords, optional Chunk shape of the array. If not specified, default are guessed based on the shape and dtype. @@ -874,6 +880,7 @@ def create_array( name=name, shape=shape, dtype=dtype, + data=data, chunks=chunks, shards=shards, filters=filters, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 632e8221b4..4c444a81fa 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -412,7 +412,7 @@ async def create( # v3 only chunk_shape: ShapeLike | None = None, chunk_key_encoding: ( - ChunkKeyEncoding + ChunkKeyEncodingLike | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None @@ -453,7 +453,7 @@ async def create( The shape of the array's chunks Zarr format 3 only. Zarr format 2 arrays should use `chunks` instead. If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding, optional + chunk_key_encoding : ChunkKeyEncodingLike, optional A specification of how the chunk keys are represented in storage. Zarr format 3 only. Zarr format 2 arrays should use `dimension_separator` instead. Default is ``("default", "/")``. @@ -553,7 +553,7 @@ async def _create( # v3 only chunk_shape: ShapeLike | None = None, chunk_key_encoding: ( - ChunkKeyEncoding + ChunkKeyEncodingLike | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None @@ -660,6 +660,48 @@ async def _create( return result + @staticmethod + def _create_metadata_v3( + shape: ShapeLike, + dtype: np.dtype[Any], + chunk_shape: ChunkCoords, + fill_value: Any | None = None, + chunk_key_encoding: ChunkKeyEncodingLike | None = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + attributes: dict[str, JSON] | None = None, + ) -> ArrayV3Metadata: + """ + Create an instance of ArrayV3Metadata. + """ + + shape = parse_shapelike(shape) + codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype)) + chunk_key_encoding_parsed: ChunkKeyEncodingLike + if chunk_key_encoding is None: + chunk_key_encoding_parsed = {"name": "default", "separator": "/"} + else: + chunk_key_encoding_parsed = chunk_key_encoding + + if dtype.kind in "UTS": + warn( + f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " + "may not be supported by other zarr implementations and may change in the future.", + category=UserWarning, + stacklevel=2, + ) + chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape) + return ArrayV3Metadata( + shape=shape, + data_type=dtype, + chunk_grid=chunk_grid_parsed, + chunk_key_encoding=chunk_key_encoding_parsed, + fill_value=fill_value, + codecs=codecs, + dimension_names=tuple(dimension_names) if dimension_names else None, + attributes=attributes or {}, + ) + @classmethod async def _create_v3( cls, @@ -671,7 +713,7 @@ async def _create_v3( config: ArrayConfig, fill_value: Any | None = None, chunk_key_encoding: ( - ChunkKeyEncoding + ChunkKeyEncodingLike | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None @@ -689,13 +731,6 @@ async def _create_v3( else: await ensure_no_existing_node(store_path, zarr_format=3) - shape = parse_shapelike(shape) - codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype)) - - if chunk_key_encoding is None: - chunk_key_encoding = ("default", "/") - assert chunk_key_encoding is not None - if isinstance(chunk_key_encoding, tuple): chunk_key_encoding = ( V2ChunkKeyEncoding(separator=chunk_key_encoding[1]) @@ -703,29 +738,58 @@ async def _create_v3( else DefaultChunkKeyEncoding(separator=chunk_key_encoding[1]) ) - if dtype.kind in "UTS": - warn( - f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " - "may not be supported by other zarr implementations and may change in the future.", - category=UserWarning, - stacklevel=2, - ) - - metadata = ArrayV3Metadata( + metadata = cls._create_metadata_v3( shape=shape, - data_type=dtype, - chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), - chunk_key_encoding=chunk_key_encoding, + dtype=dtype, + chunk_shape=chunk_shape, fill_value=fill_value, + chunk_key_encoding=chunk_key_encoding, codecs=codecs, - dimension_names=tuple(dimension_names) if dimension_names else None, - attributes=attributes or {}, + dimension_names=dimension_names, + attributes=attributes, ) array = cls(metadata=metadata, store_path=store_path, config=config) await array._save_metadata(metadata, ensure_parents=True) return array + @staticmethod + def _create_metadata_v2( + shape: ChunkCoords, + dtype: np.dtype[Any], + chunks: ChunkCoords, + order: MemoryOrder, + dimension_separator: Literal[".", "/"] | None = None, + fill_value: float | None = None, + filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + compressor: dict[str, JSON] | numcodecs.abc.Codec | None = None, + attributes: dict[str, JSON] | None = None, + ) -> ArrayV2Metadata: + if dimension_separator is None: + dimension_separator = "." + + dtype = parse_dtype(dtype, zarr_format=2) + + # inject VLenUTF8 for str dtype if not already present + if np.issubdtype(dtype, np.str_): + filters = filters or [] + from numcodecs.vlen import VLenUTF8 + + if not any(isinstance(x, VLenUTF8) or x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [VLenUTF8()] + + return ArrayV2Metadata( + shape=shape, + dtype=np.dtype(dtype), + chunks=chunks, + order=order, + dimension_separator=dimension_separator, + fill_value=fill_value, + compressor=compressor, + filters=filters, + attributes=attributes, + ) + @classmethod async def _create_v2( cls, @@ -751,30 +815,18 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) - if dimension_separator is None: - dimension_separator = "." - - dtype = parse_dtype(dtype, zarr_format=2) - - # inject VLenUTF8 for str dtype if not already present - if np.issubdtype(dtype, np.str_): - filters = filters or [] - from numcodecs.vlen import VLenUTF8 - - if not any(isinstance(x, VLenUTF8) or x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [VLenUTF8()] - - metadata = ArrayV2Metadata( + metadata = cls._create_metadata_v2( shape=shape, - dtype=np.dtype(dtype), + dtype=dtype, chunks=chunks, order=order, dimension_separator=dimension_separator, fill_value=fill_value, - compressor=compressor, filters=filters, + compressor=compressor, attributes=attributes, ) + array = cls(metadata=metadata, store_path=store_path, config=config) await array._save_metadata(metadata, ensure_parents=True) return array @@ -1708,7 +1760,7 @@ def create( The shape of the Array's chunks. Zarr format 3 only. Zarr format 2 arrays should use `chunks` instead. If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding, optional + chunk_key_encoding : ChunkKeyEncodingLike, optional A specification of how the chunk keys are represented in storage. Zarr format 3 only. Zarr format 2 arrays should use `dimension_separator` instead. Default is ``("default", "/")``. @@ -3741,10 +3793,9 @@ class ShardsConfigParam(TypedDict): ShardsLike: TypeAlias = ChunkCoords | ShardsConfigParam | Literal["auto"] -async def create_array( - store: str | StoreLike, +async def init_array( *, - name: str | None = None, + store_path: StorePath, shape: ShapeLike, dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", @@ -3756,21 +3807,16 @@ async def create_array( order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + chunk_key_encoding: ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, - storage_options: dict[str, Any] | None = None, overwrite: bool = False, - config: ArrayConfig | ArrayConfigLike | None = None, -) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """Create an array. +) -> ArrayV3Metadata | ArrayV2Metadata: + """Create and persist an array metadata document. Parameters ---------- - store : str or Store - Store or path to directory in file system or name of zip file. - name : str or None, optional - The name of the array within the store. If ``name`` is ``None``, the array will be located - at the root of the store. + store_path : StorePath + StorePath instance. The path attribute is the name of the array to initialize. shape : ChunkCoords Shape of the array. dtype : npt.DTypeLike @@ -3834,37 +3880,20 @@ async def create_array( The zarr format to use when saving. attributes : dict, optional Attributes for the array. - chunk_key_encoding : ChunkKeyEncoding, optional + chunk_key_encoding : ChunkKeyEncodingLike, optional A specification of how the chunk keys are represented in storage. For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). Zarr format 3 only. Zarr format 2 arrays should not use this parameter. - storage_options : dict, optional - If using an fsspec URL to create the store, these will be passed to the backend implementation. - Ignored otherwise. overwrite : bool, default False Whether to overwrite an array with the same name in the store, if one exists. - config : ArrayConfig or ArrayConfigLike, optional - Runtime configuration for the array. Returns ------- - AsyncArray - The array. - - Examples - -------- - >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') - >>> async_arr = await zarr.api.asynchronous.create_array( - >>> store=store, - >>> shape=(100,100), - >>> chunks=(10,10), - >>> dtype='i4', - >>> fill_value=0) - + ArrayV3Metadata | ArrayV2Metadata + The array metadata document. """ if zarr_format is None: @@ -3872,20 +3901,25 @@ async def create_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - mode: Literal["a"] = "a" dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) - config_parsed = parse_array_config(config) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format ) - store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) + + if overwrite: + if store_path.store.supports_deletes: + await store_path.delete_dir() + else: + await ensure_no_existing_node(store_path, zarr_format=zarr_format) + else: + await ensure_no_existing_node(store_path, zarr_format=zarr_format) + shard_shape_parsed, chunk_shape_parsed = _auto_partition( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, dtype=dtype_parsed ) chunks_out: tuple[int, ...] - result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] - + meta: ArrayV2Metadata | ArrayV3Metadata if zarr_format == 2: if shard_shape_parsed is not None: msg = ( @@ -3908,8 +3942,7 @@ async def create_array( else: order_parsed = order - result = await AsyncArray._create_v2( - store_path=store_path, + meta = AsyncArray._create_metadata_v2( shape=shape_parsed, dtype=dtype_parsed, chunks=chunk_shape_parsed, @@ -3919,8 +3952,6 @@ async def create_array( filters=filters_parsed, compressor=compressor_parsed, attributes=attributes, - overwrite=overwrite, - config=config_parsed, ) else: array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( @@ -3951,25 +3982,199 @@ async def create_array( chunks_out = chunk_shape_parsed codecs_out = sub_codecs - result = await AsyncArray._create_v3( - store_path=store_path, + meta = AsyncArray._create_metadata_v3( shape=shape_parsed, dtype=dtype_parsed, fill_value=fill_value, - attributes=attributes, chunk_shape=chunks_out, chunk_key_encoding=chunk_key_encoding_parsed, codecs=codecs_out, dimension_names=dimension_names, - overwrite=overwrite, - config=config_parsed, + attributes=attributes, ) + # save the metadata to disk + # TODO: make this easier -- it should be a simple function call that takes a {key: buffer} + coros = ( + (store_path / key).set(value) + for key, value in meta.to_buffer_dict(default_buffer_prototype()).items() + ) + await gather(*coros) + return meta + + +async def create_array( + store: str | StoreLike, + *, + name: str | None = None, + shape: ShapeLike | None = None, + dtype: npt.DTypeLike | None = None, + data: np.ndarray[Any, np.dtype[Any]] | None = None, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + serializer: SerializerLike = "auto", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncodingLike | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, + write_data: bool = True, +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Create an array. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords, optional + Shape of the array. Can be ``None`` if ``data`` is provided. + dtype : npt.DTypeLike | None + Data type of the array. Can be ``None`` if ``data`` is provided. + data : Array-like data to use for initializing the array. If this parameter is provided, the + ``shape`` and ``dtype`` parameters must be identical to ``data.shape`` and ``data.dtype``, + or ``None``. + chunks : ChunkCoords, optional + Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. + If no ``compressor`` is provided, a default compressor will be used. + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncodingLike, optional + A specification of how the chunk keys are represented in storage. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. + write_data : bool + If a pre-existing array-like object was provided to this function via the ``data`` parameter + then ``write_data`` determines whether the values in that array-like object should be + written to the Zarr array created by this function. If ``write_data`` is ``False``, then the + array will be left empty. + + Returns + ------- + AsyncArray + The array. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> async_arr = await zarr.api.asynchronous.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='i4', + >>> fill_value=0) + + """ + mode: Literal["a"] = "a" + config_parsed = parse_array_config(config) + store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) + + data_parsed, shape_parsed, dtype_parsed = _parse_data_params( + data=data, shape=shape, dtype=dtype + ) + meta = await init_array( + store_path=store_path, + shape=shape_parsed, + dtype=dtype_parsed, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + overwrite=overwrite, + ) + + result = AsyncArray(metadata=meta, store_path=store_path, config=config_parsed) + if write_data is True and data_parsed is not None: + await result._set_selection( + BasicIndexer(..., shape=result.shape, chunk_grid=result.metadata.chunk_grid), + data_parsed, + prototype=default_buffer_prototype(), + ) return result def _parse_chunk_key_encoding( - data: ChunkKeyEncoding | ChunkKeyEncodingLike | None, zarr_format: ZarrFormat + data: ChunkKeyEncodingLike | None, zarr_format: ZarrFormat ) -> ChunkKeyEncoding: """ Take an implicit specification of a chunk key encoding and parse it into a ChunkKeyEncoding object. @@ -4149,3 +4354,48 @@ def _parse_deprecated_compressor( elif zarr_format == 2 and compressor == compressors == "auto": compressors = ({"id": "blosc"},) return compressors + + +def _parse_data_params( + *, + data: np.ndarray[Any, np.dtype[Any]] | None, + shape: ShapeLike | None, + dtype: npt.DTypeLike | None, +) -> tuple[np.ndarray[Any, np.dtype[Any]] | None, ShapeLike, npt.DTypeLike]: + """ + Ensure an array-like ``data`` parameter is consistent with the ``dtype`` and ``shape`` + parameters. + """ + if data is None: + if shape is None: + msg = ( + "The data parameter was set to None, but shape was not specified. " + "Either provide a value for data, or specify shape." + ) + raise ValueError(msg) + shape_out = shape + if dtype is None: + msg = ( + "The data parameter was set to None, but dtype was not specified." + "Either provide an array-like value for data, or specify dtype." + ) + raise ValueError(msg) + dtype_out = dtype + else: + if shape is not None: + msg = ( + "The data parameter was used, but the shape parameter was also " + "used. This is an error. Either use the data parameter, or the shape parameter, " + "but not both." + ) + raise ValueError(msg) + shape_out = data.shape + if dtype is not None: + msg = ( + "The data parameter was used, but the dtype parameter was also " + "used. This is an error. Either use the data parameter, or the dtype parameter, " + "but not both." + ) + raise ValueError(msg) + dtype_out = data.dtype + return data, shape_out, dtype_out diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 85a7351fc7..ccab103e0f 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -470,7 +470,9 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool: # every single time we have to write data? _data, other = np.broadcast_arrays(self._data, other) return np.array_equal( - self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False + self._data, + other, + equal_nan=equal_nan if self._data.dtype.kind not in "USTOV" else False, ) def fill(self, value: Any) -> None: diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 95ce9108f3..103472c3b4 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -2,7 +2,10 @@ from abc import abstractmethod from dataclasses import dataclass -from typing import Literal, TypedDict, cast +from typing import TYPE_CHECKING, Literal, TypeAlias, TypedDict, cast + +if TYPE_CHECKING: + from typing import NotRequired from zarr.abc.metadata import Metadata from zarr.core.common import ( @@ -20,9 +23,9 @@ def parse_separator(data: JSON) -> SeparatorLiteral: return cast(SeparatorLiteral, data) -class ChunkKeyEncodingLike(TypedDict): +class ChunkKeyEncodingParams(TypedDict): name: Literal["v2", "default"] - separator: SeparatorLiteral + separator: NotRequired[SeparatorLiteral] @dataclass(frozen=True) @@ -36,9 +39,7 @@ def __init__(self, *, separator: SeparatorLiteral) -> None: object.__setattr__(self, "separator", separator_parsed) @classmethod - def from_dict( - cls, data: dict[str, JSON] | ChunkKeyEncoding | ChunkKeyEncodingLike - ) -> ChunkKeyEncoding: + def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncodingLike) -> ChunkKeyEncoding: if isinstance(data, ChunkKeyEncoding): return data @@ -46,6 +47,9 @@ def from_dict( if "name" in data and "separator" in data: data = {"name": data["name"], "configuration": {"separator": data["separator"]}} + # TODO: remove this cast when we are statically typing the JSON metadata completely. + data = cast(dict[str, JSON], data) + # configuration is optional for chunk key encodings name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False) if name_parsed == "default": @@ -73,6 +77,9 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: pass +ChunkKeyEncodingLike: TypeAlias = ChunkKeyEncodingParams | ChunkKeyEncoding + + @dataclass(frozen=True) class DefaultChunkKeyEncoding(ChunkKeyEncoding): name: Literal["default"] = "default" diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 7920d220a4..051e8c68e1 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -75,6 +75,7 @@ def reset(self) -> None: "numeric": None, "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], + "raw": None, }, "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, "v3_default_serializer": { diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index f1226821ba..733b2464ac 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1373,10 +1373,13 @@ def is_total_slice(item: Selection, shape: ChunkCoords) -> bool: item = (item,) if isinstance(item, tuple): return all( - isinstance(dim_sel, slice) - and ( - (dim_sel == slice(None)) - or ((dim_sel.stop - dim_sel.start == dim_len) and (dim_sel.step in [1, None])) + (isinstance(dim_sel, int) and dim_len == 1) + or ( + isinstance(dim_sel, slice) + and ( + (dim_sel == slice(None)) + or ((dim_sel.stop - dim_sel.start == dim_len) and (dim_sel.step in [1, None])) + ) ) for dim_sel, dim_len in zip(item, shape, strict=False) ) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 29cf15a119..192db5b203 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -193,7 +193,14 @@ def to_dict(self) -> dict[str, JSON]: zarray_dict["fill_value"] = fill_value _ = zarray_dict.pop("dtype") - zarray_dict["dtype"] = self.dtype.str + dtype_json: JSON + # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string + dtype_descr = self.dtype.descr + if self.dtype.kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0: + dtype_json = tuple(self.dtype.descr) + else: + dtype_json = self.dtype.str + zarray_dict["dtype"] = dtype_json return zarray_dict @@ -220,6 +227,8 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: + if isinstance(data, list): # this is a valid _VoidDTypeLike check + data = [tuple(d) for d in data] return np.dtype(data) @@ -376,8 +385,10 @@ def _default_filters( dtype_key = "numeric" elif dtype.kind in "U": dtype_key = "string" - elif dtype.kind in "OSV": + elif dtype.kind in "OS": dtype_key = "bytes" + elif dtype.kind == "V": + dtype_key = "raw" else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 087dbd8bfc..9154762648 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -27,7 +27,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid -from zarr.core.chunk_key_encodings import ChunkKeyEncoding +from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike from zarr.core.common import ( JSON, ZARR_JSON, @@ -253,7 +253,7 @@ def __init__( shape: Iterable[int], data_type: npt.DTypeLike | DataType, chunk_grid: dict[str, JSON] | ChunkGrid, - chunk_key_encoding: dict[str, JSON] | ChunkKeyEncoding, + chunk_key_encoding: ChunkKeyEncodingLike, fill_value: Any, codecs: Iterable[Codec | dict[str, JSON]], attributes: dict[str, JSON] | None, diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index 99c8c778e7..c30c9b601b 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -10,6 +10,7 @@ Store, SuffixByteRequest, ) +from zarr.core.buffer import Buffer from zarr.storage._common import _dereference_path if TYPE_CHECKING: @@ -17,7 +18,7 @@ from fsspec.asyn import AsyncFileSystem - from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.buffer import BufferPrototype from zarr.core.common import BytesLike @@ -172,6 +173,17 @@ def from_url( opts = {"asynchronous": True, **opts} fs, path = url_to_fs(url, **opts) + if not fs.async_impl: + try: + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + fs = AsyncFileSystemWrapper(fs) + except ImportError as e: + raise ImportError( + f"The filesystem for URL '{url}' is synchronous, and the required " + "AsyncFileSystemWrapper is not available. Upgrade fsspec to version " + "2024.12.0 or later to enable this functionality." + ) from e # fsspec is not consistent about removing the scheme from the path, so check and strip it here # https://github.com/fsspec/filesystem_spec/issues/1722 @@ -253,6 +265,10 @@ async def set( if not self._is_open: await self._open() self._check_writable() + if not isinstance(value, Buffer): + raise TypeError( + f"FsspecStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." + ) path = _dereference_path(self.path, key) # write data if byte_range: diff --git a/src/zarr/storage/_local.py b/src/zarr/storage/_local.py index 5eaa85c592..1defea26b4 100644 --- a/src/zarr/storage/_local.py +++ b/src/zarr/storage/_local.py @@ -96,7 +96,7 @@ def __init__(self, root: Path | str, *, read_only: bool = False) -> None: root = Path(root) if not isinstance(root, Path): raise TypeError( - f'"root" must be a string or Path instance. Got an object with type {type(root)} instead.' + f"'root' must be a string or Path instance. Got an instance of {type(root)} instead." ) self.root = root @@ -169,7 +169,9 @@ async def _set(self, key: str, value: Buffer, exclusive: bool = False) -> None: self._check_writable() assert isinstance(key, str) if not isinstance(value, Buffer): - raise TypeError("LocalStore.set(): `value` must a Buffer instance") + raise TypeError( + f"LocalStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." + ) path = self.root / key await asyncio.to_thread(_put, path, value, start=None, exclusive=exclusive) diff --git a/src/zarr/storage/_logging.py b/src/zarr/storage/_logging.py index 5ca716df2c..e9d6211588 100644 --- a/src/zarr/storage/_logging.py +++ b/src/zarr/storage/_logging.py @@ -2,10 +2,11 @@ import inspect import logging +import sys import time from collections import defaultdict from contextlib import contextmanager -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Self, TypeVar from zarr.abc.store import Store from zarr.storage._wrapper import WrapperStore @@ -18,8 +19,10 @@ counter: defaultdict[str, int] +T_Store = TypeVar("T_Store", bound=Store) -class LoggingStore(WrapperStore[Store]): + +class LoggingStore(WrapperStore[T_Store]): """ Store wrapper that logs all calls to the wrapped store. @@ -42,7 +45,7 @@ class LoggingStore(WrapperStore[Store]): def __init__( self, - store: Store, + store: T_Store, log_level: str = "DEBUG", log_handler: logging.Handler | None = None, ) -> None: @@ -67,7 +70,7 @@ def _configure_logger( def _default_handler(self) -> logging.Handler: """Define a default log handler""" - handler = logging.StreamHandler() + handler = logging.StreamHandler(stream=sys.stdout) handler.setLevel(self.log_level) handler.setFormatter( logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") @@ -94,6 +97,14 @@ def log(self, hint: Any = "") -> Generator[None, None, None]: end_time = time.time() self.logger.info("Finished %s [%.2f s]", op, end_time - start_time) + @classmethod + async def open(cls: type[Self], store_cls: type[T_Store], *args: Any, **kwargs: Any) -> Self: + log_level = kwargs.pop("log_level", "DEBUG") + log_handler = kwargs.pop("log_handler", None) + store = store_cls(*args, **kwargs) + await store._open() + return cls(store=store, log_level=log_level, log_handler=log_handler) + @property def supports_writes(self) -> bool: with self.log(): @@ -126,8 +137,7 @@ def _is_open(self) -> bool: @_is_open.setter def _is_open(self, value: bool) -> None: - with self.log(value): - self._store._is_open = value + raise NotImplementedError("LoggingStore must be opened via the `_open` method") async def _open(self) -> None: with self.log(): @@ -151,11 +161,11 @@ def __str__(self) -> str: return f"logging-{self._store}" def __repr__(self) -> str: - return f"LoggingStore({repr(self._store)!r})" + return f"LoggingStore({self._store.__class__.__name__}, '{self._store}')" def __eq__(self, other: object) -> bool: with self.log(other): - return self._store == other + return type(self) is type(other) and self._store.__eq__(other._store) # type: ignore[attr-defined] async def get( self, diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py index d35ecbe33d..b37fc8d5c9 100644 --- a/src/zarr/storage/_memory.py +++ b/src/zarr/storage/_memory.py @@ -111,7 +111,9 @@ async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None await self._ensure_open() assert isinstance(key, str) if not isinstance(value, Buffer): - raise TypeError(f"Expected Buffer. Got {type(value)}.") + raise TypeError( + f"MemoryStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." + ) if byte_range is not None: buf = self._store_dict[key] @@ -231,8 +233,9 @@ async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None self._check_writable() assert isinstance(key, str) if not isinstance(value, Buffer): - raise TypeError(f"Expected Buffer. Got {type(value)}.") - + raise TypeError( + f"GpuMemoryStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." + ) # Convert to gpu.Buffer gpu_value = value if isinstance(value, gpu.Buffer) else gpu.Buffer.from_buffer(value) await super().set(key, gpu_value, byte_range=byte_range) diff --git a/src/zarr/storage/_wrapper.py b/src/zarr/storage/_wrapper.py index 255e965439..349048e495 100644 --- a/src/zarr/storage/_wrapper.py +++ b/src/zarr/storage/_wrapper.py @@ -56,6 +56,14 @@ async def _ensure_open(self) -> None: async def is_empty(self, prefix: str) -> bool: return await self._store.is_empty(prefix) + @property + def _is_open(self) -> bool: + return self._store._is_open + + @_is_open.setter + def _is_open(self, value: bool) -> None: + raise NotImplementedError("WrapperStore must be opened via the `_open` method") + async def clear(self) -> None: return await self._store.clear() @@ -67,7 +75,13 @@ def _check_writable(self) -> None: return self._store._check_writable() def __eq__(self, value: object) -> bool: - return type(self) is type(value) and self._store.__eq__(value) + return type(self) is type(value) and self._store.__eq__(value._store) # type: ignore[attr-defined] + + def __str__(self) -> str: + return f"wrapping-{self._store}" + + def __repr__(self) -> str: + return f"WrapperStore({self._store.__class__.__name__}, '{self._store}')" async def get( self, key: str, prototype: BufferPrototype, byte_range: ByteRequest | None = None diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index e808b80e4e..bf8f9900b9 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -107,11 +107,14 @@ def _sync_open(self) -> None: async def _open(self) -> None: self._sync_open() - def __getstate__(self) -> tuple[Path, ZipStoreAccessModeLiteral, int, bool]: - return self.path, self._zmode, self.compression, self.allowZip64 - - def __setstate__(self, state: Any) -> None: - self.path, self._zmode, self.compression, self.allowZip64 = state + def __getstate__(self) -> dict[str, Any]: + state = self.__dict__ + for attr in ["_zf", "_lock"]: + state.pop(attr, None) + return state + + def __setstate__(self, state: dict[str, Any]) -> None: + self.__dict__ = state self._is_open = False self._sync_open() @@ -146,6 +149,8 @@ def _get( prototype: BufferPrototype, byte_range: ByteRequest | None = None, ) -> Buffer | None: + if not self._is_open: + self._sync_open() # docstring inherited try: with self._zf.open(key) as f: # will raise KeyError @@ -190,6 +195,8 @@ async def get_partial_values( return out def _set(self, key: str, value: Buffer) -> None: + if not self._is_open: + self._sync_open() # generally, this should be called inside a lock keyinfo = zipfile.ZipInfo(filename=key, date_time=time.localtime(time.time())[:6]) keyinfo.compress_type = self.compression @@ -203,9 +210,13 @@ def _set(self, key: str, value: Buffer) -> None: async def set(self, key: str, value: Buffer) -> None: # docstring inherited self._check_writable() + if not self._is_open: + self._sync_open() assert isinstance(key, str) if not isinstance(value, Buffer): - raise TypeError("ZipStore.set(): `value` must a Buffer instance") + raise TypeError( + f"ZipStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." + ) with self._lock: self._set(key, value) diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index 602d001693..1fe544d292 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -2,6 +2,7 @@ import asyncio import pickle +from abc import abstractmethod from typing import TYPE_CHECKING, Generic, TypeVar from zarr.storage import WrapperStore @@ -37,30 +38,53 @@ class StoreTests(Generic[S, B]): store_cls: type[S] buffer_cls: type[B] + @abstractmethod async def set(self, store: S, key: str, value: Buffer) -> None: """ Insert a value into a storage backend, with a specific key. This should not not use any store methods. Bypassing the store methods allows them to be tested. """ - raise NotImplementedError + ... + @abstractmethod async def get(self, store: S, key: str) -> Buffer: """ Retrieve a value from a storage backend, by key. This should not not use any store methods. Bypassing the store methods allows them to be tested. """ + ... - raise NotImplementedError - + @abstractmethod @pytest.fixture def store_kwargs(self) -> dict[str, Any]: - return {"read_only": False} + """Kwargs for instantiating a store""" + ... + + @abstractmethod + def test_store_repr(self, store: S) -> None: ... + + @abstractmethod + def test_store_supports_writes(self, store: S) -> None: ... + + @abstractmethod + def test_store_supports_partial_writes(self, store: S) -> None: ... + + @abstractmethod + def test_store_supports_listing(self, store: S) -> None: ... @pytest.fixture - async def store(self, store_kwargs: dict[str, Any]) -> Store: - return await self.store_cls.open(**store_kwargs) + def open_kwargs(self, store_kwargs: dict[str, Any]) -> dict[str, Any]: + return store_kwargs + + @pytest.fixture + async def store(self, open_kwargs: dict[str, Any]) -> Store: + return await self.store_cls.open(**open_kwargs) + + @pytest.fixture + async def store_not_open(self, store_kwargs: dict[str, Any]) -> Store: + return self.store_cls(**store_kwargs) def test_store_type(self, store: S) -> None: assert isinstance(store, Store) @@ -76,8 +100,9 @@ def test_store_eq(self, store: S, store_kwargs: dict[str, Any]) -> None: assert store == store2 def test_serializable_store(self, store: S) -> None: - foo = pickle.dumps(store) - assert pickle.loads(foo) == store + new_store: S = pickle.loads(pickle.dumps(store)) + assert new_store == store + assert new_store.read_only == store.read_only def test_store_read_only(self, store: S) -> None: assert not store.read_only @@ -86,39 +111,38 @@ def test_store_read_only(self, store: S) -> None: store.read_only = False # type: ignore[misc] @pytest.mark.parametrize("read_only", [True, False]) - async def test_store_open_read_only( - self, store_kwargs: dict[str, Any], read_only: bool - ) -> None: - store_kwargs["read_only"] = read_only - store = await self.store_cls.open(**store_kwargs) + async def test_store_open_read_only(self, open_kwargs: dict[str, Any], read_only: bool) -> None: + open_kwargs["read_only"] = read_only + store = await self.store_cls.open(**open_kwargs) assert store._is_open assert store.read_only == read_only - async def test_read_only_store_raises(self, store_kwargs: dict[str, Any]) -> None: - kwargs = {**store_kwargs, "read_only": True} + async def test_store_context_manager(self, open_kwargs: dict[str, Any]) -> None: + # Test that the context manager closes the store + with await self.store_cls.open(**open_kwargs) as store: + assert store._is_open + # Test trying to open an already open store + with pytest.raises(ValueError, match="store is already open"): + await store._open() + assert not store._is_open + + async def test_read_only_store_raises(self, open_kwargs: dict[str, Any]) -> None: + kwargs = {**open_kwargs, "read_only": True} store = await self.store_cls.open(**kwargs) assert store.read_only # set - with pytest.raises(ValueError): + with pytest.raises( + ValueError, match="store was opened in read-only mode and does not support writing" + ): await store.set("foo", self.buffer_cls.from_bytes(b"bar")) # delete - with pytest.raises(ValueError): + with pytest.raises( + ValueError, match="store was opened in read-only mode and does not support writing" + ): await store.delete("foo") - def test_store_repr(self, store: S) -> None: - raise NotImplementedError - - def test_store_supports_writes(self, store: S) -> None: - raise NotImplementedError - - def test_store_supports_partial_writes(self, store: S) -> None: - raise NotImplementedError - - def test_store_supports_listing(self, store: S) -> None: - raise NotImplementedError - @pytest.mark.parametrize("key", ["c/0", "foo/c/0.0", "foo/0/0"]) @pytest.mark.parametrize("data", [b"\x01\x02\x03\x04", b""]) @pytest.mark.parametrize( @@ -135,6 +159,26 @@ async def test_get(self, store: S, key: str, data: bytes, byte_range: ByteReques expected = data_buf[start:stop] assert_bytes_equal(observed, expected) + async def test_get_not_open(self, store_not_open: S) -> None: + """ + Ensure that data can be read from the store that isn't yet open using the store.get method. + """ + assert not store_not_open._is_open + data_buf = self.buffer_cls.from_bytes(b"\x01\x02\x03\x04") + key = "c/0" + await self.set(store_not_open, key, data_buf) + observed = await store_not_open.get(key, prototype=default_buffer_prototype()) + assert_bytes_equal(observed, data_buf) + + async def test_get_raises(self, store: S) -> None: + """ + Ensure that a ValueError is raise for invalid byte range syntax + """ + data_buf = self.buffer_cls.from_bytes(b"\x01\x02\x03\x04") + await self.set(store, "c/0", data_buf) + with pytest.raises((ValueError, TypeError), match=r"Unexpected byte_range, got.*"): + await store.get("c/0", prototype=default_buffer_prototype(), byte_range=(0, 2)) # type: ignore[arg-type] + async def test_get_many(self, store: S) -> None: """ Ensure that multiple keys can be retrieved at once with the _get_many method. @@ -157,6 +201,37 @@ async def test_get_many(self, store: S) -> None: expected_kvs = sorted(((k, b) for k, b in zip(keys, values, strict=False))) assert observed_kvs == expected_kvs + @pytest.mark.parametrize("key", ["c/0", "foo/c/0.0", "foo/0/0"]) + @pytest.mark.parametrize("data", [b"\x01\x02\x03\x04", b""]) + async def test_getsize(self, store: S, key: str, data: bytes) -> None: + """ + Test the result of store.getsize(). + """ + data_buf = self.buffer_cls.from_bytes(data) + expected = len(data_buf) + await self.set(store, key, data_buf) + observed = await store.getsize(key) + assert observed == expected + + async def test_getsize_prefix(self, store: S) -> None: + """ + Test the result of store.getsize_prefix(). + """ + data_buf = self.buffer_cls.from_bytes(b"\x01\x02\x03\x04") + keys = ["c/0/0", "c/0/1", "c/1/0", "c/1/1"] + keys_values = [(k, data_buf) for k in keys] + await store._set_many(keys_values) + expected = len(data_buf) * len(keys) + observed = await store.getsize_prefix("c") + assert observed == expected + + async def test_getsize_raises(self, store: S) -> None: + """ + Test that getsize() raise a FileNotFoundError if the key doesn't exist. + """ + with pytest.raises(FileNotFoundError): + await store.getsize("c/1000") + @pytest.mark.parametrize("key", ["zarr.json", "c/0", "foo/c/0.0", "foo/0/0"]) @pytest.mark.parametrize("data", [b"\x01\x02\x03\x04", b""]) async def test_set(self, store: S, key: str, data: bytes) -> None: @@ -169,6 +244,17 @@ async def test_set(self, store: S, key: str, data: bytes) -> None: observed = await self.get(store, key) assert_bytes_equal(observed, data_buf) + async def test_set_not_open(self, store_not_open: S) -> None: + """ + Ensure that data can be written to the store that's not yet open using the store.set method. + """ + assert not store_not_open._is_open + data_buf = self.buffer_cls.from_bytes(b"\x01\x02\x03\x04") + key = "c/0" + await store_not_open.set(key, data_buf) + observed = await self.get(store_not_open, key) + assert_bytes_equal(observed, data_buf) + async def test_set_many(self, store: S) -> None: """ Test that a dict of key : value pairs can be inserted into the store via the @@ -181,6 +267,16 @@ async def test_set_many(self, store: S) -> None: for k, v in store_dict.items(): assert (await self.get(store, k)).to_bytes() == v.to_bytes() + async def test_set_invalid_buffer(self, store: S) -> None: + """ + Ensure that set raises a Type or Value Error for invalid buffer arguments. + """ + with pytest.raises( + (ValueError, TypeError), + match=r"\S+\.set\(\): `value` must be a Buffer instance. Got an instance of instead.", + ): + await store.set("c/0", 0) # type: ignore[arg-type] + @pytest.mark.parametrize( "key_ranges", [ diff --git a/tests/test_array.py b/tests/test_array.py index 6600424147..80ff8444fc 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -5,13 +5,16 @@ import re from itertools import accumulate from typing import TYPE_CHECKING, Any, Literal +from unittest import mock import numcodecs import numpy as np import pytest import zarr.api.asynchronous +import zarr.api.synchronous as sync_api from zarr import Array, AsyncArray, Group +from zarr.abc.store import Store from zarr.codecs import ( BytesCodec, GzipCodec, @@ -36,14 +39,15 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup -from zarr.core.indexing import ceildiv -from zarr.core.metadata.v3 import DataType +from zarr.core.indexing import BasicIndexer, ceildiv +from zarr.core.metadata.v3 import ArrayV3Metadata, DataType from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike + from zarr.core.metadata.v2 import ArrayV2Metadata @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @@ -1257,8 +1261,79 @@ async def test_create_array_v2_no_shards(store: MemoryStore) -> None: ) +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("impl", ["sync", "async"]) +async def test_create_array_data(impl: Literal["sync", "async"], store: Store) -> None: + """ + Test that we can invoke ``create_array`` with a ``data`` parameter. + """ + data = np.arange(10) + name = "foo" + arr: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | Array + if impl == "sync": + arr = sync_api.create_array(store, name=name, data=data) + stored = arr[:] + elif impl == "async": + arr = await create_array(store, name=name, data=data, zarr_format=3) + stored = await arr._get_selection( + BasicIndexer(..., shape=arr.shape, chunk_grid=arr.metadata.chunk_grid), + prototype=default_buffer_prototype(), + ) + else: + raise ValueError(f"Invalid impl: {impl}") + + assert np.array_equal(stored, data) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_array_data_invalid_params(store: Store) -> None: + """ + Test that failing to specify data AND shape / dtype results in a ValueError + """ + with pytest.raises(ValueError, match="shape was not specified"): + await create_array(store, data=None, shape=None, dtype=None) + + # we catch shape=None first, so specifying a dtype should raise the same exception as before + with pytest.raises(ValueError, match="shape was not specified"): + await create_array(store, data=None, shape=None, dtype="uint8") + + with pytest.raises(ValueError, match="dtype was not specified"): + await create_array(store, data=None, shape=(10, 10)) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_array_data_ignored_params(store: Store) -> None: + """ + Test that specify data AND shape AND dtype results in a warning + """ + data = np.arange(10) + with pytest.raises( + ValueError, match="The data parameter was used, but the shape parameter was also used." + ): + await create_array(store, data=data, shape=data.shape, dtype=None, overwrite=True) + + # we catch shape first, so specifying a dtype should raise the same warning as before + with pytest.raises( + ValueError, match="The data parameter was used, but the shape parameter was also used." + ): + await create_array(store, data=data, shape=data.shape, dtype=data.dtype, overwrite=True) + + with pytest.raises( + ValueError, match="The data parameter was used, but the dtype parameter was also used." + ): + await create_array(store, data=data, shape=None, dtype=data.dtype, overwrite=True) + + async def test_scalar_array() -> None: arr = zarr.array(1.5) assert arr[...] == 1.5 assert arr[()] == 1.5 assert arr.shape == () + + +async def test_orthogonal_set_total_slice() -> None: + """Ensure that a whole chunk overwrite does not read chunks""" + store = MemoryStore() + array = zarr.create_array(store, shape=(20, 20), chunks=(1, 2), dtype=int, fill_value=-1) + with mock.patch("zarr.storage.MemoryStore.get", side_effect=ValueError): + array[0, slice(4, 10)] = np.arange(6) diff --git a/tests/test_config.py b/tests/test_config.py index c552ace840..1a2453d646 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -61,6 +61,7 @@ def test_config_defaults_set() -> None: "numeric": None, "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], + "raw": None, }, "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, "v3_default_serializer": { diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 30d0d75f22..932c32f1ae 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -19,6 +19,7 @@ OrthogonalSelection, Selection, _iter_grid, + is_total_slice, make_slice_selection, normalize_integer_selection, oindex, @@ -1953,3 +1954,8 @@ def test_vectorized_indexing_incompatible_shape(store) -> None: ) with pytest.raises(ValueError, match="Attempting to set"): arr[np.array([1, 2]), np.array([1, 2])] = np.array([[-1, -2], [-3, -4]]) + + +def test_is_total_slice(): + assert is_total_slice((0, slice(4, 6)), (1, 2)) + assert is_total_slice((slice(0, 1, None), slice(4, 6)), (1, 2)) diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index 7806f3ecef..726da06a52 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -4,12 +4,55 @@ import pytest from _pytest.compat import LEGACY_PATH -from zarr.core.common import AccessModeLiteral +from zarr import Group +from zarr.core.common import AccessModeLiteral, ZarrFormat from zarr.storage import FsspecStore, LocalStore, MemoryStore, StoreLike, StorePath -from zarr.storage._common import make_store_path +from zarr.storage._common import contains_array, contains_group, make_store_path from zarr.storage._utils import normalize_path +@pytest.mark.parametrize("path", ["foo", "foo/bar"]) +@pytest.mark.parametrize("write_group", [True, False]) +@pytest.mark.parametrize("zarr_format", [2, 3]) +async def test_contains_group( + local_store, path: str, write_group: bool, zarr_format: ZarrFormat +) -> None: + """ + Test that the contains_group method correctly reports the existence of a group. + """ + root = Group.from_store(store=local_store, zarr_format=zarr_format) + if write_group: + root.create_group(path) + store_path = StorePath(local_store, path=path) + assert await contains_group(store_path, zarr_format=zarr_format) == write_group + + +@pytest.mark.parametrize("path", ["foo", "foo/bar"]) +@pytest.mark.parametrize("write_array", [True, False]) +@pytest.mark.parametrize("zarr_format", [2, 3]) +async def test_contains_array( + local_store, path: str, write_array: bool, zarr_format: ZarrFormat +) -> None: + """ + Test that the contains array method correctly reports the existence of an array. + """ + root = Group.from_store(store=local_store, zarr_format=zarr_format) + if write_array: + root.create_array(path, shape=(100,), chunks=(10,), dtype="i4") + store_path = StorePath(local_store, path=path) + assert await contains_array(store_path, zarr_format=zarr_format) == write_array + + +@pytest.mark.parametrize("func", [contains_array, contains_group]) +async def test_contains_invalid_format_raises(local_store, func: callable) -> None: + """ + Test contains_group and contains_array raise errors for invalid zarr_formats + """ + store_path = StorePath(local_store) + with pytest.raises(ValueError): + assert await func(store_path, zarr_format="3.0") + + @pytest.mark.parametrize("path", [None, "", "bar"]) async def test_make_store_path_none(path: str) -> None: """ @@ -56,10 +99,18 @@ async def test_make_store_path_store_path( assert Path(store_path.store.root) == Path(tmpdir) path_normalized = normalize_path(path) assert store_path.path == (store_like / path_normalized).path - assert store_path.read_only == ro +@pytest.mark.parametrize("modes", [(True, "w"), (False, "x")]) +async def test_store_path_invalid_mode_raises(tmpdir: LEGACY_PATH, modes: tuple) -> None: + """ + Test that ValueErrors are raise for invalid mode. + """ + with pytest.raises(ValueError): + await StorePath.open(LocalStore(str(tmpdir), read_only=modes[0]), path=None, mode=modes[1]) + + async def test_make_store_path_invalid() -> None: """ Test that invalid types raise TypeError diff --git a/tests/test_store/test_fsspec.py b/tests/test_store/test_fsspec.py index 2713a2969d..929de37869 100644 --- a/tests/test_store/test_fsspec.py +++ b/tests/test_store/test_fsspec.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING import pytest -from botocore.session import Session +from packaging.version import parse as parse_version import zarr.api.asynchronous from zarr.abc.store import OffsetByteRequest @@ -25,6 +25,7 @@ requests = pytest.importorskip("requests") moto_server = pytest.importorskip("moto.moto_server.threaded_moto_server") moto = pytest.importorskip("moto") +botocore = pytest.importorskip("botocore") # ### amended from s3fs ### # test_bucket_name = "test" @@ -51,7 +52,7 @@ def s3_base() -> Generator[None, None, None]: def get_boto3_client() -> botocore.client.BaseClient: # NB: we use the sync botocore client for setup - session = Session() + session = botocore.session.Session() return session.create_client("s3", endpoint_url=endpoint_url) @@ -215,3 +216,31 @@ async def test_empty_nonexistent_path(self, store_kwargs) -> None: store_kwargs["path"] += "/abc" store = await self.store_cls.open(**store_kwargs) assert await store.is_empty("") + + +@pytest.mark.skipif( + parse_version(fsspec.__version__) < parse_version("2024.12.0"), + reason="No AsyncFileSystemWrapper", +) +def test_wrap_sync_filesystem(): + """The local fs is not async so we should expect it to be wrapped automatically""" + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + store = FsspecStore.from_url("local://test/path") + + assert isinstance(store.fs, AsyncFileSystemWrapper) + assert store.fs.async_impl + + +@pytest.mark.skipif( + parse_version(fsspec.__version__) < parse_version("2024.12.0"), + reason="No AsyncFileSystemWrapper", +) +def test_no_wrap_async_filesystem(): + """An async fs should not be wrapped automatically; fsspec's https filesystem is such an fs""" + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + store = FsspecStore.from_url("https://test/path") + + assert not isinstance(store.fs, AsyncFileSystemWrapper) + assert store.fs.async_impl diff --git a/tests/test_store/test_local.py b/tests/test_store/test_local.py index 22597a2c3f..d9d941c6f0 100644 --- a/tests/test_store/test_local.py +++ b/tests/test_store/test_local.py @@ -8,6 +8,7 @@ from zarr.core.buffer import Buffer, cpu from zarr.storage import LocalStore from zarr.testing.store import StoreTests +from zarr.testing.utils import assert_bytes_equal if TYPE_CHECKING: import pathlib @@ -53,3 +54,23 @@ def test_creates_new_directory(self, tmp_path: pathlib.Path): store = self.store_cls(root=target) zarr.group(store=store) + + def test_invalid_root_raises(self): + """ + Test that a TypeError is raised when a non-str/Path type is used for the `root` argument + """ + with pytest.raises( + TypeError, + match=r"'root' must be a string or Path instance. Got an instance of instead.", + ): + LocalStore(root=0) + + async def test_get_with_prototype_default(self, store: LocalStore): + """ + Ensure that data can be read via ``store.get`` if the prototype keyword argument is unspecified, i.e. set to ``None``. + """ + data_buf = self.buffer_cls.from_bytes(b"\x01\x02\x03\x04") + key = "c/0" + await self.set(store, key, data_buf) + observed = await store.get(key, prototype=None) + assert_bytes_equal(observed, data_buf) diff --git a/tests/test_store/test_logging.py b/tests/test_store/test_logging.py index b32a214db5..1a89dca874 100644 --- a/tests/test_store/test_logging.py +++ b/tests/test_store/test_logging.py @@ -1,17 +1,87 @@ from __future__ import annotations +import logging from typing import TYPE_CHECKING import pytest import zarr -from zarr.core.buffer import default_buffer_prototype -from zarr.storage import LoggingStore +from zarr.core.buffer import Buffer, cpu, default_buffer_prototype +from zarr.storage import LocalStore, LoggingStore +from zarr.testing.store import StoreTests if TYPE_CHECKING: + from _pytest.compat import LEGACY_PATH + from zarr.abc.store import Store +class TestLoggingStore(StoreTests[LoggingStore, cpu.Buffer]): + store_cls = LoggingStore + buffer_cls = cpu.Buffer + + async def get(self, store: LoggingStore, key: str) -> Buffer: + return self.buffer_cls.from_bytes((store._store.root / key).read_bytes()) + + async def set(self, store: LoggingStore, key: str, value: Buffer) -> None: + parent = (store._store.root / key).parent + if not parent.exists(): + parent.mkdir(parents=True) + (store._store.root / key).write_bytes(value.to_bytes()) + + @pytest.fixture + def store_kwargs(self, tmpdir: LEGACY_PATH) -> dict[str, str]: + return {"store": LocalStore(str(tmpdir)), "log_level": "DEBUG"} + + @pytest.fixture + def open_kwargs(self, tmpdir) -> dict[str, str]: + return {"store_cls": LocalStore, "root": str(tmpdir), "log_level": "DEBUG"} + + @pytest.fixture + def store(self, store_kwargs: str | dict[str, Buffer] | None) -> LoggingStore: + return self.store_cls(**store_kwargs) + + def test_store_supports_writes(self, store: LoggingStore) -> None: + assert store.supports_writes + + def test_store_supports_partial_writes(self, store: LoggingStore) -> None: + assert store.supports_partial_writes + + def test_store_supports_listing(self, store: LoggingStore) -> None: + assert store.supports_listing + + def test_store_repr(self, store: LoggingStore) -> None: + assert f"{store!r}" == f"LoggingStore(LocalStore, 'file://{store._store.root.as_posix()}')" + + def test_store_str(self, store: LoggingStore) -> None: + assert str(store) == f"logging-file://{store._store.root.as_posix()}" + + async def test_default_handler(self, local_store, capsys) -> None: + # Store and then remove existing handlers to enter default handler code path + handlers = logging.getLogger().handlers[:] + for h in handlers: + logging.getLogger().removeHandler(h) + # Test logs are sent to stdout + wrapped = LoggingStore(store=local_store) + buffer = default_buffer_prototype().buffer + res = await wrapped.set("foo/bar/c/0", buffer.from_bytes(b"\x01\x02\x03\x04")) + assert res is None + captured = capsys.readouterr() + assert len(captured) == 2 + assert "Calling LocalStore.set" in captured.out + assert "Finished LocalStore.set" in captured.out + # Restore handlers + for h in handlers: + logging.getLogger().addHandler(h) + + def test_is_open_setter_raises(self, store: LoggingStore) -> None: + "Test that a user cannot change `_is_open` without opening the underlying store." + with pytest.raises( + NotImplementedError, match="LoggingStore must be opened via the `_open` method" + ): + store._is_open = True + + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) async def test_logging_store(store: Store, caplog) -> None: wrapped = LoggingStore(store=store, log_level="DEBUG") diff --git a/tests/test_store/test_wrapper.py b/tests/test_store/test_wrapper.py index 489bcd5a7a..7e933548b3 100644 --- a/tests/test_store/test_wrapper.py +++ b/tests/test_store/test_wrapper.py @@ -5,13 +5,73 @@ import pytest from zarr.core.buffer.cpu import Buffer, buffer_prototype -from zarr.storage import WrapperStore +from zarr.storage import LocalStore, WrapperStore +from zarr.testing.store import StoreTests if TYPE_CHECKING: + from _pytest.compat import LEGACY_PATH + from zarr.abc.store import Store from zarr.core.buffer.core import BufferPrototype +class TestWrapperStore(StoreTests[WrapperStore, Buffer]): + store_cls = WrapperStore + buffer_cls = Buffer + + async def get(self, store: WrapperStore, key: str) -> Buffer: + return self.buffer_cls.from_bytes((store._store.root / key).read_bytes()) + + async def set(self, store: WrapperStore, key: str, value: Buffer) -> None: + parent = (store._store.root / key).parent + if not parent.exists(): + parent.mkdir(parents=True) + (store._store.root / key).write_bytes(value.to_bytes()) + + @pytest.fixture + def store_kwargs(self, tmpdir: LEGACY_PATH) -> dict[str, str]: + return {"store": LocalStore(str(tmpdir))} + + @pytest.fixture + def open_kwargs(self, tmpdir) -> dict[str, str]: + return {"store_cls": LocalStore, "root": str(tmpdir)} + + def test_store_supports_writes(self, store: WrapperStore) -> None: + assert store.supports_writes + + def test_store_supports_partial_writes(self, store: WrapperStore) -> None: + assert store.supports_partial_writes + + def test_store_supports_listing(self, store: WrapperStore) -> None: + assert store.supports_listing + + def test_store_repr(self, store: WrapperStore) -> None: + assert f"{store!r}" == f"WrapperStore(LocalStore, 'file://{store._store.root.as_posix()}')" + + def test_store_str(self, store: WrapperStore) -> None: + assert str(store) == f"wrapping-file://{store._store.root.as_posix()}" + + def test_check_writeable(self, store: WrapperStore) -> None: + """ + Test _check_writeable() runs without errors. + """ + store._check_writable() + + def test_close(self, store: WrapperStore) -> None: + "Test store can be closed" + store.close() + assert not store._is_open + + def test_is_open_setter_raises(self, store: WrapperStore) -> None: + """ + Test that a user cannot change `_is_open` without opening the underlying store. + """ + with pytest.raises( + NotImplementedError, match="WrapperStore must be opened via the `_open` method" + ): + store._is_open = True + + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=True) async def test_wrapped_set(store: Store, capsys: pytest.CaptureFixture[str]) -> None: # define a class that prints when it sets diff --git a/tests/test_v2.py b/tests/test_v2.py index b657af9c47..4c689c8e64 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -84,8 +84,15 @@ def test_codec_pipeline() -> None: np.testing.assert_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["|S", "|V"]) -async def test_v2_encode_decode(dtype): +@pytest.mark.parametrize( + ("dtype", "expected_dtype", "fill_value", "fill_value_encoding"), + [ + ("|S", "|S0", b"X", "WA=="), + ("|V", "|V0", b"X", "WA=="), + ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="), + ], +) +async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_encoding) -> None: with config.set( { "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], @@ -95,7 +102,7 @@ async def test_v2_encode_decode(dtype): store = zarr.storage.MemoryStore() g = zarr.group(store=store, zarr_format=2) g.create_array( - name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=b"X", compressor=None + name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None ) result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) @@ -105,9 +112,9 @@ async def test_v2_encode_decode(dtype): expected = { "chunks": [3], "compressor": None, - "dtype": f"{dtype}0", - "fill_value": "WA==", - "filters": [{"id": "vlen-bytes"}], + "dtype": expected_dtype, + "fill_value": fill_value_encoding, + "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, "order": "C", "shape": [3], "zarr_format": 2, @@ -284,3 +291,25 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None: assert arr.metadata.compressor.codec_id == expected_compressor if expected_filter is not None: assert arr.metadata.filters[0].codec_id == expected_filter + + +@pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"]) +def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: + a = np.array( + [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], + dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], + ) + array_path = tmp_path / "data.zarr" + za = zarr.create( + shape=(3,), + store=array_path, + chunks=(2,), + fill_value=fill_value, + zarr_format=2, + dtype=a.dtype, + ) + if fill_value is not None: + assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all() + za[...] = a + za = zarr.open_array(store=array_path) + assert (a == za[:]).all()