8000 Workaround problem where dask arrays aren't returned when chunks is None · pydata/xarray@dce4e7c · GitHub
[go: up one dir, main page]

Skip to content

Commit dce4e7c

Browse files
committed
Workaround problem where dask arrays aren't returned when chunks is None
Revert 827e546 and workaround to get dask arrays by fixing some if-then logic in the code when `engine="zarr"` is involved. Things work fine when using chunks="auto", perhaps because the try `import dask.array` is needed to trigger loading into dask arrays? Also removed using chunks="auto" in some Zarr tests to simplify.
1 parent e2e1c81 commit dce4e7c

File tree

2 files changed

+65
-79
lines changed

2 files changed

+65
-79
lines changed

xarray/backends/api.py

Lines changed: 58 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -463,70 +463,66 @@ def maybe_decode_store(store, chunks, lock=False):
463463

464464
_protect_dataset_variables_inplace(ds, cache)
465465

466-
if chunks is not None:
467-
if engine != "zarr":
468-
from dask.base import tokenize
469-
470-
# if passed an actual file path, augment the token with
471-
# the file modification time
472-
if isinstance(filename_or_obj, str) and not is_remote_uri(
473-
filename_or_obj
474-
):
475-
mtime = os.path.getmtime(filename_or_obj)
476-
else:
477-
mtime = None
478-
token = tokenize(
479-
filename_or_obj,
480-
mtime,
481-
group,
482-
decode_cf,
483-
mask_and_scale,
484-
decode_times,
485-
concat_characters,
486-
decode_coords,
487-
engine,
488-
chunks,
489-
drop_variables,
490-
use_cftime,
491-
decode_timedelta,
492-
)
493-
name_prefix = "open_dataset-%s" % token
494-
ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
495-
496-
else: # if engine=="zarr":
497-
# adapted from Dataset.Chunk() and taken from open_zarr
498-
if not isinstance(chunks, (int, dict)):
499-
if chunks != "auto":
500-
raise ValueError(
501-
"chunks must be an int, dict, 'auto', or None. "
502-
"Instead found %s. " % chunks
503-
)
504-
505-
if chunks == "auto":
506-
try:
507-
import dask.array # noqa
508-
except ImportError:
509-
chunks = None
510-
511-
# auto chunking needs to be here and not in ZarrStore because
512-
# the variable chunks does not survive decode_cf
513-
# return trivial case
514-
if not chunks: # e.g. chunks is 0, None or {}
515-
return ds
516-
517-
if isinstance(chunks, int):
518-
chunks = dict.fromkeys(ds.dims, chunks)
519-
520-
variables = {
521-
k: store.maybe_chunk(k, v, chunks, overwrite_encoded_chunks)
522-
for k, v in ds.variables.items()
523-
}
524-
ds2 = ds._replace(variables)
525-
526-
ds2._file_obj = ds._file_obj
466+
if chunks is not None and engine != "zarr":
467+
from dask.base import tokenize
468+
469+
# if passed an actual file path, augment the token with
470+
# the file modification time
471+
if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):
472+
mtime = os.path.getmtime(filename_or_obj)
473+
else:
474+
mtime = None
475+
token = tokenize(
476+
filename_or_obj,
477+
mtime,
478+
group,
479+
decode_cf,
480+
mask_and_scale,
481+
decode_times,
482+
concat_characters,
483+
decode_coords,
484+
engine,
485+
chunks,
486+
drop_variables,
487+
use_cftime,
488+
decode_timedelta,
489+
)
490+
name_prefix = "open_dataset-%s" % token
491+
ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
492+
493+
elif engine == "zarr":
494+
# adapted from Dataset.Chunk() and taken from open_zarr
495+
if not (isinstance(chunks, (int, dict)) or chunks is None):
496+
if chunks != "auto":
497+
raise ValueError(
498+
"chunks must be an int, dict, 'auto', or None. "
499+
"Instead found %s. " % chunks
500+
)
501+
502+
if chunks == "auto":
503+
try:
504+
import dask.array # noqa
505+
except ImportError:
506+
chunks = None
507+
508+
# auto chunking needs to be here and not in ZarrStore because
509+
# the variable chunks does not survive decode_cf
510+
# return trivial case
511+
if not chunks: # e.g. chunks is 0, None or {}
512+
return ds
513+
514+
if isinstance(chunks, int):
515+
chunks = dict.fromkeys(ds.dims, chunks)
516+
517+
variables = {
518+
k: store.maybe_chunk(k, v, chunks, overwrite_encoded_chunks)
519+
for k, v in ds.variables.items()
520+
}
521+
ds2 = ds._replace(variables)
522+
527523
else:
528524
ds2 = ds
529-
525+
ds2._file_obj = ds._file_obj
530526
return ds2
531527

532528
if isinstance(filename_or_obj, Path):

xarray/tests/test_backends.py

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1823,9 +1823,7 @@ def test_write_persistence_modes(self, group):
18231823
ds.to_zarr(store_target, mode="w", group=group)
18241824
ds_to_append.to_zarr(store_target, append_dim="time", group=group)
18251825
original = xr.concat([ds, ds_to_append], dim="time")
1826-
actual = xr.open_dataset(
1827-
store_target, group=group, engine="zarr", chunks="auto"
1828-
)
1826+
actual = xr.open_dataset(store_target, group=group, engine="zarr")
18291827
assert_identical(original, actual)
18301828

18311829
def test_compressor_encoding(self):
@@ -1916,11 +1914,11 @@ def test_check_encoding_is_consistent_after_append(self):
19161914
encoding = {"da": {"compressor": compressor}}
19171915
ds.to_zarr(store_target, mode="w", encoding=encoding)
19181916
ds_to_append.to_zarr(store_target, append_dim="time")
1919-
actual_ds = xr.open_dataset(store_target, engine="zarr", chunks="auto")
1917+
actual_ds = xr.open_dataset(store_target, engine="zarr")
19201918
actual_encoding = actual_ds["da"].encoding["compressor"]
19211919
assert actual_encoding.get_config() == compressor.get_config()
19221920
assert_identical(
1923-
xr.open_dataset(store_target, engine="zarr", chunks="auto").compute(),
1921+
xr.open_dataset(store_target, engine="zarr").compute(),
19241922
xr.concat([ds, ds_to_append], dim="time"),
19251923
)
19261924

@@ -1935,9 +1933,7 @@ def test_append_with_new_variable(self):
19351933
ds_with_new_var.to_zarr(store_target, mode="a")
19361934
combined = xr.concat([ds, ds_to_append], dim="time")
19371935
combined["new_var"] = ds_with_new_var["new_var"]
1938-
assert_identical(
1939-
combined, xr.open_dataset(store_target, engine="zarr", chunks="auto")
1940-
)
1936+
assert_identical(combined, xr.open_dataset(store_target, engine="zarr"))
19411937

19421938
@requires_dask
19431939
def test_to_zarr_compute_false_roundtrip(self):
@@ -2579,17 +2575,11 @@ def test_open_mfdataset_manyfiles(
25792575
concat_dim="x",
25802576
engine=readengine,
25812577
parallel=parallel,
2582-
chunks=chunks,
2578+
chunks=chunks if (not chunks and readengine != "zarr") else "auto",
25832579
) as actual:
25842580

2585-
try:
2586-
# check that using open_mfdataset returns dask arrays for variables
2587-
assert isinstance(actual["foo"].data, dask_array_type)
2588-
except AssertionError:
2589-
# A numpy array is returned instead of a dask array
2590-
# when reading with Zarr and chunks is None
2591-
if readengine == "zarr" and chunks is None:
2592-
assert isinstance(actual["foo"].data, np.ndarray)
2581+
# check that using open_mfdataset returns dask arrays for variables
2582+
assert isinstance(actual["foo"].data, dask_array_type)
25932583

25942584
assert_identical(original, actual)
25952585

0 commit comments

Comments
 (0)
0