Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
79560d5
fix merge
brokkoli71 Feb 15, 2025
7619990
add str arguments for filters, serializer, compressors
brokkoli71 Feb 15, 2025
daff61a
remove duplicate type check
brokkoli71 Feb 15, 2025
1a3a502
fix ruff
brokkoli71 Feb 15, 2025
68ac329
update docstrings
brokkoli71 Feb 15, 2025
0e227e0
document changes
brokkoli71 Feb 15, 2025
de83f92
test_bad_chunk_encoding
brokkoli71 Feb 15, 2025
73b32ac
remove unused "type: ignore" comment
brokkoli71 Feb 15, 2025
3588a65
remove comment
brokkoli71 Feb 15, 2025
74b45bb
update test_v3_chunk_encoding
brokkoli71 Feb 18, 2025
1937ee5
update test_invalid_chunk_encoding
brokkoli71 Feb 18, 2025
f3bb890
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 Feb 18, 2025
ef10fe2
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 Feb 28, 2025
ad96bfe
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 Apr 2, 2025
28e566f
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 Apr 7, 2025
d953a14
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 Apr 10, 2025
363484a
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 Apr 10, 2025
6399f13
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 Apr 11, 2025
55f975c
test for codec with mandatory config
brokkoli71 Apr 11, 2025
a55abad
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 May 8, 2025
817617a
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 May 13, 2025
82459e3
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 May 15, 2025
4519547
better error msg if codec requires config
brokkoli71 May 16, 2025
5f94313
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 May 16, 2025
e2a18c3
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 May 20, 2025
61c32f8
typing
brokkoli71 May 20, 2025
41000fd
Merge remote-tracking branch 'origin/string-arguments-for-codecs' int…
brokkoli71 May 20, 2025
9e35f7c
typing
brokkoli71 May 20, 2025
a3366aa
typing in tests
brokkoli71 May 20, 2025
7a5bc66
typing in tests
brokkoli71 May 20, 2025
bac3a10
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 May 27, 2025
0abc569
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 Jun 10, 2025
a4d8013
Merge branch 'main' into string-arguments-for-codecs
brokkoli71 Jun 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/2839.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Array creation allows string representation of codecs for ``filters``, ``serializer``, and ``compressors``.
2 changes: 1 addition & 1 deletion src/zarr/api/synchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def create_array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down
27 changes: 10 additions & 17 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3769,23 +3769,23 @@ def _get_default_codecs(


FiltersLike: TypeAlias = (
Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec]
Iterable[dict[str, JSON] | str | ArrayArrayCodec | numcodecs.abc.Codec]
| ArrayArrayCodec
| Iterable[numcodecs.abc.Codec]
| numcodecs.abc.Codec
| Literal["auto"]
| str
| None
)
CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | None
CompressorsLike: TypeAlias = (
Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec]
Iterable[dict[str, JSON] | str | BytesBytesCodec | numcodecs.abc.Codec]
| dict[str, JSON]
| BytesBytesCodec
| numcodecs.abc.Codec
| Literal["auto"]
| str
| None
)
SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"]
SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | str


class ShardsConfigParam(TypedDict):
Expand Down Expand Up @@ -4053,7 +4053,7 @@ async def create_array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down Expand Up @@ -4264,24 +4264,13 @@ def _parse_chunk_encoding_v2(
elif isinstance(compressor, tuple | list) and len(compressor) == 1:
_compressor = parse_compressor(compressor[0])
else:
if isinstance(compressor, Iterable) and not isinstance(compressor, dict):
msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead."
raise TypeError(msg)
_compressor = parse_compressor(compressor)

if filters is None:
_filters = None
elif filters == "auto":
_filters = default_filters
else:
if isinstance(filters, Iterable):
for idx, f in enumerate(filters):
if not isinstance(f, numcodecs.abc.Codec):
msg = (
"For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. "
f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec."
)
raise TypeError(msg)
_filters = parse_filters(filters)

return _filters, _compressor
Expand All @@ -4305,6 +4294,8 @@ def _parse_chunk_encoding_v3(
out_array_array: tuple[ArrayArrayCodec, ...] = ()
elif filters == "auto":
out_array_array = default_array_array
elif isinstance(filters, str):
out_array_array = (_parse_array_array_codec(filters),)
else:
maybe_array_array: Iterable[Codec | dict[str, JSON]]
if isinstance(filters, dict | Codec):
Expand All @@ -4322,6 +4313,8 @@ def _parse_chunk_encoding_v3(
out_bytes_bytes: tuple[BytesBytesCodec, ...] = ()
elif compressors == "auto":
out_bytes_bytes = default_bytes_bytes
elif isinstance(compressors, str):
out_bytes_bytes = (_parse_bytes_bytes_codec(compressors),)
else:
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
if isinstance(compressors, dict | Codec):
Expand Down
6 changes: 3 additions & 3 deletions src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -1045,7 +1045,7 @@ async def create_array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down Expand Up @@ -2280,7 +2280,7 @@ def create_array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down Expand Up @@ -2678,7 +2678,7 @@ def array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down
12 changes: 9 additions & 3 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,20 +246,24 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None:

if data is None:
return data
if isinstance(data, str):
return (numcodecs.get_codec({"id": data}),)
if isinstance(data, Iterable):
for idx, val in enumerate(data):
if isinstance(val, numcodecs.abc.Codec):
out.append(val)
elif isinstance(val, dict):
out.append(numcodecs.get_codec(val))
elif isinstance(val, str):
out.append(numcodecs.get_codec({"id": val}))
else:
msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead."
msg = f"For Zarr format 2 arrays, all elements of `filters` must be a numcodecs.abc.Codec or a dict or str representation of numcodecs.abc.Codec. Got {type(val)} at index {idx} instead."
raise TypeError(msg)
return tuple(out)
# take a single codec instance and wrap it in a tuple
if isinstance(data, numcodecs.abc.Codec):
return (data,)
msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead."
msg = f"For Zarr format 2 arrays, all elements of `filters` must be None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead."
raise TypeError(msg)


Expand All @@ -271,7 +275,9 @@ def parse_compressor(data: object) -> numcodecs.abc.Codec | None:
return data
if isinstance(data, dict):
return numcodecs.get_codec(data)
msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead."
if isinstance(data, str):
return numcodecs.get_codec({"id": data})
msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(data)} instead."
raise ValueError(msg)


Expand Down
12 changes: 9 additions & 3 deletions src/zarr/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,16 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec:
return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type]


def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec:
def _parse_bytes_bytes_codec(data: dict[str, JSON] | str | Codec) -> BytesBytesCodec:
"""
Normalize the input to a ``BytesBytesCodec`` instance.
If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it
is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function.
"""
from zarr.abc.codec import BytesBytesCodec

if isinstance(data, str):
data = {"name": data, "configuration": {}}
if isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, BytesBytesCodec):
Expand All @@ -186,14 +188,16 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec:
return result


def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec:
def _parse_array_bytes_codec(data: dict[str, JSON] | str | Codec) -> ArrayBytesCodec:
"""
Normalize the input to a ``ArrayBytesCodec`` instance.
If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it
is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function.
"""
from zarr.abc.codec import ArrayBytesCodec

if isinstance(data, str):
data = {"name": data, "configuration": {}}
if isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, ArrayBytesCodec):
Expand All @@ -206,14 +210,16 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec:
return result


def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec:
def _parse_array_array_codec(data: dict[str, JSON] | str | Codec) -> ArrayArrayCodec:
"""
Normalize the input to a ``ArrayArrayCodec`` instance.
If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it
is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function.
"""
from zarr.abc.codec import ArrayArrayCodec

if isinstance(data, str):
data = {"name": data, "configuration": {}}
if isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, ArrayArrayCodec):
Expand Down
56 changes: 52 additions & 4 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from zarr.core.array import (
CompressorsLike,
FiltersLike,
SerializerLike,
_get_default_chunk_encoding_v2,
_get_default_chunk_encoding_v3,
_parse_chunk_encoding_v2,
Expand Down Expand Up @@ -1025,6 +1026,15 @@ async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_valu
ZstdCodec(level=3),
{"name": "zstd", "configuration": {"level": 3}},
({"name": "zstd", "configuration": {"level": 3}},),
"zstd",
("crc32c", "zstd"),
],
)
@pytest.mark.parametrize(
"serializer",
[
"auto",
"bytes",
],
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -1065,6 +1075,7 @@ async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_valu
async def test_v3_chunk_encoding(
store: MemoryStore,
compressors: CompressorsLike,
serializer: SerializerLike,
filters: FiltersLike,
dtype: str,
chunks: tuple[int, ...],
Expand All @@ -1073,6 +1084,9 @@ async def test_v3_chunk_encoding(
"""
Test various possibilities for the compressors and filters parameter to create_array
"""
if serializer == "bytes" and dtype == "str":
serializer = "vlen-utf8"

arr = await create_array(
store=store,
dtype=dtype,
Expand All @@ -1081,10 +1095,11 @@ async def test_v3_chunk_encoding(
shards=shards,
zarr_format=3,
filters=filters,
serializer=serializer,
compressors=compressors,
)
filters_expected, _, compressors_expected = _parse_chunk_encoding_v3(
filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype)
filters_expected, serializer_expected, compressors_expected = _parse_chunk_encoding_v3(
filters=filters, compressors=compressors, serializer=serializer, dtype=np.dtype(dtype)
)
assert arr.filters == filters_expected
assert arr.compressors == compressors_expected
Expand All @@ -1098,11 +1113,20 @@ async def test_v3_chunk_encoding(
None,
numcodecs.Zstd(level=3),
(),
(numcodecs.Zstd(level=3),),
(numcodecs.Zstd(level=2),),
"zstd",
],
)
@pytest.mark.parametrize(
"filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)]
"filters",
[
"auto",
None,
numcodecs.GZip(level=1),
(numcodecs.GZip(level=2)),
"gzip",
("gzip", "zstd"),
],
)
async def test_v2_chunk_encoding(
store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str
Expand All @@ -1129,6 +1153,30 @@ async def test_v2_chunk_encoding(
assert arr.compressors == compressor_expected
assert arr.filters == filters_expected

@staticmethod
async def test_bad_chunk_encoding(store: MemoryStore) -> None:
"""
Test that passing an invalid compressor or filter to create_array raises an error.
"""
bad_compressor = 2
msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(bad_compressor)} instead."
with pytest.raises(ValueError, match=msg):
await create_array(
store=store,
dtype="uint8",
shape=(10,),
zarr_format=2,
compressors=bad_compressor,
)
with pytest.raises(KeyError):
await create_array(
store=store,
dtype="uint8",
shape=(10,),
zarr_format=3,
filters="bad_filter",
)

@staticmethod
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
async def test_default_filters_compressors(
Expand Down