Skip to content

What should happen in these Edge Cases in reading from Zarr #10806

@ianhi

Description

@ianhi

What is your issue?

I was playing around with manually creating zarr stores and feeding them into xarray. I noticed some weird edge cases when a coord variable and a dimension name are not the same. I'm not sure that either of these cases are necessarily bugs, but the resulting behavior doesn't "feel right".

cc @TomNicholas

# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "xarray[complete]@git+https://github.com/pydata/xarray.git@main",
#   "zarr",
#   "numpy",
# ]
# ///
#

import xarray as xr
import zarr
import numpy as np

fname = "mismatch_name_1.zarr"
z = zarr.open(fname)
z.create_array(
    "blah",
    data=np.arange(10, dtype=int),
    dimension_names=["coord_1"],
    overwrite=True,
)
z.create_array(
    "data", data=np.arange(100, step=10), dimension_names=["coord_1"], overwrite=True
)
ds = xr.open_zarr(fname, consolidated=False)
print(ds)

# <xarray.Dataset> Size: 160B
# Dimensions:  (coord_1: 10)
# Dimensions without coordinates: coord_1
# Data variables:
#     blah     (coord_1) int64 80B ...
#     data     (coord_1) int64 80B ...



#############################


fname = "mismatch_name_2.zarr"
z = zarr.open(fname)
z.create_array(
    "coord_1",
    data=np.arange(10, dtype=int),
    dimension_names=["blah"],
    overwrite=True,
)
z.create_array(
    "data", data=np.arange(100, step=10), dimension_names=["coord_1"], overwrite=True
)
ds = xr.load_dataset(fname, consolidated=False)
print(ds)

# <xarray.Dataset> Size: 160B
# Dimensions:  (coord_1: 10, blah: 10)
# Coordinates:
#     coord_1  (blah) int64 80B 0 1 2 3 4 5 6 7 8 9
# Dimensions without coordinates: blah
# Data variables:
#     data     (coord_1) int64 80B 0 10 20 30 40 50 60 70 80 90


# doesn't fail but gives wrong selection
print(ds.sel(blah=4))

# <xarray.Dataset> Size: 88B
# Dimensions:  (coord_1: 10)
# Coordinates:
#     coord_1  int64 8B 4
# Data variables:
#     data     (coord_1) int64 80B 0 10 20 30 40 50 60 70 80 90

# fails with error
# KeyError: "no index found for coordinate 'coord_1'"
ds.sel(coord_1=4)

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions