Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
0e345a6
update PydapArrayWrapper to support backend batching
Mikejmnez Aug 12, 2025
4dbcd62
update PydapDataStore to use backend logic in dap4 to batch variables…
Mikejmnez Aug 12, 2025
007dac2
pydap-server it not necessary
Mikejmnez Aug 12, 2025
76faff6
set `batch=False` as default
Mikejmnez Aug 12, 2025
16a9341
set `batch=False` as default in datatree
Mikejmnez Aug 12, 2025
6f8afb0
set `batch=False` as default in open groups as dict
Mikejmnez Aug 12, 2025
70f500f
for flaky, install pydap from repo for now
Mikejmnez Aug 12, 2025
1ac0ab4
initial tests - quantify cached url
Mikejmnez Aug 13, 2025
3a79592
adds tests to datatree backend to assert multiple dimensions download…
Mikejmnez Aug 13, 2025
a8fe8fe
update testing to show number of download urls
Mikejmnez Aug 13, 2025
1f65ef6
simplified logic
Mikejmnez Aug 13, 2025
0d22358
specify cached session debug name to actually cache urls
Mikejmnez Aug 13, 2025
3205515
fix for mypy
Mikejmnez Aug 13, 2025
cb33c28
user visible changes on `whats-new.rst`
Mikejmnez Aug 13, 2025
f85a0b9
impose sorted to `get_dimensions` method
Mikejmnez Aug 13, 2025
263592d
reformat `whats-new.rst`
Mikejmnez Aug 13, 2025
9e5c785
revert to install pydap from conda and not from repo
Mikejmnez Aug 13, 2025
73aa5a1
expose checksum as user kwarg
Mikejmnez Aug 13, 2025
f59b57d
include `checksums` optional argument in `whats-new`
Mikejmnez Aug 13, 2025
6c354ca
update to newest release of pydap via pip until conda install is avai…
Mikejmnez Aug 13, 2025
eb3fca5
use requests_cache session with retry-params when 500 errors occur
Mikejmnez Aug 13, 2025
16394aa
update env yml file to use new pydap release via conda
Mikejmnez Aug 14, 2025
1b76e98
let `pydap` handle exceptions/warning
Mikejmnez Aug 25, 2025
2ca8a4d
process dims at once, one per group
Mikejmnez Sep 5, 2025
652e5d6
debug
Mikejmnez Sep 9, 2025
8406494
revert what`s new from previous commit
Mikejmnez Sep 18, 2025
a5c6ba2
enable data checker for batched deserialized data
Mikejmnez Sep 18, 2025
a111b0c
temporarily install from source for testing - will revert to conda in…
Mikejmnez Sep 18, 2025
fd84f63
update `whats new`
Mikejmnez Sep 18, 2025
5da338c
update tests
Mikejmnez Sep 18, 2025
0c55e52
set `batch=None` as default
Mikejmnez Sep 18, 2025
36ea456
improve handling of dims vs dimensions deprecation warning
Mikejmnez Sep 19, 2025
863ea6d
update to use latest version of pydap
Mikejmnez Sep 26, 2025
fc72d32
update import
Mikejmnez Sep 26, 2025
049ff2e
update `whats new docs
Mikejmnez Sep 26, 2025
4f5a715
move cache session to `tmpdir`
Mikejmnez Sep 26, 2025
47b7e73
remove added functionality from whats new from newly released version
Mikejmnez Sep 30, 2025
4b516b4
add to `whats-new` for next release
Mikejmnez Sep 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ci/requirements/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ dependencies:
- pre-commit
- pyarrow # pandas raises a deprecation warning without this, breaking doctests
- pydap
- pydap-server
- pytest
- pytest-asyncio
- pytest-cov
Expand Down
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ v2025.10.0 (unreleased)
New Features
~~~~~~~~~~~~

- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`).
``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend.
By `Miguel Jimenez-Urias <https://github.com/Mikejmnez>`_.


Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
110 changes: 90 additions & 20 deletions xarray/backends/pydap_.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@


class PydapArrayWrapper(BackendArray):
def __init__(self, array):
def __init__(self, array, batch=None, checksums=True):
self.array = array
self._batch = batch
self._checksums = checksums

@property
def shape(self) -> tuple[int, ...]:
Expand All @@ -52,13 +54,19 @@ def __getitem__(self, key):
)

def _getitem(self, key):
result = robust_getitem(self.array, key, catch=ValueError)
# in some cases, pydap doesn't squeeze axes automatically like numpy
result = np.asarray(result)
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types))
if result.ndim + len(axis) != self.array.ndim and axis:
result = np.squeeze(result, axis)
if self._batch and hasattr(self.array, "dataset"):
# True only for pydap>3.5.5
from pydap.client import data_check, get_batch_data

dataset = self.array.dataset
get_batch_data(self.array, checksums=self._checksums, key=key)
result = data_check(np.asarray(dataset[self.array.id].data), key)
else:
result = robust_getitem(self.array, key, catch=ValueError)
result = np.asarray(result.data)
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types))
if result.ndim + len(axis) != self.array.ndim and axis:
result = np.squeeze(result, axis)
return result


Expand All @@ -81,7 +89,15 @@ class PydapDataStore(AbstractDataStore):
be useful if the netCDF4 library is not available.
"""

def __init__(self, dataset, group=None):
def __init__(
self,
dataset,
group=None,
session=None,
batch=None,
protocol=None,
checksums=True,
):
"""
Parameters
----------
Expand All @@ -91,6 +107,9 @@ def __init__(self, dataset, group=None):
"""
self.dataset = dataset
self.group = group
self._batch = batch
self._protocol = protocol
self._checksums = checksums # true by default

@classmethod
def open(
Expand All @@ -103,6 +122,8 @@ def open(
timeout=None,
verify=None,
user_charset=None,
batch=None,
checksums=True,
):
from pydap.client import open_url
from pydap.net import DEFAULT_TIMEOUT
Expand All @@ -117,6 +138,7 @@ def open(
DeprecationWarning,
)
output_grid = False # new default behavior

kwargs = {
"url": url,
"application": application,
Expand All @@ -132,22 +154,45 @@ def open(
elif hasattr(url, "ds"):
# pydap dataset
dataset = url.ds
args = {"dataset": dataset}
args = {"dataset": dataset, "checksums": checksums}
if group:
# only then, change the default
args["group"] = group
if url.startswith(("http", "dap2")):
args["protocol"] = "dap2"
elif url.startswith("dap4"):
args["protocol"] = "dap4"
if batch:
args["batch"] = batch
return cls(**args)

def open_store_variable(self, var):
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var))
try:
if hasattr(var, "dims"):
dimensions = [
dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims
]
except AttributeError:
else:
# GridType does not have a dims attribute - instead get `dimensions`
# see https://github.com/pydap/pydap/issues/485
dimensions = var.dimensions
if (
self._protocol == "dap4"
and var.name in dimensions
and hasattr(var, "dataset") # only True for pydap>3.5.5
):
if not var.dataset._batch_mode:
# for dap4, always batch all dimensions at once
var.dataset.enable_batch_mode()
data_array = self._get_data_array(var)
data = indexing.LazilyIndexedArray(data_array)
if not self._batch and var.dataset._batch_mode:
# if `batch=False``, restore it for all other variables
var.dataset.disable_batch_mode()
else:
# all non-dimension variables
data = indexing.LazilyIndexedArray(
PydapArrayWrapper(var, self._batch, self._checksums)
)

return Variable(dimensions, data, var.attributes)

def get_variables(self):
Expand All @@ -165,6 +210,7 @@ def get_variables(self):
# check the key is not a BaseType or GridType
if not isinstance(self.ds[var], GroupType)
]

return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars)

def get_attrs(self):
Expand All @@ -176,18 +222,30 @@ def get_attrs(self):
"libdap",
"invocation",
"dimensions",
"path",
"Maps",
)
attrs = self.ds.attributes
list(map(attrs.pop, opendap_attrs, [None] * 6))
attrs = dict(self.ds.attributes)
list(map(attrs.pop, opendap_attrs, [None] * 8))
return Frozen(attrs)

def get_dimensions(self):
return Frozen(self.ds.dimensions)
return Frozen(sorted(self.ds.dimensions))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To potentially address the issues with dimensions in Datatree, and the lat/lon dimensions being inconsistently ordered, I added this sorted to the dimensions list that the backend gets from the Pydap dataset directly. Hopefully this little fix will make it go away, but I will continue checking this issue locally and after merging main into this PR (it has not failed once yet! knocks on wood)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is only dataset level dimensions, not variable level dimensions.

At the dataset level, dimension order doesn't really matter, so I doubt this is going to fix the issue, unfortunately.


@property
def ds(self):
return get_group(self.dataset, self.group)

def _get_data_array(self, var):
"""gets dimension data all at once"""
from pydap.client import get_batch_data

if not var._is_data_loaded():
# data has not been deserialized yet
# runs only once per store/hierarchy
get_batch_data(var, checksums=self._checksums)
return self.dataset[var.id].data


class PydapBackendEntrypoint(BackendEntrypoint):
"""
Expand Down Expand Up @@ -231,6 +289,8 @@ def open_dataset(
timeout=None,
verify=None,
user_charset=None,
batch=None,
checksums=True,
) -> Dataset:
store = PydapDataStore.open(
url=filename_or_obj,
Expand All @@ -241,6 +301,8 @@ def open_dataset(
timeout=timeout,
verify=verify,
user_charset=user_charset,
batch=batch,
checksums=checksums,
)
store_entrypoint = StoreBackendEntrypoint()
with close_on_error(store):
Expand Down Expand Up @@ -273,6 +335,8 @@ def open_datatree(
timeout=None,
verify=None,
user_charset=None,
batch=None,
checksums=True,
) -> DataTree:
groups_dict = self.open_groups_as_dict(
filename_or_obj,
Expand All @@ -285,10 +349,12 @@ def open_datatree(
decode_timedelta=decode_timedelta,
group=group,
application=None,
session=None,
timeout=None,
verify=None,
user_charset=None,
session=session,
timeout=timeout,
verify=application,
user_charset=user_charset,
batch=batch,
checksums=checksums,
)

return datatree_from_dict_with_io_cleanup(groups_dict)
Expand All @@ -310,6 +376,8 @@ def open_groups_as_dict(
timeout=None,
verify=None,
user_charset=None,
batch=None,
checksums=True,
) -> dict[str, Dataset]:
from xarray.core.treenode import NodePath

Expand All @@ -321,6 +389,8 @@ def open_groups_as_dict(
timeout=timeout,
verify=verify,
user_charset=user_charset,
batch=batch,
checksums=checksums,
)

# Check for a group and make it a parent if it exists
Expand Down
52 changes: 52 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -6465,6 +6465,58 @@ def test_session(self) -> None:
)


@requires_pydap
@network
@pytest.mark.parametrize("protocol", ["dap2", "dap4"])
@pytest.mark.parametrize("batch", [False, True])
def test_batchdap4_downloads(tmpdir, protocol, batch) -> None:
"""Test that in dap4, all dimensions are downloaded at once"""
import pydap
from pydap.net import create_session

_version_ = Version(pydap.__version__)
# Create a session with pre-set params in pydap backend, to cache urls
cache_name = tmpdir / "debug"
session = create_session(use_cache=True, cache_kwargs={"cache_name": cache_name})
session.cache.clear()
url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc"

if protocol == "dap4":
ds = open_dataset(
url.replace("https", protocol),
engine="pydap",
session=session,
decode_times=False,
batch=batch,
)
if _version_ > Version("3.5.5"):
# total downloads are:
# 1 dmr + 1 dap (dimensions)
assert len(session.cache.urls()) == 2
# now load the rest of the variables
ds.load()
if batch:
# all non-dimensions are downloaded in a single https requests
assert len(session.cache.urls()) == 2 + 1
if not batch:
# each non-dimension array is downloaded with an individual
# https requests
assert len(session.cache.urls()) == 2 + 4
else:
assert len(session.cache.urls()) == 4
ds.load()
assert len(session.cache.urls()) == 4 + 4
elif protocol == "dap2":
ds = open_dataset(
url.replace("https", protocol),
engine="pydap",
session=session,
decode_times=False,
)
# das + dds + 3 dods urls
assert len(session.cache.urls()) == 5


class TestEncodingInvalid:
def test_extract_nc4_variable_encoding(self) -> None:
var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"})
Expand Down
24 changes: 22 additions & 2 deletions xarray/tests/test_backends_datatree.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import numpy as np
import pytest
from packaging.version import Version

import xarray as xr
from xarray import DataTree, load_datatree, open_datatree, open_groups
Expand Down Expand Up @@ -325,7 +326,7 @@

filepath = tmpdir + "/phony_dims.nc"
original_dt = simple_datatree
original_dt.to_netcdf(filepath) # should not raise

Check failure on line 329 in xarray/tests/test_backends_datatree.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.13 all-but-numba

TestH5NetCDFDatatreeIO.test_default_write_engine ModuleNotFoundError: No module named 'h5netcdf.legacyapi'

@requires_dask
def test_open_datatree_chunks(self, tmpdir) -> None:
Expand Down Expand Up @@ -613,7 +614,7 @@
) as expected:
assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected)

def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
def test_inherited_coords(self, tmpdir, url=simplegroup_datatree_url) -> None:
"""Test that `open_datatree` inherits coordinates from root tree.

This particular h5 file is a test file that inherits the time coordinate from the root
Expand All @@ -639,7 +640,19 @@
│ Temperature (time, Z, Y, X) float32 ...
| Salinity (time, Z, Y, X) float32 ...
"""
tree = open_datatree(url, engine=self.engine)
import pydap
from pydap.net import create_session

# Create a session with pre-set retry params in pydap backend, to cache urls
cache_name = tmpdir / "debug"
session = create_session(
use_cache=True, cache_kwargs={"cache_name": cache_name}
)
session.cache.clear()

_version_ = Version(pydap.__version__)

tree = open_datatree(url, engine=self.engine, session=session)
assert set(tree.dims) == {"time", "Z", "nv"}
assert tree["/SimpleGroup"].coords["time"].dims == ("time",)
assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",)
Expand All @@ -650,6 +663,13 @@
list(expected.dims) + ["Z", "nv"]
)

if _version_ > Version("3.5.5"):
# Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups per group
assert len(session.cache.urls()) == 3
else:
# 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays)
assert len(session.cache.urls()) == 5

def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None:
aligned_dict_of_datasets = open_groups(url, engine=self.engine)
aligned_dt = DataTree.from_dict(aligned_dict_of_datasets)
Expand Down
Loading