From 0e345a67269c14b444a50b3ceb6dec7b2f89da5e Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:00:37 -0700 Subject: [PATCH 01/38] update PydapArrayWrapper to support backend batching --- xarray/backends/pydap_.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4fbfe8ee210..f143959d010 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -35,8 +35,10 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array): + def __init__(self, array, batch=False, cache=None): self.array = array + self._batch = batch + self._cache = cache @property def shape(self) -> tuple[int, ...]: @@ -52,13 +54,29 @@ def __getitem__(self, key): ) def _getitem(self, key): - result = robust_getitem(self.array, key, catch=ValueError) - # in some cases, pydap doesn't squeeze axes automatically like numpy - result = np.asarray(result) + if self.array.id in self._cache.keys(): + # safely avoid re-downloading some coordinates + result = self._cache[self.array.id] + elif self._batch and hasattr(self.array, "dataset"): + # this are both True only for pydap>3.5.5 + from pydap.lib import resolve_batch_for_all_variables + + parent = self.array.parent # could be root ds | group + variables = list(parent.variables()) + resolve_batch_for_all_variables(parent, variables, key) + + result = np.asarray( + parent.dataset._current_batch_promise.wait_for_result(self.array.id) + ) + else: + result = robust_getitem(self.array, key, catch=ValueError) + try: + result = np.asarray(result.data) + except AttributeError: + result = np.asarray(result) axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) if result.ndim + len(axis) != self.array.ndim and axis: result = np.squeeze(result, axis) - return result From 4dbcd623b05178480550998c7632f646ab85746a Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:07:21 -0700 Subject: [PATCH 02/38] update PydapDataStore to use backend logic in dap4 to batch variables all together in single dap url --- xarray/backends/pydap_.py | 67 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index f143959d010..26957c0175c 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -99,7 +100,7 @@ class PydapDataStore(AbstractDataStore): be useful if the netCDF4 library is not available. """ - def __init__(self, dataset, group=None): + def __init__(self, dataset, group=None, session=None, batch=False, protocol=None): """ Parameters ---------- @@ -109,6 +110,11 @@ def __init__(self, dataset, group=None): """ self.dataset = dataset self.group = group + self.session = session + self._batch = batch + self._batch_done = False + self._array_cache = {} # holds 1D dimension data + self._protocol = protocol @classmethod def open( @@ -121,6 +127,7 @@ def open( timeout=None, verify=None, user_charset=None, + batch=False, ): from pydap.client import open_url from pydap.net import DEFAULT_TIMEOUT @@ -135,6 +142,7 @@ def open( DeprecationWarning, ) output_grid = False # new default behavior + kwargs = { "url": url, "application": application, @@ -152,12 +160,26 @@ def open( dataset = url.ds args = {"dataset": dataset} if group: - # only then, change the default args["group"] = group + if url.startswith(("https", "dap2")): + args["protocol"] = "dap2" + else: + args["protocol"] = "dap4" + if batch: + if args["protocol"] == "dap2": + warnings.warn( + f"`batch={batch}` is currently only compatible with the `DAP4` " + "protocol. Make sue the OPeNDAP server implements the `DAP4` " + "protocol and then replace the scheme of the url with `dap4` " + "to make use of it. Setting `batch=False`.", + stacklevel=2, + ) + else: + # only update if dap4 + args["batch"] = batch return cls(**args) def open_store_variable(self, var): - data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) try: dimensions = [ dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims @@ -166,6 +188,25 @@ def open_store_variable(self, var): # GridType does not have a dims attribute - instead get `dimensions` # see https://github.com/pydap/pydap/issues/485 dimensions = var.dimensions + if ( + self._protocol == "dap4" + and var.name in dimensions + and hasattr(var, "dataset") # only True for pydap>3.5.5 + ): + if not var.dataset._batch_mode: + # for dap4, always batch all dimensions at once + var.dataset.enable_batch_mode() + data_array = self._get_data_array(var) + data = indexing.LazilyIndexedArray(data_array) + if not self._batch and var.dataset._batch_mode: + # if `batch=False``, restore it for all other variables + var.dataset.disable_batch_mode() + else: + # all non-dimension variables + data = indexing.LazilyIndexedArray( + PydapArrayWrapper(var, self._batch, self._array_cache) + ) + return Variable(dimensions, data, var.attributes) def get_variables(self): @@ -183,6 +224,7 @@ def get_variables(self): # check the key is not a BaseType or GridType if not isinstance(self.ds[var], GroupType) ] + return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars) def get_attrs(self): @@ -194,9 +236,11 @@ def get_attrs(self): "libdap", "invocation", "dimensions", + "path", + "Maps", ) - attrs = self.ds.attributes - list(map(attrs.pop, opendap_attrs, [None] * 6)) + attrs = dict(self.ds.attributes) + list(map(attrs.pop, opendap_attrs, [None] * 8)) return Frozen(attrs) def get_dimensions(self): @@ -206,6 +250,19 @@ def get_dimensions(self): def ds(self): return get_group(self.dataset, self.group) + def _get_data_array(self, var): + """gets dimension data all at once, storing the numpy + arrays within a cached dictionary + """ + from pydap.lib import get_batch_data + + if not self._batch_done or var.id not in self._array_cache: + # store all dim data into a dict for reuse + self._array_cache = get_batch_data(var.parent, self._array_cache) + self._batch_done = True + + return self._array_cache[var.id] + class PydapBackendEntrypoint(BackendEntrypoint): """ From 007dac2fbf23e7d2b2994bd998437c0e55cbcdcd Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:17:51 -0700 Subject: [PATCH 03/38] pydap-server it not necessary --- ci/requirements/environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index f56b2bc1d1c..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -37,7 +37,6 @@ dependencies: - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - pydap - - pydap-server - pytest - pytest-asyncio - pytest-cov From 76faff6edabd3c9526cca4b82c0f801fd6d6899b Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:35:52 -0700 Subject: [PATCH 04/38] set `batch=False` as default --- xarray/backends/pydap_.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 26957c0175c..f8e55034146 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -306,6 +306,7 @@ def open_dataset( timeout=None, verify=None, user_charset=None, + batch=False, ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, @@ -316,6 +317,7 @@ def open_dataset( timeout=timeout, verify=verify, user_charset=user_charset, + batch=batch, ) store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): From 16a934106f8dd5a499f4c27dd6fff4140b486abc Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:36:40 -0700 Subject: [PATCH 05/38] set `batch=False` as default in datatree --- xarray/backends/pydap_.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index f8e55034146..084260be423 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -350,6 +350,7 @@ def open_datatree( timeout=None, verify=None, user_charset=None, + batch=False, ) -> DataTree: groups_dict = self.open_groups_as_dict( filename_or_obj, @@ -362,10 +363,11 @@ def open_datatree( decode_timedelta=decode_timedelta, group=group, application=None, - session=None, - timeout=None, - verify=None, - user_charset=None, + session=session, + timeout=timeout, + verify=application, + user_charset=user_charset, + batch=batch, ) return datatree_from_dict_with_io_cleanup(groups_dict) From 6f8afb0d4ba352f0295e4606524bdb07eef9451b Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:37:42 -0700 Subject: [PATCH 06/38] set `batch=False` as default in open groups as dict --- xarray/backends/pydap_.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 084260be423..acbc385a10e 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -389,6 +389,7 @@ def open_groups_as_dict( timeout=None, verify=None, user_charset=None, + batch=False, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath @@ -400,6 +401,7 @@ def open_groups_as_dict( timeout=timeout, verify=verify, user_charset=user_charset, + batch=batch, ) # Check for a group and make it a parent if it exists From 70f500f04ae8217699a86a878a3962a8791b7b51 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 12:16:42 -0700 Subject: [PATCH 07/38] for flaky, install pydap from repo for now --- ci/requirements/environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index eff54fe469e..91f2a70d0d6 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - - pydap + # - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,3 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect + - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release From 1ac0ab4a8347e1a70c6a9361c3a2d669f168f405 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 17:38:11 -0700 Subject: [PATCH 08/38] initial tests - quantify cached url --- xarray/backends/pydap_.py | 4 +-- xarray/tests/test_backends.py | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index acbc385a10e..4dfc6bb4015 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -161,9 +161,9 @@ def open( args = {"dataset": dataset} if group: args["group"] = group - if url.startswith(("https", "dap2")): + if url.startswith(("http", "dap2")): args["protocol"] = "dap2" - else: + elif url.startswith("dap4"): args["protocol"] = "dap4" if batch: if args["protocol"] == "dap2": diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7df9596b1ae..54f24144b1f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6465,6 +6465,53 @@ def test_session(self) -> None: ) +@requires_pydap +@network +@pytest.mark.parametrize("protocol", ["dap2", "dap4"]) +def test_batchdap4_downloads(protocol) -> None: + """Test that in dap4, all dimensions are downloaded at once""" + import pydap + from requests_cache import CachedSession + + _version_ = Version(pydap.__version__) + session = CachedSession() + session.cache.clear() + url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" + + open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + ) + if protocol == "dap4": + if _version_ > Version("3.5.5"): + # should download 2 urls only (1 dmr and 1 dap) + assert len(session.cache.urls()) == 2 + else: + assert len(session.cache.urls()) == 4 + # das + dds + 3 dods urls + elif protocol == "dap2": + assert len(session.cache.urls()) == 5 + + +@requires_pydap +@network +def test_batch_warnswithdap2() -> None: + from requests_cache import CachedSession + + session = CachedSession() + session.cache.clear() + url = "dap2://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" + with pytest.warns(UserWarning): + open_dataset( + url, engine="pydap", session=session, batch=True, decode_times=False + ) + + # no batching is supported here + assert len(session.cache.urls()) == 5 + + class TestEncodingInvalid: def test_extract_nc4_variable_encoding(self) -> None: var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"}) From 3a79592154533636a649b56a8639fb0ab17eb1e0 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 21:30:40 -0700 Subject: [PATCH 09/38] adds tests to datatree backend to assert multiple dimensions downloaded at once (per group) --- xarray/tests/test_backends_datatree.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 5b2b93f9ade..f230af6c575 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -9,6 +9,7 @@ import numpy as np import pytest +from packaging.version import Version import xarray as xr from xarray import DataTree, load_datatree, open_datatree, open_groups @@ -639,7 +640,15 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: │ Temperature (time, Z, Y, X) float32 ... | Salinity (time, Z, Y, X) float32 ... """ - tree = open_datatree(url, engine=self.engine) + import pydap + from requests_cache import CachedSession + + _version_ = Version(pydap.__version__) + + session = CachedSession() + session.cache.clear() + + tree = open_datatree(url, engine=self.engine, session=session) assert set(tree.dims) == {"time", "Z", "nv"} assert tree["/SimpleGroup"].coords["time"].dims == ("time",) assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",) @@ -650,6 +659,19 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: list(expected.dims) + ["Z", "nv"] ) + # group (including root). So in this case 3. In the future there + # should a only be 2 downloads (all dimensions should be downloaded) + # within single + + if _version_ > Version("3.5.5"): + # Total downloads are: 1 dmr, + 1 dap url per Group | root. + # since there is a group then 2 dap url. In the future there + # should only be 1 dap url downloaded. + assert len(session.cache.urls()) == 3 + else: + # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) + assert len(session.cache.urls()) == 5 + def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None: aligned_dict_of_datasets = open_groups(url, engine=self.engine) aligned_dt = DataTree.from_dict(aligned_dict_of_datasets) From a8fe8fecac1215a141f76af4ef6e5397e723c38a Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 22:47:22 -0700 Subject: [PATCH 10/38] update testing to show number of download urls --- xarray/tests/test_backends.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 54f24144b1f..c42cc24989e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6468,7 +6468,8 @@ def test_session(self) -> None: @requires_pydap @network @pytest.mark.parametrize("protocol", ["dap2", "dap4"]) -def test_batchdap4_downloads(protocol) -> None: +@pytest.mark.parametrize("batch", [False, True]) +def test_batchdap4_downloads(protocol, batch) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap from requests_cache import CachedSession @@ -6478,20 +6479,36 @@ def test_batchdap4_downloads(protocol) -> None: session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" - open_dataset( - url.replace("https", protocol), - engine="pydap", - session=session, - decode_times=False, - ) + args = { + "filename_or_obj": url.replace("https", protocol), + "engine": "pydap", + "session": session, + "decode_times": False, + } + if protocol == "dap4": + ds = open_dataset(**args, batch=batch) if _version_ > Version("3.5.5"): - # should download 2 urls only (1 dmr and 1 dap) + # total downloads are: + # 1 dmr + 1 dap (dimensions) assert len(session.cache.urls()) == 2 + # now load the rest of the variables + ds.load() + if batch: + # all non-dimensions are downloaded in a single https requests + assert len(session.cache.urls()) == 2 + 1 + if not batch: + # each non-dimension array is downloaded with an individual + # https requests + assert len(session.cache.urls()) == 2 + 4 else: assert len(session.cache.urls()) == 4 - # das + dds + 3 dods urls + ds.load() + assert len(session.cache.urls()) == 4 + 4 elif protocol == "dap2": + ds = open_dataset(**args) + # das + dds + 3 dods urls + assert len(session.cache.urls()) == 5 From 1f65ef6ceac4c90260d89b6dc0b3215e8e67616d Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 00:01:01 -0700 Subject: [PATCH 11/38] simplified logic --- xarray/backends/pydap_.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4dfc6bb4015..49a9103c697 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -62,12 +62,10 @@ def _getitem(self, key): # this are both True only for pydap>3.5.5 from pydap.lib import resolve_batch_for_all_variables - parent = self.array.parent # could be root ds | group - variables = list(parent.variables()) - resolve_batch_for_all_variables(parent, variables, key) - + dataset = self.array.dataset + resolve_batch_for_all_variables(self.array, key) result = np.asarray( - parent.dataset._current_batch_promise.wait_for_result(self.array.id) + dataset._current_batch_promise.wait_for_result(self.array.id) ) else: result = robust_getitem(self.array, key, catch=ValueError) From 0d22358bb223206bbf5d1ef9bef1e4293d9eda8e Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 00:01:24 -0700 Subject: [PATCH 12/38] specify cached session debug name to actually cache urls --- xarray/tests/test_backends.py | 2 +- xarray/tests/test_backends_datatree.py | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c42cc24989e..1118e434e02 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6475,7 +6475,7 @@ def test_batchdap4_downloads(protocol, batch) -> None: from requests_cache import CachedSession _version_ = Version(pydap.__version__) - session = CachedSession() + session = CachedSession(cache_name="debug") # so that urls are cached session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index f230af6c575..062fdb88fdd 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -645,7 +645,7 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: _version_ = Version(pydap.__version__) - session = CachedSession() + session = CachedSession(cache_name="debug") # so that urls are cached session.cache.clear() tree = open_datatree(url, engine=self.engine, session=session) @@ -659,15 +659,9 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: list(expected.dims) + ["Z", "nv"] ) - # group (including root). So in this case 3. In the future there - # should a only be 2 downloads (all dimensions should be downloaded) - # within single - if _version_ > Version("3.5.5"): - # Total downloads are: 1 dmr, + 1 dap url per Group | root. - # since there is a group then 2 dap url. In the future there - # should only be 1 dap url downloaded. - assert len(session.cache.urls()) == 3 + # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups + assert len(session.cache.urls()) == 2 else: # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) assert len(session.cache.urls()) == 5 From 3205515c5eb99715e69cb8af70d11497ae38ed92 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 00:24:30 -0700 Subject: [PATCH 13/38] fix for mypy --- xarray/tests/test_backends.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1118e434e02..b4a2cafa757 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6479,15 +6479,14 @@ def test_batchdap4_downloads(protocol, batch) -> None: session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" - args = { - "filename_or_obj": url.replace("https", protocol), - "engine": "pydap", - "session": session, - "decode_times": False, - } - if protocol == "dap4": - ds = open_dataset(**args, batch=batch) + ds = open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + batch=batch, + ) if _version_ > Version("3.5.5"): # total downloads are: # 1 dmr + 1 dap (dimensions) @@ -6506,9 +6505,13 @@ def test_batchdap4_downloads(protocol, batch) -> None: ds.load() assert len(session.cache.urls()) == 4 + 4 elif protocol == "dap2": - ds = open_dataset(**args) + ds = open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + ) # das + dds + 3 dods urls - assert len(session.cache.urls()) == 5 From cb33c2897385e2e52801767bfd7b0f60cf570269 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 09:21:26 -0700 Subject: [PATCH 14/38] user visible changes on `whats-new.rst` --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b8ffab2889f..ed9dfb06677 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -239,6 +239,9 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading + dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables + downloading multiple arrays in single response. Breaking changes ~~~~~~~~~~~~~~~~ From f85a0b997a9d8468640e325b14c0ab4562091c46 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 09:50:21 -0700 Subject: [PATCH 15/38] impose sorted to `get_dimensions` method --- xarray/backends/pydap_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 49a9103c697..72027a4cefa 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -242,7 +242,7 @@ def get_attrs(self): return Frozen(attrs) def get_dimensions(self): - return Frozen(self.ds.dimensions) + return Frozen(sorted(self.ds.dimensions)) @property def ds(self): From 263592d7813dfdf041e08f44cbb6d479916eaa16 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 09:52:45 -0700 Subject: [PATCH 16/38] reformat `whats-new.rst` --- doc/whats-new.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ed9dfb06677..1d63a9b8561 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -239,9 +239,8 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading - dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables - downloading multiple arrays in single response. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. + By `Miguel Jimenez-Urias `_. Breaking changes ~~~~~~~~~~~~~~~~ From 9e5c7859d05b1922b697b77d942601685d36f03e Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 10:12:24 -0700 Subject: [PATCH 17/38] revert to install pydap from conda and not from repo --- ci/requirements/environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 91f2a70d0d6..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - # - pydap + - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,4 +65,3 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release From 73aa5a120fea0e57629988600ea18e511cbe1da9 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 14:34:26 -0700 Subject: [PATCH 18/38] expose checksum as user kwarg --- ci/requirements/environment.yml | 3 ++- xarray/backends/pydap_.py | 30 +++++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index eff54fe469e..91f2a70d0d6 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - - pydap + # - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,3 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect + - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 72027a4cefa..4d86a55dbab 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -36,10 +36,11 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array, batch=False, cache=None): + def __init__(self, array, batch=False, cache=None, checksums=True): self.array = array self._batch = batch self._cache = cache + self._checksums = checksums @property def shape(self) -> tuple[int, ...]: @@ -63,7 +64,7 @@ def _getitem(self, key): from pydap.lib import resolve_batch_for_all_variables dataset = self.array.dataset - resolve_batch_for_all_variables(self.array, key) + resolve_batch_for_all_variables(self.array, key, checksums=self._checksums) result = np.asarray( dataset._current_batch_promise.wait_for_result(self.array.id) ) @@ -98,7 +99,15 @@ class PydapDataStore(AbstractDataStore): be useful if the netCDF4 library is not available. """ - def __init__(self, dataset, group=None, session=None, batch=False, protocol=None): + def __init__( + self, + dataset, + group=None, + session=None, + batch=False, + protocol=None, + checksums=True, + ): """ Parameters ---------- @@ -113,6 +122,7 @@ def __init__(self, dataset, group=None, session=None, batch=False, protocol=None self._batch_done = False self._array_cache = {} # holds 1D dimension data self._protocol = protocol + self._checksums = checksums # true by default @classmethod def open( @@ -126,6 +136,7 @@ def open( verify=None, user_charset=None, batch=False, + checksums=True, ): from pydap.client import open_url from pydap.net import DEFAULT_TIMEOUT @@ -157,6 +168,7 @@ def open( # pydap dataset dataset = url.ds args = {"dataset": dataset} + args["checksums"] = checksums if group: args["group"] = group if url.startswith(("http", "dap2")): @@ -202,7 +214,7 @@ def open_store_variable(self, var): else: # all non-dimension variables data = indexing.LazilyIndexedArray( - PydapArrayWrapper(var, self._batch, self._array_cache) + PydapArrayWrapper(var, self._batch, self._array_cache, self._checksums) ) return Variable(dimensions, data, var.attributes) @@ -256,7 +268,9 @@ def _get_data_array(self, var): if not self._batch_done or var.id not in self._array_cache: # store all dim data into a dict for reuse - self._array_cache = get_batch_data(var.parent, self._array_cache) + self._array_cache = get_batch_data( + var.parent, self._array_cache, self._checksums + ) self._batch_done = True return self._array_cache[var.id] @@ -305,6 +319,7 @@ def open_dataset( verify=None, user_charset=None, batch=False, + checksums=True, ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, @@ -316,6 +331,7 @@ def open_dataset( verify=verify, user_charset=user_charset, batch=batch, + checksums=checksums, ) store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): @@ -349,6 +365,7 @@ def open_datatree( verify=None, user_charset=None, batch=False, + checksums=True, ) -> DataTree: groups_dict = self.open_groups_as_dict( filename_or_obj, @@ -366,6 +383,7 @@ def open_datatree( verify=application, user_charset=user_charset, batch=batch, + checksums=checksums, ) return datatree_from_dict_with_io_cleanup(groups_dict) @@ -388,6 +406,7 @@ def open_groups_as_dict( verify=None, user_charset=None, batch=False, + checksums=True, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath @@ -400,6 +419,7 @@ def open_groups_as_dict( verify=verify, user_charset=user_charset, batch=batch, + checksums=checksums, ) # Check for a group and make it a parent if it exists From f59b57dc9f6e51f78af87e38af729a86bd885e06 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 14:47:17 -0700 Subject: [PATCH 19/38] include `checksums` optional argument in `whats-new` --- doc/whats-new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1d63a9b8561..4046c802f72 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -239,7 +239,8 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. By `Miguel Jimenez-Urias `_. Breaking changes From 6c354ca662362d14ad26eb8b209bd11d5ecd09ca Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 15:06:08 -0700 Subject: [PATCH 20/38] update to newest release of pydap via pip until conda install is available --- ci/requirements/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 91f2a70d0d6..b012fe82e56 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -65,4 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release + - pydap==3.5.6 # just for now - will restore to conda after new release From eb3fca5d1d14f91fc0295d09507f06365c1d9ad4 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 16:12:24 -0700 Subject: [PATCH 21/38] use requests_cache session with retry-params when 500 errors occur --- xarray/backends/pydap_.py | 1 - xarray/tests/test_backends.py | 11 +++++++---- xarray/tests/test_backends_datatree.py | 9 +++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4d86a55dbab..fd66bce2643 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -117,7 +117,6 @@ def __init__( """ self.dataset = dataset self.group = group - self.session = session self._batch = batch self._batch_done = False self._array_cache = {} # holds 1D dimension data diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b4a2cafa757..f4dd0c62ba3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6472,10 +6472,11 @@ def test_session(self) -> None: def test_batchdap4_downloads(protocol, batch) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap - from requests_cache import CachedSession + from pydap.net import create_session _version_ = Version(pydap.__version__) - session = CachedSession(cache_name="debug") # so that urls are cached + # Create a session with pre-set params in pydap backend, to cache urls + session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" @@ -6518,10 +6519,12 @@ def test_batchdap4_downloads(protocol, batch) -> None: @requires_pydap @network def test_batch_warnswithdap2() -> None: - from requests_cache import CachedSession + from pydap.net import create_session - session = CachedSession() + # Create a session with pre-set retry params in pydap backend, to cache urls + session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) session.cache.clear() + url = "dap2://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" with pytest.warns(UserWarning): open_dataset( diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 062fdb88fdd..cda353b43a4 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -641,13 +641,14 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: | Salinity (time, Z, Y, X) float32 ... """ import pydap - from requests_cache import CachedSession + from pydap.net import create_session - _version_ = Version(pydap.__version__) - - session = CachedSession(cache_name="debug") # so that urls are cached + # Create a session with pre-set retry params in pydap backend, to cache urls + session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) session.cache.clear() + _version_ = Version(pydap.__version__) + tree = open_datatree(url, engine=self.engine, session=session) assert set(tree.dims) == {"time", "Z", "nv"} assert tree["/SimpleGroup"].coords["time"].dims == ("time",) From 16394aaa98f1fa35c5e80372cf7488ad599bbd0b Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Thu, 14 Aug 2025 09:45:07 -0700 Subject: [PATCH 22/38] update env yml file to use new pydap release via conda --- ci/requirements/environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index b012fe82e56..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - # - pydap + - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,4 +65,3 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - pydap==3.5.6 # just for now - will restore to conda after new release From 1b76e98867748742c7f01608d38ba7ce983a5821 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Mon, 25 Aug 2025 10:19:05 -0700 Subject: [PATCH 23/38] let `pydap` handle exceptions/warning --- xarray/backends/pydap_.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index fd66bce2643..cf36a5ecb08 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,6 +1,5 @@ from __future__ import annotations -import warnings from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -166,8 +165,7 @@ def open( elif hasattr(url, "ds"): # pydap dataset dataset = url.ds - args = {"dataset": dataset} - args["checksums"] = checksums + args = {"dataset": dataset, "checksums": checksums} if group: args["group"] = group if url.startswith(("http", "dap2")): @@ -175,17 +173,7 @@ def open( elif url.startswith("dap4"): args["protocol"] = "dap4" if batch: - if args["protocol"] == "dap2": - warnings.warn( - f"`batch={batch}` is currently only compatible with the `DAP4` " - "protocol. Make sue the OPeNDAP server implements the `DAP4` " - "protocol and then replace the scheme of the url with `dap4` " - "to make use of it. Setting `batch=False`.", - stacklevel=2, - ) - else: - # only update if dap4 - args["batch"] = batch + args["batch"] = batch return cls(**args) def open_store_variable(self, var): From 2ca8a4d5fc078c4aefbb515af27bfea3393adb85 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 5 Sep 2025 16:20:39 -0700 Subject: [PATCH 24/38] process dims at once, one per group --- xarray/backends/pydap_.py | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index cf36a5ecb08..b381efd0275 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -35,10 +35,9 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array, batch=False, cache=None, checksums=True): + def __init__(self, array, batch=False, checksums=True): self.array = array self._batch = batch - self._cache = cache self._checksums = checksums @property @@ -55,10 +54,7 @@ def __getitem__(self, key): ) def _getitem(self, key): - if self.array.id in self._cache.keys(): - # safely avoid re-downloading some coordinates - result = self._cache[self.array.id] - elif self._batch and hasattr(self.array, "dataset"): + if self._batch and hasattr(self.array, "dataset"): # is self.array not loaded? # this are both True only for pydap>3.5.5 from pydap.lib import resolve_batch_for_all_variables @@ -69,10 +65,10 @@ def _getitem(self, key): ) else: result = robust_getitem(self.array, key, catch=ValueError) - try: - result = np.asarray(result.data) - except AttributeError: - result = np.asarray(result) + # try: + result = np.asarray(result.data) + # except AttributeError: + # result = np.asarray(result) axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) if result.ndim + len(axis) != self.array.ndim and axis: result = np.squeeze(result, axis) @@ -117,8 +113,6 @@ def __init__( self.dataset = dataset self.group = group self._batch = batch - self._batch_done = False - self._array_cache = {} # holds 1D dimension data self._protocol = protocol self._checksums = checksums # true by default @@ -201,7 +195,7 @@ def open_store_variable(self, var): else: # all non-dimension variables data = indexing.LazilyIndexedArray( - PydapArrayWrapper(var, self._batch, self._array_cache, self._checksums) + PydapArrayWrapper(var, self._batch, self._checksums) ) return Variable(dimensions, data, var.attributes) @@ -248,19 +242,14 @@ def ds(self): return get_group(self.dataset, self.group) def _get_data_array(self, var): - """gets dimension data all at once, storing the numpy - arrays within a cached dictionary - """ + """gets dimension data all at once""" from pydap.lib import get_batch_data - if not self._batch_done or var.id not in self._array_cache: - # store all dim data into a dict for reuse - self._array_cache = get_batch_data( - var.parent, self._array_cache, self._checksums - ) - self._batch_done = True - - return self._array_cache[var.id] + if not var._is_data_loaded(): + # this implies dat has not been deserialized yet + # runs only once per store/hierarchy + get_batch_data(var.parent, checksums=self._checksums) + return self.dataset[var.id].data class PydapBackendEntrypoint(BackendEntrypoint): From 652e5d68191e77d7156b6108cf8437eda54b1cc9 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Tue, 9 Sep 2025 13:18:18 -0700 Subject: [PATCH 25/38] debug --- xarray/backends/pydap_.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index b381efd0275..cd7979bc015 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -54,24 +54,34 @@ def __getitem__(self, key): ) def _getitem(self, key): - if self._batch and hasattr(self.array, "dataset"): # is self.array not loaded? + if self._batch and hasattr(self.array, "dataset"): # this are both True only for pydap>3.5.5 - from pydap.lib import resolve_batch_for_all_variables + # from pydap.lib import resolve_batch_for_all_variables + from pydap.lib import get_batch_data dataset = self.array.dataset - resolve_batch_for_all_variables(self.array, key, checksums=self._checksums) - result = np.asarray( - dataset._current_batch_promise.wait_for_result(self.array.id) - ) + print("[batching]", self.array.id) + if not dataset[self.array.id]._is_data_loaded(): + print("data not loaded", self.array.id) + # data has not been deserialized yet + # runs only once per store/hierarchy + get_batch_data(self.array, checksums=self._checksums, key=key) + result = np.asarray(dataset[self.array.id].data) + result = robust_getitem(result, key, catch=ValueError) else: + print("[non-batching]", self.array.id) result = robust_getitem(self.array, key, catch=ValueError) - # try: result = np.asarray(result.data) - # except AttributeError: - # result = np.asarray(result) axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) + print(key) + print("axis:", axis) + # print("ndim", result.ndim) + # print("array.ndim", self.array.ndim) if result.ndim + len(axis) != self.array.ndim and axis: + # print('here????') + # print("squeezed result", np.shape(result)) result = np.squeeze(result, axis) + # print("squeezed result", np.shape(result)) return result @@ -246,9 +256,9 @@ def _get_data_array(self, var): from pydap.lib import get_batch_data if not var._is_data_loaded(): - # this implies dat has not been deserialized yet + # data has not been deserialized yet # runs only once per store/hierarchy - get_batch_data(var.parent, checksums=self._checksums) + get_batch_data(var, checksums=self._checksums) return self.dataset[var.id].data From 84064949db3ea4cab9fd2cdf2b83a352db141d79 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 10:20:41 -0700 Subject: [PATCH 26/38] revert what`s new from previous commit --- doc/whats-new.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4046c802f72..b8ffab2889f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -239,9 +239,6 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). - ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. - By `Miguel Jimenez-Urias `_. Breaking changes ~~~~~~~~~~~~~~~~ From a5c6ba24fd9ee2698a33d097fc2a8ec38f8b95da Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 10:21:57 -0700 Subject: [PATCH 27/38] enable data checker for batched deserialized data --- xarray/backends/pydap_.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index cd7979bc015..acd2af44428 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -57,31 +57,17 @@ def _getitem(self, key): if self._batch and hasattr(self.array, "dataset"): # this are both True only for pydap>3.5.5 # from pydap.lib import resolve_batch_for_all_variables - from pydap.lib import get_batch_data + from pydap.lib import data_check, get_batch_data dataset = self.array.dataset - print("[batching]", self.array.id) - if not dataset[self.array.id]._is_data_loaded(): - print("data not loaded", self.array.id) - # data has not been deserialized yet - # runs only once per store/hierarchy - get_batch_data(self.array, checksums=self._checksums, key=key) - result = np.asarray(dataset[self.array.id].data) - result = robust_getitem(result, key, catch=ValueError) + get_batch_data(self.array, checksums=self._checksums, key=key) + result = data_check(np.asarray(dataset[self.array.id].data), key) else: - print("[non-batching]", self.array.id) result = robust_getitem(self.array, key, catch=ValueError) result = np.asarray(result.data) - axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) - print(key) - print("axis:", axis) - # print("ndim", result.ndim) - # print("array.ndim", self.array.ndim) - if result.ndim + len(axis) != self.array.ndim and axis: - # print('here????') - # print("squeezed result", np.shape(result)) - result = np.squeeze(result, axis) - # print("squeezed result", np.shape(result)) + axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) + if result.ndim + len(axis) != self.array.ndim and axis: + result = np.squeeze(result, axis) return result From a111b0c18725b30cbc2a16ba06899f387598dc1d Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 10:56:19 -0700 Subject: [PATCH 28/38] temporarily install from source for testing - will revert to conda install after new release if no further change to backend --- ci/requirements/environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index eff54fe469e..9dd53b0ec48 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - - pydap + # - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,3 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect + - git+https://github.com/pydap/pydap.git From fd84f638fadabb1d3cde5f2e729b8ebfb412fb94 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 13:04:41 -0700 Subject: [PATCH 29/38] update `whats new` --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b8ffab2889f..afa99a8df8f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -229,6 +229,9 @@ Alfonso Ladino, Brigitta Sipőcz, Claude, Deepak Cherian, Dimitri Papadopoulos O New Features ~~~~~~~~~~~~ +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. + By `Miguel Jimenez-Urias `_. - Added :py:meth:`DataTree.prune` method to remove empty nodes while preserving tree structure. Useful for cleaning up DataTree after time-based filtering operations (:issue:`10590`, :pull:`10598`). By `Alfonso Ladino `_. From 5da338c529fdbec215287c55e2e189724f7ce32d Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 13:32:06 -0700 Subject: [PATCH 30/38] update tests --- xarray/tests/test_backends.py | 19 ------------------- xarray/tests/test_backends_datatree.py | 4 ++-- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f4dd0c62ba3..042dd29dc8d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6516,25 +6516,6 @@ def test_batchdap4_downloads(protocol, batch) -> None: assert len(session.cache.urls()) == 5 -@requires_pydap -@network -def test_batch_warnswithdap2() -> None: - from pydap.net import create_session - - # Create a session with pre-set retry params in pydap backend, to cache urls - session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) - session.cache.clear() - - url = "dap2://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" - with pytest.warns(UserWarning): - open_dataset( - url, engine="pydap", session=session, batch=True, decode_times=False - ) - - # no batching is supported here - assert len(session.cache.urls()) == 5 - - class TestEncodingInvalid: def test_extract_nc4_variable_encoding(self) -> None: var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"}) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index cda353b43a4..3edb959d65e 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -661,8 +661,8 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: ) if _version_ > Version("3.5.5"): - # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups - assert len(session.cache.urls()) == 2 + # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups per group + assert len(session.cache.urls()) == 3 else: # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) assert len(session.cache.urls()) == 5 From 0c55e528a4c891e6ec0cadab9c884d80aeeafb0f Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 14:14:10 -0700 Subject: [PATCH 31/38] set `batch=None` as default --- xarray/backends/pydap_.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index acd2af44428..50add6dc073 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -95,7 +95,7 @@ def __init__( dataset, group=None, session=None, - batch=False, + batch=None, protocol=None, checksums=True, ): @@ -123,7 +123,7 @@ def open( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ): from pydap.client import open_url @@ -290,7 +290,7 @@ def open_dataset( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ) -> Dataset: store = PydapDataStore.open( @@ -336,7 +336,7 @@ def open_datatree( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ) -> DataTree: groups_dict = self.open_groups_as_dict( @@ -377,7 +377,7 @@ def open_groups_as_dict( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath From 36ea4566914000f690ef2d7cc9f83d6ddbac897e Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 19 Sep 2025 09:06:42 -0700 Subject: [PATCH 32/38] improve handling of dims vs dimensions deprecation warning --- xarray/backends/pydap_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 50add6dc073..5824bdc6e8b 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -35,7 +35,7 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array, batch=False, checksums=True): + def __init__(self, array, batch=None, checksums=True): self.array = array self._batch = batch self._checksums = checksums @@ -167,11 +167,11 @@ def open( return cls(**args) def open_store_variable(self, var): - try: + if hasattr(var, "dims"): dimensions = [ dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims ] - except AttributeError: + else: # GridType does not have a dims attribute - instead get `dimensions` # see https://github.com/pydap/pydap/issues/485 dimensions = var.dimensions From 863ea6d4a3602958b6c9df78672168ac933a07f0 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 26 Sep 2025 08:25:12 -0700 Subject: [PATCH 33/38] update to use latest version of pydap --- ci/requirements/environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 9dd53b0ec48..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - # - pydap + - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,4 +65,3 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - git+https://github.com/pydap/pydap.git From fc72d327124c81492bdacbb1e4038d8f59d0cd91 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 26 Sep 2025 08:25:23 -0700 Subject: [PATCH 34/38] update import --- xarray/backends/pydap_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 5824bdc6e8b..89eafcb0092 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -57,7 +57,7 @@ def _getitem(self, key): if self._batch and hasattr(self.array, "dataset"): # this are both True only for pydap>3.5.5 # from pydap.lib import resolve_batch_for_all_variables - from pydap.lib import data_check, get_batch_data + from pydap.client import data_check, get_batch_data dataset = self.array.dataset get_batch_data(self.array, checksums=self._checksums, key=key) @@ -239,7 +239,7 @@ def ds(self): def _get_data_array(self, var): """gets dimension data all at once""" - from pydap.lib import get_batch_data + from pydap.client import get_batch_data if not var._is_data_loaded(): # data has not been deserialized yet From 049ff2ef2f5d4db0e768883b23831aa2e483aa1e Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 26 Sep 2025 08:33:40 -0700 Subject: [PATCH 35/38] update `whats new docs --- doc/whats-new.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index afa99a8df8f..4db8954e898 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -49,6 +49,9 @@ Claude, Deepak Cherian, Dimitri Papadopoulos Orfanos, Dylan H. Morris, Emmanuel New Features ~~~~~~~~~~~~ +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. + By `Miguel Jimenez-Urias `_. - :py:func:`DataTree.from_dict` now supports passing in ``DataArray`` and nested dictionary values, and has a ``coords`` argument for specifying coordinates as ``DataArray`` objects (:pull:`10658`). @@ -229,9 +232,6 @@ Alfonso Ladino, Brigitta Sipőcz, Claude, Deepak Cherian, Dimitri Papadopoulos O New Features ~~~~~~~~~~~~ -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). - ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. - By `Miguel Jimenez-Urias `_. - Added :py:meth:`DataTree.prune` method to remove empty nodes while preserving tree structure. Useful for cleaning up DataTree after time-based filtering operations (:issue:`10590`, :pull:`10598`). By `Alfonso Ladino `_. From 4f5a715a2360d28a76bb9e130f9967d8dd36a645 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 26 Sep 2025 10:14:57 -0700 Subject: [PATCH 36/38] move cache session to `tmpdir` --- xarray/tests/test_backends.py | 5 +++-- xarray/tests/test_backends_datatree.py | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 042dd29dc8d..42f9850b8e4 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6469,14 +6469,15 @@ def test_session(self) -> None: @network @pytest.mark.parametrize("protocol", ["dap2", "dap4"]) @pytest.mark.parametrize("batch", [False, True]) -def test_batchdap4_downloads(protocol, batch) -> None: +def test_batchdap4_downloads(tmpdir, protocol, batch) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap from pydap.net import create_session _version_ = Version(pydap.__version__) # Create a session with pre-set params in pydap backend, to cache urls - session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) + cache_name = tmpdir / "debug" + session = create_session(use_cache=True, cache_kwargs={"cache_name": cache_name}) session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 3edb959d65e..18f8efd6b51 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -614,7 +614,7 @@ def test_open_groups(self, url=unaligned_datatree_url) -> None: ) as expected: assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected) - def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: + def test_inherited_coords(self, tmpdir, url=simplegroup_datatree_url) -> None: """Test that `open_datatree` inherits coordinates from root tree. This particular h5 file is a test file that inherits the time coordinate from the root @@ -644,7 +644,10 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: from pydap.net import create_session # Create a session with pre-set retry params in pydap backend, to cache urls - session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) + cache_name = tmpdir / "debug" + session = create_session( + use_cache=True, cache_kwargs={"cache_name": cache_name} + ) session.cache.clear() _version_ = Version(pydap.__version__) From 47b7e735a5d851d739fd7ec242da364908c5704d Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Tue, 30 Sep 2025 13:34:02 -0700 Subject: [PATCH 37/38] remove added functionality from whats new from newly released version --- doc/whats-new.rst | 3 --- xarray/backends/pydap_.py | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4db8954e898..b8ffab2889f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -49,9 +49,6 @@ Claude, Deepak Cherian, Dimitri Papadopoulos Orfanos, Dylan H. Morris, Emmanuel New Features ~~~~~~~~~~~~ -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). - ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. - By `Miguel Jimenez-Urias `_. - :py:func:`DataTree.from_dict` now supports passing in ``DataArray`` and nested dictionary values, and has a ``coords`` argument for specifying coordinates as ``DataArray`` objects (:pull:`10658`). diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 89eafcb0092..b29b92abdeb 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -55,8 +55,7 @@ def __getitem__(self, key): def _getitem(self, key): if self._batch and hasattr(self.array, "dataset"): - # this are both True only for pydap>3.5.5 - # from pydap.lib import resolve_batch_for_all_variables + # True only for pydap>3.5.5 from pydap.client import data_check, get_batch_data dataset = self.array.dataset From 4b516b496bb084dffea0da1b3f0b0546434e3fda Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Tue, 30 Sep 2025 13:36:45 -0700 Subject: [PATCH 38/38] add to `whats-new` for next release --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b8ffab2889f..6f4a1c00101 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,6 +13,10 @@ v2025.10.0 (unreleased) New Features ~~~~~~~~~~~~ +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. + By `Miguel Jimenez-Urias `_. + Breaking changes ~~~~~~~~~~~~~~~~