pydata · Mikejmnez · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml
@@ -37,7 +37,6 @@ dependencies:
   - pre-commit
   - pyarrow # pandas raises a deprecation warning without this, breaking doctests
   - pydap
-  - pydap-server
   - pytest
   - pytest-asyncio
   - pytest-cov

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -13,6 +13,10 @@ v2025.10.0 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`).
+  ``batch=True|False`` is a new  ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend.
+  By `Miguel Jimenez-Urias <https://github.com/Mikejmnez>`_.
+
 
 Breaking changes
 ~~~~~~~~~~~~~~~~

diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py
@@ -35,8 +35,10 @@
 
 
 class PydapArrayWrapper(BackendArray):
-    def __init__(self, array):
+    def __init__(self, array, batch=None, checksums=True):
         self.array = array
+        self._batch = batch
+        self._checksums = checksums
 
     @property
     def shape(self) -> tuple[int, ...]:
@@ -52,13 +54,19 @@ def __getitem__(self, key):
         )
 
     def _getitem(self, key):
-        result = robust_getitem(self.array, key, catch=ValueError)
-        # in some cases, pydap doesn't squeeze axes automatically like numpy
-        result = np.asarray(result)
-        axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types))
-        if result.ndim + len(axis) != self.array.ndim and axis:
-            result = np.squeeze(result, axis)
+        if self._batch and hasattr(self.array, "dataset"):
+            # True only for pydap>3.5.5
+            from pydap.client import data_check, get_batch_data
 
+            dataset = self.array.dataset
+            get_batch_data(self.array, checksums=self._checksums, key=key)
+            result = data_check(np.asarray(dataset[self.array.id].data), key)
+        else:
+            result = robust_getitem(self.array, key, catch=ValueError)
+            result = np.asarray(result.data)
+            axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types))
+            if result.ndim + len(axis) != self.array.ndim and axis:
+                result = np.squeeze(result, axis)
         return result
 
 
@@ -81,7 +89,15 @@ class PydapDataStore(AbstractDataStore):
     be useful if the netCDF4 library is not available.
     """
 
-    def __init__(self, dataset, group=None):
+    def __init__(
+        self,
+        dataset,
+        group=None,
+        session=None,
+        batch=None,
+        protocol=None,
+        checksums=True,
+    ):
         """
         Parameters
         ----------
@@ -91,6 +107,9 @@ def __init__(self, dataset, group=None):
         """
         self.dataset = dataset
         self.group = group
+        self._batch = batch
+        self._protocol = protocol
+        self._checksums = checksums  # true by default
 
     @classmethod
     def open(
@@ -103,6 +122,8 @@ def open(
         timeout=None,
         verify=None,
         user_charset=None,
+        batch=None,
+        checksums=True,
     ):
         from pydap.client import open_url
         from pydap.net import DEFAULT_TIMEOUT
@@ -117,6 +138,7 @@ def open(
                 DeprecationWarning,
             )
             output_grid = False  # new default behavior
+
         kwargs = {
             "url": url,
             "application": application,
@@ -132,22 +154,45 @@ def open(
         elif hasattr(url, "ds"):
             # pydap dataset
             dataset = url.ds
-        args = {"dataset": dataset}
+        args = {"dataset": dataset, "checksums": checksums}
         if group:
-            # only then, change the default
             args["group"] = group
+        if url.startswith(("http", "dap2")):
+            args["protocol"] = "dap2"
+        elif url.startswith("dap4"):
+            args["protocol"] = "dap4"
+        if batch:
+            args["batch"] = batch
         return cls(**args)
 
     def open_store_variable(self, var):
-        data = indexing.LazilyIndexedArray(PydapArrayWrapper(var))
-        try:
+        if hasattr(var, "dims"):
             dimensions = [
                 dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims
             ]
-        except AttributeError:
+        else:
             # GridType does not have a dims attribute - instead get `dimensions`
             # see https://github.com/pydap/pydap/issues/485
             dimensions = var.dimensions
+        if (
+            self._protocol == "dap4"
+            and var.name in dimensions
+            and hasattr(var, "dataset")  # only True for pydap>3.5.5
+        ):
+            if not var.dataset._batch_mode:
+                # for dap4, always batch all dimensions at once
+                var.dataset.enable_batch_mode()
+            data_array = self._get_data_array(var)
+            data = indexing.LazilyIndexedArray(data_array)
+            if not self._batch and var.dataset._batch_mode:
+                # if `batch=False``, restore it for all other variables
+                var.dataset.disable_batch_mode()
+        else:
+            # all non-dimension variables
+            data = indexing.LazilyIndexedArray(
+                PydapArrayWrapper(var, self._batch, self._checksums)
+            )
+
         return Variable(dimensions, data, var.attributes)
 
     def get_variables(self):
@@ -165,6 +210,7 @@ def get_variables(self):
                 # check the key is not a BaseType or GridType
                 if not isinstance(self.ds[var], GroupType)
             ]
+
         return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars)
 
     def get_attrs(self):
@@ -176,18 +222,30 @@ def get_attrs(self):
             "libdap",
             "invocation",
             "dimensions",
+            "path",
+            "Maps",
         )
-        attrs = self.ds.attributes
-        list(map(attrs.pop, opendap_attrs, [None] * 6))
+        attrs = dict(self.ds.attributes)
+        list(map(attrs.pop, opendap_attrs, [None] * 8))
         return Frozen(attrs)
 
     def get_dimensions(self):
-        return Frozen(self.ds.dimensions)
+        return Frozen(sorted(self.ds.dimensions))
 
     @property
     def ds(self):
         return get_group(self.dataset, self.group)
 
+    def _get_data_array(self, var):
+        """gets dimension data all at once"""
+        from pydap.client import get_batch_data
+
+        if not var._is_data_loaded():
+            # data has not been deserialized yet
+            # runs only once per store/hierarchy
+            get_batch_data(var, checksums=self._checksums)
+        return self.dataset[var.id].data
+
 
 class PydapBackendEntrypoint(BackendEntrypoint):
     """
@@ -231,6 +289,8 @@ def open_dataset(
         timeout=None,
         verify=None,
         user_charset=None,
+        batch=None,
+        checksums=True,
     ) -> Dataset:
         store = PydapDataStore.open(
             url=filename_or_obj,
@@ -241,6 +301,8 @@ def open_dataset(
             timeout=timeout,
             verify=verify,
             user_charset=user_charset,
+            batch=batch,
+            checksums=checksums,
         )
         store_entrypoint = StoreBackendEntrypoint()
         with close_on_error(store):
@@ -273,6 +335,8 @@ def open_datatree(
         timeout=None,
         verify=None,
         user_charset=None,
+        batch=None,
+        checksums=True,
     ) -> DataTree:
         groups_dict = self.open_groups_as_dict(
             filename_or_obj,
@@ -285,10 +349,12 @@ def open_datatree(
             decode_timedelta=decode_timedelta,
             group=group,
             application=None,
-            session=None,
-            timeout=None,
-            verify=None,
-            user_charset=None,
+            session=session,
+            timeout=timeout,
+            verify=application,
+            user_charset=user_charset,
+            batch=batch,
+            checksums=checksums,
         )
 
         return datatree_from_dict_with_io_cleanup(groups_dict)
@@ -310,6 +376,8 @@ def open_groups_as_dict(
         timeout=None,
         verify=None,
         user_charset=None,
+        batch=None,
+        checksums=True,
     ) -> dict[str, Dataset]:
         from xarray.core.treenode import NodePath
 
@@ -321,6 +389,8 @@ def open_groups_as_dict(
             timeout=timeout,
             verify=verify,
             user_charset=user_charset,
+            batch=batch,
+            checksums=checksums,
         )
 
         # Check for a group and make it a parent if it exists

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -6465,6 +6465,58 @@ def test_session(self) -> None:
         )
 
 
+@requires_pydap
+@network
+@pytest.mark.parametrize("protocol", ["dap2", "dap4"])
+@pytest.mark.parametrize("batch", [False, True])
+def test_batchdap4_downloads(tmpdir, protocol, batch) -> None:
+    """Test that in dap4, all dimensions are downloaded at once"""
+    import pydap
+    from pydap.net import create_session
+
+    _version_ = Version(pydap.__version__)
+    # Create a session with pre-set params in pydap backend, to cache urls
+    cache_name = tmpdir / "debug"
+    session = create_session(use_cache=True, cache_kwargs={"cache_name": cache_name})
+    session.cache.clear()
+    url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc"
+
+    if protocol == "dap4":
+        ds = open_dataset(
+            url.replace("https", protocol),
+            engine="pydap",
+            session=session,
+            decode_times=False,
+            batch=batch,
+        )
+        if _version_ > Version("3.5.5"):
+            # total downloads are:
+            # 1 dmr + 1 dap (dimensions)
+            assert len(session.cache.urls()) == 2
+            # now load the rest of the variables
+            ds.load()
+            if batch:
+                # all non-dimensions are downloaded in a single https requests
+                assert len(session.cache.urls()) == 2 + 1
+            if not batch:
+                # each non-dimension array is downloaded with an individual
+                # https requests
+                assert len(session.cache.urls()) == 2 + 4
+        else:
+            assert len(session.cache.urls()) == 4
+            ds.load()
+            assert len(session.cache.urls()) == 4 + 4
+    elif protocol == "dap2":
+        ds = open_dataset(
+            url.replace("https", protocol),
+            engine="pydap",
+            session=session,
+            decode_times=False,
+        )
+        # das + dds + 3 dods urls
+        assert len(session.cache.urls()) == 5
+
+
 class TestEncodingInvalid:
     def test_extract_nc4_variable_encoding(self) -> None:
         var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"})

diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import pytest
+from packaging.version import Version
 
 import xarray as xr
 from xarray import DataTree, load_datatree, open_datatree, open_groups
@@ -325,7 +326,7 @@

        filepath = tmpdir + "/phony_dims.nc"
        original_dt = simple_datatree
        original_dt.to_netcdf(filepath)  # should not raise

    @requires_dask
    def test_open_datatree_chunks(self, tmpdir) -> None:
@@ -613,7 +614,7 @@
         ) as expected:
             assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected)
 
-    def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
+    def test_inherited_coords(self, tmpdir, url=simplegroup_datatree_url) -> None:
         """Test that `open_datatree` inherits coordinates from root tree.
 
         This particular h5 file is a test file that inherits the time coordinate from the root
@@ -639,7 +640,19 @@
             │       Temperature  (time, Z, Y, X) float32 ...
             |       Salinity     (time, Z, Y, X) float32 ...
         """
-        tree = open_datatree(url, engine=self.engine)
+        import pydap
+        from pydap.net import create_session
+
+        # Create a session with pre-set retry params in pydap backend, to cache urls
+        cache_name = tmpdir / "debug"
+        session = create_session(
+            use_cache=True, cache_kwargs={"cache_name": cache_name}
+        )
+        session.cache.clear()
+
+        _version_ = Version(pydap.__version__)
+
+        tree = open_datatree(url, engine=self.engine, session=session)
         assert set(tree.dims) == {"time", "Z", "nv"}
         assert tree["/SimpleGroup"].coords["time"].dims == ("time",)
         assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",)
@@ -650,6 +663,13 @@
                 list(expected.dims) + ["Z", "nv"]
             )
 
+        if _version_ > Version("3.5.5"):
+            # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups per group
+            assert len(session.cache.urls()) == 3
+        else:
+            # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays)
+            assert len(session.cache.urls()) == 5
+
     def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None:
         aligned_dict_of_datasets = open_groups(url, engine=self.engine)
         aligned_dt = DataTree.from_dict(aligned_dict_of_datasets)