fix: test failure fixes for v0.15.1 (#1358)

LennartPurucker · web-flow · commit d3bb7755938d · 2024-10-14T18:56:51.000+02:00
* fix: make FakeObject be the correct standard and robustify usage

* fix: get none valued study object from server

* add: test for minio download failures

* fix: skip test for WSL as it is not supported

* maint: rework if/else case workflow

* maint: ruff fix

* add/fix: log messages for no premission

* fix: make flow name unique and enable testing of avoiding duplicates
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -208,6 +208,8 @@ def _download_minio_bucket(source: str, destination: str | Path) -> None:
     for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
         if file_object.object_name is None:
             raise ValueError(f"Object name is None for object {file_object!r}")
+        if file_object.etag is None:
+            raise ValueError(f"Object etag is None for object {file_object!r}")
 
         marker = destination / file_object.etag
         if marker.exists():
diff --git a/openml/config.py b/openml/config.py
@@ -345,7 +345,10 @@ def _setup(config: _Config | None = None) -> None:
         if not config_dir.exists():
             config_dir.mkdir(exist_ok=True, parents=True)
     except PermissionError:
-        pass
+        openml_logger.warning(
+            f"No permission to create OpenML directory at {config_dir}!"
+            " This can result in OpenML-Python not working properly."
+        )
 
     if config is None:
         config = _parse_config(config_file)
@@ -367,27 +370,16 @@ def _setup(config: _Config | None = None) -> None:
 
     try:
         cache_exists = _root_cache_directory.exists()
-    except PermissionError:
-        cache_exists = False
-
-    # create the cache subdirectory
-    try:
-        if not _root_cache_directory.exists():
+        # create the cache subdirectory
+        if not cache_exists:
             _root_cache_directory.mkdir(exist_ok=True, parents=True)
+        _create_log_handlers()
     except PermissionError:
         openml_logger.warning(
-            f"No permission to create openml cache directory at {_root_cache_directory}!"
-            " This can result in OpenML-Python not working properly.",
+            f"No permission to create OpenML directory at {_root_cache_directory}!"
+            " This can result in OpenML-Python not working properly."
         )
-
-    if cache_exists:
-        _create_log_handlers()
-    else:
         _create_log_handlers(create_file_handler=False)
-        openml_logger.warning(
-            f"No permission to create OpenML directory at {config_dir}! This can result in "
-            " OpenML-Python not working properly.",
-        )
 
 
 def set_field_in_config_file(field: str, value: Any) -> None:
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -206,9 +206,6 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
     avoid_duplicate_runs : bool, optional (default=True)
         If True, the run will throw an error if the setup/task combination is already present on
         the server. This feature requires an internet connection.
-    avoid_duplicate_runs : bool, optional (default=True)
-        If True, the run will throw an error if the setup/task combination is already present on
-        the server. This feature requires an internet connection.
     flow_tags : List[str], optional (default=None)
         A list of tags that the flow should have at creation.
     seed: int, optional (default=None)
diff --git a/openml/study/functions.py b/openml/study/functions.py
@@ -78,7 +78,7 @@ def get_study(
     return study
 
 
-def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
+def _get_study(id_: int | str, entity_type: str) -> BaseStudy:  # noqa: C901
     xml_string = openml._api_calls._perform_api_call(f"study/{id_}", "get")
     force_list_tags = (
         "oml:data_id",
@@ -93,6 +93,12 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
     alias = result_dict.get("oml:alias", None)
     main_entity_type = result_dict["oml:main_entity_type"]
 
+    # Parses edge cases where the server returns a string with a newline character for empty values.
+    none_value_indicator = "\n      "
+    for key in result_dict:
+        if result_dict[key] == none_value_indicator:
+            result_dict[key] = None
+
     if entity_type != main_entity_type:
         raise ValueError(
             f"Unexpected entity type '{main_entity_type}' reported by the server"
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
@@ -36,8 +36,12 @@ def test_retry_on_database_error(self, Session_class_mock, _):
 
         assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20
 
+
 class FakeObject(NamedTuple):
     object_name: str
+    etag: str
+    """We use the etag of a Minio object as the name of a marker if we already downloaded it."""
+
 
 class FakeMinio:
     def __init__(self, objects: Iterable[FakeObject] | None = None):
@@ -60,7 +64,7 @@ def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None:
     some_url = f"https://not.real.com/bucket/{some_object_path}"
     mock_minio.return_value = FakeMinio(
         objects=[
-            FakeObject(some_object_path),
+            FakeObject(object_name=some_object_path, etag=str(hash(some_object_path))),
         ],
     )
 
@@ -71,3 +75,27 @@ def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None:
     time_modified = (tmp_path / some_filename).stat().st_mtime
 
     assert time_created == time_modified
+
+
+@mock.patch.object(minio, "Minio")
+def test_download_minio_failure(mock_minio, tmp_path: Path) -> None:
+    some_prefix, some_filename = "some/prefix", "dataset.arff"
+    some_object_path = f"{some_prefix}/{some_filename}"
+    some_url = f"https://not.real.com/bucket/{some_object_path}"
+    mock_minio.return_value = FakeMinio(
+        objects=[
+            FakeObject(object_name=None, etag="tmp"),
+        ],
+    )
+
+    with pytest.raises(ValueError):
+        _download_minio_bucket(source=some_url, destination=tmp_path)
+
+    mock_minio.return_value = FakeMinio(
+        objects=[
+            FakeObject(object_name="tmp", etag=None),
+        ],
+    )
+
+    with pytest.raises(ValueError):
+        _download_minio_bucket(source=some_url, destination=tmp_path)
diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py
@@ -8,6 +8,7 @@
 from copy import copy
 from typing import Any, Iterator
 from pathlib import Path
+import platform
 
 import pytest
 
@@ -37,14 +38,18 @@ class TestConfig(openml.testing.TestBase):
     @unittest.mock.patch("openml.config.openml_logger.warning")
     @unittest.mock.patch("openml.config._create_log_handlers")
     @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
+    @unittest.skipIf(
+        platform.uname().release.endswith(("-Microsoft", "microsoft-standard-WSL2")),
+        "WSL does nto support chmod as we would need here, see https://github.com/microsoft/WSL/issues/81",
+    )
     def test_non_writable_home(self, log_handler_mock, warnings_mock):
         with tempfile.TemporaryDirectory(dir=self.workdir) as td:
             os.chmod(td, 0o444)
             _dd = copy(openml.config._defaults)
             _dd["cachedir"] = Path(td) / "something-else"
             openml.config._setup(_dd)
 
-        assert warnings_mock.call_count == 2
+        assert warnings_mock.call_count == 1
         assert log_handler_mock.call_count == 1
         assert not log_handler_mock.call_args_list[0][1]["create_file_handler"]
         assert openml.config._root_cache_directory == Path(td) / "something-else"
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -119,7 +119,6 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
         # time.time() works in seconds
         start_time = time.time()
         while time.time() - start_time < max_waiting_time_seconds:
-
             try:
                 openml.runs.get_run_trace(run_id)
             except openml.exceptions.OpenMLServerException:
@@ -131,7 +130,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
                 time.sleep(10)
                 continue
 
-            assert len(run.evaluations) > 0, "Expect not-None evaluations to always contain elements."
+            assert (
+                len(run.evaluations) > 0
+            ), "Expect not-None evaluations to always contain elements."
             return
 
         raise RuntimeError(
@@ -557,7 +558,7 @@ def determine_grid_size(param_grid):
             fold_evaluations=run.fold_evaluations,
             num_repeats=1,
             num_folds=num_folds,
-            task_type=task_type
+            task_type=task_type,
         )
 
         # Check if run string and print representation do not run into an error
@@ -796,7 +797,9 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
 
     @pytest.mark.sklearn()
     def test_run_and_upload_gridsearch(self):
-        estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         gridsearch = GridSearchCV(
             BaggingClassifier(**{estimator_name: SVC()}),
             {f"{estimator_name}__C": [0.01, 0.1, 10], f"{estimator_name}__gamma": [0.01, 0.1, 10]},
@@ -1826,7 +1829,9 @@ def test_joblib_backends(self, parallel_mock):
         num_instances = x.shape[0]
         line_length = 6 + len(task.class_labels)
 
-        backend_choice = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+        backend_choice = (
+            "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+        )
         for n_jobs, backend, call_count in [
             (1, backend_choice, 10),
             (2, backend_choice, 10),
@@ -1877,14 +1882,23 @@ def test_joblib_backends(self, parallel_mock):
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
     def test_delete_run(self):
-        rs = 1
+        rs = np.random.randint(1, 2**32 - 1)
         clf = sklearn.pipeline.Pipeline(
-            steps=[("imputer", SimpleImputer()), ("estimator", DecisionTreeClassifier())],
+            steps=[
+                (f"test_server_imputer_{rs}", SimpleImputer()),
+                ("estimator", DecisionTreeClassifier()),
+            ],
         )
         task = openml.tasks.get_task(32)  # diabetes; crossvalidation
 
-        run = openml.runs.run_model_on_task(model=clf, task=task, seed=rs)
+        run = openml.runs.run_model_on_task(
+            model=clf, task=task, seed=rs, avoid_duplicate_runs=False
+        )
         run.publish()
+
+        with pytest.raises(openml.exceptions.OpenMLRunsExistError):
+            openml.runs.run_model_on_task(model=clf, task=task, seed=rs, avoid_duplicate_runs=True)
+
         TestBase._mark_entity_for_removal("run", run.run_id)
         TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")