alan-turing-institute
diff --git a/‎autoemulate/core/compare.py‎
Lines changed: 4 additions & 6 deletions b/‎autoemulate/core/compare.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎autoemulate/core/metrics.py‎
Lines changed: 21 additions & 2 deletions b/‎autoemulate/core/metrics.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎autoemulate/core/model_selection.py‎
Lines changed: 2 additions & 2 deletions b/‎autoemulate/core/model_selection.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎autoemulate/core/results.py‎
Lines changed: 31 additions & 38 deletions b/‎autoemulate/core/results.py‎
Lines changed: 31 additions & 38 deletions
diff --git a/‎autoemulate/core/save.py‎
Lines changed: 7 additions & 2 deletions b/‎autoemulate/core/save.py‎
Lines changed: 7 additions & 2 deletions
@@ -15,7 +15,7 @@
 from autoemulate.core.logging_config import get_configured_logger
 from autoemulate.core.metrics import (
     R2,
-    TorchMetrics,
+    Metric,
     get_metric,
     get_metrics,
 )
@@ -74,8 +74,8 @@ def __init__(
         device: DeviceLike | None = None,
         random_seed: int | None = None,
         log_level: str = "progress_bar",
-        tuning_metric: str | TorchMetrics = "r2",
-        evaluation_metrics: list[str | TorchMetrics] | None = None,
+        tuning_metric: str | Metric = "r2",
+        evaluation_metrics: list[str | Metric] | None = None,
     ):
         """
         Initialize the AutoEmulate class.
@@ -542,9 +542,7 @@ def compare(self):
         # Get the best result and log the comparison
         # Use the first evaluation metric to determine the best result
         first_metric = self.evaluation_metrics[0]
-        best_result = self.best_result(
-            metric_name=first_metric.name,
-        )
+        best_result = self.best_result(first_metric)
         self.log_compare(
             best_model_name=best_result.model_name,
             x_transforms=best_result.x_transforms,
 
@@ -4,7 +4,7 @@
 
 from abc import abstractmethod
 from collections.abc import Sequence
-from functools import partial
+from functools import partial, total_ordering
 
 import torchmetrics
 from einops import rearrange
@@ -18,6 +18,7 @@
 )
 
 
+@total_ordering
 class Metric:
     """Configuration for a single metric.
 
@@ -33,9 +34,27 @@ class Metric:
     maximize: bool
 
     def __repr__(self) -> str:
-        """Return the string representation of the Metric."""
+        """Representation of the Metric."""
         return f"Metric(name={self.name}, maximize={self.maximize})"
 
+    def __str__(self):
+        """Metric when formatted as a string."""
+        return self.name
+
+    def __eq__(self, other: object) -> bool:
+        """Check equality based on metric name."""
+        if not isinstance(other, Metric):
+            return NotImplemented
+        return self.name == other.name
+
+    def __hash__(self) -> int:
+        """Return hash based on metric name."""
+        return hash(self.name)
+
+    def __lt__(self, other: Metric) -> bool:
+        """Compare metrics based on their str name."""
+        return self.name < other.name
+
     @abstractmethod
     def __call__(
         self, y_pred: OutputLike, y_true: TensorLike, n_samples: int = 1000
 
@@ -159,7 +159,7 @@ def bootstrap(
     n_samples: int = 1000,
     device: str | torch.device = "cpu",
     metrics: list[Metric] | None = None,
-) -> dict[str, tuple[float, float]]:
+) -> dict[Metric, tuple[float, float]]:
     """
     Get bootstrap estimates of metrics.
 
@@ -228,7 +228,7 @@ def bootstrap(
 
     # Return mean and std for each metric
     return {
-        metric.name: (
+        metric: (
             metric_scores[metric.name].mean().item(),
             metric_scores[metric.name].std().item(),
         )
 
@@ -2,7 +2,7 @@
 
 import pandas as pd
 
-from autoemulate.core.metrics import AVAILABLE_METRICS
+from autoemulate.core.metrics import Metric, get_metric
 from autoemulate.core.types import ModelParams
 from autoemulate.emulators.transformed.base import TransformedEmulator
 
@@ -18,8 +18,8 @@ def __init__(
         model_name: str,
         model: TransformedEmulator,
         params: ModelParams,
-        test_metrics: dict[str, tuple[float, float]],
-        train_metrics: dict[str, tuple[float, float]],
+        test_metrics: dict[Metric, tuple[float, float]],
+        train_metrics: dict[Metric, tuple[float, float]],
     ):
         """Initialize a Result object.
 
@@ -141,32 +141,32 @@ def summarize(self) -> pd.DataFrame:
             "params": [result.params for result in self.results],
         }
 
-        # Collect all unique metric names from all results
+        # Collect all unique metrics from all results
         all_test_metrics = set()
         all_train_metrics = set()
         for result in self.results:
             all_test_metrics.update(result.test_metrics.keys())
             all_train_metrics.update(result.train_metrics.keys())
 
         # Add test metrics columns
-        for metric_name in sorted(all_test_metrics):
-            data[f"{metric_name}_test"] = [
-                result.test_metrics.get(metric_name, (float("nan"), float("nan")))[0]
+        for metric in sorted(all_test_metrics):
+            data[f"{metric}_test"] = [
+                result.test_metrics.get(metric, (float("nan"), float("nan")))[0]
                 for result in self.results
             ]
-            data[f"{metric_name}_test_std"] = [
-                result.test_metrics.get(metric_name, (float("nan"), float("nan")))[1]
+            data[f"{metric}_test_std"] = [
+                result.test_metrics.get(metric, (float("nan"), float("nan")))[1]
                 for result in self.results
             ]
 
         # Add train metrics columns
-        for metric_name in sorted(all_train_metrics):
-            data[f"{metric_name}_train"] = [
-                result.train_metrics.get(metric_name, (float("nan"), float("nan")))[0]
+        for metric in sorted(all_train_metrics):
+            data[f"{metric}_train"] = [
+                result.train_metrics.get(metric, (float("nan"), float("nan")))[0]
                 for result in self.results
             ]
-            data[f"{metric_name}_train_std"] = [
-                result.train_metrics.get(metric_name, (float("nan"), float("nan")))[1]
+            data[f"{metric}_train_std"] = [
+                result.train_metrics.get(metric, (float("nan"), float("nan")))[1]
                 for result in self.results
             ]
 
@@ -177,13 +177,13 @@ def summarize(self) -> pd.DataFrame:
 
     summarise = summarize
 
-    def best_result(self, metric_name: str | None = None) -> Result:
+    def best_result(self, metric: str | Metric | None = None) -> Result:
         """
         Get the model with the best result based on the given metric.
 
         Parameters
         ----------
-        metric_name: str | None
+        metric: str | Metric | None
             The name of the metric to use for comparison. If None, uses the first
             available metric found in the results. The metric should exist in the
             test_metrics of the results.
@@ -202,51 +202,44 @@ def best_result(self, metric_name: str | None = None) -> Result:
             raise ValueError(msg)
 
         # If metric_name is None, use the first available metric
-        if metric_name is None:
+        if metric is None:
             # Collect all available metrics
-            available_metrics = set()
-            for result in self.results:
-                available_metrics.update(result.test_metrics.keys())
+            available_metrics = [
+                m for result in self.results for m in result.test_metrics
+            ]
 
             if not available_metrics:
                 msg = "No metrics available in results."
                 raise ValueError(msg)
 
             # Use the first metric
-            metric_name = next(iter(available_metrics))
-            logger.info("Using metric '%s' to determine best result.", metric_name)
+            metric_selected = available_metrics[0]
+            logger.info("Using metric '%s' to determine best result.", metric_selected)
         else:
             # Check if the specified metric exists in at least one result
-            if not any(metric_name in result.test_metrics for result in self.results):
+            if not any(metric in result.test_metrics for result in self.results):
                 available_metrics = set()
                 for result in self.results:
                     available_metrics.update(result.test_metrics.keys())
                 msg = (
-                    f"Metric '{metric_name}' not found in any results. "
+                    f"Metric '{metric}' not found in any results. "
                     f"Available metrics: {sorted(available_metrics)}"
                 )
                 raise ValueError(msg)
-
-            logger.info("Using metric '%s' to determine best result.", metric_name)
-
-        # Determine if we are maximizing or minimizing the metric
-        # from the metric name
-        assert metric_name is not None  # for pyright
-        metric_config = AVAILABLE_METRICS.get(metric_name)
-        if metric_config is None:
-            msg = f"Metric '{metric_name}' not found in AVAILABLE_METRICS."
-            raise ValueError(msg)
-        metric_maximize = metric_config.maximize
+            metric_selected = get_metric(metric)
+            logger.info("Using metric '%s' to determine best result.", metric_selected)
 
         # Select best result based on whether we're maximizing or minimizing
-        if metric_maximize:
+        if metric_selected.maximize:
             return max(
                 self.results,
-                key=lambda r: r.test_metrics.get(metric_name, (float("-inf"), 0))[0],
+                key=lambda r: r.test_metrics.get(metric_selected, (float("-inf"), 0))[
+                    0
+                ],
             )
         return min(
             self.results,
-            key=lambda r: r.test_metrics.get(metric_name, (float("inf"), 0))[0],
+            key=lambda r: r.test_metrics.get(metric_selected, (float("inf"), 0))[0],
         )
 
     def get_result(self, result_id: int) -> Result:
 
@@ -4,6 +4,7 @@
 import joblib
 import pandas as pd
 
+from autoemulate.core.metrics import get_metric
 from autoemulate.core.results import Result  # , Results
 from autoemulate.emulators.base import Emulator
 
@@ -150,13 +151,17 @@ def _load_result(self, path: str | Path) -> Result | Emulator:
                 metric_name = col[:-5]  # Remove "_test" suffix
                 mean = row[col]
                 std = row.get(f"{metric_name}_test_std", float("nan"))
-                test_metrics[metric_name] = (mean, std)
+                # Convert metric name string back to Metric object
+                metric = get_metric(metric_name)
+                test_metrics[metric] = (mean, std)
             elif col.endswith("_train") and not col.endswith("_train_std"):
                 # Extract metric name (e.g., "r2" from "r2_train")
                 metric_name = col[:-6]  # Remove "_train" suffix
                 mean = row[col]
                 std = row.get(f"{metric_name}_train_std", float("nan"))
-                train_metrics[metric_name] = (mean, std)
+                # Convert metric name string back to Metric object
+                metric = get_metric(metric_name)
+                train_metrics[metric] = (mean, std)
 
         return Result(
             id=row["id"],