alan-turing-institute
diff --git a/‎autoemulate/core/compare.py‎
Lines changed: 66 additions & 39 deletions b/‎autoemulate/core/compare.py‎
Lines changed: 66 additions & 39 deletions
diff --git a/‎autoemulate/core/metrics.py‎
Lines changed: 169 additions & 0 deletions b/‎autoemulate/core/metrics.py‎
Lines changed: 169 additions & 0 deletions
@@ -12,7 +12,12 @@
 
 from autoemulate.core.device import TorchDeviceMixin
 from autoemulate.core.logging_config import get_configured_logger
-from autoemulate.core.model_selection import bootstrap, evaluate, r2_metric
+from autoemulate.core.metrics import (
+    TorchMetrics,
+    get_metric_config,
+    get_metric_configs,
+)
+from autoemulate.core.model_selection import bootstrap, evaluate
 from autoemulate.core.plotting import (
     calculate_subplot_layout,
     create_and_plot_slice,
@@ -72,6 +77,8 @@ def __init__(
         device: DeviceLike | None = None,
         random_seed: int | None = None,
         log_level: str = "progress_bar",
+        tuning_metric: str | TorchMetrics = "r2",
+        evaluation_metrics: list[str | TorchMetrics] | None = None,
     ):
         """
         Initialize the AutoEmulate class.
@@ -122,13 +129,27 @@ def __init__(
             it will show a progress bar during model comparison. It will set the
             logging level to "error" to avoid cluttering the output
             with debug/info logs.
+        tuning_metric: str | TorchMetrics
+            Metric to use for hyperparameter tuning. Can be a string shortcut
+            ("r2", "rmse", "mse", "mae") or a MetricConfig object. Defaults to "r2".
+        evaluation_metrics: list[str | TorchMetrics] | None
+            Metrics to compute during evaluation.
+            If None, then defaults to ["r2", "rmse"].
+            Each entry can be a string shortcut or a MetricConfig object.
+            IMPORTANT: The first metric in the list is used to
+            determine the best model.
         """
         Results.__init__(self)
         self.random_seed = random_seed
         TorchDeviceMixin.__init__(self, device=device)
         x, y = self._convert_to_tensors(x, y)
         x, y = self._move_tensors_to_device(x, y)
 
+        # Setup metrics. If evaluation_metrics is None, default to ["r2", "rmse"]
+        evaluation_metrics = evaluation_metrics or ["r2", "rmse"]
+        self.evaluation_metrics = get_metric_configs(evaluation_metrics)
+        self.tuning_metric = get_metric_config(tuning_metric)
+
         # Transforms to search over
         self.x_transforms_list = [
             self.get_transforms(transforms)
@@ -323,18 +344,19 @@ def log_compare(
         x_transforms,
         y_transforms,
         best_params_for_this_model,
-        r2_score,
-        rmse_score,
+        test_metrics,
     ):
         """Log the comparison results."""
+        metrics_str = ", ".join(
+            f"{metric}: {mean:.3f}" for metric, (mean, _std) in test_metrics.items()
+        )
         msg = (
             "Comparison results:\n"
             f"Best Model: {best_model_name}, "
             f"x transforms: {x_transforms}, "
-            f"y transforms: {y_transforms}",
+            f"y transforms: {y_transforms}, "
             f"Best params: {best_params_for_this_model}, "
-            f"R2 score: {r2_score:.3f}, "
-            f"RMSE score: {rmse_score:.3f}",
+            f"Metrics: {metrics_str}"
         )
         self.logger.debug(msg)
 
@@ -351,7 +373,13 @@ def compare(self):
         - Log the results.
         - Save the best model and its parameters.
         """
-        tuner = Tuner(self.train_val, y=None, n_iter=self.n_iter, device=self.device)
+        tuner = Tuner(
+            self.train_val,
+            y=None,
+            n_iter=self.n_iter,
+            device=self.device,
+            tuning_metric=self.tuning_metric,
+        )
         self.logger.info(
             "Comparing %s", [model_cls.__name__ for model_cls in self.models]
         )
@@ -393,7 +421,11 @@ def compare(self):
                                 mean_scores = [
                                     np.mean(score).item() for score in scores
                                 ]
-                                best_score_idx = np.argmax(mean_scores)
+                                # Select best whether we're maximizing or minimizing
+                                if self.tuning_metric.maximize:
+                                    best_score_idx = np.argmax(mean_scores)
+                                else:
+                                    best_score_idx = np.argmin(mean_scores)
                                 best_params_for_this_model = params_list[best_score_idx]
                                 self.logger.debug(
                                     'Tuner found best params for model "%s": '
@@ -445,35 +477,33 @@ def compare(self):
                             # This can fail for some model params
                             transformed_emulator.fit(train_val_x, train_val_y)
 
-                            (
-                                (r2_train_val, r2_train_val_std),
-                                (rmse_train_val, rmse_train_val_std),
-                            ) = bootstrap(
+                            train_metrics = bootstrap(
                                 transformed_emulator,
                                 train_val_x,
                                 train_val_y,
                                 n_bootstraps=self.n_bootstraps,
                                 device=self.device,
+                                metrics=self.evaluation_metrics,
                             )
-                            (r2_test, r2_test_std), (rmse_test, rmse_test_std) = (
-                                bootstrap(
-                                    transformed_emulator,
-                                    test_x,
-                                    test_y,
-                                    n_bootstraps=self.n_bootstraps,
-                                    device=self.device,
-                                )
+                            test_metrics = bootstrap(
+                                transformed_emulator,
+                                test_x,
+                                test_y,
+                                n_bootstraps=self.n_bootstraps,
+                                device=self.device,
+                                metrics=self.evaluation_metrics,
                             )
 
+                            # Log all test metrics from test_metrics dictionary
+                            test_metrics_str = ", ".join(
+                                f"{metric}: {mean:.3f} (std: {std:.3f})"
+                                for metric, (mean, std) in test_metrics.items()
+                            )
                             self.logger.debug(
-                                'Cross-validation for model "%s"'
-                                " completed with test mean (std) R2 score: %.3f (%.3f),"
-                                " mean (std) RMSE score: %.3f (%.3f)",
+                                'Cross-validation for model "%s" '
+                                "completed with test metrics: %s",
                                 model_cls.__name__,
-                                r2_test,
-                                r2_test_std,
-                                rmse_test,
-                                rmse_test_std,
+                                test_metrics_str,
                             )
                             self.logger.info(
                                 "Finished running Model: %s\n", model_cls.__name__
@@ -483,14 +513,8 @@ def compare(self):
                                 model_name=transformed_emulator.untransformed_model_name,
                                 model=transformed_emulator,
                                 params=best_params_for_this_model,
-                                r2_test=r2_test,
-                                rmse_test=rmse_test,
-                                r2_test_std=r2_test_std,
-                                rmse_test_std=rmse_test_std,
-                                r2_train=r2_train_val,
-                                rmse_train=rmse_train_val,
-                                r2_train_std=r2_train_val_std,
-                                rmse_train_std=rmse_train_val_std,
+                                test_metrics=test_metrics,
+                                train_metrics=train_metrics,
                             )
                             self.add_result(result)
                             # if successful, break out of the retry loop
@@ -511,14 +535,17 @@ def compare(self):
                                 )
 
         # Get the best result and log the comparison
-        best_result = self.best_result()
+        # Use the first evaluation metric to determine the best result
+        first_metric = self.evaluation_metrics[0]
+        best_result = self.best_result(
+            metric_name=first_metric.name,
+        )
         self.log_compare(
             best_model_name=best_result.model_name,
             x_transforms=best_result.x_transforms,
             y_transforms=best_result.y_transforms,
             best_params_for_this_model=best_result.params,
-            r2_score=best_result.r2_test,
-            rmse_score=best_result.rmse_test,
+            test_metrics=best_result.test_metrics,
         )
 
     def fit_from_reinitialized(
@@ -642,7 +669,7 @@ def plot(  # noqa: PLR0912, PLR0915
 
         # Re-run prediction with just this model to get the predictions
         y_pred, y_variance = model.predict_mean_and_variance(test_x)
-        r2_score = evaluate(y_pred, test_y, r2_metric())
+        r2_score = evaluate(y_pred, test_y)
 
         # Handle ranges
         input_ranges = input_ranges or {}
 
@@ -0,0 +1,169 @@
+"""Metrics configuration and utilities for model evaluation and tuning."""
+
+from __future__ import annotations
+
+from abc import abstractmethod
+from collections.abc import Sequence
+from functools import partial
+
+import torchmetrics
+
+from autoemulate.core.types import OutputLike, TensorLike, TorchMetricsLike
+
+
+class Metric:
+    """Configuration for a single metric.
+
+    Parameters
+    ----------
+    name : str
+        Display name for the metric.
+    maximize : bool
+        Whether higher values are better. Defaults to True.
+    """
+
+    name: str
+    maximize: bool
+
+    def __repr__(self) -> str:
+        """Return the string representation of the MetricConfig."""
+        return f"MetricConfig(name={self.name}, maximize={self.maximize})"
+
+    @abstractmethod
+    def __call__(self, y_pred: OutputLike, y_true: TensorLike) -> TensorLike:
+        """Calculate metric."""
+
+
+class TorchMetrics(Metric):
+    """Configuration for a single torchmetrics metric.
+
+    Parameters
+    ----------
+    metric : MetricLike
+        The torchmetrics metric class or partial.
+    name : str
+        Display name for the metric. If None, uses the class name of the metric.
+    maximize : bool
+        Whether higher values are better.
+    """
+
+    def __init__(
+        self,
+        metric: TorchMetricsLike,
+        name: str,
+        maximize: bool,
+    ):
+        self.metric = metric
+        self.name = name
+        self.maximize = maximize
+
+    def __call__(self, y_pred: OutputLike, y_true: TensorLike) -> TensorLike:
+        """Calculate metric."""
+        if not isinstance(y_pred, TensorLike):
+            raise ValueError(f"Metric not implemented for y_pred ({type(y_pred)})")
+        if not isinstance(y_true, TensorLike):
+            raise ValueError(f"Metric not implemented for y_true ({type(y_true)})")
+
+        metric = self.metric()
+        metric.to(y_pred.device)
+        # Assume first dim is a batch dim, flatten others for metric calculation
+        metric.update(y_pred.flatten(start_dim=1), y_true.flatten(start_dim=1))
+        return metric.compute()
+
+
+R2 = TorchMetrics(
+    metric=torchmetrics.R2Score,
+    name="r2",
+    maximize=True,
+)
+
+RMSE = TorchMetrics(
+    metric=partial(torchmetrics.MeanSquaredError, squared=False),
+    name="rmse",
+    maximize=False,
+)
+
+MSE = TorchMetrics(
+    metric=torchmetrics.MeanSquaredError,
+    name="mse",
+    maximize=False,
+)
+
+MAE = TorchMetrics(
+    metric=torchmetrics.MeanAbsoluteError,
+    name="mae",
+    maximize=False,
+)
+
+AVAILABLE_METRICS = {
+    "r2": R2,
+    "rmse": RMSE,
+    "mse": MSE,
+    "mae": MAE,
+}
+
+
+def get_metric_config(
+    metric: str | TorchMetrics,
+) -> TorchMetrics:
+    """Convert various metric specifications to MetricConfig.
+
+    Parameters
+    ----------
+    metric : str | type[torchmetrics.Metric] | partial[torchmetrics.Metric] | Metric
+        The metric specification. Can be:
+        - A string shortcut like "r2", "rmse", "mse", "mae"
+        - A Metric instance (returned as-is)
+
+    Returns
+    -------
+    TorchMetrics
+        The metric configuration.
+
+    Raises
+    ------
+    ValueError
+        If the metric specification is invalid or name is not provided when required.
+
+
+    """
+    # If already a TorchMetric, return as-is
+    if isinstance(metric, TorchMetrics):
+        return metric
+
+    if isinstance(metric, str):
+        if metric.lower() in AVAILABLE_METRICS:
+            return AVAILABLE_METRICS[metric.lower()]
+        raise ValueError(
+            f"Unknown metric shortcut '{metric}'. "
+            f"Available options: {list(AVAILABLE_METRICS.keys())}"
+        )
+    # Handle unsupported types
+    raise ValueError(
+        f"Unsupported metric type: {type(metric).__name__}. "
+        "Metric must be a string shortcut or a MetricConfig instance."
+    )
+
+
+def get_metric_configs(
+    metrics: Sequence[str | TorchMetrics],
+) -> list[TorchMetrics]:
+    """Convert a list of metric specifications to MetricConfig objects.
+
+    Parameters
+    ----------
+    metrics : Sequence[str | TorchMetrics]
+        Sequence of metric specifications.
+
+    Returns
+    -------
+    list[TorchMetrics]
+        List of metric configurations.
+    """
+    result_metrics = []
+
+    for m in metrics:
+        config = get_metric_config(m) if isinstance(m, (str | TorchMetrics)) else m
+        result_metrics.append(config)
+
+    return result_metrics