alan-turing-institute
diff --git a/‎autoemulate/core/compare.py‎
Lines changed: 3 additions & 6 deletions b/‎autoemulate/core/compare.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎autoemulate/core/metrics.py‎
Lines changed: 187 additions & 9 deletions b/‎autoemulate/core/metrics.py‎
Lines changed: 187 additions & 9 deletions
@@ -13,12 +13,7 @@
 
 from autoemulate.core.device import TorchDeviceMixin
 from autoemulate.core.logging_config import get_configured_logger
-from autoemulate.core.metrics import (
-    R2,
-    Metric,
-    get_metric,
-    get_metrics,
-)
+from autoemulate.core.metrics import R2, Metric, MetricParams, get_metric, get_metrics
 from autoemulate.core.model_selection import bootstrap, evaluate
 from autoemulate.core.plotting import (
     calculate_subplot_layout,
@@ -489,6 +484,7 @@ def compare(self):
                                 n_bootstraps=self.n_bootstraps,
                                 device=self.device,
                                 metrics=self.evaluation_metrics,
+                                metric_params=MetricParams(y_train=train_val_y),
                             )
                             test_metrics = bootstrap(
                                 transformed_emulator,
@@ -497,6 +493,7 @@ def compare(self):
                                 n_bootstraps=self.n_bootstraps,
                                 device=self.device,
                                 metrics=self.evaluation_metrics,
+                                metric_params=MetricParams(y_train=train_val_y),
                             )
 
                             # Log all test metrics from test_metrics dictionary
 
@@ -4,11 +4,14 @@
 
 from abc import abstractmethod
 from collections.abc import Sequence
+from dataclasses import dataclass
 from functools import partial, total_ordering
+from typing import Literal
 
 import torch
 import torchmetrics
 from einops import rearrange
+from torch.distributions import Independent
 from torchmetrics.regression.crps import ContinuousRankedProbabilityScore
 
 from autoemulate.core.types import (
@@ -19,6 +22,31 @@
 )
 
 
+@dataclass
+class MetricParams:
+    """
+    Parameters for metric calculations.
+
+    Attributes
+    ----------
+    n_samples: int
+        Number of samples to draw from the predicted distribution if `y_pred` is a
+        distribution. Defaults to 1000.
+    y_train: TensorLike | None
+        Training target values. In MSLL used to parameterize the trivial model for
+        standardization. If None, mean log loss is computed without standardization.
+        Defaults to None.
+    reduction: Literal["mean", "none"]
+        Reduction method to apply to the final metric scores computer per task.
+        Options are 'mean' or 'none'. Defaults to 'mean'.
+    """
+
+    n_samples: int = 1000
+    y_train: TensorLike | None = None
+    reduction: Literal["mean", "none"] = "mean"
+    metric_kwargs: dict | None = None  # supports subclasses with arbitrary new kwargs
+
+
 @total_ordering
 class Metric:
     """Configuration for a single metric.
@@ -58,7 +86,10 @@ def __lt__(self, other: Metric) -> bool:
 
     @abstractmethod
     def __call__(
-        self, y_pred: OutputLike, y_true: TensorLike, n_samples: int = 1000
+        self,
+        y_pred: OutputLike,
+        y_true: TensorLike,
+        metric_params: MetricParams | None = None,
     ) -> TensorLike:
         """Calculate metric."""
 
@@ -87,20 +118,28 @@ def __init__(
         self.maximize = maximize
 
     def __call__(
-        self, y_pred: OutputLike, y_true: TensorLike, n_samples: int = 1000
+        self,
+        y_pred: OutputLike,
+        y_true: TensorLike,
+        metric_params: MetricParams | None = None,
     ) -> TensorLike:
         """Calculate metric."""
         if not isinstance(y_pred, OutputLike):
             raise ValueError(f"Metric not implemented for y_pred ({type(y_pred)})")
         if not isinstance(y_true, TensorLike):
             raise ValueError(f"Metric not implemented for y_true ({type(y_true)})")
 
+        if metric_params is None:
+            metric_params = MetricParams()
+
         # Handle probabilistic predictions
         if isinstance(y_pred, DistributionLike):
             try:
                 y_pred = y_pred.mean
             except Exception:
-                y_pred = y_pred.rsample((n_samples,)).mean(dim=0)
+                y_pred = y_pred.rsample(torch.Size([metric_params.n_samples])).mean(
+                    dim=0
+                )
         metric = self.metric()
         metric.to(y_pred.device)
 
@@ -117,7 +156,10 @@ class ProbabilisticMetric(Metric):
 
     @abstractmethod
     def __call__(
-        self, y_pred: OutputLike, y_true: TensorLike, n_samples: int = 1000
+        self,
+        y_pred: OutputLike,
+        y_true: TensorLike,
+        metric_params: MetricParams | None = None,
     ) -> TensorLike:
         """Calculate metric."""
 
@@ -145,7 +187,10 @@ class CRPSMetric(ProbabilisticMetric):
     maximize: bool = False
 
     def __call__(
-        self, y_pred: OutputLike, y_true: TensorLike, n_samples: int = 1000
+        self,
+        y_pred: OutputLike,
+        y_true: TensorLike,
+        metric_params: MetricParams | None = None,
     ) -> TensorLike:
         """Calculate CRPS metric.
 
@@ -167,9 +212,8 @@ def __call__(
             - If distribution: `n_samples` are drawn to estimate CRPS.
         y_true: TensorLike
             True target values of shape `(batch_size, *target_shape)`.
-        n_samples: int
-            Number of samples to draw from the predicted distribution if `y_pred` is a
-            distribution. Defaults to 1000.
+        metric_params: MetricParams
+            Metric parameters including: n_samples.
 
         Returns
         -------
@@ -184,6 +228,9 @@ def __call__(
         if not isinstance(y_true, TensorLike):
             raise ValueError(f"y_true must be a tensor, got {type(y_true)}")
 
+        if metric_params is None:
+            metric_params = MetricParams()
+
         # Ensure 2D y_true for consistent handling
         y_true = y_true.unsqueeze(-1) if y_true.ndim == 1 else y_true
 
@@ -195,7 +242,7 @@ def __call__(
         if isinstance(y_pred, DistributionLike):
             # Distribution case: sample from it
             samples = rearrange(
-                y_pred.sample(torch.Size((n_samples,))),
+                y_pred.sample(torch.Size((metric_params.n_samples,))),
                 "s b ... -> b ... s",
             )
             if samples.shape[:-1] != y_true.shape:
@@ -236,6 +283,134 @@ def __call__(
         return crps_metric(samples_flat, y_true_flat)
 
 
+class MSLLMetric(ProbabilisticMetric):
+    """Mean Standardized Log Loss (MSLL) metric.
+
+    MSLL evaluates the quality of probabilistic predictions by measuring the
+    log-likelihood of the true values under the predictive distribution,
+    standardized by the log-likelihood under the trivial model (i.e., predictive
+    normal distribution parameterized with the data mean and variance).
+
+    If no training data is supplied, the mean log loss is computed.
+
+    Lower MSLL values indicate better predictive performance.
+
+    Note: This metric requires probabilistic predictions. Standardization
+    assumes that the predictive distribution is Gaussian.
+
+    Attributes
+    ----------
+    name: str
+        Display name for the metric.
+    maximize: bool
+        Whether higher values are better. False for MSLL (lower is better).
+    """
+
+    name: str = "msll"
+    maximize: bool = False
+
+    def __call__(
+        self,
+        y_pred: OutputLike,
+        y_true: TensorLike,
+        metric_params: MetricParams | None = None,
+    ) -> TensorLike:
+        """Calculate MSLL metric.
+
+        If no training data is provided in `metric_params.y_train`, the mean log loss
+        is computed without standardization.
+
+        Parameters
+        ----------
+        y_pred: OutputLike
+            Predicted outputs. Must be a distribution.
+        y_true: TensorLike
+            True target values.
+        metric_params: MetricParams
+            Metric parameters including: y_train and reduction.
+
+        Returns
+        -------
+        TensorLike
+            Mean Standardized Log Loss (MSLL) score.
+
+        Raises
+        ------
+        ValueError
+            If y_pred is not a distribution.
+        """
+        if not isinstance(y_pred, DistributionLike):
+            raise ValueError(
+                f"MSLL metric requires probabilistic predictions, got {type(y_pred)}. "
+            )
+
+        if not isinstance(y_true, TensorLike):
+            raise ValueError(f"y_true must be a tensor, got {type(y_true)}")
+
+        if metric_params is None:
+            metric_params = MetricParams()
+
+        # Ensure 2D y_true for consistent handling
+        y_true = y_true.unsqueeze(-1) if y_true.ndim == 1 else y_true
+
+        # Compute mean negative log likelihood (also by output dimension if have
+        # Independent distribution to support 'none' reduction)
+        if isinstance(y_pred, Independent):
+            model_nll_output = -y_pred.base_dist.log_prob(y_true).mean(dim=0)
+            model_nll_total = model_nll_output.mean()
+        else:
+            model_nll_output = None
+            model_nll_total = -y_pred.log_prob(y_true).mean()
+
+        # If no training data, return mean log loss
+        if metric_params.y_train is None:
+            if metric_params.reduction == "mean":
+                return model_nll_total
+            if metric_params.reduction == "none":
+                if model_nll_output is None:
+                    msg = (
+                        "Per-output MLL not available for non-Independent "
+                        "distributions."
+                    )
+                    raise ValueError(msg)
+                return model_nll_output.reshape(*y_true.shape[1:])
+            msg = (
+                f"Unknown reduction method: {metric_params.reduction}. "
+                "Expected 'mean' or 'none'."
+            )
+            raise ValueError(msg)
+
+        # Keep original shape for y_train_mean to match y_true shape
+        y_train_mean = metric_params.y_train.mean(dim=0, keepdim=True)
+
+        # following GPyTorch implementation, use global variance rather than per task
+        # https://github.com/cornellius-gp/gpytorch/blob/c0fb6c64311fdbef2862fd3ba2bd613fbd081e79/gpytorch/metrics/metrics.py#L60
+        y_train_var = metric_params.y_train.var()
+
+        # Avoid numerical issues
+        y_train_var = torch.clamp(y_train_var, min=1e-6)
+
+        # Compute mean negative log likelihood under trivial Gaussian model
+        trivial_nll_output = 0.5 * (
+            torch.log(2 * torch.pi * y_train_var)
+            + torch.square(y_true - y_train_mean) / (2 * y_train_var)
+        ).mean(dim=0)
+
+        # Return mean standardized log loss
+        if metric_params.reduction == "mean":
+            return model_nll_total - trivial_nll_output.mean()
+        if metric_params.reduction == "none":
+            if model_nll_output is None:
+                msg = "Per-output MLL not available for non-Independent distributions."
+                raise ValueError(msg)
+            return (model_nll_output - trivial_nll_output).reshape(*y_true.shape[1:])
+        msg = (
+            f"Unknown reduction method: {metric_params.reduction}. "
+            "Expected 'mean' or 'none'."
+        )
+        raise ValueError(msg)
+
+
 R2 = TorchMetrics(
     metric=torchmetrics.R2Score,
     name="r2",
@@ -262,12 +437,15 @@ def __call__(
 
 CRPS = CRPSMetric()
 
+MSLL = MSLLMetric()
+
 AVAILABLE_METRICS = {
     "r2": R2,
     "rmse": RMSE,
     "mse": MSE,
     "mae": MAE,
     "crps": CRPS,
+    "msll": MSLL,
 }