NVIDIA-NeMo · XuesongYang · Mar 9, 2026 · Dec 16, 2025 · Feb 28, 2026 · Feb 28, 2026
diff --git a/docs/source/tts/magpietts-longform.rst b/docs/source/tts/magpietts-longform.rst
@@ -68,7 +68,7 @@ The input text is split into individual sentences using punctuation markers (``.
 Step 2: State Initialization
 ----------------------------
 
-A ``LongformChunkState`` object is created to track information across sentence chunks:
+A ``ChunkState`` object is created to track information across sentence chunks:
 
 - **History text tokens**: Text from previous chunks for context
 - **History encoder context**: Encoder outputs that provide continuity
@@ -112,7 +112,7 @@ Key Components
 
 1. **Sentence Splitting** (``split_by_sentence``): Intelligently splits text on sentence boundaries while handling abbreviations (e.g., "Dr.", "Mr.").
 
-2. **Chunk State** (``LongformChunkState``): Maintains context across chunks:
+2. **Chunk State** (``ChunkState``): Maintains context across chunks:
 
    - ``history_text``: Text tokens from previous chunks
    - ``history_context_tensor``: Encoder outputs for continuity
@@ -211,24 +211,24 @@ Configuration Dataclasses
 #########################
 
 
-``LongformConfig``
-------------------
+``ChunkedInferenceConfig``
+--------------------------
 
 Immutable tuning parameters (set in model):
 
 .. literalinclude:: ../../../nemo/collections/tts/models/magpietts.py
    :language: python
-   :pyobject: LongformConfig
+   :pyobject: ChunkedInferenceConfig
 
 
-``LongformChunkState``
-----------------------
+``ChunkState``
+--------------
 
 Mutable state passed between chunk iterations:
 
 .. literalinclude:: ../../../nemo/collections/tts/models/magpietts.py
    :language: python
-   :pyobject: LongformChunkState
+   :pyobject: ChunkState
 
 
 Best Practices

diff --git a/docs/source/tts/magpietts-po.rst b/docs/source/tts/magpietts-po.rst
@@ -96,8 +96,8 @@ The final step is fine-tuning the base model on the preference pairs using the D
         max_epochs=10 \
         exp_manager.exp_dir=/path/to/dpo_experiment \
         exp_manager.checkpoint_callback_params.always_save_nemo=false \
-        model.train_ds.dataset._target_="nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDatasetDPO" \
-        model.validation_ds.dataset._target_="nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDatasetDPO" \
+        model.train_ds.datasets._target_="nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDatasetDPO" \
+        model.validation_ds.datasets._target_="nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDatasetDPO" \
         +train_ds_meta.dpopreftrain.manifest_path="/path/to/manifests/" \
         +train_ds_meta.dpopreftrain.audio_dir="/" \
         +train_ds_meta.dpopreftrain.feature_dir="/" \

diff --git a/examples/tts/conf/magpietts/magpietts.yaml b/examples/tts/conf/magpietts/magpietts.yaml
@@ -80,7 +80,7 @@ model:
     #     pretrained_model: "google/byt5-small"
 
   train_ds:
-    dataset:
+    datasets:
       _target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
       dataset_meta: ${train_ds_meta}
       weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch}
@@ -93,8 +93,11 @@ model:
       drop_last: true
       pin_memory: true
 
+  # Non-lhotse validation uses a single dataloader. All dataset_meta entries are mixed
+  # together, so validation metrics are logged jointly. For per-dataset validation
+  # metrics, use the lhotse config (magpietts_lhotse.yaml) with separate datasets entries.
   validation_ds:
-    dataset:
+    datasets:
       _target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
       dataset_meta: ${val_ds_meta}
       min_duration: 0.2

diff --git a/examples/tts/conf/magpietts/magpietts_lhotse.yaml b/examples/tts/conf/magpietts/magpietts_lhotse.yaml
@@ -1,6 +1,7 @@
 name: Magpie-TTS
 
 quadratic_duration: 20  # both training and validation datasets can apply same quadratic_duration.
+
 model:
   use_lhotse: true
   model_type: "decoder_ce" # decoder_context_tts or decoder_ce
@@ -16,7 +17,7 @@ model:
   alignment_loss_scale: 0.002
   embedding_dim: 768
   codecmodel_path: ???
-  cfg_unconditional_prob: 0.1
+  cfg_unconditional_prob: 0.1  # enable classifier-free guidance during traing by dropping out conditionals with this probability
 
   # Alignment encoder parameters, to binarize the prior
   # This is used for attention-constrained training and inference
@@ -70,57 +71,60 @@ model:
   train_ds:
     use_lhotse: ${model.use_lhotse}
     volume_norm: true
-
-    dataset:
-      min_duration: 0.2
-      min_context_speaker_similarity: 0.6
-      max_cer: 0.03
-      batch_duration : ???  # in seconds. Adjust based on your GPU memory.
-      quadratic_duration: ${quadratic_duration}
-      use_bucketing: true
-      num_buckets: 20
-      bucket_buffer_size: 20_000
-      shuffle_buffer_size: 20_000
-      num_cuts_for_bins_estimate: 20_000
-      shard_seed: "trng"
-      drop_last: true
-      shuffle: true
-      num_workers: 6
-      pin_memory: true
-
-      input_cfg:
-      - type: lhotse_shar
-        shar_path: ???
-        weight: 1.0
-        tags:
-          tokenizer_names: ["english_phoneme"]
+    min_duration: 0.2
+    min_context_speaker_similarity: 0.6
+    max_cer: 0.03
+    batch_duration: ???  # in seconds. Adjust based on your GPU memory.
+    quadratic_duration: ${quadratic_duration}
+    use_bucketing: true
+    num_buckets: 20
+    bucket_buffer_size: 20_000
+    shuffle_buffer_size: 20_000
+    num_cuts_for_bins_estimate: 20_000
+    shard_seed: "trng"
+    drop_last: true
+    shuffle: true
+    num_workers: 6
+    pin_memory: true
+
+    input_cfg:
+    - type: lhotse_shar
+      shar_path: ???
+      weight: 1.0
+      tags:
+        tokenizer_names: ["english_phoneme"]
 
   validation_ds:
+    # the entries under 'datasets' are a list of separate dataloaders.
+    # The structure is:
+    # - name: '<dataset-name>'
+    #   <dataloader-dict-config>
+    # They inherit all settings from validation_ds, but can individually override them.
     use_lhotse: ${model.use_lhotse}
     volume_norm: true
-
-    dataset:
-      min_duration: 0.2
-      min_context_speaker_similarity: 0.6
-      max_cer: 0.03
-      batch_duration: ???   # recommend to use smaller batch_duration for validation dataset than training dataset.
-      quadratic_duration: ${quadratic_duration}
-      use_bucketing: false
-      force_finite: true
-      force_map_dataset: true
-      seed: 42
-      shard_seed: "randomized"
-      drop_last: false
-      shuffle: false
-      num_workers: 2
-      pin_memory: true
-
-      input_cfg:
-      - type: lhotse_shar
-        shar_path: ???
-        weight: 1.0
-        tags:
-          tokenizer_names: ["english_phoneme"]
+    min_duration: 0.2
+    min_context_speaker_similarity: 0.6
+    max_cer: 0.03
+    batch_duration: ???   # recommend to use smaller batch_duration for validation dataset than training dataset.
+    quadratic_duration: ${quadratic_duration}
+    use_bucketing: false
+    force_finite: true
+    force_map_dataset: true
+    seed: 42
+    shard_seed: "randomized"
+    drop_last: false
+    shuffle: false
+    num_workers: 2
+    pin_memory: true
+
+    datasets:
+      - name: "val_set_0"  # rename to your dataset name, add more as needed
+        input_cfg:
+        - type: lhotse_shar
+          shar_path: ???
+          weight: 1.0
+          tags:
+            tokenizer_names: ["english_phoneme"]
 
   encoder:
     n_layers: 6
@@ -185,10 +189,9 @@ trainer:
   precision: 32
   max_steps: ???
   accumulate_grad_batches: 1
-  enable_checkpointing: False # Provided by exp_manager
-  logger: false # Provided by exp_manager
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
   log_every_n_steps: 100
-  check_val_every_n_epoch: 1
   limit_train_batches: 1_000
   val_check_interval: 1_000
   num_sanity_val_steps: 0

diff --git a/examples/tts/conf/magpietts/magpietts_lhotse_moe.yaml b/examples/tts/conf/magpietts/magpietts_lhotse_moe.yaml
@@ -74,57 +74,60 @@ model:
   train_ds:
     use_lhotse: ${model.use_lhotse}
     volume_norm: true
-
-    dataset:
-      min_duration: 0.2
-      min_context_speaker_similarity: 0.6
-      max_cer: 0.03
-      batch_duration : ???  # in seconds. Adjust based on your GPU memory.
-      quadratic_duration: ${quadratic_duration}
-      use_bucketing: true
-      num_buckets: 20
-      bucket_buffer_size: 20_000
-      shuffle_buffer_size: 20_000
-      num_cuts_for_bins_estimate: 20_000
-      shard_seed: "trng"
-      drop_last: true
-      shuffle: true
-      num_workers: 6
-      pin_memory: true
-
-      input_cfg:
-      - type: lhotse_shar
-        shar_path: ???
-        weight: 1.0
-        tags:
-          tokenizer_names: ["english_phoneme"]
+    min_duration: 0.2
+    min_context_speaker_similarity: 0.6
+    max_cer: 0.03
+    batch_duration: ???  # in seconds. Adjust based on your GPU memory.
+    quadratic_duration: ${quadratic_duration}
+    use_bucketing: true
+    num_buckets: 20
+    bucket_buffer_size: 20_000
+    shuffle_buffer_size: 20_000
+    num_cuts_for_bins_estimate: 20_000
+    shard_seed: "trng"
+    drop_last: true
+    shuffle: true
+    num_workers: 6
+    pin_memory: true
+
+    input_cfg:
+    - type: lhotse_shar
+      shar_path: ???
+      weight: 1.0
+      tags:
+        tokenizer_names: ["english_phoneme"]
 
   validation_ds:
+    # the entries under 'datasets' are a list of separate dataloaders.
+    # The structure is:
+    # - name: '<dataset-name>'
+    #   <dataloader-dict-config>
+    # They inherit all settings from validation_ds, but can individually override them.
     use_lhotse: ${model.use_lhotse}
     volume_norm: true
-
-    dataset:
-      min_duration: 0.2
-      min_context_speaker_similarity: 0.6
-      max_cer: 0.03
-      batch_duration: ???   # recommend to use smaller batch_duration for validation dataset than training dataset.
-      quadratic_duration: ${quadratic_duration}
-      use_bucketing: false
-      force_finite: true
-      force_map_dataset: true
-      seed: 42
-      shard_seed: "randomized"
-      drop_last: false
-      shuffle: false
-      num_workers: 2
-      pin_memory: true
-
-      input_cfg:
-      - type: lhotse_shar
-        shar_path: ???
-        weight: 1.0
-        tags:
-          tokenizer_names: ["english_phoneme"]
+    min_duration: 0.2
+    min_context_speaker_similarity: 0.6
+    max_cer: 0.03
+    batch_duration: ???   # recommend to use smaller batch_duration for validation dataset than training dataset.
+    quadratic_duration: ${quadratic_duration}
+    use_bucketing: false
+    force_finite: true
+    force_map_dataset: true
+    seed: 42
+    shard_seed: "randomized"
+    drop_last: false
+    shuffle: false
+    num_workers: 2
+    pin_memory: true
+
+    datasets:
+      - name: "val_set_0"  # rename to your dataset name, add more as needed
+        input_cfg:
+        - type: lhotse_shar
+          shar_path: ???
+          weight: 1.0
+          tags:
+            tokenizer_names: ["english_phoneme"]
 
   encoder:
     n_layers: 6

diff --git a/examples/tts/conf/magpietts/magpietts_po_inference.yaml b/examples/tts/conf/magpietts/magpietts_po_inference.yaml
@@ -88,7 +88,7 @@ model:
     #     pretrained_model: "google/byt5-small"
 
   test_ds:
-    dataset:
+    datasets:
       _target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
       dataset_meta: ${test_ds_meta}
       min_duration: 0.2

diff --git a/nemo/collections/tts/losses/moe_loss.py b/nemo/collections/tts/losses/moe_loss.py
@@ -16,7 +16,7 @@
 import torch.nn.functional as F
 
 from nemo.core.classes import Loss, typecheck
-from nemo.core.neural_types.elements import LossType, ProbsType
+from nemo.core.neural_types.elements import LogitsType, LossType, ProbsType
 from nemo.core.neural_types.neural_type import NeuralType
 
 
@@ -122,7 +122,7 @@ def __init__(self, loss_scale: float = 0.001):
     @property
     def input_types(self):
         return {
-            "router_logits": NeuralType(('B', 'T', 'D'), ProbsType()),  # D = num_experts
+            "router_logits": NeuralType(('B', 'T', 'D'), LogitsType()),  # D = num_experts
             "x_mask": NeuralType(('B', 'T'), ProbsType(), optional=True),
         }
 
@@ -194,7 +194,7 @@ def __init__(
     @property
     def input_types(self):
         return {
-            "router_logits": NeuralType(('B', 'T', 'D'), ProbsType()),
+            "router_logits": NeuralType(('B', 'T', 'D'), LogitsType()),
             "router_probs": NeuralType(('B', 'T', 'D'), ProbsType()),
             "x_mask": NeuralType(('B', 'T'), ProbsType(), optional=True),
         }