NVIDIA
diff --git a/‎examples/llm_sparse_attention/hf_spar_attn.py‎ renamed to ‎examples/llm_sparsity/attention_sparsity/hf_sa.py‎
Lines changed: 22 additions & 53 deletions b/‎examples/llm_sparse_attention/hf_spar_attn.py‎ renamed to ‎examples/llm_sparsity/attention_sparsity/hf_sa.py‎
Lines changed: 22 additions & 53 deletions
diff --git a/‎examples/llm_sparsity/.gitignore‎ renamed to ‎examples/llm_sparsity/weight_sparsity/.gitignore‎ b/‎examples/llm_sparsity/.gitignore‎ renamed to ‎examples/llm_sparsity/weight_sparsity/.gitignore‎
diff --git a/‎examples/llm_sparsity/README.md‎ renamed to ‎examples/llm_sparsity/weight_sparsity/README.md‎ b/‎examples/llm_sparsity/README.md‎ renamed to ‎examples/llm_sparsity/weight_sparsity/README.md‎
diff --git a/‎examples/llm_sparsity/data_prep.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/data_prep.py‎ b/‎examples/llm_sparsity/data_prep.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/data_prep.py‎
diff --git a/‎examples/llm_sparsity/eval.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/eval.py‎ b/‎examples/llm_sparsity/eval.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/eval.py‎
diff --git a/‎examples/llm_sparsity/export_trtllm_ckpt.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py‎ b/‎examples/llm_sparsity/export_trtllm_ckpt.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py‎
diff --git a/‎examples/llm_sparsity/finetune.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/finetune.py‎
Lines changed: 15 additions & 0 deletions b/‎examples/llm_sparsity/finetune.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/finetune.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎examples/llm_sparsity/hf_pts.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/hf_pts.py‎ b/‎examples/llm_sparsity/hf_pts.py‎ renamed to ‎examples/llm_sparsity/weight_sparsity/hf_pts.py‎
diff --git a/‎examples/llm_sparsity/launch_finetune.sh‎ renamed to ‎examples/llm_sparsity/weight_sparsity/launch_finetune.sh‎ b/‎examples/llm_sparsity/launch_finetune.sh‎ renamed to ‎examples/llm_sparsity/weight_sparsity/launch_finetune.sh‎
diff --git a/‎examples/llm_sparsity/requirements.txt‎ renamed to ‎examples/llm_sparsity/weight_sparsity/requirements.txt‎ b/‎examples/llm_sparsity/requirements.txt‎ renamed to ‎examples/llm_sparsity/weight_sparsity/requirements.txt‎
@@ -22,64 +22,43 @@
 
 import numpy as np
 import torch
-import torch.nn as nn
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+import modelopt.torch.opt as mto
 import modelopt.torch.sparsity.attention_sparsity as mtsa
 from modelopt.torch.export import export_hf_checkpoint
 from modelopt.torch.sparsity.attention_sparsity import SparseAttentionConfig
-from modelopt.torch.sparsity.attention_sparsity.config import (
-    SKIP_SOFTMAX_CALIB,
-    SKIP_SOFTMAX_DEFAULT,
-)
-from modelopt.torch.sparsity.attention_sparsity.nn.sparse_attention import SparseAttentionModule
+from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_DEFAULT
+from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
 
 RAND_SEED = 1234
 
+# Enable HuggingFace checkpointing support
+mto.enable_huggingface_checkpointing()
+
 # You can define custom configurations or use the default
 SPARSE_ATTN_CFG_CHOICES = {
     "skip_softmax": SKIP_SOFTMAX_DEFAULT,
-    "skip_softmax_calib": SKIP_SOFTMAX_CALIB,
 }
 
 
-def print_sparsity_stats(model: nn.Module):
-    """Print sparsity statistics if available."""
-    module_stats = []
-    for name, module in model.named_modules():
-        if hasattr(module, "get_stats"):
-            stats = module.get_stats()
-            if stats and "average_sparsity" in stats:
-                module_stats.append((name, stats["average_sparsity"]))
-
-    if not module_stats:
-        print("No sparsity statistics available")
-        return
-
-    # Check if all modules have the same sparsity
-    sparsities = [s for _, s in module_stats]
-    if len(set(sparsities)) == 1:
-        # All identical - show summary
-        print(f"Average sparsity across all {len(module_stats)} modules: {sparsities[0]:.2%}")
-    else:
-        # Different sparsities - show individual values
-        avg_sparsity = sum(sparsities) / len(sparsities)
-        print(f"Average sparsity: {avg_sparsity:.2%}")
-        print("Per-module breakdown:")
-        for name, sparsity in module_stats:
-            print(f"  {name}: {sparsity:.2%} sparse")
-
-
 def get_narrativeqa_samples(num_samples=3):
     """Load samples from NarrativeQA dataset for testing.
 
     Args:
         num_samples: Number of samples to generate
+
+    Raises:
+        RuntimeError: If dataset loading fails
+        ValueError: If no valid samples could be loaded
     """
-    # Load NarrativeQA dataset
-    dataset = load_dataset("narrativeqa", split="test", streaming=True)
+    # Load NarrativeQA dataset with retry logic
+    try:
+        dataset = load_dataset("narrativeqa", split="test", streaming=True)
+    except Exception as e:
+        raise RuntimeError(f"Failed to load NarrativeQA dataset: {e}")
 
     samples = []
     for i, item in enumerate(dataset):
@@ -120,8 +99,10 @@ def truncate_text(text: str, tokenizer, max_length: int):
         return text
 
     # Need to truncate - preserve beginning and end
-    # Reserve some tokens for special tokens
-    available_tokens = max_length - 2  # Account for special tokens
+    # Calculate actual special tokens used
+    dummy_tokens = tokenizer.encode("", add_special_tokens=True)
+    special_token_count = len(dummy_tokens)
+    available_tokens = max_length - special_token_count
 
     # Split tokens roughly in half for beginning and end
     begin_tokens = available_tokens // 2
@@ -173,9 +154,7 @@ def verify_outputs(model, tokenizer, args):
     print("BASELINE vs SPARSE ATTENTION COMPARISON")
     print("=" * 60)
     print(f"\nTest prompt: {display_prompt}")
-    print(f"Input tokens: {inputs['input_ids'].shape[1]} (max: {args.seq_len})")
-    if "[...]" in truncated_prompt:
-        print("Note: Text was middle-truncated to fit token limit")
+    print(f"Input tokens: {inputs['input_ids'].shape[1]}")
 
     # Helper function to generate text
     def generate_text(model, inputs, args, tokenizer):
@@ -235,23 +214,13 @@ def sparsify_model(model, args):
         modified_sparse_cfg[pattern] = modified_cfg
 
     # Create new config with modified settings
-    sparse_config = SparseAttentionConfig(
-        method=base_config["method"],
-        sparse_cfg=modified_sparse_cfg,
-        collect_stats=True,  # Enable stats collection for monitoring
-    )
+    sparse_config = SparseAttentionConfig(sparse_cfg=modified_sparse_cfg)
 
-    # Sparsify with optional calibration - framework handles calibration automatically
+    # Sparsify the model
     model = mtsa.sparsify(model, config=sparse_config)
 
     print("Sparse attention applied successfully!")
 
-    # Show sparsity statistics
-    print("\n" + "=" * 60)
-    print("Sparsity Statistics")
-    print("=" * 60)
-    print_sparsity_stats(model)
-
     return model
 
 
 
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Adapted from https://github.com/tatsu-lab/stanford_alpaca/blob/3783d18/train.py
 
 # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li