|
22 | 22 |
|
23 | 23 | import numpy as np |
24 | 24 | import torch |
25 | | -import torch.nn as nn |
26 | 25 | from datasets import load_dataset |
27 | 26 | from transformers import AutoModelForCausalLM, AutoTokenizer |
28 | 27 |
|
| 28 | +import modelopt.torch.opt as mto |
29 | 29 | import modelopt.torch.sparsity.attention_sparsity as mtsa |
30 | 30 | from modelopt.torch.export import export_hf_checkpoint |
31 | 31 | from modelopt.torch.sparsity.attention_sparsity import SparseAttentionConfig |
32 | | -from modelopt.torch.sparsity.attention_sparsity.config import ( |
33 | | - SKIP_SOFTMAX_CALIB, |
34 | | - SKIP_SOFTMAX_DEFAULT, |
35 | | -) |
36 | | -from modelopt.torch.sparsity.attention_sparsity.nn.sparse_attention import SparseAttentionModule |
| 32 | +from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_DEFAULT |
| 33 | +from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule |
37 | 34 | from modelopt.torch.utils.memory_monitor import launch_memory_monitor |
38 | 35 |
|
39 | 36 | RAND_SEED = 1234 |
40 | 37 |
|
| 38 | +# Enable HuggingFace checkpointing support |
| 39 | +mto.enable_huggingface_checkpointing() |
| 40 | + |
41 | 41 | # You can define custom configurations or use the default |
42 | 42 | SPARSE_ATTN_CFG_CHOICES = { |
43 | 43 | "skip_softmax": SKIP_SOFTMAX_DEFAULT, |
44 | | - "skip_softmax_calib": SKIP_SOFTMAX_CALIB, |
45 | 44 | } |
46 | 45 |
|
47 | 46 |
|
48 | | -def print_sparsity_stats(model: nn.Module): |
49 | | - """Print sparsity statistics if available.""" |
50 | | - module_stats = [] |
51 | | - for name, module in model.named_modules(): |
52 | | - if hasattr(module, "get_stats"): |
53 | | - stats = module.get_stats() |
54 | | - if stats and "average_sparsity" in stats: |
55 | | - module_stats.append((name, stats["average_sparsity"])) |
56 | | - |
57 | | - if not module_stats: |
58 | | - print("No sparsity statistics available") |
59 | | - return |
60 | | - |
61 | | - # Check if all modules have the same sparsity |
62 | | - sparsities = [s for _, s in module_stats] |
63 | | - if len(set(sparsities)) == 1: |
64 | | - # All identical - show summary |
65 | | - print(f"Average sparsity across all {len(module_stats)} modules: {sparsities[0]:.2%}") |
66 | | - else: |
67 | | - # Different sparsities - show individual values |
68 | | - avg_sparsity = sum(sparsities) / len(sparsities) |
69 | | - print(f"Average sparsity: {avg_sparsity:.2%}") |
70 | | - print("Per-module breakdown:") |
71 | | - for name, sparsity in module_stats: |
72 | | - print(f" {name}: {sparsity:.2%} sparse") |
73 | | - |
74 | | - |
75 | 47 | def get_narrativeqa_samples(num_samples=3): |
76 | 48 | """Load samples from NarrativeQA dataset for testing. |
77 | 49 |
|
78 | 50 | Args: |
79 | 51 | num_samples: Number of samples to generate |
| 52 | +
|
| 53 | + Raises: |
| 54 | + RuntimeError: If dataset loading fails |
| 55 | + ValueError: If no valid samples could be loaded |
80 | 56 | """ |
81 | | - # Load NarrativeQA dataset |
82 | | - dataset = load_dataset("narrativeqa", split="test", streaming=True) |
| 57 | + # Load NarrativeQA dataset with retry logic |
| 58 | + try: |
| 59 | + dataset = load_dataset("narrativeqa", split="test", streaming=True) |
| 60 | + except Exception as e: |
| 61 | + raise RuntimeError(f"Failed to load NarrativeQA dataset: {e}") |
83 | 62 |
|
84 | 63 | samples = [] |
85 | 64 | for i, item in enumerate(dataset): |
@@ -120,8 +99,10 @@ def truncate_text(text: str, tokenizer, max_length: int): |
120 | 99 | return text |
121 | 100 |
|
122 | 101 | # Need to truncate - preserve beginning and end |
123 | | - # Reserve some tokens for special tokens |
124 | | - available_tokens = max_length - 2 # Account for special tokens |
| 102 | + # Calculate actual special tokens used |
| 103 | + dummy_tokens = tokenizer.encode("", add_special_tokens=True) |
| 104 | + special_token_count = len(dummy_tokens) |
| 105 | + available_tokens = max_length - special_token_count |
125 | 106 |
|
126 | 107 | # Split tokens roughly in half for beginning and end |
127 | 108 | begin_tokens = available_tokens // 2 |
@@ -173,9 +154,7 @@ def verify_outputs(model, tokenizer, args): |
173 | 154 | print("BASELINE vs SPARSE ATTENTION COMPARISON") |
174 | 155 | print("=" * 60) |
175 | 156 | print(f"\nTest prompt: {display_prompt}") |
176 | | - print(f"Input tokens: {inputs['input_ids'].shape[1]} (max: {args.seq_len})") |
177 | | - if "[...]" in truncated_prompt: |
178 | | - print("Note: Text was middle-truncated to fit token limit") |
| 157 | + print(f"Input tokens: {inputs['input_ids'].shape[1]}") |
179 | 158 |
|
180 | 159 | # Helper function to generate text |
181 | 160 | def generate_text(model, inputs, args, tokenizer): |
@@ -235,23 +214,13 @@ def sparsify_model(model, args): |
235 | 214 | modified_sparse_cfg[pattern] = modified_cfg |
236 | 215 |
|
237 | 216 | # Create new config with modified settings |
238 | | - sparse_config = SparseAttentionConfig( |
239 | | - method=base_config["method"], |
240 | | - sparse_cfg=modified_sparse_cfg, |
241 | | - collect_stats=True, # Enable stats collection for monitoring |
242 | | - ) |
| 217 | + sparse_config = SparseAttentionConfig(sparse_cfg=modified_sparse_cfg) |
243 | 218 |
|
244 | | - # Sparsify with optional calibration - framework handles calibration automatically |
| 219 | + # Sparsify the model |
245 | 220 | model = mtsa.sparsify(model, config=sparse_config) |
246 | 221 |
|
247 | 222 | print("Sparse attention applied successfully!") |
248 | 223 |
|
249 | | - # Show sparsity statistics |
250 | | - print("\n" + "=" * 60) |
251 | | - print("Sparsity Statistics") |
252 | | - print("=" * 60) |
253 | | - print_sparsity_stats(model) |
254 | | - |
255 | 224 | return model |
256 | 225 |
|
257 | 226 |
|
|
0 commit comments