vllm-project · ckadner · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
@@ -42,3 +42,14 @@ configurations.
 [Granite-Embedding-278m (Multilingual)]: https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual
 [BAAI/BGE-Reranker (v2-m3)]: https://huggingface.co/BAAI/bge-reranker-v2-m3
 [BAAI/BGE-Reranker (Large)]: https://huggingface.co/BAAI/bge-reranker-large
+
+## Runtime Validation
+
+At runtime, the Spyre engine validates the requested model and configurations against the list
+of supported models and configurations based on the entries in the file
+<gh-file:vllm_spyre/config/supported_configs.yaml>. If a requested model or configuration
+is not found, a warning message will be logged.
+
+```python
+--8<-- "vllm_spyre/config/supported_configs.yaml:supported-model-runtime-configurations"
+```
@@ -0,0 +1,67 @@
+import os
+from pathlib import Path
+from urllib.request import urlretrieve
+
+from transformers import AutoConfig, PretrainedConfig
+
+from vllm_spyre.config.runtime_config_validator import (
+    get_supported_models_list)
+
+_configs_path = Path(__file__).parent / "fixtures" / "model_configs"
+
+
+def download_hf_model_config(hf_model_id: str,
+                             revision: str = "main") -> PretrainedConfig:
+    """
+    Use CONFIG_MAPPING to match known patterns to the requested model ID. Does
+    not work as reliably as direct download from HF, though (e.g. the
+    `transformers_version` field is filled in from the local installation).
+    """
+    model_config = AutoConfig.from_pretrained(hf_model_id, revision=revision)
+    config_path = _configs_path / hf_model_id
+    if revision != "main":
+        config_path /= revision
+    model_config.save_pretrained(config_path, safe_serialization=True)
+    return model_config
+
+
+def download_model_config_from_hf(hf_model_id: str, revision: str = "main"):
+    """
+    Download the model config.json directly from HuggingFace.
+    """
+    config_url = f"https://huggingface.co/{hf_model_id}/resolve/{revision}/config.json"
+    config_path = _configs_path / hf_model_id
+    if revision != "main":
+        config_path /= revision
+    os.makedirs(config_path, exist_ok=True)
+    urlretrieve(config_url, config_path / "hf_config.json")
+
+
+if __name__ == '__main__':
+    model_ids = get_supported_models_list()
+    for model_id in model_ids:
+        # TODO: get the actual FP8 model config
+        if "-FP8" in model_id:
+            continue
+        config = download_hf_model_config(model_id)
+        # download_model_config_from_hf(model_id)
+        print(f"model_id: {model_id}")
+        print(os.linesep.join(str(config).split(os.linesep)[:4]))
+
+    # model_id = "RedHatAI/granite-3.1-8b-instruct-FP8-dynamic"
+    # revisions = ["main", "2f1a9431020bea1db9719c6c447a2267412b569a"]
+    # # model_id = "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
+    # # revisions = ["2714578f54cfb744ece40df9326ee0b47e879e03",
+    #                "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"]
+    # model_id = "ibm-granite/granite-3.3-8b-instruct"
+    # revisions = ["3efd179a48ad7cb28ccf46568985af8cf38cbba9",
+    #              "de4d3920884ea7b8f8f276d8aa286f8d82afbb83"]
+    # for revision in revisions:
+    #     config = download_hf_model_config(model_id, revision)
+    #     download_model_config_from_hf(model_id, revision)
+
+# TODO: make it a CLI script with parameters:
+#  --hf-model-id
+#  --revision
+#  --all-supported-models
+#  --output-dir [default: fixtures/model_configs/<hf-model-id>/config.json]
@@ -0,0 +1,33 @@
+{
+  "architectures": [
+    "XLMRobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}
@@ -0,0 +1,33 @@
+{
+  "architectures": [
+    "XLMRobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 8194,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}
@@ -26,7 +26,7 @@
   "rope_scaling": null,
   "rope_theta": 10000000.0,
   "tie_word_embeddings": false,
-  "transformers_version": "4.56.1",
+  "transformers_version": "4.53.3",
   "use_cache": true,
   "vocab_size": 49159
 }
@@ -26,7 +26,7 @@
   "rope_theta": 10000000.0,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.49.0",
+  "transformers_version": "4.53.3",
   "use_cache": true,
   "vocab_size": 49159
 }
@@ -0,0 +1,27 @@
+{
+  "architectures": [
+    "RobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}
@@ -0,0 +1,26 @@
+{
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}
@@ -0,0 +1,26 @@
+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.53.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}
@@ -1,4 +1,5 @@
 """Tests for model-specific overrides for granite"""
+
 import os
 from pathlib import Path
 from unittest import mock
@@ -8,7 +9,7 @@
 
 from vllm_spyre.platform import SpyrePlatform
 
-FIXTURES_PATH = Path(__file__).parent.parent / "fixtures" / "models"
+FIXTURES_PATH = Path(__file__).parent.parent / "fixtures" / "model_configs"
 
 NO_SWAP_CONFIG = CacheConfig(swap_space=0.001)
 
@@ -17,13 +18,17 @@
 def test_granite_3_8b_detection():
     """Check that we can detect the model config for granite 3 8b"""
 
-    granite_3_8b_config = VllmConfig(model_config=ModelConfig(
-        model=str(FIXTURES_PATH / "granite-3.3-8b-instruct-config-only")),
-                                     cache_config=NO_SWAP_CONFIG)
+    granite_3_8b_config = VllmConfig(
+        model_config=ModelConfig(model=str(FIXTURES_PATH / "ibm-granite" /
+                                           "granite-3.3-8b-instruct")),
+        cache_config=NO_SWAP_CONFIG,
+    )
 
-    granite_micro_config = VllmConfig(model_config=ModelConfig(
-        model=str(FIXTURES_PATH / "granite-3.3-micro-config-only")),
-                                      cache_config=NO_SWAP_CONFIG)
+    granite_micro_config = VllmConfig(
+        model_config=ModelConfig(model=str(FIXTURES_PATH / "ibm-ai-platform" /
+                                           "micro-g3.3-8b-instruct-1b")),
+        cache_config=NO_SWAP_CONFIG,
+    )
 
     assert SpyrePlatform.is_granite_3_8b(granite_3_8b_config.model_config)
 
@@ -38,10 +43,12 @@ def test_granite_3_8b_overrides():
     with mock.patch.dict(os.environ, clear=True):
         tp4_config = ParallelConfig(tensor_parallel_size=4)
 
-        granite_3_8b_config = VllmConfig(model_config=ModelConfig(
-            model=str(FIXTURES_PATH / "granite-3.3-8b-instruct-config-only")),
-                                         parallel_config=tp4_config,
-                                         cache_config=NO_SWAP_CONFIG)
+        granite_3_8b_config = VllmConfig(
+            model_config=ModelConfig(model=str(FIXTURES_PATH / "ibm-granite" /
+                                               "granite-3.3-8b-instruct")),
+            parallel_config=tp4_config,
+            cache_config=NO_SWAP_CONFIG,
+        )
 
         assert granite_3_8b_config.cache_config.num_gpu_blocks_override == 2080