Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
3691629
Manage supported model configurations
ckadner Sep 5, 2025
1b80333
Reorganize import statements
ckadner Sep 5, 2025
3d8ad48
Lint docs
ckadner Sep 5, 2025
c8ce7da
use 'x86_64' instead of 'amd64'
ckadner Sep 5, 2025
4decb48
typecheck updates
ckadner Sep 5, 2025
09bde9e
more typecheck updates
ckadner Sep 5, 2025
564d9b0
run isort with suggested changes
ckadner Sep 5, 2025
4223ddf
reorganize imports as isort wants them
ckadner Sep 5, 2025
dd6bd49
CI: isort show suggested import changes
ckadner Sep 5, 2025
ca46ba8
update comments in config YAML
ckadner Sep 5, 2025
214a5ce
yapf
ckadner Sep 5, 2025
d67d7b1
run type-check with Python 3.10 by default
ckadner Sep 5, 2025
03c9c76
revert unrelated changes
ckadner Sep 5, 2025
5a67e9a
Merge branch 'main' into model_configs
ckadner Sep 5, 2025
6995cad
Merge branch 'main' into model_configs
ckadner Sep 8, 2025
12bb213
address review comments, add tests
ckadner Sep 22, 2025
654f480
Merge branch 'main' into model_configs
ckadner Sep 24, 2025
566ac37
lint
ckadner Sep 24, 2025
790de2f
remove f-strings from logging statements
ckadner Sep 24, 2025
de4544e
yapf is ruff
ckadner Sep 24, 2025
dca59ba
type-check
ckadner Sep 24, 2025
8a1205b
update supported configs
ckadner Sep 25, 2025
763a112
update supported parameters
ckadner Sep 25, 2025
b2f8649
Merge branch 'main' into model_configs
ckadner Sep 29, 2025
cbb7a1b
assert c.warmup_shapes is None if use_cb
ckadner Sep 30, 2025
cc0a393
update list of supported models
ckadner Sep 30, 2025
3f48a91
requested config `<=` supported config
ckadner Sep 30, 2025
c65aa9e
Validate that prompt + new_tokens <= max_model_len
ckadner Sep 30, 2025
437290a
type-check
ckadner Sep 30, 2025
4f7a804
remove option to error out on unsupported/unknown configuration
ckadner Oct 1, 2025
3970d84
remove configurations that are within the upper bound of another config
ckadner Oct 4, 2025
05405f4
verify config parameters adhere to restrictions
ckadner Oct 4, 2025
82ccf41
Merge branch 'main' into model_configs
ckadner Oct 8, 2025
1caae76
determine model from HF-config (config.json)
ckadner Oct 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/user_guide/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,14 @@ configurations.
[Granite-Embedding-278m (Multilingual)]: https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual
[BAAI/BGE-Reranker (v2-m3)]: https://huggingface.co/BAAI/bge-reranker-v2-m3
[BAAI/BGE-Reranker (Large)]: https://huggingface.co/BAAI/bge-reranker-large

## Runtime Validation

At runtime, the Spyre engine validates the requested model and configurations against the list
of supported models and configurations based on the entries in the file
<gh-file:vllm_spyre/config/supported_configs.yaml>. If a requested model or configuration
is not found, a warning message will be logged.

```python
--8<-- "vllm_spyre/config/supported_configs.yaml:supported-model-runtime-configurations"
```
67 changes: 67 additions & 0 deletions tests/download_model_configs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
from pathlib import Path
from urllib.request import urlretrieve

from transformers import AutoConfig, PretrainedConfig

from vllm_spyre.config.runtime_config_validator import (
get_supported_models_list)

_configs_path = Path(__file__).parent / "fixtures" / "model_configs"


def download_hf_model_config(hf_model_id: str,
revision: str = "main") -> PretrainedConfig:
"""
Use CONFIG_MAPPING to match known patterns to the requested model ID. Does
not work as reliably as direct download from HF, though (e.g. the
`transformers_version` field is filled in from the local installation).
"""
model_config = AutoConfig.from_pretrained(hf_model_id, revision=revision)
config_path = _configs_path / hf_model_id
if revision != "main":
config_path /= revision
model_config.save_pretrained(config_path, safe_serialization=True)
return model_config


def download_model_config_from_hf(hf_model_id: str, revision: str = "main"):
"""
Download the model config.json directly from HuggingFace.
"""
config_url = f"https://huggingface.co/{hf_model_id}/resolve/{revision}/config.json"
config_path = _configs_path / hf_model_id
if revision != "main":
config_path /= revision
os.makedirs(config_path, exist_ok=True)
urlretrieve(config_url, config_path / "hf_config.json")


if __name__ == '__main__':
model_ids = get_supported_models_list()
for model_id in model_ids:
# TODO: get the actual FP8 model config
if "-FP8" in model_id:
continue
config = download_hf_model_config(model_id)
# download_model_config_from_hf(model_id)
print(f"model_id: {model_id}")
print(os.linesep.join(str(config).split(os.linesep)[:4]))

# model_id = "RedHatAI/granite-3.1-8b-instruct-FP8-dynamic"
# revisions = ["main", "2f1a9431020bea1db9719c6c447a2267412b569a"]
# # model_id = "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
# # revisions = ["2714578f54cfb744ece40df9326ee0b47e879e03",
# "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"]
# model_id = "ibm-granite/granite-3.3-8b-instruct"
# revisions = ["3efd179a48ad7cb28ccf46568985af8cf38cbba9",
# "de4d3920884ea7b8f8f276d8aa286f8d82afbb83"]
# for revision in revisions:
# config = download_hf_model_config(model_id, revision)
# download_model_config_from_hf(model_id, revision)

# TODO: make it a CLI script with parameters:
# --hf-model-id
# --revision
# --all-supported-models
# --output-dir [default: fixtures/model_configs/<hf-model-id>/config.json]
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"architectures": [
"XLMRobertaForSequenceClassification"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"id2label": {
"0": "LABEL_0"
},
"initializer_range": 0.02,
"intermediate_size": 4096,
"label2id": {
"LABEL_0": 0
},
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "xlm-roberta",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"output_past": true,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.53.3",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 250002
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"architectures": [
"XLMRobertaForSequenceClassification"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"id2label": {
"0": "LABEL_0"
},
"initializer_range": 0.02,
"intermediate_size": 4096,
"label2id": {
"LABEL_0": 0
},
"layer_norm_eps": 1e-05,
"max_position_embeddings": 8194,
"model_type": "xlm-roberta",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"output_past": true,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.53.3",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 250002
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"rope_scaling": null,
"rope_theta": 10000000.0,
"tie_word_embeddings": false,
"transformers_version": "4.56.1",
"transformers_version": "4.53.3",
"use_cache": true,
"vocab_size": 49159
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"rope_theta": 10000000.0,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.49.0",
"transformers_version": "4.53.3",
"use_cache": true,
"vocab_size": 49159
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"architectures": [
"RobertaModel"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "bfloat16",
"transformers_version": "4.53.3",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"architectures": [
"XLMRobertaModel"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "xlm-roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "bfloat16",
"transformers_version": "4.53.3",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 250002
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"transformers_version": "4.53.3",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}
29 changes: 18 additions & 11 deletions tests/models/test_granite.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Tests for model-specific overrides for granite"""

import os
from pathlib import Path
from unittest import mock
Expand All @@ -8,7 +9,7 @@

from vllm_spyre.platform import SpyrePlatform

FIXTURES_PATH = Path(__file__).parent.parent / "fixtures" / "models"
FIXTURES_PATH = Path(__file__).parent.parent / "fixtures" / "model_configs"

NO_SWAP_CONFIG = CacheConfig(swap_space=0.001)

Expand All @@ -17,13 +18,17 @@
def test_granite_3_8b_detection():
"""Check that we can detect the model config for granite 3 8b"""

granite_3_8b_config = VllmConfig(model_config=ModelConfig(
model=str(FIXTURES_PATH / "granite-3.3-8b-instruct-config-only")),
cache_config=NO_SWAP_CONFIG)
granite_3_8b_config = VllmConfig(
model_config=ModelConfig(model=str(FIXTURES_PATH / "ibm-granite" /
"granite-3.3-8b-instruct")),
cache_config=NO_SWAP_CONFIG,
)

granite_micro_config = VllmConfig(model_config=ModelConfig(
model=str(FIXTURES_PATH / "granite-3.3-micro-config-only")),
cache_config=NO_SWAP_CONFIG)
granite_micro_config = VllmConfig(
model_config=ModelConfig(model=str(FIXTURES_PATH / "ibm-ai-platform" /
"micro-g3.3-8b-instruct-1b")),
cache_config=NO_SWAP_CONFIG,
)

assert SpyrePlatform.is_granite_3_8b(granite_3_8b_config.model_config)

Expand All @@ -38,10 +43,12 @@ def test_granite_3_8b_overrides():
with mock.patch.dict(os.environ, clear=True):
tp4_config = ParallelConfig(tensor_parallel_size=4)

granite_3_8b_config = VllmConfig(model_config=ModelConfig(
model=str(FIXTURES_PATH / "granite-3.3-8b-instruct-config-only")),
parallel_config=tp4_config,
cache_config=NO_SWAP_CONFIG)
granite_3_8b_config = VllmConfig(
model_config=ModelConfig(model=str(FIXTURES_PATH / "ibm-granite" /
"granite-3.3-8b-instruct")),
parallel_config=tp4_config,
cache_config=NO_SWAP_CONFIG,
)

assert granite_3_8b_config.cache_config.num_gpu_blocks_override == 2080

Expand Down
Loading