-
Notifications
You must be signed in to change notification settings - Fork 689
fix: Extend add_tensor_model so that ModelDeploymentCard can be correctly picked up #4169
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
rmccorm4
merged 14 commits into
main
from
zhongdaor/dis-982-support-modelconfig-with-kserve-grpc-python-bindings
Nov 10, 2025
Merged
Changes from 12 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
cbbf7f2
feat: expose python api to save/remove/list ModelDeploymentCard
zhongdaor-nv f3f6bc3
use runtime_config rather than ModelDeploymentCard
zhongdaor-nv a572af9
revert model_card.rs to main
zhongdaor-nv afd4c43
add tests
zhongdaor-nv dcc356a
make test looks better
zhongdaor-nv 8c3f15a
pre-commit
zhongdaor-nv a60d41f
Merge branch 'main' into zhongdaor/dis-982-support-modelconfig-with-k…
zhongdaor-nv c46c335
pre-commit
zhongdaor-nv e58832d
Merge branch 'main' into zhongdaor/dis-982-support-modelconfig-with-k…
rmccorm4 1120e80
Merge branch 'main' into zhongdaor/dis-982-support-modelconfig-with-k…
rmccorm4 ae5a99e
Merge branch 'main' into zhongdaor/dis-982-support-modelconfig-with-k…
zhongdaor-nv 192afe8
Merge branch 'main' into zhongdaor/dis-982-support-modelconfig-with-k…
rmccorm4 44348b7
import triton gRPC client in tests hope to prevent CI error due to gR…
zhongdaor-nv b4538a6
pre-commit
zhongdaor-nv File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,145 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import asyncio | ||
| import contextlib | ||
| from contextlib import asynccontextmanager | ||
| from typing import Any, AsyncIterator, Optional, Tuple | ||
|
|
||
| import pytest | ||
| import tritonclient.grpc as grpcclient | ||
| import tritonclient.grpc.model_config_pb2 as mc | ||
| from tritonclient.utils import InferenceServerException | ||
|
|
||
| from dynamo.llm import KserveGrpcService, ModelRuntimeConfig, PythonAsyncEngine | ||
|
|
||
| pytestmark = pytest.mark.pre_merge | ||
|
|
||
|
|
||
| async def _fetch_model_config( | ||
| client: grpcclient.InferenceServerClient, | ||
| model_name: str, | ||
| retries: int = 30, | ||
| ) -> Any: | ||
| last_error: Optional[Exception] = None | ||
| for _ in range(retries): | ||
| try: | ||
| return await asyncio.to_thread(client.get_model_config, model_name) | ||
| except InferenceServerException as err: | ||
| last_error = err | ||
| await asyncio.sleep(0.1) | ||
| raise AssertionError( | ||
| f"Unable to fetch model config for '{model_name}': {last_error}" | ||
| ) | ||
|
|
||
|
|
||
| class EchoTensorEngine: | ||
| """Minimal tensor engine stub for registering tensor models.""" | ||
|
|
||
| def __init__(self, model_name: str): | ||
| self._model_name = model_name | ||
|
|
||
| def generate(self, request, context=None): | ||
| async def _generator(): | ||
| yield { | ||
| "model": self._model_name, | ||
| "tensors": request.get("tensors", []), | ||
| "parameters": request.get("parameters", {}), | ||
| } | ||
|
|
||
| return _generator() | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def tensor_service(runtime): | ||
| @asynccontextmanager | ||
| async def _start( | ||
| model_name: str, | ||
| *, | ||
| runtime_config: Optional[ModelRuntimeConfig] = None, | ||
| checksum: str = "dummy-mdcsum", | ||
| ) -> AsyncIterator[Tuple[str, int]]: | ||
| host = "127.0.0.1" | ||
| port = 8787 | ||
| loop = asyncio.get_running_loop() | ||
| engine = PythonAsyncEngine(EchoTensorEngine(model_name).generate, loop) | ||
| tensor_model_service = KserveGrpcService(port=port, host=host) | ||
|
|
||
| tensor_model_service.add_tensor_model( | ||
| model_name, checksum, engine, runtime_config=runtime_config | ||
| ) | ||
|
|
||
| cancel_token = runtime.child_token() | ||
|
|
||
| async def _serve(): | ||
| await tensor_model_service.run(cancel_token) | ||
|
|
||
| server_task = asyncio.create_task(_serve()) | ||
| try: | ||
| await asyncio.sleep(1) # wait service to start | ||
| yield host, port | ||
| finally: | ||
| cancel_token.cancel() | ||
| with contextlib.suppress(asyncio.TimeoutError, asyncio.CancelledError): | ||
| await asyncio.wait_for(server_task, timeout=5) | ||
|
|
||
| return _start | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_model_config_uses_runtime_config(tensor_service): | ||
| """Ensure tensor runtime_config is returned via the ModelConfig endpoint.""" | ||
| model_name = "tensor-config-model" | ||
| tensor_config = { | ||
| "name": model_name, | ||
| "inputs": [ | ||
| {"name": "input_text", "data_type": "Bytes", "shape": [-1]}, | ||
| {"name": "control_flag", "data_type": "Bool", "shape": [1]}, | ||
| ], | ||
| "outputs": [ | ||
| {"name": "results", "data_type": "Bytes", "shape": [-1]}, | ||
| ], | ||
| } | ||
| runtime_config = ModelRuntimeConfig() | ||
| runtime_config.set_tensor_model_config(tensor_config) | ||
|
|
||
| async with tensor_service(model_name, runtime_config=runtime_config) as ( | ||
| host, | ||
| port, | ||
| ): | ||
| client = grpcclient.InferenceServerClient(url=f"{host}:{port}") | ||
| try: | ||
| response = await _fetch_model_config(client, model_name) | ||
| finally: | ||
| client.close() | ||
|
|
||
| model_config = response.config | ||
| assert model_config.name == model_name | ||
| assert model_config.platform == "dynamo" | ||
| assert model_config.backend == "dynamo" | ||
|
|
||
| inputs = {spec.name: spec for spec in model_config.input} | ||
| assert list(inputs["input_text"].dims) == [-1] | ||
| assert inputs["input_text"].data_type == mc.TYPE_STRING | ||
| assert list(inputs["control_flag"].dims) == [1] | ||
| assert inputs["control_flag"].data_type == mc.TYPE_BOOL | ||
|
|
||
| outputs = {spec.name: spec for spec in model_config.output} | ||
| assert list(outputs["results"].dims) == [-1] | ||
| assert outputs["results"].data_type == mc.TYPE_STRING | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_model_config_missing_runtime_config_errors(tensor_service): | ||
| """ModelConfig should return NOT_FOUND when no tensor runtime_config is saved.""" | ||
| model_name = "tensor-config-missing" | ||
|
|
||
| async with tensor_service(model_name, runtime_config=None) as (host, port): | ||
| client = grpcclient.InferenceServerClient(url=f"{host}:{port}") | ||
| try: | ||
| with pytest.raises(InferenceServerException) as excinfo: | ||
| await asyncio.to_thread(client.get_model_config, model_name) | ||
| finally: | ||
| client.close() | ||
|
|
||
| assert "not found" in str(excinfo.value).lower() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.