Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
754cb0d
Vllm deploy on runpod
PatrickCmd Nov 14, 2025
c22048e
fix(config): update allowed cuda versions in hub and tests config (#236)
TimPietruskyRunPod Nov 14, 2025
67de76c
fix: remove space from gpuIds (#234)
TimPietruskyRunPod Nov 14, 2025
a88d20e
feat: bump transformers to allow Qwen3-VL (#225)
wwydmanski Nov 14, 2025
ee8eae2
add ENABLE_EXPERT_PARALLEL engine arg for MoE models (#239)
velaraptor-runpod Nov 17, 2025
458a269
chore(deps): update runpod to latest version (#242)
TimPietruskyRunPod Nov 24, 2025
1506060
sunflower-ultravox-vllm inference testing
PatrickCmd Nov 25, 2025
8a155ab
API testing
PatrickCmd Dec 1, 2025
3b2ea3c
Sunflower Ultravox deployment
PatrickCmd Dec 2, 2025
aa10d4a
init
huwenjie333 Dec 3, 2025
beb694b
Qwen3-8B-FP8
huwenjie333 Dec 3, 2025
95272fc
Sunflower-14B-FP8
huwenjie333 Dec 3, 2025
6362b00
default client script
huwenjie333 Dec 5, 2025
9700a58
sunflower client updates
huwenjie333 Dec 5, 2025
daacae9
deployed ultravox
huwenjie333 Dec 5, 2025
78f3b5a
readme
huwenjie333 Dec 5, 2025
4acbdd8
update temperature
huwenjie333 Dec 8, 2025
7952177
reorganize files
huwenjie333 Dec 15, 2025
a1616dc
Merge pull request #1 from SunbirdAI/modal-deploy
huwenjie333 Dec 15, 2025
672d266
init
huwenjie333 Dec 15, 2025
dfbd0de
update to deploy spark-tts-salt
huwenjie333 Dec 15, 2025
71ac7c9
getting 404 errors
huwenjie333 Dec 17, 2025
f28285b
time_to_first_token_seconds
huwenjie333 Dec 17, 2025
56a8135
fix errors
huwenjie333 Jan 5, 2026
5c65e62
update comments
huwenjie333 Jan 5, 2026
acd57b6
enforce_eager=False; latency loggings
huwenjie333 Jan 5, 2026
586d1eb
minor comments updates
huwenjie333 Jan 6, 2026
dc9ce99
Merge pull request #2 from SunbirdAI/spark-tts-salt
PatrickCmd Jan 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/CI-runpod_dep.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
types: [python-package-release]

push:
branches: ["main"]
branches: ["main", "runpod-deploy"]

workflow_dispatch:

Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ runpod.toml
.env
test/*
vllm-base/vllm-*
.DS_Store
.DS_Store
.ipynb_checkpoints
24 changes: 12 additions & 12 deletions .runpod/hub.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,9 @@
"config": {
"runsOn": "GPU",
"containerDiskInGb": 150,
"gpuIds": "ADA_80_PRO, AMPERE_80",
"gpuIds": "ADA_80_PRO,AMPERE_80",
"gpuCount": 1,
"allowedCudaVersions": [
"12.9",
"12.8",
"12.7",
"12.6",
"12.5",
"12.4",
"12.3",
"12.2",
"12.1"
],
"allowedCudaVersions": ["12.9", "12.8", "12.7", "12.6", "12.5", "12.4"],
"presets": [
{
"name": "deepseek-ai/deepseek-r1-distill-llama-8b",
Expand Down Expand Up @@ -939,6 +929,16 @@
"advanced": true
}
},
{
"key": "ENABLE_EXPERT_PARALLEL",
"input": {
"name": "Enable Expert Parallel",
"type": "boolean",
"description": "Enable Expert Parallel for MoE models",
"default": false,
"advanced": true
}
},
{
"key": "MODEL_REVISION",
"input": {
Expand Down
12 changes: 1 addition & 11 deletions .runpod/tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,6 @@
"value": "HuggingFaceTB/SmolLM2-135M-Instruct"
}
],
"allowedCudaVersions": [
"12.9",
"12.8",
"12.7",
"12.6",
"12.5",
"12.4",
"12.3",
"12.2",
"12.1"
]
"allowedCudaVersions": ["12.9", "12.8", "12.7", "12.6", "12.5"]
}
}
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade -r /requirements.txt

# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
RUN python3 -m pip install vllm==0.11.0 && \
RUN python3 -m pip install vllm[audio]==0.11.0 && \
python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3

# Setup for Option 2: Building the Image with the Model included
Expand Down
8 changes: 8 additions & 0 deletions bin/build
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash

set -e

export $(grep -v '^#' .env | xargs)
export DOCKER_BUILDKIT=1

docker build -t sunbirddocker/sunbirdai-ultravox:v1.0.0 --secret id=HF_TOKEN --build-arg MODEL_NAME="jq/sunflower-ultravox-251111" .
5 changes: 5 additions & 0 deletions bin/push
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash

set -e

docker push sunbirddocker/sunbirdai-ultravox:v1.0.0
4 changes: 2 additions & 2 deletions builder/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
ray
pandas
pyarrow
runpod~=1.7.7
runpod>=1.8,<2.0
huggingface-hub
packaging
typing-extensions>=4.8.0
pydantic
pydantic-settings
hf-transfer
transformers>=4.55.0
transformers>=4.57.0
bitsandbytes>=0.45.0
kernels
torch==2.6.0
1 change: 1 addition & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Complete guide to all environment variables and configuration options for worker
| `ENFORCE_EAGER` | False | `bool` | Always use eager-mode PyTorch. If False(`0`), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility. |
| `MAX_SEQ_LEN_TO_CAPTURE` | `8192` | `int` | Maximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. |
| `DISABLE_CUSTOM_ALL_REDUCE` | `0` | `int` | Enables or disables custom all reduce. |
| `ENABLE_EXPERT_PARALLEL` | `False` | `bool` | Enable Expert Parallel for MoE models |

## Tokenizer Settings

Expand Down
41 changes: 41 additions & 0 deletions modal-deploy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Serverless Deployment with Modal
This readme describes the steps for serverless model deployement with Modal platform and vLLM framework. The detailed documentation is available at [here](https://modal.com/docs/examples/vllm_inference#build-a-vllm-engine-and-serve-it).

Compares to our current deployment platform Runpod:
| Feature | **RunPod (current)** | **Modal (new)** |
| :--- | :--- | :--- |
| **Support audio vLLM (e.g. Ultravox)** | No | **Yes** |
| **Costs (A100-80GB)** | $0.00076 / s | **$0.00069 / s** |
| **GPU availability** | low when using network volumes | **high** |
| **Deployment methods** | Docker container | **single python script** |
| **serverless cold start time** | 2-3 mins | 2-3 mins |

## Deployment steps
1. register an account in https://modal.com/

2. add your HuggingFace secret in https://modal.com/secrets

3. install the Modal Python package, and create an API token.
```
pip install modal
modal setup
```

4. `vllm_inference.py` contains all the configuration for a deployment. Here are some important values that you should consider to modify:
- `uv_pip_install`: python packges required
- `MODEL_NAME`: model name in HuggingFace
- `app = modal.App`: deployed model name in Modal platform
- `gpu=f"A100-80GB:{N_GPU}"`: the GPU type and number for deployment
- `scaledown_window`: how long should the instance stay up with no requests?
- `modal.Secret.from_name`: update your HuggingFace secret name if it is different.
- `def serve()`: update the vLLM commands if necessary

5. run `modal deploy vllm_inference.py` to deploy the model to Modal platform. You can view the deployment on https://modal.com/apps

6. test the deployed model with the client script, for example:
```
python client.py \
--app-name Sunflower32b-Ultravox \
--prompt "Translate to English: " \
--audio_file "../sunflower-ultravox-vllm/audios/context_eng_1.wav"
```
216 changes: 216 additions & 0 deletions modal-deploy/Sunflower32b-Ultravox/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
"""This simple script shows how to interact with an OpenAI-compatible server from a client."""

import argparse

import modal
from openai import OpenAI
import base64


class Colors:
"""ANSI color codes"""

GREEN = "\033[0;32m"
RED = "\033[0;31m"
BLUE = "\033[0;34m"
GRAY = "\033[0;90m"
BOLD = "\033[1m"
END = "\033[0m"


def get_completion(client, model_id, messages, args):
completion_args = {
"model": model_id,
"messages": messages,
"frequency_penalty": args.frequency_penalty,
"max_tokens": args.max_tokens,
"n": args.n,
"presence_penalty": args.presence_penalty,
"seed": args.seed,
"stop": args.stop,
"stream": args.stream,
"temperature": args.temperature,
"top_p": args.top_p,
}

completion_args = {k: v for k, v in completion_args.items() if v is not None}

try:
response = client.chat.completions.create(**completion_args)
return response
except Exception as e:
print(Colors.RED, f"Error during API call: {e}", Colors.END, sep="")
return None


def main():
parser = argparse.ArgumentParser(description="OpenAI Client CLI")

parser.add_argument(
"--model",
type=str,
default=None,
help="The model to use for completion, defaults to the first available model",
)
parser.add_argument(
"--workspace",
type=str,
default=None,
help="The workspace where the LLM server app is hosted, defaults to your current Modal workspace",
)
parser.add_argument(
"--environment",
type=str,
default=None,
help="The environment in your Modal workspace where the LLM server app is hosted, defaults to your current environment",
)
parser.add_argument(
"--app-name",
type=str,
default="Sunflower32b-Ultravox",
help="A Modal App serving an OpenAI-compatible API",
)
parser.add_argument(
"--function-name",
type=str,
default="serve",
help="A Modal Function serving an OpenAI-compatible API. Append `-dev` to use a `modal serve`d Function.",
)
parser.add_argument(
"--api-key",
type=str,
default="super-secret-key",
help="The API key to use for authentication, set in your api.py",
)

# Completion parameters
parser.add_argument("--max-tokens", type=int, default=None)
parser.add_argument("--temperature", type=float, default=0.6)
parser.add_argument("--top-p", type=float, default=0.9)
parser.add_argument("--top-k", type=int, default=0)
parser.add_argument("--frequency-penalty", type=float, default=0)
parser.add_argument("--presence-penalty", type=float, default=0)
parser.add_argument(
"--n",
type=int,
default=1,
help="Number of completions to generate. Streaming and chat mode only support n=1.",
)
parser.add_argument("--stop", type=str, default=None)
parser.add_argument("--seed", type=int, default=None)

# Prompting
parser.add_argument(
"--prompt",
type=str,
default="Translate to English: ",
help="The user prompt for the chat completion",
)
parser.add_argument(
"--system-prompt",
type=str,
default="You are Sunflower, a helpful assistant made by Sunbird AI who understands all Ugandan languages. You specialise in accurate translations, explanations, summaries and other language tasks.",
help="The system prompt for the chat completion",
)
parser.add_argument(
"--audio_file",
type=str,
default="../sunflower-ultravox-vllm/audios/kibuuka_eng.mp3",
help="input audio file for the model.",
)

# UI options
parser.add_argument(
"--no-stream",
dest="stream",
action="store_false",
help="Disable streaming of response chunks",
)

args = parser.parse_args()

client = OpenAI(api_key=args.api_key)

workspace = args.workspace or modal.config._profile

environment = args.environment or modal.config.config["environment"]

prefix = workspace + (f"-{environment}" if environment else "")

client.base_url = (
f"https://{prefix}--{args.app_name}-{args.function_name}.modal.run/v1"
)

if args.model:
model_id = args.model
print(
Colors.BOLD,
f"🧠: Using model {model_id}. This may trigger a model load on first call!",
Colors.END,
sep="",
)
else:
print(
Colors.BOLD,
f"🔎: Looking up available models on server at {client.base_url}. This may trigger a model load!",
Colors.END,
sep="",
)
model = client.models.list().data[0]
model_id = model.id
print(
Colors.BOLD,
f"🧠: Using {model_id}",
Colors.END,
sep="",
)

messages = [
{
"role": "system",
"content": args.system_prompt,
}
]

print(Colors.BOLD + "🧠: Using system prompt: " + args.system_prompt + Colors.END)

if args.audio_file:
with open(args.audio_file, 'rb') as f:
audio_bytes = f.read()
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
content = [
{
"type": "text",
"text": args.prompt,
},
{
"type": "input_audio",
"input_audio": {"data": audio_b64, "format": "wav"},
},
]
else:
content = args.prompt

messages.append({"role": "user", "content": content})
print(Colors.GREEN + f"\nYou: {args.prompt}" + Colors.END)
response = get_completion(client, model_id, messages, args)
if response:
if args.stream:
print(Colors.BLUE + "\n🤖:", end="")
for chunk in response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
print(Colors.END)
else:
# only case where multiple completions are returned
for i, response in enumerate(response.choices):
print(
Colors.BLUE
+ f"\n🤖 Choice {i + 1}:{response.message.content}"
+ Colors.END,
sep="",
)


if __name__ == "__main__":
main()
Loading
Loading