SunbirdAI · huwenjie333 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
diff --git a/.github/workflows/CI-runpod_dep.yml b/.github/workflows/CI-runpod_dep.yml
@@ -5,7 +5,7 @@ on:
     types: [python-package-release]
 
   push:
-    branches: ["main"]
+    branches: ["main", "runpod-deploy"]
 
   workflow_dispatch:
 

diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,5 @@ runpod.toml
 .env
 test/*
 vllm-base/vllm-*
-.DS_Store
+.DS_Store
+.ipynb_checkpoints
diff --git a/.runpod/hub.json b/.runpod/hub.json
@@ -7,19 +7,9 @@
   "config": {
     "runsOn": "GPU",
     "containerDiskInGb": 150,
-    "gpuIds": "ADA_80_PRO, AMPERE_80",
+    "gpuIds": "ADA_80_PRO,AMPERE_80",
     "gpuCount": 1,
-    "allowedCudaVersions": [
-      "12.9",
-      "12.8",
-      "12.7",
-      "12.6",
-      "12.5",
-      "12.4",
-      "12.3",
-      "12.2",
-      "12.1"
-    ],
+    "allowedCudaVersions": ["12.9", "12.8", "12.7", "12.6", "12.5", "12.4"],
     "presets": [
       {
         "name": "deepseek-ai/deepseek-r1-distill-llama-8b",
@@ -939,6 +929,16 @@
           "advanced": true
         }
       },
+      {
+        "key": "ENABLE_EXPERT_PARALLEL",
+        "input": {
+          "name": "Enable Expert Parallel",
+          "type": "boolean",
+          "description": "Enable Expert Parallel for MoE models",
+          "default": false,
+          "advanced": true
+        }
+      },
       {
         "key": "MODEL_REVISION",
         "input": {

diff --git a/.runpod/tests.json b/.runpod/tests.json
@@ -38,16 +38,6 @@
         "value": "HuggingFaceTB/SmolLM2-135M-Instruct"
       }
     ],
-    "allowedCudaVersions": [
-      "12.9",
-      "12.8",
-      "12.7",
-      "12.6",
-      "12.5",
-      "12.4",
-      "12.3",
-      "12.2",
-      "12.1"
-    ]
+    "allowedCudaVersions": ["12.9", "12.8", "12.7", "12.6", "12.5"]
   }
 }
diff --git a/Dockerfile b/Dockerfile
@@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install --upgrade -r /requirements.txt
 
 # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 
-RUN python3 -m pip install vllm==0.11.0 && \
+RUN python3 -m pip install vllm[audio]==0.11.0 && \
     python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3
 
 # Setup for Option 2: Building the Image with the Model included

diff --git a/bin/build b/bin/build
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+set -e
+
+export $(grep -v '^#' .env | xargs)
+export DOCKER_BUILDKIT=1
+
+docker build -t sunbirddocker/sunbirdai-ultravox:v1.0.0 --secret id=HF_TOKEN --build-arg MODEL_NAME="jq/sunflower-ultravox-251111" .
diff --git a/bin/push b/bin/push
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+set -e
+
+docker push sunbirddocker/sunbirdai-ultravox:v1.0.0
diff --git a/builder/requirements.txt b/builder/requirements.txt
@@ -1,14 +1,14 @@
 ray
 pandas
 pyarrow
-runpod~=1.7.7
+runpod>=1.8,<2.0
 huggingface-hub
 packaging
 typing-extensions>=4.8.0
 pydantic
 pydantic-settings
 hf-transfer
-transformers>=4.55.0
+transformers>=4.57.0
 bitsandbytes>=0.45.0
 kernels
 torch==2.6.0
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -85,6 +85,7 @@ Complete guide to all environment variables and configuration options for worker
 | `ENFORCE_EAGER`                | False   | `bool`          | Always use eager-mode PyTorch. If False(`0`), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility. |
 | `MAX_SEQ_LEN_TO_CAPTURE`       | `8192`  | `int`           | Maximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode.     |
 | `DISABLE_CUSTOM_ALL_REDUCE`    | `0`     | `int`           | Enables or disables custom all reduce.                                                                                              |
+| `ENABLE_EXPERT_PARALLEL`       | `False` | `bool`           |  Enable Expert Parallel for MoE models  |
 
 ## Tokenizer Settings
 

diff --git a/modal-deploy/README.md b/modal-deploy/README.md
@@ -0,0 +1,41 @@
+# Serverless Deployment with Modal
+This readme describes the steps for serverless model deployement with Modal platform and vLLM framework. The detailed documentation is available at [here](https://modal.com/docs/examples/vllm_inference#build-a-vllm-engine-and-serve-it).
+
+Compares to our current deployment platform Runpod:
+| Feature | **RunPod (current)** | **Modal (new)** |
+| :--- | :--- | :--- |
+| **Support  audio vLLM (e.g. Ultravox)** | No | **Yes** |
+| **Costs (A100-80GB)** | $0.00076 / s |  **$0.00069 / s** |
+| **GPU availability** | low when using network volumes |  **high**  |
+| **Deployment methods** | Docker container | **single python script** |
+| **serverless cold start time** | 2-3 mins | 2-3 mins |
+
+## Deployment steps
+1. register an account in https://modal.com/
+
+2. add your HuggingFace secret in https://modal.com/secrets
+
+3. install the Modal Python package, and create an API token.
+```
+pip install modal
+modal setup
+```
+
+4. `vllm_inference.py` contains all the configuration for a deployment. Here are some important values that you should consider to modify:
+   - `uv_pip_install`: python packges required
+   - `MODEL_NAME`: model name in HuggingFace
+   - `app = modal.App`:  deployed model name in Modal platform
+   - `gpu=f"A100-80GB:{N_GPU}"`: the GPU type and number for deployment
+   - `scaledown_window`: how long should the instance stay up with no requests?
+   - `modal.Secret.from_name`: update your HuggingFace secret name if it is different.
+   - `def serve()`: update the vLLM commands if necessary
+
+5. run `modal deploy vllm_inference.py` to deploy the model to Modal platform. You can view the deployment on https://modal.com/apps
+
+6. test the deployed model with the client script, for example:
+```
+python client.py \
+    --app-name Sunflower32b-Ultravox  \
+    --prompt "Translate to English: " \
+    --audio_file "../sunflower-ultravox-vllm/audios/context_eng_1.wav"
+```
diff --git a/modal-deploy/Sunflower32b-Ultravox/client.py b/modal-deploy/Sunflower32b-Ultravox/client.py
@@ -0,0 +1,216 @@
+"""This simple script shows how to interact with an OpenAI-compatible server from a client."""
+
+import argparse
+
+import modal
+from openai import OpenAI
+import base64
+
+
+class Colors:
+    """ANSI color codes"""
+
+    GREEN = "\033[0;32m"
+    RED = "\033[0;31m"
+    BLUE = "\033[0;34m"
+    GRAY = "\033[0;90m"
+    BOLD = "\033[1m"
+    END = "\033[0m"
+
+
+def get_completion(client, model_id, messages, args):
+    completion_args = {
+        "model": model_id,
+        "messages": messages,
+        "frequency_penalty": args.frequency_penalty,
+        "max_tokens": args.max_tokens,
+        "n": args.n,
+        "presence_penalty": args.presence_penalty,
+        "seed": args.seed,
+        "stop": args.stop,
+        "stream": args.stream,
+        "temperature": args.temperature,
+        "top_p": args.top_p,
+    }
+
+    completion_args = {k: v for k, v in completion_args.items() if v is not None}
+
+    try:
+        response = client.chat.completions.create(**completion_args)
+        return response
+    except Exception as e:
+        print(Colors.RED, f"Error during API call: {e}", Colors.END, sep="")
+        return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="OpenAI Client CLI")
+
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="The model to use for completion, defaults to the first available model",
+    )
+    parser.add_argument(
+        "--workspace",
+        type=str,
+        default=None,
+        help="The workspace where the LLM server app is hosted, defaults to your current Modal workspace",
+    )
+    parser.add_argument(
+        "--environment",
+        type=str,
+        default=None,
+        help="The environment in your Modal workspace where the LLM server app is hosted, defaults to your current environment",
+    )
+    parser.add_argument(
+        "--app-name",
+        type=str,
+        default="Sunflower32b-Ultravox",
+        help="A Modal App serving an OpenAI-compatible API",
+    )
+    parser.add_argument(
+        "--function-name",
+        type=str,
+        default="serve",
+        help="A Modal Function serving an OpenAI-compatible API. Append `-dev` to use a `modal serve`d Function.",
+    )
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default="super-secret-key",
+        help="The API key to use for authentication, set in your api.py",
+    )
+
+    # Completion parameters
+    parser.add_argument("--max-tokens", type=int, default=None)
+    parser.add_argument("--temperature", type=float, default=0.6)
+    parser.add_argument("--top-p", type=float, default=0.9)
+    parser.add_argument("--top-k", type=int, default=0)
+    parser.add_argument("--frequency-penalty", type=float, default=0)
+    parser.add_argument("--presence-penalty", type=float, default=0)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of completions to generate. Streaming and chat mode only support n=1.",
+    )
+    parser.add_argument("--stop", type=str, default=None)
+    parser.add_argument("--seed", type=int, default=None)
+
+    # Prompting
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Translate to English: ",
+        help="The user prompt for the chat completion",
+    )
+    parser.add_argument(
+        "--system-prompt",
+        type=str,
+        default="You are Sunflower, a helpful assistant made by Sunbird AI who understands all Ugandan languages. You specialise in accurate translations, explanations, summaries and other language tasks.",
+        help="The system prompt for the chat completion",
+    )
+    parser.add_argument(
+        "--audio_file",
+        type=str,
+        default="../sunflower-ultravox-vllm/audios/kibuuka_eng.mp3",
+        help="input audio file for the model.",
+    )
+
+    # UI options
+    parser.add_argument(
+        "--no-stream",
+        dest="stream",
+        action="store_false",
+        help="Disable streaming of response chunks",
+    )
+
+    args = parser.parse_args()
+
+    client = OpenAI(api_key=args.api_key)
+
+    workspace = args.workspace or modal.config._profile
+
+    environment = args.environment or modal.config.config["environment"]
+
+    prefix = workspace + (f"-{environment}" if environment else "")
+
+    client.base_url = (
+        f"https://{prefix}--{args.app_name}-{args.function_name}.modal.run/v1"
+    )
+
+    if args.model:
+        model_id = args.model
+        print(
+            Colors.BOLD,
+            f"🧠: Using model {model_id}. This may trigger a model load on first call!",
+            Colors.END,
+            sep="",
+        )
+    else:
+        print(
+            Colors.BOLD,
+            f"🔎: Looking up available models on server at {client.base_url}. This may trigger a model load!",
+            Colors.END,
+            sep="",
+        )
+        model = client.models.list().data[0]
+        model_id = model.id
+        print(
+            Colors.BOLD,
+            f"🧠: Using {model_id}",
+            Colors.END,
+            sep="",
+        )
+
+    messages = [
+        {
+            "role": "system",
+            "content": args.system_prompt,
+        }
+    ]
+
+    print(Colors.BOLD + "🧠: Using system prompt: " + args.system_prompt + Colors.END)
+
+    if args.audio_file:
+        with open(args.audio_file, 'rb') as f:
+            audio_bytes = f.read()
+        audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
+        content = [
+            {
+                "type": "text",
+                "text": args.prompt,
+            },
+            {
+                "type": "input_audio",
+                "input_audio": {"data": audio_b64, "format": "wav"},
+            },
+        ]
+    else:
+        content = args.prompt
+
+    messages.append({"role": "user", "content": content})
+    print(Colors.GREEN + f"\nYou: {args.prompt}" + Colors.END)
+    response = get_completion(client, model_id, messages, args)
+    if response:
+        if args.stream:
+            print(Colors.BLUE + "\n🤖:", end="")
+            for chunk in response:
+                if chunk.choices[0].delta.content:
+                    print(chunk.choices[0].delta.content, end="")
+            print(Colors.END)
+        else:
+            # only case where multiple completions are returned
+            for i, response in enumerate(response.choices):
+                print(
+                    Colors.BLUE
+                    + f"\n🤖 Choice {i + 1}:{response.message.content}"
+                    + Colors.END,
+                    sep="",
+                )
+
+
+if __name__ == "__main__":
+    main()