update model to qwen-8b

QiJune · QiJune · commit 458f82df675e · 2025-09-10T15:29:54.000-07:00
Signed-off-by: junq &lt;22017000+QiJune@users.noreply.github.com&gt;
diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py
@@ -5,16 +5,11 @@
 
 
 def main(args):
-
     prompt_a = (
-        "the following question and four candidate answers (A, B, C and D), choose the best answer."
-        "The following excerpt is from a pamphlet. You will do me the justice to remember, "
-    )
-
-    prompt_b = (
-        "Given the following question and four candidate answers (A, B, C and D), choose the best answer."
-        "The following excerpt is from a pamphlet. You will do me the justice to remember, "
-    )
+        "Returns the per-iterations statistics computed since last call to this method. "
+        "Contains at most iter_stats_max_iterations iterations.")
+    prompt_b = ("Use for skipping decoding step for non generation model, "
+                "and return the batch_output (such as mm_embeddings)")
     max_batch_size = 1
     max_seq_len = 256
 
@@ -24,7 +19,7 @@ def main(args):
 
     sampling_params = SamplingParams(max_tokens=max_seq_len)
 
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    llm = LLM(model="Qwen/Qwen3-8B",
               max_batch_size=max_batch_size,
               max_seq_len=max_seq_len,
               kv_cache_config=KvCacheConfig(enable_block_reuse=True,