diff --git a/.cd/benchmark/benchmark_defaults.yaml b/.cd/benchmark/benchmark_defaults.yaml index cc2b65b1..0147e628 100644 --- a/.cd/benchmark/benchmark_defaults.yaml +++ b/.cd/benchmark/benchmark_defaults.yaml @@ -29,12 +29,10 @@ model_text: model_vision: MODELS: - - meta-llama/Llama-3.2-11B-Vision-Instruct - - meta-llama/Llama-3.2-90B-Vision-Instruct - Qwen/Qwen2.5-VL-7B-Instruct DATASET: lmarena-ai/vision-arena-bench-v0.1 DATASET_NAME: hf BACKEND: openai-chat ENDPOINT: /v1/chat/completions CONCURRENT_REQ: 64 - NUM_PROMPTS: 500 \ No newline at end of file + NUM_PROMPTS: 500 diff --git a/.cd/benchmark/benchmark_scenarios_vision.yaml b/.cd/benchmark/benchmark_scenarios_vision.yaml index b9e438cf..8e00db02 100644 --- a/.cd/benchmark/benchmark_scenarios_vision.yaml +++ b/.cd/benchmark/benchmark_scenarios_vision.yaml @@ -1,8 +1,2 @@ -llama32-11B-Vision-Instruct: - MODEL: meta-llama/Llama-3.2-11B-Vision-Instruct - -llama32-90B-Vision-Instruct: - MODEL: meta-llama/Llama-3.2-90B-Vision-Instruct - qwen2.5-vl-7b-instruct: MODEL: Qwen/Qwen2.5-VL-7B-Instruct diff --git a/.cd/entrypoints/entrypoint_main.py b/.cd/entrypoints/entrypoint_main.py index c107414a..babfce32 100644 --- a/.cd/entrypoints/entrypoint_main.py +++ b/.cd/entrypoints/entrypoint_main.py @@ -190,6 +190,7 @@ def run(self): output_script_path="vllm_server.sh", variables=variables, log_dir="logs", + varlist_conf_path="server/server_output.env", ).create_and_run() elif self.mode == "benchmark": print("[INFO] Starting container in benchmark mode.") diff --git a/.cd/entrypoints/script_generator.py b/.cd/entrypoints/script_generator.py index e48062d2..684d1e9c 100644 --- a/.cd/entrypoints/script_generator.py +++ b/.cd/entrypoints/script_generator.py @@ -4,8 +4,9 @@ class ScriptGenerator: - def __init__(self, template_script_path, output_script_path, variables, log_dir="logs"): + def __init__(self, template_script_path, output_script_path, variables, log_dir="logs", varlist_conf_path=None): self.template_script_path = template_script_path + self.varlist_conf_path = varlist_conf_path self.output_script_path = output_script_path self.variables = variables self.log_dir = log_dir @@ -19,7 +20,16 @@ def generate_script(self, vars_dict): """ with open(self.template_script_path) as f: template = f.read() - export_lines = "\n".join([f"export {k}={v}" for k, v in vars_dict.items()]) + # Create our output list + if self.varlist_conf_path: + output_dict = {} + with open(self.varlist_conf_path) as var_file: + for line in var_file: + param = line.strip() + output_dict[param] = vars_dict[param] + export_lines = "\n".join([f"export {k}={v}" for k, v in output_dict.items()]) + else: + export_lines = "\n".join([f"export {k}={v}" for k, v in vars_dict.items()]) script_content = template.replace("#@VARS", export_lines) with open(self.output_script_path, 'w') as f: f.write(script_content) diff --git a/.cd/server/server_output.env b/.cd/server/server_output.env new file mode 100644 index 00000000..dccdef0a --- /dev/null +++ b/.cd/server/server_output.env @@ -0,0 +1,60 @@ +MODEL +DTYPE +DEVICE_NAME +TENSOR_PARALLEL_SIZE +MAX_MODEL_LEN +TOTAL_GPU_MEM +MODEL_DTYPE +QUANT_DTYPE +BLOCK_SIZE +VLLM_PROMPT_BS_BUCKET_MIN +VLLM_PROMPT_BS_BUCKET_STEP +VLLM_DECODE_BS_BUCKET_MIN +VLLM_DECODE_BS_BUCKET_STEP +VLLM_PROMPT_SEQ_BUCKET_MIN +VLLM_PROMPT_SEQ_BUCKET_STEP +VLLM_DECODE_BLOCK_BUCKET_MIN +VLLM_DECODE_BLOCK_BUCKET_STEP +MAX_NUM_PREFILL_SEQS +NUM_HIDDEN_LAYERS +HIDDEN_SIZE +NUM_KEY_VALUE_HEADS +NUM_ATTENTION_HEADS +CACHE_DTYPE_BYTES +LIMIT_MODEL_LEN +PT_HPU_LAZY_MODE +VLLM_DELAYED_SAMPLING +VLLM_SKIP_WARMUP +EXPERIMENTAL_WEIGHT_SHARING +VLLM_EXPONENTIAL_BUCKETING +MAX_NUM_BATCHED_TOKENS +PT_HPU_ENABLE_LAZY_COLLECTIVES +DEVICE_HPU_MEM +MODEL_MEM_IN_GB +USABLE_MEM +GPU_MEM_UTILIZATION +KV_CACHE_PER_SEQ +EST_MAX_NUM_SEQS +EST_HPU_BLOCKS +DECODE_BS_RAMP_GRAPHS +DECODE_BS_STEP_GRAPHS +DECODE_BLOCK_RAMP_GRAPHS +DECODE_BLOCK_STEP_GRAPHS +NUM_DECODE_GRAPHS +PROMPT_BS_RAMP_GRAPHS +PROMPT_BS_STEP_GRAPHS +PROMPT_SEQ_RAMP_GRAPHS +PROMPT_SEQ_STEP_GRAPHS +EST_NUM_PROMPT_GRAPHS +EST_GRAPH_PROMPT_RATIO +VLLM_GRAPH_PROMPT_RATIO +DECODE_GRAPH_TARGET_GB +EST_GRAPH_RESERVE_MEM +VLLM_GRAPH_RESERVED_MEM +KV_CACHE_MEM +MAX_NUM_SEQS +VLLM_PROMPT_SEQ_BUCKET_MAX +VLLM_CONTIGUOUS_PA +VLLM_DEFRAG +ASYNC_SCHEDULING +VLLM_WEIGHT_LOAD_FORCE_SYNC diff --git a/.cd/server/server_user.env b/.cd/server/server_user.env index dd125729..3dd52ba0 100644 --- a/.cd/server/server_user.env +++ b/.cd/server/server_user.env @@ -9,3 +9,5 @@ MAX_MODEL_LEN MAX_NUM_SEQS TENSOR_PARALLEL_SIZE VLLM_EXPONENTIAL_BUCKETING +GPU_MEM_UTILIZATION +ASYNC_SCHEDULING diff --git a/.cd/server/settings_vllm.csv b/.cd/server/settings_vllm.csv index a5021b97..bb677487 100644 --- a/.cd/server/settings_vllm.csv +++ b/.cd/server/settings_vllm.csv @@ -1,21 +1,19 @@ -MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING -meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,131072,1,TRUE,FALSE,0,TRUE -meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE -meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE -meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE -meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE -mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE -mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE -mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE -meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE -Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE -deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE -Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE -Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE -Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE -Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE -meta-llama/Llama-3.2-11B-Vision-Instruct,1,8448,128,2,21340441670,2,2,19.87483507,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,40,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE -meta-llama/Llama-3.2-90B-Vision-Instruct,4,8448,512,2,177186710646,2,2,165.0179835,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,100,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE -ibm-granite/granite-8b-code-instruct-4k,1,2048,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE -ibm-granite/granite-20b-code-instruct-8k,1,2048,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,40,80,8192,16,80,2,65536,1,TRUE,FALSE,0,FALSE -Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE +MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC +meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,1 +Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 diff --git a/.cd/server/vllm_autocalc_rules.py b/.cd/server/vllm_autocalc_rules.py index ef813751..30290edb 100644 --- a/.cd/server/vllm_autocalc_rules.py +++ b/.cd/server/vllm_autocalc_rules.py @@ -42,6 +42,9 @@ def calc_GPU_MEMORY_UTIL_TEMP(ctx): def calc_GPU_MEM_UTILIZATION(ctx): + # If user provided + if ctx.get('GPU_MEM_UTILIZATION') is not None: + return ctx['GPU_MEM_UTILIZATION'] return math.floor(ctx['GPU_MEMORY_UTIL_TEMP'] * 100) / 100 @@ -78,8 +81,13 @@ def calc_DECODE_BLOCK_STEP_GRAPHS(ctx): def calc_NUM_DECODE_GRAPHS(ctx): - return ((ctx['DECODE_BS_RAMP_GRAPHS'] + ctx['DECODE_BS_STEP_GRAPHS']) * - (ctx['DECODE_BLOCK_RAMP_GRAPHS'] + ctx['DECODE_BLOCK_STEP_GRAPHS'])) + # 3d update + decode_graphs = ((ctx['DECODE_BS_RAMP_GRAPHS'] + ctx['DECODE_BS_STEP_GRAPHS']) * + (ctx['DECODE_BLOCK_RAMP_GRAPHS'] + ctx['DECODE_BLOCK_STEP_GRAPHS'])) + if ctx['VLLM_CONTIGUOUS_PA']: + return decode_graphs + else: + return decode_graphs / 2 def calc_PROMPT_BS_RAMP_GRAPHS(ctx): @@ -99,12 +107,20 @@ def calc_PROMPT_SEQ_RAMP_GRAPHS(ctx): def calc_PROMPT_SEQ_STEP_GRAPHS(ctx): - return int(1 + (ctx['MAX_MODEL_LEN'] - ctx['VLLM_PROMPT_SEQ_BUCKET_STEP']) / ctx['VLLM_PROMPT_SEQ_BUCKET_STEP']) + return int(1 + (min(ctx['MAX_NUM_BATCHED_TOKENS'], ctx['MAX_MODEL_LEN']) - ctx['VLLM_PROMPT_SEQ_BUCKET_STEP']) / + ctx['VLLM_PROMPT_SEQ_BUCKET_STEP']) def calc_EST_NUM_PROMPT_GRAPHS(ctx): - return ((ctx['PROMPT_BS_RAMP_GRAPHS'] + ctx['PROMPT_BS_STEP_GRAPHS']) * - (ctx['PROMPT_SEQ_RAMP_GRAPHS'] + ctx['PROMPT_SEQ_STEP_GRAPHS']) / 2) + prompt_bs_graphs = ctx['PROMPT_BS_RAMP_GRAPHS'] + ctx['PROMPT_BS_STEP_GRAPHS'] + prompt_seq_graphs = ctx['PROMPT_SEQ_RAMP_GRAPHS'] + ctx['PROMPT_SEQ_STEP_GRAPHS'] + graphs_2d = prompt_bs_graphs * prompt_seq_graphs + if prompt_bs_graphs > 1: + graphs_2d = graphs_2d / 2 + ctx_block_graphs_max = (ctx['MAX_MODEL_LEN'] - ctx['VLLM_PROMPT_SEQ_BUCKET_MIN']) / ctx['BLOCK_SIZE'] + ctx_block_graphs_min = max(1, (ctx['MAX_MODEL_LEN'] - ctx['MAX_NUM_BATCHED_TOKENS']) / ctx['BLOCK_SIZE']) + graphs_3d = graphs_2d * (ctx_block_graphs_max + ctx_block_graphs_min) / 2 + return graphs_3d def calc_EST_GRAPH_PROMPT_RATIO(ctx): diff --git a/.cd/templates/template_vllm_benchmark.sh b/.cd/templates/template_vllm_benchmark.sh index 3af3e3f3..ad890ef3 100644 --- a/.cd/templates/template_vllm_benchmark.sh +++ b/.cd/templates/template_vllm_benchmark.sh @@ -3,7 +3,7 @@ #@VARS # Wait for vLLM server to be ready -until curl -s http://localhost:8000${ENDPOINT} > /dev/null; do +until curl -s http://localhost:8000/v1/models > /dev/null; do echo "Waiting for vLLM server to be ready..." sleep 15 done @@ -25,6 +25,7 @@ vllm bench serve \ --model $MODEL \ --base-url http://localhost:8000 \ --endpoint $ENDPOINT \ + --endpoint-type $BACKEND \ --backend $BACKEND \ --dataset-name $DATASET_NAME \ --dataset-path $DATASET\ @@ -35,4 +36,4 @@ vllm bench serve \ --metric-percentiles 90 \ --ignore-eos \ --trust-remote-code \ -2>&1 | tee -a logs/perftest_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log \ No newline at end of file +2>&1 | tee -a logs/perftest_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log diff --git a/.cd/templates/template_vllm_server.sh b/.cd/templates/template_vllm_server.sh index c28cd3ed..b6db4e8c 100644 --- a/.cd/templates/template_vllm_server.sh +++ b/.cd/templates/template_vllm_server.sh @@ -2,6 +2,10 @@ #@VARS +if [ $ASYNC_SCHEDULING -gt 0 ]; then # Checks if using async scheduling + EXTRA_ARGS+=" --async_scheduling" +fi + ## Start server vllm serve $MODEL \ --block-size $BLOCK_SIZE \ @@ -11,5 +15,7 @@ vllm serve $MODEL \ --max-model-len $MAX_MODEL_LEN \ --gpu-memory-utilization $GPU_MEM_UTILIZATION \ --max-num-seqs $MAX_NUM_SEQS \ - --disable-log-requests \ + --generation-config vllm \ + --max_num_batched_tokens $MAX_NUM_BATCHED_TOKENS \ + --disable-log-requests ${EXTRA_ARGS} \ 2>&1 | tee -a logs/vllm_server.log diff --git a/.cd/tests/test_vllm_autocalc_rules.py b/.cd/tests/test_vllm_autocalc_rules.py index 11512018..17a504e1 100644 --- a/.cd/tests/test_vllm_autocalc_rules.py +++ b/.cd/tests/test_vllm_autocalc_rules.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +import pytest import math import server.vllm_autocalc_rules as rules @@ -110,14 +111,16 @@ def test_calc_DECODE_BLOCK_STEP_GRAPHS(): assert rules.calc_DECODE_BLOCK_STEP_GRAPHS(ctx) == expected -def test_calc_NUM_DECODE_GRAPHS(): +@pytest.mark.parametrize("cpa", ["true", "false"]) +def test_calc_NUM_DECODE_GRAPHS(cpa): ctx = { 'DECODE_BS_RAMP_GRAPHS': 2, 'DECODE_BS_STEP_GRAPHS': 3, 'DECODE_BLOCK_RAMP_GRAPHS': 4, - 'DECODE_BLOCK_STEP_GRAPHS': 5 + 'DECODE_BLOCK_STEP_GRAPHS': 5, + 'VLLM_CONTIGUOUS_PA': cpa } - expected = (2 + 3) * (4 + 5) + expected = (2 + 3) * (4 + 5) if cpa else (2 + 3) * (4 + 5) / 2 assert rules.calc_NUM_DECODE_GRAPHS(ctx) == expected @@ -140,19 +143,23 @@ def test_calc_PROMPT_SEQ_RAMP_GRAPHS(): def test_calc_PROMPT_SEQ_STEP_GRAPHS(): - ctx = {'MAX_MODEL_LEN': 64, 'VLLM_PROMPT_SEQ_BUCKET_STEP': 8} - expected = int(1 + (64 - 8) / 8) + ctx = {'MAX_NUM_BATCHED_TOKENS': 32, 'MAX_MODEL_LEN': 64, 'VLLM_PROMPT_SEQ_BUCKET_STEP': 8} + expected = int(1 + (32 - 8) / 8) assert rules.calc_PROMPT_SEQ_STEP_GRAPHS(ctx) == expected def test_calc_EST_NUM_PROMPT_GRAPHS(): ctx = { - 'PROMPT_BS_RAMP_GRAPHS': 2, - 'PROMPT_BS_STEP_GRAPHS': 3, + 'PROMPT_BS_RAMP_GRAPHS': 1, + 'PROMPT_BS_STEP_GRAPHS': 0, 'PROMPT_SEQ_RAMP_GRAPHS': 4, - 'PROMPT_SEQ_STEP_GRAPHS': 5 + 'PROMPT_SEQ_STEP_GRAPHS': 5, + 'MAX_NUM_BATCHED_TOKENS': 2048, + 'MAX_MODEL_LEN': 4352, + 'VLLM_PROMPT_SEQ_BUCKET_MIN': 128, + 'BLOCK_SIZE': 128, } - expected = ((2 + 3) * (4 + 5)) / 2 + expected = ((1 + 0) * (4 + 5)) * (33 + 18) / 2 assert rules.calc_EST_NUM_PROMPT_GRAPHS(ctx) == expected