Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .cd/benchmark/benchmark_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,10 @@ model_text:

model_vision:
MODELS:
- meta-llama/Llama-3.2-11B-Vision-Instruct
- meta-llama/Llama-3.2-90B-Vision-Instruct
- Qwen/Qwen2.5-VL-7B-Instruct
DATASET: lmarena-ai/vision-arena-bench-v0.1
DATASET_NAME: hf
BACKEND: openai-chat
ENDPOINT: /v1/chat/completions
CONCURRENT_REQ: 64
NUM_PROMPTS: 500
NUM_PROMPTS: 500
6 changes: 0 additions & 6 deletions .cd/benchmark/benchmark_scenarios_vision.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,2 @@
llama32-11B-Vision-Instruct:
MODEL: meta-llama/Llama-3.2-11B-Vision-Instruct

llama32-90B-Vision-Instruct:
MODEL: meta-llama/Llama-3.2-90B-Vision-Instruct

qwen2.5-vl-7b-instruct:
MODEL: Qwen/Qwen2.5-VL-7B-Instruct
1 change: 1 addition & 0 deletions .cd/entrypoints/entrypoint_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ def run(self):
output_script_path="vllm_server.sh",
variables=variables,
log_dir="logs",
varlist_conf_path="server/server_output.env",
).create_and_run()
elif self.mode == "benchmark":
print("[INFO] Starting container in benchmark mode.")
Expand Down
14 changes: 12 additions & 2 deletions .cd/entrypoints/script_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

class ScriptGenerator:

def __init__(self, template_script_path, output_script_path, variables, log_dir="logs"):
def __init__(self, template_script_path, output_script_path, variables, log_dir="logs", varlist_conf_path=None):
self.template_script_path = template_script_path
self.varlist_conf_path = varlist_conf_path
self.output_script_path = output_script_path
self.variables = variables
self.log_dir = log_dir
Expand All @@ -19,7 +20,16 @@ def generate_script(self, vars_dict):
"""
with open(self.template_script_path) as f:
template = f.read()
export_lines = "\n".join([f"export {k}={v}" for k, v in vars_dict.items()])
# Create our output list
if self.varlist_conf_path:
output_dict = {}
with open(self.varlist_conf_path) as var_file:
for line in var_file:
param = line.strip()
output_dict[param] = vars_dict[param]
export_lines = "\n".join([f"export {k}={v}" for k, v in output_dict.items()])
else:
export_lines = "\n".join([f"export {k}={v}" for k, v in vars_dict.items()])
script_content = template.replace("#@VARS", export_lines)
with open(self.output_script_path, 'w') as f:
f.write(script_content)
Expand Down
60 changes: 60 additions & 0 deletions .cd/server/server_output.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
MODEL
DTYPE
DEVICE_NAME
TENSOR_PARALLEL_SIZE
MAX_MODEL_LEN
TOTAL_GPU_MEM
MODEL_DTYPE
QUANT_DTYPE
BLOCK_SIZE
VLLM_PROMPT_BS_BUCKET_MIN
VLLM_PROMPT_BS_BUCKET_STEP
VLLM_DECODE_BS_BUCKET_MIN
VLLM_DECODE_BS_BUCKET_STEP
VLLM_PROMPT_SEQ_BUCKET_MIN
VLLM_PROMPT_SEQ_BUCKET_STEP
VLLM_DECODE_BLOCK_BUCKET_MIN
VLLM_DECODE_BLOCK_BUCKET_STEP
MAX_NUM_PREFILL_SEQS
NUM_HIDDEN_LAYERS
HIDDEN_SIZE
NUM_KEY_VALUE_HEADS
NUM_ATTENTION_HEADS
CACHE_DTYPE_BYTES
LIMIT_MODEL_LEN
PT_HPU_LAZY_MODE
VLLM_DELAYED_SAMPLING
VLLM_SKIP_WARMUP
EXPERIMENTAL_WEIGHT_SHARING
VLLM_EXPONENTIAL_BUCKETING
MAX_NUM_BATCHED_TOKENS
PT_HPU_ENABLE_LAZY_COLLECTIVES
DEVICE_HPU_MEM
MODEL_MEM_IN_GB
USABLE_MEM
GPU_MEM_UTILIZATION
KV_CACHE_PER_SEQ
EST_MAX_NUM_SEQS
EST_HPU_BLOCKS
DECODE_BS_RAMP_GRAPHS
DECODE_BS_STEP_GRAPHS
DECODE_BLOCK_RAMP_GRAPHS
DECODE_BLOCK_STEP_GRAPHS
NUM_DECODE_GRAPHS
PROMPT_BS_RAMP_GRAPHS
PROMPT_BS_STEP_GRAPHS
PROMPT_SEQ_RAMP_GRAPHS
PROMPT_SEQ_STEP_GRAPHS
EST_NUM_PROMPT_GRAPHS
EST_GRAPH_PROMPT_RATIO
VLLM_GRAPH_PROMPT_RATIO
DECODE_GRAPH_TARGET_GB
EST_GRAPH_RESERVE_MEM
VLLM_GRAPH_RESERVED_MEM
KV_CACHE_MEM
MAX_NUM_SEQS
VLLM_PROMPT_SEQ_BUCKET_MAX
VLLM_CONTIGUOUS_PA
VLLM_DEFRAG
ASYNC_SCHEDULING
VLLM_WEIGHT_LOAD_FORCE_SYNC
2 changes: 2 additions & 0 deletions .cd/server/server_user.env
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ MAX_MODEL_LEN
MAX_NUM_SEQS
TENSOR_PARALLEL_SIZE
VLLM_EXPONENTIAL_BUCKETING
GPU_MEM_UTILIZATION
ASYNC_SCHEDULING
40 changes: 19 additions & 21 deletions .cd/server/settings_vllm.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,131072,1,TRUE,FALSE,0,TRUE
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE
meta-llama/Llama-3.2-11B-Vision-Instruct,1,8448,128,2,21340441670,2,2,19.87483507,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,40,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE
meta-llama/Llama-3.2-90B-Vision-Instruct,4,8448,512,2,177186710646,2,2,165.0179835,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,100,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE
ibm-granite/granite-8b-code-instruct-4k,1,2048,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE
ibm-granite/granite-20b-code-instruct-8k,1,2048,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,40,80,8192,16,80,2,65536,1,TRUE,FALSE,0,FALSE
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,1
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
26 changes: 21 additions & 5 deletions .cd/server/vllm_autocalc_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def calc_GPU_MEMORY_UTIL_TEMP(ctx):


def calc_GPU_MEM_UTILIZATION(ctx):
# If user provided
if ctx.get('GPU_MEM_UTILIZATION') is not None:
return ctx['GPU_MEM_UTILIZATION']
return math.floor(ctx['GPU_MEMORY_UTIL_TEMP'] * 100) / 100


Expand Down Expand Up @@ -78,8 +81,13 @@ def calc_DECODE_BLOCK_STEP_GRAPHS(ctx):


def calc_NUM_DECODE_GRAPHS(ctx):
return ((ctx['DECODE_BS_RAMP_GRAPHS'] + ctx['DECODE_BS_STEP_GRAPHS']) *
(ctx['DECODE_BLOCK_RAMP_GRAPHS'] + ctx['DECODE_BLOCK_STEP_GRAPHS']))
# 3d update
decode_graphs = ((ctx['DECODE_BS_RAMP_GRAPHS'] + ctx['DECODE_BS_STEP_GRAPHS']) *
(ctx['DECODE_BLOCK_RAMP_GRAPHS'] + ctx['DECODE_BLOCK_STEP_GRAPHS']))
if ctx['VLLM_CONTIGUOUS_PA']:
return decode_graphs
else:
return decode_graphs / 2


def calc_PROMPT_BS_RAMP_GRAPHS(ctx):
Expand All @@ -99,12 +107,20 @@ def calc_PROMPT_SEQ_RAMP_GRAPHS(ctx):


def calc_PROMPT_SEQ_STEP_GRAPHS(ctx):
return int(1 + (ctx['MAX_MODEL_LEN'] - ctx['VLLM_PROMPT_SEQ_BUCKET_STEP']) / ctx['VLLM_PROMPT_SEQ_BUCKET_STEP'])
return int(1 + (min(ctx['MAX_NUM_BATCHED_TOKENS'], ctx['MAX_MODEL_LEN']) - ctx['VLLM_PROMPT_SEQ_BUCKET_STEP']) /
ctx['VLLM_PROMPT_SEQ_BUCKET_STEP'])


def calc_EST_NUM_PROMPT_GRAPHS(ctx):
return ((ctx['PROMPT_BS_RAMP_GRAPHS'] + ctx['PROMPT_BS_STEP_GRAPHS']) *
(ctx['PROMPT_SEQ_RAMP_GRAPHS'] + ctx['PROMPT_SEQ_STEP_GRAPHS']) / 2)
prompt_bs_graphs = ctx['PROMPT_BS_RAMP_GRAPHS'] + ctx['PROMPT_BS_STEP_GRAPHS']
prompt_seq_graphs = ctx['PROMPT_SEQ_RAMP_GRAPHS'] + ctx['PROMPT_SEQ_STEP_GRAPHS']
graphs_2d = prompt_bs_graphs * prompt_seq_graphs
if prompt_bs_graphs > 1:
graphs_2d = graphs_2d / 2
ctx_block_graphs_max = (ctx['MAX_MODEL_LEN'] - ctx['VLLM_PROMPT_SEQ_BUCKET_MIN']) / ctx['BLOCK_SIZE']
ctx_block_graphs_min = max(1, (ctx['MAX_MODEL_LEN'] - ctx['MAX_NUM_BATCHED_TOKENS']) / ctx['BLOCK_SIZE'])
graphs_3d = graphs_2d * (ctx_block_graphs_max + ctx_block_graphs_min) / 2
return graphs_3d


def calc_EST_GRAPH_PROMPT_RATIO(ctx):
Expand Down
5 changes: 3 additions & 2 deletions .cd/templates/template_vllm_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#@VARS

# Wait for vLLM server to be ready
until curl -s http://localhost:8000${ENDPOINT} > /dev/null; do
until curl -s http://localhost:8000/v1/models > /dev/null; do
echo "Waiting for vLLM server to be ready..."
sleep 15
done
Expand All @@ -25,6 +25,7 @@ vllm bench serve \
--model $MODEL \
--base-url http://localhost:8000 \
--endpoint $ENDPOINT \
--endpoint-type $BACKEND \
--backend $BACKEND \
--dataset-name $DATASET_NAME \
--dataset-path $DATASET\
Expand All @@ -35,4 +36,4 @@ vllm bench serve \
--metric-percentiles 90 \
--ignore-eos \
--trust-remote-code \
2>&1 | tee -a logs/perftest_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log
2>&1 | tee -a logs/perftest_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log
8 changes: 7 additions & 1 deletion .cd/templates/template_vllm_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

#@VARS

if [ $ASYNC_SCHEDULING -gt 0 ]; then # Checks if using async scheduling
EXTRA_ARGS+=" --async_scheduling"
fi

## Start server
vllm serve $MODEL \
--block-size $BLOCK_SIZE \
Expand All @@ -11,5 +15,7 @@ vllm serve $MODEL \
--max-model-len $MAX_MODEL_LEN \
--gpu-memory-utilization $GPU_MEM_UTILIZATION \
--max-num-seqs $MAX_NUM_SEQS \
--disable-log-requests \
--generation-config vllm \
--max_num_batched_tokens $MAX_NUM_BATCHED_TOKENS \
--disable-log-requests ${EXTRA_ARGS} \
2>&1 | tee -a logs/vllm_server.log
25 changes: 16 additions & 9 deletions .cd/tests/test_vllm_autocalc_rules.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
import math

import server.vllm_autocalc_rules as rules
Expand Down Expand Up @@ -110,14 +111,16 @@ def test_calc_DECODE_BLOCK_STEP_GRAPHS():
assert rules.calc_DECODE_BLOCK_STEP_GRAPHS(ctx) == expected


def test_calc_NUM_DECODE_GRAPHS():
@pytest.mark.parametrize("cpa", ["true", "false"])
def test_calc_NUM_DECODE_GRAPHS(cpa):
ctx = {
'DECODE_BS_RAMP_GRAPHS': 2,
'DECODE_BS_STEP_GRAPHS': 3,
'DECODE_BLOCK_RAMP_GRAPHS': 4,
'DECODE_BLOCK_STEP_GRAPHS': 5
'DECODE_BLOCK_STEP_GRAPHS': 5,
'VLLM_CONTIGUOUS_PA': cpa
}
expected = (2 + 3) * (4 + 5)
expected = (2 + 3) * (4 + 5) if cpa else (2 + 3) * (4 + 5) / 2
assert rules.calc_NUM_DECODE_GRAPHS(ctx) == expected


Expand All @@ -140,19 +143,23 @@ def test_calc_PROMPT_SEQ_RAMP_GRAPHS():


def test_calc_PROMPT_SEQ_STEP_GRAPHS():
ctx = {'MAX_MODEL_LEN': 64, 'VLLM_PROMPT_SEQ_BUCKET_STEP': 8}
expected = int(1 + (64 - 8) / 8)
ctx = {'MAX_NUM_BATCHED_TOKENS': 32, 'MAX_MODEL_LEN': 64, 'VLLM_PROMPT_SEQ_BUCKET_STEP': 8}
expected = int(1 + (32 - 8) / 8)
assert rules.calc_PROMPT_SEQ_STEP_GRAPHS(ctx) == expected


def test_calc_EST_NUM_PROMPT_GRAPHS():
ctx = {
'PROMPT_BS_RAMP_GRAPHS': 2,
'PROMPT_BS_STEP_GRAPHS': 3,
'PROMPT_BS_RAMP_GRAPHS': 1,
'PROMPT_BS_STEP_GRAPHS': 0,
'PROMPT_SEQ_RAMP_GRAPHS': 4,
'PROMPT_SEQ_STEP_GRAPHS': 5
'PROMPT_SEQ_STEP_GRAPHS': 5,
'MAX_NUM_BATCHED_TOKENS': 2048,
'MAX_MODEL_LEN': 4352,
'VLLM_PROMPT_SEQ_BUCKET_MIN': 128,
'BLOCK_SIZE': 128,
}
expected = ((2 + 3) * (4 + 5)) / 2
expected = ((1 + 0) * (4 + 5)) * (33 + 18) / 2
assert rules.calc_EST_NUM_PROMPT_GRAPHS(ctx) == expected


Expand Down