Skip to content

Question, pod failed to start #1477

@zhouji1026

Description

@zhouji1026

root@vm-22a-2d71k:/home/k8s/deploy/models# kubectl logs -n default -f pods/vllm-qwen-pod
INFO 11-05 22:11:54 api_server.py:526] vLLM API server version 0.6.1.dev238+ge2c6e0a82
INFO 11-05 22:11:54 api_server.py:527] args: Namespace(host='0.0.0.0', port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=[''], allowed_methods=[''], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_auto_tool_choice=False, tool_call_parser=None, model='./', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=True, download_dir=None, load_format='auto', config_format='auto', dtype='auto', kv_cache_dtype='auto', quantization_param_path=None, max_model_len=None, guided_decoding_backend='outlines', distributed_executor_backend=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=16, enable_prefix_caching=False, disable_sliding_window=False, use_v2_block_manager=False, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.6, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_seqs=256, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, enforce_eager=False, max_context_len_to_capture=None, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, enable_lora=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=False, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, override_neuron_config=None, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False)
INFO 11-05 22:11:54 api_server.py:164] Multiprocessing frontend to use ipc:///tmp/54a46ec9-9268-4867-a7b0-8d2e63819530 for IPC Path.
INFO 11-05 22:11:54 api_server.py:177] Started engine process with PID 77
INFO 11-05 22:11:54 config.py:1652] Downcasting torch.float32 to torch.float16.
/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py:1617: FutureWarning: clean_up_tokenization_spaces was not set. It will be set to True by default. This behavior will be deprecated in transformers v4.45, and will be then set to False by default. For more details check this issue: huggingface/transformers#31884
warnings.warn(
WARNING 11-05 22:11:54 tokenizer.py:156] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead.
INFO 11-05 22:11:57 config.py:1652] Downcasting torch.float32 to torch.float16.
INFO 11-05 22:11:57 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='./', speculative_config=None, tokenizer='./', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./, use_v2_block_manager=False, num_scheduler_steps=1, multi_step_stream_outputs=False, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=True, mm_processor_kwargs=None)
/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py:1617: FutureWarning: clean_up_tokenization_spaces was not set. It will be set to True by default. This behavior will be deprecated in transformers v4.45, and will be then set to False by default. For more details check this issue: huggingface/transformers#31884
warnings.warn(
WARNING 11-05 22:11:57 tokenizer.py:156] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead.
INFO 11-05 22:11:58 model_runner.py:1014] Starting to load model ./...
[HAMI-core ERROR (pid:77 thread=140077414037120 allocator.c:121)]: cuMemoryAllocate failed res=2
[HAMI-core ERROR (pid:77 thread=140077414037120 allocator.c:121)]: cuMemoryAllocate failed res=2
Process SpawnProcess-1:
Traceback (most recent call last):
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 388, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 138, in from_engine_args
return cls(
^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 78, in init
self.engine = LLMEngine(*args,
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 325, in init
self.model_executor = executor_class(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 47, in init
self._init_executor()
File "/usr/local/lib/python3.12/dist-packages/vllm/executor/gpu_executor.py", line 40, in _init_executor
self.driver_worker.load_model()
File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 183, in load_model
self.model_runner.load_model()
File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1016, in load_model
self.model = get_model(model_config=self.model_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/init.py", line 19, in get_model
return loader.load_model(model_config=model_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 399, in load_model
model = _initialize_model(model_config, self.load_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 176, in _initialize_model
return build_model(
^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 161, in build_model
return model_class(config=hf_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen.py", line 876, in init
self.transformer = QWenModel(config, cache_config, quant_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen.py", line 566, in init
self.start_layer, self.end_layer, self.h = make_layers(
^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 283, in make_layers
maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen.py", line 568, in
lambda prefix: QWenBlock(config, cache_config, quant_config),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen.py", line 518, in init
self.mlp = QWenMLP(config.hidden_size,
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen.py", line 415, in init
self.c_proj = RowParallelLinear(intermediate_size,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 983, in init
self.quant_method.create_weights(
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 122, in create_weights
weight = Parameter(torch.empty(sum(output_partition_sizes),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_device.py", line 79, in torch_function
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 58.59 GiB of which 55.61 GiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 2.96 GiB is allocated by PyTorch, and 19.80 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Traceback (most recent call last):
File "", line 198, in _run_module_as_main
File "", line 88, in _run_code
File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 571, in
uvloop.run(run_server(args))
File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 109, in run
return __asyncio.run(
^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 194, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1517, in uvloop.loop.Loop.run_until_complete
File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 61, in wrapper
return await main
^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 538, in run_server
async with build_async_engine_client(args) as engine_client:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 210, in aenter
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 105, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 210, in aenter
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 192, in build_async_engine_client_from_engine_args
raise RuntimeError(
RuntimeError: Engine process failed to start
这是我的POD日志,

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions