Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,15 +264,27 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
else:
position_ids = None

enable_thinking = request.get("enable_thinking", True)
enable_thinking = enable_thinking if enable_thinking is not None else True
self.share_inputs["enable_thinking"][:] = enable_thinking
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
)

if request.get("enable_thinking", False):
# Enable thinking
req_reasoning_max_tokens = request.get("reasoning_max_tokens")
req_max_tokens = request.get("max_tokens")
final_reasoning_tokens = (
req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens
)

self.share_inputs["enable_thinking"][idx : idx + 1] = True
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens
else:
# Disable thinking
self.share_inputs["enable_thinking"][idx : idx + 1] = False
self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0

if isinstance(request.prompt_token_ids, np.ndarray):
prompt_token_ids = request.prompt_token_ids.tolist()
else:
Expand Down Expand Up @@ -496,16 +508,28 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
self.share_inputs["prompt_lens"][idx : idx + 1] = length

if self.enable_mm:
enable_thinking = request.get("enable_thinking", True)
enable_thinking = enable_thinking if enable_thinking is not None else True
self.share_inputs["enable_thinking"][:] = enable_thinking
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
position_ids, request.get("max_tokens", 2048)
)
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0

if request.get("enable_thinking", False):
# Enable thinking
req_reasoning_max_tokens = request.get("reasoning_max_tokens")
req_max_tokens = request.get("max_tokens")
final_reasoning_tokens = (
req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens
)

self.share_inputs["enable_thinking"][idx : idx + 1] = True
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens
else:
# Disable thinking
self.share_inputs["enable_thinking"][idx : idx + 1] = False
self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0

def get_attr_from_request(request, attr, default_value=None):
res = request.get(attr, default_value)
if res is not None:
Expand Down
Loading
Loading