Skip to content

Commit a379e6e

Browse files
[Qwen2_5_vl] - Onboarding Qwen2_5_vl model in QEfficient (quic#560)
Signed-off-by: Mohit Soni <[email protected]> Co-authored-by: Hem Agnihotri <[email protected]>
1 parent 3dfb840 commit a379e6e

File tree

7 files changed

+1172
-5
lines changed

7 files changed

+1172
-5
lines changed

QEfficient/transformers/models/modeling_auto.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,9 +1313,14 @@ def kv_offload_generate(
13131313
vision_end = perf_counter()
13141314

13151315
lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}
1316-
lang_inputs["position_ids"] = np.where(
1317-
lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
1318-
) # Need to use -1 as position_ids for invalid tokens
1316+
1317+
if "position_ids" in inputs:
1318+
lang_inputs["position_ids"] = inputs["position_ids"]
1319+
lang_inputs.pop("attention_mask")
1320+
else:
1321+
lang_inputs["position_ids"] = np.where(
1322+
lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
1323+
) # Need to use -1 as position_ids for invalid tokens
13191324

13201325
not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
13211326
if not_mllama:
@@ -1336,7 +1341,7 @@ def kv_offload_generate(
13361341
for i in range(num_chunks):
13371342
chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
13381343
chunk_inputs["position_ids"] = lang_inputs["position_ids"][
1339-
:, i * prefill_seq_len : (i + 1) * prefill_seq_len
1344+
..., i * prefill_seq_len : (i + 1) * prefill_seq_len
13401345
]
13411346
outputs = lang_session.run(chunk_inputs)
13421347
chunk_inputs["image_idx"] = outputs["image_idx_output"]
@@ -1353,7 +1358,7 @@ def kv_offload_generate(
13531358

13541359
# Get first token
13551360
lang_inputs["input_ids"] = outputs["logits"].argmax(2)
1356-
lang_inputs["position_ids"] = input_len.numpy()
1361+
lang_inputs["position_ids"] = np.max(lang_inputs["position_ids"], axis=-1, keepdims=True) + 1
13571362
if "cross_attention_mask" in lang_inputs:
13581363
bs, _, num_images, img_tiles = lang_inputs["cross_attention_mask"].shape
13591364
lang_inputs["cross_attention_mask"] = torch.ones((bs, 1, num_images, img_tiles), dtype=torch.int64).numpy()

QEfficient/transformers/models/pytorch_transforms.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,18 @@
152152
Qwen2Model,
153153
Qwen2RMSNorm,
154154
)
155+
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
156+
Qwen2_5_VisionTransformerPretrainedModel,
157+
Qwen2_5_VLAttention,
158+
Qwen2_5_VLDecoderLayer,
159+
Qwen2_5_VLForConditionalGeneration,
160+
Qwen2_5_VLModel,
161+
Qwen2_5_VLTextModel,
162+
Qwen2_5_VLVisionAttention,
163+
)
164+
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
165+
Qwen2RMSNorm as Qwen2_5RMSNorm,
166+
)
155167
from transformers.models.qwen3.modeling_qwen3 import (
156168
Qwen3Attention,
157169
Qwen3DecoderLayer,
@@ -356,6 +368,15 @@
356368
QEffQwen2ForCausalLM,
357369
QEffQwen2Model,
358370
)
371+
from QEfficient.transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
372+
QEffQwen2_5_VisionTransformerPretrainedModel,
373+
QEffQwen2_5_VLAttention,
374+
QEffQwen2_5_VLDecoderLayer,
375+
QEffQwen2_5_VLModel,
376+
QEffQwen2_5_VLTextModel,
377+
QEffQwen2_5_VLVisionAttention,
378+
QEffQwen_2_5_vl_ForConditionalGeneration,
379+
)
359380
from QEfficient.transformers.models.qwen3.modeling_qwen3 import (
360381
QEffQwen3Attention,
361382
QEffQwen3DecoderLayer,
@@ -404,6 +425,7 @@ class CustomOpsTransform(ModuleMappingTransform):
404425
Phi3RMSNorm: CustomRMSNormAIC,
405426
Qwen2RMSNorm: CustomRMSNormAIC,
406427
Qwen3RMSNorm: CustomRMSNormAIC,
428+
Qwen2_5RMSNorm: CustomRMSNormAIC,
407429
MllamaTextRMSNorm: CustomRMSNormAIC,
408430
GraniteRMSNorm: CustomRMSNormAIC,
409431
PixtralRMSNorm: CustomRMSNormAIC,
@@ -544,6 +566,14 @@ class KVCacheTransform(ModuleMappingTransform):
544566
Qwen3DecoderLayer: QEffQwen3DecoderLayer,
545567
Qwen3Model: QEffQwen3Model,
546568
Qwen3ForCausalLM: QEffQwen3ForCausalLM,
569+
# Qwen2.5 VL
570+
Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
571+
Qwen2_5_VLModel: QEffQwen2_5_VLModel,
572+
Qwen2_5_VLAttention: QEffQwen2_5_VLAttention,
573+
Qwen2_5_VLDecoderLayer: QEffQwen2_5_VLDecoderLayer,
574+
Qwen2_5_VisionTransformerPretrainedModel: QEffQwen2_5_VisionTransformerPretrainedModel,
575+
Qwen2_5_VLVisionAttention: QEffQwen2_5_VLVisionAttention,
576+
Qwen2_5_VLTextModel: QEffQwen2_5_VLTextModel,
547577
# Starcoder2
548578
Starcoder2Attention: QEffStarcoder2Attention,
549579
Starcoder2DecoderLayer: QEFFStarcoder2DecoderLayer,
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# -----------------------------------------------------------------------------
2+
#
3+
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# -----------------------------------------------------------------------------

0 commit comments

Comments
 (0)