Skip to content

Commit 949ef99

Browse files
authored
Fixes the bug of moving input_names after tokenizer call (nod-ai#449)
Fixes the bug introduced in commit: "Fixes native inference input size mistmatch issue (nod-ai#447)" tokenizer.model_input_names should be updated before tokenizer call.
1 parent 65aa453 commit 949ef99

File tree

1 file changed

+19
-22
lines changed

1 file changed

+19
-22
lines changed

alt_e2eshark/onnx_tests/models/hf_models.py

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def get_tokenizer_from_model_path(model_repo_path: str, cache_dir: str | Path):
171171
if 'kobert' in name.lower():
172172
trust_remote_code = True
173173

174-
return AutoTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir, trust_remote_code=True)
174+
return AutoTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir, trust_remote_code=trust_remote_code)
175175

176176

177177
def build_repo_to_model_map():
@@ -245,32 +245,29 @@ def construct_inputs(self):
245245

246246
tokenizer = get_tokenizer_from_model_path(self.model_repo_path, self.cache_dir)
247247

248-
tokens = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
249-
self.input_name_to_shape_map = {k: v.shape for (k, v) in tokens.items()}
250-
251248
if self.name in models_with_input_names_2:
252249
# Handles 2 inputs
253250
tokenizer.model_input_names = ["input_ids", "attention_mask"]
254-
inputs = (*list(tokens.values()), )
255-
else:
256-
self.input_name_to_shape_map["position_ids"] = self.input_name_to_shape_map["input_ids"]
257-
zeros = torch.zeros(*(self.input_name_to_shape_map["input_ids"]), dtype=int)
258-
if self.name in models_with_input_names_3:
259-
# Handles 3 inputs
260-
tokenizer.model_input_names = ["input_ids", "attention_mask", "position_ids"]
261-
elif self.name in models_with_input_names_4:
262-
tokenizer.model_input_names = ["input_ids", "bbox", "attention_mask", "position_ids"]
263-
264-
# Handles 4 inputs
265-
# Tokenizer is returning tokens dict with key token_type_ids" instead of "bbox".
266-
# For now, "token_type_ids" will be reused as bbox in this case
267-
# bbox is a bounding box with size [?, ?, 4]
268-
# where each 4 numbers represent x_min, y_min, x_max, y_max
269-
tokens["token_type_ids"] = tokens["token_type_ids"].unsqueeze(-1).repeat(1, 1, 4)
270-
else:
271-
raise RuntimeError(f"Model: {self.name} not found in any of the registry lists.")
272251

252+
tokens = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
253+
254+
if self.name in models_with_input_names_4:
255+
# Handles 4 inputs
256+
# Tokenizer is returning tokens dict with key token_type_ids" instead of "bbox".
257+
# For now, "token_type_ids" will be reused as bbox in this case
258+
# bbox is a bounding box with size [?, ?, 4]
259+
# where each 4 numbers represent x_min, y_min, x_max, y_max
260+
print(f'DEBUG: {tokens=}')
261+
tokens["token_type_ids"] = tokens["token_type_ids"].unsqueeze(-1).repeat(1, 1, 4)
262+
263+
self.input_name_to_shape_map = {k: v.shape for (k, v) in tokens.items()}
264+
if self.name in models_with_input_names_3 or self.name in models_with_input_names_4:
265+
# Handles 3 and 4 inputs
266+
self.input_name_to_shape_map["position_ids"] = self.input_name_to_shape_map["input_ids"]
267+
zeros = torch.zeros(*(self.input_name_to_shape_map["position_ids"]), dtype=int)
273268
inputs = (*list(tokens.values()), zeros)
269+
else:
270+
inputs = (*list(tokens.values()), )
274271

275272
test_tensors = TestTensors(inputs)
276273
return test_tensors

0 commit comments

Comments
 (0)