@@ -171,7 +171,7 @@ def get_tokenizer_from_model_path(model_repo_path: str, cache_dir: str | Path):
171171 if 'kobert' in name .lower ():
172172 trust_remote_code = True
173173
174- return AutoTokenizer .from_pretrained (model_repo_path , cache_dir = cache_dir , trust_remote_code = True )
174+ return AutoTokenizer .from_pretrained (model_repo_path , cache_dir = cache_dir , trust_remote_code = trust_remote_code )
175175
176176
177177def build_repo_to_model_map ():
@@ -245,32 +245,29 @@ def construct_inputs(self):
245245
246246 tokenizer = get_tokenizer_from_model_path (self .model_repo_path , self .cache_dir )
247247
248- tokens = tokenizer (prompt , return_tensors = "pt" , padding = True , truncation = True )
249- self .input_name_to_shape_map = {k : v .shape for (k , v ) in tokens .items ()}
250-
251248 if self .name in models_with_input_names_2 :
252249 # Handles 2 inputs
253250 tokenizer .model_input_names = ["input_ids" , "attention_mask" ]
254- inputs = (* list (tokens .values ()), )
255- else :
256- self .input_name_to_shape_map ["position_ids" ] = self .input_name_to_shape_map ["input_ids" ]
257- zeros = torch .zeros (* (self .input_name_to_shape_map ["input_ids" ]), dtype = int )
258- if self .name in models_with_input_names_3 :
259- # Handles 3 inputs
260- tokenizer .model_input_names = ["input_ids" , "attention_mask" , "position_ids" ]
261- elif self .name in models_with_input_names_4 :
262- tokenizer .model_input_names = ["input_ids" , "bbox" , "attention_mask" , "position_ids" ]
263-
264- # Handles 4 inputs
265- # Tokenizer is returning tokens dict with key token_type_ids" instead of "bbox".
266- # For now, "token_type_ids" will be reused as bbox in this case
267- # bbox is a bounding box with size [?, ?, 4]
268- # where each 4 numbers represent x_min, y_min, x_max, y_max
269- tokens ["token_type_ids" ] = tokens ["token_type_ids" ].unsqueeze (- 1 ).repeat (1 , 1 , 4 )
270- else :
271- raise RuntimeError (f"Model: { self .name } not found in any of the registry lists." )
272251
252+ tokens = tokenizer (prompt , return_tensors = "pt" , padding = True , truncation = True )
253+
254+ if self .name in models_with_input_names_4 :
255+ # Handles 4 inputs
256+ # Tokenizer is returning tokens dict with key token_type_ids" instead of "bbox".
257+ # For now, "token_type_ids" will be reused as bbox in this case
258+ # bbox is a bounding box with size [?, ?, 4]
259+ # where each 4 numbers represent x_min, y_min, x_max, y_max
260+ print (f'DEBUG: { tokens = } ' )
261+ tokens ["token_type_ids" ] = tokens ["token_type_ids" ].unsqueeze (- 1 ).repeat (1 , 1 , 4 )
262+
263+ self .input_name_to_shape_map = {k : v .shape for (k , v ) in tokens .items ()}
264+ if self .name in models_with_input_names_3 or self .name in models_with_input_names_4 :
265+ # Handles 3 and 4 inputs
266+ self .input_name_to_shape_map ["position_ids" ] = self .input_name_to_shape_map ["input_ids" ]
267+ zeros = torch .zeros (* (self .input_name_to_shape_map ["position_ids" ]), dtype = int )
273268 inputs = (* list (tokens .values ()), zeros )
269+ else :
270+ inputs = (* list (tokens .values ()), )
274271
275272 test_tensors = TestTensors (inputs )
276273 return test_tensors
0 commit comments