[Misc] rename torch_dtype to dtype (vllm-project#26695)

wangxiyuan · xuebwang-amd · commit ddee22f481b8 · 2025-10-24T09:30:38.000Z
Signed-off-by: wangxiyuan &lt;wangxiyuan1007@gmail.com&gt;
Signed-off-by: xuebwang-amd &lt;xuebwang@amd.com&gt;
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
@@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
     else:
         ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     block_quant_shape = get_weight_block_size_safety(config)
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
 
     hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     use_customized_permute = args.use_customized_permute
diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md
@@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
 model_name = "Qwen/Qwen3-0.6B"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 bits, group_size, sym = 4, 128, True
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
@@ -43,7 +43,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
-    torch_dtype="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
@@ -41,7 +41,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
-    torch_dtype="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
@@ -46,7 +46,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
-    torch_dtype="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
@@ -82,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
 
     # Select model and load it
     MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
     # Select calibration dataset
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
@@ -50,7 +50,7 @@ to fetch model and tokenizer.
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         device_map="auto",
-        torch_dtype="auto",
+        dtype="auto",
     )
     model.eval()
 
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md
@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
     quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
     quantized_model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        torch_dtype="auto",
+        dtype="auto",
         device_map="auto",
         quantization_config=quantization_config
     )
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.55.2
+transformers >= 4.56.0
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -334,15 +334,15 @@ def _init(
             trust_remote_code=trust_remote_code,
         )
         self.device = self.get_default_device()
-        self.dtype = torch_dtype = _get_and_verify_dtype(
+        self.dtype = dtype = _get_and_verify_dtype(
             self.model_name,
             self.config,
             dtype=dtype,
             is_pooling_model=is_sentence_transformer or is_cross_encoder,
         )
 
         model_kwargs = model_kwargs if model_kwargs is not None else {}
-        model_kwargs.setdefault("torch_dtype", torch_dtype)
+        model_kwargs.setdefault("dtype", dtype)
 
         if is_sentence_transformer:
             # Lazy init required for AMD CI
@@ -388,7 +388,7 @@ def _init(
         if not skip_tokenizer_init:
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
 
@@ -398,7 +398,7 @@ def _init(
 
         self.processor = AutoProcessor.from_pretrained(
             model_name,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
         if skip_tokenizer_init:
diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -38,7 +38,7 @@ def run_intern_vit_test(
         config.norm_type = "rms_norm"
 
     hf_model = AutoModel.from_pretrained(
-        model, torch_dtype=torch_dtype, trust_remote_code=True
+        model, dtype=torch_dtype, trust_remote_code=True
     ).to("cuda")
     hf_outputs_per_image = [
         hf_model(pixel_value.to("cuda")).last_hidden_state
diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py
@@ -45,7 +45,7 @@ def run_radio_test(
     hf_model = AutoModel.from_pretrained(
         model_id,
         config=config,
-        torch_dtype=torch_dtype,
+        dtype=torch_dtype,
         trust_remote_code=True,
     ).to("cuda")
     hf_model.eval()
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
@@ -251,7 +251,7 @@ def run_hf(
     disable_detokenize: bool = False,
 ) -> float:
     llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+        model, dtype=torch.float16, trust_remote_code=trust_remote_code
     )
     if llm.config.model_type == "llama":
         # To enable padding in the HF backend.
diff --git a/vllm/config/model.py b/vllm/config/model.py
@@ -1837,18 +1837,18 @@ def _find_dtype(
     *,
     revision: str | None,
 ):
-    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
-    # because config.torch_dtype can be None.
-    config_dtype = getattr(config, "torch_dtype", None)
+    # NOTE: getattr(config, "dtype", torch.float32) is not correct
+    # because config.dtype can be None.
+    config_dtype = getattr(config, "dtype", None)
 
     # Fallbacks for multi-modal models if the root config
-    # does not define torch_dtype
+    # does not define dtype
     if config_dtype is None:
-        config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
+        config_dtype = getattr(config.get_text_config(), "dtype", None)
     if config_dtype is None and hasattr(config, "vision_config"):
-        config_dtype = getattr(config.vision_config, "torch_dtype", None)
+        config_dtype = getattr(config.vision_config, "dtype", None)
     if config_dtype is None and hasattr(config, "encoder_config"):
-        config_dtype = getattr(config.encoder_config, "torch_dtype", None)
+        config_dtype = getattr(config.encoder_config, "dtype", None)
 
     # Try to read the dtype of the weights if they are in safetensors format
     if config_dtype is None:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -117,9 +117,8 @@ class LLM:
             execution with tensor parallelism.
         dtype: The data type for the model weights and activations. Currently,
             we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
-            the `torch_dtype` attribute specified in the model config file.
-            However, if the `torch_dtype` in the config is `float32`, we will
-            use `float16` instead.
+            the `dtype` attribute of the Transformers model's config. However,
+            if the `dtype` in the config is `float32`, we will use `float16` instead.
         quantization: The method used to quantize the model weights. Currently,
             we support "awq", "gptq", and "fp8" (experimental).
             If None, we first check the `quantization_config` attribute in the
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
@@ -518,7 +518,7 @@ def init_tensorizer_model(
 ) -> nn.Module:
     assert tensorizer_config.hf_config is not None
     model_args = tensorizer_config.hf_config
-    model_args.torch_dtype = tensorizer_config.dtype
+    model_args.dtype = tensorizer_config.dtype
     assert tensorizer_config.model_class is not None
     # TODO: Do we need to consider old-style model class?
     with meta_tensor_mode(), set_current_vllm_config(vllm_config, check_compile=True):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
@@ -999,7 +999,7 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
             return []
         assert self.model.vqmodel is not None
         image_tokens = self.model.get_image_tokens(
-            image_input["data"].to(self.config.torch_dtype)
+            image_input["data"].to(self.config.dtype)
         )
         vision_embeddings = self.model.get_input_embeddings(image_tokens)
         return vision_embeddings
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
@@ -1089,7 +1089,7 @@ def _pixel_values_norm(
         pixel_values = (
             rescale_factor * pixel_values.to(torch.float32) - image_mean_tensor
         ) / image_std_tensor
-        pixel_values = pixel_values.to(hf_config.torch_dtype)
+        pixel_values = pixel_values.to(hf_config.dtype)
         return pixel_values
 
     def _call_hf_processor(
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
@@ -615,7 +615,7 @@ def _parse_and_validate_image_input(
         return None
 
     def _process_image_input(self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
-        pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)
+        pixel_values = image_input["data"].to(dtype=self.config.dtype)
 
         return self.transformer.vision(pixel_values)
 
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
@@ -114,7 +114,7 @@ def __init__(
         attention_dropout=0.0,
         mla_scale_q_lora=False,
         mla_scale_kv_lora=False,
-        torch_dtype="bfloat16",
+        dtype="bfloat16",
         params_dtype="bfloat16",
         router_dtype="float32",
         router_bias=False,
@@ -130,7 +130,7 @@ def __init__(
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             params_dtype=params_dtype,
             router_dtype=router_dtype,
             topk_method=topk_method,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -987,7 +987,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=maybe_prefix(prefix, "language_model"),
         )
         self.vision_model = self.get_vit_model_from_radio_config(config).to(
-            self.language_model.config.torch_dtype
+            self.language_model.config.dtype
         )
 
         # Construct the vision projection.
@@ -1008,7 +1008,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ReLUSquaredActivation(),
             nn.Linear(vision_projection_hidden_size, llm_hidden_size, bias=False),
         )
-        self.mlp1 = self.mlp1.to(self.language_model.config.torch_dtype)
+        self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
 
         self.config = config
         self.model_config = vllm_config.model_config
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
@@ -338,7 +338,7 @@ def __init__(
             group_size=None,
             norm_before_gate=True,
             device=current_platform.current_device(),
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         self.out_proj = RowParallelLinear(
@@ -847,15 +847,15 @@ def __init__(
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.torch_dtype,
+                    dtype=config.dtype,
                 ),
             )
             self.ffn_layer_scale = torch.nn.Parameter(
                 torch.zeros(
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.torch_dtype,
+                    dtype=config.dtype,
                 ),
             )
 
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
@@ -530,7 +530,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         with init_on_device_without_buffers("meta"):
             self.model: PreTrainedModel = AutoModel.from_config(
                 self.config,
-                torch_dtype=self.model_config.dtype,
+                dtype=self.model_config.dtype,
                 trust_remote_code=self.model_config.trust_remote_code,
             )
 
diff --git a/vllm/model_executor/models/transformers_pooling.py b/vllm/model_executor/models/transformers_pooling.py
@@ -157,7 +157,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         with torch.device("meta"):
             seq_cls_model = AutoModelForSequenceClassification.from_config(
                 self.config,
-                torch_dtype=self.model_config.dtype,
+                dtype=self.model_config.dtype,
                 trust_remote_code=self.model_config.trust_remote_code,
             )
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -500,8 +500,8 @@ def is_kv_cache_dtype_supported(
         return supported
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
-        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
+        if dtype == torch.bfloat16:  # noqa: SIM102
             if not cls.has_device_capability(80):
                 capability = cls.get_device_capability()
                 gpu_name = cls.get_device_name()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -563,7 +563,7 @@ def is_kv_cache_dtype_supported(
         return False
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
         """
         Check if the dtype is supported by the current platform.
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -484,8 +484,8 @@ def is_kv_cache_dtype_supported(
         return True
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
-        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
+        if dtype == torch.bfloat16:  # noqa: SIM102
             if not cls.has_device_capability(80):
                 capability = cls.get_device_capability()
                 gpu_name = cls.get_device_name()
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
@@ -236,8 +236,8 @@ def device_count(cls) -> int:
         return torch.xpu.device_count()
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
-        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
+        if dtype == torch.bfloat16:  # noqa: SIM102
             device_name = cls.get_device_name().lower()
             # client gpu a770
             if device_name.count("a770") > 0:
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
@@ -806,7 +806,7 @@ def create_kv_caches_with_random_flash(
 
     current_platform.seed_everything(seed)
 
-    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+    dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
     generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
     assert cache_layout in ("NHD", "HND")
     stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2, 4)
@@ -819,7 +819,7 @@ def create_kv_caches_with_random_flash(
 
     for _ in range(num_layers):
         key_value_cache = torch.empty(
-            size=kv_cache_allocation_shape, dtype=torch_dtype, device=device
+            size=kv_cache_allocation_shape, dtype=dtype, device=device
         ).permute(*stride_order)
         if cache_dtype in ["auto", "half", "bfloat16", "float"]:
             key_value_cache.uniform_(-scale, scale)
@@ -851,14 +851,14 @@ def create_kv_caches_with_random(
 
     current_platform.seed_everything(seed)
 
-    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+    dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
 
     scale = head_size**-0.5
-    x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
+    x = 16 // torch.tensor([], dtype=dtype).element_size()
     key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
     key_caches: list[torch.Tensor] = []
     for _ in range(num_layers):
-        key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, device=device)
+        key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device=device)
         if cache_dtype in ["auto", "half", "bfloat16", "float"]:
             key_cache.uniform_(-scale, scale)
         elif cache_dtype == "fp8":
@@ -870,9 +870,7 @@ def create_kv_caches_with_random(
     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
     value_caches: list[torch.Tensor] = []
     for _ in range(num_layers):
-        value_cache = torch.empty(
-            size=value_cache_shape, dtype=torch_dtype, device=device
-        )
+        value_cache = torch.empty(size=value_cache_shape, dtype=dtype, device=device)
         if cache_dtype in ["auto", "half", "bfloat16", "float"]:
             value_cache.uniform_(-scale, scale)
         elif cache_dtype == "fp8":

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"`
`43`	`43`	`model = AutoModelForCausalLM.from_pretrained(`
`44`	`44`	`MODEL_ID,`
`45`	`45`	`device_map="auto",`
`46`		`- torch_dtype="auto",`
	`46`	`+ dtype="auto",`
`47`	`47`	`)`
`48`	`48`	`tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)`
`49`	`49`	```
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"`
`41`	`41`	`model = AutoModelForCausalLM.from_pretrained(`
`42`	`42`	`MODEL_ID,`
`43`	`43`	`device_map="auto",`
`44`		`- torch_dtype="auto",`
	`44`	`+ dtype="auto",`
`45`	`45`	`)`
`46`	`46`	`tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)`
`47`	`47`	```
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"`
`46`	`46`	`model = AutoModelForCausalLM.from_pretrained(`
`47`	`47`	`MODEL_ID,`
`48`	`48`	`device_map="auto",`
`49`		`- torch_dtype="auto",`
	`49`	`+ dtype="auto",`
`50`	`50`	`)`
`51`	`51`	`tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)`
`52`	`52`	```
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ to fetch model and tokenizer.`
`50`	`50`	`model = AutoModelForCausalLM.from_pretrained(`
`51`	`51`	`MODEL_ID,`
`52`	`52`	`device_map="auto",`
`53`		`- torch_dtype="auto",`
	`53`	`+ dtype="auto",`
`54`	`54`	`)`
`55`	`55`	`model.eval()`
`56`	`56`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht`
`27`	`27`	`quantization_config = TorchAoConfig(Int8WeightOnlyConfig())`
`28`	`28`	`quantized_model = AutoModelForCausalLM.from_pretrained(`
`29`	`29`	`model_name,`
`30`		`- torch_dtype="auto",`
	`30`	`+ dtype="auto",`
`31`	`31`	`device_map="auto",`
`32`	`32`	`quantization_config=quantization_config`
`33`	`33`	`)`