|
5 | 5 |
|
6 | 6 | import torch |
7 | 7 |
|
| 8 | +from vllm import _custom_ops as ops |
| 9 | +from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import ( |
| 10 | + triton_scaled_mm, |
| 11 | +) |
| 12 | +from vllm.model_executor.layers.quantization.utils import replace_parameter |
8 | 13 | from vllm.platforms import current_platform |
9 | 14 |
|
10 | | -from .cutlass import CutlassScaledMMLinearKernel |
11 | | -from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig |
| 15 | +from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig |
12 | 16 |
|
13 | 17 |
|
14 | | -class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel): |
| 18 | +class TritonScaledMMLinearKernel(ScaledMMLinearKernel): |
15 | 19 | @classmethod |
16 | 20 | def get_min_capability(cls) -> int: |
17 | 21 | return 75 |
18 | 22 |
|
19 | 23 | @classmethod |
20 | | - def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: |
21 | | - if current_platform.is_cpu(): |
22 | | - return ( |
23 | | - False, |
24 | | - "TritonScaledMMLinearKernel requires Triton which is not " |
25 | | - + "currently supported on CPU.", |
26 | | - ) |
| 24 | + def is_supported( |
| 25 | + cls, compute_capability: Optional[int] = None |
| 26 | + ) -> tuple[bool, Optional[str]]: |
| 27 | + if current_platform.is_rocm() or current_platform.is_cuda(): |
| 28 | + return True, None |
| 29 | + return False, "Requires ROCm or CUDA." |
| 30 | + |
| 31 | + @classmethod |
| 32 | + def can_implement( |
| 33 | + cls, c: ScaledMMLinearLayerConfig |
| 34 | + ) -> tuple[bool, Optional[str]]: |
27 | 35 | if not c.input_symmetric: |
28 | | - return ( |
29 | | - False, |
30 | | - "TritonScaledMMLinearKernel only supports symmetric " + "quantization.", |
31 | | - ) |
| 36 | + return False, "Only symmetric input is supported." |
32 | 37 | return True, None |
33 | 38 |
|
34 | 39 | def process_weights_after_loading(self, layer: torch.nn.Module) -> None: |
35 | | - super().process_weights_after_loading(layer) |
| 40 | + # INPUT SCALE |
| 41 | + if self.config.is_static_input_scheme: |
| 42 | + input_scale = getattr(layer, self.i_s_name) |
| 43 | + replace_parameter( |
| 44 | + layer, |
| 45 | + self.i_s_name, |
| 46 | + torch.nn.Parameter(input_scale.max(), requires_grad=False), |
| 47 | + ) |
| 48 | + setattr(layer, self.i_zp_name, None) |
| 49 | + else: |
| 50 | + setattr(layer, self.i_s_name, None) |
| 51 | + setattr(layer, self.i_zp_name, None) |
| 52 | + |
| 53 | + setattr(layer, self.azp_adj_name, None) |
36 | 54 |
|
37 | 55 | def apply_weights( |
38 | 56 | self, |
39 | 57 | layer: torch.nn.Module, |
40 | 58 | x: torch.Tensor, |
41 | 59 | bias: Optional[torch.Tensor] = None, |
42 | 60 | ) -> torch.Tensor: |
43 | | - return super().apply_weights(layer, x, bias) |
| 61 | + w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer) |
| 62 | + |
| 63 | + x_q, x_s, x_zp = ops.scaled_int8_quant( |
| 64 | + x.contiguous(), i_s, i_zp, symmetric=True |
| 65 | + ) |
| 66 | + |
| 67 | + assert x_zp is None, "Triton kernel only supports symmetric quantization" |
| 68 | + |
| 69 | + return triton_scaled_mm( |
| 70 | + x_q, w_q, scale_a=x_s, scale_b=w_s, out_dtype=x.dtype, bias=bias |
| 71 | + ) |
0 commit comments