|
4 | 4 |
|
5 | 5 | import torch |
6 | 6 |
|
| 7 | +from vllm import _custom_ops as ops |
| 8 | +from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import ( # noqa: E501 |
| 9 | + triton_scaled_mm, |
| 10 | +) |
| 11 | +from vllm.model_executor.layers.quantization.utils import replace_parameter |
7 | 12 | from vllm.platforms import current_platform |
8 | 13 |
|
9 | | -from .cutlass import CutlassScaledMMLinearKernel |
10 | | -from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig |
| 14 | +from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig |
11 | 15 |
|
12 | 16 |
|
13 | | -class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel): |
| 17 | +class TritonScaledMMLinearKernel(ScaledMMLinearKernel): |
14 | 18 | @classmethod |
15 | 19 | def get_min_capability(cls) -> int: |
16 | 20 | return 75 |
17 | 21 |
|
| 22 | + @classmethod |
| 23 | + def is_supported( |
| 24 | + cls, compute_capability: int | None = None |
| 25 | + ) -> tuple[bool, str | None]: |
| 26 | + if current_platform.is_cuda_alike(): |
| 27 | + return True, None |
| 28 | + return False, "Requires ROCm or CUDA." |
| 29 | + |
18 | 30 | @classmethod |
19 | 31 | def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: |
20 | | - if current_platform.is_cpu(): |
21 | | - return ( |
22 | | - False, |
23 | | - "TritonScaledMMLinearKernel requires Triton which is not " |
24 | | - + "currently supported on CPU.", |
25 | | - ) |
26 | 32 | if not c.input_symmetric: |
27 | | - return ( |
28 | | - False, |
29 | | - "TritonScaledMMLinearKernel only supports symmetric " + "quantization.", |
30 | | - ) |
| 33 | + return False, "Only symmetric input is supported." |
31 | 34 | return True, None |
32 | 35 |
|
33 | 36 | def process_weights_after_loading(self, layer: torch.nn.Module) -> None: |
34 | | - super().process_weights_after_loading(layer) |
| 37 | + weight = getattr(layer, self.w_q_name) |
| 38 | + replace_parameter( |
| 39 | + layer, |
| 40 | + self.w_q_name, |
| 41 | + torch.nn.Parameter(weight.t().data, requires_grad=False), |
| 42 | + ) |
| 43 | + |
| 44 | + # INPUT SCALE |
| 45 | + if self.config.is_static_input_scheme: |
| 46 | + input_scale = getattr(layer, self.i_s_name) |
| 47 | + replace_parameter( |
| 48 | + layer, |
| 49 | + self.i_s_name, |
| 50 | + torch.nn.Parameter(input_scale.max(), requires_grad=False), |
| 51 | + ) |
| 52 | + setattr(layer, self.i_zp_name, None) |
| 53 | + else: |
| 54 | + setattr(layer, self.i_s_name, None) |
| 55 | + setattr(layer, self.i_zp_name, None) |
| 56 | + |
| 57 | + setattr(layer, self.azp_adj_name, None) |
35 | 58 |
|
36 | 59 | def apply_weights( |
37 | 60 | self, |
38 | 61 | layer: torch.nn.Module, |
39 | 62 | x: torch.Tensor, |
40 | 63 | bias: torch.Tensor | None = None, |
41 | 64 | ) -> torch.Tensor: |
42 | | - return super().apply_weights(layer, x, bias) |
| 65 | + w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer) |
| 66 | + |
| 67 | + x_q, x_s, x_zp = ops.scaled_int8_quant( |
| 68 | + x.contiguous(), i_s, i_zp, symmetric=True |
| 69 | + ) |
| 70 | + |
| 71 | + assert x_zp is None, "Triton kernel only supports symmetric quantization" |
| 72 | + |
| 73 | + return triton_scaled_mm( |
| 74 | + x_q, w_q, scale_a=x_s, scale_b=w_s, out_dtype=x.dtype, bias=bias |
| 75 | + ) |
0 commit comments