pytorch · namgyu-youn · Sep 21, 2025 · Sep 22, 2025 · Sep 23, 2025 · Sep 24, 2025
diff --git a/docs/source/quantization_overview.rst b/docs/source/quantization_overview.rst
@@ -5,7 +5,7 @@ First we want to lay out the torchao stack::
 
   Quantization Algorithms/Flows: weight only/dynamic/static quantization, hqq, awq, gptq etc.
   ---------------------------------------------------------------------------------------------
-      Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Float8Tensor
+      Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Int8Tensor, Float8Tensor
   ---------------------------------------------------------------------------------------------
     Quantization Primitive Ops/Efficient Kernels: matmul, quantize, dequantize
   ---------------------------------------------------------------------------------------------
@@ -88,6 +88,8 @@ So in general we structure Tensor subclasses by dervied dtpype and packing forma
      - scaled int4
      - preshuffled (special format to optimize for loading)
      - float8 act + int4 weight dynamic quantization and int4 weight only quantization
+   * - Int8Tensor
+     - plain
 
 .. note::
    We don't have granularity specific tensor subclasses, i.e. no Float8RowwiseTensor or Float8BlockwiseTensor, all granularities are implemented in the same Tensor, we typically use a general `block_size` attribute to distinguish between different granularities, and each Tensor is allowed to support only a subset of all possible granularity options.

diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+
+from torchao.quantization.quantize_.workflows.int8.int8_tensor import (
+    Int8Tensor,
+    QuantizeTensorToInt8Kwargs,
+)
+from torchao.quantization.utils import compute_error
+from torchao.testing.utils import TorchAOIntegrationTestCase
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+class TestInt8Tensor(TorchAOIntegrationTestCase):
+    def setUp(self):
+        super().setUp()
+        torch.manual_seed(42)
+        self.weight_fp = torch.randn(4, 3, dtype=torch.float32)
+        self.input_fp = torch.randn(2, 3, dtype=torch.float32)
+        self.bias = torch.randn(4)
+        self.block_size = [4, 3]
+
+    def test_creation_and_attributes(self):
+        """Test tensor creation, dtypes, and ranges"""
+        tensor = Int8Tensor.from_hp(self.weight_fp, self.block_size)
+
+        self.assertEqual(tensor.shape, (4, 3))
+        self.assertEqual(tensor.qdata.dtype, torch.int8)
+        self.assertTrue(
+            torch.all(tensor.qdata >= -128) and torch.all(tensor.qdata <= 127)
+        )
+
+    def test_linear_operations(self):
+        """Test fp+int8 and int8+int8 linear ops with quantization error check"""
+        weight_q8 = Int8Tensor.from_hp(self.weight_fp, self.block_size)
+        input_q8 = Int8Tensor.from_hp(self.input_fp, self.block_size)
+
+        reference = torch.nn.functional.linear(self.input_fp, self.weight_fp, self.bias)
+        result_fp = torch.nn.functional.linear(self.input_fp, weight_q8, self.bias)
+        result_q8 = torch.nn.functional.linear(input_q8, weight_q8, self.bias)
+
+        self.assertEqual(result_fp.shape, reference.shape)
+        self.assertEqual(result_q8.shape, reference.shape)
+        self.assertTrue(compute_error(result_fp, reference) > 10)
+        self.assertTrue(compute_error(result_q8, reference) > 10)
+
+    def test_dynamic_quantization(self):
+        weight_q8_dynamic = Int8Tensor.from_hp(
+            self.weight_fp,
+            self.block_size,
+            act_quant_kwargs=QuantizeTensorToInt8Kwargs(),
+        )
+
+        reference = torch.nn.functional.linear(self.input_fp, self.weight_fp, self.bias)
+        result_dynamic = torch.nn.functional.linear(
+            self.input_fp, weight_q8_dynamic, self.bias
+        )
+
+        self.assertEqual(result_dynamic.shape, reference.shape)
+
+    def test_error_handling_and_dequant(self):
+        """Test input validation and dequantization accuracy"""
+        # Test 1D tensor validation
+        with self.assertRaises((AssertionError, ValueError, RuntimeError)):
+            Int8Tensor.from_hp(torch.randn(5), [1])
+
+        # Test wrong block_size validation
+        with self.assertRaises((AssertionError, ValueError, RuntimeError)):
+            Int8Tensor.from_hp(self.weight_fp, [1])
+
+        # Test dequantization with exact values
+        test_data = torch.tensor([[1.0, -1.0]], dtype=torch.float32)
+        tensor = Int8Tensor.from_hp(test_data, [1, 1])
+
+        dequantized = torch.ops.aten.dequantize.self(tensor)
+        self.assertEqual(dequantized.shape, test_data.shape)
+        self.assertLess(torch.abs(dequantized - test_data).max().item(), 0.1)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -95,6 +95,7 @@
     Int4PreshuffledTensor,
     Int4Tensor,
     Int4TilePackedTo4dTensor,
+    Int8Tensor,
     IntxOpaqueTensor,
     IntxUnpackedToInt8Tensor,
 )
@@ -168,6 +169,7 @@
     "IntxOpaqueTensor",
     "IntxUnpackedToInt8Tensor",
     "Int4TilePackedTo4dTensor",
+    "Int8Tensor",
     "Float8Tensor",
     "Int4OpaqueTensor",
     # smooth quant - subject to change

diff --git a/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py b/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
@@ -39,7 +39,9 @@ def _choose_quant_func_and_quantize_tensor(
     """
     from torchao.quantization.quantize_.workflows import (
         Float8Tensor,
+        Int8Tensor,
         QuantizeTensorToFloat8Kwargs,
+        QuantizeTensorToInt8Kwargs,
     )
 
     if isinstance(quant_kwargs, QuantizeTensorToFloat8Kwargs):
@@ -52,5 +54,11 @@ def _choose_quant_func_and_quantize_tensor(
             quant_kwargs.hp_value_ub,
             quant_kwargs.kernel_preference,
         )
+    elif isinstance(quant_kwargs, QuantizeTensorToInt8Kwargs):
+        return Int8Tensor.from_hp(
+            tensor,
+            quant_kwargs.block_size or [1, tensor.shape[-1]],
+            kernel_preference=quant_kwargs.kernel_preference,
+        )
 
     raise NotImplementedError(f"Quant kwargs not supported: {quant_kwargs}")
diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
@@ -20,6 +20,10 @@
     Int4Tensor,
 )
 from .int4.int4_tile_packed_to_4d_tensor import Int4TilePackedTo4dTensor
+from .int8.int8_tensor import (
+    Int8Tensor,
+    QuantizeTensorToInt8Kwargs,
+)
 from .intx.intx_opaque_tensor import (
     IntxOpaqueTensor,
 )
@@ -36,6 +40,8 @@
     "Int4MarlinSparseTensor",
     "Int4PlainInt32Tensor",
     "Int4TilePackedTo4dTensor",
+    "Int8Tensor",
+    "QuantizeTensorToInt8Kwargs",
     "Float8Tensor",
     "QuantizeTensorToFloat8Kwargs",
     "Int4OpaqueTensor",

diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -0,0 +1,193 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from torchao.quantization.quantize_.common import (
+    KernelPreference,
+    QuantizeTensorKwargs,
+    _choose_quant_func_and_quantize_tensor,
+)
+from torchao.utils import TorchAOBaseTensor
+
+__all__ = ["Int8Tensor", "QuantizeTensorToInt8Kwargs"]
+
+aten = torch.ops.aten
+
+
+@dataclass
+class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
+    """Tensor kwargs for creating int8 tensor (either activation or weight)
+
+    Args:
+        kernel_preference (KernelPreference): kernel preference for ops like matmul, grouped matmul etc.
+        block_size (Optional[list[int]]): block size for quantization granularity
+    """
+
+    kernel_preference: KernelPreference = KernelPreference.AUTO
+    block_size: Optional[list[int]] = None
+
+
+# TODO: Implement block-wise quantization using block_size
+class Int8Tensor(TorchAOBaseTensor):
+    """
+    int8 quantized tensor with plain layout
+
+    Tensor Attributes:
+        qdata: (N, K) int8 quantized weight data
+        scale: scale factors for dequantization
+        zero_point: zero points for dequantization
+
+    Non-Tensor Attributes:
+        block_size: block size for quantization granularity
+        shape: original tensor shape
+        act_quant_kwargs: flags for static/dynamic activation quantization
+        kernel_preference: kernel preference for operations
+    """
+
+    tensor_data_names = ["qdata", "scale", "zero_point"]
+    tensor_attribute_names = ["block_size"]
+    optional_tensor_attribute_names = [
+        "act_quant_kwargs",
+        "kernel_preference",
+        "dtype",
+    ]
+
+    def __new__(
+        cls,
+        qdata,
+        scale,
+        zero_point,
+        block_size,
+        shape,
+        act_quant_kwargs=None,
+        kernel_preference=KernelPreference.AUTO,
+        dtype=None,
+    ):
+        kwargs = {
+            "device": qdata.device,
+            "dtype": dtype or scale.dtype,
+            "requires_grad": False,
+        }
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)
+
+    def __init__(
+        self,
+        qdata,
+        scale,
+        zero_point,
+        block_size,
+        shape,
+        act_quant_kwargs=None,
+        kernel_preference=KernelPreference.AUTO,
+        dtype=None,
+    ):
+        super().__init__()
+        self.qdata = qdata
+        self.scale = scale
+        self.zero_point = zero_point
+        self.block_size = block_size
+        self.act_quant_kwargs = act_quant_kwargs
+        self.kernel_preference = kernel_preference
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}({self.act_quant_kwargs=}, {self.qdata=}, {self.scale=}, "
+            f"{self.zero_point=}, {self.block_size=}, {self.kernel_preference=}, "
+            f"{self.shape=}, {self.device=}, {self.dtype=})"
+        )
+
+    @classmethod
+    def from_hp(
+        cls,
+        w: torch.Tensor,
+        block_size: list[int],
+        act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
+        kernel_preference: KernelPreference = KernelPreference.AUTO,
+    ):
+        if w.dim() != 2 or len(block_size) != 2:
+            raise ValueError("Expected 2D tensor and block_size length 2")
+
+        # Rounding function from high precision dtype
+        scale = w.abs().max(dim=-1, keepdim=True)[0] / 127.0
 def _linear_fp_act_int8_weight_check(input_tensor, weight_tensor, bias): 
 scale, zero_point = choose_qparams_affine( 
     input=preprocessed_w, 
     mapping_type=MappingType.SYMMETRIC, 
     block_size=block_size, 
     target_dtype=target_dtype, 
     quant_min=quant_min, 
     quant_max=quant_max, 
     eps=1e-6, 
 ) 
 wq = quantize_affine( 
     input=preprocessed_w, 
     block_size=block_size, 
     scale=scale, 
     zero_point=zero_point, 
     output_dtype=target_dtype, 
     quant_min=quant_min, 
     quant_max=quant_max, 
 ) 
 new_weight = to_affine_quantized_intx( 
 scale, zero_point = choose_qparams_affine( 
 def _linear_fp_act_int8_weight_check(input_tensor, weight_tensor, bias): 
 scale, zero_point = choose_qparams_affine( 
     input=preprocessed_w, 
     mapping_type=MappingType.SYMMETRIC, 
     block_size=block_size, 
     target_dtype=target_dtype, 
     quant_min=quant_min, 
     quant_max=quant_max, 
     eps=1e-6, 
 ) 
  
 wq = quantize_affine( 
     input=preprocessed_w, 
     block_size=block_size, 
     scale=scale, 
     zero_point=zero_point, 
     output_dtype=target_dtype, 
     quant_min=quant_min, 
     quant_max=quant_max, 
 ) 
 new_weight = to_affine_quantized_intx( 
 scale, zero_point = choose_qparams_affine( 
+        scale = scale.clamp(min=1e-6)
+
+        int_data = torch.round(w / scale).clamp(-128, 127).to(torch.int8)
+
+        return cls(
+            int_data,
+            scale.squeeze(-1),
+            torch.zeros_like(scale.squeeze(-1), dtype=torch.int8),
+            block_size,
+            w.shape,
+            act_quant_kwargs=act_quant_kwargs,
+            kernel_preference=kernel_preference,
+            dtype=w.dtype,
+        )
+
+    def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        """Dequantize int8 tensor to floating point"""
+        dtype = output_dtype or self.dtype or self.scale.dtype
+        return (
+            self.qdata.to(dtype) - self.zero_point.to(dtype).unsqueeze(1)
+        ) * self.scale.to(dtype).unsqueeze(1)
+
+
+implements = Int8Tensor.implements
+
+
+@implements([aten.dequantize.self])
+def _(func, types, args, kwargs):
+    """dequantization: int8 -> float"""
+    tensor = args[0]
+    dtype = tensor.dtype or tensor.scale.dtype
+    return (
+        tensor.qdata.to(dtype) - tensor.zero_point.to(dtype).unsqueeze(1)
+    ) * tensor.scale.to(dtype).unsqueeze(1)
+
+
+@implements([torch.nn.functional.linear, aten.linear.default])
+def _(func, types, args, kwargs):
+    """quantization: float -> int8"""
+    input_tensor, weight_tensor, bias = (
+        args[0],
+        args[1],
+        args[2] if len(args) > 2 else None,
+    )
+
+    assert isinstance(weight_tensor, Int8Tensor), (
+        f"Expected weight to be Int8Tensor, got {type(weight_tensor)}"
+    )
+
+    # Dynamic activation quantization if enabled
+    if weight_tensor.act_quant_kwargs is not None:
+        input_tensor = _choose_quant_func_and_quantize_tensor(
+            input_tensor, weight_tensor.act_quant_kwargs
+        )
+
+    if isinstance(input_tensor, Int8Tensor):
+        # INT8 × INT8 (dynamic)
+        x_int32 = input_tensor.qdata.to(torch.int32)
+        w_int32 = weight_tensor.qdata.to(torch.int32).t()
+
+        result = torch.mm(x_int32.view(-1, x_int32.size(-1)), w_int32)
+        scale = input_tensor.scale.view(-1, 1) * weight_tensor.scale.unsqueeze(0)
+        result = result.to(scale.dtype) * scale
+        result = result.view(*input_tensor.shape[:-1], -1)
+    else:
+        # FP × INT8 (static)
 def _linear_fp_act_int8_weight_impl(input_tensor, weight_tensor, bias): 
 def _linear_fp_act_int8_weight_impl(input_tensor, weight_tensor, bias): 
+        result = torch.nn.functional.linear(
+            input_tensor, weight_tensor.dequantize(), None
+        )
+
+    return result + bias if bias is not None else result
+
+
+Int8Tensor.__module__ = "torchao.quantization"
+torch.serialization.add_safe_globals([Int8Tensor, QuantizeTensorToInt8Kwargs])