pytorch
diff --git a/‎test/dtypes/test_affine_quantized_float.py
Lines changed: 47 additions & 13 deletions b/‎test/dtypes/test_affine_quantized_float.py
Lines changed: 47 additions & 13 deletions
diff --git a/‎torchao/dtypes/affine_quantized_tensor.py
Lines changed: 3 additions & 4 deletions b/‎torchao/dtypes/affine_quantized_tensor.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎torchao/dtypes/floatx/float8_layout.py
Lines changed: 122 additions & 85 deletions b/‎torchao/dtypes/floatx/float8_layout.py
Lines changed: 122 additions & 85 deletions
@@ -297,21 +297,55 @@ def test_fp8_weight_dimension_warning(self):
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
-    def test_mm_float8dq(self):
+    @common_utils.parametrize(
+        "in_features,out_features", [(512, 1024), (256, 768), (1024, 512)]
+    )
+    @common_utils.parametrize(
+        "leading_shape", [(1,), (8,), (16,), (2, 8,), (2, 2, 16,)]
+    )  # fmt: skip
+    @common_utils.parametrize("bias", [True, False])
+    def test_mm_float8dq_per_row(
+        self, in_features, out_features, leading_shape, bias: bool
+    ):
         device = "cuda"
         dtype = torch.bfloat16
-        weight = torch.randn(512, 1024).to(device).to(dtype)
-        weight = weight.t()
-
-        l = torch.nn.Linear(512, 1024).to(device).to(dtype)
-        l.weight = torch.nn.Parameter(weight)
-        quantize_(l, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
-        # weight shape: 1024 x 512
-        weight = l.weight
-
-        input = torch.randn(1, 512, device=device, dtype=dtype)
-        # make sure it runs
-        torch.nn.functional.linear(input, weight)
+        input_shape = leading_shape + (in_features,)
+
+        ref_linear = (
+            torch.nn.Linear(in_features, out_features, bias=bias).to(device).to(dtype)
+        )
+        test_linear = copy.deepcopy(ref_linear)
+        quantize_(
+            test_linear, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+        )
+
+        quant_weight = test_linear.weight
+
+        self.assertTrue(hasattr(quant_weight, "original_weight_tensor"))
+        weight_impl = quant_weight.original_weight_tensor.tensor_impl
+
+        self.assertTrue(hasattr(weight_impl, "float8_data"))
+        self.assertTrue(hasattr(weight_impl, "scale"))
+        self.assertFalse(weight_impl.transposed)
+
+        # Verify scale shape for row-wise quantization
+        expected_scale_shape = (out_features, 1)
+        actual_scale_shape = weight_impl.scale.shape
+        self.assertEqual(actual_scale_shape, expected_scale_shape)
+
+        self.assertEqual(weight_impl.float8_data.shape, (out_features, in_features))
+
+        input_tensor = torch.randn(*input_shape, device=device, dtype=dtype)
+
+        with torch.no_grad():
+            ref_output = ref_linear(input_tensor)
+            quant_output = torch.nn.functional.linear(input_tensor, quant_weight)
+
+        expected_output_shape = input_tensor.shape[:-1] + (out_features,)
+        self.assertEqual(quant_output.shape, expected_output_shape)
+
+        error = compute_error(ref_output, quant_output)
+        assert error > 20, f"Quantization error is too high got a SQNR of {error}"
 
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)
 
@@ -462,10 +462,10 @@ def from_hp_to_floatx(
         if target_dtype in FP8_TYPES:
             original_shape = input_float.shape
             input_float = _layout.pre_process(input_float)
-
-            scale = choose_qparams_affine_float8(input_float, float8_dtype=target_dtype)
+            scale = choose_qparams_affine_float8(
+                input_float, float8_dtype=target_dtype, block_size=block_size
+            )
             data = quantize_affine_float8(input_float, scale, target_dtype)
-
             data, scale, zero_point = _layout.post_process(
                 data, scale, None, block_size
             )
@@ -503,7 +503,6 @@ def from_hp_to_floatx_static(
                 input_float,
                 scale,
                 target_dtype,
-                scale_dtype,
             )
 
             data, scale, zero_point = _layout.post_process(
 
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch.utils._python_dispatch import (
@@ -26,14 +26,25 @@
 from torchao.utils import _is_float8_type, fill_defaults
 
 aten = torch.ops.aten
+FLOAT8_IMPL_OPS_TABLE: Dict[Any, Any] = {}
+
+
+def implements(aten_ops: List[Any]):
+    """Register aten ops to the float8 op table"""
+
+    def decorator(func):
+        for op in aten_ops:
+            FLOAT8_IMPL_OPS_TABLE[op] = func
+        return func
+
+    return decorator
 
 
 def _same_metadata(self: "Float8AQTTensorImpl", src: "Float8AQTTensorImpl") -> bool:
     # Special handling for transposed attribute
     transposed_match = (self.transposed == src.transposed) or (
         self.transposed is False and src.transposed is None
     )
-
     return (
         isinstance(self, Float8AQTTensorImpl)
         and isinstance(src, Float8AQTTensorImpl)
@@ -160,90 +171,23 @@ def __tensor_unflatten__(
     def __torch_dispatch__(cls, func, types, args, kwargs):
         kwargs = {} if kwargs is None else kwargs
 
-        if func is aten.detach.default:
-            return return_and_correct_aliasing(
-                func, args, kwargs, args[0]._apply_fn_to_data(torch.detach)
-            )
-        elif func is aten.clone.default:
-            return return_and_correct_aliasing(
-                func, args, kwargs, args[0]._apply_fn_to_data(torch.clone)
-            )
-        elif func is aten.t.default:
-            """we don't need to repack the weight and just rely on external
-            shape being changed and record the status of transpose/no-transpose
-            """
-            args[0].transposed = not args[0].transposed
-            return return_and_correct_aliasing(func, args, kwargs, args[0])
-        elif func is aten.copy_.default:
-            self = args[0]
-            src = args[1]
-            if _same_metadata(self, src):
-                self_tensors = self.__tensor_flatten__()[0]
-                for tensor_name in self_tensors:
-                    getattr(self, tensor_name).copy_(getattr(src, tensor_name))
-                return
-            raise ValueError(
-                f"Not supported args for copy_ due to metadata mistach: {args[0], args[1]}"
-            )
-        elif func in [aten.select.int, aten.index.Tensor]:
-            return return_and_correct_aliasing(
-                func,
-                args,
-                kwargs,
-                args[0]._apply_fn_to_data(lambda x: func(x, *args[1:], **kwargs)),
-            )
-        elif func is aten.slice.Tensor:
-            self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
-            if dim == 0:
-                # TODO: scale replecation should be dependent on block size
-                if self.scale.ndim == 1:
-                    return return_and_correct_aliasing(
-                        func,
-                        args,
-                        kwargs,
-                        args[0]._apply_fn_to_data(
-                            lambda x: aten.slice.Tensor(x, dim, start, end, step)
-                        ),
-                    )
-                elif self.scale.ndim == 0:
-                    return return_and_correct_aliasing(
-                        func,
-                        args,
-                        kwargs,
-                        Float8AQTTensorImpl(
-                            aten.slice.Tensor(self.float8_data, dim, start, end, step),
-                            self.scale,
-                            None,
-                            self._layout,
-                        ),
-                    )
-                else:
-                    raise NotImplementedError(
-                        f"Float8AQTTensorImpl dispatch: attempting to run {func}, with scale ndim={dim}, that is not supported"
-                    )
-            elif dim == 1:
-                return return_and_correct_aliasing(
-                    func,
-                    args,
-                    kwargs,
-                    Float8AQTTensorImpl(
-                        aten.slice.Tensor(
-                            self.float8_data, dim, start, end, step
-                        ).contiguous(),
-                        self.scale,
-                        None,
-                        self._layout,
-                    ),
-                )
-            else:
-                raise NotImplementedError(
-                    f"Float8AQTTensorImpl dispatch: attempting to run {func}, with dim={dim}, that is not supported"
+        def allowed_subclasses(type):
+            return (
+                issubclass(cls, type)
+                or issubclass(torch._subclasses.fake_tensor.FakeTensor, type)
+                or issubclass(
+                    torch._subclasses.functional_tensor.FunctionalTensor, type
                 )
-        else:
-            raise NotImplementedError(
-                f"Float8AQTTensorImpl dispatch: attempting to run {func}, this is not supported"
             )
 
+        if not all(allowed_subclasses(t) for t in types):
+            return NotImplemented
+
+        if func in FLOAT8_IMPL_OPS_TABLE:
+            return FLOAT8_IMPL_OPS_TABLE[func](func, types, args, kwargs)
+
+        raise NotImplementedError(f"attempting to run {func}, this is not supported")
+
     __torch_function__ = torch._C._disabled_torch_function_impl
 
     def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
@@ -281,6 +225,100 @@ def __repr__(self):
         )
 
 
+##########################
+# Regsiter FP8 Ops
+##########################
+
+
+@implements([aten.detach.default, aten.alias.default, aten.clone.default])
+def _(func, types, args, kwargs):
+    return return_and_correct_aliasing(
+        func, args, kwargs, args[0]._apply_fn_to_data(func)
+    )
+
+
+@implements([aten.t.default])
+def _(func, types, args, kwargs):
+    """we don't need to repack the weight and just rely on external
+    shape being changed and record the status of transpose/no-transpose
+    """
+    args[0].transposed = not args[0].transposed
+    return return_and_correct_aliasing(func, args, kwargs, args[0])
+
+
+@implements([aten.copy_.default])
+def _(func, types, args, kwargs):
+    self = args[0]
+    src = args[1]
+    if _same_metadata(self, src):
+        self_tensors = self.__tensor_flatten__()[0]
+        for tensor_name in self_tensors:
+            getattr(self, tensor_name).copy_(getattr(src, tensor_name))
+        return
+    raise ValueError(
+        f"Not supported args for copy_ due to metadata mismatch: {args[0], args[1]}"
+    )
+
+
+@implements([aten.select.int, aten.index.Tensor])
+def _(func, types, args, kwargs):
+    return return_and_correct_aliasing(
+        func,
+        args,
+        kwargs,
+        args[0]._apply_fn_to_data(lambda x: func(x, *args[1:], **kwargs)),
+    )
+
+
+@implements([aten.slice.Tensor])
+def _(func, types, args, kwargs):
+    self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
+    if dim == 0:
+        if self.scale.numel() == 1:
+            # Per Tensor
+            return return_and_correct_aliasing(
+                func,
+                args,
+                kwargs,
+                Float8AQTTensorImpl(
+                    aten.slice.Tensor(self.float8_data, dim, start, end, step),
+                    self.scale,
+                    self.transposed,
+                    self._layout,
+                ),
+            )
+        elif self.scale.ndim == 2:
+            # TODO: scale replication should be dependent on block size
+            return return_and_correct_aliasing(
+                func,
+                args,
+                kwargs,
+                args[0]._apply_fn_to_data(
+                    lambda x: aten.slice.Tensor(x, dim, start, end, step)
+                ),
+            )
+        else:
+            raise NotImplementedError(
+                f"Float8AQTTensorImpl dispatch: attempting to run {func}, with scale ndim={dim}, that is not supported"
+            )
+    elif dim == 1:
+        return return_and_correct_aliasing(
+            func,
+            args,
+            kwargs,
+            Float8AQTTensorImpl(
+                aten.slice.Tensor(self.float8_data, dim, start, end, step).contiguous(),
+                self.scale,
+                self.transposed,
+                self._layout,
+            ),
+        )
+    else:
+        raise NotImplementedError(
+            f"Float8AQTTensorImpl dispatch: attempting to run {func}, with dim={dim}, that is not supported"
+        )
+
+
 ##########################
 # Float8 Dispatch Kernels
 ##########################
@@ -333,13 +371,12 @@ def _linear_fp8_act_fp8_weight_impl(
     input_scale = input_tensor.tensor_impl.scale
     # Handle case where input tensor is more than 2D
     inpt_data = inpt_data.reshape(-1, inpt_data.shape[-1])
-
     # Handle rowwise case
     if _is_rowwise_scaled(weight_tensor):
         assert _is_rowwise_scaled(input_tensor), (
             "Input tensor must be rowwise block size"
         )
-        w_scale = w_scale.unsqueeze(-1).T
+        w_scale = w_scale.T
         input_scale = preprocess_scale(input_scale, input_tensor.shape)
 
     # Preprocess data