diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py index 70b16b86fda..e0256437022 100644 --- a/backends/cadence/aot/quantizer/quantizer.py +++ b/backends/cadence/aot/quantizer/quantizer.py @@ -372,3 +372,30 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: # Add 16-bit quantizers for LinearPattern quantizers.append(CadenceAtenQuantizer(LinearPattern(), qconfig_A16)) super().__init__(quantizers) + + +class CadenceWith16BitConvActivationsQuantizer(CadenceQuantizer): + """ + Quantizer including A16 conv + """ + + def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: + if quantizers is None: + quantizers = [] + # Add 16-bit quantizers for Conv patterns + quantizers.append(CadenceAtenQuantizer(Conv1dPattern(), qconfig_A16)) + quantizers.append(CadenceAtenQuantizer(Conv2dPattern(), qconfig_A16)) + super().__init__(quantizers) + + +class CadenceWith16BitMatmulActivationsQuantizer(CadenceQuantizer): + """ + Quantizer including A16 matmul + """ + + def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: + if quantizers is None: + quantizers = [] + # Add 16-bit quantizers for MatmulPattern + quantizers.append(CadenceAtenQuantizer(MatmulPattern(), qconfig_A16)) + super().__init__(quantizers) diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp index 984747d9316..fdc2c9ad5dc 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) @@ -532,6 +533,30 @@ void quantized_conv2d_nchw_out( __ET_UNUSED const Tensor& out_multiplier, __ET_UNUSED const Tensor& out_shift, Tensor& out) { + // Handle W8A16 heterogeneous type (int16_t activations, int8_t weights) + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + input.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_conv2d_nchw_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + out); + return; + } + const float bias_scale_float = bias_scale.const_data_ptr()[0]; const int32_t weight_zero_point_int = weight_zero_point.const_data_ptr()[0]; @@ -596,6 +621,30 @@ void quantized_conv2d_nchw_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { + // Handle W8A16 heterogeneous type (int16_t activations, int8_t weights) + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + input.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_conv2d_nchw_per_tensor_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + out); + return; + } + bool optimized = 0; if ((input.scalar_type() == ScalarType::Char) || diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp index a5d503853c4..55074199a77 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) @@ -438,6 +439,29 @@ void quantized_conv2d_nhwc_out( __ET_UNUSED const Tensor& out_multiplier, __ET_UNUSED const Tensor& out_shift, Tensor& out) { + // Handle W8A16 heterogeneous type (int16_t activations, int8_t weights) + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + input.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_conv2d_nhwc_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + out); + return; + } const float bias_scale_float = bias_scale.const_data_ptr()[0]; const int32_t weight_zero_point_int = weight_zero_point.const_data_ptr()[0]; @@ -502,8 +526,31 @@ void quantized_conv2d_nhwc_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - bool optimized = 0; + // Handle W8A16 heterogeneous type (int16_t activations, int8_t weights) + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + input.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_conv2d_nhwc_per_tensor_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + out); + return; + } + bool optimized = 0; if ((input.scalar_type() == ScalarType::Char) || (input.scalar_type() == ScalarType::Byte)) optimized = 1; diff --git a/backends/cadence/hifi/operators/op_quantized_linear_out.cpp b/backends/cadence/hifi/operators/op_quantized_linear_out.cpp index 84aff1c2f41..4f0973ce6bf 100644 --- a/backends/cadence/hifi/operators/op_quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_linear_out.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -218,7 +219,24 @@ void quantized_linear_out( int64_t out_zero_point, __ET_UNUSED const optional& offset, Tensor& out) { - if (out.scalar_type() == executorch::aten::ScalarType::Byte) { + + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + in.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_linear_out( + ctx, + in, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + offset, + out); + } + else if (out.scalar_type() == executorch::aten::ScalarType::Byte) { _quantized_linear_asym8u( in, weight, @@ -260,7 +278,23 @@ void quantized_linear_per_tensor_out( int64_t out_zero_point, __ET_UNUSED const optional& offset, Tensor& out) { - if (out.scalar_type() == executorch::aten::ScalarType::Byte) { + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + in.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_linear_per_tensor_out( + ctx, + in, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + offset, + out); + } + else if (out.scalar_type() == executorch::aten::ScalarType::Byte) { _quantized_linear_per_tensor_asym8u( in, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp b/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp index 90fe483660b..5b615c41386 100644 --- a/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp @@ -8,6 +8,7 @@ #include #include +#include #include using executorch::aten::ScalarType; @@ -192,8 +193,20 @@ void quantized_matmul_out( size_t leading_dim = X.size(X.dim() - 2); size_t out_dim = Y.size(Y.dim() - 1 - transposed); size_t in_dim = X.size(X.dim() - 1); - - if (out.scalar_type() == exec_aten::ScalarType::Byte) { + if (out.scalar_type() == exec_aten::ScalarType::Short) { + ::impl::generic::native::quantized_matmul_out( + ctx, + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + out); + } else if (out.scalar_type() == exec_aten::ScalarType::Byte) { _typed_quantized_matmul( ctx, X, diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.h b/backends/cadence/hifi/operators/op_quantized_matmul_out.h new file mode 100644 index 00000000000..c53a07b58aa --- /dev/null +++ b/backends/cadence/hifi/operators/op_quantized_matmul_out.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "executorch/runtime/core/exec_aten/exec_aten.h" +#include "executorch/runtime/kernel/kernel_runtime_context.h" + +namespace impl { +namespace HiFi { +namespace native { + +::executorch::aten::Tensor& quantized_matmul_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& X, + int64_t X_zero_point, + const ::executorch::aten::Tensor& Y, + int64_t Y_zero_point, + const ::executorch::aten::optional<::executorch::aten::Tensor>& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + ::executorch::aten::Tensor& out); + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index a25dfd1bcbc..9ff7f060277 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -2,7 +2,7 @@ load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -def define_operator(name: str, deps: list[str] | None = None) -> None: +def define_operator(name: str, deps: list[str] | None = None, exported_headers: list[str] | None = None) -> None: op_name = "op_{}".format(name) # Deps used by all operators. @@ -21,6 +21,8 @@ def define_operator(name: str, deps: list[str] | None = None) -> None: ] if deps == None: deps = [] + if exported_headers == None: + exported_headers = ["operators.h"] runtime.cxx_library( name = op_name, @@ -32,7 +34,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None: ], compatible_with = ["ovr_config//cpu:xtensa"], deps = deps + common_deps, - exported_headers = ["operators.h"], + exported_headers = exported_headers, ) OPERATORS = [ @@ -65,7 +67,6 @@ OPERATORS = [ "ne", "permute_copy", "pow", - "quantized_conv2d_nchw_out", "quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out", @@ -74,7 +75,6 @@ OPERATORS = [ "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out", - "quantized_conv2d_nhwc_out", "quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out", @@ -87,10 +87,8 @@ OPERATORS = [ "quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor_out", "quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out", "quantized_layer_norm", - "quantized_linear_out", "quantized_linear_asym8sxasym8s_asym8s_per_tensor_out", "quantized_linear_asym8uxasym8u_asym8u_per_tensor_out", - "quantized_matmul_out", "quantized_matmul_asym8sxasym8s_asym8s_out", "quantized_matmul_asym8uxasym8u_asym8u_out", "quantized_relu_out", @@ -122,3 +120,14 @@ def define_common_targets(): # Define build targets for all operators registered in the tables above. for op in OPERATORS: define_operator(op) + + # quantized_linear_out and quantized_linear_per_tensor_out needs additional dependency for int16 support + define_operator("quantized_linear_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_linear"]) + define_operator("quantized_linear_per_tensor_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_linear"]) + + # quantized_conv2d_nchw_out and quantized_conv2d_nhwc_out need additional dependency for int16 support + define_operator("quantized_conv2d_nchw_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"]) + define_operator("quantized_conv2d_nhwc_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"]) + + # quantized_matmul_out needs additional dependency for int16 support + define_operator("quantized_matmul_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_matmul"], exported_headers=["op_quantized_matmul_out.h"]) diff --git a/backends/cadence/hifi/operators/tests/test_op_quantized_conv2d_out.cpp b/backends/cadence/hifi/operators/tests/test_op_quantized_conv2d_out.cpp new file mode 100644 index 00000000000..2c963f9d1a6 --- /dev/null +++ b/backends/cadence/hifi/operators/tests/test_op_quantized_conv2d_out.cpp @@ -0,0 +1,225 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace impl { +namespace HiFi { +namespace native { +namespace { + +using ::executorch::aten::Scalar; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::aten::TensorImpl; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; +using ::executorch::runtime::runtime_init; +using ::executorch::runtime::testing::TensorFactory; + +class HiFiQuantizedConv2dTest : public OperatorTest { + public: + protected: + void quantized_conv2d_nchw_out( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + ::executorch::aten::IntArrayRef stride, + ::executorch::aten::IntArrayRef padding, + ::executorch::aten::IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + return ::impl::HiFi::native::quantized_conv2d_nchw_out( + context_, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + output); + } + + void quantized_conv2d_nhwc_out( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + ::executorch::aten::IntArrayRef stride, + ::executorch::aten::IntArrayRef padding, + ::executorch::aten::IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + return ::impl::HiFi::native::quantized_conv2d_nhwc_out( + context_, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + output); + } +}; + +// Test quantized_conv2d_nchw_out with int16 activations and int8 weights +TEST_F(HiFiQuantizedConv2dTest, QuantizedConv2dNchwInt16Test) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + TensorFactory tf_float; + + // Simple 2D case: input [1, 8, 20, 28] with kernel [16, 8, 3, 5] + // Using simple values for testing + Tensor input = tf_int16.ones({1, 8, 20, 28}); + Tensor weight = tf_int8.ones({16, 8, 3, 5}); + Tensor bias = tf_int32.zeros({16}); + + // Calculate output dimensions: (20-3)/1+1=18, (28-5)/1+1=24 + Tensor output = tf_int16.zeros({1, 16, 18, 24}); + + int64_t in_zero_point = 0; + Tensor weight_zero_point = tf_int32.make({1}, {0}); + Tensor bias_scale = tf_float.make({1}, {1.0f}); + double output_scale = 1.0; + int64_t output_zero_point = 0; + Tensor out_multiplier = tf_int32.make({1}, {1073741824}); // 0.5 * 2^31 + Tensor out_shift = tf_int32.make({1}, {0}); + + std::array stride_arr = {1, 1}; + std::array padding_arr = {0, 0}; + std::array dilation_arr = {1, 1}; + + ::executorch::aten::ArrayRef stride(stride_arr.data(), 2); + ::executorch::aten::ArrayRef padding(padding_arr.data(), 2); + ::executorch::aten::ArrayRef dilation(dilation_arr.data(), 2); + + quantized_conv2d_nchw_out( + input, + weight, + bias, + stride, + padding, + dilation, + 1, // groups + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + output); + + // Verify the output is correct + // With all ones input and weights, and kernel size 3x5=15 * 8 channels = 120 + // After applying out_multiplier (0.5 * 2^31), the value is scaled by 0.5 + // Expected value: 120 * 0.5 = 60 + Tensor expected = tf_int16.full({1, 16, 18, 24}, 120); + EXPECT_TENSOR_EQ(output, expected); +} + +// Test quantized_conv2d_nhwc_out with int16 activations and int8 weights +TEST_F(HiFiQuantizedConv2dTest, QuantizedConv2dNhwcInt16Test) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + TensorFactory tf_float; + + // Simple 2D case in NHWC format: input [1, 20, 28, 8] with kernel [16, 3, 5, + // 8] + Tensor input = tf_int16.ones({1, 20, 28, 8}); + Tensor weight = tf_int8.ones({16, 3, 5, 8}); + Tensor bias = tf_int32.zeros({16}); + + // Calculate output dimensions: (20-3)/1+1=18, (28-5)/1+1=24 + Tensor output = tf_int16.zeros({1, 18, 24, 16}); + + int64_t in_zero_point = 0; + Tensor weight_zero_point = tf_int32.make({1}, {0}); + Tensor bias_scale = tf_float.make({1}, {1.0f}); + double output_scale = 1.0; + int64_t output_zero_point = 0; + Tensor out_multiplier = tf_int32.make({1}, {1073741824}); // 0.5 * 2^31 + Tensor out_shift = tf_int32.make({1}, {0}); + + std::array stride_arr = {1, 1}; + std::array padding_arr = {0, 0}; + std::array dilation_arr = {1, 1}; + + ::executorch::aten::ArrayRef stride(stride_arr.data(), 2); + ::executorch::aten::ArrayRef padding(padding_arr.data(), 2); + ::executorch::aten::ArrayRef dilation(dilation_arr.data(), 2); + + quantized_conv2d_nhwc_out( + input, + weight, + bias, + stride, + padding, + dilation, + 1, // groups + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + output); + + // Verify the output is correct + // With all ones input and weights, and kernel size 3x5=15 * 8 channels = 120 + // After applying out_multiplier (0.5 * 2^31), the value is scaled by 0.5 + // Expected value: 120 * 0.5 = 60 + Tensor expected = tf_int16.full({1, 18, 24, 16}, 120); + EXPECT_TENSOR_EQ(output, expected); +} + +} // namespace +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/tests/test_op_quantized_linear_out.cpp b/backends/cadence/hifi/operators/tests/test_op_quantized_linear_out.cpp new file mode 100644 index 00000000000..aabad72d9ec --- /dev/null +++ b/backends/cadence/hifi/operators/tests/test_op_quantized_linear_out.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. +*/ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace impl { +namespace HiFi { +namespace native { +namespace { + +using ::executorch::aten::Scalar; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::aten::TensorImpl; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; +using ::executorch::runtime::runtime_init; +using ::executorch::runtime::testing::TensorFactory; +using std::optional; +using std::string_view; + +class HiFiQuantizedLinearTest : public OperatorTest { + public: + protected: + void quantized_linear_out( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + int64_t out_zero_point, + const optional& offset, + Tensor& output) { + return ::impl::HiFi::native::quantized_linear_out( + context_, + input, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + offset, + output); + } + + void quantized_linear_per_tensor_out( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + int64_t in_zero_point, + int64_t weight_zero_point, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + const optional& offset, + Tensor& output) { + return ::impl::HiFi::native::quantized_linear_per_tensor_out( + context_, + input, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + offset, + output); + } +}; + +// Test quantized_linear_out with int16 activations (asym8s) +TEST_F(HiFiQuantizedLinearTest, QuantizedLinearInt16Test) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + + // Simple 2D case: input [2, 3] x weight [4, 3] = output [2, 4] + // Values captured from e2e test with + // CadenceWith16BitLinearActivationsQuantizer + Tensor input = + tf_int16.make({2, 3}, {-28170, -26389, -32768, -31474, -32266, -29076}); + Tensor weight = tf_int8.make( + {4, 3}, {1, 87, -128, -114, -59, 44, -1, 127, -12, 44, -46, -29}); + Tensor bias = tf_int32.zeros({4}); + Tensor output = tf_int16.zeros({2, 4}); + + int64_t in_zero_point = -29822; + Tensor weight_zero_point = tf_int32.make({1}, {2}); + Tensor out_multiplier = tf_int32.make({1}, {2011373824}); + Tensor out_shift = tf_int32.make({1}, {-8}); + int64_t out_zero_point = -30847; + quantized_linear_out( + input, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + std::nullopt, + output); + // Expected output from e2e test + Tensor expected_output = tf_int16.make( + {2, 4}, {-28384, -32767, -29144, -30862, -31956, -29486, -31985, -30756}); + EXPECT_TENSOR_CLOSE(output, expected_output); +} + +} // namespace +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp b/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp new file mode 100644 index 00000000000..c2d3815ba19 --- /dev/null +++ b/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace impl { +namespace HiFi { +namespace native { +namespace { + +using ::executorch::aten::Scalar; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::aten::TensorImpl; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; +using ::executorch::runtime::runtime_init; +using ::executorch::runtime::testing::TensorFactory; + +class HiFiQuantizedMatmulTest : public OperatorTest { + public: + protected: + Tensor& quantized_matmul_out( + const Tensor& X, + int64_t X_zero_point, + const Tensor& Y, + int64_t Y_zero_point, + const std::optional& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + Tensor& output) { + return impl::HiFi::native::quantized_matmul_out( + context_, + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + output); + } +}; + +// Test quantized_matmul_out with int16 activations and int8 weights +TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16Test) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + + // Simple 2D case: X [64, 33] x Y [33, 128] = output [64, 128] + // Using simple values for testing + Tensor X = tf_int16.ones({64, 33}); + Tensor Y = tf_int8.ones({33, 128}); + // Bias not used + Tensor bias = tf_int32.full({128}, -30); + Tensor output = tf_int16.zeros({64, 128}); + + int64_t X_zero_point = 0; + int64_t Y_zero_point = 0; + int64_t out_multiplier = 1073741824; // 0.5 * 2^31 + int64_t out_shift = 0; + int64_t out_zero_point = 0; + + quantized_matmul_out( + X, + X_zero_point, + Y, + Y_zero_point, + bias, // pass bias tensor + out_multiplier, + out_shift, + out_zero_point, + false, // transposed + output); + + // Verify the output is correct + // With all ones input and weights, inner dimension is 33 + // Matmul result: 33, with out_multiplier = 0.5 * 2^31 (scales by 0.5) + // Expected value: 33 * 0.5 = 16.5 ≈ 16 + Tensor expected = tf_int16.full({64, 128}, 16); + EXPECT_TENSOR_EQ(output, expected); +} + +// Test quantized_matmul_out with transposed Y (int16 activations and int8 +// weights) +TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16TransposedTest) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + + // Transposed case: X [64, 33] x Y^T [128, 33] = output [64, 128] + Tensor X = tf_int16.ones({64, 33}); + Tensor Y = tf_int8.ones({128, 33}); // Transposed + // Bias not used + Tensor bias = tf_int32.full({128}, -30); + Tensor output = tf_int16.zeros({64, 128}); + + int64_t X_zero_point = 0; + int64_t Y_zero_point = 0; + int64_t out_multiplier = 1073741824; // 0.5 * 2^31 + int64_t out_shift = 0; + int64_t out_zero_point = 0; + + quantized_matmul_out( + X, + X_zero_point, + Y, + Y_zero_point, + bias, // pass bias tensor + out_multiplier, + out_shift, + out_zero_point, + true, // transposed + output); + + // Verify the output is correct + // With all ones input and weights, inner dimension is 33 + // Matmul result: 33, with out_multiplier = 0.5 * 2^31 (scales by 0.5) + // Expected value: 33 * 0.5 = 16.5 ≈ 16 + Tensor expected = tf_int16.full({64, 128}, 16); + EXPECT_TENSOR_EQ(output, expected); +} + +} // namespace +} // namespace native +} // namespace HiFi +} // namespace impl