Skip to content

[QNN EP] Fix 16x16 MatMul translation #24846

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 77 additions & 56 deletions onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,50 +49,6 @@ class MatMulOpBuilder : public BaseOpBuilder {
};

namespace {

// Inserts a QNN Convert operator to convert from one quantization type (e.g., uint16) to another (e.g., uint8).
Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
const std::string& convert_input_name,
const std::string& convert_output_name,
Qnn_DataType_t input_qnn_data_type,
Qnn_DataType_t output_qnn_data_type,
int32_t input_offset,
float input_scale,
const std::vector<uint32_t>& output_shape,
bool do_op_validation) {
// Assume input is already handled.
float qmin = 0.0f;
float qmax = 255.0f;
ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);
float scale = 0.0f;
int32_t offset = 0;
ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
static_cast<float>(value_max),
output_qnn_data_type,
scale,
offset));

std::vector<uint32_t> output_shape_copy = output_shape;
QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
QNN_TENSOR_TYPE_NATIVE,
output_qnn_data_type,
QnnQuantParamsWrapper(scale, offset),
std::move(output_shape_copy));
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");

ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
"Convert",
{convert_input_name},
{convert_output_name},
{},
do_op_validation),
"Failed to add node.");
return Status::OK();
}

inline bool IsQuant16bit(Qnn_DataType_t qnn_data_type) {
return qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16 || qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16;
}
Expand Down Expand Up @@ -253,7 +209,8 @@ Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wra
}
input_names.emplace_back(input_1_name);

// Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to quantized uint8)
// Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to quantized uint8
// OR converts from asymmetric quantized uint16 to symmetric quantized uint16)
// to avoid a QNN validation failure.
//
// QNN graph WITHOUT workaround (fails validation):
Expand All @@ -262,12 +219,18 @@ Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wra
// |
// input_1_uint16 -----+
//
// QNN graph WITH workaround (passes validation):
// For Dynamic weights, QNN graph WITH workaround (passes validation):
// input_0_uint16 ----------------------> MatMul ---> output_uint16
// ^
// |
// input_1_uint16 --> Convert(to uint8) --+
if (!input_info_0.is_initializer && !input_info_1.is_initializer &&
//
// For Static weights, QNN graph WITH workaround (passes validation):
// input_0_uint16 ------------------------------> MatMul ---> output_uint16
// ^
// |
// input_1_uint16 --> Convert(to symmetric int16) --+
if (!input_info_0.is_initializer &&
input_info_0.qnn_data_type == input_info_1.qnn_data_type &&
input_info_0.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
ORT_RETURN_IF_NOT(input_info_1.quant_param.IsPerTensor(),
Expand All @@ -282,15 +245,29 @@ Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wra
if (reshape_input_1) {
input_1_shape = {input_info_1.shape[0], 1};
}
ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper,
convert_input_name,
convert_output_name,
input_info_1.qnn_data_type,
QNN_DATATYPE_UFIXED_POINT_8,
quant_param.scaleOffsetEncoding.offset,
quant_param.scaleOffsetEncoding.scale,
input_1_shape,
do_op_validation));
if (!input_info_1.is_initializer) {
ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
convert_input_name,
convert_output_name,
input_info_1.qnn_data_type,
QNN_DATATYPE_UFIXED_POINT_8,
quant_param.scaleOffsetEncoding.offset,
quant_param.scaleOffsetEncoding.scale,
input_1_shape,
false, // asymmetric
do_op_validation));
} else {
ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
convert_input_name,
convert_output_name,
input_info_1.qnn_data_type,
QNN_DATATYPE_SFIXED_POINT_16,
quant_param.scaleOffsetEncoding.offset,
quant_param.scaleOffsetEncoding.scale,
input_1_shape,
true, // symmetric
do_op_validation));
}
input_names.push_back(convert_output_name);
}
return Status::OK();
Expand Down Expand Up @@ -355,6 +332,50 @@ Status MatMulOpBuilder::ProcessInputsForQnnFullyConnected(QnnModelWrapper& qnn_m
qnn_model_wrapper.IsGraphInput(org_input_1_name), false));
}
input_names.emplace_back(input_1_name);

// Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to signed symmetric int16)
// to avoid a QNN validation failure.
//
// QNN graph WITHOUT workaround (fails validation):
// input_0_uint16 ---> FC ---> output_uint16
// ^
// |
// input_1_uint16 -----+
//
// QNN graph WITH workaround (passes validation):
// input_0_uint16 ----------------------> FC ---> output_uint16
// ^
// |
// input_1_uint16 --> Convert(to int16) --+

std::string weight_input_name = input_names.back();
const auto& weight_tensor_wrapper = qnn_model_wrapper.GetQnnTensorWrapper(weight_input_name);

if (weight_tensor_wrapper.GetTensorDataType() == QNN_DATATYPE_UFIXED_POINT_16) {
const auto& quant_param_wrapper = weight_tensor_wrapper.GetQnnQuantParams();
const Qnn_QuantizeParams_t& quant_param = quant_param_wrapper.Get();
const auto& transformed_input1_shape = weight_tensor_wrapper.GetTensorDims();

ORT_RETURN_IF_NOT(quant_param_wrapper.IsPerTensor(),
"FC's INT16 weight inputs only support INT16 per-tensor quantization");

// Pop Conv weight. Insert Convert op after Weight
input_names.pop_back();
const std::string& conv_output_name = node_unit.Outputs()[0].node_arg.Name();
std::string convert_output_name = weight_input_name + "_convert_" + conv_output_name;

ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
weight_input_name,
convert_output_name,
QNN_DATATYPE_UFIXED_POINT_16,
QNN_DATATYPE_SFIXED_POINT_16,
quant_param.scaleOffsetEncoding.offset,
quant_param.scaleOffsetEncoding.scale,
transformed_input1_shape,
true, // Symmetric
do_op_validation));
input_names.push_back(convert_output_name);
}
return Status::OK();
}

Expand Down
54 changes: 54 additions & 0 deletions onnxruntime/test/providers/qnn/matmul_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,60 @@ TEST_F(QnnHTPBackendTests, MatMulOp_QDQ_Regression_uint16_dynamic_inputs) {
}
}

#ifndef __linux__
// Tests MatMul with two uint16 (quantized) inputs with weight as static.
// This exercises a workaround in QNN EP that inserts a QNN Convert op before input[1] (converts from uint16 to sint16).
// This workaround prevents a validation error for this specific MatMul configuration.
// Got specific shapes and input ranges (quant params) from customer model.
TEST_F(QnnHTPBackendTests, MatMulOp_QDQ_Regression_uint16_static_weight) {
ProviderOptions provider_options;
provider_options["backend_type"] = "htp";
provider_options["offload_graph_io_quantization"] = "0";

// Test with rank 4 inputs
{
std::vector<int64_t> shape_0 = {1, 12, 512, 96};
TestInputDef<float> input0_def(
{1, 12, 512, 96}, false,
GetFloatDataInRange(-5.087f, 4.992f,
static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>()))));
std::vector<int64_t> shape_1 = {1, 12, 96, 512};
TestInputDef<float> input1_def(
shape_1, true,
GetFloatDataInRange(-6.772f, 7.258f,
static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>()))));

TestQDQModelAccuracy(
BuildMatMulOpTestCase(input0_def, input1_def),
BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
}

// Test with input[1] as rank 1
{
std::vector<int64_t> shape_0 = {1, 12, 512, 96};
TestInputDef<float> input0_def(
{1, 12, 512, 96}, false,
GetFloatDataInRange(-5.087f, 4.992f,
static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>()))));
std::vector<int64_t> shape_1 = {96};
TestInputDef<float> input1_def(
shape_1, true,
GetFloatDataInRange(-6.772f, 7.258f,
static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>()))));

TestQDQModelAccuracy(
BuildMatMulOpTestCase(input0_def, input1_def),
BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
}
}
#endif

#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)

} // namespace test
Expand Down
Loading