[QNN EP] Fix 16x16 MatMul translation

quic-tirupath · Tirupathi Reddy T · commit 0fca5ef211b5 · 2025-06-02T11:23:36.000-07:00
- QNN's 16x16 FC doesn't support asymmetric int16 weight
 - QNN's 16x16 MatMul doesn't support asymmetric int16 weight
   initializer.
 - Insert Convert Op to convert from asymmetric uint16 weight
   to symmetric int16 weight.
 - Add unit tests to verify 16x16 MatMul translations.
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
@@ -49,50 +49,6 @@ class MatMulOpBuilder : public BaseOpBuilder {
 };
 
 namespace {
-
-// Inserts a QNN Convert operator to convert from one quantization type (e.g., uint16) to another (e.g., uint8).
-Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
-                       const std::string& convert_input_name,
-                       const std::string& convert_output_name,
-                       Qnn_DataType_t input_qnn_data_type,
-                       Qnn_DataType_t output_qnn_data_type,
-                       int32_t input_offset,
-                       float input_scale,
-                       const std::vector<uint32_t>& output_shape,
-                       bool do_op_validation) {
-  // Assume input is already handled.
-  float qmin = 0.0f;
-  float qmax = 255.0f;
-  ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
-  double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
-  double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);
-  float scale = 0.0f;
-  int32_t offset = 0;
-  ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
-                                                 static_cast<float>(value_max),
-                                                 output_qnn_data_type,
-                                                 scale,
-                                                 offset));
-
-  std::vector<uint32_t> output_shape_copy = output_shape;
-  QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
-                                                QNN_TENSOR_TYPE_NATIVE,
-                                                output_qnn_data_type,
-                                                QnnQuantParamsWrapper(scale, offset),
-                                                std::move(output_shape_copy));
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");
-
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name,
-                                                    QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                    "Convert",
-                                                    {convert_input_name},
-                                                    {convert_output_name},
-                                                    {},
-                                                    do_op_validation),
-                    "Failed to add node.");
-  return Status::OK();
-}
-
 inline bool IsQuant16bit(Qnn_DataType_t qnn_data_type) {
   return qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16 || qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16;
 }
@@ -253,7 +209,8 @@ Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wra
   }
   input_names.emplace_back(input_1_name);
 
-  // Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to quantized uint8)
+  // Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to quantized uint8
+  // OR converts from asymmetric quantized uint16 to symmetric quantized uint16)
   // to avoid a QNN validation failure.
   //
   // QNN graph WITHOUT workaround (fails validation):
@@ -262,12 +219,18 @@ Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wra
   //                         |
   //     input_1_uint16 -----+
   //
-  // QNN graph WITH workaround (passes validation):
+  // For Dynamic weights, QNN graph WITH workaround (passes validation):
   //     input_0_uint16 ----------------------> MatMul ---> output_uint16
   //                                            ^
   //                                            |
   //     input_1_uint16 --> Convert(to uint8) --+
-  if (!input_info_0.is_initializer && !input_info_1.is_initializer &&
+  //
+  // For Static weights, QNN graph WITH workaround (passes validation):
+  //     input_0_uint16 ------------------------------> MatMul ---> output_uint16
+  //                                                      ^
+  //                                                      |
+  //     input_1_uint16 --> Convert(to symmetric int16) --+
+  if (!input_info_0.is_initializer &&
       input_info_0.qnn_data_type == input_info_1.qnn_data_type &&
       input_info_0.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
     ORT_RETURN_IF_NOT(input_info_1.quant_param.IsPerTensor(),
@@ -282,15 +245,29 @@ Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wra
     if (reshape_input_1) {
       input_1_shape = {input_info_1.shape[0], 1};
     }
-    ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper,
-                                        convert_input_name,
-                                        convert_output_name,
-                                        input_info_1.qnn_data_type,
-                                        QNN_DATATYPE_UFIXED_POINT_8,
-                                        quant_param.scaleOffsetEncoding.offset,
-                                        quant_param.scaleOffsetEncoding.scale,
-                                        input_1_shape,
-                                        do_op_validation));
+    if (!input_info_1.is_initializer) {
+      ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
+                                                 convert_input_name,
+                                                 convert_output_name,
+                                                 input_info_1.qnn_data_type,
+                                                 QNN_DATATYPE_UFIXED_POINT_8,
+                                                 quant_param.scaleOffsetEncoding.offset,
+                                                 quant_param.scaleOffsetEncoding.scale,
+                                                 input_1_shape,
+                                                 false,  // asymmetric
+                                                 do_op_validation));
+    } else {
+      ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
+                                                 convert_input_name,
+                                                 convert_output_name,
+                                                 input_info_1.qnn_data_type,
+                                                 QNN_DATATYPE_SFIXED_POINT_16,
+                                                 quant_param.scaleOffsetEncoding.offset,
+                                                 quant_param.scaleOffsetEncoding.scale,
+                                                 input_1_shape,
+                                                 true,  // symmetric
+                                                 do_op_validation));
+    }
     input_names.push_back(convert_output_name);
   }
   return Status::OK();
@@ -355,6 +332,50 @@ Status MatMulOpBuilder::ProcessInputsForQnnFullyConnected(QnnModelWrapper& qnn_m
                                                          qnn_model_wrapper.IsGraphInput(org_input_1_name), false));
   }
   input_names.emplace_back(input_1_name);
+
+  // Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to signed symmetric int16)
+  // to avoid a QNN validation failure.
+  //
+  // QNN graph WITHOUT workaround (fails validation):
+  //     input_0_uint16 ---> FC ---> output_uint16
+  //                         ^
+  //                         |
+  //     input_1_uint16 -----+
+  //
+  // QNN graph WITH workaround (passes validation):
+  //     input_0_uint16 ----------------------> FC ---> output_uint16
+  //                                            ^
+  //                                            |
+  //     input_1_uint16 --> Convert(to int16) --+
+
+  std::string weight_input_name = input_names.back();
+  const auto& weight_tensor_wrapper = qnn_model_wrapper.GetQnnTensorWrapper(weight_input_name);
+
+  if (weight_tensor_wrapper.GetTensorDataType() == QNN_DATATYPE_UFIXED_POINT_16) {
+    const auto& quant_param_wrapper = weight_tensor_wrapper.GetQnnQuantParams();
+    const Qnn_QuantizeParams_t& quant_param = quant_param_wrapper.Get();
+    const auto& transformed_input1_shape = weight_tensor_wrapper.GetTensorDims();
+
+    ORT_RETURN_IF_NOT(quant_param_wrapper.IsPerTensor(),
+                      "FC's INT16 weight inputs only support INT16 per-tensor quantization");
+
+    // Pop Conv weight. Insert Convert op after Weight
+    input_names.pop_back();
+    const std::string& conv_output_name = node_unit.Outputs()[0].node_arg.Name();
+    std::string convert_output_name = weight_input_name + "_convert_" + conv_output_name;
+
+    ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
+                                               weight_input_name,
+                                               convert_output_name,
+                                               QNN_DATATYPE_UFIXED_POINT_16,
+                                               QNN_DATATYPE_SFIXED_POINT_16,
+                                               quant_param.scaleOffsetEncoding.offset,
+                                               quant_param.scaleOffsetEncoding.scale,
+                                               transformed_input1_shape,
+                                               true,  // Symmetric
+                                               do_op_validation));
+    input_names.push_back(convert_output_name);
+  }
   return Status::OK();
 }
 
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -340,6 +340,58 @@ TEST_F(QnnHTPBackendTests, MatMulOp_QDQ_Regression_uint16_dynamic_inputs) {
   }
 }
 
+// Tests MatMul with two uint16 (quantized) inputs with weight as static.
+// This exercises a workaround in QNN EP that inserts a QNN Convert op before input[1] (converts from uint16 to sint16).
+// This workaround prevents a validation error for this specific MatMul configuration.
+// Got specific shapes and input ranges (quant params) from customer model.
+TEST_F(QnnHTPBackendTests, MatMulOp_QDQ_Regression_uint16_static_weight) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  // Test with rank 4 inputs
+  {
+    std::vector<int64_t> shape_0 = {1, 12, 512, 96};
+    TestInputDef<float> input0_def(
+        {1, 12, 512, 96}, false,
+        GetFloatDataInRange(-5.087f, 4.992f,
+                            static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
+                                                                std::multiplies<int64_t>()))));
+    std::vector<int64_t> shape_1 = {1, 12, 96, 512};
+    TestInputDef<float> input1_def(
+        shape_1, true,
+        GetFloatDataInRange(-6.772f, 7.258f,
+                            static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
+                                                                std::multiplies<int64_t>()))));
+
+    TestQDQModelAccuracy(
+        BuildMatMulOpTestCase(input0_def, input1_def),
+        BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
+        provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
+  }
+
+  // Test with input[1] as rank 1
+  {
+    std::vector<int64_t> shape_0 = {1, 12, 512, 96};
+    TestInputDef<float> input0_def(
+        {1, 12, 512, 96}, false,
+        GetFloatDataInRange(-5.087f, 4.992f,
+                            static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
+                                                                std::multiplies<int64_t>()))));
+    std::vector<int64_t> shape_1 = {96};
+    TestInputDef<float> input1_def(
+        shape_1, true,
+        GetFloatDataInRange(-6.772f, 7.258f,
+                            static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
+                                                                std::multiplies<int64_t>()))));
+
+    TestQDQModelAccuracy(
+        BuildMatMulOpTestCase(input0_def, input1_def),
+        BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
+        provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
+  }
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test