[NNAPI QDQ] Add nnapi qdq softmax op support (#10591)

YUNQIUGUO · rachguo · web-flow · commit d6a8cba27302 · 2022-02-21T18:00:46.000-08:00
* wip

* save

* update pr comments

* update

Co-authored-by: rachguo &lt;rachguo@rachguos-Mini.attlocal.net&gt;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -34,6 +34,7 @@ static const OpVersionsAndSelector::OpVersionsMap GetMiscOpVersionsMap() { retur
                                                                                    {"Resize", {}}}; }
 
 static const OpVersionsAndSelector::OpVersionsMap GetUnaryOpVersionsMap() { return {{"AveragePool", {}},
+                                                                                    {"Softmax", {}},
                                                                                     {"LeakyRelu", {}}}; }
 static const OpVersionsAndSelector::OpVersionsMap GetBinaryOpVersionsMap() { return {{"Add", {}},
                                                                                      {"Mul", {}}}; }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -82,6 +82,8 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {
       return QuantizedOpType::QDQTranspose;
     else if (op_type == "Reshape")
       return QuantizedOpType::QDQReshape;
+    else if (op_type == "Softmax")
+      return QuantizedOpType::QDQSoftmax;
   } else {
     // throw?
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -92,6 +92,7 @@ enum class QuantizedOpType : uint8_t {
   QDQMul,
   QDQTranspose,
   QDQReshape,
+  QDQSoftmax,
   // TODO, add other QDQ NodeUnit types
 };
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -1478,10 +1478,25 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 #pragma region op_softmax
 
 class SoftMaxOpBuilder : public BaseOpBuilder {
+ public:
+  void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
+
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
+  bool IsQuantizedOp(const NodeUnit& node_unit) const override;
 };
 
+bool SoftMaxOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) const {
+  return GetQuantizedOpType(node_unit) == QuantizedOpType::QDQSoftmax;
+}
+
+void SoftMaxOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
+  if (IsQuantizedOp(node_unit)) {
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Inputs()[0].quant_param);   // x_scale, x_zp
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Outputs()[0].quant_param);  // y_scale, y_zp
+  }
+}
+
 Status SoftMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
@@ -1499,6 +1514,21 @@ Status SoftMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
 
   int32_t axis = helper.Get("axis", 1);
 
+  // Check if the quantization scale and ZP are correct
+  float x_scale = 0.0f;
+  int32_t x_zero_point = 0;
+  float y_scale = 0.0f;
+  int32_t y_zero_point = 0;
+  if (IsQuantizedOp(node_unit)) {
+    ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+        model_builder.GetInitializerTensors(), node_unit.Inputs()[0], node_unit.ModelPath(),
+        x_scale, x_zero_point));
+
+    ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
+
+    y_scale = 1.f / 256;
+  }
+
   const auto& output = node_unit.Outputs()[0].node_arg.Name();
   float beta = 1.f;
   std::vector<uint32_t> input_indices;
@@ -1511,7 +1541,7 @@ Status SoftMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   }
 
   ORT_RETURN_IF_ERROR(shaper.Identity(input, output));
-  const OperandType output_operand_type(operand_types.at(input).type, shaper[output]);
+  const OperandType output_operand_type(operand_types.at(input).type, shaper[output], y_scale, y_zero_point);
   ORT_RETURN_IF_ERROR(model_builder.AddOperation(ANEURALNETWORKS_SOFTMAX, input_indices,
                                                  {output}, {output_operand_type}));
   return Status::OK();
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -272,6 +272,42 @@ static bool IsQuantizedIOSupported(const InitializedTensorSet& initializers, con
   return true;
 }
 
+// Some Quantized NNAPI operations have required output scale and zero point
+// e.g. Softmax (uint8) requires output scale be 1.f/256 and zp be 0
+// This helper function checks if the given io_def has required scale and zp
+static bool HasRequiredScaleAndZeroPoint(const InitializedTensorSet& initializers,
+                                         const std::string& op_desc,
+                                         const NodeUnitIODef& io_def,
+                                         const Path& path,
+                                         float required_scale, int32_t required_zp) {
+  float scale = 0.0f;
+  int32_t zp = 0;
+  auto status = GetQuantizationScaleAndZeroPoint(initializers, io_def, path,
+                                                 scale, zp);
+  if (!status.IsOK()) {
+    LOGS_DEFAULT(ERROR) << op_desc
+                        << " GetQuantizationScaleAndZeroPoint failed, message: "
+                        << status.ErrorMessage();
+    return false;
+  }
+
+  if (scale != required_scale) {
+    LOGS_DEFAULT(VERBOSE) << op_desc
+                          << " scale can only be [" << required_scale
+                          << "], actual scale: " << scale;
+    return false;
+  }
+
+  if (zp != required_zp) {
+    LOGS_DEFAULT(VERBOSE) << op_desc
+                          << "] zero point can only be [" << required_zp
+                          << "], actual zero point: " << scale;
+    return false;
+  }
+
+  return true;
+}
+
 #pragma endregion helpers
 
 #pragma region op_base
@@ -1142,8 +1178,19 @@ class SoftMaxOpSupportChecker : public BaseOpSupportChecker {
                                            const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override;
+
+  bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
+
+  bool IsQuantizedOp(const NodeUnit& node_unit) const override;
 };
 
+bool SoftMaxOpSupportChecker::IsQuantizedOp(const NodeUnit& node_unit) const {
+  return GetQuantizedOpType(node_unit) == QuantizedOpType::QDQSoftmax;
+}
+
 bool SoftMaxOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
                                                 const OpSupportCheckParams& params) const {
   Shape input_shape;
@@ -1171,6 +1218,32 @@ bool SoftMaxOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* i
   return true;
 }
 
+bool SoftMaxOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
+  if (!IsQuantizedOp(node_unit)) {
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
+  }
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input)) {
+    return false;
+  }
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output)) {
+    return false;
+  }
+
+  // NNAPI requires the scale be 1.f/256 and zero point to be 0
+  if (!HasRequiredScaleAndZeroPoint(initializers,
+                                    MakeString("Op [", node_unit.OpType(), "] name [", node_unit.Name(), "]'s output 0 "),
+                                    node_unit.Outputs()[0], node_unit.ModelPath(),
+                                    1.f / 256 /* required_scale */, 0 /* required_zp */)) {
+    return false;
+  }
+
+  return true;
+}
+
 #pragma endregion
 
 #pragma region op_gemm
@@ -1443,29 +1516,13 @@ int UnaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) const
     const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) {
   const auto& op_type = node_unit.OpType();
   ORT_ENFORCE(op_type == "QLinearSigmoid");
-  const auto& op_name = node_unit.Name();
 
   // NNAPI requires the scale be 1.f/256 and zero point to be 0
   // See https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/android10-c2f2-release/nn/common/operations/Activation.cpp#180
-  float output_scale = 0.0f;
-  int32_t output_zp = 0;
-  auto status = GetQuantizationScaleAndZeroPoint(initializers, node_unit.Outputs()[0], node_unit.ModelPath(),
-                                                 output_scale, output_zp);
-  if (!status.IsOK()) {
-    LOGS_DEFAULT(ERROR) << "Op [" << op_type << "] name [" << op_name
-                        << "] GetQuantizationScaleAndZeroPoint failed, message: " << status.ErrorMessage();
-    return false;
-  }
-
-  if (output_scale != 1.f / 256) {
-    LOGS_DEFAULT(VERBOSE) << "Op [" << op_type << "] name [" << op_name
-                          << "] output scale can only be 1.f/256, actual scale: " << output_scale;
-    return false;
-  }
-
-  if (output_zp != 0) {
-    LOGS_DEFAULT(VERBOSE) << "Op [" << op_type << "] name [" << op_name
-                          << "] output zero point can only be 0, actual zero point: " << output_scale;
+  if (!HasRequiredScaleAndZeroPoint(initializers,
+                                    MakeString("Op [", op_type, "] name [", node_unit.Name(), "]'s output 0 "),
+                                    node_unit.Outputs()[0], node_unit.ModelPath(),
+                                    1.f / 256 /* required_scale */, 0 /* required_zp */)) {
     return false;
   }
 
diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h
@@ -193,7 +193,9 @@ GetQDQTestCaseFn BuildQDQTransposeTestCase(
     const std::vector<int64_t>& input_shape,
     const std::vector<int64_t>& perms) {
   return [input_shape, perms](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput<InputType>(input_shape, -128, 127);
+    auto* input_arg = builder.MakeInput<InputType>(input_shape,
+                                                   std::numeric_limits<InputType>::min(),
+                                                   std::numeric_limits<InputType>::max());
     auto* output_arg = builder.MakeOutput();
 
     InputType dq_zp = std::numeric_limits<InputType>::max() / 2;
@@ -215,5 +217,30 @@ GetQDQTestCaseFn BuildQDQTransposeTestCase(
 
 GetQDQTestCaseFn BuildQDQReshapeTestCase(const std::vector<int64_t>& input_shape,
                                          const std::vector<int64_t>& reshape_shape);
+
+template <typename InputType, typename OutputType>
+GetQDQTestCaseFn BuildQDQSoftMaxTestCase(const std::vector<int64_t>& input_shape, const int64_t& axis = -1) {
+  return [input_shape, axis](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput<InputType>(input_shape,
+                                                   std::numeric_limits<InputType>::min(),
+                                                   std::numeric_limits<InputType>::max());
+
+    auto* output_arg = builder.MakeOutput();
+
+    // add DQ
+    auto* dq_output = builder.MakeIntermediate();
+    builder.AddDequantizeLinearNode<InputType>(input_arg, .003f, std::numeric_limits<InputType>::max() / 2, dq_output);
+
+    // add SoftMax
+    auto* softmax_output = builder.MakeIntermediate();
+    Node& softmax_node = builder.AddNode("Softmax", {dq_output}, {softmax_output});
+
+    softmax_node.AddAttribute("axis", axis);
+
+    // add Q
+    builder.AddQuantizeLinearNode<OutputType>(softmax_output, 1.f / 256, 0, output_arg);
+  };
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
@@ -306,8 +306,8 @@ TEST(NnapiExecutionProviderTest, TestQDQConv) {
                                        uint8_t /* WeightType */,
                                        int32_t /* BiasType */,
                                        uint8_t /* OutputType */>(
-                      {1, 1, 5, 5} /*input_shape*/,
-                      {1, 1, 3, 3} /*weights_shape*/),
+                      {1, 1, 5, 5} /* input_shape */,
+                      {1, 1, 3, 3} /* weights_shape */),
                   "nnapi_qdq_test_graph_conv",
                   {true /* verify_entire_graph_use_ep */});
 }
@@ -384,6 +384,16 @@ TEST(NnapiExecutionProviderTest, TestQDQReshape) {
                   });
 }
 
+TEST(NnapiExecutionProviderTest, TestQDQSoftMax) {
+  RunQDQModelTest(BuildQDQSoftMaxTestCase<uint8_t, uint8_t>(
+                      {1, 32} /* input_shape */,
+                      static_cast<int64_t>(1) /* axis */),
+                  "nnapi_qdq_test_graph_softmax",
+                  {
+                      true /* verify_entire_graph_use_ep */
+                  });
+}
+
 #endif  // !(ORT_MINIMAL_BUILD)
 
 TEST(NnapiExecutionProviderTest, NNAPIFlagsTest) {

Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,8 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {`
`82`	`82`	`return QuantizedOpType::QDQTranspose;`
`83`	`83`	`else if (op_type == "Reshape")`
`84`	`84`	`return QuantizedOpType::QDQReshape;`
	`85`	`+ else if (op_type == "Softmax")`
	`86`	`+ return QuantizedOpType::QDQSoftmax;`
`85`	`87`	`} else {`
`86`	`88`	`// throw?`
`87`	`89`	`}`