add arg_max tensorrt converter, fix identity_scale_op_clean_pass (#42850)

5efc4146 · zhupengyang · GitHub · 5d1bbecb · 5efc4146 · 5efc4146
10 changed file
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 #include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -29,55 +29,62 @@ void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
  // ->
  // pre_op -> scale_out
  GraphPatternDetector detector;
-  auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op();
+  auto scale_in =
-  auto scale_in = detector.mutable_pattern()
+      detector.mutable_pattern()
-                      ->NewNode("scale_in")
+          ->NewNode("scale_in")
-                      ->assert_is_op_input("scale")
+          ->assert_is_op_input("scale")
-                      ->AsIntermediate();
+          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
  auto scale_op = detector.mutable_pattern()
                      ->NewNode("scale_fuse")
                      ->assert_is_op("scale")
                      ->assert_op_attr<float>("scale", 1.)
                      ->assert_op_attr<float>("bias", 0.);
-  auto scale_out =
+  auto scale_out = detector.mutable_pattern()
-      detector.mutable_pattern()
+                       ->NewNode("scale_out")
-          ->NewNode("scale_out")
+                       ->assert_is_op_output("scale");
-          ->assert_is_op_output("scale")
-          // scale's output var should has only one consumer, or it can't be
-          // removed.
-          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
-  pre_op->LinksTo({scale_in});
  scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
+  int found_subgraph_count = 0;
  GraphPatternDetector::handle_t handler = [&](
      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
    Node* scale_op_var = subgraph.at(scale_op);
    Node* scale_in_var = subgraph.at(scale_in);
    Node* scale_out_var = subgraph.at(scale_out);
-    Node* pre_op_var = subgraph.at(pre_op);
-    // Link pre_op directly to scale_out
    const std::string scale_in_name = scale_in_var->Name();
    const std::string scale_out_name = scale_out_var->Name();
    // Remove links in graph
    GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
-    // Modify proto message
+    // Modify pre_op_desc
-    auto* pre_op_desc = pre_op_var->Op();
+    // Link pre_op directly to scale_out
-    for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) {
+    for (auto& node : graph->Nodes()) {
-      auto* arguments = parameter.mutable_arguments();
+      if (node->IsOp()) {
-      auto it = std::find(arguments->begin(), arguments->end(), scale_in_name);
+        auto* op_desc = node->Op();
-      PADDLE_ENFORCE_NE(
+        auto out_vars_map = op_desc->Outputs();
-          it, arguments->end(),
+        for (auto out_var_map : out_vars_map) {
-          platform::errors::NotFound(
+          auto names = out_var_map.second;
-              "Can not find input variable(%s) from scale op(%s).",
+          bool reset = false;
-              scale_in_name, pre_op_desc->Type()));
+          for (size_t i = 0; i < names.size(); i++) {
-      *it = scale_out_name;
+            if (names[i] == scale_in_name) {
+              reset = true;
+              names[i] = scale_out_name;
+              break;
+            }
+          }
+          if (reset) {
+            op_desc->SetOutput(out_var_map.first, names);
+            op_desc->Flush();
+            IR_NODE_LINK_TO(node, scale_out_var);
+            break;
+          }
+        }
+      }
    }
+    found_subgraph_count++;
-    IR_NODE_LINK_TO(pre_op_var, scale_out_var);
  };
  detector(graph, handler);
+  AddStatis(found_subgraph_count);
 }
 }  // namespace ir
@@ -86,3 +93,7 @@ void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(identity_scale_op_clean_pass,
              paddle::framework::ir::IdentityScaleOpCleanPass);
+REGISTER_PASS_CAPABILITY(identity_scale_op_clean_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "scale", 0));
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -139,6 +139,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  block_desc.Proto()->set_parent_idx(-1);
  block_desc.Proto()->set_idx(0);
  LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
+  for (auto node : subgraph) {
+    if (node->NodeType() == Node::Type::kOperation) {
+      VLOG(5) << "trt subgraph has op: " << (node->Op()->Type());
+    }
+  }
  for (auto *node : subgraph) {
    auto *new_block_op = new_block->AppendOp();

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1782,6 +1782,7 @@ USE_TRT_CONVERTER(gather);
 USE_TRT_CONVERTER(anchor_generator);
 USE_TRT_CONVERTER(yolo_box);
 USE_TRT_CONVERTER(yolo_box_head);
+USE_TRT_CONVERTER(arg_max);
 USE_TRT_CONVERTER(roi_align);
 USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -82,7 +82,8 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 const std::vector<std::string> kTRTSubgraphPasses({
-  "adaptive_pool2d_convert_global_pass",
+  "identity_scale_op_clean_pass",              //
+      "adaptive_pool2d_convert_global_pass",   //
      "shuffle_channel_detect_pass",           //
      "quant_conv2d_dequant_fuse_pass",        //
      "delete_fill_constant_op_pass",          //

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -37,6 +37,7 @@ nv_library(tensorrt_converter
                anchor_generator_op.cc
                yolo_box_op.cc
                yolo_box_head_op.cc
+                arg_max_op.cc
                roi_align_op.cc
                affine_channel_op.cc
                multiclass_nms_op.cc

--- a/paddle/fluid/inference/tensorrt/convert/arg_max_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/arg_max_op.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class ArgMaxOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid arg_max op to tensorrt topk layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto input_dims = input->getDimensions();
+    int rank = input_dims.nbDims;
+    int axis = op_desc.HasAttr("axis")
+                   ? BOOST_GET_CONST(int64_t, op_desc.GetAttr("axis"))
+                   : -1;
+    if (axis > 0) axis -= 1;
+    if (axis < 0) axis += rank;
+    auto* topk_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, TopK, *input, nvinfer1::TopKOperation::kMAX, 1, 1 << axis);
+    auto output_name = op_desc.Output("Out")[0];
+    bool keepdims = BOOST_GET_CONST(bool, op_desc.GetAttr("keepdims"));
+    if (keepdims) {
+      RreplenishLayerAndOutput(topk_layer, "arg_max",
+                               {output_name + "_value", output_name},
+                               test_mode);
+    } else {
+      auto squeeze_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *topk_layer->getOutput(1));
+      auto dims = input_dims;
+      dims.nbDims -= 1;
+      for (int i = axis; i < dims.nbDims; i++) {
+        dims.d[i] = dims.d[i + 1];
+      }
+      squeeze_layer->setReshapeDimensions(dims);
+      RreplenishLayerAndOutput(squeeze_layer, "arg_max", {output_name},
+                               test_mode);
+    }
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(arg_max, ArgMaxOpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -102,6 +102,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "gather_nd",
      "yolo_box",
      "yolo_box_head",
+      "arg_max",
      "roi_align",
      "affine_channel",
      "nearest_interp",
@@ -169,6 +170,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "gather_nd",
      "yolo_box",
      "yolo_box_head",
+      "arg_max",
      "roi_align",
      "affine_channel",
      "nearest_interp",
@@ -644,6 +646,16 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
      if (!has_attrs) return false;
    }
+    if (op_type == "arg_max") {
+      if (with_dynamic_shape) return false;
+      int axis = desc.HasAttr("axis")
+                     ? BOOST_GET_CONST(int64_t, desc.GetAttr("axis"))
+                     : -1;
+      bool flatten = BOOST_GET_CONST(bool, desc.GetAttr("flatten"));
+      int dtype = BOOST_GET_CONST(int, desc.GetAttr("dtype"));
+      if (axis == 0 || flatten || dtype != 2) return false;
+    }
    if (op_type == "affine_channel") {
      if (!desc.HasAttr("data_layout")) return false;
      auto data_layout = framework::StringToDataLayout(

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_identity_scale_clean_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_identity_scale_clean_pass.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from auto_scan_test import PassAutoScanTest
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import paddle.inference as paddle_infer
+import unittest
+import hypothesis.strategies as st
+class TestIdentityScaleCleanPass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=8,
+            workspace_size=0,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['relu'], (1e-5, 1e-5)
+    def sample_program_config(self, draw):
+        bias_after_scale = draw(st.booleans())
+        n = draw(st.integers(min_value=1, max_value=4))
+        c = draw(st.integers(min_value=1, max_value=20))
+        h = draw(st.integers(min_value=1, max_value=20))
+        w = draw(st.integers(min_value=1, max_value=20))
+        relu_op = OpConfig(
+            "relu", inputs={"X": ["relu_x"]}, outputs={"Out": ["relu_out"]})
+        scale_op = OpConfig(
+            "scale",
+            inputs={"X": ["relu_out"]},
+            outputs={"Out": ["scale_out"]},
+            bias=0.,
+            scale=1.,
+            bias_after_scale=True)
+        program_config = ProgramConfig(
+            ops=[relu_op, scale_op],
+            weights={},
+            inputs={"relu_x": TensorConfig(shape=[n, c, h, w])},
+            outputs=["scale_out"])
+        return program_config
+    def test(self):
+        self.run_and_statis(
+            max_examples=25, passes=["identity_scale_op_clean_pass"])
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+class TrtConvertArgMaxTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        input_shape = program_config.inputs["arg_max_input"].shape
+        axis = program_config.ops[0].attrs["axis"]
+        if axis < 0:
+            axis += len(input_shape)
+        if len(input_shape) <= axis or axis == 0:
+            return False
+        return True
+    def sample_program_configs(self):
+        def generate_input(rank, batch):
+            dims = [batch]
+            for i in range(rank - 1):
+                dims.append((i + 1) * 8)
+            size = np.prod(dims)
+            return (np.arange(size) % 10 - 5).astype("float32").reshape(dims)
+        for rank in [3, 4]:
+            for batch in [1, 4]:
+                for axis in [-1, 0, 1, 2, 3]:
+                    for keepdims in [True, False]:
+                        flatten = False
+                        dtype = 2
+                        ops_config = [{
+                            "op_type": "arg_max",
+                            "op_inputs": {
+                                "X": ["arg_max_input"]
+                            },
+                            "op_outputs": {
+                                "Out": ["arg_max_out"]
+                            },
+                            "op_attrs": {
+                                "axis": axis,
+                                "keepdims": keepdims,
+                                "flatten": flatten,
+                                "dtype": dtype
+                            }
+                        }]
+                        ops = self.generate_op_config(ops_config)
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "arg_max_input": TensorConfig(data_gen=partial(
+                                    generate_input, rank, batch))
+                            },
+                            outputs=["arg_max_out"])
+                        yield program_config
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        self.trt_param.workspace_size = 1024000
+        yield self.create_inference_config(), [1, 2], 1e-5
+    def test(self):
+        self.run_test()
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py
@@ -42,7 +42,7 @@ class TrtConvertScaleTest(TrtLayerAutoScanTest):
        for num_input in [0, 1]:
            for dims in [1, 2, 3, 4]:
                for batch in [1, 2]:
-                    for scale in [0.1, 1.0]:
+                    for scale in [0.1, -1.0]:
                        for bias in [0.0, 1.2]:
                            for bias_after_scale in [False, True]:
                                self.num_input = num_input