Update trt5 for paddle-trt (#18645)

* update paddle-trt for: 1. fix bug: when batch > 2, core in split plugin. 2. add leaky_relu trt5.0 support (yolov3 from 65ms to 42ms.) 3. add new attr to dropout. 4. shuffle channel, swish, relu6 support test=develop * 1. fix ci test=develop

Update trt5 for paddle-trt (#18645)
* update paddle-trt for: 1. fix bug: when batch > 2, core in split plugin. 2. add leaky_relu trt5.0 support (yolov3 from 65ms to 42ms.) 3. add new attr to dropout. 4. shuffle channel, swish, relu6 support test=develop * 1. fix ci test=develop
26ae6d49 · Zhaolong Xing · GitHub · d8396281 · 26ae6d49 · 26ae6d49
21 changed file
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1880,6 +1880,9 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
 void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) {
  auto reshape1_op =
      pattern->NewNode(reshape1_op_repr())->assert_is_op("reshape2");
+  reshape1_op->assert_more([&](Node *x) {
+    return boost::get<std::vector<int>>(x->Op()->GetAttr("shape")).size() == 5;
+  });
  auto reshape1_out = pattern->NewNode(reshape1_out_repr())
                          ->assert_is_op_output("reshape2", "Out")

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -968,6 +968,8 @@ USE_TRT_CONVERTER(split);
 USE_TRT_CONVERTER(prelu);
 USE_TRT_CONVERTER(conv2d_transpose);
 USE_TRT_CONVERTER(leaky_relu);
+USE_TRT_CONVERTER(shuffle_channel);
+USE_TRT_CONVERTER(swish);
 #endif
 #if PADDLE_WITH_ANAKIN

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -74,6 +74,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
  "infer_clean_graph_pass",                        //
      "conv_affine_channel_fuse_pass",             //
      "conv_eltwiseadd_affine_channel_fuse_pass",  //
+      "shuffle_channel_detect_pass",               //
      "quant_conv2d_dequant_fuse_pass",            //
      "delete_quant_dequant_op_pass",              //
      // "fc_fuse_pass",                                 //

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,6 +3,7 @@ nv_library(tensorrt_converter
           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc
+shuffle_channel_op.cc swish_op.cc
           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -42,3 +43,9 @@ nv_test(test_op_converter SRCS test_op_converter.cc DEPS
 #        prelu_op)
 #nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc
 #        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op)
+#nv_test(test_shuffle_channel_op SRCS test_shuffle_channel_op.cc shuffle_channel_op.cc
+#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine shuffle_channel_op)
+#nv_test(test_swish_op SRCS test_swish_op.cc swish_op.cc
+#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op tensorrt_plugin)
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -42,11 +42,20 @@ class ActivationOpConverter : public OpConverter {
    nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
        engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
        op_pair->second);
+#if IS_TRT_VERSION_GE(5130)
+    // max(alpha, min(beta, x))
+    if (op_type_ == "relu6") {
+      layer->setAlpha(0.);
+      layer->setBeta(6.);
+    }
+#endif
    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
+#if IS_TRT_VERSION_GE(5130)
      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
 #endif
@@ -63,6 +72,9 @@ const std::unordered_map<std::string, nvinfer1::ActivationType>
        {"relu", nvinfer1::ActivationType::kRELU},
        {"sigmoid", nvinfer1::ActivationType::kSIGMOID},
        {"tanh", nvinfer1::ActivationType::kTANH},
+#if IS_TRT_VERSION_GE(5130)
+        {"relu6", nvinfer1::ActivationType::kCLIP},
+#endif
 };
 class ReluOpConverter : public ActivationOpConverter {
@@ -80,6 +92,11 @@ class TanhOpConverter : public ActivationOpConverter {
  TanhOpConverter() { op_type_ = "tanh"; }
 };
+class Relu6OpConverter : public ActivationOpConverter {
+ public:
+  Relu6OpConverter() { op_type_ = "relu6"; }
+};
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
@@ -87,3 +104,4 @@ class TanhOpConverter : public ActivationOpConverter {
 REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
 REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter);
 REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter);
+REGISTER_TRT_OP_CONVERTER(relu6, Relu6OpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -31,6 +31,20 @@ class DropoutOpConverter : public OpConverter {
    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
    float dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
+    std::string downgrade_in_infer = "";
+    if (op_desc.HasAttr("dropout_implementation")) {
+      downgrade_in_infer =
+          boost::get<std::string>(op_desc.GetAttr("dropout_implementation"));
+    }
+    if (!downgrade_in_infer.empty() &&
+        downgrade_in_infer == "upscale_in_train") {
+      auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
+      auto output_name = op_desc.Output("Out")[0];
+      RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode);
+      return;
+    }
    platform::CPUPlace cpu_place;
    std::unique_ptr<framework::LoDTensor> weight_tensor(
        new framework::LoDTensor());

--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -35,7 +35,14 @@ class LeakyReluOpConverter : public OpConverter {
    PADDLE_ENFORCE(output_num == 1);
    // Get attrs
    float alpha = boost::get<float>(op_desc.GetAttr("alpha"));
+    nvinfer1::ILayer* output_layer = nullptr;
+#if IS_TRT_VERSION_GE(5100)
+    nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Activation, *input, nvinfer1::ActivationType::kLEAKY_RELU);
+    layer->setAlpha(alpha);
+    output_layer = layer;
+#else
    platform::CPUPlace place;
    std::unique_ptr<framework::LoDTensor> alpha_tensor(
        new framework::LoDTensor());
@@ -65,7 +72,7 @@ class LeakyReluOpConverter : public OpConverter {
                             nvinfer1::ScaleMode::kUNIFORM, shift.get(),
                             sub_scale.get(), power.get());
    PADDLE_ENFORCE(nullptr != scale_relu_layer);
-    auto* output_layer =
+    output_layer =
        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)),
                             *(scale_relu_layer->getOutput(0)),
                             nvinfer1::ElementWiseOperation::kSUM);
@@ -75,7 +82,7 @@ class LeakyReluOpConverter : public OpConverter {
    PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) ==
                   engine_->weight_map.end());
    engine_->weight_map[alpha_name] = std::move(alpha_tensor);
+#endif
    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name},
                             test_mode);

--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * ConcatOp
+ */
+class ShuffleChannelOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto input_dims = input->getDimensions();
+    PADDLE_ENFORCE(input_dims.nbDims == 3);
+    int c = input_dims.d[0];
+    int h = input_dims.d[1];
+    int w = input_dims.d[2];
+    int group = boost::get<int>(op_desc.GetAttr("group"));
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+    nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
+    layer->setReshapeDimensions(reshape_dim);
+    layer->setSecondTranspose({1, 0, 2, 3});
+    auto* output = layer->getOutput(0);
+    auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *output);
+    nvinfer1::DimsCHW reshape_dim2(c, h, w);
+    reshape_layer->setReshapeDimensions(reshape_dim2);
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(reshape_layer, "concat", {output_name}, test_mode);
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(shuffle_channel, ShuffleChannelOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class SwishOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid swish op to tensorrt layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    int input_num = op_desc.Input("X").size();
+    PADDLE_ENFORCE(input_num == 1);
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    // Get output
+    size_t output_num = op_desc.Output("Out").size();
+    PADDLE_ENFORCE(output_num == 1);
+    // Get attrs
+    float beta = boost::get<float>(op_desc.GetAttr("beta"));
+    plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta);
+    nvinfer1::IPluginLayer* layer =
+        engine_->AddPlugin(&input, input_num, plugin);
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "swish", {output_name}, test_mode);
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(swish, SwishOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -46,6 +46,8 @@ TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); }
 TEST(TanhOpConverter, main) { test_activation("tanh"); }
+TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
@@ -53,3 +55,4 @@ TEST(TanhOpConverter, main) { test_activation("tanh"); }
 USE_OP(relu);
 USE_OP(sigmoid);
 USE_OP(tanh);
+USE_OP(relu6);
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
@@ -34,6 +34,7 @@ TEST(DropoutOpConverter, main) {
  framework::OpDesc desc;
  int is_test = 1;
  float dropout_prob = 0.4;
+  std::string dropout_implementation = "upscale_in_train";
  desc.SetType("dropout");
  desc.SetInput("X", {"dropout-X"});
@@ -42,6 +43,8 @@ TEST(DropoutOpConverter, main) {
  desc.SetAttr("is_test", is_test);
  desc.SetAttr("dropout_prob", dropout_prob);
+  desc.SetAttr("dropout_implementation", dropout_implementation);
  LOG(INFO) << "set OP";
  validator.SetOp(*desc.Proto());
  LOG(INFO) << "execute";

--- a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+TEST(leaky_relu_op, test_leaky_relu) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("sc_input", nvinfer1::DimsCHW(4, 2, 2));
+  validator.DeclOutputVar("sc_out", nvinfer1::DimsCHW(4, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("shuffle_channel");
+  desc.SetInput("X", {"sc_input"});
+  desc.SetOutput("Out", {"sc_out"});
+  int group = 2;
+  desc.SetAttr("group", group);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+// USE_OP(leaky_relu);
+USE_OP(shuffle_channel);
--- a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+TEST(swish_op, test_swish) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("sw_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclOutputVar("sw_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("swish");
+  desc.SetInput("X", {"sw_input"});
+  desc.SetOutput("Out", {"sw_out"});
+  desc.SetAttr("beta", 2.0f);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(swish);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -20,7 +20,11 @@ namespace tensorrt {
 // Just tell by the op_types.
 struct SimpleOpTypeSetTeller : public Teller {
-  SimpleOpTypeSetTeller() {}
+  SimpleOpTypeSetTeller() {
+#if IS_TRT_VERSION_GE(5130)
+    teller_set.insert("relu6");
+#endif
+  }
  bool operator()(const std::string& op_type,
                  const framework::OpDesc& desc) override {
@@ -28,11 +32,27 @@ struct SimpleOpTypeSetTeller : public Teller {
  }
 private:
-  std::unordered_set<std::string> teller_set{
+  std::unordered_set<std::string> teller_set{{"mul",
-      {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+                                              "conv2d",
-       "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+                                              "pool2d",
-       "elementwise_add", "elementwise_mul", "dropout", "prelu",
+                                              "relu",
-       "conv2d_transpose", "leaky_relu", "fc"}};
+                                              "softmax",
+                                              "sigmoid",
+                                              "depthwise_conv2d",
+                                              "batch_norm",
+                                              "concat",
+                                              "tanh",
+                                              "pad",
+                                              "elementwise_add",
+                                              "elementwise_mul",
+                                              "dropout",
+                                              "prelu",
+                                              "conv2d_transpose",
+                                              "leaky_relu",
+                                              "fc",
+                                              "shuffle_channel",
+                                              "swish",
+                                              "split"}};
 };
 bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {

--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -18,6 +18,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
           prelu_op_plugin.cu  trt_plugin_factory.cc
-           avg_pool_op_plugin.cu
+           avg_pool_op_plugin.cu swish_op_plugin.cu
           DEPS enforce tensorrt_engine prelu)
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -34,6 +34,7 @@ int PReluPlugin::initialize() {
  cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
  cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
             cudaMemcpyHostToDevice);
+  return 0;
 }
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -27,50 +27,20 @@ SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
 }
 REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
-// copied from operators::math::SplitFunctor
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
+__device__ int upper_bound(T const* vals, int n, T const& key) {
-                            const int in_col, const int* out_cols,
+  int i = 0;
-                            int out_cols_size, T** outputs_data) {
+  while (n > 0) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+    int m = n / 2;
-  int curr_segment = 0;
+    int j = i + m;
-  int curr_offset = out_cols[0];
+    if (!(key < vals[j])) {
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+      i = j + 1;
-    int curr_col_offset = out_cols[curr_segment + 1];
+      n -= m + 1;
-    while (curr_col_offset <= tid_x) {
+    } else {
-      curr_offset = curr_col_offset;
+      n = m;
-      ++curr_segment;
-      curr_col_offset = out_cols[curr_segment + 1];
-    }
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs_data[curr_segment];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * segment_width + local_col] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
-                            T** outputs_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x / fixed_out_col;
-    int in_offset = tid_x - split * fixed_out_col;
-    T* output_ptr = outputs_data[split];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * fixed_out_col + in_offset] =
-            input_data[tid_y * in_col + tid_x];
    }
  }
+  return i;
 }
 nvinfer1::Dims SplitPlugin::getOutputDimensions(
@@ -101,80 +71,61 @@ int SplitPlugin::initialize() {
    if (output_length_[i] != output_length_[0]) {
      same_shape_ = false;
    }
-    segment_offsets.push_back(segment_offsets.back() +
+    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
-                              output_length_[i] * inner_cols_);
  }
-  inner_cols_ *= dims.d[axis_];
+  axis_shape_ = dims.d[axis_];
  d_segment_offsets_ = segment_offsets;
  segment_offsets_ = std::move(segment_offsets);
  d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
  return 0;
 }
+// The following part of the code refers to onnx-tensorrt
+// https://github.com/onnx/onnx-tensorrt/blob/master/Split.cu
 template <typename T>
-inline void Split(cudaStream_t stream, const bool same_shape,
+__global__ void split_kernel(int nsegment,
-                  const int outer_rows, const int inner_cols,
+                             int const* __restrict__ segment_offsets,
-                  const std::vector<int>& segment_offsets,
+                             T const* __restrict__ idata, T* const* odatas,
-                  const int* d_segment_offsets, const T* input, T** outputs) {
+                             int inner_cols, int axis_shape, int outer_rows) {
-  const int kThreadsPerBlock = 1024;
+  int x0 = threadIdx.x + blockIdx.x * blockDim.x;
-  const int kMaxBlocks = 65535;
+  int src_y0 = threadIdx.y + blockIdx.y * blockDim.y;
-  int block_cols = kThreadsPerBlock;
+  int z0 = threadIdx.z + blockIdx.z * blockDim.z;
-  if (inner_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+  for (int z = z0; z < outer_rows; z += blockDim.z * gridDim.z) {
-    block_cols = ((inner_cols + 31) >> 5) << 5;
+    for (int src_y = src_y0; src_y < axis_shape;
-  }
+         src_y += blockDim.y * gridDim.y) {
-  int block_rows = kThreadsPerBlock / block_cols;
+      for (int x = x0; x < inner_cols; x += blockDim.x * gridDim.x) {
-  dim3 block_size = dim3(block_cols, block_rows, 1);
+        int segment = upper_bound(segment_offsets, nsegment, src_y) - 1;
+        int dst_y = src_y - segment_offsets[segment];
-  int grid_cols =
+        int dst_ny = segment_offsets[segment + 1] - segment_offsets[segment];
-      std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks);
+        odatas[segment][x + inner_cols * (dst_y + dst_ny * z)] =
-  int grid_rows =
+            idata[x + inner_cols * (src_y + axis_shape * z)];
-      std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1));
+      }
-  dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+    }
-  if (same_shape) {
-    SplitKernel<<<grid_size, block_size, 0, stream>>>(
-        input, outer_rows, inner_cols, segment_offsets[1], outputs);
-  } else {
-    SplitKernel<<<grid_size, block_size, 0, stream>>>(
-        input, outer_rows, inner_cols, d_segment_offsets,
-        static_cast<int>(segment_offsets.size()), outputs);
  }
 }
 int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                         void** outputs, void* workspace, cudaStream_t stream) {
+  const int* d_segment_offsets_ptr =
+      thrust::raw_pointer_cast(&d_segment_offsets_[0]);
  float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
-  if (((batchSize == 1 && axis_ == 0) || axis_ == -1) &&
+  float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
-      this->getNbOutputs() < 10) {
+  float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-    float** output_ptrs = reinterpret_cast<float**>(outputs);
+  PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, h_odatas,
-    int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
+                                 d_output_ptrs_.size() * sizeof(float*),
-                             ? sizeof(float)
+                                 cudaMemcpyHostToDevice,
-                             : sizeof(__half);
+                                 stream) == cudaSuccess);
-    for (int i = 0; i < this->getNbOutputs(); ++i) {
-      PADDLE_ENFORCE(
+  int outer_rows = outer_rows_ * batchSize;
-          cudaMemcpyAsync(
-              output_ptrs[i], input_ptr + segment_offsets_[i],
+  dim3 block(32, 16);
-              (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size,
+  dim3 grid(std::min((inner_cols_ - 1) / block.x + 1, 65535u),
-              cudaMemcpyDeviceToDevice, stream) == cudaSuccess);
+            std::min((axis_shape_ - 1) / block.y + 1, 65535u),
-    }
+            std::min((outer_rows_ - 1) / block.z + 1, 65535u));
-  } else {
-    outer_rows_ *= batchSize;
+  split_kernel<<<grid, block, 0, stream>>>(
-    const int* d_segment_offsets_ptr =
+      d_segment_offsets_.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
-        thrust::raw_pointer_cast(&d_segment_offsets_[0]);
+      inner_cols_, axis_shape_, outer_rows);
-    float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-    PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs,
-                                   this->getNbOutputs() * sizeof(float*),
-                                   cudaMemcpyHostToDevice,
-                                   stream) == cudaSuccess);
-    if (this->getDataType() == nvinfer1::DataType::kFLOAT) {
-      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
-            d_segment_offsets_ptr, input_ptr, output_ptrs);
-    } else {
-      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
-            d_segment_offsets_ptr, (__half*)input_ptr,  // NOLINT
-            (__half**)output_ptrs);                     // NOLINT
-    }
-  }
  return cudaGetLastError() != cudaSuccess;
 }

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -66,6 +66,7 @@ class SplitPlugin : public PluginTensorRT {
  int axis_;
  int outer_rows_;
  int inner_cols_;
+  int axis_shape_;
  bool same_shape_;
  std::vector<int> output_length_;
  std::vector<int> segment_offsets_;

--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <stdio.h>
+#include <cassert>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+SwishPlugin *CreateSwishPluginDeserialize(const void *buffer, size_t length) {
+  return new SwishPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("swish_plugin", CreateSwishPluginDeserialize);
+int SwishPlugin::initialize() { return 0; }
+nvinfer1::Dims SwishPlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims *inputDims,
+                                                int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index < this->getNbOutputs());
+  nvinfer1::Dims const &input_dims = inputDims[0];
+  nvinfer1::Dims output_dims = input_dims;
+  return output_dims;
+}
+__global__ void swish_kernel(int num, const float *input, float *output,
+                             float beta) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+#if __CUDA_ARCH__ >= 350
+    output[index] =
+        __ldg(input + index) / (1.0f + expf(-beta * __ldg(input + index)));
+#else
+    output[index] = input[index] / (1.0f + expf(-beta * input[index]));
+#endif
+  }
+}
+int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
+  // input dims is CHW.
+  const auto &input_dims = this->getInputDims(0);
+  const float *input = reinterpret_cast<const float *>(inputs[0]);
+  float *output = reinterpret_cast<float **>(outputs)[0];
+  int num = batch_size;
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    num *= input_dims.d[i];
+  }
+  int threads = 1024;
+  int blocks = (num + threads - 1) / threads;
+  swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output, beta_);
+  return cudaGetLastError() != cudaSuccess;
+}
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+class SwishPlugin : public PluginTensorRT {
+ private:
+  float beta_;
+ protected:
+  size_t getSerializationSize() override {
+    return getBaseSerializationSize() + SerializedSize(beta_);
+  }
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
+    serializeBase(buffer);
+    SerializeValue(&buffer, beta_);
+  }
+ public:
+  explicit SwishPlugin(const float beta) : beta_(beta) {}
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  SwishPlugin(void const *serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &beta_);
+  }
+  ~SwishPlugin() {}
+  int initialize() override;
+  SwishPlugin *clone() const override { return new SwishPlugin(beta_); }
+  const char *getPluginType() const override { return "swish_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
+};
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle