fix ernie fc shape error (#31803)

* fix conflict * fix compile error * cherry-pick #31316 * Refine cudnn softmax (#25757) * refine cudnn softmax * Trt elementwise plugin serialize (#31587) * add serialize unittest * fix element_op trt plugin serialize bug * remove PassVersionChecker.IsCompatible * fix unittest Co-authored-by: N Pei Yang <peiyang@baidu.com> Co-authored-by: N GaoWei8 <53294385+GaoWei8@users.noreply.github.com>

fix ernie fc shape error (#31803)
* fix conflict * fix compile error * cherry-pick #31316 * Refine cudnn softmax (#25757) * refine cudnn softmax * Trt elementwise plugin serialize (#31587) * add serialize unittest * fix element_op trt plugin serialize bug * remove PassVersionChecker.IsCompatible * fix unittest Co-authored-by: N Pei Yang <peiyang@baidu.com> Co-authored-by: N GaoWei8 <53294385+GaoWei8@users.noreply.github.com>
ba82757e · Shang Zhizhou · GitHub · 28c36d86 · ba82757e · ba82757e
18 changed file
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -147,11 +147,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  std::set<std::string> output_names;
  std::set<std::string> output_names_with_id;
-  std::vector<int> origin_output_dims;
+  std::map<std::string, int> origin_name_output_dims;
  for (auto *x : node->outputs) {
    output_names.insert(x->Name());
    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
-    origin_output_dims.push_back(x->Var()->GetShape().size());
+    origin_name_output_dims[x->Name()] = x->Var()->GetShape().size();
  }
  std::unordered_map<std::string, std::string> output_name_map;
@@ -195,9 +195,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  // output_mapping help us copy the data from the renamed ITensor
  // to Tensor.
  std::vector<std::string> output_mapping;
+  std::vector<int> renamed_output_dims;
  for (auto name : output_names) {
    PADDLE_ENFORCE(output_name_map.count(name) != 0);
    output_mapping.push_back(output_name_map[name]);
+    renamed_output_dims.push_back(origin_name_output_dims[name]);
  }
  PADDLE_ENFORCE(!output_mapping.empty());
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
@@ -217,7 +219,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
  op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
  op_desc->SetAttr("output_name_mapping", output_mapping);
-  op_desc->SetAttr("origin_output_dims", origin_output_dims);
+  op_desc->SetAttr("origin_output_dims", renamed_output_dims);
  op_desc->SetAttr("parameters", params);
  // we record all inputs' shapes in attr to check if they are consistent

--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -133,7 +133,69 @@ class FcOpConverter : public OpConverter {
                                static_cast<size_t>(bias_num)};
    if (engine_->with_dynamic_shape()) {
+      // not NCHW layout, but NLP layout with added 'x 1 x 1'
+      auto x_dim = X->getDimensions();
+      if (x_dim.nbDims == 3 || x_dim.nbDims == 2) {
+        auto output_name = op_desc.Output("Out").front();
+        // add shuffle before fc
+        nvinfer1::Dims reshape_before_fc_dim;
+        reshape_before_fc_dim.nbDims = x_dim.nbDims + 2;
+        for (int i = 0; i < x_dim.nbDims; i++) {
+          reshape_before_fc_dim.d[i] = 0;
+        }
+        reshape_before_fc_dim.d[x_dim.nbDims] = 1;
+        reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1;
+        auto* reshape_before_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+        reshape_before_fc_layer->setName(
+            ("shuffle_before_fc(Output: " + output_name + ")").c_str());
+        // add fc layer
+        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
+            n_output, weight.get(), bias.get());
+        fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
+        // add shuffle after fc
+        nvinfer1::Dims reshape_after_fc_dim;
+        if (x_dim.nbDims == 3) {
+          if (x_num_col_dims == 2) {
+            reshape_after_fc_dim.nbDims = 3;
+            reshape_after_fc_dim.d[0] = 0;
+            reshape_after_fc_dim.d[1] = 0;
+            reshape_after_fc_dim.d[2] = 0;
+          } else {
+            reshape_after_fc_dim.nbDims = 2;
+            reshape_after_fc_dim.d[0] = 0;
+            auto dim = fc_layer->getOutput(0)->getDimensions();
+            reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2];
+          }
+          // x_dim.nbDims == 2
+        } else {
+          reshape_after_fc_dim.nbDims = 2;
+          reshape_after_fc_dim.d[0] = 0;
+          reshape_after_fc_dim.d[1] = 0;
+        }
+        auto* reshape_after_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
+        reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+        if (activation_type == "relu") {
+          reshape_after_fc_layer->setName(
+              ("shuffle_after_fc(Output: " + output_name + ")").c_str());
+          nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
+              nvinfer1::ActivationType::kRELU);
+          RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
+                                   {output_name}, test_mode);
+        } else {
+          RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
+                                   {output_name}, test_mode);
+        }
+      } else {
        regist_fc(X, n_output, weight, bias);
+      }
      return;
    }
    // in order to handle situations in NLP models(input dims < 3,
@@ -143,12 +205,6 @@ class FcOpConverter : public OpConverter {
    auto input_d = X->getDimensions().d;
    int reshape_dim3[3] = {0};
    int reshape_dim4[4] = {0};
-    PADDLE_ENFORCE_EQ(
-        x_num_col_dims == 1 || x_num_col_dims == 2, true,
-        platform::errors::InvalidArgument(
-            "Wrong x_num_col_dims param of op mul. Paddle-TRT FC converter "
-            "expects x_num_col_dims is either 1 or 2, but got %d",
-            x_num_col_dims));
    PADDLE_ENFORCE_LE(x_num_col_dims, input_dims,
                      platform::errors::InvalidArgument(
                          "Params and input dims mismatch. Paddle-TRT FC "

--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
-See the License for the specific language governing permissions and
+the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
@@ -28,7 +28,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
               "network structure";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
-    // Shouble be a 5 dims tensor.
    auto* input = engine_->GetITensor(op_desc.Input("Input").front());
    // fc weights and fc bias
@@ -69,6 +68,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
    int head_number = boost::get<int>(op_desc.GetAttr("head_number"));
    nvinfer1::ILayer* layer = nullptr;
+    auto output_name = op_desc.Output("Out")[0];
    if (engine_->with_dynamic_shape()) {
      if (engine_->use_oss()) {
@@ -170,6 +170,12 @@ class MultiheadMatMulOpConverter : public OpConverter {
            plugin_inputs.data(), plugin_inputs.size(), *plugin);
        layer = plugin_layer;
      } else {
+        PADDLE_ENFORCE_EQ(
+            input->getDimensions().nbDims, 3,
+            platform::errors::InvalidArgument(
+                "The Input dim of the MultiheadMatMul should be 3, "
+                "but it's (%d) now.",
+                input->getDimensions().nbDims));
        // transpose weight_data from m * n to  n * m
        auto* input_bias_qk =
            engine_->GetITensor(op_desc.Input("BiasQK").front());
@@ -183,15 +189,37 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                    static_cast<void*>(bias_data),
                                    static_cast<size_t>(bias_t->numel())};
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
+        // add shuffle before fc
-                                              n, weight.get(), bias.get());
+        nvinfer1::Dims reshape_before_fc_dim;
-        auto* fc_out = fc_layer->getOutput(0);
+        reshape_before_fc_dim.nbDims = 5;
+        reshape_before_fc_dim.d[0] = 0;
+        reshape_before_fc_dim.d[1] = 0;
+        reshape_before_fc_dim.d[2] = 0;
+        reshape_before_fc_dim.d[3] = 1;
+        reshape_before_fc_dim.d[4] = 1;
+        auto* reshape_before_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+        reshape_before_fc_layer->setName(
+            ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
+                .c_str());
+        // add layer fc
+        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n,
+            weight.get(), bias.get());
+        fc_layer->setName(
+            ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
+        // no need to add shuffle after fc, just change it in
+        // QkvToContextPluginDynamic
        // add qkv to context
        int head_size = hidden_out / head_number;
        float scale = boost::get<float>(op_desc.GetAttr("alpha"));
        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.push_back(fc_out);
+        plugin_inputs.push_back(fc_layer->getOutput(0));
        plugin_inputs.push_back(input_bias_qk);
        bool with_fp16 =
            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
@@ -207,7 +235,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
          "You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
          "the shape information to run the dynamic shape mode."));
    }
-    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name},
                             test_mode);
 #else

--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -48,6 +48,8 @@ class ScaleOpConverter : public OpConverter {
      return tmp_data;
    };
+    int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0;
    float* bias_ptr = create_weights(bias, "bias");
    float* scale_ptr = create_weights(scale, "scale");
@@ -60,19 +62,22 @@ class ScaleOpConverter : public OpConverter {
    nvinfer1::ILayer* layer = nullptr;
    auto input_dim = input->getDimensions();
-    PADDLE_ENFORCE_GE(input_dim.nbDims, 3,
-                      platform::errors::Fatal(
-                          "Paddle-TRT scale mode only support dimension >= 3"));
    nvinfer1::IShuffleLayer* expand_layer = nullptr;
    nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
-    if (input_dim.nbDims == 3) {
+    if (input_dim.nbDims < 3 + dynamic_shape_offset) {
-      // TensorRT scale layer is not supporting input dims < 4 when using
+      nvinfer1::Dims expand_shape;
-      // explicit batch
+      expand_shape.nbDims = 3 + dynamic_shape_offset;
+      for (int i = 0; i < 3 + dynamic_shape_offset; i++) {
+        if (i < input_dim.nbDims) {
+          expand_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
+        } else {
+          expand_shape.d[i] = 1;
+        }
+      }
      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-      nvinfer1::Dims4 target_shape(0, 0, 0, 1);  // expand 1 dims
+      expand_layer->setReshapeDimensions(expand_shape);
-      expand_layer->setReshapeDimensions(target_shape);
      input = expand_layer->getOutput(0);
    }
@@ -94,13 +99,15 @@ class ScaleOpConverter : public OpConverter {
    PADDLE_ENFORCE_EQ(layer != nullptr, true,
                      platform::errors::Fatal("Create scale layer failed."));
-    if (input_dim.nbDims == 3) {
+    if (input_dim.nbDims < 3 + dynamic_shape_offset) {
-      // TensorRT scale layer is not supporting input dims < 4 when using
+      nvinfer1::Dims squeeze_shape;
-      // explicit batch
+      squeeze_shape.nbDims = input_dim.nbDims;
+      for (int i = 0; i < squeeze_shape.nbDims; i++) {
+        squeeze_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
+      }
      squeeze_layer =
          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
-      nvinfer1::Dims3 target_shape(0, 0, 0);  // expand 1 dims
+      squeeze_layer->setReshapeDimensions(squeeze_shape);
-      squeeze_layer->setReshapeDimensions(target_shape);
      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
    }
    RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);

--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -50,6 +50,7 @@ class SoftMaxOpConverter : public OpConverter {
    uint32_t axes = std::max(0, input_dims - 3);
    // TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers
    // support Nd.
+    // Tips: Dynammic shape alreay fixes.
    int padded_dims = 0;
    int explicit_batch = 0;
    if (engine_->with_dynamic_shape()) explicit_batch = 1;
@@ -61,16 +62,16 @@ class SoftMaxOpConverter : public OpConverter {
      }
    }
    if (!engine_->with_dynamic_shape()) {
-      if (axis == -1) {
+      if (axis < 0) {
-        axes = input_dims - 1 - padded_dims;
+        axes = input_dims + axis - padded_dims;
      } else {
-        axes = axis;
+        axes = axis - 1;
      }
    } else {
-      if (axis == -1) {
+      if (axis < 0) {
-        axes = input_dims - 1 - padded_dims;
+        axes = input_dims + axis;
      } else {
-        axes = axis + 1;
+        axes = axis;
      }
    }
    layer->setAxes(1 << axes);

--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -131,6 +131,18 @@ bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc,
        }
      }
    }
+    if (op_type == "fc" || op_type == "mul") {
+      const int x_num_col_dims =
+          desc.HasAttr("x_num_col_dims")
+              ? boost::get<int>(desc.GetAttr("x_num_col_dims"))
+              : (desc.HasAttr("in_num_col_dims")
+                     ? boost::get<int>(desc.GetAttr("in_num_col_dims"))
+                     : 1);
+      if (x_num_col_dims != 1 && x_num_col_dims != 2) {
+        return false;
+      }
+    }
    if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
  }
  return false;

--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -127,9 +127,14 @@ int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs,
 int ElementwisePluginDynamic::initialize() { return 0; }
-size_t ElementwisePluginDynamic::getSerializationSize() const { return 0; }
+size_t ElementwisePluginDynamic::getSerializationSize() const {
+  return SerializedSize(type_.c_str()) + SerializedSize(axis_);
+}
-void ElementwisePluginDynamic::serialize(void *buffer) const {}
+void ElementwisePluginDynamic::serialize(void *buffer) const {
+  SerializeValue(&buffer, type_.c_str());
+  SerializeValue(&buffer, axis_);
+}
 nvinfer1::DimsExprs ElementwisePluginDynamic::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,

--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -92,7 +92,12 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
 public:
  explicit ElementwisePluginDynamic(const std::string& type, int axis)
      : type_(type), axis_(axis) {}
-  ElementwisePluginDynamic(void const* serialData, size_t serialLength) {}
+  ElementwisePluginDynamic(void const* serialData, size_t serialLength) {
+    const char* elementwise_type;
+    DeserializeValue(&serialData, &serialLength, &elementwise_type);
+    type_ = std::string(elementwise_type);
+    DeserializeValue(&serialData, &serialLength, &axis_);
+  }
  nvinfer1::IPluginV2DynamicExt* clone() const override {
    return new ElementwisePluginDynamic(type_, axis_);
  }
@@ -138,6 +143,46 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
  std::string type_;
  int axis_;
 };
+class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  ElementwisePluginV2Creator() {}
+  const char* getPluginName() const override { return "elementwise_plugin"; }
+  const char* getPluginVersion() const override { return "1"; }
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new ElementwisePluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(ElementwisePluginV2Creator);
 #endif
 }  // namespace plugin

--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -182,12 +182,10 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
                        "but it's (%d)",
                        output_index));
  nvinfer1::DimsExprs ret;
-  ret.nbDims = 5;
+  ret.nbDims = 3;
  ret.d[0] = inputs[0].d[0];
  ret.d[1] = inputs[0].d[1];
  ret.d[2] = expr_builder.constant(hidden_size_);
-  ret.d[3] = expr_builder.constant(1);
-  ret.d[4] = expr_builder.constant(1);
  return ret;
 }

--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -158,12 +158,10 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
          "it has (%d) inputs",
          nb_inputs));
  nvinfer1::DimsExprs ret;
-  ret.nbDims = 5;
+  ret.nbDims = 3;
  ret.d[0] = inputs[0].d[0];
  ret.d[1] = inputs[0].d[1];
  ret.d[2] = expr_builder.constant(head_size_ * head_number_);
-  ret.d[3] = expr_builder.constant(1);
-  ret.d[4] = expr_builder.constant(1);
  return ret;
 }

--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -43,11 +43,6 @@ int SkipLayerNormPluginDynamic::initialize() {
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
    nvinfer1::IExprBuilder &expr_builder) {
-  PADDLE_ENFORCE_EQ(
-      inputs[0].nbDims, 5,
-      platform::errors::InvalidArgument(
-          "The Input dim of the SkipLayernorm should be 5, but it's (%d) now.",
-          inputs[0].nbDims));
  return inputs[0];
 }

--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -62,6 +62,8 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions(
  output.d[1] = one;
  output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB,
                                       *inputs[1].d[0], *one);
+  // remove padding 1
+  output.nbDims -= 2;
  return output;
 }

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
@@ -126,7 +126,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
  run(config, &out_data);
  for (size_t i = 0; i < out_data.size(); i++) {
-    EXPECT_NEAR(result[i], out_data[i], 1e-4);
+    EXPECT_NEAR(result[i], out_data[i], 3e-3);
  }
 }

--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -12,60 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 namespace paddle {
 namespace operators {
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
+static inline int SizeOutAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
 template <typename T>
 class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = context.Input<Tensor>("X");
+    auto* x = ctx.Input<Tensor>("X");
-    auto* Out = context.Output<Tensor>("Out");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
-    // allocate memory on device.
+    auto* out_data = out->data<T>();
-    Out->mutable_data<T>(context.GetPlace());
+    auto dims = x->dims();
-    auto dims = X->dims();
+    const int rank = dims.size();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    framework::LoDTensor flattened_x;
+    const int dim = dims[axis];
-    framework::LoDTensor flattened_out;
+    const int N = SizeToAxis(axis, dims);
-    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    const int D = SizeOutAxis(axis, dims);
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    ScopedTensorDescriptor desc;
-    math::SoftmaxCUDNNFunctor<T>()(
+    std::vector<int> tensor_dims = {N, dim, D, 1};
-        context.template device_context<platform::CUDADeviceContext>(),
+    DataLayout layout = DataLayout::kNCHW;
-        &flattened_x, &flattened_out);
+    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+        handle, CUDNN_SOFTMAX_ACCURATE, mode,
+        platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+        platform::CudnnDataType<T>::kZero(), desc_, out_data));
  }
 };
 template <typename T>
 class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Out = context.Input<Tensor>("Out");
+    auto* out = ctx.Input<Tensor>("Out");
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
-    // allocate memory on device.
+    auto* dx_data = dx->data<T>();
-    dX->mutable_data<T>(context.GetPlace());
+    auto dims = out->dims();
-    auto dims = Out->dims();
+    const int rank = dims.size();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    framework::LoDTensor flattened_out;
+    const int dim = dims[axis];
-    framework::LoDTensor flattened_d_out;
+    const int N = SizeToAxis(axis, dims);
-    framework::LoDTensor flattened_d_x;
+    const int D = SizeOutAxis(axis, dims);
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
-    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    ScopedTensorDescriptor desc;
-    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+    std::vector<int> tensor_dims = {N, dim, D, 1};
+    DataLayout layout = DataLayout::kNCHW;
-    math::SoftmaxGradCUDNNFunctor<T>()(
+    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-        context.template device_context<platform::CUDADeviceContext>(),
-        &flattened_out, &flattened_d_out, &flattened_d_x);
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+        handle, CUDNN_SOFTMAX_ACCURATE, mode,
+        platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
+        dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
  }
 };

--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -53,13 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                          "Attr(axis) value should be in range [-R, R-1], "
                          "R is the rank of Input(X)."));
-    auto use_cudnn = ctx->Attrs().Get<bool>("use_cudnn");
-    if (axis != rank_x - 1 && axis != -1) {
-      PADDLE_ENFORCE_EQ(use_cudnn, false,
-                        platform::errors::InvalidArgument(
-                            "CUDNN kernel only support axis as -1."));
-    }
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
    ctx->ShareLoD("X", /*->*/ "Out");
  }

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+class TRTScaleTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[-1, 512], dtype="float32")
+            scale_out = self.append_scale(data)
+            out = fluid.layers.batch_norm(scale_out, is_test=True)
+        self.feeds = {"data": np.random.random([1, 512]).astype("float32"), }
+        self.enable_trt = True
+        self.trt_parameters = TRTScaleTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+    def append_scale(self, data):
+        return fluid.layers.scale(
+            x=data, scale=2.0, bias=-1.0, bias_after_scale=False)
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -654,6 +654,56 @@ class TensorRTSubgraphPassElementwiseMulTest(
        return fluid.layers.elementwise_mul(x=data1, y=data2)
+class TensorRTSubgraphPassElementwiseSerializeTest(
+        TensorRTSubgraphPassElementwiseTest):
+    def setUp(self):
+        super(TensorRTSubgraphPassElementwiseSerializeTest, self).setUp()
+        self.trt_parameters = TensorRTSubgraphPassElementwiseTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
+    def test_check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        super(TensorRTSubgraphPassElementwiseSerializeTest,
+              self).test_check_output()
+class TensorRTSubgraphPassElementwiseBroadcastDynamicTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data1 = fluid.data(
+                name="data1", shape=[-1, 3, 64, 64], dtype="float32")
+            data2 = fluid.data(name="data2", shape=[64, 64], dtype="float32")
+            eltwise_out = self.append_eltwise(data1, data2)
+            out = fluid.layers.batch_norm(eltwise_out, is_test=True)
+        self.feeds = {
+            "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
+            "data2": np.random.random([64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassElementwiseBroadcastDynamicTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassElementwiseBroadcastDynamicTest.DynamicShapeParam(
+            {
+                'data1': [1, 3, 8, 64],
+                'data2': [8, 64]
+            }, {'data1': [1, 3, 512, 64],
+                'data2':
+                [512, 64]}, {'data1': [1, 3, 256, 64],
+                             'data2': [256, 64]}, False)
+        self.fetch_list = [out]
+    def append_eltwise(self, data1, data2):
+        return fluid.layers.elementwise_add(x=data1, y=data2)
+    def test_check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 class TensorRTSubgraphPassShuffleChannelTest(InferencePassTest):
    def setUp(self):
        with fluid.program_guard(self.main_program, self.startup_program):

--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -153,16 +153,103 @@ class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
        return [2, 3, 4, 5]
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp3(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+    def get_axis(self):
+        return 0
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp4(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+    def get_axis(self):
+        return 1
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                 "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp):
    def get_x_shape(self):
        return [2, 3, 4, 5]
+    def get_axis(self):
+        return 2
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp6(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
    def get_axis(self):
        return 3
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp7(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp8(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+    def get_axis(self):
+        return 0
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp9(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+    def get_axis(self):
+        return 1
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp10(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+    def get_axis(self):
+        return 2
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp11(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+    def get_axis(self):
+        return 3
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp12(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+    def get_axis(self):
+        return 4
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                 "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):