diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index dc075b9f79a92fe086bd2b653a0c597377340c0a..e628216a5ed87b42a3f37a1adb86f441a735151e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1237,6 +1237,8 @@ USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);
 USE_TRT_CONVERTER(nearest_interp);
 USE_TRT_CONVERTER(reshape);
+USE_TRT_CONVERTER(reduce_sum);
+USE_TRT_CONVERTER(gather_nd);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 99328e6076891392fad019635a6f02839b7b6b8a..2e4a175566a7a100749d14c712e8ef9a89eb6019 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -13,6 +13,8 @@ nv_library(tensorrt_converter
                 multiclass_nms_op.cc
                 nearest_interp_op.cc
                 reshape_op.cc
+                reduce_op.cc
+                gather_nd_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 66a682db07b91195046d3d11031b8739b72b81c4..04c51202f022f65cf0c6b6a9671248acbc547a4a 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -40,10 +40,19 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     auto word_emb_name = op_desc.Input("WordEmbedding").front();
     auto pos_emb_name = op_desc.Input("PosEmbedding").front();
     auto sent_emb_name = op_desc.Input("SentEmbedding").front();
-    std::vector<std::string> id_names = {word_id_name, pos_id_name,
-                                         sent_id_name};
-    std::vector<std::string> emb_names = {word_emb_name, pos_emb_name,
-                                          sent_emb_name};
+
+    std::vector<std::string> id_names;
+    std::vector<std::string> emb_names;
+
+    if (engine_->use_oss()) {
+      id_names =
+          std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
+      emb_names =
+          std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
+    } else {
+      id_names = op_desc.Input("Ids");
+      emb_names = op_desc.Input("Embs");
+    }
 
     int input_num = id_names.size();
 
diff --git a/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..489fc987dfec2a13b4baccb06911c940b627d908
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class GatherNdOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a paddle gather_nd op to tensorrt gather_nd plugin";
+    framework::OpDesc op_desc(op, nullptr);
+
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> inputs;
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* index = engine_->GetITensor(op_desc.Input("Index")[0]);
+    inputs.emplace_back(input);
+    inputs.emplace_back(index);
+
+    nvinfer1::ILayer* layer = nullptr;
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+    plugin::GatherNdPluginDynamic* plugin =
+        new plugin::GatherNdPluginDynamic(with_fp16);
+    layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
+
+    std::string layer_name = "gather_nd (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(gather_nd, GatherNdOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66d2680fe9969cf7857130f1aa6e6aef742ca805
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <NvInfer.h>
+#include <sys/types.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ReduceSumOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a paddle reduce_sum op to tensorrt reduce layer";
+    framework::OpDesc op_desc(op, nullptr);
+
+    auto* x = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::Dims input_shape = x->getDimensions();
+    int input_dims = input_shape.nbDims;
+
+    bool keep_dim = BOOST_GET_CONST(bool, op_desc.GetAttr("keep_dim"));
+    std::vector<int32_t> dim =
+        BOOST_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("dim"));
+    bool reduce_all = BOOST_GET_CONST(bool, op_desc.GetAttr("reduce_all"));
+
+    // Now we only support dynamic_shape mode.
+    nvinfer1::IReduceLayer* layer = nullptr;
+    if (reduce_all) {
+      uint32_t reduce_dim = 0;
+      for (int i = 0; i < input_dims; ++i) {
+        reduce_dim |= 1 << i;
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
+                                   nvinfer1::ReduceOperation::kSUM, reduce_dim,
+                                   keep_dim);
+    } else {
+      auto CvtToBitMask = [&](const std::vector<int32_t>& dims) -> uint32_t {
+        uint32_t res = 0;
+        for (auto x : dims) {
+          if (x < 0) {
+            res |= 1 << (x + input_dims);
+          } else {
+            res |= 1 << x;
+          }
+        }
+        return res;
+      };
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
+                                   nvinfer1::ReduceOperation::kSUM,
+                                   CvtToBitMask(dim), keep_dim);
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "reduce_sum", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(reduce_sum, ReduceSumOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
index 3d8c72728c66711b0869c3e495385833b576d25c..489603e20cda2f1143fd4791c8cbe5e8e58e4148 100644
--- a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
@@ -34,7 +34,7 @@ class ReshapeOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    const std::vector<int>& shape =
+    std::vector<int> shape =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("shape"));
     int nbDims_num = shape.size();
     nvinfer1::Dims reshape_dim;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 85c466e4644e013509b1528e854626198d06b907..6c6a59e98d9e21728a9fbca08de1ffb455b55ccd 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
 
@@ -122,11 +123,13 @@ struct SimpleOpTypeSetTeller : public Teller {
       "flatten2",
       "flatten",
       "gather",
+      "gather_nd",
       "yolo_box",
       "roi_align",
       "affine_channel",
       "nearest_interp",
       "anchor_generator",
+      "reduce_sum",
   };
 };
 
@@ -324,6 +327,30 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
 
+    if (op_type == "gather_nd") {
+      auto* block = desc.Block();
+      auto x_var_name = desc.Input("X")[0];
+      auto index_var_name = desc.Input("Index")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      auto* index_var_desc = block->FindVar(index_var_name);
+
+      // The index input must be int32 datatype.
+      if (index_var_desc->GetDataType() !=
+          paddle::framework::proto::VarType_Type::VarType_Type_INT32) {
+        VLOG(3) << "gather_nd op Index input data type must be int32";
+        return false;
+      }
+
+      const auto index_shape = index_var_desc->GetShape();
+      const auto x_shape = x_var_desc->GetShape();
+      if (x_shape.size() != index_shape.size()) {
+        VLOG(3) << "gather_nd op Index input dims size [" << index_shape.size()
+                << " ] not equal to x dims size [" << x_shape.size() << "]";
+        return false;
+      }
+      if (!with_dynamic_shape) return false;
+    }
+
     if (op_type == "yolo_box") {
       if (with_dynamic_shape) return false;
       bool has_attrs =
@@ -684,6 +711,21 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (shape.size() >= nvinfer1::Dims::MAX_DIMS) return false;
       }
     }
+
+    if (op_type == "reduce_sum") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the reduce_sum does not support static shape yet";
+        return false;
+      }
+
+      if (!(desc.HasAttr("keep_dim") && desc.HasAttr("dim") &&
+            desc.HasAttr("reduce_all"))) {
+        VLOG(3) << "the reduce_sum does not have attr (keep_dim or dim or "
+                   "reduce_all)";
+        return false;
+      }
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 1804e6c5571d3a15b0b9adc67dc535b46635caa8..26125d21ad7d1a8e9348ce547494289b3a7dd6ad 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -8,6 +8,7 @@ nv_library(tensorrt_plugin
            anchor_generator_op_plugin.cu
            yolo_box_op_plugin.cu
            roi_align_op_plugin.cu
+           gather_nd_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5f4ac054c95b34f305d0213d773f5bcfe5e9e3e9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
@@ -0,0 +1,229 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <sstream>
+
+#include "NvInferRuntimeCommon.h"
+#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+
+template <typename T, typename IndexT = int>
+__global__ void GatherNdCUDAKernel(const T* input, const int32_t* input_dims,
+                                   const IndexT* indices, T* output,
+                                   int32_t remain_size, int32_t slice_size,
+                                   int32_t end_size) {
+  CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = 0;
+    int32_t temp = slice_size;
+    for (int32_t j = end_size - 1; j >= 0; --j) {
+      auto index_value = indices[indices_i * end_size + j];
+      PADDLE_ENFORCE(
+          index_value >= 0 && index_value < input_dims[j],
+          "The index is out of bounds, "
+          "please check whether the dimensions of index and "
+          "input meet the requirements. It should "
+          "be less than [%d] and greater or equal to 0, but received [%d]",
+          input_dims[j], index_value);
+      gather_i += (index_value * temp);
+      temp *= input_dims[j];
+    }
+    IndexT input_i = gather_i + slice_i;
+    *(output + i) = *(input + input_i);
+  }
+}
+
+int GatherNdPluginDynamic::initialize() { return 0; }
+
+size_t GatherNdPluginDynamic::getSerializationSize() const {
+  return SerializedSize(with_fp16_);
+}
+
+void GatherNdPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, with_fp16_);
+}
+
+nvinfer1::DimsExprs GatherNdPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) {
+  PADDLE_ENFORCE_EQ(
+      nb_inputs, 2,
+      platform::errors::InvalidArgument(
+          "The gather_nd plugin should have 2 input, but got %d.", nb_inputs));
+  PADDLE_ENFORCE_EQ(output_index, 0,
+                    platform::errors::InvalidArgument(
+                        "When GetOutputDimensions in gather_nd "
+                        "plugin, the output_index should be 0."));
+
+  nvinfer1::DimsExprs x_dims = inputs[0];
+  nvinfer1::DimsExprs index_dims = inputs[1];
+
+  int32_t x_dims_size = x_dims.nbDims;
+  int32_t index_dims_size = index_dims.nbDims;
+
+  // TODO(wilber): The result dims shoule be Index.shape[:-1] +
+  // X.shape[Index.shape[-1]:], but the trt DimsExprs is an expression we can't
+  // get the actual value. So we only support one scenario: input_dims.size ==
+  // index_dims.size.
+  nvinfer1::DimsExprs ret(x_dims);
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    ret.d[i] = index_dims.d[i];
+  }
+
+  return ret;
+}
+
+bool GatherNdPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of gather_nd plugin should not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  (in_out && pos < (nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+    if (with_fp16_) {
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    } else {
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    }
+  } else if (pos == 1) {
+    return in.type == nvinfer1::DataType::kINT32 &&
+           in.format == nvinfer1::TensorFormat::kLINEAR;
+  } else if (pos == 2) {
+    return in.type == in_out[0].type &&
+           in.format == nvinfer1::TensorFormat::kLINEAR;
+  }
+
+  return true;
+}
+
+nvinfer1::DataType GatherNdPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+  return input_types[0];
+}
+
+int GatherNdPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* input_desc,
+    const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;
+  auto index_dims = input_desc[1].dims;
+  auto input_dims_size = input_dims.nbDims;
+  auto index_dims_size = index_dims.nbDims;
+
+  std::vector<int32_t> input_shape, index_shape, out_shape;
+  for (int i = 0; i < input_dims.nbDims; i++)
+    input_shape.push_back(input_dims.d[i]);
+  for (int i = 0; i < index_dims.nbDims; i++)
+    index_shape.push_back(index_dims.d[i]);
+  // The out_shape is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    out_shape.emplace_back(index_shape[i]);
+  }
+  for (int i = index_shape[index_dims_size - 1]; i < input_dims_size; ++i) {
+    out_shape.emplace_back(input_shape[i]);
+  }
+
+  // final dim
+  int end_size = index_shape[index_dims_size - 1];
+  // remain dim
+  std::vector<int> remain_ddim(index_shape.begin(), index_shape.end() - 1);
+  int remain_numel = std::accumulate(remain_ddim.begin(), remain_ddim.end(), 1,
+                                     std::multiplies<int>());
+  // slice size
+  int slice_size = 1;
+  for (int i = end_size; i < input_dims_size; ++i) {
+    slice_size *= input_shape[i];
+  }
+
+  auto input_type = input_desc[0].type;
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp32";
+
+    const float* p_input = static_cast<const float*>(inputs[0]);
+    const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
+    float* p_output = static_cast<float*>(outputs[0]);
+
+    if (input_dims_data_ == nullptr) {
+      cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
+    }
+    cudaMemcpyAsync(input_dims_data_, input_shape.data(),
+                    sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
+                    stream);
+
+    int block = 512;
+    int n = slice_size * remain_numel;
+    int grid = (n + block - 1) / block;
+
+    GatherNdCUDAKernel<float, int32_t><<<grid, block, 0, stream>>>(
+        p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
+        end_size);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp16";
+
+    const half* p_input = static_cast<const half*>(inputs[0]);
+    const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
+    half* p_output = static_cast<half*>(outputs[0]);
+
+    if (input_dims_data_ == nullptr) {
+      cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
+    }
+    cudaMemcpyAsync(input_dims_data_, input_shape.data(),
+                    sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
+                    stream);
+
+    int block = 512;
+    int n = slice_size * remain_numel;
+    int grid = (n + block - 1) / block;
+
+    GatherNdCUDAKernel<half, int32_t><<<grid, block, 0, stream>>>(
+        p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
+        end_size);
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a242238c81fb3b34888905a393bc992179712b2
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class GatherNdPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit GatherNdPluginDynamic(bool with_fp16) { with_fp16_ = with_fp16; }
+
+  GatherNdPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+  }
+
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new GatherNdPluginDynamic(with_fp16_);
+  }
+
+  const char* getPluginType() const override { return "gather_nd_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override;
+
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void destroy() override {
+    if (input_dims_data_) {
+      cudaFree(input_dims_data_);
+    }
+    delete this;
+  }
+
+ private:
+  int32_t* input_dims_data_{nullptr};
+};
+
+class GatherNdPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  GatherNdPluginDynamicCreator() {}
+  const char* getPluginName() const override { return "gather_nd_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new GatherNdPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(GatherNdPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index 512f9c62415e5d1b09a1b649e78c72ac2d9f2d88..4d7218cd89e04b5122ff4385abfb2c7305e40c0a 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
@@ -311,6 +312,156 @@ __global__ void SoftmaxKernelWithEltadd2<half2>(
 #endif
 }
 
+template <typename T>
+__global__ void SoftmaxKernelWithEltaddForLarge(T *qk_buf, const T *bias_qk,
+                                                const int batch_size,
+                                                const int head_num,
+                                                const int seq_len,
+                                                const unsigned mask) {
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  T stride_max = -1e20f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    stride_max = qk_buf[threadIdx.x + i + qk_offset] +
+                             bias_qk[threadIdx.x + i + qk_offset] >
+                         stride_max
+                     ? qk_buf[threadIdx.x + i + qk_offset] +
+                           bias_qk[threadIdx.x + i + qk_offset]
+                     : stride_max;
+  }
+  T max_val = blockReduceMax<T>(stride_max, mask);
+
+  T stride_sum = 0.f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    stride_sum += __expf(qk_buf[threadIdx.x + i + qk_offset] +
+                         bias_qk[threadIdx.x + i + qk_offset] - max_val);
+  }
+  T sum_val = blockReduceSum<T>(stride_sum, mask);
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    qk_buf[threadIdx.x + i + qk_offset] =
+        (T)(__expf(qk_buf[threadIdx.x + i + qk_offset] +
+                   bias_qk[threadIdx.x + i + qk_offset] - max_val) /
+            sum_val);
+  }
+}
+
+// HIP defined __HIP_NO_HALF_CONVERSIONS__
+#ifndef __HIPCC__  // @{ Half kernel: SoftmaxKernelWithEltadd
+template <>
+__global__ void SoftmaxKernelWithEltaddForLarge(
+    half *qk_buf, const half *bias_qk, const int batch_size, const int head_num,
+    const int seq_len, const unsigned mask) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  float stride_max = -1e20f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float tmp = static_cast<float>(qk_buf[threadIdx.x + i + qk_offset] +
+                                   bias_qk[threadIdx.x + i + qk_offset]);
+    stride_max = tmp > stride_max ? tmp : stride_max;
+  }
+  float max_val = blockReduceMax<float>(stride_max, mask);
+
+  float stride_sum = 0.f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float tmp = static_cast<float>(qk_buf[threadIdx.x + i + qk_offset] +
+                                   bias_qk[threadIdx.x + i + qk_offset]);
+    stride_sum += __expf(tmp - max_val);
+  }
+  float sum_val = blockReduceSum<float>(stride_sum, mask);
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float tmp =
+        __expf(static_cast<float>(qk_buf[threadIdx.x + i + qk_offset] +
+                                  bias_qk[threadIdx.x + i + qk_offset]) -
+               max_val);
+    qk_buf[threadIdx.x + i + qk_offset] = (half)(tmp / sum_val);
+  }
+#endif
+}
+#endif  // @} End Half kernel: SoftmaxKernelWithEltadd
+
+template <typename T>
+__global__ void SoftmaxKernelWithEltaddForLarge2(T *qk_buf_, const T *bias_qk_,
+                                                 const int batch_size,
+                                                 const int head_num,
+                                                 const int seq_len,
+                                                 const unsigned mask) {
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  float2 stride_max = make_float2(-1e20f, -1e20f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                             bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_max.x = max(stride_max.x, cur.x);
+    stride_max.y = max(stride_max.y, cur.y);
+  }
+  float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
+
+  float2 stride_sum = make_float2(0.f, 0.f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                             bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_sum.x += __expf(cur.x - max_val);
+    stride_sum.y += __expf(cur.y - max_val);
+  }
+
+  float sum_val =
+      blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f;
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                             bias_qk_[threadIdx.x + i + qk_offset]);
+    qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<T>(
+        __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
+  }
+}
+
+template <>
+__global__ void SoftmaxKernelWithEltaddForLarge2(
+    half2 *qk_buf_, const half2 *bias_qk_, const int batch_size,
+    const int head_num, const int seq_len, const unsigned mask) {
+// operator "+" of half only suppotted after cuda version 10.0
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#if defined(PADDLE_WITH_CUDA) && \
+    (CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000)
+
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  float2 stride_max = make_float2(-1e20f, -1e20f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_max.x = max(stride_max.x, cur.x);
+    stride_max.y = max(stride_max.y, cur.y);
+  }
+  float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
+
+  float2 stride_sum = make_float2(0.f, 0.f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_sum.x += __expf(cur.x - max_val);
+    stride_sum.y += __expf(cur.y - max_val);
+  }
+
+  float sum_val =
+      blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f;
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<half2>(
+        __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
+  }
+#endif
+}
+
 template <typename T>
 inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
                              int head_num, int seq_len, int size_per_head,
@@ -332,31 +483,48 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
       reinterpret_cast<run_type *>(qk_buf_), batch_size * head_num,
       seq_len * size_per_head, seq_len * size_per_head);
 
-  int grid = batch_size * head_num * seq_len;
-  int block = seq_len;
-
-  // Align block to 32, also limit seq_len to max block size.
-  PADDLE_ENFORCE_LE(seq_len, 1024, platform::errors::InvalidArgument(
-                                       "seq_len should <= 1024, "
-                                       "but received seq_len is:%d",
-                                       seq_len));
-  if (seq_len % 2 == 0) {
-    block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32;
-    if (std::is_same<T, float>::value) {
-      SoftmaxKernelWithEltadd2<float2><<<grid, block, 0, stream>>>(
-          reinterpret_cast<float2 *>(qk_buf_),
-          reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
-          seq_len / 2, FINAL_MASK);
+  if (seq_len <= 1024) {
+    int grid = batch_size * head_num * seq_len;
+    int block = seq_len;
+
+    // Align block to 32, also limit seq_len to max block size.
+    if (seq_len % 2 == 0) {
+      block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32;
+      if (std::is_same<T, float>::value) {
+        SoftmaxKernelWithEltadd2<float2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<float2 *>(qk_buf_),
+            reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      } else {
+        SoftmaxKernelWithEltadd2<__half2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<__half2 *>(qk_buf_),
+            reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      }
     } else {
-      SoftmaxKernelWithEltadd2<__half2><<<grid, block, 0, stream>>>(
-          reinterpret_cast<__half2 *>(qk_buf_),
-          reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
-          seq_len / 2, FINAL_MASK);
+      block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32;
+      SoftmaxKernelWithEltadd<T><<<grid, block, 0, stream>>>(
+          qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
     }
   } else {
-    block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32;
-    SoftmaxKernelWithEltadd<T><<<grid, block, 0, stream>>>(
-        qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
+    int grid = batch_size * head_num * seq_len;
+    int block = 512;
+    if (seq_len % 2 == 0) {
+      if (std::is_same<T, float>::value) {
+        SoftmaxKernelWithEltaddForLarge2<float2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<float2 *>(qk_buf_),
+            reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      } else {
+        SoftmaxKernelWithEltaddForLarge2<__half2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<__half2 *>(qk_buf_),
+            reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      }
+    } else {
+      SoftmaxKernelWithEltaddForLarge<T><<<grid, block, 0, stream>>>(
+          qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
+    }
   }
 }
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..75f5328ac1c419015b379b08cc050f478a5b2112
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTGatherNdTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[-1, 3, 4], dtype="float32")
+            index = fluid.data(name="index", shape=[-1, 2, 2], dtype="int32")
+            gather_nd = fluid.layers.gather_nd(data, index)
+            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([2, 3, 4]).astype("float32"),
+            "index":
+            np.array([[[0, 1], [1, 0]], [[1, 2], [0, 1]]]).astype("int32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTGatherNdTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTGatherNdTest.DynamicShapeParam({
+            'data': [1, 3, 4],
+            'index': [1, 2, 2]
+        }, {'data': [3, 3, 4],
+            'index': [3, 2, 2]}, {'data': [3, 3, 4],
+                                  'index': [3, 2, 2]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTGatherNdFp16Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 5120, 768], dtype="float32")
+            index = fluid.data(name="index", shape=[-1, 4096, 2], dtype="int32")
+            gather_nd = fluid.layers.gather_nd(data, index)
+            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+
+        index_data = np.zeros((1, 4096, 2), dtype='int32')
+        self.feeds = {
+            "data": np.random.random([1, 5120, 768]).astype("float32"),
+            "index": index_data,
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTGatherNdFp16Test.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTGatherNdFp16Test.DynamicShapeParam({
+            'data': [1, 5120, 768],
+            'index': [1, 4096, 2]
+        }, {'data': [3, 5120, 768],
+            'index':
+            [3, 4096, 2]}, {'data': [3, 5120, 768],
+                            'index': [3, 4096, 2]}, False)
+
+    def test_check_output(self, atol=1e-3):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb5e8e99b092690a2c88d75ba3a77de2dba2e720
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTReduceSumTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 10, 768], dtype="float32")
+            reduce_sum = fluid.layers.reduce_sum(
+                data, dim=[2, -1], keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 10, 768]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceSumTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceSumTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [3, 3, 10, 768]}, {'data': [3, 3, 10, 768]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceSumAllTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 10, 768], dtype="float32")
+            reduce_sum = fluid.layers.reduce_sum(data, keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 10, 768]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceSumAllTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceSumAllTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [3, 3, 10, 768]}, {'data': [3, 3, 10, 768]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()