[Inference] clean unused code/target for reduce inference so volume (PART I) (#53762)

* remove prelu land ookuip_table plugin, adjust .h include location * clean code and adjust some .h * update

[Inference] clean unused code/target for reduce inference so volume (PART I) (#53762)
* remove prelu land ookuip_table plugin, adjust .h include location * clean code and adjust some .h * update
51ecd933 · Yuanle Liu · GitHub · db407bf0 · 51ecd933 · 51ecd933
31 changed file
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -107,11 +107,7 @@ void AttentionLSTMFusePass::FindWhileOp(Graph* graph) const {
  gpd.mutable_pattern()->NewNode(
      [&](Node* n) { return fused_external_ops.count(n->id()); }, "while");
-  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
+  auto& marked_nodes = GetMarkedNodes(graph);
-    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
-  }
-  auto& marked_nodes =
-      graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
  auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph,
                    Graph* g) {

--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include <fstream>
+#include <ostream>
 #include <string>
 #include "paddle/fluid/framework/ir/graph_helper.h"

--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -14,12 +14,7 @@ limitations under the License. */
 #pragma once
-#include <fstream>
-#include <map>
-#include <memory>
-#include <string>
 #include <unordered_set>
-#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
-#include <vector>
 #include "paddle/fluid/inference/analysis/passes/passes.h"
 #include "paddle/fluid/string/pretty_log.h"

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -35,12 +35,10 @@ limitations under the License. */
 * phase in the inference service.
 */
-#include <string>
 #include <vector>
 #include "gflags/gflags.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/flags.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/device.h
+++ b/paddle/fluid/inference/analysis/device.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-namespace paddle {
-namespace inference {
-namespace analysis {
-enum class Device { CPU, GPU };
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/analysis/dot_tester.cc
+++ b/paddle/fluid/inference/analysis/dot_tester.cc
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <gtest/gtest.h>
-#include <memory>
-#include "paddle/fluid/inference/analysis/dot.h"
-namespace paddle {
-namespace inference {
-namespace analysis {
-class DotTester : public ::testing::Test {
- protected:
-  void SetUp() override {
-    std::vector<Dot::Attr> attrs({{"title", "hello"}});
-    dot.reset(new Dot(attrs));
-    dot->AddNode("a", {Dot::Attr{"shape", "box"}, Dot::Attr("color", "blue")});
-    dot->AddNode("b", {});
-    dot->AddNode("c", {});
-    dot->AddEdge("a", "b", {});
-    dot->AddEdge("b", "c", {});
-    dot->AddEdge("a", "c", {});
-  }
-  std::unique_ptr<Dot> dot;
-};
-TEST_F(DotTester, Build) {
-  auto codes = dot->Build();
-  // Output the DOT language code, the generated codes are too long to compare
-  // the string.
-  //
-  // The output is
-  //
-  // digraph G {
-  //   title="hello"
-  //   node_1
-  //   node_2
-  //   node_0[label="a" shape="box" color="blue"]
-  //   node_0->node_1
-  //   node_1->node_2
-  //   node_0->node_2
-  // } // end G
-  LOG(INFO) << '\n' << codes;
-}
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/analysis/flags.h
+++ b/paddle/fluid/inference/analysis/flags.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "gflags/gflags.h"
-// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
-// flag if not available.
-DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
-DECLARE_string(IA_graphviz_log_root);
-DECLARE_string(IA_output_storage_path);
-DECLARE_bool(IA_enable_ir);
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -21,7 +21,6 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 namespace paddle {

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -14,16 +14,9 @@
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
-#include <memory>
-#include <utility>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
-#endif
 namespace paddle {
 namespace inference {
 namespace analysis {

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -33,10 +33,6 @@ class IrAnalysisPass : public AnalysisPass {
  void CollectFusionStatis(Argument* argument);
-  void ReadCalibrationInfo(
-      Argument* argument,
-      std::unordered_map<std::string, std::vector<float>>* var_quant_scales);
  std::string repr() const override;
 };

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -14,7 +14,6 @@
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
-#include <memory>
 #include <string>
 #include "paddle/fluid/framework/executor.h"

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -17,15 +17,15 @@
 #include <cstdlib>
 #include <string>
 #include <unordered_set>
+#include <vector>
-#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/common/data_type.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 DEFINE_bool(
    custom_model_save_cpu,

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -15,12 +15,9 @@
 #pragma once
 #include <string>
-#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -15,7 +15,9 @@
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include <string>
+#include <unordered_set>
 #include <utility>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"

--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -15,12 +15,9 @@
 #pragma once
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <utility>
-#include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/phi/backends/dynload/port.h"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
 if(APPLE)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif()

--- a/paddle/fluid/inference/api/README.md
+++ b/paddle/fluid/inference/api/README.md
-# Embed Paddle Inference in Your Application
-Paddle inference offers the APIs in `C` and `C++` languages.
-You can easily deploy a model trained by Paddle following the steps as below:
-1. Optimize the native model;
-2. Write some codes for deployment.
-## The APIs
-All the released APIs are located in the `paddle_inference_api.h` header file.
-The stable APIs are wrapped by `namespace paddle`, the unstable APIs are protected by `namespace paddle::contrib`.
-## Write some codes
-Read `paddle_inference_api.h` for more information.
--- a/paddle/fluid/inference/api/high_level_api.md
+++ b/paddle/fluid/inference/api/high_level_api.md
-# Inference High-level APIs
-This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly.
-The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_inference.so` and `libpaddle_inference_io.so` are needed for a deployment.
-## PaddleTensor
-We provide the `PaddleTensor` data structure to give a general tensor interface.
-The definition is
-```c++
-struct PaddleTensor {
-  std::string name;  // variable name.
-  std::vector<int> shape;
-  PaddleBuf data;  // blob of data.
-  PaddleDType dtype;
-};
-```
-The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type.
-The `name` field is used to specify the name of an input variable,
-that is important when there are multiple inputs and need to distinguish which variable to set.
-## engine
-The inference APIs has two different underlying engines
- the native engine
- the tensorrt engine
-The native engine, which is consists of the native operators and framework, takes a native Paddle model
-as input, and supports any model that trained by Paddle.
-```c++
-enum class PaddleEngineKind {
-  kNative = 0,  // Use the native Fluid facility.
-  kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
-};
-```
-## PaddlePredictor and how to create one
-The main interface is `PaddlePredictor,` there are following methods
- `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
-  - take inputs and output `output_data.`
- `Clone` to clone a predictor from an existing one, with model parameter shared.
-There is a factory method to help create a predictor, and the user takes the ownership of this object.
-```c++
-template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-```
-By specifying the engine kind and config, one can get a specific implementation.
-## Reference
- [paddle_inference_api.h](./paddle_inference_api.h)
- [some demos](./demo_ci)
--- a/paddle/fluid/inference/api/high_level_api_cn.md
+++ b/paddle/fluid/inference/api/high_level_api_cn.md
-# Paddle 预测 API
-为了更简单方便的预测部署，Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
-预测库包含:
- 头文件 `paddle_inference_api.h` 定义了所有的接口
- 库文件 `libpaddle_inference.so/.a(Linux/Mac)` `libpaddle_inference.lib/paddle_inference.dll(Windows)`
-下面是详细的一些 API 概念介绍
-## PaddleTensor
-PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
-```c++
-struct PaddleTensor {
-  std::string name;  // variable name.
-  std::vector<int> shape;
-  PaddleBuf data;  // blob of data.
-  PaddleDType dtype;
-};
-```
- `name` 用于指定输入数据对应的 模型中variable 的名字 （暂时没有用，但会在后续支持任意 target 时启用）
- `shape` 表示一个 Tensor 的 shape
- `data`  数据以连续内存的方式存储在`PaddleBuf` 中，`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存，详细可以参考头文件中相关定义。
- `dtype` 表示 Tensor 的数据类型
-## engine
-高层 API 底层有多种优化实现，我们称之为 engine，目前有两种 engine
- 原生 engine，由 paddle 原生的 forward operator 组成，可以天然支持所有paddle 训练出的模型，
- TensorRT mixed engine，用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ，支持所有paddle 模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
-其实现为
-```c++
-enum class PaddleEngineKind {
-  kNative = 0,       // Use the native Fluid facility.
-  kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
-};
-```
-## 预测部署过程
-总体上分为以下步骤
-1. 用合适的配置创建 `PaddlePredictor`
-2. 创建输入用的 `PaddleTensor`，传入到 `PaddlePredictor` 中
-3. 获取输出的 `PaddleTensor` ，将结果取出
-下面完整演示一个简单的模型，部分细节代码隐去
-```c++
-#include "paddle_inference_api.h"
-// 创建一个 config，并修改相关设置
-paddle::NativeConfig config;
-config.model_dir = "xxx";
-config.use_gpu = false;
-// 创建一个原生的 PaddlePredictor
-auto predictor =
-      paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
-// 创建输入 tensor
-int64_t data[4] = {1, 2, 3, 4};
-paddle::PaddleTensor tensor{.name = "",
-                            .shape = std::vector<int>({4, 1}),
-                            .data = paddle::PaddleBuf(data, sizeof(data)),
-                            .dtype = paddle::PaddleDType::INT64};
-// 创建输出 tensor，输出 tensor 的内存可以复用
-std::vector<paddle::PaddleTensor> outputs;
-// 执行预测
-CHECK(predictor->Run(slots, &outputs));
-// 获取 outputs ...
-```
-编译时，联编 `libpaddle_inference.a/.so(Linux/Mac)` 或 `libpaddle_inference.lib/paddle_inference.dll(Windows)` 便可。
-## 详细代码参考
- [inference demos](./demo_ci)
- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc)
--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <string>
-namespace paddle {
-namespace inference {
-struct Buffer;
-enum class DeviceType { UNK = -1, CPU, GPU };
-/*
- * EngineBase is the base class of all inference engines. An inference engine
- * takes a paddle program as input, and outputs the result in fluid Tensor
- * format. It can be used to optimize performance of computation sub-blocks, for
- * example, break down the original block into sub-blocks and execute each
- * sub-blocks in different engines.
- *
- * For example:
- *   When inference, the resnet50 model can put most of the model into subgraph
- * and run it on a TensorRT engine.
- *
- * There are several engines such as TensorRT and other frameworks, so an
- * EngineBase is put forward to give an unified interface for all the
- * different engine implemention.
- */
-class EngineBase {
- public:
-  using DescType = ::paddle::framework::proto::BlockDesc;
-  // Build the model and do some preparation, for example, in TensorRT, run
-  // createInferBuilder, buildCudaEngine.
-  virtual void Build(const DescType& paddle_model) = 0;
-  // Execute the engine, that will run the inference network.
-  virtual void Execute(int batch_size) = 0;
-  virtual ~EngineBase() {}
-};  // class EngineBase
-struct Buffer {
-  void* buffer{nullptr};               // buffer should be allocated only once.
-  size_t max_size;                     // buffer allocated space.
-  size_t size;                         // data size.
-  DeviceType device{DeviceType::UNK};  // tells which device this buffer is on.
-};
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -99,7 +99,6 @@ list(
  merge_layernorm_op.cc
  skip_merge_layernorm_op.cc
  generic_and_custom_plugin_creater.cc
-  fused_lookup_tables_op.cc
  lookup_table_op.cc
  elementwiseadd_transpose_op.cc
  skip_groupnorm_act_op.cc

--- a/paddle/fluid/inference/tensorrt/convert/fused_lookup_tables_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fused_lookup_tables_op.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/utils.h"
-#include "paddle/fluid/inference/tensorrt/plugin/lookup_table.h"
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-class FusedLookupTablesOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope,
-                  bool test_mode) override {
-    if (!engine_->with_dynamic_shape()) {
-      PADDLE_THROW(
-          platform::errors::Fatal("lookup_table_op must with dynamic shape"));
-    }
-    framework::OpDesc op_desc(op, nullptr);
-    auto ids_name = op_desc.Input("Ids").front();
-    auto w_name = op_desc.Input("W").front();
-    auto output_name = op_desc.Output("Out").front();
-    bool enable_int8 = op_desc.HasAttr("enable_int8");
-    std::vector<nvinfer1::ITensor*> plugin_inputs;
-    auto ids_dims = engine_->GetITensor(ids_name)->getDimensions();
-    if (ids_dims.d[ids_dims.nbDims - 1] == 1) {
-      nvinfer1::Dims new_ids_dims;
-      new_ids_dims.nbDims = ids_dims.nbDims - 1;
-      for (int i = 0; i < ids_dims.nbDims - 1; i++) {
-        new_ids_dims.d[i] = 0;
-      }
-      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Shuffle, *(engine_->GetITensor(ids_name)));
-      reshape_layer->setReshapeDimensions(new_ids_dims);
-      reshape_layer->setName(
-          ("lookup_table: Shuffle (Output: " + output_name + ")").c_str());
-      plugin_inputs.push_back(reshape_layer->getOutput(0));
-    } else {
-      plugin_inputs.push_back(engine_->GetITensor(ids_name));
-    }
-    TensorRTEngine::Weight weight;
-    auto* w_var = scope.FindVar(w_name);
-    auto* w_tensor = w_var->GetMutable<phi::DenseTensor>();
-    auto w_dims = w_tensor->dims();
-    weight = engine_->GetTrtWeight(w_name, *w_tensor);
-    auto weight_size = phi::product(w_dims);
-    int output_fp16;
-    if (engine_->precision() == AnalysisConfig::Precision::kFloat32) {
-      output_fp16 = 0;
-    } else {
-      output_fp16 = 1;
-    }
-    int32_t weight_width = static_cast<int32_t>(w_dims[1]);
-    std::vector<nvinfer1::PluginField> fields;
-    fields.emplace_back("lookup_table_weight",
-                        weight.get().values,
-                        GetPluginFieldType(weight.get().type),
-                        static_cast<int32_t>(weight_size));
-    fields.emplace_back("lookup_table_weight_width",
-                        &weight_width,
-                        nvinfer1::PluginFieldType::kINT32,
-                        1);
-    fields.emplace_back(
-        "output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1);
-    nvinfer1::PluginFieldCollection* plugin_ptr =
-        static_cast<nvinfer1::PluginFieldCollection*>(
-            malloc(sizeof(*plugin_ptr) +
-                   fields.size() * sizeof(nvinfer1::PluginField)));
-    plugin_ptr->nbFields = static_cast<int>(fields.size());
-    plugin_ptr->fields = fields.data();
-    auto creator =
-        GetPluginRegistry()->getPluginCreator("LookupTablePluginDynamic", "1");
-    auto plugin_obj =
-        creator->createPlugin("LookupTablePluginDynamic", plugin_ptr);
-    auto plugin_layer = engine_->network()->addPluginV2(
-        plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
-    plugin_layer->setName(
-        ("lookup_table: (Output: " + output_name + ")").c_str());
-    engine_->SetITensor(output_name, plugin_layer->getOutput(0));
-    free(plugin_ptr);
-    if (enable_int8) {
-      float out_scale =
-          PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
-      engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), out_scale);
-    }
-  }
-};
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-// NOTE(liuyuanle): We will remove the implementation here later. Ref to
-// tensorrt/convert/lookup_table_op.cc.
-// REGISTER_TRT_OP_CONVERTER(lookup_table, FusedLookupTablesOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -31,7 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -4,7 +4,6 @@ list(
  trt_plugin.cc
  split_op_plugin.cu
  elementwise_op_plugin.cu
-  prelu_op_plugin.cu
  gelu_op_plugin.cu
  pool_op_plugin.cu
  swish_op_plugin.cu
@@ -40,7 +39,6 @@ list(
  preln_groupnorm_act_op_plugin.cu
  elementwiseadd_transpose_op_plugin.cu
  generic_plugin.cu
-  lookup_table.cu
  many_emb_layernorm_plugin.cu
  many_emb_layernorm_kernel.cu)
@@ -57,11 +55,7 @@ endif()
 nv_library(
  tensorrt_plugin
  SRCS ${TRT_FILES}
-  DEPS enforce
+  DEPS enforce tensorrt_engine tensor bert_encoder_functor
-       tensorrt_engine
-       prelu
-       tensor
-       bert_encoder_functor
       tensorrt_dynamic_shape_infermeta_factory
       tensorrt_plugin_arg_mapping_context)

--- a/paddle/fluid/inference/tensorrt/plugin/lookup_table.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/lookup_table.cu
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/inference/tensorrt/plugin/lookup_table.h"
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-char const* PLUGINVERSION{"1"};
-char const* LOOKUPTABLEPLUGINNAME{"LookupTablePluginDynamic"};
-template <typename T, unsigned TPB>
-__global__ void lookup_table_kernel(int weight_height,
-                                    int32_t const* inputIds,
-                                    T const* wordEmb,
-                                    int32_t const wordSize,
-                                    T* output) {
-  // 1. lookup word and token of the block
-  // blockIdx.x = position in the sequence
-  // blockIdx.y = batch
-  // gridDim.x = S
-  // gridDim.y = B
-  __shared__ int wordId;
-  int32_t const seqPos = blockIdx.x + blockIdx.y * gridDim.x;
-  if (threadIdx.x == 0) {
-    wordId = inputIds[seqPos];
-  }
-  __syncthreads();
-  // 2. load word embeddings and add them toghether
-  // offset into embeddings is given by wordId * hidden_size
-  int32_t const woffset = wordId * weight_height;
-  // the output offset is given by b * (S*hidden_size) + s * hidden_size
-  int32_t const outOffset = seqPos * weight_height;
-  if (wordId >= 0 && wordId < wordSize) {
-    for (int it = threadIdx.x; it < weight_height; it += TPB) {
-      T const w(wordEmb[woffset + it]);
-      output[outOffset + it] = w;
-    }
-  } else {
-    printf(
-        "Error!!!!!!(LookupTablePlugin): ID cannot be lookup "
-        "table: ID < 0 or ID > max ");
-    return;
-  }
-}
-template <typename T>
-int lookup_table(cudaStream_t stream,
-                 int weight_height,
-                 int B,
-                 int S,
-                 int32_t const* inputIds,
-                 T const* wordEmb,
-                 int32_t const wordSize,
-                 T* output) {
-  constexpr int tpb = 256;
-  dim3 const grid(S, B, 1);
-  dim3 const block(tpb, 1, 1);
-  lookup_table_kernel<T, tpb><<<grid, block, 0, stream>>>(
-      weight_height, inputIds, wordEmb, wordSize, output);
-  return 0;
-}
-// Static class fields initialization
-nvinfer1::PluginFieldCollection LookupTablePluginDynamicCreator::mFC{};
-std::vector<nvinfer1::PluginField>
-    LookupTablePluginDynamicCreator::mPluginAttributes;
-LookupTablePluginDynamic::LookupTablePluginDynamic(
-    nvinfer1::DataType const type,
-    void* weight_dev,
-    int32_t weight_size,
-    int32_t width)
-    : mType(type),
-      mWeightDev(weight_dev),
-      mWeightSize(weight_size),
-      mWeightWidth(width) {}
-LookupTablePluginDynamic::LookupTablePluginDynamic(void const* data,
-                                                   size_t length) {
-  // Deserialize in the same order as serialization
-  deserialize_value(&data, &length, &mType);
-  deserialize_value(&data, &length, &mWeightSize);
-  deserialize_value(&data, &length, &mWeightWidth);
-  char const* d = static_cast<char const*>(data);
-  cudaMalloc(&mWeightDev, mWeightSize * getElementSize(mType));
-  cudaMemcpy(mWeightDev,
-             d,
-             mWeightSize * getElementSize(mType),
-             cudaMemcpyHostToDevice);
-}
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* LookupTablePluginDynamic::clone()
-    const noexcept {
-  auto p = new LookupTablePluginDynamic(
-      mType, mWeightDev, mWeightSize, mWeightWidth);
-  p->setPluginNamespace(mNamespace.c_str());
-  return p;
-}
-nvinfer1::DimsExprs LookupTablePluginDynamic::getOutputDimensions(
-    int32_t outputIndex,
-    nvinfer1::DimsExprs const* inputs,
-    int32_t nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) noexcept {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = inputs[0].nbDims + 1;
-  for (int i = 0; i < inputs[0].nbDims; ++i) {
-    ret.d[i] = inputs[0].d[i];
-  }
-  ret.d[inputs[0].nbDims] = exprBuilder.constant(mWeightWidth);
-  return ret;
-}
-bool LookupTablePluginDynamic::supportsFormatCombination(
-    int32_t pos,
-    nvinfer1::PluginTensorDesc const* inOut,
-    int32_t nbInputs,
-    int32_t nbOutputs) noexcept {
-  nvinfer1::PluginTensorDesc const& desc = inOut[pos];
-  if (desc.format != nvinfer1::TensorFormat::kLINEAR) {
-    return false;
-  }
-  if (pos == 0) {
-    return desc.type == nvinfer1::DataType::kINT32;
-  }
-  if (pos == 1) {
-    if (mType == nvinfer1::DataType::kFLOAT) {
-      return desc.type == nvinfer1::DataType::kFLOAT;
-    } else {
-      return desc.type == nvinfer1::DataType::kHALF;
-    }
-  }
-}
-void LookupTablePluginDynamic::configurePlugin(
-    nvinfer1::DynamicPluginTensorDesc const* inputs,
-    int32_t nbInputs,
-    nvinfer1::DynamicPluginTensorDesc const* outputs,
-    int32_t nbOutputs) noexcept {}
-size_t LookupTablePluginDynamic::getWorkspaceSize(
-    nvinfer1::PluginTensorDesc const* inputs,
-    int32_t nbInputs,
-    nvinfer1::PluginTensorDesc const* outputs,
-    int32_t nbOutputs) const noexcept {
-  return 0;
-}
-int32_t LookupTablePluginDynamic::enqueue(
-    nvinfer1::PluginTensorDesc const* inputDesc,
-    nvinfer1::PluginTensorDesc const* outputDesc,
-    void const* const* inputs,
-    void* const* outputs,
-    void* workspace,
-    cudaStream_t stream) noexcept {
-  int32_t const batchSize = inputDesc->dims.d[0];
-  int32_t S;
-  if (inputDesc->dims.nbDims == 1) {
-    S = 1;
-  } else {
-    S = inputDesc->dims.d[1];
-  }
-  int32_t mWeightHeight = mWeightSize / mWeightWidth;
-  int32_t status = STATUS_FAILURE;
-  auto const inputIds = static_cast<int32_t const*>(inputs[0]);
-  if (mType == nvinfer1::DataType::kFLOAT) {
-    auto output = static_cast<float*>(outputs[0]);
-    auto const Weight = static_cast<const float*>(mWeightDev);
-    status = lookup_table<float>(stream,
-                                 static_cast<int32_t>(mWeightWidth),
-                                 batchSize,
-                                 S,
-                                 inputIds,
-                                 Weight,
-                                 mWeightHeight,
-                                 output);
-  } else if (mType == nvinfer1::DataType::kHALF) {
-    auto output = static_cast<half*>(outputs[0]);
-    auto const Weight = static_cast<const half*>(mWeightDev);
-    status = lookup_table<half>(stream,
-                                static_cast<int32_t>(mWeightWidth),
-                                batchSize,
-                                S,
-                                inputIds,
-                                Weight,
-                                mWeightHeight,
-                                output);
-  }
-  return status;
-}
-// IPluginV2Ext Methods
-nvinfer1::DataType LookupTablePluginDynamic::getOutputDataType(
-    int32_t index,
-    nvinfer1::DataType const* inputTypes,
-    int32_t nbInputs) const noexcept {
-  if (index == 0) {
-    assert(mType == nvinfer1::DataType::kHALF ||
-           mType == nvinfer1::DataType::kFLOAT);
-    return mType;
-  }
-}
-// IPluginV2 Methods
-char const* LookupTablePluginDynamic::getPluginType() const noexcept {
-  return LOOKUPTABLEPLUGINNAME;
-}
-char const* LookupTablePluginDynamic::getPluginVersion() const noexcept {
-  return PLUGINVERSION;
-}
-int32_t LookupTablePluginDynamic::getNbOutputs() const noexcept { return 1; }
-int32_t LookupTablePluginDynamic::initialize() noexcept { return 0; }
-void LookupTablePluginDynamic::terminate() noexcept { cudaFree(mWeightDev); }
-size_t LookupTablePluginDynamic::getSerializationSize() const noexcept {
-  size_t const wordSize = getElementSize(mType);
-  return sizeof(mType)              //
-         + sizeof(mWeightSize)      //
-         + sizeof(mWeightWidth)     //
-         + wordSize * mWeightSize;  //
-}
-void LookupTablePluginDynamic::serialize(void* buffer) const noexcept {
-  serialize_value(&buffer, mType);
-  serialize_value(&buffer, mWeightSize);
-  serialize_value(&buffer, mWeightWidth);
-  char* d = static_cast<char*>(buffer);
-  size_t const wordSize = getElementSize(mType);
-  serFromDev(&d, static_cast<char*>(mWeightDev), mWeightSize * wordSize);
-}
-void LookupTablePluginDynamic::destroy() noexcept {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-void LookupTablePluginDynamic::setPluginNamespace(
-    char const* libNamespace) noexcept {
-  mNamespace = libNamespace;
-}
-char const* LookupTablePluginDynamic::getPluginNamespace() const noexcept {
-  return mNamespace.c_str();
-}
-LookupTablePluginDynamicCreator::LookupTablePluginDynamicCreator() {}
-char const* LookupTablePluginDynamicCreator::getPluginName() const noexcept {
-  return LOOKUPTABLEPLUGINNAME;
-}
-char const* LookupTablePluginDynamicCreator::getPluginVersion() const noexcept {
-  return PLUGINVERSION;
-}
-nvinfer1::PluginFieldCollection const*
-LookupTablePluginDynamicCreator::getFieldNames() noexcept {
-  return &mFC;
-}
-bool initializeFields(nvinfer1::PluginFieldCollection const* fc,
-                      nvinfer1::Weights* weight,
-                      int32_t& mWeightWidth) {  // NOLINT
-  bool output_fp16 = false;
-  for (int32_t i = 0; i < fc->nbFields; i++) {
-    std::string field_name(fc->fields[i].name);
-    if (field_name.compare("lookup_table_weight") == 0) {
-      weight->values = fc->fields[i].data;
-      weight->count = fc->fields[i].length;
-      weight->type = fieldTypeToDataType(fc->fields[i].type);
-    }
-    if (field_name.compare("lookup_table_weight_width") == 0) {
-      assert(fc->fields[i].type == nvinfer1::PluginFieldType::kINT32);
-      mWeightWidth = const_cast<int32_t*>(
-          static_cast<int32_t const*>(fc->fields[i].data))[0];  // NOLINT
-    }
-    if (field_name.compare("output_fp16") == 0) {
-      assert(fc->fields[i].type == nvinfer1::PluginFieldType::kINT32);
-      output_fp16 = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-    }
-  }
-  return output_fp16;
-}
-nvinfer1::IPluginV2* LookupTablePluginDynamicCreator::createPlugin(
-    char const* name, const nvinfer1::PluginFieldCollection* fc) noexcept {
-  nvinfer1::Weights weight;
-  int32_t mWeightWidth;
-  bool output_fp16 = initializeFields(fc, &weight, mWeightWidth);
-  nvinfer1::DataType type;
-  if (output_fp16) {
-    type = nvinfer1::DataType::kHALF;
-  } else {
-    type = nvinfer1::DataType::kFLOAT;
-  }
-  WeightsWithOwnership mWeight;
-  mWeight.convertAndCopy(weight, type);
-  void* cudaMem{nullptr};
-  cudaMalloc(&cudaMem, getWeightsSize(mWeight, type));
-  cudaMemcpy(cudaMem,
-             mWeight.values,
-             getWeightsSize(mWeight, type),
-             cudaMemcpyHostToDevice);
-  LookupTablePluginDynamic* p =
-      new LookupTablePluginDynamic(type, cudaMem, mWeight.count, mWeightWidth);
-  return p;
-}
-nvinfer1::IPluginV2* LookupTablePluginDynamicCreator::deserializePlugin(
-    char const* name, void const* serialData, size_t serialLength) noexcept {
-  return new LookupTablePluginDynamic(serialData, serialLength);
-}
-void LookupTablePluginDynamicCreator::setPluginNamespace(
-    char const* libNamespace) noexcept {
-  mNamespace = libNamespace;
-}
-char const* LookupTablePluginDynamicCreator::getPluginNamespace()
-    const noexcept {
-  return mNamespace.c_str();
-}
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/lookup_table.h
+++ b/paddle/fluid/inference/tensorrt/plugin/lookup_table.h
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <cuda.h>
-#include "NvInferPlugin.h"
-#include "NvInferRuntime.h"
-#include "paddle/fluid/inference/tensorrt/plugin/common/bertCommon.h"
-#include "paddle/fluid/inference/tensorrt/plugin/common/serialize.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-class LookupTablePluginDynamic : public nvinfer1::IPluginV2DynamicExt {
- public:
-  LookupTablePluginDynamic(nvinfer1::DataType const type,
-                           void* weight_dev,
-                           int32_t weight_size,
-                           int32_t width);
-  LookupTablePluginDynamic(void const* data, size_t length);
-  // It doesn't make sense to make EmbLayerNormVarSeqlenPlugin without
-  // arguments, so we delete default constructor.
-  LookupTablePluginDynamic() = delete;
-  // IPluginV2DynamicExt Methods
-  bool supportsFormatCombination(int32_t pos,
-                                 nvinfer1::PluginTensorDesc const* inOut,
-                                 int32_t nbInputs,
-                                 int32_t nbOutputs) noexcept override;
-  size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs,
-                          int32_t nbInputs,
-                          nvinfer1::PluginTensorDesc const* outputs,
-                          int32_t nbOutputs) const noexcept override;
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(
-      int32_t index,
-      nvinfer1::DataType const* inputTypes,
-      int32_t nbInputs) const noexcept override;
-  // IPluginV2 Methods
-  nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-  nvinfer1::DimsExprs getOutputDimensions(
-      int32_t outputIndex,
-      const nvinfer1::DimsExprs* inputs,
-      int32_t nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-  void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in,
-                       int32_t nbInputs,
-                       nvinfer1::DynamicPluginTensorDesc const* out,
-                       int32_t nbOutputs) noexcept override;
-  char const* getPluginType() const noexcept override;
-  int32_t getNbOutputs() const noexcept override;
-  size_t getSerializationSize() const noexcept override;
-  void serialize(void* buffer) const noexcept override;
-  void destroy() noexcept override;
-  char const* getPluginNamespace() const noexcept override;
-  void setPluginNamespace(char const* pluginNamespace) noexcept override;
-  int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
-                  nvinfer1::PluginTensorDesc const* outputDesc,
-                  void const* const* inputs,
-                  void* const* outputs,
-                  void* workspace,
-                  cudaStream_t stream) noexcept override;
-  int32_t initialize() noexcept override;
-  void terminate() noexcept override;
-  char const* getPluginVersion() const noexcept override;
- protected:
-  std::string mNamespace;
-  nvinfer1::DataType mType;
-  void* mWeightDev{nullptr};
-  int32_t mWeightSize;
-  int32_t mWeightWidth;
-};
-class LookupTablePluginDynamicCreator : public nvinfer1::IPluginCreator {
- public:
-  LookupTablePluginDynamicCreator();
-  char const* getPluginName() const noexcept override;
-  const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override;
-  void setPluginNamespace(char const* pluginNamespace) noexcept override;
-  char const* getPluginNamespace() const noexcept override;
-  nvinfer1::IPluginV2* createPlugin(
-      char const* name,
-      const nvinfer1::PluginFieldCollection* fc) noexcept override;
-  char const* getPluginVersion() const noexcept override;
-  nvinfer1::IPluginV2* deserializePlugin(char const* name,
-                                         void const* serialData,
-                                         size_t serialLength) noexcept override;
- protected:
-  static nvinfer1::PluginFieldCollection mFC;
-  static std::vector<nvinfer1::PluginField> mPluginAttributes;
-  std::string mNamespace;
-};
-REGISTER_TRT_PLUGIN_V2(LookupTablePluginDynamicCreator);
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <stdio.h>
-#include <cassert>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
-#include "paddle/fluid/operators/math/prelu.h"
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-int PReluPlugin::initialize() TRT_NOEXCEPT {
-  cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
-  cudaMemcpy(p_gpu_weight_,
-             weight_.data(),
-             weight_.size() * sizeof(float),
-             cudaMemcpyHostToDevice);
-  return 0;
-}
-void PReluPlugin::terminate() TRT_NOEXCEPT {
-  if (p_gpu_weight_) {
-    cudaFree(p_gpu_weight_);
-    p_gpu_weight_ = nullptr;
-  }
-}
-nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
-                                                const nvinfer1::Dims *inputDims,
-                                                int nbInputs) TRT_NOEXCEPT {
-  assert(nbInputs == 1);
-  assert(index < this->getNbOutputs());
-  nvinfer1::Dims const &input_dims = inputDims[0];
-  nvinfer1::Dims output_dims = input_dims;
-  return output_dims;
-}
-int PReluPlugin::enqueue(int batch_size,
-                         const void *const *inputs,
-#if IS_TRT_VERSION_LT(8000)
-                         void **outputs,
-                         void *workspace,
-                         cudaStream_t stream) {
-#else
-                         void *const *outputs,
-                         void *workspace,
-                         cudaStream_t stream) TRT_NOEXCEPT {
-#endif
-  // input dims is CHW.
-  const auto &input_dims = this->getInputDims(0);
-  const float *input = reinterpret_cast<const float *>(inputs[0]);
-  // const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
-  const float *alpha = p_gpu_weight_;
-  float *const output = reinterpret_cast<float *const *>(outputs)[0];
-  int numel = 1;
-  for (int i = 0; i < input_dims.nbDims; i++) {
-    numel *= input_dims.d[i];
-  }
-  if (mode_ == "channel") {
-    bool channel_last = data_format_ == "NHWC";
-    operators::math::PreluChannelWiseDirectCUDAFunctor<float>
-        prelu_channel_wise;
-    prelu_channel_wise(stream,
-                       input,
-                       alpha,
-                       output,
-                       input_dims.d[0],
-                       input_dims.d[1],
-                       channel_last,
-                       numel);
-  } else if (mode_ == "element") {
-    operators::math::PreluElementWiseDirectCUDAFunctor<float>
-        prelu_element_wise;
-    prelu_element_wise(stream, input, alpha, output, input_dims.d[0], numel);
-  } else {
-    operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
-    prelu_scalar(stream, input, alpha, output, numel);
-  }
-  return cudaGetLastError() != cudaSuccess;
-}
-#if IS_TRT_VERSION_GE(6000)
-void PReluPluginDynamic::terminate() TRT_NOEXCEPT {
-  if (p_gpu_weight_) {
-    cudaFree(p_gpu_weight_);
-  }
-}
-int PReluPluginDynamic::initialize() TRT_NOEXCEPT {
-  cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
-  cudaMemcpy(p_gpu_weight_,
-             weight_.data(),
-             weight_.size() * sizeof(float),
-             cudaMemcpyHostToDevice);
-  return 0;
-}
-PReluPluginDynamic::PReluPluginDynamic(void const *serialData,
-                                       size_t serialLength) {
-  DeserializeValue(&serialData, &serialLength, &weight_);
-  const char *prelu_mode;
-  DeserializeValue(&serialData, &serialLength, &prelu_mode);
-  mode_ = std::string(prelu_mode);
-}
-size_t PReluPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
-  return SerializedSize(mode_.c_str()) + SerializedSize(weight_);
-}
-void PReluPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
-  SerializeValue(&buffer, weight_);
-  SerializeValue(&buffer, mode_.c_str());
-}
-nvinfer1::DimsExprs PReluPluginDynamic::getOutputDimensions(
-    int output_index,
-    const nvinfer1::DimsExprs *inputs,
-    int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
-  return inputs[0];
-}
-bool PReluPluginDynamic::supportsFormatCombination(
-    int pos,
-    const nvinfer1::PluginTensorDesc *in_out,
-    int nb_inputs,
-    int nb_outputs) TRT_NOEXCEPT {
-  PADDLE_ENFORCE_NOT_NULL(
-      in_out,
-      platform::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
-  PADDLE_ENFORCE_LT(
-      pos,
-      nb_inputs + nb_outputs,
-      platform::errors::InvalidArgument("The pos(%d) should be less than the "
-                                        "num(%d) of the input and the output.",
-                                        pos,
-                                        nb_inputs + nb_outputs));
-  (in_out && pos < (nb_inputs + nb_outputs));
-  return ((in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
-          in_out[pos].format == nvinfer1::PluginFormat::kLINEAR);
-}
-nvinfer1::DataType PReluPluginDynamic::getOutputDataType(
-    int index,
-    const nvinfer1::DataType *input_types,
-    int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "The PRelu Plugin only has one input, so the "
-                        "index value should be 0, but get %d.",
-                        index));
-  PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The input type should be half or float"));
-  return input_types[0];
-}
-int PReluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
-                                const nvinfer1::PluginTensorDesc *output_desc,
-                                const void *const *inputs,
-                                void *const *outputs,
-                                void *workspace,
-                                cudaStream_t stream) TRT_NOEXCEPT {
-  auto input_dims = input_desc[0].dims;
-  const float *alpha = p_gpu_weight_;
-  const float *input = static_cast<const float *>(inputs[0]);
-  float *output = static_cast<float *>(outputs[0]);
-  int numel = 1;
-  for (int i = 0; i < input_dims.nbDims; i++) {
-    numel *= input_dims.d[i];
-  }
-  if (mode_ == "channel") {
-    bool channel_last = data_format_ == "NHWC";
-    operators::math::PreluChannelWiseDirectCUDAFunctor<float>
-        prelu_channel_wise;
-    prelu_channel_wise(stream,
-                       input,
-                       alpha,
-                       output,
-                       input_dims.d[0],
-                       input_dims.d[1],
-                       channel_last,
-                       numel);
-  } else if (mode_ == "element") {
-    operators::math::PreluElementWiseDirectCUDAFunctor<float>
-        prelu_element_wise;
-    prelu_element_wise(stream, input, alpha, output, input_dims.d[0], numel);
-  } else {
-    operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
-    prelu_scalar(stream, input, alpha, output, numel);
-  }
-  return cudaGetLastError() != cudaSuccess;
-}
-#endif
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-class PReluPlugin : public PluginTensorRT {
-  std::vector<float> weight_;
-  float* p_gpu_weight_;
-  std::string mode_;
-  std::string data_format_;
- public:
-  size_t getSerializationSize() const TRT_NOEXCEPT override {
-    return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
-           SerializedSize(data_format_.c_str()) + SerializedSize(weight_);
-  }
-  // TRT will call this func when we need to serialize the configuration of
-  // tensorrt.
-  // It should not be called by users.
-  void serialize(void* buffer) const TRT_NOEXCEPT override {
-    serializeBase(buffer);
-    SerializeValue(&buffer, weight_);
-    SerializeValue(&buffer, mode_.c_str());
-    SerializeValue(&buffer, data_format_.c_str());
-  }
-  PReluPlugin(const float* weight,
-              const int weight_num,
-              std::string const& mode,
-              std::string const& data_format)
-      : mode_(mode), data_format_(data_format) {
-    weight_.resize(weight_num);
-    std::copy(weight, weight + weight_num, weight_.data());
-  }
-  // It was used for tensorrt deserialization.
-  // It should not be called by users.
-  PReluPlugin(void const* serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
-    DeserializeValue(&serialData, &serialLength, &weight_);
-    const char* prelu_mode;
-    DeserializeValue(&serialData, &serialLength, &prelu_mode);
-    mode_ = std::string(prelu_mode);
-    const char* prelu_data_format;
-    DeserializeValue(&serialData, &serialLength, &prelu_data_format);
-    data_format_ = std::string(prelu_data_format);
-  }
-  ~PReluPlugin() {}
-  int initialize() TRT_NOEXCEPT override;
-  void terminate() TRT_NOEXCEPT override;
-  PReluPlugin* clone() const TRT_NOEXCEPT override {
-    auto* ptr =
-        new PReluPlugin(weight_.data(), weight_.size(), mode_, data_format_);
-    ptr->p_gpu_weight_ = p_gpu_weight_;
-    return ptr;
-  }
-  const char* getPluginType() const TRT_NOEXCEPT override {
-    return "prelu_plugin";
-  }
-  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
-  nvinfer1::Dims getOutputDimensions(int index,
-                                     const nvinfer1::Dims* inputs,
-                                     int nbInputDims) TRT_NOEXCEPT override;
-#if IS_TRT_VERSION_LT(8000)
-  int enqueue(int batchSize,
-              const void* const* inputs,
-              void** outputs,
-#else
-  int enqueue(int batchSize,
-              const void* const* inputs,
-              void* const* outputs,
-#endif
-              void* workspace,
-              cudaStream_t stream) TRT_NOEXCEPT override;
-};
-class PReluPluginCreator : public TensorRTPluginCreator {
- public:
-  const char* getPluginName() const TRT_NOEXCEPT override {
-    return "prelu_plugin";
-  }
-  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length)
-      TRT_NOEXCEPT override {
-    return new PReluPlugin(serial_data, serial_length);
-  }
-};
-REGISTER_TRT_PLUGIN_V2(PReluPluginCreator);
-#if IS_TRT_VERSION_GE(6000)
-class PReluPluginDynamic : public DynamicPluginTensorRT {
- public:
-  PReluPluginDynamic(const float* weight,
-                     const int weight_num,
-                     std::string const& mode,
-                     std::string const& data_format)
-      : mode_(mode), data_format_(data_format) {
-    weight_.resize(weight_num);
-    std::copy(weight, weight + weight_num, weight_.data());
-  }
-  PReluPluginDynamic(void const* serialData, size_t serialLength);
-  ~PReluPluginDynamic() {}
-  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
-    auto ptr = new PReluPluginDynamic(
-        weight_.data(), weight_.size(), mode_, data_format_);
-    ptr->p_gpu_weight_ = p_gpu_weight_;
-    return ptr;
-  }
-  const char* getPluginType() const TRT_NOEXCEPT override {
-    return "prelu_plugin_dynamic";
-  }
-  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
-  int initialize() TRT_NOEXCEPT override;
-  void terminate() TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void* buffer) const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int output_index,
-                                          const nvinfer1::DimsExprs* inputs,
-                                          int nb_inputs,
-                                          nvinfer1::IExprBuilder& expr_builder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) TRT_NOEXCEPT override {}
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
-                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override {
-    return 0;
-  }
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
-              const void* const* inputs,
-              void* const* outputs,
-              void* workspace,
-              cudaStream_t stream) TRT_NOEXCEPT override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const
-      TRT_NOEXCEPT override;
-  void destroy() TRT_NOEXCEPT override { delete this; }
- private:
-  std::vector<float> weight_;
-  float* p_gpu_weight_;
-  std::string mode_;
-  std::string data_format_;
-};
-#endif
-class PReluPluginDynamicCreator : public TensorRTPluginCreator {
- public:
-  const char* getPluginName() const TRT_NOEXCEPT override {
-    return "prelu_plugin_dynamic";
-  }
-  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length)
-      TRT_NOEXCEPT override {
-    return new PReluPluginDynamic(serial_data, serial_length);
-  }
-};
-REGISTER_TRT_PLUGIN_V2(PReluPluginDynamicCreator);
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle