From b9094509942d45eed9dbc674ff98952d53724f3f Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Tue, 2 Feb 2021 10:02:42 +0800
Subject: [PATCH] fix trt plugin clone and initialize bugs in TRT7.1+ (#30709)

* fix trt plugin clone and initialize bugs

* fix unit test error

* enable trt in ci py3

* update unittest timeout
---
 .../plugin/emb_eltwise_layernorm_plugin.cu    |  22 +-
 .../plugin/emb_eltwise_layernorm_plugin.h     |   9 +
 .../plugin/instance_norm_op_plugin.cu         |   8 +-
 .../tensorrt/plugin/instance_norm_op_plugin.h |  16 +-
 .../tensorrt/plugin/prelu_op_plugin.cu        |   7 +
 .../tensorrt/plugin/prelu_op_plugin.h         |   9 +-
 .../plugin/skip_layernorm_op_plugin.cu        |  11 +
 .../plugin/skip_layernorm_op_plugin.h         |   5 +-
 .../unittests/ir/inference/CMakeLists.txt     |   2 +
 .../ir/inference/test_trt_activation_pass.py  | 228 ++++++++++++
 .../ir/inference/test_trt_conv_pass.py        | 155 ++++++++
 .../ir/inference/test_trt_subgraph_pass.py    | 331 +-----------------
 tools/dockerfile/ci_dockerfile.sh             |   3 +-
 13 files changed, 459 insertions(+), 347 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 30667514ac8..238daa4a886 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -39,8 +39,27 @@ EmbEltwiseLayernormPluginDynamicImpl<
 
 inline half fp32tofp16(float x) { return static_cast<half>(x); }
 
+template <typename T>
+void EmbEltwiseLayernormPluginDynamicImpl<T>::shareGPUData(
+    const EmbEltwiseLayernormPluginDynamicImplBase *anthor) {
+  auto *ptr =
+      dynamic_cast<const EmbEltwiseLayernormPluginDynamicImpl<T> *>(anthor);
+  if (!ptr->is_initialized_) {
+    return;
+  }
+  embs_gpu_ = ptr->embs_gpu_;
+  scale_gpu_ = ptr->scale_gpu_;
+  bias_gpu_ = ptr->bias_gpu_;
+  int input_num = embs_.size();
+  in_ptr_tensor_.Resize({input_num});
+  emb_ptr_tensor_.ShareDataWith(ptr->emb_ptr_tensor_);
+}
+
 template <typename T>
 int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
+  if (is_initialized_) {
+    return 0;
+  }
   embs_gpu_.resize(embs_.size());
   for (int i = 0; i < embs_.size(); i++) {
     if (embs_[i]) {
@@ -77,13 +96,12 @@ int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
   int input_num = embs_.size();
   in_ptr_tensor_.Resize({input_num});
   emb_ptr_tensor_.Resize({input_num});
-
   cudaGetDevice(&device_id_);
   auto emb_ptr_gpu_d =
       emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
   cudaMemcpy(emb_ptr_gpu_d, embs_gpu_.data(), sizeof(uintptr_t) * input_num,
              cudaMemcpyHostToDevice);
-
+  is_initialized_ = true;
   return 0;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index fcba85daf9f..6c8381a750c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -39,6 +39,8 @@ class EmbEltwiseLayernormPluginDynamicImplBase {
                       const nvinfer1::PluginTensorDesc* outputDesc,
                       const void* const* inputs, void* const* outputs,
                       void* workspace, cudaStream_t stream) = 0;
+  virtual void shareGPUData(
+      const EmbEltwiseLayernormPluginDynamicImplBase* anthor) = 0;
 };
 
 template <typename T>
@@ -67,6 +69,7 @@ class EmbEltwiseLayernormPluginDynamicImpl
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
               cudaStream_t stream);
+  void shareGPUData(const EmbEltwiseLayernormPluginDynamicImplBase* anthor);
 
  private:
   std::vector<float*> embs_;
@@ -87,6 +90,7 @@ class EmbEltwiseLayernormPluginDynamicImpl
   framework::Tensor in_ptr_tensor_, emb_ptr_tensor_;
   int device_id_{0};
   uintptr_t old_input_ptr_{0};
+  bool is_initialized_{false};
 };
 
 class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
@@ -189,6 +193,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     auto ptr = new EmbEltwiseLayernormPluginDynamic(
         embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
         eps_, with_fp16_);
+    ptr->shareGPUData(this);
     return ptr;
   }
 
@@ -295,6 +300,10 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
 
   bool own_host_buff_{false};
   EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
+
+  void shareGPUData(const EmbEltwiseLayernormPluginDynamic* anthor) {
+    impl_->shareGPUData(anthor->impl_);
+  }
 };
 
 class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index a22714aa92f..a579743ee8a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -47,13 +47,7 @@ InstanceNormPlugin *CreateInstanceNormPluginDeserialize(const void *buffer,
 REGISTER_TRT_PLUGIN("instance_norm_plugin",
                     CreateInstanceNormPluginDeserialize);
 
-int InstanceNormPlugin::initialize() {
-  platform::dynload::cudnnCreate(&handle_);
-  platform::dynload::cudnnCreateTensorDescriptor(&x_desc_);
-  platform::dynload::cudnnCreateTensorDescriptor(&y_desc_);
-  platform::dynload::cudnnCreateTensorDescriptor(&b_desc_);
-  return 0;
-}
+int InstanceNormPlugin::initialize() { return 0; }
 
 nvinfer1::Dims InstanceNormPlugin::getOutputDimensions(
     int index, const nvinfer1::Dims *inputDims, int nbInputs) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
index ec1d8e6517e..83422708f59 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
@@ -65,6 +65,10 @@ class InstanceNormPlugin : public PluginTensorRT {
                           "The instanceNorm's scale and bias should be the "
                           "same size. Got scale size = %d, but bias size = %d",
                           scale.size(), bias.size()));
+    platform::dynload::cudnnCreate(&handle_);
+    platform::dynload::cudnnCreateTensorDescriptor(&x_desc_);
+    platform::dynload::cudnnCreateTensorDescriptor(&y_desc_);
+    platform::dynload::cudnnCreateTensorDescriptor(&b_desc_);
   }
 
   // It was used for tensorrt deserialization.
@@ -74,9 +78,19 @@ class InstanceNormPlugin : public PluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &eps_);
     DeserializeValue(&serialData, &serialLength, &scale_);
     DeserializeValue(&serialData, &serialLength, &bias_);
+
+    platform::dynload::cudnnCreate(&handle_);
+    platform::dynload::cudnnCreateTensorDescriptor(&x_desc_);
+    platform::dynload::cudnnCreateTensorDescriptor(&y_desc_);
+    platform::dynload::cudnnCreateTensorDescriptor(&b_desc_);
   }
 
-  ~InstanceNormPlugin() {}
+  ~InstanceNormPlugin() {
+    platform::dynload::cudnnDestroy(handle_);
+    platform::dynload::cudnnDestroyTensorDescriptor(x_desc_);
+    platform::dynload::cudnnDestroyTensorDescriptor(y_desc_);
+    platform::dynload::cudnnDestroyTensorDescriptor(b_desc_);
+  }
   int initialize() override;
 
   InstanceNormPlugin *clone() const override {
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index 860f1039d5e..00182b87e98 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -39,6 +39,13 @@ int PReluPlugin::initialize() {
   return 0;
 }
 
+void PReluPlugin::terminate() {
+  if (p_gpu_weight_) {
+    cudaFree(p_gpu_weight_);
+    p_gpu_weight_ = nullptr;
+  }
+}
+
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
                                                 const nvinfer1::Dims *inputDims,
                                                 int nbInputs) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
index 3126366c5fd..a0a24e70a01 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -66,11 +66,14 @@ class PReluPlugin : public PluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &prelu_mode);
     mode_ = std::string(prelu_mode);
   }
-  ~PReluPlugin() { cudaFree(p_gpu_weight_); }
+  ~PReluPlugin() {}
   int initialize() override;
+  void terminate() override;
 
   PReluPlugin* clone() const override {
-    return new PReluPlugin(weight_.data(), weight_.size(), mode_);
+    auto* ptr = new PReluPlugin(weight_.data(), weight_.size(), mode_);
+    ptr->p_gpu_weight_ = p_gpu_weight_;
+    return ptr;
   }
 
   const char* getPluginType() const override { return "prelu_plugin"; }
@@ -100,7 +103,7 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &prelu_mode);
     mode_ = std::string(prelu_mode);
   }
-  ~PReluPluginDynamic() { cudaFree(p_gpu_weight_); }
+  ~PReluPluginDynamic() {}
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
     ptr->p_gpu_weight_ = p_gpu_weight_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index 6b2b93ba223..3b9eea22199 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -40,6 +40,17 @@ int SkipLayerNormPluginDynamic::initialize() {
   return 0;
 }
 
+void SkipLayerNormPluginDynamic::terminate() {
+  if (bias_gpu_) {
+    cudaFree(bias_gpu_);
+    bias_gpu_ = nullptr;
+  }
+  if (scale_gpu_) {
+    cudaFree(scale_gpu_);
+    scale_gpu_ = nullptr;
+  }
+}
+
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index 563e2e119f5..0e457fdc8f4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -104,13 +104,14 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
                                        int nb_inputs) const override;
 
   void destroy() override { delete this; }
+  void terminate() override;
 
  private:
   std::vector<float> bias_;
   std::vector<float> scale_;
 
-  float* bias_gpu_;
-  float* scale_gpu_;
+  float* bias_gpu_{nullptr};
+  float* scale_gpu_{nullptr};
 
   int bias_size_;
   int scale_size_;
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index b667f522c09..dfec1cc7572 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -30,4 +30,6 @@ foreach(target ${TEST_INFERENCE_IR_PASSES})
 endforeach()
 if(WITH_GPU AND TENSORRT_FOUND)
 set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
new file mode 100644
index 00000000000..f71951497f2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TensorRTSubgraphPassActivationTest(InferencePassTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+
+    def setUp(self):
+        self.setUpTensorRTParam()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            act_out = self.append_act(data)
+            out = fluid.layers.batch_norm(act_out, is_test=True)
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [out]
+
+    def append_act(self, x):
+        return fluid.layers.relu(x)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            if os.path.exists(self.path + "_opt_cache"):
+                shutil.rmtree(self.path + "_opt_cache")
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Float32:
+                self.check_output_with_option(use_gpu)
+            else:
+                self.check_output_with_option(use_gpu, 1e-3)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TensorRTSubgraphPassLeakyReluTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.leaky_relu(x)
+
+
+class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.relu6(x)
+
+
+class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.softmax(x)
+
+
+class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.sigmoid(x)
+
+
+class TensorRTSubgraphPassHardSwishTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.hard_swish(x)
+
+
+class TensorRTSubgraphPassHardSigmoidTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.hard_sigmoid(x)
+
+
+class TensorRTSubgraphPassHardSwishPluginTest(
+        TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.hard_swish(x, threshold=4.0, scale=8.0)
+
+
+class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.clip(x, 0, 1)
+
+
+class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.tanh(x)
+
+
+class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
+
+    def append_act(self, x):
+        return fluid.layers.swish(x)
+
+
+class TensorRTSubgraphPassSwishFp16SerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
+
+    def append_act(self, x):
+        return fluid.layers.swish(x)
+
+
+class TensorRTSubgraphPassDynamicSwishFp16SerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+            {
+                'data': [1, 6, 8, 8]
+            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
+
+    def append_act(self, x):
+        return fluid.layers.swish(x)
+
+
+class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.prelu(x, mode='all')
+
+
+class TensorRTSubgraphPassPreluChannelTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.prelu(x, mode='channel')
+
+
+class TensorRTSubgraphPassPreluElementTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.prelu(x, mode='element')
+
+
+class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+
+
+class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+            {
+                'data': [1, 6, 8, 8]
+            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
+
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+
+
+class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
+
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+
+
+class TensorRTSubgraphPassGeluFp16SerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
+
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+
+
+class TensorRTSubgraphPassGeluFp16DynamicTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+            {
+                'data': [1, 6, 8, 8]
+            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
+
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+
+
+class TensorRTSubgraphPassGeluFp16DynamicSerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+            {
+                'data': [1, 6, 8, 8]
+            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
+
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
new file mode 100644
index 00000000000..0de37fce0ae
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TensorRTSubgraphPassConvTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=self.conv_num_filters,
+                filter_size=self.conv_filter_size,
+                groups=self.conv_groups,
+                padding=self.conv_padding,
+                bias_attr=False,
+                act=None)
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassConvTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [conv_out]
+
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 3
+        self.conv_padding = [1, 1]
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TensorRTSubgraphPassConvValidPaddingTest(TensorRTSubgraphPassConvTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 3
+        self.conv_padding = 'VALID'
+
+
+'''
+# conv2d padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete.
+class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 3
+        self.conv_padding = 'SAME'
+'''
+
+
+class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 6
+        self.conv_padding = [1, 1]
+
+
+class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d_transpose(
+                input=data,
+                num_filters=self.conv_num_filters,
+                filter_size=self.conv_filter_size,
+                groups=self.conv_groups,
+                padding=self.conv_padding,
+                bias_attr=False,
+                act=None)
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassConvTransposeTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [conv_out]
+
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 1
+        self.conv_padding = [1, 1]
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TensorRTSubgraphPassConvTransposeValidPaddingTest(
+        TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 1
+        self.conv_padding = 'VALID'
+
+
+'''
+# conv2d_transpose padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete.
+class TensorRTSubgraphPassConvTransposeSamePaddingTest(TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 1
+        self.conv_padding = 'SAME'
+'''
+
+
+class TensorRTSubgraphPassDepthwiseConvTransposeTest(
+        TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 1
+        self.conv_padding = [1, 1]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index e5cee55a31d..e4a7305f70f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -23,134 +23,6 @@ from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 
 
-class TensorRTSubgraphPassConvTest(InferencePassTest):
-    def setUp(self):
-        self.set_params()
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                act=None)
-        self.feeds = {
-            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassConvTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
-        self.fetch_list = [conv_out]
-
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 3
-        self.conv_padding = [1, 1]
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
-
-
-class TensorRTSubgraphPassConvValidPaddingTest(TensorRTSubgraphPassConvTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 3
-        self.conv_padding = 'VALID'
-
-
-'''
-# conv2d padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete.
-class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 3
-        self.conv_padding = 'SAME'
-'''
-
-
-class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 6
-        self.conv_padding = [1, 1]
-
-
-class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
-    def setUp(self):
-        self.set_params()
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
-            conv_out = fluid.layers.conv2d_transpose(
-                input=data,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                act=None)
-        self.feeds = {
-            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassConvTransposeTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
-        self.fetch_list = [conv_out]
-
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 1
-        self.conv_padding = [1, 1]
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
-
-
-class TensorRTSubgraphPassConvTransposeValidPaddingTest(
-        TensorRTSubgraphPassConvTransposeTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 1
-        self.conv_padding = 'VALID'
-
-
-'''
-# conv2d_transpose padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete.
-class TensorRTSubgraphPassConvTransposeSamePaddingTest(TensorRTSubgraphPassConvTransposeTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 1
-        self.conv_padding = 'SAME'
-'''
-
-
-class TensorRTSubgraphPassDepthwiseConvTransposeTest(
-        TensorRTSubgraphPassConvTransposeTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 1
-        self.conv_padding = [1, 1]
-
-
 class TensorRTSubgraphPassFcTest(InferencePassTest):
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
@@ -282,207 +154,6 @@ class TensorRTSubgraphPassValidPaddingPoolTest(InferencePassTest):
         self.exclusive = False
 
 
-class TensorRTSubgraphPassActivationTest(InferencePassTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
-
-    def setUp(self):
-        self.setUpTensorRTParam()
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
-            act_out = self.append_act(data)
-            out = fluid.layers.batch_norm(act_out, is_test=True)
-        self.feeds = {
-            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
-        }
-        self.fetch_list = [out]
-
-    def append_act(self, x):
-        return fluid.layers.relu(x)
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            if os.path.exists(self.path + "_opt_cache"):
-                shutil.rmtree(self.path + "_opt_cache")
-            if self.trt_parameters.precision == AnalysisConfig.Precision.Float32:
-                self.check_output_with_option(use_gpu)
-            else:
-                self.check_output_with_option(use_gpu, 1e-3)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
-
-
-class TensorRTSubgraphPassLeakyReluTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.leaky_relu(x)
-
-
-class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.relu6(x)
-
-
-class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.softmax(x)
-
-
-class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.sigmoid(x)
-
-
-class TensorRTSubgraphPassHardSwishTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.hard_swish(x)
-
-
-class TensorRTSubgraphPassHardSigmoidTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.hard_sigmoid(x)
-
-
-class TensorRTSubgraphPassHardSwishPluginTest(
-        TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.hard_swish(x, threshold=4.0, scale=8.0)
-
-
-class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.clip(x, 0, 1)
-
-
-class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.tanh(x)
-
-
-class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
-
-    def append_act(self, x):
-        return fluid.layers.swish(x)
-
-
-class TensorRTSubgraphPassSwishFp16SerializeTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-
-    def append_act(self, x):
-        return fluid.layers.swish(x)
-
-
-class TensorRTSubgraphPassDynamicSwishFp16SerializeTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
-
-    def append_act(self, x):
-        return fluid.layers.swish(x)
-
-
-class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.prelu(x, mode='all')
-
-
-class TensorRTSubgraphPassPreluChannelTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.prelu(x, mode='channel')
-
-
-class TensorRTSubgraphPassPreluElementTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.prelu(x, mode='element')
-
-
-class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-
-
-class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
-
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-
-
-class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
-
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-
-
-class TensorRTSubgraphPassGeluFp16SerializeTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-
-
-class TensorRTSubgraphPassGeluFp16DynamicTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
-
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-
-
-class TensorRTSubgraphPassGeluFp16DynamicSerializeTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
-
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-
-
 class TensorRTSubgraphPassConcatTest(InferencePassTest):
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
@@ -570,7 +241,7 @@ class TensorRTSubgraphPassDynamicSplitFp16SerializeTest(InferencePassTest):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassSplitTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+        self.dynamic_shape_params = TensorRTSubgraphPassDynamicSplitFp16SerializeTest.DynamicShapeParam(
             {
                 'data': [1, 3, 8, 64]
             }, {'data': [1, 3, 512, 64]}, {'data': [1, 3, 256, 64]}, False)
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 04594b2917a..15196e30516 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -43,8 +43,7 @@ function make_centos_dockerfile(){
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "${dockerfile_line}i RUN rm -f /usr/bin/cc && ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so \\
-    RUN ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/ \\
-    RUN rm -rf /usr/include/NvInfer*" ${dockerfile_name}
+    RUN ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/" ${dockerfile_name}
   sed -i $"${dockerfile_line}i RUN wget --no-check-certificate -q  https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \\
     RUN tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
   sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" ${dockerfile_name}
-- 
GitLab