fix trt plugin clone and initialize bugs in TRT7.1+ (#30709) (#30822)

Co-authored-by: N tianshuo78520a <707759223@qq.com>

fix trt plugin clone and initialize bugs in TRT7.1+ (#30709) (#30822)
Co-authored-by: N tianshuo78520a <707759223@qq.com>
a64bea0c · Shang Zhizhou · GitHub · d199edd8 · a64bea0c · a64bea0c
17 changed file
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -39,8 +39,27 @@ EmbEltwiseLayernormPluginDynamicImpl<
 inline half fp32tofp16(float x) { return static_cast<half>(x); }
+template <typename T>
+void EmbEltwiseLayernormPluginDynamicImpl<T>::shareGPUData(
+    const EmbEltwiseLayernormPluginDynamicImplBase *anthor) {
+  auto *ptr =
+      dynamic_cast<const EmbEltwiseLayernormPluginDynamicImpl<T> *>(anthor);
+  if (!ptr->is_initialized_) {
+    return;
+  }
+  embs_gpu_ = ptr->embs_gpu_;
+  scale_gpu_ = ptr->scale_gpu_;
+  bias_gpu_ = ptr->bias_gpu_;
+  int input_num = embs_.size();
+  in_ptr_tensor_.Resize({input_num});
+  emb_ptr_tensor_.ShareDataWith(ptr->emb_ptr_tensor_);
+}
 template <typename T>
 int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
+  if (is_initialized_) {
+    return 0;
+  }
  embs_gpu_.resize(embs_.size());
  for (int i = 0; i < embs_.size(); i++) {
    if (embs_[i]) {
@@ -77,13 +96,12 @@ int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
  int input_num = embs_.size();
  in_ptr_tensor_.Resize({input_num});
  emb_ptr_tensor_.Resize({input_num});
  cudaGetDevice(&device_id_);
  auto emb_ptr_gpu_d =
      emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
  cudaMemcpy(emb_ptr_gpu_d, embs_gpu_.data(), sizeof(uintptr_t) * input_num,
             cudaMemcpyHostToDevice);
+  is_initialized_ = true;
  return 0;
 }

--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -39,6 +39,8 @@ class EmbEltwiseLayernormPluginDynamicImplBase {
                      const nvinfer1::PluginTensorDesc* outputDesc,
                      const void* const* inputs, void* const* outputs,
                      void* workspace, cudaStream_t stream) = 0;
+  virtual void shareGPUData(
+      const EmbEltwiseLayernormPluginDynamicImplBase* anthor) = 0;
 };
 template <typename T>
@@ -67,6 +69,7 @@ class EmbEltwiseLayernormPluginDynamicImpl
              const nvinfer1::PluginTensorDesc* outputDesc,
              const void* const* inputs, void* const* outputs, void* workspace,
              cudaStream_t stream);
+  void shareGPUData(const EmbEltwiseLayernormPluginDynamicImplBase* anthor);
 private:
  std::vector<float*> embs_;
@@ -87,6 +90,7 @@ class EmbEltwiseLayernormPluginDynamicImpl
  framework::Tensor in_ptr_tensor_, emb_ptr_tensor_;
  int device_id_{0};
  uintptr_t old_input_ptr_{0};
+  bool is_initialized_{false};
 };
 class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
@@ -189,6 +193,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
    auto ptr = new EmbEltwiseLayernormPluginDynamic(
        embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
        eps_, with_fp16_);
+    ptr->shareGPUData(this);
    return ptr;
  }
@@ -295,6 +300,10 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
  bool own_host_buff_{false};
  EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
+  void shareGPUData(const EmbEltwiseLayernormPluginDynamic* anthor) {
+    impl_->shareGPUData(anthor->impl_);
+  }
 };
 class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {

--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -47,13 +47,7 @@ InstanceNormPlugin *CreateInstanceNormPluginDeserialize(const void *buffer,
 REGISTER_TRT_PLUGIN("instance_norm_plugin",
                    CreateInstanceNormPluginDeserialize);
-int InstanceNormPlugin::initialize() {
+int InstanceNormPlugin::initialize() { return 0; }
-  platform::dynload::cudnnCreate(&handle_);
-  platform::dynload::cudnnCreateTensorDescriptor(&x_desc_);
-  platform::dynload::cudnnCreateTensorDescriptor(&y_desc_);
-  platform::dynload::cudnnCreateTensorDescriptor(&b_desc_);
-  return 0;
-}
 nvinfer1::Dims InstanceNormPlugin::getOutputDimensions(
    int index, const nvinfer1::Dims *inputDims, int nbInputs) {

--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
@@ -65,6 +65,10 @@ class InstanceNormPlugin : public PluginTensorRT {
                          "The instanceNorm's scale and bias should be the "
                          "same size. Got scale size = %d, but bias size = %d",
                          scale.size(), bias.size()));
+    platform::dynload::cudnnCreate(&handle_);
+    platform::dynload::cudnnCreateTensorDescriptor(&x_desc_);
+    platform::dynload::cudnnCreateTensorDescriptor(&y_desc_);
+    platform::dynload::cudnnCreateTensorDescriptor(&b_desc_);
  }
  // It was used for tensorrt deserialization.
@@ -74,9 +78,19 @@ class InstanceNormPlugin : public PluginTensorRT {
    DeserializeValue(&serialData, &serialLength, &eps_);
    DeserializeValue(&serialData, &serialLength, &scale_);
    DeserializeValue(&serialData, &serialLength, &bias_);
+    platform::dynload::cudnnCreate(&handle_);
+    platform::dynload::cudnnCreateTensorDescriptor(&x_desc_);
+    platform::dynload::cudnnCreateTensorDescriptor(&y_desc_);
+    platform::dynload::cudnnCreateTensorDescriptor(&b_desc_);
  }
-  ~InstanceNormPlugin() {}
+  ~InstanceNormPlugin() {
+    platform::dynload::cudnnDestroy(handle_);
+    platform::dynload::cudnnDestroyTensorDescriptor(x_desc_);
+    platform::dynload::cudnnDestroyTensorDescriptor(y_desc_);
+    platform::dynload::cudnnDestroyTensorDescriptor(b_desc_);
+  }
  int initialize() override;
  InstanceNormPlugin *clone() const override {

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -39,6 +39,13 @@ int PReluPlugin::initialize() {
  return 0;
 }
+void PReluPlugin::terminate() {
+  if (p_gpu_weight_) {
+    cudaFree(p_gpu_weight_);
+    p_gpu_weight_ = nullptr;
+  }
+}
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
                                                const nvinfer1::Dims *inputDims,
                                                int nbInputs) {

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -66,11 +66,14 @@ class PReluPlugin : public PluginTensorRT {
    DeserializeValue(&serialData, &serialLength, &prelu_mode);
    mode_ = std::string(prelu_mode);
  }
-  ~PReluPlugin() { cudaFree(p_gpu_weight_); }
+  ~PReluPlugin() {}
  int initialize() override;
+  void terminate() override;
  PReluPlugin* clone() const override {
-    return new PReluPlugin(weight_.data(), weight_.size(), mode_);
+    auto* ptr = new PReluPlugin(weight_.data(), weight_.size(), mode_);
+    ptr->p_gpu_weight_ = p_gpu_weight_;
+    return ptr;
  }
  const char* getPluginType() const override { return "prelu_plugin"; }
@@ -100,7 +103,7 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
    DeserializeValue(&serialData, &serialLength, &prelu_mode);
    mode_ = std::string(prelu_mode);
  }
-  ~PReluPluginDynamic() { cudaFree(p_gpu_weight_); }
+  ~PReluPluginDynamic() {}
  nvinfer1::IPluginV2DynamicExt* clone() const override {
    auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
    ptr->p_gpu_weight_ = p_gpu_weight_;

--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -40,6 +40,17 @@ int SkipLayerNormPluginDynamic::initialize() {
  return 0;
 }
+void SkipLayerNormPluginDynamic::terminate() {
+  if (bias_gpu_) {
+    cudaFree(bias_gpu_);
+    bias_gpu_ = nullptr;
+  }
+  if (scale_gpu_) {
+    cudaFree(scale_gpu_);
+    scale_gpu_ = nullptr;
+  }
+}
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
    nvinfer1::IExprBuilder &expr_builder) {

--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -104,13 +104,14 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
                                       int nb_inputs) const override;
  void destroy() override { delete this; }
+  void terminate() override;
 private:
  std::vector<float> bias_;
  std::vector<float> scale_;
-  float* bias_gpu_;
+  float* bias_gpu_{nullptr};
-  float* scale_gpu_;
+  float* scale_gpu_{nullptr};
  int bias_size_;
  int scale_size_;

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -62,6 +62,16 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
  return output_dims;
 }
+void SplitPlugin::shareData(const SplitPlugin* another) {
+  outer_rows_ = another->outer_rows_;
+  inner_cols_ = another->inner_cols_;
+  same_shape_ = another->same_shape_;
+  axis_shape_ = another->axis_shape_;
+  d_segment_offsets_ = another->d_segment_offsets_;
+  segment_offsets_ = another->segment_offsets_;
+  d_output_ptrs_.resize(another->d_output_ptrs_.size(), nullptr);
+}
 int SplitPlugin::initialize() {
  PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS,
                    platform::errors::InvalidArgument(
@@ -93,6 +103,9 @@ int SplitPlugin::initialize() {
  return 0;
 }
+// nothing to release according to initialize
+void SplitPlugin::terminate() {}
 // The following part of the code refers to onnx-tensorrt
 // https://github.com/onnx/onnx-tensorrt/blob/master/Split.cu
 template <typename T>

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -40,7 +40,9 @@ class SplitPlugin : public PluginTensorRT {
  }
  SplitPlugin* clone() const override {
-    return new SplitPlugin(axis_, output_length_, with_fp16_);
+    auto* ptr = new SplitPlugin(axis_, output_length_, with_fp16_);
+    ptr->shareData(this);
+    return ptr;
  }
  const char* getPluginType() const override { return "split_plugin"; }
@@ -50,6 +52,7 @@ class SplitPlugin : public PluginTensorRT {
                                     int num_inputs) override;
  int initialize() override;
+  void terminate() override;
  int enqueue(int batchSize, const void* const* inputs, void** outputs,
              void* workspace, cudaStream_t stream) override;
@@ -75,6 +78,9 @@ class SplitPlugin : public PluginTensorRT {
  std::vector<int> segment_offsets_;
  thrust::device_vector<int> d_segment_offsets_;
  thrust::device_vector<float*> d_output_ptrs_;
+ private:
+  void shareData(const SplitPlugin* another);
 };
 #if IS_TRT_VERSION_GE(6000)

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -642,7 +642,9 @@ set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT 120)
 if(WITH_GPU AND TENSORRT_FOUND)
    set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120)
+    if(WITH_MKLDNN)
+        set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120)
+    endif()
 endif()
 if(ON_INFER OR WITH_GPU)
    set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)

--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -30,4 +30,6 @@ foreach(target ${TEST_INFERENCE_IR_PASSES})
 endforeach()
 if(WITH_GPU AND TENSORRT_FOUND)
 set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
 endif()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+class TensorRTSubgraphPassActivationTest(InferencePassTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+    def setUp(self):
+        self.setUpTensorRTParam()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            act_out = self.append_act(data)
+            out = fluid.layers.batch_norm(act_out, is_test=True)
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [out]
+    def append_act(self, x):
+        return fluid.layers.relu(x)
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            if os.path.exists(self.path + "_opt_cache"):
+                shutil.rmtree(self.path + "_opt_cache")
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Float32:
+                self.check_output_with_option(use_gpu)
+            else:
+                self.check_output_with_option(use_gpu, 1e-3)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+class TensorRTSubgraphPassLeakyReluTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.leaky_relu(x)
+class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.relu6(x)
+class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.softmax(x)
+class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.sigmoid(x)
+class TensorRTSubgraphPassHardSwishTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.hard_swish(x)
+class TensorRTSubgraphPassHardSigmoidTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.hard_sigmoid(x)
+class TensorRTSubgraphPassHardSwishPluginTest(
+        TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.hard_swish(x, threshold=4.0, scale=8.0)
+class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.clip(x, 0, 1)
+class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.tanh(x)
+class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
+    def append_act(self, x):
+        return fluid.layers.swish(x)
+class TensorRTSubgraphPassSwishFp16SerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
+    def append_act(self, x):
+        return fluid.layers.swish(x)
+class TensorRTSubgraphPassDynamicSwishFp16SerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+            {
+                'data': [1, 6, 8, 8]
+            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
+    def append_act(self, x):
+        return fluid.layers.swish(x)
+class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.prelu(x, mode='all')
+class TensorRTSubgraphPassPreluChannelTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.prelu(x, mode='channel')
+class TensorRTSubgraphPassPreluElementTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.prelu(x, mode='element')
+class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest):
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+            {
+                'data': [1, 6, 8, 8]
+            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+class TensorRTSubgraphPassGeluFp16SerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+class TensorRTSubgraphPassGeluFp16DynamicTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+            {
+                'data': [1, 6, 8, 8]
+            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+class TensorRTSubgraphPassGeluFp16DynamicSerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+            {
+                'data': [1, 6, 8, 8]
+            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
+    def append_act(self, x):
+        return fluid.layers.gelu(x)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+class TensorRTSubgraphPassConvTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=self.conv_num_filters,
+                filter_size=self.conv_filter_size,
+                groups=self.conv_groups,
+                padding=self.conv_padding,
+                bias_attr=False,
+                act=None)
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassConvTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [conv_out]
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 3
+        self.conv_padding = [1, 1]
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+class TensorRTSubgraphPassConvValidPaddingTest(TensorRTSubgraphPassConvTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 3
+        self.conv_padding = 'VALID'
+'''
+# conv2d padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete.
+class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 3
+        self.conv_padding = 'SAME'
+'''
+class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 6
+        self.conv_padding = [1, 1]
+class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d_transpose(
+                input=data,
+                num_filters=self.conv_num_filters,
+                filter_size=self.conv_filter_size,
+                groups=self.conv_groups,
+                padding=self.conv_padding,
+                bias_attr=False,
+                act=None)
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassConvTransposeTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [conv_out]
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 1
+        self.conv_padding = [1, 1]
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+class TensorRTSubgraphPassConvTransposeValidPaddingTest(
+        TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 1
+        self.conv_padding = 'VALID'
+'''
+# conv2d_transpose padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete.
+class TensorRTSubgraphPassConvTransposeSamePaddingTest(TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 1
+        self.conv_padding = 'SAME'
+'''
+class TensorRTSubgraphPassDepthwiseConvTransposeTest(
+        TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 1
+        self.conv_padding = [1, 1]
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -23,134 +23,6 @@ from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
-class TensorRTSubgraphPassConvTest(InferencePassTest):
-    def setUp(self):
-        self.set_params()
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                act=None)
-        self.feeds = {
-            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassConvTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
-        self.fetch_list = [conv_out]
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 3
-        self.conv_padding = [1, 1]
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
-class TensorRTSubgraphPassConvValidPaddingTest(TensorRTSubgraphPassConvTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 3
-        self.conv_padding = 'VALID'
-'''
-# conv2d padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete.
-class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 3
-        self.conv_padding = 'SAME'
-'''
-class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 6
-        self.conv_padding = [1, 1]
-class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
-    def setUp(self):
-        self.set_params()
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
-            conv_out = fluid.layers.conv2d_transpose(
-                input=data,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                act=None)
-        self.feeds = {
-            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassConvTransposeTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
-        self.fetch_list = [conv_out]
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 1
-        self.conv_padding = [1, 1]
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
-class TensorRTSubgraphPassConvTransposeValidPaddingTest(
-        TensorRTSubgraphPassConvTransposeTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 1
-        self.conv_padding = 'VALID'
-'''
-# conv2d_transpose padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete.
-class TensorRTSubgraphPassConvTransposeSamePaddingTest(TensorRTSubgraphPassConvTransposeTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 1
-        self.conv_padding = 'SAME'
-'''
-class TensorRTSubgraphPassDepthwiseConvTransposeTest(
-        TensorRTSubgraphPassConvTransposeTest):
-    def set_params(self):
-        self.conv_num_filters = 6
-        self.conv_filter_size = 6
-        self.conv_groups = 1
-        self.conv_padding = [1, 1]
 class TensorRTSubgraphPassFcTest(InferencePassTest):
    def setUp(self):
        with fluid.program_guard(self.main_program, self.startup_program):
@@ -282,207 +154,6 @@ class TensorRTSubgraphPassValidPaddingPoolTest(InferencePassTest):
        self.exclusive = False
-class TensorRTSubgraphPassActivationTest(InferencePassTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
-    def setUp(self):
-        self.setUpTensorRTParam()
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
-            act_out = self.append_act(data)
-            out = fluid.layers.batch_norm(act_out, is_test=True)
-        self.feeds = {
-            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
-        }
-        self.fetch_list = [out]
-    def append_act(self, x):
-        return fluid.layers.relu(x)
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            if os.path.exists(self.path + "_opt_cache"):
-                shutil.rmtree(self.path + "_opt_cache")
-            if self.trt_parameters.precision == AnalysisConfig.Precision.Float32:
-                self.check_output_with_option(use_gpu)
-            else:
-                self.check_output_with_option(use_gpu, 1e-3)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
-class TensorRTSubgraphPassLeakyReluTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.leaky_relu(x)
-class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.relu6(x)
-class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.softmax(x)
-class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.sigmoid(x)
-class TensorRTSubgraphPassHardSwishTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.hard_swish(x)
-class TensorRTSubgraphPassHardSwishPluginTest(
-        TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.hard_swish(x, threshold=4.0, scale=8.0)
-class TensorRTSubgraphPassHardSigmoidTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.hard_sigmoid(x)
-class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.clip(x, 0, 1)
-class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.tanh(x)
-class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
-    def append_act(self, x):
-        return fluid.layers.swish(x)
-class TensorRTSubgraphPassSwishFp16SerializeTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-    def append_act(self, x):
-        return fluid.layers.swish(x)
-class TensorRTSubgraphPassDynamicSwishFp16SerializeTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
-    def append_act(self, x):
-        return fluid.layers.swish(x)
-class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.prelu(x, mode='all')
-class TensorRTSubgraphPassPreluChannelTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.prelu(x, mode='channel')
-class TensorRTSubgraphPassPreluElementTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.prelu(x, mode='element')
-class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest):
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-class TensorRTSubgraphPassGeluFp16SerializeTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-class TensorRTSubgraphPassGeluFp16DynamicTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
-class TensorRTSubgraphPassGeluFp16DynamicSerializeTest(
-        TensorRTSubgraphPassActivationTest):
-    def setUpTensorRTParam(self):
-        self.enable_trt = True
-        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
-            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
-    def append_act(self, x):
-        return fluid.layers.gelu(x)
 class TensorRTSubgraphPassConcatTest(InferencePassTest):
    def setUp(self):
        with fluid.program_guard(self.main_program, self.startup_program):
@@ -570,7 +241,7 @@ class TensorRTSubgraphPassDynamicSplitFp16SerializeTest(InferencePassTest):
        self.enable_trt = True
        self.trt_parameters = TensorRTSubgraphPassSplitTest.TensorRTParam(
            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
-        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+        self.dynamic_shape_params = TensorRTSubgraphPassDynamicSplitFp16SerializeTest.DynamicShapeParam(
            {
                'data': [1, 3, 8, 64]
            }, {'data': [1, 3, 512, 64]}, {'data': [1, 3, 256, 64]}, False)

--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -130,6 +130,12 @@ function build_cpython {
 function build_cpythons {
    for py_ver in $@; do
+        if [ ${py_ver} == "2.7.15" ]; then
+            GET_PIP_URL="https://bootstrap.pypa.io/2.7/get-pip.py"
+        elif [ ${py_ver} == "3.5.1" ]  ;then
+            GET_PIP_URL="https://bootstrap.pypa.io/3.5/get-pip.py"
+        fi
        check_var $GET_PIP_URL
        curl -sLO $GET_PIP_URL
        build_cpython $py_ver

--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -41,9 +41,9 @@ function make_centos_dockerfile(){
  sed "s/<baseimg>/11.0-cudnn8-devel-centos7/g" Dockerfile.centos >${dockerfile_name}
  sed -i "s#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts  ./build_scripts#g" ${dockerfile_name} 
  dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
+  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/cc && ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc" ${dockerfile_name}
  sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so \\
-    RUN ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/ \\
+    RUN ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/" ${dockerfile_name}
-    RUN rm -rf /usr/include/NvInfer*" ${dockerfile_name}
  sed -i $"${dockerfile_line}i RUN wget --no-check-certificate -q  https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \\
    RUN tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" ${dockerfile_name}