add trt pool and ut (#41258)

2012aeb6 · Wilber · GitHub · ad0c106c · 2012aeb6 · 2012aeb6
11 changed file
--- a/paddle/infrt/dialect/tensorrt/convert.h
+++ b/paddle/infrt/dialect/tensorrt/convert.h
@@ -14,17 +14,49 @@
 #pragma once

 #include <glog/logging.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <llvm/include/mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinAttributes.h>
+#include <mlir/IR/PatternMatch.h>
 #include <mlir/Transforms/DialectConversion.h>
-
 #include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "paddle/infrt/kernel/tensorrt/trt_helper.h"

 namespace infrt {
 namespace trt {
+
+#ifdef INFRT_WITH_TRT
+
+#define STRING_TO_ENUM_TYPE(enum_type) enum_type
+#define STRING_TO_ENUM_VALUE(enum_value) enum_value
+#include <NvInfer.h>
+
+#else  // INFRT_WITH_TRT
+
+#define STRING_TO_ENUM_TYPE(enum_type) std::string
+#define STRING_TO_ENUM_VALUE(enum_value) #enum_value
+
+#endif  // INFRT_WITH_TRT
+
+template <typename T>
+::mlir::IntegerAttr createNvinferEnumAttr(
+    ::mlir::PatternRewriter &rewriter,  // NOLINT
+    T enum_value) {
+  return rewriter.getSI32IntegerAttr((int32_t)enum_value);
+}
+
+template <>
+::mlir::IntegerAttr createNvinferEnumAttr<std::string>(
+    ::mlir::PatternRewriter &rewriter, std::string enum_value) {  // NOLINT
+  (void)enum_value;
+  return rewriter.getSI32IntegerAttr(-1);
+}
+
 static mlir::Value createTRTConv2dOp(mlir::PatternRewriter &rewriter,  // NOLINT
                                     mlir::Operation *op) {
  auto conv_op = ::llvm::dyn_cast<infrt::pd::Conv2dOp>(op);
@@ -205,5 +237,127 @@ static mlir::Value createTRTShuffledOp(
  return rewriter.create<trt::ShuffleOp>(
      op->getLoc(), resultTypes, operands, attributes);
 }
+
+inline mlir::IntegerAttr CreatePoolingType(
+    mlir::PatternRewriter &builder,  // NOLINT
+    mlir::StringAttr pool_type) {
+  // pool_type.
+  auto ptype = pool_type.str();
+  if (ptype == "max") {
+    return createNvinferEnumAttr(builder, nvinfer1::PoolingType::kMAX);
+  } else if (ptype == "avg") {
+    return createNvinferEnumAttr(builder, nvinfer1::PoolingType::kAVERAGE);
+  } else {
+    llvm_unreachable("unknown pool_type.");
+    return {};
+  }
+}
+
+inline mlir::IntegerAttr CreatePaddingMode(
+    mlir::PatternRewriter &builder,  // NOLINT
+    mlir::StringAttr padding_algorithm,
+    mlir::BoolAttr ceil_mode) {
+  // TODO(Inference): Phi pool kernel seems not process ceil_mode.
+  auto padding_algo = padding_algorithm.str();
+  if (padding_algo == "SAME") {
+    return createNvinferEnumAttr(builder, nvinfer1::PaddingMode::kSAME_UPPER);
+  }
+  if (ceil_mode.getValue() && padding_algo != "SAME") {
+    return createNvinferEnumAttr(builder,
+                                 nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
+  } else {
+    return createNvinferEnumAttr(builder,
+                                 nvinfer1::PaddingMode::kEXPLICIT_ROUND_DOWN);
+  }
+}
+
+inline ::llvm::SmallVector<::mlir::Value, 4> CreatePaddleTrtPoolingOp(
+    mlir::PatternRewriter &builder,  // NOLINT
+    mlir::Value input,
+    mlir::StringAttr pool_type,
+    mlir::ArrayAttr ksize,
+    mlir::BoolAttr global_pooling,
+    mlir::ArrayAttr strides,
+    mlir::ArrayAttr paddings,
+    mlir::BoolAttr exclusive,
+    mlir::BoolAttr adaptive,
+    mlir::BoolAttr ceil_mode,
+    mlir::StringAttr data_format,
+    mlir::StringAttr padding_algorithm) {
+  ::llvm::SmallVector<::mlir::Value, 4> tblgen_repl_values;
+
+  // TODO(inference): Support NHWC.
+  if (data_format.str() != "NCHW") {
+    CHECK(false) << "The pool2d converter now only support NCHW.";
+  }
+
+  // TODO(Wilber): How to support dynamic shape?
+
+  auto *input_producer = input.getDefiningOp();
+
+  // Process pool_type.
+  auto pool_type_attr = CreatePoolingType(builder, pool_type);
+
+  // Update padding.
+  auto padding_algorithm_str = padding_algorithm.str();
+  auto paddings_attr = paddings;
+  if (padding_algorithm_str == "EXPLICIT") {
+    // Do nothing on paddings.
+  } else if (padding_algorithm_str == "SAME") {
+    // We should process this case in trt network build phase.
+  } else if (padding_algorithm_str == "VALID") {
+    // Set padding to zero.
+    paddings_attr = builder.getI32ArrayAttr({0, 0});
+  } else {
+    CHECK(false) << "Unknown padding_algotithm.";
+  }
+
+  // if global_pooling == true or adaptive == true, padding will be ignored
+  if (global_pooling.getValue() || adaptive.getValue()) {
+    paddings_attr = builder.getI32ArrayAttr({0, 0});
+  }
+
+  // if global_pooling == true, then we should update kernel size to input dims.
+  if (global_pooling.getValue() == true) {
+    // Update ksize to input dims.
+  }
+
+  // The adaptive logic should be processed when we get the context of
+  // INetworkDefinition, so we place the logic in infrt runtime(trt compile
+  // time).
+
+  // The `exclusive` may be a naive attr, which can be forward to trt.
+
+  auto padding_mode_attr =
+      CreatePaddingMode(builder, padding_algorithm, ceil_mode);
+
+  if (global_pooling.getValue() == true) {
+    CHECK(false) << "Temporarily not support global_pool";
+    return tblgen_repl_values;
+  }
+
+  PoolingOp pool_op;
+  {
+    auto ods_loc = builder.getFusedLoc({input_producer->getLoc()});
+    builder.create<PoolingOp>(ods_loc,
+                              input.getType(),
+                              input,
+                              pool_type_attr,
+                              ksize,
+                              strides,
+                              paddings_attr,
+                              padding_mode_attr,
+                              exclusive,
+                              adaptive,
+                              padding_algorithm);
+  }
+
+  for (auto v :
+       ::llvm::SmallVector<::mlir::Value, 4>{pool_op.getODSResults(0)}) {
+    tblgen_repl_values.push_back(v);
+  }
+  return tblgen_repl_values;
+}
+
 }  // namespace trt
 }  // namespace infrt
--- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -31,9 +31,10 @@ def PD2TRT_Conv2d_Lower : Pat<
        (PD_Conv2dOp:$old_value $Input, $Filter, $strides, $paddings, $padding_algorithm, $groups, $dilations, $data_format),
        (createTRTConv2dOp $old_value)>;

+def createTrtPoolingOp : NativeCodeCall<"::infrt::trt::CreatePaddleTrtPoolingOp($_builder, $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10)">;
 def PD2TRT_Pooling_Lower : Pat<
        (PD_Pool2dOp $Input, $pooling_type, $ksize, $global_pooling, $strides, $paddings, $exclusive, $adaptive, $ceil_mode, $data_format, $padding_algorithm),
-        (TRT_PoolingOp $Input, (INFRT_createI32Attr<"0">)/*kmax*/, $ksize, $strides, $paddings, $padding_algorithm)>;
+        (createTrtPoolingOp $Input, $pooling_type, $ksize, $global_pooling, $strides, $paddings, $exclusive, $adaptive, $ceil_mode, $data_format, $padding_algorithm)>;

 def PD2TRT_MatrixMultipl_Lower : Pat<
        (PD_MulOp $Input1, $Input2, $x_num_col_dims, $y_num_col_dims),

--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -28,33 +28,6 @@
 namespace infrt {
 namespace trt {

-#ifdef INFRT_WITH_TRT
-
-#define STRING_TO_ENUM_TYPE(enum_type) enum_type
-#define STRING_TO_ENUM_VALUE(enum_value) enum_value
-#include <NvInfer.h>
-
-#else  // INFRT_WITH_TRT
-
-#define STRING_TO_ENUM_TYPE(enum_type) std::string
-#define STRING_TO_ENUM_VALUE(enum_value) #enum_value
-
-#endif  // INFRT_WITH_TRT
-
-template <typename T>
-::mlir::IntegerAttr createNvinferEnumAttr(
-    ::mlir::PatternRewriter &rewriter,  // NOLINT
-    T enum_value) {
-  return rewriter.getSI32IntegerAttr((int32_t)enum_value);
-}
-
-template <>
-::mlir::IntegerAttr createNvinferEnumAttr<std::string>(
-    ::mlir::PatternRewriter &rewriter, std::string enum_value) {  // NOLINT
-  (void)enum_value;
-  return rewriter.getSI32IntegerAttr(-1);
-}
-
 #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT

 struct PD2TRT_GraphLower : public ::mlir::RewritePattern {

--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -101,7 +101,10 @@ def TRT_PoolingOp : TRT_Op<"Pooling", [NoSideEffect]> {
    I32ArrayAttr:$window_size,
    I32ArrayAttr:$strides,
    I32ArrayAttr:$paddings,
-    StrAttr:$padding_mode
+    I32Attr:$padding_mode,
+    BoolAttr:$exclusive,
+    BoolAttr:$adaptive,
+    StrAttr:$padding_algorithm
  );
  let results = (outs
    DenseTensor:$output_tensor

--- a/paddle/infrt/kernel/tensorrt/trt_helper.h
+++ b/paddle/infrt/kernel/tensorrt/trt_helper.h
@@ -28,13 +28,13 @@ namespace infrt {
 namespace kernel {
 namespace tensorrt {

-static nvinfer1::DataType TensorTypeToWeightType(phi::DataType tensor_type) {
+static nvinfer1::DataType TensorTypeToWeightType(::phi::DataType tensor_type) {
  switch (tensor_type) {
-    case phi::DataType::FLOAT32:
+    case ::phi::DataType::FLOAT32:
      return nvinfer1::DataType::kFLOAT;
-    case phi::DataType::INT32:
+    case ::phi::DataType::INT32:
      return nvinfer1::DataType::kINT32;
-    case phi::DataType::FLOAT16:
+    case ::phi::DataType::FLOAT16:
      return nvinfer1::DataType::kHALF;
    default:
      llvm_unreachable("should not reach here");
@@ -52,7 +52,7 @@ static nvinfer1::Dims ArrayAttrToNvDims(const mlir::ArrayAttr& int_array_attr) {
  return dims;
 }

-static nvinfer1::Weights TensorToWeights(phi::DenseTensor* tensor) {
+static nvinfer1::Weights TensorToWeights(::phi::DenseTensor* tensor) {
  CHECK_NOTNULL(tensor);
  nvinfer1::Weights ret;
  ret.type = TensorTypeToWeightType(tensor->dtype());

--- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -129,6 +129,7 @@ namespace tensorrt {

  // TODO(wilber): Find a way to add layer.
  for (auto& operation : block.without_terminator()) {
+    VLOG(1) << "process " << operation.getName().getStringRef().str() << " ...";
    if (trt::ActivationOp op = llvm::dyn_cast<trt::ActivationOp>(operation)) {
      ActivationFunc(
          op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
@@ -138,6 +139,8 @@ namespace tensorrt {
    } else if (trt::ConvolutionOp op =
                   llvm::dyn_cast<trt::ConvolutionOp>(operation)) {
      ConvFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else if (trt::PoolingOp op = llvm::dyn_cast<trt::PoolingOp>(operation)) {
+      PoolFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
    } else {
      CHECK(false) << "not supported operation.";
    }

--- a/paddle/infrt/kernel/tensorrt/trt_layers.h
+++ b/paddle/infrt/kernel/tensorrt/trt_layers.h
@@ -15,13 +15,15 @@
 #pragma once

 #include <NvInfer.h>
+#include <llvm/ADT/StringRef.h>
+#include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/Operation.h>
+#include <mlir/IR/Value.h>

 #include <string>

 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 #include "paddle/infrt/kernel/tensorrt/trt_helper.h"
-
 #include "paddle/phi/core/dense_tensor.h"

 namespace infrt {
@@ -63,7 +65,12 @@ inline void ConvFunc(trt::ConvolutionOp& op,  // NOLINT
  nvinfer1::Dims dims = ArrayAttrToNvDims(size_attrs);
  auto kernel_weights =
      TensorToWeights(value_to_tensor_map[op.kernel_weights()]);
-  auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]);
+  nvinfer1::Weights bias_weights;
+  if (op.bias_weights() == mlir::Value()) {
+    bias_weights = nvinfer1::Weights{};
+  } else {
+    bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]);
+  }

  auto* layer =
      network->addConvolutionNd(*value_to_trt_tensor_map[input_tensor_repr],
@@ -77,6 +84,51 @@ inline void ConvFunc(trt::ConvolutionOp& op,  // NOLINT
  value_to_trt_tensor_map[out_repr] = out_tensor;
 }

+inline void PoolFunc(trt::PoolingOp& op,  // NOLINT
+                     nvinfer1::INetworkDefinition* network,
+                     ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+                     ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  mlir::Value input_tensor_repr = op.input_tensor();
+  nvinfer1::ITensor* input_itensor = value_to_trt_tensor_map[input_tensor_repr];
+  // nvinfer1::Dims input_shape = input_itensor->getDimensions();
+  // int input_dims = input_shape.nbDims;
+
+  auto padding_mode = op.padding_mode();
+  auto pool_type = op.pool_type();
+  mlir::ArrayAttr paddings = op.paddings();
+  mlir::ArrayAttr strides = op.strides();
+  mlir::ArrayAttr ksize = op.window_size();
+  bool exclusive = op.exclusive();
+  bool adaptive = op.adaptive();
+  auto padding_algorithm = op.padding_algorithm().str();
+
+  if (padding_algorithm == "SAME") {
+    // TODO(wilber)
+    CHECK(false) << "Not supported `same` padding algorithm";
+  }
+
+  if (adaptive) {
+    // TODO(Inference)
+    CHECK(false) << "Not supported adaptive pool";
+  }
+
+  nvinfer1::Dims window_size = ArrayAttrToNvDims(ksize);
+
+  auto* layer =
+      network->addPoolingNd(*input_itensor,
+                            static_cast<nvinfer1::PoolingType>(pool_type),
+                            window_size);
+  CHECK_NOTNULL(layer);
+  layer->setPaddingMode(static_cast<nvinfer1::PaddingMode>(padding_mode));
+  layer->setPaddingNd(ArrayAttrToNvDims(paddings));
+  layer->setStrideNd(ArrayAttrToNvDims(strides));
+  layer->setAverageCountExcludesPadding(exclusive);
+
+  mlir::Value out_repr = op.output_tensor();
+  nvinfer1::ITensor* out_tensor = layer->getOutput(0);
+  value_to_trt_tensor_map[out_repr] = out_tensor;
+}
+
 inline void FcFunc(trt::FullyConnectedOp& op,  // NOLINT
                   nvinfer1::INetworkDefinition* network,
                   ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT

--- a/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir
-// RUN: infrtexec -i %s | FileCheck %s
-
-// CHECK-LABEL: @run_trt
-func @run_trt(%0 : !infrt.dense_tensor<GPU, FP32, NCHW>, %ctx : !phi.context<GPU>) {
-  %a = "trt.create_engine"(%0) ({
-    %1 = "trt.Activation"(%0) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
-    "infrt.return"(%1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
-  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !trt.engine
-  "trt.inspect_engine"(%a) {} : (!trt.engine) -> ()
-
-  %res = "trt.compute"(%a, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
-  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
-  "infrt.print.i32"(%size) {} : (i32) -> ()
-
-  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
-  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
-
-  infrt.return
-}
-
-// CHECK-LABEL: @main
-func @main() {
-  %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
-  %t = "phi_dt.create_dense_tensor.gpu" (%ctx) {
-    precision=#infrt.precision<FP32>,
-    layout=#infrt.layout<NCHW>,
-    dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
-
-  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
-  "phi_dt.print_tensor" (%t) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
-
-  //%res = 
-  infrt.call @run_trt(%t, %ctx) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> ()
-  //-> (!infrt.dense_tensor<GPU, FP32, NCHW>)
-
-  infrt.return
-}
--- a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_activation.mlir
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_activation.mlir
+module  {
+  func @main_graph(%arg0: !infrt.dense_tensor<CPU, FP32, ANY>) -> !infrt.dense_tensor<CPU, FP32, ANY> {
+    %0 = "phi_dt.create_context.gpu"() : () -> !phi.context<GPU>
+    %1 = "phi_dt.memcpy.gpu"(%arg0, %0) {d2h = false} : (!infrt.dense_tensor<CPU, FP32, ANY>, !phi.context<GPU>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %2 = "trt.create_engine"(%1) ( {
+      %6 = "trt.Activation"(%1) {activation_type = 1 : si32, alpha = 0.000000e+00 : f32, beta = 0.000000e+00 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+      infrt.return %6 : !infrt.dense_tensor<GPU, FP32, NCHW>
+    }) {run_once = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !trt.engine
+    %3 = "trt.compute"(%2, %0) : (!trt.engine, !phi.context<GPU>) -> !infrt.tensor_list
+    %4 = "dt.tensor_list_get_tensor"(%3) {id = 0 : i32} : (!infrt.tensor_list) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %5 = "phi_dt.memcpy.gpu"(%4, %0) {d2h = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> !infrt.dense_tensor<CPU, FP32, ANY>
+    infrt.return %5 : !infrt.dense_tensor<CPU, FP32, ANY>
+  }
+  func @main() {
+    %0 = "phi_dt.create_context.cpu"() : () -> !phi.context<CPU>
+    %1 = "phi_dt.create_inited_dense_tensor.cpu.f32"(%0) {dims = [3, 6, 1, 1], layout = #infrt.layout<NCHW>, lod = [0], value = 1.500000e+00 : f32} : (!phi.context<CPU>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    %2 = infrt.call @main_graph(%1) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
+    infrt.return
+  }
+}
--- a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir
-// RUN: infrtexec -i %s | FileCheck %s
-
-// CHECK-LABEL: @main
-func @main() {
-  %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
-  %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
-
-  %input_tensor = "phi_dt.create_dense_tensor.gpu" (%ctx) {
-    precision=#infrt.precision<FP32>,
-    layout=#infrt.layout<NCHW>,
-    dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
-  "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
-  //"phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
-
-  %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
-    precision=#infrt.precision<FP32>,
-    layout=#infrt.layout<NCHW>,
-    dims=[2:i64, 3:i64], lod=[1:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
-  "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-  //"phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-
-  %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
-    precision=#infrt.precision<FP32>,
-    layout=#infrt.layout<NCHW>,
-    dims=[2:i64], lod=[1:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
-  "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32, 2.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-  //"phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-
-  %engine = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({
-    %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
-    %2 = "trt.FullyConnected"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 2 : si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
-    "infrt.return"(%1, %2) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
-  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
-
-  %res = "trt.compute"(%engine, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
-  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
-  "infrt.print.i32"(%size) {} : (i32) -> ()
-
-  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
-  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
-
-  %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
-  "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
-
+module  {
+  func @main_graph(%arg0: !infrt.dense_tensor<CPU, FP32, ANY>) -> !infrt.dense_tensor<CPU, FP32, ANY> {
+    %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+    %0 = "phi_dt.create_context.gpu"() : () -> !phi.context<GPU>
+    %1 = "phi_dt.memcpy.gpu"(%arg0, %0) {d2h = false} : (!infrt.dense_tensor<CPU, FP32, ANY>, !phi.context<GPU>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %4 = "phi_dt.create_inited_dense_tensor.cpu.f32" (%ctx) {value=1.5:f32, layout=#infrt.layout<NCHW>, lod=[0], dims=[2, 6]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    %3 = "phi_dt.create_inited_dense_tensor.cpu.f32" (%ctx) {value=1.5:f32, layout=#infrt.layout<NCHW>, lod=[0], dims=[2]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    %5 = "trt.create_engine"(%1, %4, %3) ( {
+      %10 = "trt.FullyConnected"(%1, %4, %3) {out_channel_num = 2 : si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+      infrt.return %10 : !infrt.dense_tensor<GPU, FP32, NCHW>
+    }) {run_once = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
+    %6 = "trt.compute"(%5, %0) : (!trt.engine, !phi.context<GPU>) -> !infrt.tensor_list
+    %7 = "dt.tensor_list_get_tensor"(%6) {id = 0 : i32} : (!infrt.tensor_list) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %8 = "phi_dt.memcpy.gpu"(%7, %0) {d2h = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> !infrt.dense_tensor<CPU, FP32, ANY>
+    infrt.return %8 : !infrt.dense_tensor<CPU, FP32, ANY>
+  }
+
+  func @main() {
+    %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+    %input_tensor = "phi_dt.create_inited_dense_tensor.cpu.f32" (%ctx) {value=1.5:f32, layout=#infrt.layout<NCHW>, lod=[0], dims=[3, 6, 1, 1]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    %res = infrt.call @main_graph(%input_tensor) {} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    "phi_dt.print_tensor" (%res) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
    infrt.return
+  }
 }
--- a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_pool.mlir
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_pool.mlir
+module  {
+  func @main_graph(%arg0: !infrt.dense_tensor<CPU, FP32, ANY>) -> !infrt.dense_tensor<CPU, FP32, ANY> {
+    %0 = "phi_dt.create_context.gpu"() : () -> !phi.context<GPU>
+    %1 = "phi_dt.memcpy.gpu"(%arg0, %0) {d2h = false} : (!infrt.dense_tensor<CPU, FP32, ANY>, !phi.context<GPU>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %2 = "trt.create_engine"(%1) ( {
+      %6 = "trt.Pooling"(%1) {padding_mode = 0 : i32, paddings = [1 : i32, 1 : i32], pool_type = 0 : i32, strides = [2 : i32, 2 : i32], window_size = [3 : i32, 3 : i32], exclusive = false, adaptive = false, padding_algorithm = "EXPLICIT"} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+      infrt.return %6 : !infrt.dense_tensor<GPU, FP32, NCHW>
+    }) {run_once = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !trt.engine
+    %3 = "trt.compute"(%2, %0) : (!trt.engine, !phi.context<GPU>) -> !infrt.tensor_list
+    %4 = "dt.tensor_list_get_tensor"(%3) {id = 0 : i32} : (!infrt.tensor_list) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %5 = "phi_dt.memcpy.gpu"(%4, %0) {d2h = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> !infrt.dense_tensor<CPU, FP32, ANY>
+    infrt.return %5 : !infrt.dense_tensor<CPU, FP32, ANY>
+  }
+  func @main() {
+    %0 = "phi_dt.create_context.cpu"() : () -> !phi.context<CPU>
+    %1 = "phi_dt.create_inited_dense_tensor.cpu.f32"(%0) {dims = [1, 3, 10, 10], layout = #infrt.layout<NCHW>, lod = [0], value = 1.500000e+00 : f32} : (!phi.context<CPU>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    %2 = infrt.call @main_graph(%1) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
+    infrt.return
+  }
+}