infrt add trt engine (#39885)

27536a32 · Wilber · GitHub · aceb25e1 · 27536a32 · 27536a32
12 changed file
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -37,7 +37,7 @@ void* GetTensorRtPluginHandle();
 extern std::once_flag tensorrt_plugin_dso_flag;
 extern void* tensorrt_plugin_dso_handle;

-#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name)                    \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP_(__name)                   \
  struct DynLoad__##__name {                                                  \
    template <typename... Args>                                               \
    void* operator()(Args... args) {                                          \
@@ -55,7 +55,7 @@ extern void* tensorrt_plugin_dso_handle;
  };                                                                          \
  extern DynLoad__##__name __name

-#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)                \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP_(__name)               \
  struct DynLoad__##__name {                                                  \
    template <typename... Args>                                               \
    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
@@ -72,7 +72,7 @@ extern void* tensorrt_plugin_dso_handle;
  };                                                                          \
  extern DynLoad__##__name __name

-#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP_(__name)                     \
  struct DynLoad__##__name {                                                   \
    template <typename... Args>                                                \
    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
@@ -109,10 +109,10 @@ extern void* tensorrt_plugin_dso_handle;
 #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
  __macro(initLibNvInferPlugins);

-TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP_)
 TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
-    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
-TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
+    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP_)
+TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP_)

 #endif  // end of NV_TENSORRT_MAJOR


--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -74,6 +74,7 @@ endif()


 add_subdirectory(api)
+add_subdirectory(backends)
 add_subdirectory(common)
 add_subdirectory(dialect)
 add_subdirectory(host_context)

--- a/paddle/infrt/backends/CMakeLists.txt
+++ b/paddle/infrt/backends/CMakeLists.txt
+if (INFRT_WITH_PHI AND WITH_GPU AND WITH_TENSORRT)
+  add_subdirectory(tensorrt)
+endif()
--- a/paddle/infrt/backends/tensorrt/CMakeLists.txt
+++ b/paddle/infrt/backends/tensorrt/CMakeLists.txt
+cc_library(infrt_trt SRCS trt_engine.cc DEPS glog phi_dynload_cuda phi)
+
+cc_test_tiny(test_infrt_trt SRCS test_trt_engine.cc DEPS infrt_trt phi_dynload_cuda tensorrt_converter)
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+const char* model_input = "model_input";
+const char* model_output = "model_output1";
+const char* model_output2 = "model_output2";
+
+TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
+    nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
+  TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  if (is_static_shape) {
+    network.reset(builder->createNetworkV2(0U));
+  } else {
+    auto networkFlags =
+        1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    network.reset(builder->createNetworkV2(networkFlags));
+  }
+
+  ITensor* data =
+      network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims);
+  CHECK_NOTNULL(data);
+  IActivationLayer* act =
+      network->addActivation(*data, ActivationType::kSIGMOID);
+  CHECK_NOTNULL(act);
+  auto* act_out = act->getOutput(0);
+  std::vector<int> output_length{1, 2};
+  int axis;
+  nvinfer1::IPluginV2Layer* split_layer;
+  if (is_static_shape) {
+    axis = 0;
+    paddle::inference::tensorrt::plugin::SplitPlugin plugin(
+        axis, output_length, false);
+    split_layer = network->addPluginV2(&act_out, 1, plugin);
+  } else {
+    axis = 1;
+    paddle::inference::tensorrt::plugin::SplitPluginDynamic plugin(
+        axis, output_length, false);
+    split_layer = network->addPluginV2(&act_out, 1, plugin);
+  }
+
+  split_layer->getOutput(0)->setName(model_output);
+  split_layer->getOutput(1)->setName(model_output2);
+  network->markOutput(*split_layer->getOutput(0));
+  network->markOutput(*split_layer->getOutput(1));
+  return network;
+}
+
+// sigmoid(x) = 1 / (1 + exp(-x))
+inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
+
+TEST(trt, run_static) {
+  TRTEngine static_trt_engine(0);
+  auto net = ConstructNetwork(
+      static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
+  BuildOptions static_build_options;
+  static_build_options.max_batch = 4;
+  static_trt_engine.Build(std::move(net), static_build_options);
+  InferenceOptions inference_options;
+  inference_options.batch = 2;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 3, 28, 28}));
+  phi::DenseTensor input;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 3 * 28 * 28, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  phi::DenseTensor output, output2;
+  std::unordered_map<std::string, phi::DenseTensor*> outputs;
+  outputs.emplace(std::make_pair(model_output, &output));
+  outputs.emplace(std::make_pair(model_output2, &output2));
+
+  static_trt_engine.SetUpInference(inference_options, inputs, &outputs);
+  static_trt_engine.GetEngineInfo();
+  static_trt_engine.Run(context);
+
+  std::vector<float> output_data1(inference_options.batch * 1 * 28 * 28, 0);
+  std::vector<float> output_data2(inference_options.batch * 2 * 28 * 28, 0);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data1.data(),
+                       place,
+                       output.data<float>(),
+                       sizeof(float) * output_data1.size(),
+                       context.stream());
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data2.data(),
+                       place,
+                       output2.data<float>(),
+                       sizeof(float) * output_data2.size(),
+                       context.stream());
+  cudaStreamSynchronize(context.stream());
+
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    int w = i % 28;
+    int h = (i / 28) % 28;
+    int c = i / (28 * 28) % 3;
+    int n = i / (28 * 28 * 3);
+    if (c == 0) {
+      CHECK_NEAR(
+          sigmoid(host_data[i]), output_data1[n * 28 * 28 + h * 28 + w], 1e-5);
+    } else {
+      CHECK_NEAR(sigmoid(host_data[i]),
+                 output_data2[n * 28 * 28 * 2 + (c - 1) * 28 * 28 + h * 28 + w],
+                 1e-5);
+    }
+  }
+}
+
+TEST(trt, run_dynamic) {
+  TRTEngine engine(0);
+  auto net = ConstructNetwork(
+      engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false);
+  BuildOptions build_options;
+  build_options.max_batch = 4;
+  build_options.workspace = 32;
+  // build_options.fp16 = true;
+  std::vector<int32_t> min_shape{1, 3, 16, 16};
+  std::vector<int32_t> opt_shape{2, 3, 28, 28};
+  std::vector<int32_t> max_shape{4, 3, 28, 28};
+  build_options.shapes[model_input][0] = min_shape;
+  build_options.shapes[model_input][1] = opt_shape;
+  build_options.shapes[model_input][2] = max_shape;
+  engine.Build(std::move(net), build_options);
+
+  InferenceOptions inference_options;
+  inference_options.batch = 2;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 3, 16, 16}));
+  phi::DenseTensor input, output, output2;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 3 * 16 * 16, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  std::unordered_map<std::string, phi::DenseTensor*> outputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  outputs.emplace(std::make_pair(model_output, &output));
+  outputs.emplace(std::make_pair(model_output2, &output2));
+
+  engine.SetUpInference(inference_options, inputs, &outputs);
+  engine.GetEngineInfo();
+  engine.Run(context);
+
+  std::vector<float> output_data1(inference_options.batch * 1 * 16 * 16, 0);
+  std::vector<float> output_data2(inference_options.batch * 2 * 16 * 16, 0);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data1.data(),
+                       place,
+                       output.data<float>(),
+                       sizeof(float) * output_data1.size(),
+                       context.stream());
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data2.data(),
+                       place,
+                       output2.data<float>(),
+                       sizeof(float) * output_data2.size(),
+                       context.stream());
+  cudaStreamSynchronize(context.stream());
+
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    int w = i % 16;
+    int h = (i / 16) % 16;
+    int c = i / (16 * 16) % 3;
+    int n = i / (16 * 16 * 3);
+    if (c == 0) {
+      CHECK_NEAR(
+          sigmoid(host_data[i]), output_data1[n * 16 * 16 + h * 16 + w], 1e-5);
+    } else {
+      CHECK_NEAR(sigmoid(host_data[i]),
+                 output_data2[n * 16 * 16 * 2 + (c - 1) * 16 * 16 + h * 16 + w],
+                 1e-5);
+    }
+  }
+}
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include "glog/logging.h"
+#include "paddle/phi/backends/dynload/tensorrt.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+// The following two API are implemented in TensorRT's header file, cannot load
+// from the dynamic library. So create our own implementation and directly
+// trigger the method from the dynamic library.
+static nvinfer1::IBuilder* createInferBuilder(
+    nvinfer1::ILogger& logger) {  // NOLINT
+  return static_cast<nvinfer1::IBuilder*>(
+      phi::dynload::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+static nvinfer1::IRuntime* createInferRuntime(
+    nvinfer1::ILogger& logger) {  // NOLINT
+  return static_cast<nvinfer1::IRuntime*>(
+      phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+
+TRTEngine::TRTEngine(int device_id) : device_id_(device_id) {
+  FreshDeviceId();
+  logger_.reset(new TrtLogger());
+  builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
+  phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
+}
+
+nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() {
+  CHECK_NOTNULL(builder_);
+  return builder_.get();
+}
+
+void TRTEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+                      const BuildOptions& build_options) {
+  FreshDeviceId();
+  ModelToBuildEnv(std::move(network), build_options);
+  CHECK_NOTNULL(engine_);
+}
+
+bool TRTEngine::ModelToBuildEnv(
+    TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+    const BuildOptions& build) {
+  CHECK_NOTNULL(builder_);
+  std::swap(network, network_);
+  CHECK_NOTNULL(network_);
+  // ModelToNetwork(network_, logger);
+  NetworkToEngine(build);
+  return true;
+}
+
+bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
+  TrtUniquePtr<IBuilderConfig> config{builder_->createBuilderConfig()};
+  CHECK_NOTNULL(config);
+  CHECK(SetupNetworkAndConfig(build, *network_, *config));
+
+#if IS_TRT_VERSION_LT(8000)
+  engine_.reset(builder_->buildEngineWithConfig(*network_, *config));
+#else
+  serialized_engine_.reset(
+      builder_->buildSerializedNetwork(*network_, *config));
+  CHECK_NOTNULL(serialized_engine_);
+
+  TrtUniquePtr<IRuntime> runtime{createInferRuntime(logger_->GetTrtLogger())};
+  CHECK_NOTNULL(runtime);
+  engine_.reset(runtime->deserializeCudaEngine(serialized_engine_->data(),
+                                               serialized_engine_->size()));
+  CHECK_NOTNULL(engine_);
+#endif
+  return true;
+}
+
+bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
+                                      INetworkDefinition& network,
+                                      IBuilderConfig& config) {
+  builder_->setMaxBatchSize(build.max_batch);
+  // TODO(wilber): handle one engine - multi execution context case.
+  IOptimizationProfile* profile{nullptr};
+  if (!build.shapes.empty()) {
+    profile = builder_->createOptimizationProfile();
+    CHECK_NOTNULL(profile);
+  }
+
+  // Set formats and data types of inputs
+  for (int32_t i = 0; i < network.getNbInputs(); ++i) {
+    auto* input = network.getInput(i);
+    if (!build.input_formats.empty()) {
+      input->setType(build.input_formats[i].first);
+      input->setAllowedFormats(build.input_formats[i].second);
+    } else {
+      switch (input->getType()) {
+        case DataType::kINT32:
+        case DataType::kBOOL:
+        case DataType::kHALF:
+          // Leave these as is.
+          break;
+        case DataType::kFLOAT:
+        case DataType::kINT8:
+          // User did not specify a floating-point format.  Default to kFLOAT.
+          input->setType(DataType::kFLOAT);
+          break;
+      }
+      input->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
+    }
+
+    if (profile) {
+      Dims dims = input->getDimensions();
+      // TODO(wilber): shape tensor.
+      const bool is_dynamic_input = std::any_of(
+          dims.d, dims.d + dims.nbDims, [](int dim) { return dim == -1; });
+      if (is_dynamic_input) {
+        is_dynamic_shape_ = true;
+        auto shape = build.shapes.find(input->getName());
+
+        // If no shape is provided
+        if (shape == build.shapes.end()) {
+          // TODO(wilber): add infomation.
+          CHECK(false);
+        }
+        LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
+        std::vector<int> profile_dims{};
+        profile_dims =
+            shape->second[static_cast<size_t>(OptProfileSelector::kMIN)];
+        CHECK(profile->setDimensions(input->getName(),
+                                     OptProfileSelector::kMIN,
+                                     VecToDims(profile_dims)));
+        profile_dims =
+            shape->second[static_cast<size_t>(OptProfileSelector::kOPT)];
+        CHECK(profile->setDimensions(input->getName(),
+                                     OptProfileSelector::kOPT,
+                                     VecToDims(profile_dims)));
+        profile_dims =
+            shape->second[static_cast<size_t>(OptProfileSelector::kMAX)];
+        CHECK(profile->setDimensions(input->getName(),
+                                     OptProfileSelector::kMAX,
+                                     VecToDims(profile_dims)));
+      }
+    }
+  }
+
+  if (profile && is_dynamic_shape_) {
+    CHECK(profile->isValid());  // Required optimization profile is invalid
+    CHECK_NE(config.addOptimizationProfile(profile), -1);
+  }
+
+  // Set formats and data types of outputs
+  for (int32_t i = 0, n = network.getNbOutputs(); i < n; i++) {
+    auto* output = network.getOutput(i);
+    if (!build.output_formats.empty()) {
+      // int outputFormatIndex = broadcastOutputFormats ? 0 : i;
+      output->setType(build.output_formats[i].first);
+      output->setAllowedFormats(build.output_formats[i].second);
+    } else {
+      output->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
+    }
+  }
+
+  config.setMaxWorkspaceSize(static_cast<size_t>(build.workspace) << 20);
+
+  if (build.fp16) {
+    config.setFlag(BuilderFlag::kFP16);
+    bool support_fp16 = builder_->platformHasFastFp16();
+    if (support_fp16) {
+      LOG(INFO) << "Run INFRT-TRT FP16 mode";
+    } else {
+      LOG(INFO) << "You specify FP16 mode, but the hardware do not support "
+                   "FP16 speed up, use FP32 instead.";
+    }
+  }
+
+  if (build.tf32) {
+    config.setFlag(BuilderFlag::kTF32);
+    bool support_tf32 = builder_->platformHasTf32();
+    if (support_tf32) {
+      LOG(INFO) << "Run INFRT-TRT TF32 mode";
+    } else {
+      LOG(INFO) << "You specify TF32 mode, but the hardware do not support "
+                   "TF32 speed up, use FP32 instead.";
+    }
+  }
+
+  // TODO(wilber): other precision.
+
+  // TODO(wilber): precision config.
+  switch (build.precision_constraints) {
+    case PrecisionConstraints::kNONE:
+      // It's the default for TensorRT.
+      break;
+    case PrecisionConstraints::kOBEY:
+      config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS);
+      break;
+    case PrecisionConstraints::kPREFER:
+      config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
+      break;
+  }
+
+  // TODO(TRT): DLA config.
+
+  // TODO(TRT): int8 config.
+  // TODO(TRT): support int8
+  if (build.int8) {
+    assert(false);
+    config.setFlag(BuilderFlag::kINT8);
+    bool support_int8 = builder_->platformHasFastInt8();
+    if (support_int8) {
+      LOG(INFO) << "Run INFRT-TRT FP16 mode";
+    }
+  }
+
+  // TODO(TRT): calib config.
+
+  // TODO(TRT): sparse config.
+
+  return true;
+}
+
+bool TRTEngine::SetUpInference(
+    const InferenceOptions& inference,
+    const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
+    std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
+  // TODO(wilber): now only create one exec_context
+  FreshDeviceId();
+  CHECK(engine_ != nullptr);
+  nvinfer1::IExecutionContext* ec = engine_->createExecutionContext();
+  CHECK(ec != nullptr);
+  contexts_.emplace_back(ec);
+  bindings_.emplace_back(new Bindings());
+
+  for (const auto& it : inputs) {
+    const int bind_index = engine_->getBindingIndex(it.first.c_str());
+    bindings_.front()->AddBinding(
+        bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT);
+  }
+  for (auto& it : *outputs) {
+    const int bind_index = engine_->getBindingIndex(it.first.c_str());
+    bindings_.front()->AddBinding(
+        bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT);
+  }
+
+  return true;
+}
+
+void TRTEngine::Run(const phi::GPUContext& ctx) {
+  if (is_dynamic_shape_) {
+    DynamicRun(ctx);
+  } else {
+    StaticRun(ctx);
+  }
+}
+
+void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
+  const int num_bindings = engine_->getNbBindings();
+  std::vector<void*> buffers(num_bindings, nullptr);
+
+  int runtime_batch = -1;
+  auto input_binds = bindings_.front()->GetInputBindings();
+  for (auto bind : input_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    buffers[bind_index] =
+        const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>()));
+    if (runtime_batch != -1) {
+      CHECK_EQ(runtime_batch, phi::vectorize<int64_t>(bind.buffer->dims())[0]);
+    }
+    runtime_batch = bind.buffer->dims()[0];
+  }
+
+  auto output_binds = bindings_.front()->GetOutputBindings();
+  for (auto bind : output_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    std::vector<int32_t> ddim;
+    auto dims = engine_->getBindingDimensions(bind_index);
+    ddim.push_back(runtime_batch);
+    for (int i = 0; i < dims.nbDims; ++i) {
+      ddim.push_back(dims.d[i]);
+    }
+    bind.buffer->Resize(phi::make_ddim(ddim));
+    ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
+    buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
+  }
+
+  contexts_.front()->enqueue(
+      runtime_batch, buffers.data(), ctx.stream(), nullptr);
+}
+
+void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
+  const int num_bindings = engine_->getNbBindings();
+  std::vector<void*> buffers(num_bindings, nullptr);
+
+  auto input_binds = bindings_.front()->GetInputBindings();
+  for (auto bind : input_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    buffers[bind_index] =
+        const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>()));
+    nvinfer1::Dims trt_dims;
+    trt_dims.nbDims = bind.buffer->dims().size();
+
+    for (int i = 0; i < trt_dims.nbDims; ++i) {
+      trt_dims.d[i] = bind.buffer->dims()[i];
+    }
+    contexts_.front()->setBindingDimensions(bind_index, trt_dims);
+  }
+
+  CHECK(contexts_.front()->allInputDimensionsSpecified());
+
+  auto output_binds = bindings_.front()->GetOutputBindings();
+  for (auto bind : output_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    auto dims = contexts_.front()->getBindingDimensions(bind_index);
+    std::vector<int32_t> ddim(dims.nbDims);
+    for (int i = 0; i < dims.nbDims; ++i) {
+      ddim[i] = dims.d[i];
+    }
+    bind.buffer->Resize(phi::make_ddim(ddim));
+    ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
+    buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
+  }
+
+  contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr);
+}
+
+void TRTEngine::FreshDeviceId() {
+  int count;
+  cudaGetDeviceCount(&count);
+  CHECK_LT(device_id_, count);
+  phi::backends::gpu::SetDeviceId(device_id_);
+}
+
+void TRTEngine::GetEngineInfo() {
+#if IS_TRT_VERSION_GE(8200)
+  LOG(INFO) << "====== engine info ======";
+  std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
+      engine_->createEngineInspector());
+  infer_inspector->setExecutionContext(contexts_.front().get());
+  LOG(INFO) << infer_inspector->getEngineInformation(
+      nvinfer1::LayerInformationFormat::kONELINE);
+  LOG(INFO) << "====== engine info end ======";
+#else
+  LOG(INFO) << "Inspector needs TensorRT version 8.2 and after.";
+#endif
+}
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/infrt/backends/tensorrt/trt_utils.h"
+#include "paddle/phi/backends/dynload/tensorrt.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+using namespace nvinfer1;  // NOLINT
+
+// The trt programing model as follows:
+// 1. The build phase:
+// IBuilder* builder = createInferBuilder(&logger_);
+// 2. Create a network definition:
+// INetworkDefinition* network = builder->createNetworkV2(...);
+// 3. Build network:
+// network->AddLayer(...)
+// 4. Configure network:
+// IBuilderConfig* config = builder->createBuilderConfig();
+// config->setMaxWorkspaceSize(...)
+// 5. Get cuda engine and deserializing a plan:
+// IHostMemory* serialized_model = builder->buildSerializedNetwork(...);
+// IRuntime* runtime = createInferRuntime(&logger_);
+// ICudaEngine* engine = runtime->deserializeCudaEngine(...);
+// 6. Get execution context:
+// IExecutionContext* exec_context = engine->createExecutionContext();
+// 7. Set input data:
+// int32_t input_index = engine->getBindingIndex("input");
+// int32_t output_index = engine->getBindingIndex("output");
+// void* buffers[2];
+// buffers[input_index] = input_buffer;
+// buffers[output_index] = output_buffer;
+// 8. Performance inference:
+// exec_context->enqueueV2(buffers, stream, nullptr);
+//
+// We have encapsulated this logic, please use the following programming model.
+//
+// TRTEngine trt_engine;
+// trt_engine.Build(...);
+// trt_engine.SetUpInference(...);
+// trt_engine.Run(...);
+class TRTEngine {
+ public:
+  explicit TRTEngine(int device_id);
+
+  nvinfer1::IBuilder* GetTrtBuilder();
+
+  // TODO(wilber): Modify signature after infrt-trt ready.
+  void Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+             const BuildOptions& build_options);
+
+  // TODO(wilber): Modify signature after infrt-trt ready.
+  void Run(const phi::GPUContext& ctx);
+
+  // TODO(wilber): How to support multiple execution contexts?
+  bool SetUpInference(
+      const InferenceOptions& inference,
+      const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
+      std::unordered_map<std::string, phi::DenseTensor*>* outputs);
+
+  void GetEngineInfo();
+
+ private:
+  void FreshDeviceId();
+
+  bool SetupNetworkAndConfig(const BuildOptions& build,
+                             INetworkDefinition& network,  // NOLINT
+                             IBuilderConfig& config);      // NOLINT
+
+  bool NetworkToEngine(const BuildOptions& build);
+
+  bool ModelToBuildEnv(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+                       const BuildOptions& build);
+
+  void StaticRun(const phi::GPUContext& ctx);
+
+  void DynamicRun(const phi::GPUContext& ctx);
+
+ private:
+  std::unique_ptr<TrtLogger> logger_{nullptr};
+  TrtUniquePtr<nvinfer1::IBuilder> builder_{nullptr};
+  TrtUniquePtr<INetworkDefinition> network_{nullptr};
+  std::unique_ptr<IHostMemory> serialized_engine_{nullptr};
+  TrtUniquePtr<nvinfer1::ICudaEngine> engine_{nullptr};
+  std::vector<TrtUniquePtr<nvinfer1::IExecutionContext>> contexts_;
+  std::vector<std::unique_ptr<Bindings>> bindings_;
+  int device_id_{0};
+  bool is_dynamic_shape_{false};
+};
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
--- a/paddle/infrt/backends/tensorrt/trt_options.h
+++ b/paddle/infrt/backends/tensorrt/trt_options.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <NvInfer.h>
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+// Build default params
+constexpr int32_t max_batch_not_provided{0};
+constexpr int32_t default_workspace{16};
+// Inference default params
+constexpr int32_t default_batch{1};
+constexpr int32_t batch_not_provided{0};
+
+enum class PrecisionConstraints { kNONE, kOBEY, kPREFER };
+
+enum class SparsityFlag { kDISABLE, kENABLE, kFORCE };
+
+using ShapeRange =
+    std::array<std::vector<int32_t>,
+               nvinfer1::EnumMax<nvinfer1::OptProfileSelector>()>;
+
+using IOFormat = std::pair<nvinfer1::DataType, nvinfer1::TensorFormats>;
+
+struct BuildOptions {
+  // Set max batch size.
+  int32_t max_batch{max_batch_not_provided};
+
+  // Set workspace size in megabytes (default = 16)
+  int32_t workspace{default_workspace};
+
+  // Enable tf32 precision, in addition to fp32 (default = disabled)
+  bool tf32{false};
+
+  // Enable fp16 precision, in addition to fp32 (default = disabled)
+  bool fp16{false};
+
+  // Enable int8 precision, in addition to fp32 (default = disabled)
+  bool int8{false};
+
+  // Control precision constraints. (default = none)
+  // Precision Constaints: = none, obey, prefer
+  //     none = no constraints
+  //     prefer = meet precision constraints if possible
+  //     obey = meet precision constraints or fail otherwise
+  PrecisionConstraints precision_constraints{PrecisionConstraints::kNONE};
+
+  // Save the serialized engine.
+  bool save{false};
+
+  // Load a serialized engine.
+  bool load{false};
+
+  // Build with dynamic shapes using a profile with the min, max and opt shapes
+  // provided
+  std::unordered_map<std::string, ShapeRange> shapes;
+
+  // Type and format of each of the input tensors (default = all inputs in
+  // fp32:chw)
+  std::vector<IOFormat> input_formats;
+
+  // Type and format of each of the output tensors (default = all outputs in
+  // fp32:chw)
+  std::vector<IOFormat> output_formats;
+};
+
+struct InferenceOptions {
+  int32_t batch{batch_not_provided};
+  std::unordered_map<std::string, std::vector<int32_t>> shapes;
+};
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
--- a/paddle/infrt/backends/tensorrt/trt_utils.h
+++ b/paddle/infrt/backends/tensorrt/trt_utils.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include "glog/logging.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+#define IS_TRT_VERSION_GE(version)                       \
+  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
+
+#define IS_TRT_VERSION_LT(version)                       \
+  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) < version)
+
+#define TRT_VERSION                                    \
+  NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+      NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD
+
+inline nvinfer1::Dims VecToDims(const std::vector<int>& vec) {
+  int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
+  if (static_cast<int>(vec.size()) > limit) {
+    assert(false);
+  }
+  // Pick first nvinfer1::Dims::MAX_DIMS elements
+  nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
+  std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
+  return dims;
+}
+
+template <typename T>
+struct TrtDestroyer {
+  void operator()(T* t) { t->destroy(); }
+};
+
+template <typename T>
+using TrtUniquePtr = std::unique_ptr<T, TrtDestroyer<T>>;
+
+class TrtLogger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity,
+           const char* msg) noexcept override {
+    switch (severity) {
+      case Severity::kVERBOSE:
+        VLOG(3) << msg;
+        break;
+      case Severity::kINFO:
+        VLOG(2) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+  nvinfer1::ILogger& GetTrtLogger() noexcept { return *this; }
+  ~TrtLogger() override = default;
+};
+
+struct Binding {
+  bool is_input{false};
+  nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT};
+  phi::DenseTensor* buffer{nullptr};
+  std::string name;
+};
+
+class Bindings {
+ public:
+  Bindings() = default;
+
+  void AddBinding(int32_t b,
+                  const std::string& name,
+                  bool is_input,
+                  phi::DenseTensor* buffer,
+                  nvinfer1::DataType data_type) {
+    while (bindings_.size() <= static_cast<size_t>(b)) {
+      bindings_.emplace_back();
+    }
+    names_[name] = b;
+    bindings_[b].buffer = buffer;
+    bindings_[b].is_input = is_input;
+    bindings_[b].data_type = data_type;
+    bindings_[b].name = name;
+  }
+
+  std::vector<Binding> GetInputBindings() {
+    return GetBindings([](const Binding& b) -> bool { return b.is_input; });
+  }
+
+  std::vector<Binding> GetOutputBindings() {
+    return GetBindings([](const Binding& b) -> bool { return !b.is_input; });
+  }
+
+  std::vector<Binding> GetBindings() {
+    return GetBindings([](const Binding& b) -> bool { return true; });
+  }
+
+  std::vector<Binding> GetBindings(
+      std::function<bool(const Binding& b)> predicate) {
+    std::vector<Binding> bindings;
+    for (const auto& b : bindings_) {
+      if (predicate(b)) {
+        bindings.push_back(b);
+      }
+    }
+    return bindings;
+  }
+
+ private:
+  std::unordered_map<std::string, int32_t> names_;
+  std::vector<Binding> bindings_;
+};
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
--- a/paddle/infrt/kernel/phi/CMakeLists.txt
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -18,6 +18,10 @@ set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/gener

 add_custom_command(
        OUTPUT ${infrt_register_phi_kernels_gen_source_file}
+        COMMAND sh ${infrt_register_phi_kernels_gen_file}
+        DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
+        VERBATIM)
+add_custom_target(infrt_register_phi_kernel
        COMMAND sh ${infrt_register_phi_kernels_gen_file}
        DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
        COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}"

--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) {
  host_context::KernelRegistry registry;
  RegisterInferShapeLaunchers(&registry);
  ASSERT_GE(registry.size(), 1UL);
-  auto creator = registry.GetKernel("add.cpu.any.fp32");
+  auto creator = registry.GetKernel("pten.add.cpu.any.fp32");

  const phi::DDim dims({1, 2});
  const phi::DataType dtype{phi::DataType::FLOAT32};

--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -219,7 +219,7 @@ def gen_register_info(resources: List[List[str]]):
        for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
            kernel_func = gen_kernel_func(update_item[3], ctx_name,
                                          origin_dtype)
-            ir_name = '.'.join(
+            ir_name = 'pten.' + '.'.join(
                [it.lower() for it in update_item[:3]]) + "." + ir_dtype
            res += f"""
  registry->AddKernel("{ir_name}","""