From 5a6182b80481542a81b1cfdc7ff3dcdbf1926d4e Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 14 Apr 2022 11:08:23 +0800
Subject: [PATCH] infrt run once (A trick version) (#41634)

* temporariliy run once

* update

* update

* update

* update

* fix ci problem
---
 paddle/infrt/dialect/phi/ir/phi_base.cc       |  2 +-
 paddle/infrt/dialect/tensorrt/convert.h       |  2 +-
 paddle/infrt/dialect/tensorrt/trt_exec.cc     |  2 +-
 paddle/infrt/host_context/op_executable.cc    | 11 ++++-
 .../infrt/kernel/phi/dense_tensor_kernels.cc  | 47 +++++++++++++------
 .../infrt/kernel/phi/dense_tensor_kernels.h   |  7 +--
 paddle/infrt/kernel/tensor_kernels.cc         |  7 ++-
 paddle/infrt/kernel/tensorrt/trt_kernels.cc   |  2 +-
 paddle/infrt/kernel/tensorrt/trt_layers.h     | 21 +++++++++
 9 files changed, 76 insertions(+), 25 deletions(-)
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
index f91381fe72..1bd6068d3f 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
-#include <llvm/include/llvm/ADT/TypeSwitch.h>
+#include <llvm/ADT/TypeSwitch.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/DialectImplementation.h>
diff --git a/paddle/infrt/dialect/tensorrt/convert.h b/paddle/infrt/dialect/tensorrt/convert.h
index be363e7784..2a242ca285 100644
--- a/paddle/infrt/dialect/tensorrt/convert.h
+++ b/paddle/infrt/dialect/tensorrt/convert.h
@@ -15,7 +15,7 @@
 
 #include <glog/logging.h>
 #include <llvm/Support/ErrorHandling.h>
-#include <llvm/include/mlir/IR/Attributes.h>
+#include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/PatternMatch.h>
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index 2682a744bb..dcb84ceb50 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
     std::cout << "\npass failed!\n" << std::endl;
     return 4;
   }
-  // module->dump();
+  module->dump();
   ::infrt::host_context::TestMlir(module.get(), &registry);
   return 0;
 }
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
index 59a73e7108..4d588a9c2b 100644
--- a/paddle/infrt/host_context/op_executable.cc
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -16,6 +16,7 @@
 
 #include <mlir/IR/BuiltinOps.h>
 #include <string>
+#include <unordered_set>
 
 #include "paddle/infrt/host_context/kernel_frame.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
@@ -71,7 +72,15 @@ OpExecutableBuilder::OpExecutableBuilder(const std::string& op_name,
   // TODO(Superjomn) support other device other than CPU.
   CHECK(impl_->kernel_impl) << "No CPU kernel called " << op_name;
 
-  if (op_name == "dt.get_param") {
+  // TODO(wilber): Maybe we can use the MLIR trait or other facilities to remove
+  // the run_once set.
+  std::unordered_set<std::string> run_once_set{
+      "dt.get_param",
+      "trt.create_engine",
+      "phi_dt.create_host_inited_dense_tensor.f32",
+      "phi_dt.create_context.cpu",
+      "phi_dt.create_context.gpu"};
+  if (run_once_set.count(op_name)) {
     impl_->run_once = true;
   }
 }
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index fe1cda0e10..7ffc8de151 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -22,6 +22,7 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 #ifdef INFRT_WITH_GPU
 #include <cuda_runtime.h>
@@ -308,34 +309,50 @@ inline size_t SizeOfDataType(::phi::DataType data_type) {
   }
   return 0;
 }
-::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input,
-                             const ::phi::GPUContext& context,
-                             bool d2h) {
+void GpuMemCpy(const ::phi::DenseTensor& input,
+               const ::phi::GPUContext& context,
+               bool d2h,
+               ::phi::DenseTensor* output) {
   if (d2h) {
-    ::phi::DenseTensor ret(
-        const_cast<::phi::Allocator*>(&context.GetHostAllocator()),
-        input.meta());
     CHECK(input.place().GetType() == ::phi::AllocationType::GPU);
-    // TODO(wilber): Add sync op and stream.
-    cudaMemcpyAsync(ret.data(),
+
+    // TODO(wilber): Just a trick to avoid malloc.
+    if (input.numel() > output->numel()) {
+      // TODO(wilber): Use pinned memory.
+      output->Resize(input.dims());
+      context.HostAlloc(
+          output, input.dtype(), input.numel() * SizeOfDataType(input.dtype()));
+    }
+
+    cudaMemcpyAsync(output->data(),
                     input.data(),
                     SizeOfDataType(input.dtype()) * input.numel(),
                     cudaMemcpyDeviceToHost,
-                    nullptr);
-    return ret;
+                    context.stream());
+    // TODO(wilber): Ir add sync op.
+    cudaStreamSynchronize(context.stream());
   } else {
     // h2d
-    ::phi::DenseTensor ret(
-        const_cast<::phi::Allocator*>(&context.GetAllocator()), input.meta());
     CHECK(input.place().GetType() == ::phi::AllocationType::CPU ||
           input.place().GetType() == ::phi::AllocationType::GPUPINNED);
+
+    if (input.numel() > output->numel()) {
+      output->Resize(input.dims());
+      context.Alloc(output,
+                    input.dtype(),
+                    input.numel() * SizeOfDataType(input.dtype()),
+                    false);
+
+    } else {
+      output->Resize(input.dims());
+    }
+
     // TODO(wilber): Add sync op and stream.
-    cudaMemcpyAsync(ret.data(),
+    cudaMemcpyAsync(output->data(),
                     input.data(),
                     SizeOfDataType(input.dtype()) * input.numel(),
                     cudaMemcpyHostToDevice,
-                    nullptr);
-    return ret;
+                    context.stream());
   }
 }
 #endif
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index b107544473..c401fb9997 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -76,9 +76,10 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
 int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map);
 
 #ifdef INFRT_WITH_GPU
-::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input,
-                             const ::phi::GPUContext& context,
-                             bool d2h);
+void GpuMemCpy(const ::phi::DenseTensor& input,
+               const ::phi::GPUContext& context,
+               bool d2h,
+               ::phi::DenseTensor* output);
 #endif
 
 }  // namespace phi
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 65e137472b..2e952e77d1 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -119,6 +119,7 @@ void NaiveMatmul(const DenseHostTensor &x,
   const int N = w.shape().GetDim(1);
   for (int i = 0; i < M; i++) {
     for (int j = 0; j < N; j++) {
+      out_data[i * N + j] = 0;
       for (int k = 0; k < K; k++) {
         out_data[i * N + j] += x_data[i * K + k] * w_data[k * N + j];
       }
@@ -134,9 +135,11 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
                       {"shape"});
   registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor));
   registry->AddKernel("dt.fill_tensor_with_constant.f32",
-                      INFRT_KERNEL(FillTensorWithConstant<float>));
+                      INFRT_KERNEL(FillTensorWithConstant<float>),
+                      {"value"});
   registry->AddKernel("dt.fill_tensor_with_constant.f64",
-                      INFRT_KERNEL(FillTensorWithConstant<double>));
+                      INFRT_KERNEL(FillTensorWithConstant<double>),
+                      {"value"});
 
   // TensorMap related methods.
   registry->AddKernel("dt.load_params", INFRT_KERNEL(LoadParams));
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
index c182dda270..c0f5ebb4a7 100644
--- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -57,7 +57,7 @@ namespace tensorrt {
   // TODO(wilber): The build option shoule be fiiled from mlir info.
   backends::tensorrt::BuildOptions options;
   options.max_batch = 4;
-  options.workspace = 1024;
+  options.workspace = 128;
 
   // Parse mlir Region which only has one block.
   mlir::Operation& operation = *create_engine_op.operation;
diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h
index 9d8eba0bb3..0f2c2c88ca 100644
--- a/paddle/infrt/kernel/tensorrt/trt_layers.h
+++ b/paddle/infrt/kernel/tensorrt/trt_layers.h
@@ -115,6 +115,27 @@ inline void PoolFunc(trt::PoolingOp& op,  // NOLINT
     // TODO(Inference)
     // CHECK(false) << "Not supported adaptive pool";
 
+    // TODO(wilber): Reformat.
+    // global average pooling.
+    auto ksize_vec = ArrayAttrToVec<int>(ksize);
+    if (static_cast<nvinfer1::PoolingType>(pool_type) ==
+            nvinfer1::PoolingType::kAVERAGE &&
+        ksize_vec.size() == 2 && ksize_vec[0] == 1 && ksize_vec[1] == 1) {
+      nvinfer1::Dims dims;
+      dims.nbDims = 2;
+      dims.d[0] = input_shape.d[1];
+      dims.d[1] = input_shape.d[2];
+      auto* layer = network->addPoolingNd(
+          *input_itensor, static_cast<nvinfer1::PoolingType>(pool_type), dims);
+      CHECK_NOTNULL(layer);
+
+      mlir::Value out_repr = op.output_tensor();
+      nvinfer1::ITensor* out_tensor = layer->getOutput(0);
+      value_to_trt_tensor_map[out_repr] = out_tensor;
+      return;
+    }
+
+    // plugin...
     std::vector<int> input_shape_v;
     for (int i = 0; i < input_dims; i++) {
       input_shape_v.push_back(input_shape.d[i]);
-- 
GitLab