From 5a6182b80481542a81b1cfdc7ff3dcdbf1926d4e Mon Sep 17 00:00:00 2001 From: Wilber Date: Thu, 14 Apr 2022 11:08:23 +0800 Subject: [PATCH] infrt run once (A trick version) (#41634) * temporariliy run once * update * update * update * update * fix ci problem --- paddle/infrt/dialect/phi/ir/phi_base.cc | 2 +- paddle/infrt/dialect/tensorrt/convert.h | 2 +- paddle/infrt/dialect/tensorrt/trt_exec.cc | 2 +- paddle/infrt/host_context/op_executable.cc | 11 ++++- .../infrt/kernel/phi/dense_tensor_kernels.cc | 47 +++++++++++++------ .../infrt/kernel/phi/dense_tensor_kernels.h | 7 +-- paddle/infrt/kernel/tensor_kernels.cc | 7 ++- paddle/infrt/kernel/tensorrt/trt_kernels.cc | 2 +- paddle/infrt/kernel/tensorrt/trt_layers.h | 21 +++++++++ 9 files changed, 76 insertions(+), 25 deletions(-) diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc index f91381fe72..1bd6068d3f 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -14,7 +14,7 @@ #include "paddle/infrt/dialect/phi/ir/phi_base.h" -#include +#include #include #include #include diff --git a/paddle/infrt/dialect/tensorrt/convert.h b/paddle/infrt/dialect/tensorrt/convert.h index be363e7784..2a242ca285 100644 --- a/paddle/infrt/dialect/tensorrt/convert.h +++ b/paddle/infrt/dialect/tensorrt/convert.h @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc index 2682a744bb..dcb84ceb50 100644 --- a/paddle/infrt/dialect/tensorrt/trt_exec.cc +++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc @@ -87,7 +87,7 @@ int main(int argc, char** argv) { std::cout << "\npass failed!\n" << std::endl; return 4; } - // module->dump(); + module->dump(); ::infrt::host_context::TestMlir(module.get(), ®istry); return 0; } diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc index 59a73e7108..4d588a9c2b 100644 --- a/paddle/infrt/host_context/op_executable.cc +++ b/paddle/infrt/host_context/op_executable.cc @@ -16,6 +16,7 @@ #include #include +#include #include "paddle/infrt/host_context/kernel_frame.h" #include "paddle/infrt/host_context/kernel_registry.h" @@ -71,7 +72,15 @@ OpExecutableBuilder::OpExecutableBuilder(const std::string& op_name, // TODO(Superjomn) support other device other than CPU. CHECK(impl_->kernel_impl) << "No CPU kernel called " << op_name; - if (op_name == "dt.get_param") { + // TODO(wilber): Maybe we can use the MLIR trait or other facilities to remove + // the run_once set. + std::unordered_set run_once_set{ + "dt.get_param", + "trt.create_engine", + "phi_dt.create_host_inited_dense_tensor.f32", + "phi_dt.create_context.cpu", + "phi_dt.create_context.gpu"}; + if (run_once_set.count(op_name)) { impl_->run_once = true; } } diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index fe1cda0e10..7ffc8de151 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -22,6 +22,7 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" #ifdef INFRT_WITH_GPU #include @@ -308,34 +309,50 @@ inline size_t SizeOfDataType(::phi::DataType data_type) { } return 0; } -::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input, - const ::phi::GPUContext& context, - bool d2h) { +void GpuMemCpy(const ::phi::DenseTensor& input, + const ::phi::GPUContext& context, + bool d2h, + ::phi::DenseTensor* output) { if (d2h) { - ::phi::DenseTensor ret( - const_cast<::phi::Allocator*>(&context.GetHostAllocator()), - input.meta()); CHECK(input.place().GetType() == ::phi::AllocationType::GPU); - // TODO(wilber): Add sync op and stream. - cudaMemcpyAsync(ret.data(), + + // TODO(wilber): Just a trick to avoid malloc. + if (input.numel() > output->numel()) { + // TODO(wilber): Use pinned memory. + output->Resize(input.dims()); + context.HostAlloc( + output, input.dtype(), input.numel() * SizeOfDataType(input.dtype())); + } + + cudaMemcpyAsync(output->data(), input.data(), SizeOfDataType(input.dtype()) * input.numel(), cudaMemcpyDeviceToHost, - nullptr); - return ret; + context.stream()); + // TODO(wilber): Ir add sync op. + cudaStreamSynchronize(context.stream()); } else { // h2d - ::phi::DenseTensor ret( - const_cast<::phi::Allocator*>(&context.GetAllocator()), input.meta()); CHECK(input.place().GetType() == ::phi::AllocationType::CPU || input.place().GetType() == ::phi::AllocationType::GPUPINNED); + + if (input.numel() > output->numel()) { + output->Resize(input.dims()); + context.Alloc(output, + input.dtype(), + input.numel() * SizeOfDataType(input.dtype()), + false); + + } else { + output->Resize(input.dims()); + } + // TODO(wilber): Add sync op and stream. - cudaMemcpyAsync(ret.data(), + cudaMemcpyAsync(output->data(), input.data(), SizeOfDataType(input.dtype()) * input.numel(), cudaMemcpyHostToDevice, - nullptr); - return ret; + context.stream()); } } #endif diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index b107544473..c401fb9997 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -76,9 +76,10 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor); int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map); #ifdef INFRT_WITH_GPU -::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input, - const ::phi::GPUContext& context, - bool d2h); +void GpuMemCpy(const ::phi::DenseTensor& input, + const ::phi::GPUContext& context, + bool d2h, + ::phi::DenseTensor* output); #endif } // namespace phi diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index 65e137472b..2e952e77d1 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -119,6 +119,7 @@ void NaiveMatmul(const DenseHostTensor &x, const int N = w.shape().GetDim(1); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { + out_data[i * N + j] = 0; for (int k = 0; k < K; k++) { out_data[i * N + j] += x_data[i * K + k] * w_data[k * N + j]; } @@ -134,9 +135,11 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) { {"shape"}); registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor)); registry->AddKernel("dt.fill_tensor_with_constant.f32", - INFRT_KERNEL(FillTensorWithConstant)); + INFRT_KERNEL(FillTensorWithConstant), + {"value"}); registry->AddKernel("dt.fill_tensor_with_constant.f64", - INFRT_KERNEL(FillTensorWithConstant)); + INFRT_KERNEL(FillTensorWithConstant), + {"value"}); // TensorMap related methods. registry->AddKernel("dt.load_params", INFRT_KERNEL(LoadParams)); diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc index c182dda270..c0f5ebb4a7 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -57,7 +57,7 @@ namespace tensorrt { // TODO(wilber): The build option shoule be fiiled from mlir info. backends::tensorrt::BuildOptions options; options.max_batch = 4; - options.workspace = 1024; + options.workspace = 128; // Parse mlir Region which only has one block. mlir::Operation& operation = *create_engine_op.operation; diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h index 9d8eba0bb3..0f2c2c88ca 100644 --- a/paddle/infrt/kernel/tensorrt/trt_layers.h +++ b/paddle/infrt/kernel/tensorrt/trt_layers.h @@ -115,6 +115,27 @@ inline void PoolFunc(trt::PoolingOp& op, // NOLINT // TODO(Inference) // CHECK(false) << "Not supported adaptive pool"; + // TODO(wilber): Reformat. + // global average pooling. + auto ksize_vec = ArrayAttrToVec(ksize); + if (static_cast(pool_type) == + nvinfer1::PoolingType::kAVERAGE && + ksize_vec.size() == 2 && ksize_vec[0] == 1 && ksize_vec[1] == 1) { + nvinfer1::Dims dims; + dims.nbDims = 2; + dims.d[0] = input_shape.d[1]; + dims.d[1] = input_shape.d[2]; + auto* layer = network->addPoolingNd( + *input_itensor, static_cast(pool_type), dims); + CHECK_NOTNULL(layer); + + mlir::Value out_repr = op.output_tensor(); + nvinfer1::ITensor* out_tensor = layer->getOutput(0); + value_to_trt_tensor_map[out_repr] = out_tensor; + return; + } + + // plugin... std::vector input_shape_v; for (int i = 0; i < input_dims; i++) { input_shape_v.push_back(input_shape.d[i]); -- GitLab