From aa67c292e612ad99844a4da0aa2ff9bf9a38fc2a Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Tue, 19 Apr 2022 15:27:04 +0800 Subject: [PATCH] [infrt] support resnet50 on gpu backend (#41473) --- paddle/infrt/api/infrt_api.cc | 23 +++- paddle/infrt/api/infrt_api.h | 4 + paddle/infrt/api/infrt_api_test.cc.in | 114 ++++++++++++++++++ paddle/infrt/backends/host/phi_allocator.h | 9 +- paddle/infrt/dialect/init_dialects.cc | 3 +- .../infrt/dialect/phi/ir/infrt_phi_tensor.td | 15 +++ .../dialect/phi/pass/phi_op_convert_pass.cc | 75 ++++++++++-- .../infrt/kernel/phi/dense_tensor_kernels.cc | 39 +++++- .../infrt/kernel/phi/dense_tensor_kernels.h | 3 + paddle/infrt/kernel/phi/registry.cc | 3 + paddle/infrt/tests/dialect/phi/phi_pass.mlir | 16 +-- 11 files changed, 269 insertions(+), 35 deletions(-) diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc index ec6293f9a2b..f0bf46567a5 100644 --- a/paddle/infrt/api/infrt_api.cc +++ b/paddle/infrt/api/infrt_api.cc @@ -270,6 +270,12 @@ int InfRtPredictor::Init(const InfRtConfig& config) { {::infrt::TargetType::CPU, ::infrt::PrecisionType::FLOAT32, ::infrt::LayoutType::NCHW}}; + if (config.gpu_enabled()) { + valid_places.insert(valid_places.begin(), + ::infrt::Place(::infrt::TargetType::GPU, + ::infrt::PrecisionType::FLOAT32, + ::infrt::LayoutType::NCHW)); + } pass_manager.addPass(CreatePhiOpCvtPass(valid_places)); pass_manager.addPass(CreateInfrtOpFusePass()); } @@ -300,12 +306,19 @@ int InfRtPredictor::Init(const InfRtConfig& config) { } // Load params - auto tensor_map = ::infrt::kernel::phi::LoadCombinedParameters( - config.model_dir(), config.param_dir()); + if (config.gpu_enabled() && !config.tensorrt_enabled()) { + auto tensor_map = ::infrt::kernel::phi::LoadCombinedParamsToGpu( + config.model_dir(), config.param_dir()); + impl_->executor.reset( + new PredictExecutor(module_op, registry, std::move(tensor_map))); + + } else { + auto tensor_map = ::infrt::kernel::phi::LoadCombinedParameters( + config.model_dir(), config.param_dir()); + impl_->executor.reset( + new PredictExecutor(module_op, registry, std::move(tensor_map))); + } - // Create PredictExecutor - impl_->executor.reset( - new PredictExecutor(module_op, registry, std::move(tensor_map))); return 0; } diff --git a/paddle/infrt/api/infrt_api.h b/paddle/infrt/api/infrt_api.h index 231f496bb89..fcaed78bdd9 100644 --- a/paddle/infrt/api/infrt_api.h +++ b/paddle/infrt/api/infrt_api.h @@ -27,6 +27,7 @@ class InfRtConfig { std::vector shared_libs_; // TODO(wilber): Design an easy-to-use interface. + bool gpu_enabled_{false}; bool tensorrt_enabled_{false}; public: @@ -42,6 +43,9 @@ class InfRtConfig { } const std::vector& shared_libs() const { return shared_libs_; } + void enable_gpu() { gpu_enabled_ = true; } + bool gpu_enabled() const { return gpu_enabled_; } + // TODO(wilber): Design an easy-to-use interface. void enable_tensorrt() { tensorrt_enabled_ = true; } void disable_tensorrt() { tensorrt_enabled_ = false; } diff --git a/paddle/infrt/api/infrt_api_test.cc.in b/paddle/infrt/api/infrt_api_test.cc.in index 13635ddaaab..f7d1c97603c 100644 --- a/paddle/infrt/api/infrt_api_test.cc.in +++ b/paddle/infrt/api/infrt_api_test.cc.in @@ -57,6 +57,57 @@ TEST(InfRtPredictor, predictor) { ASSERT_EQ(output->dims(), ::phi::DDim({16, 10})); } +TEST(InfRtPredictor, cpu_predictor) { + std::vector shared_libs; + + InfRtConfig config; + + config.set_model_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdmodel"); + config.set_param_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdiparams"); + + std::unique_ptr predictor = CreateInfRtPredictor(config); + + ::infrt::backends::CpuPhiAllocator cpu_allocator; + ::phi::DenseTensor* input = predictor->GetInput(0); + input->Resize({2, 3, 256, 256}); + input->AllocateFrom(&cpu_allocator, ::phi::DataType::FLOAT32); + auto* input_data = reinterpret_cast(input->data()); + for (int i = 0; i < input->numel(); i++) input_data[i] = 1.0; + + for(int i = 0; i < 10; i++) { + predictor->Run(); + } + auto start = std::chrono::steady_clock::now(); + for(int i = 0; i < 10; i++) { + predictor->Run(); + } + auto end = std::chrono::steady_clock::now(); + auto msec = std::chrono::duration_cast(end-start); + std::cout <<"One predict period costs " << msec.count()/1000 << "ms.\n"; + + // get and print output tensor + auto* output = predictor->GetOutput(0); + + ASSERT_EQ(output->dims(), ::phi::DDim({2, 1000})); + const std::vector true_vals { + -3.319006264209747314e-01, -1.418896913528442383e+00, + -6.934890151023864746e-01, -1.498023152351379395e+00, + 3.078042864799499512e-01, -1.340998053550720215e+00, + 3.508620023727416992e+00, 2.274388313293457031e+00, + -1.321727275848388672e+00, -8.888689428567886353e-02, + -3.319006264209747314e-01, -1.418896913528442383e+00, + -6.934890151023864746e-01, -1.498023152351379395e+00, + 3.078042864799499512e-01, -1.340998053550720215e+00, + 3.508620023727416992e+00, 2.274388313293457031e+00, + -1.321727275848388672e+00, -8.888689428567886353e-02 + }; + + for (size_t i = 0; i < true_vals.size(); i+=100) { + CHECK_NEAR(output->data()[i*100], true_vals[i], 1e-5); + } +} + + #ifdef INFRT_WITH_TRT TEST(InfRtPredictor, trt_predictor) { std::vector shared_libs; @@ -100,4 +151,67 @@ TEST(InfRtPredictor, trt_predictor) { } #endif +#ifdef INFRT_WITH_GPU +TEST(InfRtPredictor, gpu_predictor) { + std::vector shared_libs; + + InfRtConfig config; + config.enable_gpu(); + + config.set_model_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdmodel"); + config.set_param_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdiparams"); + + std::unique_ptr predictor = CreateInfRtPredictor(config); + + ::infrt::backends::GpuPhiAllocator gpu_allocator; + + + ::phi::DenseTensor* input = predictor->GetInput(0); + input->Resize({2, 3, 256, 256}); + input->AllocateFrom(&gpu_allocator, ::phi::DataType::FLOAT32); + auto* data = reinterpret_cast(input->data()); + + std::vector input_data(2 * 3 * 256 * 256, 1.0); + cudaMemcpy(data, + input_data.data(), + sizeof(float) * input->numel(), + cudaMemcpyHostToDevice); + + for(int i = 0; i < 10; i++) { + predictor->Run(); + } + auto start = std::chrono::steady_clock::now(); + for(int i = 0; i < 1000; i++) { + predictor->Run(); + } + auto end = std::chrono::steady_clock::now(); + auto msec = std::chrono::duration_cast(end-start); + std::cout <<"One predict period costs " << msec.count()/1000 << "ms.\n"; + + auto* output = predictor->GetOutput(0); + std::vector output_data(output->numel()); + cudaMemcpy(output_data.data(), + output->data(), + sizeof(float) * output->numel(), + cudaMemcpyDeviceToHost); + + ASSERT_EQ(output->dims(), ::phi::DDim({2, 1000})); + const std::vector true_vals { + -3.319006264209747314e-01, -1.418896913528442383e+00, + -6.934890151023864746e-01, -1.498023152351379395e+00, + 3.078042864799499512e-01, -1.340998053550720215e+00, + 3.508620023727416992e+00, 2.274388313293457031e+00, + -1.321727275848388672e+00, -8.888689428567886353e-02, + -3.319006264209747314e-01, -1.418896913528442383e+00, + -6.934890151023864746e-01, -1.498023152351379395e+00, + 3.078042864799499512e-01, -1.340998053550720215e+00, + 3.508620023727416992e+00, 2.274388313293457031e+00, + -1.321727275848388672e+00, -8.888689428567886353e-02 + }; + for (size_t i = 0; i < true_vals.size(); i+=100) { + CHECK_NEAR(output_data[i*100], true_vals[i], 1e-5); + } +} +#endif + } // namespace infrt diff --git a/paddle/infrt/backends/host/phi_allocator.h b/paddle/infrt/backends/host/phi_allocator.h index 6e3bef92991..810c79509e7 100644 --- a/paddle/infrt/backends/host/phi_allocator.h +++ b/paddle/infrt/backends/host/phi_allocator.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/memory/malloc.h" #include "paddle/phi/core/allocator.h" #ifdef INFRT_WITH_GPU @@ -40,12 +41,8 @@ class GpuPhiAllocator : public phi::Allocator { static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); } AllocationPtr Allocate(size_t bytes_size) { - void* ptr; - cudaMalloc(&ptr, bytes_size); - return AllocationPtr( - new phi::Allocation( - ptr, bytes_size, phi::Place(phi::AllocationType::GPU)), - deleter); + return paddle::memory::Alloc(phi::Place(phi::AllocationType::GPU), + bytes_size); } }; #endif diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc index 56c375c72d2..8da34bd404b 100644 --- a/paddle/infrt/dialect/init_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -34,9 +34,8 @@ void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT InfrtDialect, dt::DTDialect, pd::PaddleDialect, - trt::TensorRTDialect + trt::TensorRTDialect, #ifdef INFRT_WITH_PHI - , phi::PHIDenseTensorDialect, phi::PHICPUKernelDialect, phi::PHIGPUKernelDialect, diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index 2078ebb1442..7e612be05b1 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -40,6 +40,13 @@ def CreateHostInitedDenseTensorOp : PDT_Op<"create_host_inited_dense_tensor.f32" let results = (outs DenseTensor:$output); } +def CreateInitedGpuFLOAT32DenseTensorOp + : PDT_Op<"create_inited_dense_tensor.gpu.f32", [NoSideEffect]> { + let arguments = (ins Context:$context, I64ArrayAttr:$dims, + LayoutAttr:$layout, I64ArrayAttr:$lod, F32Attr:$value); + let results = (outs DenseTensor:$output); +} + def CreateInitedCpuFLOAT32DenseTensorOp : PDT_Op<"create_inited_dense_tensor.cpu.f32", [NoSideEffect]> { let arguments = (ins Context:$context, I64ArrayAttr:$dims, @@ -86,6 +93,14 @@ def PDT_LoadCombinedParamsOp : PDT_Op<"load_combined_params", [NoSideEffect]> { let assemblyFormat = "`(``)`attr-dict"; } +def PDT_LoadCombinedParamsGpuOp : PDT_Op<"load_combined_params_to_gpu", [NoSideEffect]> { + // input path of model params. + let arguments = (ins StrAttr:$model_path, StrAttr:$params_path); + let results = (outs PD_DenseTensorMap:$out); + + let assemblyFormat = "`(``)`attr-dict"; +} + def PDT_TensorMapGetSizeOp : PDT_Op<"tensor_map_get_size", [NoSideEffect]> { let arguments = (ins PD_DenseTensorMap:$map); let results = (outs I32:$size); diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc index e9b426a5088..4bf39d4f660 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc @@ -76,6 +76,7 @@ class PhiOpConvertPass void getDependentDialects(mlir::DialectRegistry ®istry) const override; private: + void updateInputsAndResults(infrt::TargetType target); void convertStage(); void dispatchStage(); @@ -110,10 +111,50 @@ mlir::LogicalResult PhiOpConvertPass::initialize(mlir::MLIRContext *context) { // Implementation of the PhiOpConvertPass. void PhiOpConvertPass::runOnFunction() { + updateInputsAndResults(valid_places_[0].target); convertStage(); dispatchStage(); } +void PhiOpConvertPass::updateInputsAndResults(infrt::TargetType target) { + mlir::Block &body = getFunction().front(); + auto loc = getFunction().getLoc(); + mlir::Operation &operation = body.front(); + mlir::MLIRContext *context = operation.getContext(); + size_t num_input = body.getNumArguments(); + + // step1. update input cpu tensors into gpu tensors + for (size_t index = 0; index < num_input; index++) { + auto argument = body.getArgument(index); + if (auto t = argument.getType().dyn_cast<::infrt::DenseTensorType>()) { + mlir::Type replace_type = infrt::DenseTensorType::get( + context, target, t.getPrecision(), infrt::LayoutType::NCHW); + getFunction().insertArgument(index, replace_type, {}, loc); + argument.replaceAllUsesWith(getFunction().getArgument(index)); + getFunction().eraseArgument(index + 1); + } + } + // update output tensors + unsigned int num_result = getFunction().getNumResults(); + for (unsigned int index = 0; index < num_result; index++) { + mlir::Type replace_type = + infrt::DenseTensorType::get(context, + target, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW); + getFunction().eraseResult(index); + getFunction().insertResult(index, replace_type, {}); + } + // update dense_tensor_map + mlir::Type replace_type = infrt::DenseTensorType::get( + context, target, infrt::PrecisionType::FLOAT32, infrt::LayoutType::NCHW); + + for (auto &op : body.without_terminator()) { + if (op.getName().getIdentifier().str() == "phi_dt.tensor_map_get_tensor") + op.getResult(0).setType(replace_type); + } +} + void PhiOpConvertPass::convertStage() { mlir::Block &body = getFunction().front(); std::vector worklist; @@ -200,6 +241,7 @@ void PhiOpConvertPass::dispatchStage() { mlir::OpBuilder builder(&block, block.begin()); std::map phi_context; + for (infrt::KernelOp kernel_op : worklist) { std::string kernel_name = kernel_op.name().str(); std::vector candidates = @@ -257,15 +299,25 @@ void PhiOpConvertPass::dispatchStage() { for (size_t index = 0; index < phi_kernel_desc.input_types.size(); ++index) { mlir::Value input = kernel_op.getOperand(index); - auto cvt_tensor_type_op = builder.create( - kernel_op.getLoc(), - infrt::DenseTensorType::get( - kernel_op.getContext(), - phi_kernel_desc.input_types[index].target, - phi_kernel_desc.input_types[index].precision, - phi_kernel_desc.input_types[index].layout), - input); - operation_state.addOperands(cvt_tensor_type_op.output()); + if (input.getType().dyn_cast<::infrt::DenseTensorType>().getTarget() == + ::infrt::TargetType::CPU && + phi_kernel_desc.input_types[index].target == + ::infrt::TargetType::GPU) { + auto cvt_tensor_type_op = builder.create( + kernel_op.getLoc(), + infrt::DenseTensorType::get( + kernel_op.getContext(), + phi_kernel_desc.input_types[index].target, + phi_kernel_desc.input_types[index].precision, + phi_kernel_desc.input_types[index].layout), + input, + phi_context[infrt::TargetType::GPU], + mlir::BoolAttr::get(kernel_op.getContext(), /*d2h*/ false)); + + operation_state.addOperands(cvt_tensor_type_op.output()); + } else { + operation_state.addOperands(input); + } } for (size_t index = 0; index < phi_kernel_desc.output_types.size(); @@ -280,11 +332,8 @@ void PhiOpConvertPass::dispatchStage() { mlir::Operation *phi_operation = builder.createOperation(operation_state); for (size_t index = 0; index < phi_kernel_desc.output_types.size(); ++index) { - mlir::Value input = phi_operation->getResult(index); - auto cvt_tensor_type_op = builder.create( - kernel_op.getLoc(), kernel_op.getResultTypes()[index], input); kernel_op.getResult(index).replaceAllUsesWith( - cvt_tensor_type_op.output()); + phi_operation->getResult(index)); } kernel_op.erase(); } diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index a9b18c769dc..95e25b243f3 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -62,7 +62,7 @@ namespace phi { ::phi::make_ddim(dims.get()), ConvertLayoutToPhi(layout.get()), {})); - float* a_data = dense_tensor.mutable_data(::phi::CPUPlace()); + float* a_data = dense_tensor.mutable_data(context.GetPlace()); for (int64_t i = 0; i < dense_tensor.numel(); ++i) { a_data[i] = value.get(); } @@ -260,6 +260,43 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { return map; } +::infrt::phi::DenseTensorMap LoadCombinedParamsToGpu( + const std::string& model_path, const std::string& params_path) { + ::infrt::phi::DenseTensorMap map; + + auto pb_proto_prog = paddle::LoadProgram(model_path); + auto main_block = pb_proto_prog->blocks(0); + + std::ifstream param_file(params_path, std::ios::binary); + + std::set tmp; + for (auto& var : main_block.vars()) { + if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) { + continue; + } + if (var.type().type() == + ::paddle::framework::proto::VarType_Type_LOD_TENSOR) { + tmp.emplace(var.name()); + } else { + llvm_unreachable("the tensor type is illegal."); + } + } + +#ifdef INFRT_WITH_GPU + ::phi::GPUContext ctx; + ctx.PartialInitWithoutAllocator(); + + for (auto& var : tmp) { + std::unique_ptr<::phi::DenseTensor> tensor{ + std::make_unique<::phi::DenseTensor>()}; + ::paddle::framework::DeserializeFromStream(param_file, tensor.get(), ctx); + map.SetDenseTensor(var, std::move(tensor)); + } +#endif + + return map; +} + ::infrt::phi::DenseTensorMap LoadCombinedParams( host_context::Attribute model_path, host_context::Attribute params_path) { diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index c401fb99978..573b8f102ec 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -73,6 +73,9 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor); ::infrt::phi::DenseTensorMap LoadCombinedParameters( const std::string& model_path, const std::string& params_path); +::infrt::phi::DenseTensorMap LoadCombinedParamsToGpu( + const std::string& model_path, const std::string& params_path); + int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map); #ifdef INFRT_WITH_GPU diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 848ff28faff..fa51ab3566d 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -68,6 +68,9 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { registry->AddKernel("phi_dt.load_params", INFRT_KERNEL(infrt::kernel::phi::LoadParams), {"path"}); + registry->AddKernel("phi_dt.load_combined_params_to_gpu", + INFRT_KERNEL(infrt::kernel::phi::LoadCombinedParamsToGpu), + {"model_path", "params_path"}); registry->AddKernel("phi_dt.load_combined_params", INFRT_KERNEL(infrt::kernel::phi::LoadCombinedParams), {"model_path", "params_path"}); diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir index 0d9e312ce0b..6c5a98f45ce 100644 --- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir @@ -1,15 +1,15 @@ // RUN: infrtopt -phi-op-convert=valid-targets=CPU-FP32-NCHW -infrt-op-fuse %s // CHECK-LABEL: @ops -func @ops(%a:!infrt.lod_tensor, %b:!infrt.lod_tensor) { - %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor, !infrt.lod_tensor) -> tensor - %h = "pd.abs"(%g):(tensor) -> tensor - infrt.return %h:tensor +func @ops(%a:!infrt.dense_tensor, %b:!infrt.dense_tensor) { + %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %h = "pd.abs"(%g):(!infrt.dense_tensor) -> !infrt.dense_tensor + infrt.return %h:!infrt.dense_tensor } // CHECK-LABEL: @op_execute -func @op_execute(%a:!infrt.lod_tensor, %b:!infrt.lod_tensor, %c:!infrt.lod_tensor) -> !infrt.lod_tensor { - %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor, !infrt.lod_tensor) -> tensor - %h = "pd.abs"(%g):(tensor) -> tensor - infrt.return %h:tensor +func @op_execute(%a:!infrt.dense_tensor, %b:!infrt.dense_tensor, %c:!infrt.dense_tensor) -> !infrt.dense_tensor { + %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %h = "pd.abs"(%g):(!infrt.dense_tensor) -> !infrt.dense_tensor + infrt.return %h:!infrt.dense_tensor } -- GitLab