From a66b29d714d46bf6471bfa21e37a6c8d02909dbb Mon Sep 17 00:00:00 2001 From: zhupengyang Date: Tue, 12 May 2020 19:35:50 +0800 Subject: [PATCH] [NPU] share buffer between lite tensor and npu tensor (#3606) --- lite/core/mir/subgraph/subgraph_pass_test.cc | 1 + lite/kernels/npu/subgraph_compute.cc | 40 +++++++++++++------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index c638793c08..ee2d67e918 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -139,6 +139,7 @@ std::shared_ptr TestModel( predictor->Run(); } for (int i = 0; i < FLAGS_repeats; i++) { + FillInputTensors(predictor, input_tensor_shape, input_tensor_type, i); auto start = GetCurrentUS(); predictor->Run(); LOG(INFO) << i << ", " << GetCurrentUS() - start << "us"; diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc index 1a991bfc74..da2fd3ead2 100644 --- a/lite/kernels/npu/subgraph_compute.cc +++ b/lite/kernels/npu/subgraph_compute.cc @@ -212,12 +212,6 @@ int SubgraphEngine::LaunchDeviceProgram() { hiai::AI_SUCCESS); VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; - // Copy the data of output HiAI tensor to the buffer of origin output tensors - for (size_t i = 0; i < device_otensors_.size(); i++) { - std::memcpy(const_cast(origin_otensors_[i]->raw_data()), - device_otensors_[i]->GetBuffer(), - device_otensors_[i]->GetSize()); - } return 0; } @@ -236,16 +230,34 @@ int SubgraphEngine::Build() { void SubgraphEngine::InitDeviceTensor() { auto device_program = device_program_map_[inputs_shape_]; for (size_t i = 0; i < device_itensors_.size(); i++) { - device_itensors_[i]->Init(&(device_program->device_idims[i])); - std::memcpy(device_itensors_[i]->GetBuffer(), - origin_itensors_[i]->raw_data(), - origin_itensors_[i]->memory_size()); + if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) { + VLOG(3) << "init device_itensors and share input tensor buf between " + "device and host"; + device_itensors_[i]->Init(&(device_program->device_idims[i])); + std::memcpy(device_itensors_[i]->GetBuffer(), + origin_itensors_[i]->raw_data(), + origin_itensors_[i]->memory_size()); + // share data buf between device_itensor and origin_itensor + std::shared_ptr buffer = + std::make_shared(device_itensors_[i]->GetBuffer(), + lite_api::TargetType::kHost, + device_itensors_[i]->GetSize()); + origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize()); + } } for (size_t i = 0; i < device_otensors_.size(); i++) { - device_otensors_[i]->Init(&(device_program->device_odims[i])); - } - for (size_t i = 0; i < origin_otensors_.size(); i++) { - origin_otensors_[i]->Resize(device_program->origin_odims[i]); + if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) { + VLOG(3) << "init device_otensors and share output tensor buf between " + "device and host"; + device_otensors_[i]->Init(&(device_program->device_odims[i])); + // share data buf between device_itensor and origin_itensor + origin_otensors_[i]->Resize(device_program->origin_odims[i]); + std::shared_ptr buffer = + std::make_shared(device_otensors_[i]->GetBuffer(), + lite_api::TargetType::kHost, + device_otensors_[i]->GetSize()); + origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize()); + } } } -- GitLab