提交 a66b29d7 编写于 作者: Z zhupengyang 提交者: GitHub

[NPU] share buffer between lite tensor and npu tensor (#3606)

上级 6fa68a5a
...@@ -139,6 +139,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel( ...@@ -139,6 +139,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
predictor->Run(); predictor->Run();
} }
for (int i = 0; i < FLAGS_repeats; i++) { for (int i = 0; i < FLAGS_repeats; i++) {
FillInputTensors(predictor, input_tensor_shape, input_tensor_type, i);
auto start = GetCurrentUS(); auto start = GetCurrentUS();
predictor->Run(); predictor->Run();
LOG(INFO) << i << ", " << GetCurrentUS() - start << "us"; LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
......
...@@ -212,12 +212,6 @@ int SubgraphEngine::LaunchDeviceProgram() { ...@@ -212,12 +212,6 @@ int SubgraphEngine::LaunchDeviceProgram() {
hiai::AI_SUCCESS); hiai::AI_SUCCESS);
VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
// Copy the data of output HiAI tensor to the buffer of origin output tensors
for (size_t i = 0; i < device_otensors_.size(); i++) {
std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
device_otensors_[i]->GetBuffer(),
device_otensors_[i]->GetSize());
}
return 0; return 0;
} }
...@@ -236,16 +230,34 @@ int SubgraphEngine::Build() { ...@@ -236,16 +230,34 @@ int SubgraphEngine::Build() {
void SubgraphEngine::InitDeviceTensor() { void SubgraphEngine::InitDeviceTensor() {
auto device_program = device_program_map_[inputs_shape_]; auto device_program = device_program_map_[inputs_shape_];
for (size_t i = 0; i < device_itensors_.size(); i++) { for (size_t i = 0; i < device_itensors_.size(); i++) {
device_itensors_[i]->Init(&(device_program->device_idims[i])); if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) {
std::memcpy(device_itensors_[i]->GetBuffer(), VLOG(3) << "init device_itensors and share input tensor buf between "
origin_itensors_[i]->raw_data(), "device and host";
origin_itensors_[i]->memory_size()); device_itensors_[i]->Init(&(device_program->device_idims[i]));
std::memcpy(device_itensors_[i]->GetBuffer(),
origin_itensors_[i]->raw_data(),
origin_itensors_[i]->memory_size());
// share data buf between device_itensor and origin_itensor
std::shared_ptr<Buffer> buffer =
std::make_shared<Buffer>(device_itensors_[i]->GetBuffer(),
lite_api::TargetType::kHost,
device_itensors_[i]->GetSize());
origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize());
}
} }
for (size_t i = 0; i < device_otensors_.size(); i++) { for (size_t i = 0; i < device_otensors_.size(); i++) {
device_otensors_[i]->Init(&(device_program->device_odims[i])); if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) {
} VLOG(3) << "init device_otensors and share output tensor buf between "
for (size_t i = 0; i < origin_otensors_.size(); i++) { "device and host";
origin_otensors_[i]->Resize(device_program->origin_odims[i]); device_otensors_[i]->Init(&(device_program->device_odims[i]));
// share data buf between device_itensor and origin_itensor
origin_otensors_[i]->Resize(device_program->origin_odims[i]);
std::shared_ptr<Buffer> buffer =
std::make_shared<Buffer>(device_otensors_[i]->GetBuffer(),
lite_api::TargetType::kHost,
device_otensors_[i]->GetSize());
origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize());
}
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册