未验证 提交 b2142d4a 编写于 作者: Z zhupengyang 提交者: GitHub

[NPU] share buffer between lite tensor and npu tensor (#3606)

上级 620b801b
......@@ -139,6 +139,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
predictor->Run();
}
for (int i = 0; i < FLAGS_repeats; i++) {
FillInputTensors(predictor, input_tensor_shape, input_tensor_type, i);
auto start = GetCurrentUS();
predictor->Run();
LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
......
......@@ -212,12 +212,6 @@ int SubgraphEngine::LaunchDeviceProgram() {
hiai::AI_SUCCESS);
VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
// Copy the data of output HiAI tensor to the buffer of origin output tensors
for (size_t i = 0; i < device_otensors_.size(); i++) {
std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
device_otensors_[i]->GetBuffer(),
device_otensors_[i]->GetSize());
}
return 0;
}
......@@ -236,16 +230,34 @@ int SubgraphEngine::Build() {
void SubgraphEngine::InitDeviceTensor() {
auto device_program = device_program_map_[inputs_shape_];
for (size_t i = 0; i < device_itensors_.size(); i++) {
device_itensors_[i]->Init(&(device_program->device_idims[i]));
std::memcpy(device_itensors_[i]->GetBuffer(),
origin_itensors_[i]->raw_data(),
origin_itensors_[i]->memory_size());
if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) {
VLOG(3) << "init device_itensors and share input tensor buf between "
"device and host";
device_itensors_[i]->Init(&(device_program->device_idims[i]));
std::memcpy(device_itensors_[i]->GetBuffer(),
origin_itensors_[i]->raw_data(),
origin_itensors_[i]->memory_size());
// share data buf between device_itensor and origin_itensor
std::shared_ptr<Buffer> buffer =
std::make_shared<Buffer>(device_itensors_[i]->GetBuffer(),
lite_api::TargetType::kHost,
device_itensors_[i]->GetSize());
origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize());
}
}
for (size_t i = 0; i < device_otensors_.size(); i++) {
device_otensors_[i]->Init(&(device_program->device_odims[i]));
}
for (size_t i = 0; i < origin_otensors_.size(); i++) {
origin_otensors_[i]->Resize(device_program->origin_odims[i]);
if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) {
VLOG(3) << "init device_otensors and share output tensor buf between "
"device and host";
device_otensors_[i]->Init(&(device_program->device_odims[i]));
// share data buf between device_itensor and origin_itensor
origin_otensors_[i]->Resize(device_program->origin_odims[i]);
std::shared_ptr<Buffer> buffer =
std::make_shared<Buffer>(device_otensors_[i]->GetBuffer(),
lite_api::TargetType::kHost,
device_otensors_[i]->GetSize());
origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize());
}
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册