diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc index 6009e71e05c33f6dedfd995020612e112c888d36..9e2ef7538675486072d43913c1a3973971277a23 100644 --- a/lite/kernels/apu/subgraph_compute.cc +++ b/lite/kernels/apu/subgraph_compute.cc @@ -28,7 +28,7 @@ namespace lite { namespace kernels { namespace apu { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::BuildDeviceProgram() { unsigned int version; Neuron_getVersion(&version); VLOG(3) << "Neuron Adapter version: " << version; @@ -38,7 +38,7 @@ int SubgraphEngine::BuildDeviceProgram() { int neuron_errCode = NeuronModel_create(&model_); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Fail to create model"; - return subgraph::FAILED; + return false; } graph.set_model(model_); graph.set_input_names(input_names_); @@ -46,6 +46,9 @@ int SubgraphEngine::BuildDeviceProgram() { // Convert all of ops and their input vars and weights and added into the APU // NIR graph + if (origin_program_.empty()) { + BuildOriginProgram(); + } const auto& bridges = subgraph::Registry::Instance(); for (auto& inst : origin_program_) { auto op = const_cast(inst.op()); @@ -54,7 +57,7 @@ int SubgraphEngine::BuildDeviceProgram() { op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kAPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); @@ -63,7 +66,7 @@ int SubgraphEngine::BuildDeviceProgram() { const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } @@ -84,7 +87,7 @@ int SubgraphEngine::BuildDeviceProgram() { VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index(); } else { LOG(WARNING) << "Fail to find input: " << input_names_[i]; - return subgraph::FAILED; + return false; } } @@ -105,7 +108,7 @@ int SubgraphEngine::BuildDeviceProgram() { VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index(); } else { LOG(WARNING) << "Fail to find output: " << output_names_[i]; - return subgraph::FAILED; + return false; } } @@ -116,7 +119,7 @@ int SubgraphEngine::BuildDeviceProgram() { neuron_errCode = NeuronModel_finish(model_); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode; - return subgraph::FAILED; + return false; } VLOG(3) << "[APU] APU NIR model created!"; @@ -129,15 +132,14 @@ int SubgraphEngine::BuildDeviceProgram() { compilation_ = lite::apu::Device::Global().Build(model_); if (compilation_ == nullptr) { LOG(WARNING) << "[APU] Build APU DLA model failed!"; - return subgraph::FAILED; + return false; } VLOG(3) << "[APU] APU DLA model created, Build cost " << GetCurrentUS() - start_time << " us"; - - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); @@ -149,7 +151,7 @@ int SubgraphEngine::LaunchDeviceProgram() { int neuron_errCode = NeuronExecution_create(compilation_, &run); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "[APU] Build APU runtime failed!"; - return subgraph::FAILED; + return false; } // Set input buffer @@ -180,7 +182,7 @@ int SubgraphEngine::LaunchDeviceProgram() { neuron_errCode = NeuronExecution_compute(run); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Fail to run execution!" << neuron_errCode; - return subgraph::FAILED; + return false; } for (size_t i = 0; i < origin_otensors_.size(); i++) { @@ -192,7 +194,7 @@ int SubgraphEngine::LaunchDeviceProgram() { } NeuronExecution_free(run); VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us"; - return 0; + return true; } SubgraphEngine::~SubgraphEngine() { @@ -213,12 +215,11 @@ void SubgraphCompute::PrepareForRun() { param.output_data_names, param.scope)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace apu diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h index ecd8a38343cd1f62bb5a3bf8e948384b90cfe826..beb582b8cc16e456491c28ace5e2d1695143216a 100644 --- a/lite/kernels/apu/subgraph_compute.h +++ b/lite/kernels/apu/subgraph_compute.h @@ -41,8 +41,8 @@ class SubgraphEngine : public subgraph::Engine { ~SubgraphEngine(); protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; NeuronModel *model_; NeuronCompilation *compilation_; diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc index c6059461d1e790064407009cfc0aa3cfcdec8935..868481f4b8419c39131c145eb85ff450686482a8 100644 --- a/lite/kernels/bm/subgraph_compute.cc +++ b/lite/kernels/bm/subgraph_compute.cc @@ -28,12 +28,35 @@ namespace lite { namespace kernels { namespace bm { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_inputs_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + device_inputs_[i].reset(new hiai::AiTensor); + CHECK(device_inputs_[i]); + } + device_outputs_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + device_outputs_[i].reset(new hiai::AiTensor); + CHECK(device_outputs_[i]); + } + return true; +} + +bool SubgraphEngine::BuildDeviceProgram() { int status = 0; subgraph::bm::Graph graph; const auto& bridges = subgraph::Registry::Instance(); graph.CreateCompilerHandle(); auto& ctx = this->ctx_->template As(); + if (origin_program_.empty()) { + BuildOriginProgram(); + } for (auto& inst : origin_program_) { auto op = const_cast(inst.op()); CHECK(op); @@ -41,7 +64,7 @@ int SubgraphEngine::BuildDeviceProgram() { op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kBM))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= @@ -49,7 +72,7 @@ int SubgraphEngine::BuildDeviceProgram() { const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } std::string net_name = "bmnetc_f32umodel"; @@ -61,7 +84,7 @@ int SubgraphEngine::BuildDeviceProgram() { finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size); bmrt_hd_ = bmrt_create(bm_hd_); if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) { - return subgraph::FAILED; + return false; } bmrt_get_network_names(bmrt_hd_, &net_names_); net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]); @@ -114,10 +137,10 @@ int SubgraphEngine::BuildDeviceProgram() { net_info_->output_dtypes[i], stage.output_shapes[i]); } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { for (size_t i = 0; i < device_inputs_.size(); i++) { bm_memcpy_s2d(bm_hd_, device_inputs_[i].device_mem, @@ -141,7 +164,7 @@ int SubgraphEngine::LaunchDeviceProgram() { out_index++; } } - return 0; + return true; } void SubgraphCompute::PrepareForRun() { @@ -153,12 +176,11 @@ void SubgraphCompute::PrepareForRun() { param.output_data_names, param.scope)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace bm diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h index 60f7661c7990d90020dbfc7ec3a6e0d178dceb70..7a5b2552ff95681da09346ba11f40f1a6acb7f01 100644 --- a/lite/kernels/bm/subgraph_compute.h +++ b/lite/kernels/bm/subgraph_compute.h @@ -44,8 +44,9 @@ class SubgraphEngine : public subgraph::Engine { ctx, block_idx, block_desc, input_names, output_names, scope) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; private: void *bmrt_hd_; diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h index 3bfba33f4d7e8fd86f7aaf276da2ca4a8b0bd7cf..dbd055fe226aa1853bc8e33de7b4db17666558cc 100644 --- a/lite/kernels/mlu/subgraph_compute.h +++ b/lite/kernels/mlu/subgraph_compute.h @@ -46,34 +46,8 @@ class SubgraphEngine : public subgraph::Engine { graph_.SetFPType(type); } - int Build() { - // In order to attach all of the ops of the block desc, we need to build - // the original program firstly. - BuildOriginProgram(); - // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph - build_device_program_status_ = BuildDeviceProgram(); - return build_device_program_status_; - } - - int Launch() { - // Rebuild device program when the shapes of input tensors have been - // changed. - if (subgraph::CHECK_SUCCESS(build_device_program_status_) && - subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED( - build_device_program_status_) && - InputShapeChanged()) { - Build(); - } - if (subgraph::CHECK_FAILED(build_device_program_status_)) { - LaunchOriginProgram(); - } else { - LaunchDeviceProgram(); - } - return 0; - } - protected: - int BuildDeviceProgram() override { + bool BuildDeviceProgram() override { int status = 0; // Convert all of input data vars and added into the MLU IR graph for (auto& input_name : input_names_) { @@ -94,6 +68,9 @@ class SubgraphEngine : public subgraph::Engine { LOG(INFO) << "START TO CONVERT "; // Convert all of ops and its weights and added into the MLU IR graph const auto& bridges = subgraph::Registry::Instance(); + if (origin_program_.empty()) { + BuildOriginProgram(); + } for (auto& inst : origin_program_) { auto op = inst.op(); CHECK(op); @@ -102,7 +79,7 @@ class SubgraphEngine : public subgraph::Engine { const_cast(op)->InferShape(); if (!bridges.Exists(op_type, TARGET(kMLU))) { LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type; - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kMLU))( @@ -110,7 +87,7 @@ class SubgraphEngine : public subgraph::Engine { const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } // Obtain the output nodes of the MLU IR graph and build the graph to MLU @@ -138,10 +115,10 @@ class SubgraphEngine : public subgraph::Engine { auto core_version = mlu_context.MLUCoreVersion(); auto core_number = mlu_context.MLUCoreNumber(); graph_.Compile(core_version, core_number); - return status; + return true; } - int LaunchDeviceProgram() override { + bool LaunchDeviceProgram() override { auto& mlu_context = this->ctx_->template As(); auto exec_queue = mlu_context.exec_queue(); u32_t affinity = mlu_context.affinity(); @@ -151,7 +128,7 @@ class SubgraphEngine : public subgraph::Engine { forward_param.affinity = &affinity; forward_param.end = CNRT_PARAM_END; graph_.Compute(forward_param, exec_queue); - return 0; + return true; } paddle::lite::subgraph::mlu::Graph graph_; @@ -174,12 +151,11 @@ class SubgraphCompute param.scope, this->precision())); CHECK(engine_); - engine_->Build(); } void Run() override { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } virtual ~SubgraphCompute() = default; diff --git a/lite/kernels/rknpu/subgraph_compute.cc b/lite/kernels/rknpu/subgraph_compute.cc index e0b63205705609b6899918ce8e254ccdf6cbad47..a50505c38c0740f762256cd71e006caf9249838e 100644 --- a/lite/kernels/rknpu/subgraph_compute.cc +++ b/lite/kernels/rknpu/subgraph_compute.cc @@ -28,13 +28,36 @@ namespace lite { namespace kernels { namespace rknpu { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_itensors_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + device_itensors_[i].reset(new hiai::AiTensor); + CHECK(device_itensors_[i]); + } + device_otensors_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + device_otensors_[i].reset(new hiai::AiTensor); + CHECK(device_otensors_[i]); + } + return true; +} + +bool SubgraphEngine::BuildDeviceProgram() { LOG(INFO) << "[RKNPU]:BuildDeviceProgram"; int status = 0; // Convert all of ops and their input vars and weights and added into the NPU // RKNPU IR graph subgraph::rknpu::Graph graph; const auto& bridges = subgraph::Registry::Instance(); + if (origin_program_.empty()) { + BuildOriginProgram(); + } for (auto& inst : origin_program_) { auto op = const_cast(inst.op()); CHECK(op); @@ -42,13 +65,13 @@ int SubgraphEngine::BuildDeviceProgram() { op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kRKNPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kRKNPU))( reinterpret_cast(&graph), op, const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } // Collect the valid input and output nodes in the RKNPU IR graph and update @@ -91,7 +114,7 @@ int SubgraphEngine::BuildDeviceProgram() { model_name_, graph.GetHandle(), device_itensors_, device_otensors_); if (device_program_ == nullptr) { LOG(WARNING) << "[RKNPU] Build model failed!"; - return subgraph::FAILED; + return false; } // input @@ -165,10 +188,10 @@ int SubgraphEngine::BuildDeviceProgram() { break; } } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { LOG(INFO) << "[RKNPU]:LaunchDeviceProgram"; std::vector inputs; std::vector outputs; @@ -195,7 +218,7 @@ int SubgraphEngine::LaunchDeviceProgram() { device_program_->SetInputs(inputs); device_program_->Run(); device_program_->GetOutputs(outputs); - return 0; + return true; } void SubgraphCompute::PrepareForRun() { @@ -208,13 +231,12 @@ void SubgraphCompute::PrepareForRun() { param.output_data_names, param.scope)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { LOG(INFO) << "[RKNPU]:Run"; CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace rknpu diff --git a/lite/kernels/rknpu/subgraph_compute.h b/lite/kernels/rknpu/subgraph_compute.h index 863e6aef39ad54f0e9d94d4b507c6fca4128ebb8..a4bdadc658a81decd8107072f7b5948613d0c68a 100644 --- a/lite/kernels/rknpu/subgraph_compute.h +++ b/lite/kernels/rknpu/subgraph_compute.h @@ -42,14 +42,15 @@ class SubgraphEngine : public subgraph::Engine { ctx, block_idx, block_desc, input_names, output_names, scope) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; std::string model_name_; std::vector device_inames_; std::vector device_onames_; - std::vector> device_itensors_; - std::vector> device_otensors_; + std::vector> device_itensors_{}; + std::vector> device_otensors_{}; std::unique_ptr device_program_{nullptr}; }; diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc index 9c2191331c85a7f99ffb5a2e9662ed5831cb1dda..981922f8eacab57da4638e1fdcdd3df72465b379 100644 --- a/lite/kernels/xpu/subgraph_compute.cc +++ b/lite/kernels/xpu/subgraph_compute.cc @@ -27,12 +27,35 @@ namespace lite { namespace kernels { namespace xpu { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_itensors_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + device_itensors_[i].reset(new hiai::AiTensor); + CHECK(device_itensors_[i]); + } + device_otensors_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + device_otensors_[i].reset(new hiai::AiTensor); + CHECK(device_otensors_[i]); + } + return true; +} + +bool SubgraphEngine::BuildDeviceProgram() { int status = 0; // Convert all of ops and their input vars and weights and added into the XPU // IR graph subgraph::xpu::Graph graph; const auto& bridges = subgraph::Registry::Instance(); + if (origin_program_.empty()) { + BuildOriginProgram(); + } for (auto& inst : origin_program_) { auto op = const_cast(inst.op()); CHECK(op); @@ -40,13 +63,13 @@ int SubgraphEngine::BuildDeviceProgram() { op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kXPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kXPU))( reinterpret_cast(&graph), op, const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } // Obtain the output nodes of the XPU IR graph and build the graph to the XPU @@ -86,7 +109,7 @@ int SubgraphEngine::BuildDeviceProgram() { &graph.builder_, &graph.params_, &device_onodes); if (device_program_ == nullptr) { LOG(WARNING) << "[XPU] Build model failed!"; - return subgraph::FAILED; + return false; } // Query and check the dimensions of input and output tensors @@ -166,10 +189,10 @@ int SubgraphEngine::BuildDeviceProgram() { device_otensors_[i].strides = nullptr; device_otensors_[i].byte_offset = 0; } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { for (size_t i = 0; i < device_itensors_.size(); i++) { // Update the data pointer of DLTensor to track the origin input tensors device_itensors_[i].data = @@ -191,7 +214,7 @@ int SubgraphEngine::LaunchDeviceProgram() { const_cast(origin_otensors_[i]->raw_data()); device_program_->CopyOutputTo(i, &device_otensors_[i]); } - return 0; + return true; } void SubgraphCompute::PrepareForRun() { @@ -203,12 +226,11 @@ void SubgraphCompute::PrepareForRun() { param.output_data_names, param.scope)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace xpu diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h index 601c8821bc826e350c233573bf7eff89cdf5c1f5..f09a06a85d5382c72e9efb20cede8bea1922f2da 100644 --- a/lite/kernels/xpu/subgraph_compute.h +++ b/lite/kernels/xpu/subgraph_compute.h @@ -39,13 +39,14 @@ class SubgraphEngine : public subgraph::Engine { ctx, block_idx, block_desc, input_names, output_names, scope) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; std::vector device_inames_; std::vector device_onames_; - std::vector device_itensors_; - std::vector device_otensors_; + std::vector device_itensors_{}; + std::vector device_otensors_{}; std::unique_ptr device_program_{nullptr}; };