// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "lite/kernels/apu/subgraph_compute.h" #include #include #include #include #include "lite/backends/apu/device.h" #include "lite/core/op_registry.h" #include "lite/kernels/apu/bridges/graph.h" #include "lite/kernels/apu/bridges/paddle_use_bridges.h" #include "lite/kernels/apu/bridges/utility.h" #include "lite/utils/io.h" #include "lite/utils/md5.h" namespace paddle { namespace lite { namespace kernels { namespace apu { // Generate the model name by using md5 hashes based on: // 1. the sorted variable input names // 2. the shapes of the origin input tensors // 3. the sorted variable output names std::string DeviceProgram::GenerateModelName( const std::vector& input_names, const std::vector& output_names, const std::vector>& origin_idims) { std::ostringstream os; CHECK_EQ(input_names.size(), origin_idims.size()); for (int i = 0; i < input_names.size(); i++) { os << input_names[i]; for (auto dim : origin_idims[i]) { os << dim; } } for (auto output_name : output_names) { os << output_name; } return MD5(os.str()); } // Deserialize the generated model bool DeviceProgram::LoadFromCacheFile( const std::vector& input_names, const std::vector& output_names, const std::vector>& origin_idims, const std::string& model_cache_dir) { int status; // Generate the model name if not initialized if (model_name_.empty()) { model_name_ = GenerateModelName(input_names, output_names, origin_idims); } // Load from the cached model file auto model_path = model_cache_dir + "/" + model_name_ + ".dla"; VLOG(3) << "[APU] Load model from " << model_path; std::vector compilationBuffer; if (!ReadFile(model_path, &compilationBuffer)) { LOG(WARNING) << "[NPU] Open " << model_path << " for reading failed!"; return false; } model_ = nullptr; compilation_ = nullptr; status = NeuronModel_restoreFromCompiledNetwork( &model_, &compilation_, &compilationBuffer[0], compilationBuffer.size()); if (status != NEURON_NO_ERROR) { LOG(WARNING) << "[APU] Load model failed!" << compilationBuffer.size(); return false; } VLOG(3) << "[APU] Complete Load model!"; // Deserialize the preicisions and shapes of the origin output tensors from // the cached configuration file auto config_path = model_cache_dir + "/" + model_name_ + ".cfg"; VLOG(3) << "[APU] Load configuration from " << config_path; std::vector config_buffer; if (!ReadFile(config_path, &config_buffer)) { LOG(WARNING) << "[APU] read from " << config_path << " failed!"; return false; } std::string str(config_buffer.begin(), config_buffer.end()); // Parse the precision and shapes of the output tensors auto output_options = Split(str, ";"); CHECK_EQ(output_options.size(), output_names.size()); origin_otypes_.resize(output_names.size()); origin_odims_.resize(output_names.size()); for (int i = 0; i < output_names.size(); i++) { auto items = Split(output_options[i], ":"); CHECK_EQ(items.size(), 2); // precision and shapes origin_otypes_[i] = static_cast(std::stoi(items[0])); origin_odims_[i] = Split(items[1], ","); } return true; } bool DeviceProgram::BuildGraphAndCacheToFile( RuntimeProgram* origin_program, const std::vector& input_names, const std::vector& output_names, const std::vector>& origin_idims, const std::vector& origin_itensors, const std::vector& origin_otensors, const std::string& model_cache_dir) { auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); return 1e+6 * time.tv_sec + time.tv_usec; }; auto start_time = GetCurrentUS(); unsigned int version; Neuron_getVersion(&version); VLOG(3) << "Neuron Adapter version: " << version; int status = 0; subgraph::apu::Graph graph; int neuron_errCode = NeuronModel_create(&model_); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "[APU] Failed to create the neuron model!"; return false; } graph.set_model(model_); graph.set_input_names(input_names); graph.set_output_names(output_names); // Convert all of ops and their input vars and weights and added into the APU // NIR graph const auto& bridges = subgraph::SubgraphBridgeRegistry::Instance(); const auto& insts = origin_program->instructions(kRootBlockIdx); for (auto& inst : insts) { auto op = const_cast(inst.op()); CHECK(op); op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kAPU))) { return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kAPU))(reinterpret_cast(&graph), const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { return false; } } // Get the index of input tensors std::vector input_indices; for (int i = 0; i < input_names.size(); i++) { CHECK(graph.Has(input_names[i])) << "[APU] Failed to find input node " << input_names[i]; auto index = graph.Get(input_names[i])->index(); input_indices.push_back(index); VLOG(3) << "[APU] Input[" << i << "] name " << input_names[i] << " dims " << origin_itensors[i]->dims() << " index " << index; } // Get the index of output tensors std::vector output_indices; for (int i = 0; i < output_names.size(); i++) { CHECK(graph.Has(output_names[i])) << "[APU] Failed to find output node " << output_names[i]; origin_otensors[i]->mutable_data(); auto index = graph.Get(output_names[i])->index(); output_indices.push_back(index); VLOG(3) << "[APU] Output[" << i << "] name " << output_names[i] << " dims " << origin_otensors[i]->dims() << " index " << index; } // Indentify the input and output tensors of the neuron model NeuronModel_identifyInputsAndOutputs(model_, input_indices.size(), &input_indices[0], output_indices.size(), &output_indices[0]); neuron_errCode = NeuronModel_finish(model_); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode; return false; } VLOG(1) << "[APU] APU NIR model created, Create cost " << GetCurrentUS() - start_time << " us"; start_time = GetCurrentUS(); compilation_ = lite::apu::Device::Global().Build(model_); if (compilation_ == nullptr) { LOG(WARNING) << "[APU] Build APU DLA model failed!"; return false; } VLOG(1) << "[APU] APU DLA model created, Build cost " << GetCurrentUS() - start_time << " us"; start_time = GetCurrentUS(); CHECK_EQ(origin_otensors.size(), output_names.size()); origin_otypes_.resize(output_names.size()); origin_odims_.resize(output_names.size()); for (size_t i = 0; i < output_names.size(); i++) { origin_otypes_[i] = origin_otensors[i]->precision(); origin_odims_[i] = origin_otensors[i]->dims().Vectorize(); } if (!model_cache_dir.empty()) { // Save the generated model to file auto model_path = model_cache_dir + "/" + model_name_ + ".dla"; VLOG(3) << "[APU] Save model to " << model_path; size_t compilationSize; status = NeuronCompilation_getCompiledNetworkSize(compilation_, &compilationSize); std::vector model_buffer; if (status == NEURON_NO_ERROR) { // Serialization DLA model_buffer.resize(compilationSize); status = NeuronCompilation_storeCompiledNetwork( compilation_, &model_buffer[0], compilationSize); if (status != NEURON_NO_ERROR) { LOG(WARNING) << "[APU] Serialization DLA failed!"; } VLOG(3) << "[APU] Export the model to " << model_path; if (!WriteFile(model_path, model_buffer)) { LOG(WARNING) << "[APU] Open " << model_path << " for writting failed!"; } } // Serialize the precisions and shapes of the origin output tensors into the // configuration file std::ostringstream os; for (int i = 0; i < output_names.size(); i++) { os << static_cast(origin_otypes_[i]) << ":"; for (auto dim : origin_odims_[i]) { os << dim << ","; } os << ";"; } auto str = os.str(); std::vector config_buffer(str.begin(), str.end()); auto config_path = model_cache_dir + "/" + model_name_ + ".cfg"; VLOG(3) << "[APU] Save configuration to " << config_path; if (!WriteFile(config_path, config_buffer)) { LOG(WARNING) << "[APU] Open " << config_path << " for writting failed!"; } // Workaround: after calling storeCompiledNetwork, model will be modificated // that will cause a low performace, so we need restore it. after we fix // this bug, below code will be deleted NeuronCompilation_free(compilation_); NeuronModel_free(model_); model_ = nullptr; compilation_ = nullptr; status = NeuronModel_restoreFromCompiledNetwork( &model_, &compilation_, &model_buffer[0], compilationSize); if (status != NEURON_NO_ERROR) { LOG(WARNING) << "[APU] Load model failed!" << compilationSize; return false; } VLOG(3) << "[APU] Complete Load model!"; VLOG(1) << "[APU] APU DLA model cached, cache cost " << GetCurrentUS() - start_time << " us"; } return true; } bool SubgraphEngine::BuildDeviceProgram() { // Check if the cache device program exists if (!device_programs_.count(origin_idims_)) { auto device_program = std::make_shared(); // Obtain the model cache dir from the NPU Context of the subgraph op auto model_cache_dir = ctx_->As().SubgraphModelCacheDir(exec_scope_); VLOG(3) << "[APU] Getting subgraph_model_cache_dir: " << model_cache_dir; // Check and load if the cached model and configuration file exists if (model_cache_dir.empty() || !device_program->LoadFromCacheFile( input_names_, output_names_, origin_idims_, model_cache_dir)) { // Build the model online, including converting the paddle ops to the NIR // nodes, building the MTK NIR graph, and compile MTK NIR graph to dla if (!origin_program_) { BuildOriginProgram(); } CHECK(origin_program_) << "[APU] The origin program is not initialized!"; CHECK_GT(origin_program_->instructions().size(), 0) << "[APU] No instructions found in the origin program!"; if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(), input_names_, output_names_, origin_idims_, origin_itensors_, origin_otensors_, model_cache_dir)) { return false; } } if (device_program->model_ == nullptr) { LOG(WARNING) << "dla create fail!"; return false; } device_programs_[origin_idims_] = device_program; } // Get the index of output tensors auto device_program = device_programs_[origin_idims_]; CHECK(device_program && device_program->model_); for (int i = 0; i < output_names_.size(); i++) { origin_otensors_[i]->Resize(device_program->origin_odims_[i]); origin_otensors_[i]->mutable_data(); VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims " << origin_otensors_[i]->dims() << " memory_size " << origin_otensors_[i]->memory_size(); } } bool SubgraphEngine::LaunchDeviceProgram() { auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); return 1e+6 * time.tv_sec + time.tv_usec; }; if (device_programs_.count(origin_idims_) == 0 || device_programs_[origin_idims_]->model_ == nullptr) { return LaunchOriginProgram(); } auto device_program = device_programs_[origin_idims_]; auto start_time = GetCurrentUS(); NeuronExecution* run = NULL; int neuron_errCode = NeuronExecution_create(device_program->compilation_, &run); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "[APU] Build APU runtime failed!"; return false; } // Set input buffer for (size_t i = 0; i < origin_itensors_.size(); i++) { auto origin_data = origin_itensors_[i]->mutable_data(); auto converted_data = reinterpret_cast(origin_data); for (int j = 0; j < origin_itensors_[i]->data_size(); j++) { converted_data[j] = static_cast(static_cast(origin_data[j]) + 128); } NeuronExecution_setInput( run, i, NULL, converted_data, origin_itensors_[i]->memory_size()); } // Set output buffer for (size_t i = 0; i < origin_otensors_.size(); i++) { NeuronExecution_setOutput( run, i, NULL, reinterpret_cast(origin_otensors_[i]->raw_data()), origin_otensors_[i]->memory_size()); } neuron_errCode = NeuronExecution_compute(run); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Fail to run execution!" << neuron_errCode; return false; } for (size_t i = 0; i < origin_otensors_.size(); i++) { auto converted_data = origin_otensors_[i]->mutable_data(); auto origin_data = reinterpret_cast(converted_data); for (int j = 0; j < origin_otensors_[i]->data_size(); j++) { converted_data[j] = static_cast(static_cast(origin_data[j]) - 128); } } NeuronExecution_free(run); VLOG(1) << "[APU] Process cost " << GetCurrentUS() - start_time << " us"; return true; } SubgraphEngine::~SubgraphEngine() { for (auto& device_program : device_programs_) { if (device_program.second->compilation_) { NeuronCompilation_free(device_program.second->compilation_); } if (device_program.second->model_) { NeuronModel_free(device_program.second->model_); } } } void SubgraphCompute::PrepareForRun() { auto& param = this->Param(); engine_.reset(new SubgraphEngine(ctx_.get(), param.block_idx, param.program_desc, param.exec_scope, param.input_data_names, param.output_data_names)); CHECK(engine_); } void SubgraphCompute::Run() { CHECK(engine_); engine_->Run(); } } // namespace apu } // namespace kernels } // namespace lite } // namespace paddle REGISTER_LITE_KERNEL(subgraph, kAPU, kInt8, kNCHW, paddle::lite::kernels::apu::SubgraphCompute, def) .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt8), DATALAYOUT(kNCHW))}) .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt8), DATALAYOUT(kNCHW))}) .Finalize();