未验证 提交 e6b3d883 编写于 作者: Q Qi Li 提交者: GitHub

[NPU] apply npu cache offline model to other devices, test=develop (#3925)

* [NPU] apply npu cache offline model to other devices, test=develop

* [NPU] address review comments, test=develop
上级 b85bc6e5
......@@ -28,7 +28,7 @@ namespace lite {
namespace kernels {
namespace apu {
int SubgraphEngine::BuildDeviceProgram() {
bool SubgraphEngine::BuildDeviceProgram() {
unsigned int version;
Neuron_getVersion(&version);
VLOG(3) << "Neuron Adapter version: " << version;
......@@ -38,7 +38,7 @@ int SubgraphEngine::BuildDeviceProgram() {
int neuron_errCode = NeuronModel_create(&model_);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Fail to create model";
return subgraph::FAILED;
return false;
}
graph.set_model(model_);
graph.set_input_names(input_names_);
......@@ -46,6 +46,9 @@ int SubgraphEngine::BuildDeviceProgram() {
// Convert all of ops and their input vars and weights and added into the APU
// NIR graph
if (origin_program_.empty()) {
BuildOriginProgram();
}
const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) {
auto op = const_cast<OpLite*>(inst.op());
......@@ -54,7 +57,7 @@ int SubgraphEngine::BuildDeviceProgram() {
op->InferShape();
std::string op_type = op->op_info()->Type();
if (!bridges.Exists(op_type, TARGET(kAPU))) {
return subgraph::FAILED;
return false;
}
auto kernel = inst.kernel();
......@@ -63,7 +66,7 @@ int SubgraphEngine::BuildDeviceProgram() {
const_cast<OpLite*>(op),
const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
return false;
}
}
......@@ -84,7 +87,7 @@ int SubgraphEngine::BuildDeviceProgram() {
VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
} else {
LOG(WARNING) << "Fail to find input: " << input_names_[i];
return subgraph::FAILED;
return false;
}
}
......@@ -105,7 +108,7 @@ int SubgraphEngine::BuildDeviceProgram() {
VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
} else {
LOG(WARNING) << "Fail to find output: " << output_names_[i];
return subgraph::FAILED;
return false;
}
}
......@@ -116,7 +119,7 @@ int SubgraphEngine::BuildDeviceProgram() {
neuron_errCode = NeuronModel_finish(model_);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
return subgraph::FAILED;
return false;
}
VLOG(3) << "[APU] APU NIR model created!";
......@@ -129,15 +132,14 @@ int SubgraphEngine::BuildDeviceProgram() {
compilation_ = lite::apu::Device::Global().Build(model_);
if (compilation_ == nullptr) {
LOG(WARNING) << "[APU] Build APU DLA model failed!";
return subgraph::FAILED;
return false;
}
VLOG(3) << "[APU] APU DLA model created, Build cost "
<< GetCurrentUS() - start_time << " us";
return status;
return true;
}
int SubgraphEngine::LaunchDeviceProgram() {
bool SubgraphEngine::LaunchDeviceProgram() {
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
......@@ -149,7 +151,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
int neuron_errCode = NeuronExecution_create(compilation_, &run);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "[APU] Build APU runtime failed!";
return subgraph::FAILED;
return false;
}
// Set input buffer
......@@ -177,7 +179,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
neuron_errCode = NeuronExecution_compute(run);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Fail to run execution!" << neuron_errCode;
return subgraph::FAILED;
return false;
}
for (size_t i = 0; i < origin_otensors_.size(); i++) {
......@@ -190,7 +192,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
}
NeuronExecution_free(run);
VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
return 0;
return true;
}
SubgraphEngine::~SubgraphEngine() {
......@@ -211,12 +213,11 @@ void SubgraphCompute::PrepareForRun() {
param.output_data_names,
param.scope));
CHECK(engine_);
engine_->Build();
}
void SubgraphCompute::Run() {
CHECK(engine_);
engine_->Launch();
engine_->Run();
}
} // namespace apu
......
......@@ -41,8 +41,8 @@ class SubgraphEngine : public subgraph::Engine {
~SubgraphEngine();
protected:
int BuildDeviceProgram() override;
int LaunchDeviceProgram() override;
bool BuildDeviceProgram() override;
bool LaunchDeviceProgram() override;
NeuronModel *model_;
NeuronCompilation *compilation_;
......
......@@ -28,12 +28,35 @@ namespace lite {
namespace kernels {
namespace bm {
int SubgraphEngine::BuildDeviceProgram() {
bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
// Obtain the origin input tensors, and create the origin output
// tensors(Don't try to access them before launch the device program or the
// origin program)
PrepareWorkspaceForOriginProgram();
// Create the device input and output tensors, but don't initialize them
// with the dimensions
device_inputs_.resize(input_names_.size());
for (int i = 0; i < input_names_.size(); i++) {
device_inputs_[i].reset(new hiai::AiTensor);
CHECK(device_inputs_[i]);
}
device_outputs_.resize(output_names_.size());
for (int i = 0; i < output_names_.size(); i++) {
device_outputs_[i].reset(new hiai::AiTensor);
CHECK(device_outputs_[i]);
}
return true;
}
bool SubgraphEngine::BuildDeviceProgram() {
int status = 0;
subgraph::bm::Graph graph;
const auto& bridges = subgraph::Registry::Instance();
graph.CreateCompilerHandle();
auto& ctx = this->ctx_->template As<BMContext>();
if (origin_program_.empty()) {
BuildOriginProgram();
}
for (auto& inst : origin_program_) {
auto op = const_cast<OpLite*>(inst.op());
CHECK(op);
......@@ -42,7 +65,7 @@ int SubgraphEngine::BuildDeviceProgram() {
std::string op_type = op->op_info()->Type();
LOG(INFO) << op_type;
if (!bridges.Exists(op_type, TARGET(kBM))) {
return subgraph::FAILED;
return false;
}
auto kernel = inst.kernel();
status |=
......@@ -50,7 +73,7 @@ int SubgraphEngine::BuildDeviceProgram() {
const_cast<OpLite*>(op),
const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
return false;
}
}
std::string net_name = "bmnetc_f32umodel";
......@@ -63,7 +86,7 @@ int SubgraphEngine::BuildDeviceProgram() {
graph.UnlockCompilerMutex();
bmrt_hd_ = bmrt_create(bm_hd_);
if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
return subgraph::FAILED;
return false;
}
bmrt_get_network_names(bmrt_hd_, &net_names_);
net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]);
......@@ -116,10 +139,10 @@ int SubgraphEngine::BuildDeviceProgram() {
net_info_->output_dtypes[i],
stage.output_shapes[i]);
}
return status;
return true;
}
int SubgraphEngine::LaunchDeviceProgram() {
bool SubgraphEngine::LaunchDeviceProgram() {
for (size_t i = 0; i < device_inputs_.size(); i++) {
bm_memcpy_s2d(bm_hd_,
device_inputs_[i].device_mem,
......@@ -143,7 +166,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
out_index++;
}
}
return 0;
return true;
}
void SubgraphCompute::PrepareForRun() {
......@@ -155,12 +178,11 @@ void SubgraphCompute::PrepareForRun() {
param.output_data_names,
param.scope));
CHECK(engine_);
engine_->Build();
}
void SubgraphCompute::Run() {
CHECK(engine_);
engine_->Launch();
engine_->Run();
}
} // namespace bm
......
......@@ -44,8 +44,9 @@ class SubgraphEngine : public subgraph::Engine {
ctx, block_idx, block_desc, input_names, output_names, scope) {}
protected:
int BuildDeviceProgram() override;
int LaunchDeviceProgram() override;
bool PrepareWorkspaceForDeviceProgram() override;
bool BuildDeviceProgram() override;
bool LaunchDeviceProgram() override;
private:
void *bmrt_hd_;
......
......@@ -62,32 +62,6 @@ class SubgraphEngine : public subgraph::Engine {
}
}
int Build() {
// In order to attach all of the ops of the block desc, we need to build
// the original program firstly.
BuildOriginProgram();
// Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
build_device_program_status_ = BuildDeviceProgram();
return build_device_program_status_;
}
int Launch() {
// Rebuild device program when the shapes of input tensors have been
// changed.
if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
build_device_program_status_) &&
InputShapeChanged()) {
Build();
}
if (subgraph::CHECK_FAILED(build_device_program_status_)) {
LaunchOriginProgram();
} else {
LaunchDeviceProgram();
}
return 0;
}
bool InputShapeChanged() {
std::vector<std::vector<int64_t>> new_shape;
// used in batch changable situation
......@@ -127,7 +101,10 @@ class SubgraphEngine : public subgraph::Engine {
}
protected:
int BuildDeviceProgram() override {
bool BuildDeviceProgram() override {
if (origin_program_.empty()) {
BuildOriginProgram();
}
if (!error_compile_batch_size_changeable_ &&
!disable_batch_size_changeable_) {
int status = BuildDeviceProgramImpl();
......@@ -142,7 +119,7 @@ class SubgraphEngine : public subgraph::Engine {
return BuildDeviceProgramImpl();
}
int BuildDeviceProgramImpl() {
bool BuildDeviceProgramImpl() {
int status = 0;
auto graph = std::make_shared<paddle::lite::subgraph::mlu::Graph>();
graph->SetFPType(fp_type_);
......@@ -197,13 +174,16 @@ class SubgraphEngine : public subgraph::Engine {
status |= subgraph::FAILED;
VLOG(4) << "[MLU] found unsupported batch_size changeable op type: "
<< op_type;
return status;
if (subgraph::CHECK_FAILED(status)) {
return false;
}
return true;
}
op->CheckShape();
const_cast<OpLite*>(op)->InferShape();
if (!bridges.Exists(op_type, TARGET(kMLU))) {
LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
return subgraph::FAILED;
return false;
}
auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kMLU))(
......@@ -211,7 +191,7 @@ class SubgraphEngine : public subgraph::Engine {
const_cast<OpLite*>(op),
const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
return false;
}
}
// Obtain the output nodes of the MLU IR graph and build the graph to MLU
......@@ -242,7 +222,7 @@ class SubgraphEngine : public subgraph::Engine {
if (GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL")) {
graph->GenOfflineModel(GetOfflineModName());
}
return status;
return true;
}
std::string TrimStrings(const std::string& origin_str) {
......@@ -329,7 +309,7 @@ class SubgraphEngine : public subgraph::Engine {
}
}
int LaunchDeviceProgram() override {
bool LaunchDeviceProgram() override {
// prepare input and output memory
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto exec_queue = mlu_context.exec_queue();
......@@ -453,7 +433,7 @@ class SubgraphEngine : public subgraph::Engine {
// =========== DUMP END ================
}
return 0;
return true;
}
paddle::lite_api::PrecisionType fp_type_;
......@@ -501,12 +481,11 @@ class SubgraphCompute
param.scope,
this->precision()));
CHECK(engine_);
engine_->Build();
}
void Run() override {
CHECK(engine_);
engine_->Launch();
engine_->Run();
}
virtual ~SubgraphCompute() = default;
......
......@@ -28,13 +28,36 @@ namespace lite {
namespace kernels {
namespace rknpu {
int SubgraphEngine::BuildDeviceProgram() {
bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
// Obtain the origin input tensors, and create the origin output
// tensors(Don't try to access them before launch the device program or the
// origin program)
PrepareWorkspaceForOriginProgram();
// Create the device input and output tensors, but don't initialize them
// with the dimensions
device_itensors_.resize(input_names_.size());
for (int i = 0; i < input_names_.size(); i++) {
device_itensors_[i].reset(new hiai::AiTensor);
CHECK(device_itensors_[i]);
}
device_otensors_.resize(output_names_.size());
for (int i = 0; i < output_names_.size(); i++) {
device_otensors_[i].reset(new hiai::AiTensor);
CHECK(device_otensors_[i]);
}
return true;
}
bool SubgraphEngine::BuildDeviceProgram() {
LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
int status = 0;
// Convert all of ops and their input vars and weights and added into the NPU
// RKNPU IR graph
subgraph::rknpu::Graph graph;
const auto& bridges = subgraph::Registry::Instance();
if (origin_program_.empty()) {
BuildOriginProgram();
}
for (auto& inst : origin_program_) {
auto op = const_cast<OpLite*>(inst.op());
CHECK(op);
......@@ -42,13 +65,13 @@ int SubgraphEngine::BuildDeviceProgram() {
op->InferShape();
std::string op_type = op->op_info()->Type();
if (!bridges.Exists(op_type, TARGET(kRKNPU))) {
return subgraph::FAILED;
return false;
}
auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kRKNPU))(
reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
return false;
}
}
// Collect the valid input and output nodes in the RKNPU IR graph and update
......@@ -91,7 +114,7 @@ int SubgraphEngine::BuildDeviceProgram() {
model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
if (device_program_ == nullptr) {
LOG(WARNING) << "[RKNPU] Build model failed!";
return subgraph::FAILED;
return false;
}
// input
......@@ -165,10 +188,10 @@ int SubgraphEngine::BuildDeviceProgram() {
break;
}
}
return status;
return true;
}
int SubgraphEngine::LaunchDeviceProgram() {
bool SubgraphEngine::LaunchDeviceProgram() {
LOG(INFO) << "[RKNPU]:LaunchDeviceProgram";
std::vector<rk::nn::InputInfo> inputs;
std::vector<rk::nn::OutputInfo> outputs;
......@@ -195,7 +218,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
device_program_->SetInputs(inputs);
device_program_->Run();
device_program_->GetOutputs(outputs);
return 0;
return true;
}
void SubgraphCompute::PrepareForRun() {
......@@ -208,13 +231,12 @@ void SubgraphCompute::PrepareForRun() {
param.output_data_names,
param.scope));
CHECK(engine_);
engine_->Build();
}
void SubgraphCompute::Run() {
LOG(INFO) << "[RKNPU]:Run";
CHECK(engine_);
engine_->Launch();
engine_->Run();
}
} // namespace rknpu
......
......@@ -42,14 +42,15 @@ class SubgraphEngine : public subgraph::Engine {
ctx, block_idx, block_desc, input_names, output_names, scope) {}
protected:
int BuildDeviceProgram() override;
int LaunchDeviceProgram() override;
bool PrepareWorkspaceForDeviceProgram() override;
bool BuildDeviceProgram() override;
bool LaunchDeviceProgram() override;
std::string model_name_;
std::vector<std::string> device_inames_;
std::vector<std::string> device_onames_;
std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_;
std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_;
std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_{};
std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_{};
std::unique_ptr<rk::nn::Exection> device_program_{nullptr};
};
......
......@@ -27,12 +27,35 @@ namespace lite {
namespace kernels {
namespace xpu {
int SubgraphEngine::BuildDeviceProgram() {
bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
// Obtain the origin input tensors, and create the origin output
// tensors(Don't try to access them before launch the device program or the
// origin program)
PrepareWorkspaceForOriginProgram();
// Create the device input and output tensors, but don't initialize them
// with the dimensions
device_itensors_.resize(input_names_.size());
for (int i = 0; i < input_names_.size(); i++) {
device_itensors_[i].reset(new hiai::AiTensor);
CHECK(device_itensors_[i]);
}
device_otensors_.resize(output_names_.size());
for (int i = 0; i < output_names_.size(); i++) {
device_otensors_[i].reset(new hiai::AiTensor);
CHECK(device_otensors_[i]);
}
return true;
}
bool SubgraphEngine::BuildDeviceProgram() {
int status = 0;
// Convert all of ops and their input vars and weights and added into the XPU
// IR graph
subgraph::xpu::Graph graph;
const auto& bridges = subgraph::Registry::Instance();
if (origin_program_.empty()) {
BuildOriginProgram();
}
for (auto& inst : origin_program_) {
auto op = const_cast<OpLite*>(inst.op());
CHECK(op);
......@@ -40,13 +63,13 @@ int SubgraphEngine::BuildDeviceProgram() {
op->InferShape();
std::string op_type = op->op_info()->Type();
if (!bridges.Exists(op_type, TARGET(kXPU))) {
return subgraph::FAILED;
return false;
}
auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kXPU))(
reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
return false;
}
}
// Obtain the output nodes of the XPU IR graph and build the graph to the XPU
......@@ -86,7 +109,7 @@ int SubgraphEngine::BuildDeviceProgram() {
&graph.builder_, &graph.params_, &device_onodes);
if (device_program_ == nullptr) {
LOG(WARNING) << "[XPU] Build model failed!";
return subgraph::FAILED;
return false;
}
// Query and check the dimensions of input and output tensors
......@@ -166,10 +189,10 @@ int SubgraphEngine::BuildDeviceProgram() {
device_otensors_[i].strides = nullptr;
device_otensors_[i].byte_offset = 0;
}
return status;
return true;
}
int SubgraphEngine::LaunchDeviceProgram() {
bool SubgraphEngine::LaunchDeviceProgram() {
for (size_t i = 0; i < device_itensors_.size(); i++) {
// Update the data pointer of DLTensor to track the origin input tensors
device_itensors_[i].data =
......@@ -191,7 +214,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
const_cast<void*>(origin_otensors_[i]->raw_data());
device_program_->CopyOutputTo(i, &device_otensors_[i]);
}
return 0;
return true;
}
void SubgraphCompute::PrepareForRun() {
......@@ -203,12 +226,11 @@ void SubgraphCompute::PrepareForRun() {
param.output_data_names,
param.scope));
CHECK(engine_);
engine_->Build();
}
void SubgraphCompute::Run() {
CHECK(engine_);
engine_->Launch();
engine_->Run();
}
} // namespace xpu
......
......@@ -39,13 +39,14 @@ class SubgraphEngine : public subgraph::Engine {
ctx, block_idx, block_desc, input_names, output_names, scope) {}
protected:
int BuildDeviceProgram() override;
int LaunchDeviceProgram() override;
bool PrepareWorkspaceForDeviceProgram() override;
bool BuildDeviceProgram() override;
bool LaunchDeviceProgram() override;
std::vector<std::string> device_inames_;
std::vector<std::string> device_onames_;
std::vector<DLTensor> device_itensors_;
std::vector<DLTensor> device_otensors_;
std::vector<DLTensor> device_itensors_{};
std::vector<DLTensor> device_otensors_{};
std::unique_ptr<xtcl::network::xRuntimeInstance> device_program_{nullptr};
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册