From 789112e80c74193f265dac3a1a25460804ca389f Mon Sep 17 00:00:00 2001 From: jackzhang235 Date: Thu, 9 Apr 2020 13:55:04 +0000 Subject: [PATCH] support changable input dims --- lite/kernels/mlu/bridges/batch_norm_op.cc | 15 +-- lite/kernels/mlu/bridges/graph.h | 47 +++++++--- lite/kernels/mlu/bridges/tensor.cc | 1 + lite/kernels/mlu/bridges/tensor.h | 3 + lite/kernels/mlu/subgraph_compute.h | 109 +++++++++++++++------- 5 files changed, 122 insertions(+), 53 deletions(-) diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc index 61f098ec8b..0eadd6b62c 100644 --- a/lite/kernels/mlu/bridges/batch_norm_op.cc +++ b/lite/kernels/mlu/bridges/batch_norm_op.cc @@ -61,12 +61,13 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { int co = static_cast(mean_dims[0]); + std::vector variance_trans(co); + std::vector mean_trans(co); for (int i = 0; i < co; ++i) { - variance->mutable_data()[i] = + variance_trans[i] = scale->data()[i] / sqrtf(variance->data()[i] + epsilon); - mean->mutable_data()[i] = - mean->data()[i] - - bias->data()[i] / variance->data()[i]; + mean_trans[i] = + mean->data()[i] - bias->data()[i] / variance_trans[i]; } auto input_tensor = graph->GetNode(x_var_name); @@ -77,8 +78,10 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { mean_tensor->mlu_tensor(), variance_tensor->mlu_tensor())); - graph->BindConstData(variance_var_name, variance); - graph->BindConstData(mean_var_name, mean); + graph->BindConstRawData( + variance_var_name, variance_trans.data(), variance_trans.size(), true); + graph->BindConstRawData( + mean_var_name, mean_trans.data(), mean_trans.size(), true); graph->FuseOp(bn_op); CNML_CALL(cnmlDestroyBaseOp(&bn_op)); diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h index 0583a0c953..c4fb10bdb5 100644 --- a/lite/kernels/mlu/bridges/graph.h +++ b/lite/kernels/mlu/bridges/graph.h @@ -89,6 +89,14 @@ class Graph { output_tensors_.push_back(tensor); } + std::vector>* MutableInputs() { + return &input_tensors_; + } + + std::vector>* MutableOutputs() { + return &output_tensors_; + } + void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); } void Compile(cnmlCoreVersion_t core_version, int core_number) { @@ -100,15 +108,18 @@ class Graph { CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number)); CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version)); CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_)); - for (auto in : input_tensors_) { - input_addrs_.push_back(in->mlu_data()); - } - for (auto out : output_tensors_) { - output_addrs_.push_back(out->mlu_data()); - } } void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) { + input_addrs_.resize(input_tensors_.size()); + output_addrs_.resize(output_tensors_.size()); + for (size_t i = 0; i < input_addrs_.size(); ++i) { + input_addrs_[i] = input_tensors_[i]->mlu_data(); + } + for (size_t i = 0; i < output_addrs_.size(); ++i) { + output_addrs_[i] = output_tensors_[i]->mlu_data(); + } + #if PRINT_HW_TIME thread_local float hw_time; CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); @@ -159,7 +170,7 @@ class Graph { CNML_CALL(cnmlBindConstData_V2( nodes_[tensor_name]->mlu_tensor(), alloc_data, false)); } else if (fp_type_ == CNML_DATA_FLOAT16) { - void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len); + void* data_fp16 = RegisterConstData(len); CNRT_CALL( cnrtCastDataType(const_cast(static_cast(data)), CNRT_FLOAT32, @@ -174,7 +185,7 @@ class Graph { } } - void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) { + void BindConstData(std::string tensor_name, paddle::lite::Tensor* tensor) { const float* data = tensor->data(); size_t len = tensor->data_size(); if (fp_type_ == CNML_DATA_FLOAT32) { @@ -183,10 +194,14 @@ class Graph { const_cast(static_cast(data)), false)); } else if (fp_type_ == CNML_DATA_FLOAT16) { - auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>(); - for (size_t i = 0; i < len; ++i) { - data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]); - } + void* data_fp16 = RegisterConstData(len); + CNRT_CALL( + cnrtCastDataType(const_cast(static_cast(data)), + CNRT_FLOAT32, + data_fp16, + CNRT_FLOAT16, + len, + nullptr)); CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(), static_cast(data_fp16), false)); @@ -207,12 +222,13 @@ class Graph { CNML_CALL(cnmlDestroyQuantizedParam(&quant_param)); } - void SetFPType(::paddle::lite_api::PrecisionType type) { + void SetFPType(paddle::lite_api::PrecisionType type) { + origin_fp_type_ = type; switch (type) { - case ::paddle::lite_api::PrecisionType::kFP16: + case paddle::lite_api::PrecisionType::kFP16: fp_type_ = CNML_DATA_FLOAT16; break; - case ::paddle::lite_api::PrecisionType::kFloat: + case paddle::lite_api::PrecisionType::kFloat: fp_type_ = CNML_DATA_FLOAT32; break; default: @@ -224,6 +240,7 @@ class Graph { private: cnmlDataType_t fp_type_{CNML_DATA_FLOAT32}; + paddle::lite_api::PrecisionType origin_fp_type_{PRECISION(kFloat)}; std::unordered_map> nodes_; std::vector inputs_; std::vector outputs_; diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc index c426069c7d..4fd1d573f1 100644 --- a/lite/kernels/mlu/bridges/tensor.cc +++ b/lite/kernels/mlu/bridges/tensor.cc @@ -46,6 +46,7 @@ void MLUTensor::remember(const std::vector& shape, cnmlDataOrder_t shape_order) { tensor_type_ = tensor_type; mlu_dtype_ = mlu_dtype; + origin_shape_.assign(shape.begin(), shape.end()); int size = 4; if (shape.size() > 4 || shape_order == CNML_ARRAY) { diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h index 2a4cc23a73..24e181a47b 100644 --- a/lite/kernels/mlu/bridges/tensor.h +++ b/lite/kernels/mlu/bridges/tensor.h @@ -51,6 +51,8 @@ class MLUTensor { void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; } + const std::vector& get_origin_shape() const { return origin_shape_; } + ~MLUTensor(); void ToFile(std::string file_name); @@ -59,6 +61,7 @@ class MLUTensor { cnmlTensor_t mlu_tensor_; std::vector shape_; + std::vector origin_shape_; cnmlTensorType_t tensor_type_; cnmlDataType_t mlu_dtype_; int dim_{0}; diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h index 7d9db21133..00ec78a0b4 100644 --- a/lite/kernels/mlu/subgraph_compute.h +++ b/lite/kernels/mlu/subgraph_compute.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -40,11 +41,10 @@ class SubgraphEngine : public subgraph::Engine { const std::vector& input_names, const std::vector& output_names, Scope* scope, - ::paddle::lite_api::PrecisionType type) + paddle::lite_api::PrecisionType type) : subgraph::Engine( - ctx, block_idx, block_desc, input_names, output_names, scope) { - graph_.SetFPType(type); - } + ctx, block_idx, block_desc, input_names, output_names, scope), + fp_type_(type) {} int Build() { // In order to attach all of the ops of the block desc, we need to build @@ -72,24 +72,44 @@ class SubgraphEngine : public subgraph::Engine { return 0; } + bool InputShapeChanged() { + std::vector> new_shape; + for (auto origin_itensor : origin_itensors_) { + new_shape.push_back(origin_itensor->dims().Vectorize()); + } + inputs_shape_ = new_shape; + if (shape_graph_map_.count(inputs_shape_) > 0) { + return false; + } + return true; + } + protected: int BuildDeviceProgram() override { int status = 0; + auto graph = std::make_shared(); + graph->SetFPType(fp_type_); + std::vector> new_shape; + origin_itensors_.clear(); + origin_otensors_.clear(); + // Convert all of input data vars and added into the MLU IR graph + status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; for (auto& input_name : input_names_) { auto input_tensor = scope_->FindMutableTensor(input_name); + + origin_itensors_.push_back(input_tensor); + new_shape.push_back(input_tensor->dims().Vectorize()); + CHECK(input_tensor); - auto input_node = - graph_.AddNode(input_name, - input_tensor->dims().Vectorize(), - CNML_TENSOR, - CNML_NCHW, - graph_.FPType(), - const_cast(input_tensor->raw_data())); + auto input_node = graph->AddNode(input_name, + input_tensor->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType()); CHECK(input_node); // MLU doesn't support dynamic dimensions/shapes, so need to rebuild // the program when the shape of any input tensor is changed. - status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; } LOG(INFO) << "START TO CONVERT "; // Convert all of ops and its weights and added into the MLU IR graph @@ -106,7 +126,7 @@ class SubgraphEngine : public subgraph::Engine { } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kMLU))( - reinterpret_cast(&graph_), + reinterpret_cast(graph.get()), const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { @@ -115,33 +135,51 @@ class SubgraphEngine : public subgraph::Engine { } // Obtain the output nodes of the MLU IR graph and build the graph to MLU // runtime - std::vector valid_output_names; for (auto& output_name : output_names_) { - if (graph_.HasNode(output_name)) { - graph_.AddOutput(graph_.GetNode(output_name)); + if (graph->HasNode(output_name)) { + graph->AddOutput(graph->GetNode(output_name)); auto output_tensor = scope_->FindMutableTensor(output_name); - void* p_data = static_cast( - output_tensor->mutable_data::T>( - TARGET(kMLU))); - auto node = graph_.GetNode(output_name); - CHECK(p_data); - node->set_mlu_ptr(p_data); - valid_output_names.push_back(output_name); + origin_otensors_.push_back(output_tensor); + + // auto node = graph->GetNode(output_name); + // CHECK(p_data); + // node->set_mlu_ptr(p_data); } } for (auto& input_name : input_names_) { - graph_.AddInput(graph_.GetNode(input_name)); + graph->AddInput(graph->GetNode(input_name)); } - CHECK(!valid_output_names.empty()) << "[MLU] no valid output names"; + + CHECK(!origin_otensors_.empty()) << "[MLU] no valid output names"; auto& mlu_context = this->ctx_->template As(); auto core_version = mlu_context.MLUCoreVersion(); auto core_number = mlu_context.MLUCoreNumber(); - graph_.Compile(core_version, core_number); + graph->Compile(core_version, core_number); + shape_graph_map_[new_shape] = graph; return status; } int LaunchDeviceProgram() override { + // prepare input and output memory + auto graph = shape_graph_map_[inputs_shape_]; + auto* graph_input = graph->MutableInputs(); + auto* graph_output = graph->MutableOutputs(); + CHECK_EQ(graph_input->size(), origin_itensors_.size()); + CHECK_EQ(graph_output->size(), origin_otensors_.size()); + + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + graph_input->at(i)->set_mlu_ptr( + const_cast(origin_itensors_[i]->raw_data())); + } + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape()); + void* p_data = static_cast( + origin_otensors_[i] + ->mutable_data::T>(TARGET(kMLU))); + graph_output->at(i)->set_mlu_ptr(p_data); + } + auto& mlu_context = this->ctx_->template As(); auto exec_queue = mlu_context.exec_queue(); u32_t affinity = mlu_context.affinity(); @@ -150,11 +188,13 @@ class SubgraphEngine : public subgraph::Engine { forward_param.data_parallelism = &data_param; forward_param.affinity = &affinity; forward_param.end = CNRT_PARAM_END; - graph_.Compute(forward_param, exec_queue); + + graph->Compute(forward_param, exec_queue); // // =========== DUMP =================== // for (auto input_name : input_names_) { - // auto input_tensor = graph_.GetNode(input_name); + // auto input_tensor = + // shape_graph_map_[inputs_shape_]->GetNode(input_name); // auto dump_name = input_name; // while (dump_name.find("/") != std::string::npos) { // dump_name = dump_name.replace(dump_name.find("/"), 1, "_"); @@ -163,8 +203,9 @@ class SubgraphEngine : public subgraph::Engine { // input_tensor->ToFile(dump_name); // } // for (auto output_name : output_names_) { - // if (graph_.HasNode(output_name)) { - // auto output_tensor = graph_.GetNode(output_name); + // if (shape_graph_map_[inputs_shape_]->HasNode(output_name)) { + // auto output_tensor = + // shape_graph_map_[inputs_shape_]->GetNode(output_name); // auto dump_name = output_name; // while (dump_name.find("/") != std::string::npos) { // dump_name = dump_name.replace(dump_name.find("/"), 1, "_"); @@ -180,7 +221,11 @@ class SubgraphEngine : public subgraph::Engine { return 0; } - paddle::lite::subgraph::mlu::Graph graph_; + paddle::lite_api::PrecisionType fp_type_; + std::vector> inputs_shape_{}; + std::map>, + std::shared_ptr> + shape_graph_map_{}; }; template -- GitLab