From c7ffa64a1b24a86bc5b4b9dd376f7ad8fb398fa2 Mon Sep 17 00:00:00 2001 From: jackzhang235 Date: Mon, 11 May 2020 09:14:44 +0000 Subject: [PATCH] fix bug when set output tensors' dimensions --- lite/kernels/mlu/bridges/graph.h | 54 ++++++++++ lite/kernels/mlu/subgraph_compute.h | 151 ++++++++++++++++++---------- 2 files changed, 151 insertions(+), 54 deletions(-) diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h index b03a23bc99..f950ee9c19 100644 --- a/lite/kernels/mlu/bridges/graph.h +++ b/lite/kernels/mlu/bridges/graph.h @@ -79,6 +79,27 @@ class Graph { return nodes_.find(name) != nodes_.end(); } +// const std::vector> InferOutputsShape(std::vector> graph_in){ +// CHECK_EQ(graph_in.size(), inputs_.size()); +// std::vector inputs(inputs_.size()); +// for (size_t i = 0; i < graph_in.size(); ++i) { +// inputs[i] = graph_in[i]->mlu_tensor(); +// } +// std::vector outputs(outputs_.size()); +// cnmlInferFusionOpOutputShape(fusion_op_, inputs.data(), inputs.size(), outputs.size(), outpus.size()); +// +// std::vector> outputs_shape; +// for (size_t i = 0; i < outputs.size(); ++i) { +// int len; +// cnmlGetTensorLen(outputs[i], &len); +// std::vector tmp_shape(len); +// cnmlGetTensorShape(outputs[i], tmp_shape.data()) +// outputs_shape.push_back(std::move(tmp_shape)); +// } +// +// return outputs_shape; +// } + void AddInput(std::shared_ptr tensor) { inputs_.push_back(tensor->mlu_tensor()); input_tensors_.push_back(tensor); @@ -123,6 +144,39 @@ class Graph { CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_)); } + void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) { + input_addrs_.resize(input_tensors_.size()); + output_addrs_.resize(output_tensors_.size()); + for (size_t i = 0; i < input_addrs_.size(); ++i) { + input_addrs_[i] = input_tensors_[i]->mlu_data(); + } + for (size_t i = 0; i < output_addrs_.size(); ++i) { + output_addrs_[i] = output_tensors_[i]->mlu_data(); + } + +#if PRINT_HW_TIME + thread_local float hw_time; + CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); +#endif + CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_, + input_addrs_.data(), + input_addrs_.size(), + output_addrs_.data(), + output_addrs_.size(), + &forward_param, + que)); +#if PRINT_HW_TIME + CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); + CNRT_CALL(cnrtSyncQueue(que)); + CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); + hw_time /= 1000.0f; + DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; + std::lock_guard lk(time_mut_); + time_log_.push_back(hw_time); +#endif + } + + void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que, const std::vector>& in, diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h index e846be82a4..ef3cdf4242 100644 --- a/lite/kernels/mlu/subgraph_compute.h +++ b/lite/kernels/mlu/subgraph_compute.h @@ -80,9 +80,12 @@ class SubgraphEngine : public subgraph::Engine { bool InputShapeChanged() { std::vector> new_shape; + // used in batch changable situation + std::vector> all_shape; for (auto origin_itensor : origin_itensors_) { if (GetBoolFromEnv("BATCH_SIZE_CHANGEBLE")) { auto iv = origin_itensor->dims().Vectorize(); + all_shape.push_back(iv); iv.erase(iv.begin()); new_shape.push_back(iv); } else { @@ -90,6 +93,7 @@ class SubgraphEngine : public subgraph::Engine { } } inputs_shape_ = new_shape; + all_inputs_shape_ = all_shape; if (shape_graph_map_.count(inputs_shape_) > 0) { return false; } @@ -229,78 +233,115 @@ class SubgraphEngine : public subgraph::Engine { return name; } + void InferOutputsShapeOnly() { + const auto iter = in_out_shape_map_.find(all_inputs_shape_); + if (iter != in_out_shape_map_.end()) { + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + origin_otensors_[i]->Resize(iter->second[i]); + } + } else { + for (auto& inst : origin_program_) { + auto op = inst.op(); + CHECK(op); + op->CheckShape(); + const_cast(op)->InferShape(); + } + std::vector> outs_shape; + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + outs_shape.push_back(origin_otensors_[i]->dims().Vectorize()); + } + in_out_shape_map_[all_inputs_shape_] = outs_shape; + } + } + int LaunchDeviceProgram() override { // prepare input and output memory + auto& mlu_context = this->ctx_->template As(); + auto exec_queue = mlu_context.exec_queue(); + u32_t affinity = mlu_context.affinity(); + cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); + int data_param = 1; + forward_param.data_parallelism = &data_param; + forward_param.affinity = &affinity; + forward_param.end = CNRT_PARAM_END; + auto graph = shape_graph_map_[inputs_shape_]; auto* graph_input = graph->MutableInputs(); auto* graph_output = graph->MutableOutputs(); CHECK_EQ(graph_input->size(), origin_itensors_.size()); CHECK_EQ(graph_output->size(), origin_otensors_.size()); - std::vector> - graph_in; - if (shape_tensor_map_in_.find(inputs_shape_) != - shape_tensor_map_in_.end()) { - graph_in = shape_tensor_map_in_[inputs_shape_]; - for (size_t i = 0; i < origin_itensors_.size(); ++i) { - graph_in[i]->set_mlu_ptr( - const_cast(origin_itensors_[i]->raw_data())); - } - } else { - graph_in.reserve(origin_itensors_.size()); - for (size_t i = 0; i < origin_itensors_.size(); ++i) { - paddle::lite::subgraph::mlu::MLUTensor tmp( - origin_itensors_[i]->dims().Vectorize()); - // graph_input->at(i)->get_origin_shape()); - tmp.set_mlu_dtype(graph_input->at(i)->dtype()); - tmp.set_mlu_ptr(const_cast(origin_itensors_[i]->raw_data())); - graph_in.push_back( - std::make_shared(tmp)); + if (GetBoolFromEnv("BATCH_SIZE_CHANGEBLE")) { + std::vector> + graph_in; + if (shape_tensor_map_in_.find(all_inputs_shape_) != + shape_tensor_map_in_.end()) { + graph_in = shape_tensor_map_in_[all_inputs_shape_]; + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + graph_in[i]->set_mlu_ptr( + const_cast(origin_itensors_[i]->raw_data())); + } + } else { + graph_in.reserve(origin_itensors_.size()); + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + paddle::lite::subgraph::mlu::MLUTensor tmp( + origin_itensors_[i]->dims().Vectorize()); + tmp.set_mlu_dtype(graph_input->at(i)->dtype()); + tmp.set_mlu_ptr(const_cast(origin_itensors_[i]->raw_data())); + graph_in.push_back( + std::make_shared(tmp)); + } + shape_tensor_map_in_[all_inputs_shape_] = graph_in; } - shape_tensor_map_in_[inputs_shape_] = graph_in; - } - std::vector> - graph_out; - if (shape_tensor_map_out_.find(inputs_shape_) != - shape_tensor_map_out_.end()) { - graph_out = shape_tensor_map_out_[inputs_shape_]; - for (size_t i = 0; i < origin_otensors_.size(); ++i) { - void* p_data = static_cast( - origin_otensors_[i] - ->mutable_data::T>(TARGET(kMLU))); - graph_out[i]->set_mlu_ptr(p_data); + InferOutputsShapeOnly(); + //const std::vector> new_output_size = graph->InferOutputsShape(graph_in); + + std::vector> + graph_out; + + if (shape_tensor_map_out_.find(all_inputs_shape_) != + shape_tensor_map_out_.end()) { + graph_out = shape_tensor_map_out_[all_inputs_shape_]; + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + //origin_otensors_[i]->Resize(new_output_size.at(i)); + void* p_data = static_cast( + origin_otensors_[i] + ->mutable_data::T>(TARGET(kMLU))); + graph_out[i]->set_mlu_ptr(p_data); + } + } else { + graph_out.reserve(origin_otensors_.size()); + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + //origin_otensors_[i]->Resize(new_output_size.at(i)); + void* p_data = static_cast( + origin_otensors_[i] + ->mutable_data::T>(TARGET(kMLU))); + paddle::lite::subgraph::mlu::MLUTensor tmp( + origin_otensors_[i]->dims().Vectorize()); + tmp.set_mlu_dtype(graph_output->at(i)->dtype()); + tmp.set_mlu_ptr(p_data); + graph_out.push_back( + std::make_shared(tmp)); + } + shape_tensor_map_out_[all_inputs_shape_] = graph_out; } + graph->Compute(forward_param, exec_queue, graph_in, graph_out); } else { - graph_out.reserve(origin_otensors_.size()); + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + graph_input->at(i)->set_mlu_ptr(const_cast(origin_itensors_[i]->raw_data())); + } for (size_t i = 0; i < origin_otensors_.size(); ++i) { origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape()); - void* p_data = static_cast( - origin_otensors_[i] - ->mutable_data::T>(TARGET(kMLU))); - paddle::lite::subgraph::mlu::MLUTensor tmp( - origin_otensors_[i]->dims().Vectorize()); - // graph_output->at(i)->get_origin_shape()); - tmp.set_mlu_dtype(graph_output->at(i)->dtype()); - tmp.set_mlu_ptr(p_data); - graph_out.push_back( - std::make_shared(tmp)); + void* p_data = static_cast(origin_otensors_[i]->mutable_data::T>(TARGET(kMLU))); + graph_output->at(i)->set_mlu_ptr(p_data); } - shape_tensor_map_out_[inputs_shape_] = graph_out; + graph->Compute(forward_param, exec_queue); } - auto& mlu_context = this->ctx_->template As(); - auto exec_queue = mlu_context.exec_queue(); - u32_t affinity = mlu_context.affinity(); - cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); - int data_param = 1; - forward_param.data_parallelism = &data_param; - forward_param.affinity = &affinity; - forward_param.end = CNRT_PARAM_END; - graph->Compute(forward_param, exec_queue, graph_in, graph_out); // // =========== DUMP =================== // for (auto input_name : input_names_) { @@ -334,6 +375,7 @@ class SubgraphEngine : public subgraph::Engine { paddle::lite_api::PrecisionType fp_type_; std::vector> inputs_shape_{}; + std::vector> all_inputs_shape_{}; std::map>, std::shared_ptr> shape_graph_map_{}; @@ -343,6 +385,7 @@ class SubgraphEngine : public subgraph::Engine { std::map>, std::vector>> shape_tensor_map_in_{}; + std::map>, std::vector>> in_out_shape_map_{}; }; // namespace mlu template -- GitLab