diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h index ab6a075e71045a8d98826e524f5a64e4cdab56c2..e77993290ce01905775835930ebf1889a1e8cd0b 100644 --- a/lite/kernels/mlu/bridges/graph.h +++ b/lite/kernels/mlu/bridges/graph.h @@ -47,7 +47,6 @@ class Graph { CNRT_CALL(cnrtCreateNotifier(¬ifier_end_)); #endif } - ~Graph() { FreeConstData(); CNML_CALL(cnmlDestroyFusionOp(&fusion_op_)); @@ -62,7 +61,6 @@ class Graph { << " process:" << total_time / time_log_.size() << std::endl; #endif } - // Data node std::shared_ptr AddNode( const std::string& name, @@ -81,9 +79,39 @@ class Graph { return nodes_.find(name) != nodes_.end(); } + // const std::vector> + // InferOutputsShape(std::vector> + // graph_in){ + // CHECK_EQ(graph_in.size(), inputs_.size()); + // std::vector inputs(inputs_.size()); + // for (size_t i = 0; i < graph_in.size(); ++i) { + // inputs[i] = graph_in[i]->mlu_tensor(); + // } + // std::vector outputs(outputs_.size()); + // cnmlInferFusionOpOutputShape(fusion_op_, inputs.data(), inputs.size(), + // outputs.size(), outpus.size()); + // + // std::vector> outputs_shape; + // for (size_t i = 0; i < outputs.size(); ++i) { + // int len; + // cnmlGetTensorLen(outputs[i], &len); + // std::vector tmp_shape(len); + // cnmlGetTensorShape(outputs[i], tmp_shape.data()) + // outputs_shape.push_back(std::move(tmp_shape)); + // } + // + // return outputs_shape; + // } + void AddInput(std::shared_ptr tensor) { inputs_.push_back(tensor->mlu_tensor()); input_tensors_.push_back(tensor); + if (GetBoolFromEnv("BATCH_SIZE_CHANGEBLE")) { + constexpr int input_dimNb = 4; + bool input_dim_mutable[4] = {true, false, false, false}; + cnmlSetTensorDimMutable( + tensor->mlu_tensor(), input_dim_mutable, input_dimNb); + } } void AddOutput(std::shared_ptr tensor) { @@ -151,6 +179,49 @@ class Graph { #endif } + void Compute(cnrtInvokeFuncParam_t forward_param, + cnrtQueue_t que, + const std::vector>& in, + const std::vector>& out) { + std::vector in_tensor; + std::vector out_tensor; + input_addrs_.resize(in.size()); + output_addrs_.resize(out.size()); + for (size_t i = 0; i < input_addrs_.size(); ++i) { + input_addrs_[i] = in[i]->mlu_data(); + in_tensor.push_back(in[i]->mlu_tensor()); + } + for (size_t i = 0; i < output_addrs_.size(); ++i) { + output_addrs_[i] = out[i]->mlu_data(); + out_tensor.push_back(out[i]->mlu_tensor()); + } + +#if PRINT_HW_TIME + thread_local float hw_time; + CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); +#endif + /* Because of using cnmlSetTensorDimMutable, cnmlComputeFusionOpForward_V3 + * -> cnmlComputeFusionOpForward_V4 */ + CNML_CALL(cnmlComputeFusionOpForward_V4(fusion_op_, + &in_tensor[0], + input_addrs_.data(), + input_addrs_.size(), + &out_tensor[0], + output_addrs_.data(), + output_addrs_.size(), + que, + NULL)); +#if PRINT_HW_TIME + CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); + CNRT_CALL(cnrtSyncQueue(que)); + CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); + hw_time /= 1000.0f; + DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; + std::lock_guard lk(time_mut_); + time_log_.push_back(hw_time); +#endif + } + template void* RegisterConstData(size_t len) { void* addr = malloc(len * sizeof(T)); diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h index 24e181a47b1e91741e53a9b63dcf5cab4171f7df..79b4b9c0d932052d6f5fe52c1e3c784570d13843 100644 --- a/lite/kernels/mlu/bridges/tensor.h +++ b/lite/kernels/mlu/bridges/tensor.h @@ -49,6 +49,7 @@ class MLUTensor { return mlu_ptr_; } + cnmlDataType_t dtype() { return mlu_dtype_; } void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; } const std::vector& get_origin_shape() const { return origin_shape_; } diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc index 7dca67fc30845f23c2c1334697094bea982ef897..7676e8426583ead52fb00a6c19f48bc1760bacbc 100644 --- a/lite/kernels/mlu/bridges/test_helper.cc +++ b/lite/kernels/mlu/bridges/test_helper.cc @@ -89,7 +89,10 @@ void LaunchOp(const std::shared_ptr op, } graph.Compile(CNML_MLU270, 1); - graph.Compute(forward_param, queue_); + graph.Compute(forward_param, + queue_, + *(graph.MutableInputs()), + *(graph.MutableOutputs())); CNRT_CALL(cnrtSyncQueue(queue_)); for (auto& output_name : output_var_names) { diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h index 4b1ffad2b0582283b138d47a2ecd70a578cde9be..b293494d7987fa17d1f455cf62fa13b3bb27b5fe 100644 --- a/lite/kernels/mlu/subgraph_compute.h +++ b/lite/kernels/mlu/subgraph_compute.h @@ -22,12 +22,16 @@ #include "lite/api/paddle_place.h" #include "lite/core/kernel.h" +#include "lite/core/op_lite.h" #include "lite/core/op_registry.h" +#include "lite/core/tensor.h" #include "lite/core/type_system.h" #include "lite/core/types.h" #include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/tensor.h" #include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/registry.h" +#include "lite/utils/env.h" namespace paddle { namespace lite { @@ -76,10 +80,20 @@ class SubgraphEngine : public subgraph::Engine { bool InputShapeChanged() { std::vector> new_shape; + // used in batch changable situation + std::vector> all_shape; for (auto origin_itensor : origin_itensors_) { - new_shape.push_back(origin_itensor->dims().Vectorize()); + if (GetBoolFromEnv("BATCH_SIZE_CHANGEABLE")) { + auto iv = origin_itensor->dims().Vectorize(); + all_shape.push_back(iv); + iv.erase(iv.begin()); + new_shape.push_back(iv); + } else { + new_shape.push_back(origin_itensor->dims().Vectorize()); + } } inputs_shape_ = new_shape; + all_inputs_shape_ = all_shape; if (shape_graph_map_.count(inputs_shape_) > 0) { return false; } @@ -99,9 +113,14 @@ class SubgraphEngine : public subgraph::Engine { status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; for (auto& input_name : input_names_) { auto input_tensor = scope_->FindMutableTensor(input_name); - origin_itensors_.push_back(input_tensor); - new_shape.push_back(input_tensor->dims().Vectorize()); + if (GetBoolFromEnv("BATCH_SIZE_CHANGEABLE")) { + auto iv = input_tensor->dims().Vectorize(); + iv.erase(iv.begin()); + new_shape.push_back(iv); + } else { + new_shape.push_back(input_tensor->dims().Vectorize()); + } CHECK(input_tensor); auto input_node = graph->AddNode(input_name, @@ -214,27 +233,30 @@ class SubgraphEngine : public subgraph::Engine { return name; } - int LaunchDeviceProgram() override { - // prepare input and output memory - auto graph = shape_graph_map_[inputs_shape_]; - auto* graph_input = graph->MutableInputs(); - auto* graph_output = graph->MutableOutputs(); - CHECK_EQ(graph_input->size(), origin_itensors_.size()); - CHECK_EQ(graph_output->size(), origin_otensors_.size()); - - for (size_t i = 0; i < origin_itensors_.size(); ++i) { - graph_input->at(i)->set_mlu_ptr( - const_cast(origin_itensors_[i]->raw_data())); - } - for (size_t i = 0; i < origin_otensors_.size(); ++i) { - origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape()); - void* p_data = static_cast( - origin_otensors_[i] - ->mutable_data::T>(TARGET(kMLU))); - graph_output->at(i)->set_mlu_ptr(p_data); + void InferOutputsShapeOnly() { + // infer outputs shape when enable BATCH_SIZE_CHANGEABLE + const auto iter = in_out_shape_map_.find(all_inputs_shape_); + if (iter != in_out_shape_map_.end()) { + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + origin_otensors_[i]->Resize(iter->second[i]); + } + } else { + for (auto& inst : origin_program_) { + auto op = inst.op(); + CHECK(op); + op->CheckShape(); + const_cast(op)->InferShape(); + } + std::vector> outs_shape; + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + outs_shape.push_back(origin_otensors_[i]->dims().Vectorize()); + } + in_out_shape_map_[all_inputs_shape_] = outs_shape; } + } + int LaunchDeviceProgram() override { + // prepare input and output memory auto& mlu_context = this->ctx_->template As(); auto exec_queue = mlu_context.exec_queue(); u32_t affinity = mlu_context.affinity(); @@ -244,7 +266,91 @@ class SubgraphEngine : public subgraph::Engine { forward_param.affinity = &affinity; forward_param.end = CNRT_PARAM_END; - graph->Compute(forward_param, exec_queue); + auto graph = shape_graph_map_[inputs_shape_]; + auto* graph_input = graph->MutableInputs(); + auto* graph_output = graph->MutableOutputs(); + CHECK_EQ(graph_input->size(), origin_itensors_.size()); + CHECK_EQ(graph_output->size(), origin_otensors_.size()); + + if (GetBoolFromEnv("BATCH_SIZE_CHANGEABLE")) { + std::vector> + graph_in; + if (shape_tensor_map_in_.find(all_inputs_shape_) != + shape_tensor_map_in_.end()) { + graph_in = shape_tensor_map_in_[all_inputs_shape_]; + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + graph_in[i]->set_mlu_ptr( + const_cast(origin_itensors_[i]->raw_data())); + } + } else { + graph_in.reserve(origin_itensors_.size()); + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + paddle::lite::subgraph::mlu::MLUTensor tmp( + origin_itensors_[i]->dims().Vectorize()); + tmp.set_mlu_dtype(graph_input->at(i)->dtype()); + tmp.set_mlu_ptr(const_cast(origin_itensors_[i]->raw_data())); + graph_in.push_back( + std::make_shared(tmp)); + } + shape_tensor_map_in_[all_inputs_shape_] = graph_in; + } + + // TODO(zhangmingwei): we just call every op's infer_shape to get outputs' + // shape, may be it's better to use cnml's api to get output shape. This + // can be done when cnml's tensor dimension is totally equal to lite's + // tensor + // shape. + InferOutputsShapeOnly(); + // const std::vector> new_output_size = + // graph->InferOutputsShape(graph_in); + + std::vector> + graph_out; + + if (shape_tensor_map_out_.find(all_inputs_shape_) != + shape_tensor_map_out_.end()) { + graph_out = shape_tensor_map_out_[all_inputs_shape_]; + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + // origin_otensors_[i]->Resize(new_output_size.at(i)); + void* p_data = static_cast( + origin_otensors_[i] + ->mutable_data::T>(TARGET(kMLU))); + graph_out[i]->set_mlu_ptr(p_data); + } + } else { + graph_out.reserve(origin_otensors_.size()); + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + // origin_otensors_[i]->Resize(new_output_size.at(i)); + void* p_data = static_cast( + origin_otensors_[i] + ->mutable_data::T>(TARGET(kMLU))); + paddle::lite::subgraph::mlu::MLUTensor tmp( + origin_otensors_[i]->dims().Vectorize()); + tmp.set_mlu_dtype(graph_output->at(i)->dtype()); + tmp.set_mlu_ptr(p_data); + graph_out.push_back( + std::make_shared(tmp)); + } + shape_tensor_map_out_[all_inputs_shape_] = graph_out; + } + graph->Compute(forward_param, exec_queue, graph_in, graph_out); + } else { + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + graph_input->at(i)->set_mlu_ptr( + const_cast(origin_itensors_[i]->raw_data())); + } + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape()); + void* p_data = static_cast( + origin_otensors_[i] + ->mutable_data::T>(TARGET(kMLU))); + graph_output->at(i)->set_mlu_ptr(p_data); + } + graph->Compute(forward_param, exec_queue); + } // // =========== DUMP =================== // for (auto input_name : input_names_) { @@ -278,9 +384,24 @@ class SubgraphEngine : public subgraph::Engine { paddle::lite_api::PrecisionType fp_type_; std::vector> inputs_shape_{}; + std::vector> all_inputs_shape_{}; std::map>, std::shared_ptr> shape_graph_map_{}; + // search output runtime MLUTensor for certain output shape when enable + // BATCH_SIZE_CHANGEABLE + std::map>, + std::vector>> + shape_tensor_map_out_{}; + // search input runtime MLUTensor for certain input shape when enable + // BATCH_SIZE_CHANGEABLE + std::map>, + std::vector>> + shape_tensor_map_in_{}; + // search output shape for certain input shape when enable + // BATCH_SIZE_CHANGEABLE + std::map>, std::vector>> + in_out_shape_map_{}; }; // namespace mlu template