提交 547750ef 编写于 作者: J jackzhang235

Merge branch 'Batch_Size' into develop

...@@ -47,7 +47,6 @@ class Graph { ...@@ -47,7 +47,6 @@ class Graph {
CNRT_CALL(cnrtCreateNotifier(&notifier_end_)); CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
#endif #endif
} }
~Graph() { ~Graph() {
FreeConstData(); FreeConstData();
CNML_CALL(cnmlDestroyFusionOp(&fusion_op_)); CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
...@@ -62,7 +61,6 @@ class Graph { ...@@ -62,7 +61,6 @@ class Graph {
<< " process:" << total_time / time_log_.size() << std::endl; << " process:" << total_time / time_log_.size() << std::endl;
#endif #endif
} }
// Data node // Data node
std::shared_ptr<MLUTensor> AddNode( std::shared_ptr<MLUTensor> AddNode(
const std::string& name, const std::string& name,
...@@ -81,9 +79,39 @@ class Graph { ...@@ -81,9 +79,39 @@ class Graph {
return nodes_.find(name) != nodes_.end(); return nodes_.find(name) != nodes_.end();
} }
// const std::vector<std::vector<int64_t>>
// InferOutputsShape(std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
// graph_in){
// CHECK_EQ(graph_in.size(), inputs_.size());
// std::vector<cnmlTensor_t> inputs(inputs_.size());
// for (size_t i = 0; i < graph_in.size(); ++i) {
// inputs[i] = graph_in[i]->mlu_tensor();
// }
// std::vector<cnmlTensor_t> outputs(outputs_.size());
// cnmlInferFusionOpOutputShape(fusion_op_, inputs.data(), inputs.size(),
// outputs.size(), outpus.size());
//
// std::vector<std::vector<int64_t>> outputs_shape;
// for (size_t i = 0; i < outputs.size(); ++i) {
// int len;
// cnmlGetTensorLen(outputs[i], &len);
// std::vector<int64_t> tmp_shape(len);
// cnmlGetTensorShape(outputs[i], tmp_shape.data())
// outputs_shape.push_back(std::move(tmp_shape));
// }
//
// return outputs_shape;
// }
void AddInput(std::shared_ptr<MLUTensor> tensor) { void AddInput(std::shared_ptr<MLUTensor> tensor) {
inputs_.push_back(tensor->mlu_tensor()); inputs_.push_back(tensor->mlu_tensor());
input_tensors_.push_back(tensor); input_tensors_.push_back(tensor);
if (GetBoolFromEnv("BATCH_SIZE_CHANGEBLE")) {
constexpr int input_dimNb = 4;
bool input_dim_mutable[4] = {true, false, false, false};
cnmlSetTensorDimMutable(
tensor->mlu_tensor(), input_dim_mutable, input_dimNb);
}
} }
void AddOutput(std::shared_ptr<MLUTensor> tensor) { void AddOutput(std::shared_ptr<MLUTensor> tensor) {
...@@ -151,6 +179,49 @@ class Graph { ...@@ -151,6 +179,49 @@ class Graph {
#endif #endif
} }
void Compute(cnrtInvokeFuncParam_t forward_param,
cnrtQueue_t que,
const std::vector<std::shared_ptr<MLUTensor>>& in,
const std::vector<std::shared_ptr<MLUTensor>>& out) {
std::vector<cnmlTensor_t> in_tensor;
std::vector<cnmlTensor_t> out_tensor;
input_addrs_.resize(in.size());
output_addrs_.resize(out.size());
for (size_t i = 0; i < input_addrs_.size(); ++i) {
input_addrs_[i] = in[i]->mlu_data();
in_tensor.push_back(in[i]->mlu_tensor());
}
for (size_t i = 0; i < output_addrs_.size(); ++i) {
output_addrs_[i] = out[i]->mlu_data();
out_tensor.push_back(out[i]->mlu_tensor());
}
#if PRINT_HW_TIME
thread_local float hw_time;
CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
#endif
/* Because of using cnmlSetTensorDimMutable, cnmlComputeFusionOpForward_V3
* -> cnmlComputeFusionOpForward_V4 */
CNML_CALL(cnmlComputeFusionOpForward_V4(fusion_op_,
&in_tensor[0],
input_addrs_.data(),
input_addrs_.size(),
&out_tensor[0],
output_addrs_.data(),
output_addrs_.size(),
que,
NULL));
#if PRINT_HW_TIME
CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
CNRT_CALL(cnrtSyncQueue(que));
CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
hw_time /= 1000.0f;
DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
std::lock_guard<std::mutex> lk(time_mut_);
time_log_.push_back(hw_time);
#endif
}
template <typename T> template <typename T>
void* RegisterConstData(size_t len) { void* RegisterConstData(size_t len) {
void* addr = malloc(len * sizeof(T)); void* addr = malloc(len * sizeof(T));
......
...@@ -49,6 +49,7 @@ class MLUTensor { ...@@ -49,6 +49,7 @@ class MLUTensor {
return mlu_ptr_; return mlu_ptr_;
} }
cnmlDataType_t dtype() { return mlu_dtype_; }
void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; } void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
const std::vector<int64_t>& get_origin_shape() const { return origin_shape_; } const std::vector<int64_t>& get_origin_shape() const { return origin_shape_; }
......
...@@ -89,7 +89,10 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op, ...@@ -89,7 +89,10 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
} }
graph.Compile(CNML_MLU270, 1); graph.Compile(CNML_MLU270, 1);
graph.Compute(forward_param, queue_); graph.Compute(forward_param,
queue_,
*(graph.MutableInputs()),
*(graph.MutableOutputs()));
CNRT_CALL(cnrtSyncQueue(queue_)); CNRT_CALL(cnrtSyncQueue(queue_));
for (auto& output_name : output_var_names) { for (auto& output_name : output_var_names) {
......
...@@ -22,12 +22,16 @@ ...@@ -22,12 +22,16 @@
#include "lite/api/paddle_place.h" #include "lite/api/paddle_place.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
#include "lite/core/types.h" #include "lite/core/types.h"
#include "lite/kernels/mlu/bridges/graph.h" #include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/tensor.h"
#include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/utils/env.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -76,10 +80,20 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -76,10 +80,20 @@ class SubgraphEngine : public subgraph::Engine {
bool InputShapeChanged() { bool InputShapeChanged() {
std::vector<std::vector<int64_t>> new_shape; std::vector<std::vector<int64_t>> new_shape;
// used in batch changable situation
std::vector<std::vector<int64_t>> all_shape;
for (auto origin_itensor : origin_itensors_) { for (auto origin_itensor : origin_itensors_) {
if (GetBoolFromEnv("BATCH_SIZE_CHANGEABLE")) {
auto iv = origin_itensor->dims().Vectorize();
all_shape.push_back(iv);
iv.erase(iv.begin());
new_shape.push_back(iv);
} else {
new_shape.push_back(origin_itensor->dims().Vectorize()); new_shape.push_back(origin_itensor->dims().Vectorize());
} }
}
inputs_shape_ = new_shape; inputs_shape_ = new_shape;
all_inputs_shape_ = all_shape;
if (shape_graph_map_.count(inputs_shape_) > 0) { if (shape_graph_map_.count(inputs_shape_) > 0) {
return false; return false;
} }
...@@ -99,9 +113,14 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -99,9 +113,14 @@ class SubgraphEngine : public subgraph::Engine {
status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
for (auto& input_name : input_names_) { for (auto& input_name : input_names_) {
auto input_tensor = scope_->FindMutableTensor(input_name); auto input_tensor = scope_->FindMutableTensor(input_name);
origin_itensors_.push_back(input_tensor); origin_itensors_.push_back(input_tensor);
if (GetBoolFromEnv("BATCH_SIZE_CHANGEABLE")) {
auto iv = input_tensor->dims().Vectorize();
iv.erase(iv.begin());
new_shape.push_back(iv);
} else {
new_shape.push_back(input_tensor->dims().Vectorize()); new_shape.push_back(input_tensor->dims().Vectorize());
}
CHECK(input_tensor); CHECK(input_tensor);
auto input_node = graph->AddNode(input_name, auto input_node = graph->AddNode(input_name,
...@@ -214,14 +233,110 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -214,14 +233,110 @@ class SubgraphEngine : public subgraph::Engine {
return name; return name;
} }
void InferOutputsShapeOnly() {
// infer outputs shape when enable BATCH_SIZE_CHANGEABLE
const auto iter = in_out_shape_map_.find(all_inputs_shape_);
if (iter != in_out_shape_map_.end()) {
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
origin_otensors_[i]->Resize(iter->second[i]);
}
} else {
for (auto& inst : origin_program_) {
auto op = inst.op();
CHECK(op);
op->CheckShape();
const_cast<OpLite*>(op)->InferShape();
}
std::vector<std::vector<int64_t>> outs_shape;
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
outs_shape.push_back(origin_otensors_[i]->dims().Vectorize());
}
in_out_shape_map_[all_inputs_shape_] = outs_shape;
}
}
int LaunchDeviceProgram() override { int LaunchDeviceProgram() override {
// prepare input and output memory // prepare input and output memory
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto exec_queue = mlu_context.exec_queue();
u32_t affinity = mlu_context.affinity();
cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
int data_param = 1;
forward_param.data_parallelism = &data_param;
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
auto graph = shape_graph_map_[inputs_shape_]; auto graph = shape_graph_map_[inputs_shape_];
auto* graph_input = graph->MutableInputs(); auto* graph_input = graph->MutableInputs();
auto* graph_output = graph->MutableOutputs(); auto* graph_output = graph->MutableOutputs();
CHECK_EQ(graph_input->size(), origin_itensors_.size()); CHECK_EQ(graph_input->size(), origin_itensors_.size());
CHECK_EQ(graph_output->size(), origin_otensors_.size()); CHECK_EQ(graph_output->size(), origin_otensors_.size());
if (GetBoolFromEnv("BATCH_SIZE_CHANGEABLE")) {
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
graph_in;
if (shape_tensor_map_in_.find(all_inputs_shape_) !=
shape_tensor_map_in_.end()) {
graph_in = shape_tensor_map_in_[all_inputs_shape_];
for (size_t i = 0; i < origin_itensors_.size(); ++i) {
graph_in[i]->set_mlu_ptr(
const_cast<void*>(origin_itensors_[i]->raw_data()));
}
} else {
graph_in.reserve(origin_itensors_.size());
for (size_t i = 0; i < origin_itensors_.size(); ++i) {
paddle::lite::subgraph::mlu::MLUTensor tmp(
origin_itensors_[i]->dims().Vectorize());
tmp.set_mlu_dtype(graph_input->at(i)->dtype());
tmp.set_mlu_ptr(const_cast<void*>(origin_itensors_[i]->raw_data()));
graph_in.push_back(
std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
}
shape_tensor_map_in_[all_inputs_shape_] = graph_in;
}
// TODO(zhangmingwei): we just call every op's infer_shape to get outputs'
// shape, may be it's better to use cnml's api to get output shape. This
// can be done when cnml's tensor dimension is totally equal to lite's
// tensor
// shape.
InferOutputsShapeOnly();
// const std::vector<std::vector<int64_t>> new_output_size =
// graph->InferOutputsShape(graph_in);
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
graph_out;
if (shape_tensor_map_out_.find(all_inputs_shape_) !=
shape_tensor_map_out_.end()) {
graph_out = shape_tensor_map_out_[all_inputs_shape_];
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
// origin_otensors_[i]->Resize(new_output_size.at(i));
void* p_data = static_cast<void*>(
origin_otensors_[i]
->mutable_data<typename paddle::lite::subgraph::mlu::
FPTypeTraits<Precision>::T>(TARGET(kMLU)));
graph_out[i]->set_mlu_ptr(p_data);
}
} else {
graph_out.reserve(origin_otensors_.size());
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
// origin_otensors_[i]->Resize(new_output_size.at(i));
void* p_data = static_cast<void*>(
origin_otensors_[i]
->mutable_data<typename paddle::lite::subgraph::mlu::
FPTypeTraits<Precision>::T>(TARGET(kMLU)));
paddle::lite::subgraph::mlu::MLUTensor tmp(
origin_otensors_[i]->dims().Vectorize());
tmp.set_mlu_dtype(graph_output->at(i)->dtype());
tmp.set_mlu_ptr(p_data);
graph_out.push_back(
std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
}
shape_tensor_map_out_[all_inputs_shape_] = graph_out;
}
graph->Compute(forward_param, exec_queue, graph_in, graph_out);
} else {
for (size_t i = 0; i < origin_itensors_.size(); ++i) { for (size_t i = 0; i < origin_itensors_.size(); ++i) {
graph_input->at(i)->set_mlu_ptr( graph_input->at(i)->set_mlu_ptr(
const_cast<void*>(origin_itensors_[i]->raw_data())); const_cast<void*>(origin_itensors_[i]->raw_data()));
...@@ -230,21 +345,12 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -230,21 +345,12 @@ class SubgraphEngine : public subgraph::Engine {
origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape()); origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape());
void* p_data = static_cast<void*>( void* p_data = static_cast<void*>(
origin_otensors_[i] origin_otensors_[i]
->mutable_data<typename paddle::lite::subgraph::mlu::FPTypeTraits< ->mutable_data<typename paddle::lite::subgraph::mlu::
Precision>::T>(TARGET(kMLU))); FPTypeTraits<Precision>::T>(TARGET(kMLU)));
graph_output->at(i)->set_mlu_ptr(p_data); graph_output->at(i)->set_mlu_ptr(p_data);
} }
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto exec_queue = mlu_context.exec_queue();
u32_t affinity = mlu_context.affinity();
cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
int data_param = 1;
forward_param.data_parallelism = &data_param;
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
graph->Compute(forward_param, exec_queue); graph->Compute(forward_param, exec_queue);
}
// // =========== DUMP =================== // // =========== DUMP ===================
// for (auto input_name : input_names_) { // for (auto input_name : input_names_) {
...@@ -278,9 +384,24 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -278,9 +384,24 @@ class SubgraphEngine : public subgraph::Engine {
paddle::lite_api::PrecisionType fp_type_; paddle::lite_api::PrecisionType fp_type_;
std::vector<std::vector<int64_t>> inputs_shape_{}; std::vector<std::vector<int64_t>> inputs_shape_{};
std::vector<std::vector<int64_t>> all_inputs_shape_{};
std::map<std::vector<std::vector<int64_t>>, std::map<std::vector<std::vector<int64_t>>,
std::shared_ptr<paddle::lite::subgraph::mlu::Graph>> std::shared_ptr<paddle::lite::subgraph::mlu::Graph>>
shape_graph_map_{}; shape_graph_map_{};
// search output runtime MLUTensor for certain output shape when enable
// BATCH_SIZE_CHANGEABLE
std::map<std::vector<std::vector<int64_t>>,
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
shape_tensor_map_out_{};
// search input runtime MLUTensor for certain input shape when enable
// BATCH_SIZE_CHANGEABLE
std::map<std::vector<std::vector<int64_t>>,
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
shape_tensor_map_in_{};
// search output shape for certain input shape when enable
// BATCH_SIZE_CHANGEABLE
std::map<std::vector<std::vector<int64_t>>, std::vector<std::vector<int64_t>>>
in_out_shape_map_{};
}; // namespace mlu }; // namespace mlu
template <PrecisionType Precision> template <PrecisionType Precision>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册