提交 c7ffa64a 编写于 作者: J jackzhang235

fix bug when set output tensors' dimensions

上级 b2e1827a
...@@ -79,6 +79,27 @@ class Graph { ...@@ -79,6 +79,27 @@ class Graph {
return nodes_.find(name) != nodes_.end(); return nodes_.find(name) != nodes_.end();
} }
// const std::vector<std::vector<int64_t>> InferOutputsShape(std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>> graph_in){
// CHECK_EQ(graph_in.size(), inputs_.size());
// std::vector<cnmlTensor_t> inputs(inputs_.size());
// for (size_t i = 0; i < graph_in.size(); ++i) {
// inputs[i] = graph_in[i]->mlu_tensor();
// }
// std::vector<cnmlTensor_t> outputs(outputs_.size());
// cnmlInferFusionOpOutputShape(fusion_op_, inputs.data(), inputs.size(), outputs.size(), outpus.size());
//
// std::vector<std::vector<int64_t>> outputs_shape;
// for (size_t i = 0; i < outputs.size(); ++i) {
// int len;
// cnmlGetTensorLen(outputs[i], &len);
// std::vector<int64_t> tmp_shape(len);
// cnmlGetTensorShape(outputs[i], tmp_shape.data())
// outputs_shape.push_back(std::move(tmp_shape));
// }
//
// return outputs_shape;
// }
void AddInput(std::shared_ptr<MLUTensor> tensor) { void AddInput(std::shared_ptr<MLUTensor> tensor) {
inputs_.push_back(tensor->mlu_tensor()); inputs_.push_back(tensor->mlu_tensor());
input_tensors_.push_back(tensor); input_tensors_.push_back(tensor);
...@@ -123,6 +144,39 @@ class Graph { ...@@ -123,6 +144,39 @@ class Graph {
CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_)); CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
} }
void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
input_addrs_.resize(input_tensors_.size());
output_addrs_.resize(output_tensors_.size());
for (size_t i = 0; i < input_addrs_.size(); ++i) {
input_addrs_[i] = input_tensors_[i]->mlu_data();
}
for (size_t i = 0; i < output_addrs_.size(); ++i) {
output_addrs_[i] = output_tensors_[i]->mlu_data();
}
#if PRINT_HW_TIME
thread_local float hw_time;
CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
#endif
CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
input_addrs_.data(),
input_addrs_.size(),
output_addrs_.data(),
output_addrs_.size(),
&forward_param,
que));
#if PRINT_HW_TIME
CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
CNRT_CALL(cnrtSyncQueue(que));
CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
hw_time /= 1000.0f;
DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
std::lock_guard<std::mutex> lk(time_mut_);
time_log_.push_back(hw_time);
#endif
}
void Compute(cnrtInvokeFuncParam_t forward_param, void Compute(cnrtInvokeFuncParam_t forward_param,
cnrtQueue_t que, cnrtQueue_t que,
const std::vector<std::shared_ptr<MLUTensor>>& in, const std::vector<std::shared_ptr<MLUTensor>>& in,
......
...@@ -80,9 +80,12 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -80,9 +80,12 @@ class SubgraphEngine : public subgraph::Engine {
bool InputShapeChanged() { bool InputShapeChanged() {
std::vector<std::vector<int64_t>> new_shape; std::vector<std::vector<int64_t>> new_shape;
// used in batch changable situation
std::vector<std::vector<int64_t>> all_shape;
for (auto origin_itensor : origin_itensors_) { for (auto origin_itensor : origin_itensors_) {
if (GetBoolFromEnv("BATCH_SIZE_CHANGEBLE")) { if (GetBoolFromEnv("BATCH_SIZE_CHANGEBLE")) {
auto iv = origin_itensor->dims().Vectorize(); auto iv = origin_itensor->dims().Vectorize();
all_shape.push_back(iv);
iv.erase(iv.begin()); iv.erase(iv.begin());
new_shape.push_back(iv); new_shape.push_back(iv);
} else { } else {
...@@ -90,6 +93,7 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -90,6 +93,7 @@ class SubgraphEngine : public subgraph::Engine {
} }
} }
inputs_shape_ = new_shape; inputs_shape_ = new_shape;
all_inputs_shape_ = all_shape;
if (shape_graph_map_.count(inputs_shape_) > 0) { if (shape_graph_map_.count(inputs_shape_) > 0) {
return false; return false;
} }
...@@ -229,78 +233,115 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -229,78 +233,115 @@ class SubgraphEngine : public subgraph::Engine {
return name; return name;
} }
void InferOutputsShapeOnly() {
const auto iter = in_out_shape_map_.find(all_inputs_shape_);
if (iter != in_out_shape_map_.end()) {
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
origin_otensors_[i]->Resize(iter->second[i]);
}
} else {
for (auto& inst : origin_program_) {
auto op = inst.op();
CHECK(op);
op->CheckShape();
const_cast<OpLite*>(op)->InferShape();
}
std::vector<std::vector<int64_t>> outs_shape;
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
outs_shape.push_back(origin_otensors_[i]->dims().Vectorize());
}
in_out_shape_map_[all_inputs_shape_] = outs_shape;
}
}
int LaunchDeviceProgram() override { int LaunchDeviceProgram() override {
// prepare input and output memory // prepare input and output memory
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto exec_queue = mlu_context.exec_queue();
u32_t affinity = mlu_context.affinity();
cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
int data_param = 1;
forward_param.data_parallelism = &data_param;
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
auto graph = shape_graph_map_[inputs_shape_]; auto graph = shape_graph_map_[inputs_shape_];
auto* graph_input = graph->MutableInputs(); auto* graph_input = graph->MutableInputs();
auto* graph_output = graph->MutableOutputs(); auto* graph_output = graph->MutableOutputs();
CHECK_EQ(graph_input->size(), origin_itensors_.size()); CHECK_EQ(graph_input->size(), origin_itensors_.size());
CHECK_EQ(graph_output->size(), origin_otensors_.size()); CHECK_EQ(graph_output->size(), origin_otensors_.size());
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>> if (GetBoolFromEnv("BATCH_SIZE_CHANGEBLE")) {
graph_in; std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
if (shape_tensor_map_in_.find(inputs_shape_) != graph_in;
shape_tensor_map_in_.end()) { if (shape_tensor_map_in_.find(all_inputs_shape_) !=
graph_in = shape_tensor_map_in_[inputs_shape_]; shape_tensor_map_in_.end()) {
for (size_t i = 0; i < origin_itensors_.size(); ++i) { graph_in = shape_tensor_map_in_[all_inputs_shape_];
graph_in[i]->set_mlu_ptr( for (size_t i = 0; i < origin_itensors_.size(); ++i) {
const_cast<void*>(origin_itensors_[i]->raw_data())); graph_in[i]->set_mlu_ptr(
} const_cast<void*>(origin_itensors_[i]->raw_data()));
} else { }
graph_in.reserve(origin_itensors_.size()); } else {
for (size_t i = 0; i < origin_itensors_.size(); ++i) { graph_in.reserve(origin_itensors_.size());
paddle::lite::subgraph::mlu::MLUTensor tmp( for (size_t i = 0; i < origin_itensors_.size(); ++i) {
origin_itensors_[i]->dims().Vectorize()); paddle::lite::subgraph::mlu::MLUTensor tmp(
// graph_input->at(i)->get_origin_shape()); origin_itensors_[i]->dims().Vectorize());
tmp.set_mlu_dtype(graph_input->at(i)->dtype()); tmp.set_mlu_dtype(graph_input->at(i)->dtype());
tmp.set_mlu_ptr(const_cast<void*>(origin_itensors_[i]->raw_data())); tmp.set_mlu_ptr(const_cast<void*>(origin_itensors_[i]->raw_data()));
graph_in.push_back( graph_in.push_back(
std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp)); std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
}
shape_tensor_map_in_[all_inputs_shape_] = graph_in;
} }
shape_tensor_map_in_[inputs_shape_] = graph_in;
}
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>> InferOutputsShapeOnly();
graph_out; //const std::vector<std::vector<int64_t>> new_output_size = graph->InferOutputsShape(graph_in);
if (shape_tensor_map_out_.find(inputs_shape_) !=
shape_tensor_map_out_.end()) { std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
graph_out = shape_tensor_map_out_[inputs_shape_]; graph_out;
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
void* p_data = static_cast<void*>( if (shape_tensor_map_out_.find(all_inputs_shape_) !=
origin_otensors_[i] shape_tensor_map_out_.end()) {
->mutable_data<typename paddle::lite::subgraph::mlu:: graph_out = shape_tensor_map_out_[all_inputs_shape_];
FPTypeTraits<Precision>::T>(TARGET(kMLU))); for (size_t i = 0; i < origin_otensors_.size(); ++i) {
graph_out[i]->set_mlu_ptr(p_data); //origin_otensors_[i]->Resize(new_output_size.at(i));
void* p_data = static_cast<void*>(
origin_otensors_[i]
->mutable_data<typename paddle::lite::subgraph::mlu::
FPTypeTraits<Precision>::T>(TARGET(kMLU)));
graph_out[i]->set_mlu_ptr(p_data);
}
} else {
graph_out.reserve(origin_otensors_.size());
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
//origin_otensors_[i]->Resize(new_output_size.at(i));
void* p_data = static_cast<void*>(
origin_otensors_[i]
->mutable_data<typename paddle::lite::subgraph::mlu::
FPTypeTraits<Precision>::T>(TARGET(kMLU)));
paddle::lite::subgraph::mlu::MLUTensor tmp(
origin_otensors_[i]->dims().Vectorize());
tmp.set_mlu_dtype(graph_output->at(i)->dtype());
tmp.set_mlu_ptr(p_data);
graph_out.push_back(
std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
}
shape_tensor_map_out_[all_inputs_shape_] = graph_out;
} }
graph->Compute(forward_param, exec_queue, graph_in, graph_out);
} else { } else {
graph_out.reserve(origin_otensors_.size()); for (size_t i = 0; i < origin_itensors_.size(); ++i) {
graph_input->at(i)->set_mlu_ptr(const_cast<void*>(origin_itensors_[i]->raw_data()));
}
for (size_t i = 0; i < origin_otensors_.size(); ++i) { for (size_t i = 0; i < origin_otensors_.size(); ++i) {
origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape()); origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape());
void* p_data = static_cast<void*>( void* p_data = static_cast<void*>(origin_otensors_[i]->mutable_data<typename paddle::lite::subgraph::mlu::FPTypeTraits<Precision>::T>(TARGET(kMLU)));
origin_otensors_[i] graph_output->at(i)->set_mlu_ptr(p_data);
->mutable_data<typename paddle::lite::subgraph::mlu::
FPTypeTraits<Precision>::T>(TARGET(kMLU)));
paddle::lite::subgraph::mlu::MLUTensor tmp(
origin_otensors_[i]->dims().Vectorize());
// graph_output->at(i)->get_origin_shape());
tmp.set_mlu_dtype(graph_output->at(i)->dtype());
tmp.set_mlu_ptr(p_data);
graph_out.push_back(
std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
} }
shape_tensor_map_out_[inputs_shape_] = graph_out; graph->Compute(forward_param, exec_queue);
} }
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto exec_queue = mlu_context.exec_queue();
u32_t affinity = mlu_context.affinity();
cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
int data_param = 1;
forward_param.data_parallelism = &data_param;
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
graph->Compute(forward_param, exec_queue, graph_in, graph_out);
// // =========== DUMP =================== // // =========== DUMP ===================
// for (auto input_name : input_names_) { // for (auto input_name : input_names_) {
...@@ -334,6 +375,7 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -334,6 +375,7 @@ class SubgraphEngine : public subgraph::Engine {
paddle::lite_api::PrecisionType fp_type_; paddle::lite_api::PrecisionType fp_type_;
std::vector<std::vector<int64_t>> inputs_shape_{}; std::vector<std::vector<int64_t>> inputs_shape_{};
std::vector<std::vector<int64_t>> all_inputs_shape_{};
std::map<std::vector<std::vector<int64_t>>, std::map<std::vector<std::vector<int64_t>>,
std::shared_ptr<paddle::lite::subgraph::mlu::Graph>> std::shared_ptr<paddle::lite::subgraph::mlu::Graph>>
shape_graph_map_{}; shape_graph_map_{};
...@@ -343,6 +385,7 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -343,6 +385,7 @@ class SubgraphEngine : public subgraph::Engine {
std::map<std::vector<std::vector<int64_t>>, std::map<std::vector<std::vector<int64_t>>,
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>> std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
shape_tensor_map_in_{}; shape_tensor_map_in_{};
std::map<std::vector<std::vector<int64_t>>, std::vector<std::vector<int64_t>>> in_out_shape_map_{};
}; // namespace mlu }; // namespace mlu
template <PrecisionType Precision> template <PrecisionType Precision>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册