提交 a25331bc 编写于 作者: N nhzlx

cherry-pick from feature/anakin-engine: deal the changing shape when using anakin #16189

上级 c79f06d3
...@@ -72,6 +72,7 @@ pass_library(identity_scale_op_clean_pass base) ...@@ -72,6 +72,7 @@ pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base) pass_library(sync_batch_norm_pass base)
pass_library(runtime_context_cache_pass base) pass_library(runtime_context_cache_pass base)
pass_library(simplify_anakin_detection_pattern_pass inference) pass_library(simplify_anakin_detection_pattern_pass inference)
pass_library(anakin_fillconstant_elementwisemul_fuse inference)
# There may be many transpose-flatten structures in a model, and the output of # There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will # these structures will be used as inputs to the concat Op. This pattern will
...@@ -82,7 +83,7 @@ foreach (index RANGE 3 6) ...@@ -82,7 +83,7 @@ foreach (index RANGE 3 6)
file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n") file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
endforeach() endforeach()
foreach (index RANGE 3 6) foreach (index RANGE 2 6)
file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n") file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
endforeach() endforeach()
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
namespace framework {
namespace ir {
#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
#define GET_NODES \
GET_IR_NODE(fill_constant); \
GET_IR_NODE(fill_constant_out); \
GET_IR_NODE(elementwise_mul); \
GET_IR_NODE(elementwise_mul_out);
std::unique_ptr<ir::Graph> AnakinFillconstantElementwisemulFuse::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
FusePassBase::Init(pattern_name, graph.get());
GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern()
->NewNode("x")
->assert_is_op_input("elementwise_mul", "X")
->AsInput();
patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
pattern_name);
pattern(x);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_NODES;
PADDLE_ENFORCE(subgraph.count(x));
auto* elementwise_in = subgraph.at(x);
float constant_value =
boost::get<float>(fill_constant->Op()->GetAttr("value"));
framework::OpDesc new_op_desc;
new_op_desc.SetType("scale");
new_op_desc.SetInput("X", {elementwise_in->Name()});
new_op_desc.SetAttr("scale", constant_value);
new_op_desc.SetAttr("bias", static_cast<float>(0.0));
new_op_desc.SetAttr("bias_after_scale", true);
new_op_desc.SetOutput("Out", {elementwise_mul_out->Name()});
new_op_desc.Flush();
// Create a new node for the fused op.
auto* scale_op = graph->CreateOpNode(&new_op_desc);
IR_NODE_LINK_TO(elementwise_in, scale_op); // Input
IR_NODE_LINK_TO(scale_op, elementwise_mul_out); // Output
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph.get(),
{fill_constant, fill_constant_out, elementwise_mul});
};
gpd(graph.get(), handler);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class AnakinFillconstantElementwisemulFuse : public FusePassBase {
public:
virtual ~AnakinFillconstantElementwisemulFuse() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -1596,6 +1596,29 @@ PDNode *patterns::AnakinDetectionPattern::operator()( ...@@ -1596,6 +1596,29 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
return multiclass_nms_out; return multiclass_nms_out;
} }
PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
PDNode *elementwise_op_input) {
auto fill_constant =
pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
auto fill_constant_out = pattern->NewNode(fill_constant_out_repr())
->assert_is_op_output("fill_constant")
->assert_is_op_input("elementwise_mul", "Y")
->AsIntermediate();
auto elementwise_mul_op =
pattern->NewNode(elementwise_mul_repr())->assert_is_op("elementwise_mul");
auto elementwise_mul_out = pattern->NewNode(elementwise_mul_out_repr())
->assert_is_op_output("elementwise_mul")
->AsOutput();
fill_constant_out->LinksFrom({fill_constant});
elementwise_mul_op->LinksFrom({elementwise_op_input, fill_constant_out});
elementwise_mul_out->LinksFrom({elementwise_mul_op});
return elementwise_mul_out;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -856,6 +856,21 @@ struct AnakinDetectionPattern : public PatternBase { ...@@ -856,6 +856,21 @@ struct AnakinDetectionPattern : public PatternBase {
} }
}; };
struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
const std::string& name_scope)
: PatternBase(pattern, name_scope,
"anakin_fillconstant_elementwisemul_fuse") {}
PDNode* operator()(PDNode* elementwise_op_input);
// declare operator node's name
PATTERN_DECL_NODE(fill_constant);
PATTERN_DECL_NODE(fill_constant_out);
PATTERN_DECL_NODE(elementwise_mul);
PATTERN_DECL_NODE(elementwise_mul_out);
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.
......
...@@ -215,6 +215,7 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl( ...@@ -215,6 +215,7 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
} }
template class SimplifyAnakinDetectionPatternPass<1>; template class SimplifyAnakinDetectionPatternPass<1>;
template class SimplifyAnakinDetectionPatternPass<2>;
template class SimplifyAnakinDetectionPatternPass<3>; template class SimplifyAnakinDetectionPatternPass<3>;
template class SimplifyAnakinDetectionPatternPass<4>; template class SimplifyAnakinDetectionPatternPass<4>;
template class SimplifyAnakinDetectionPatternPass<5>; template class SimplifyAnakinDetectionPatternPass<5>;
...@@ -227,6 +228,9 @@ template class SimplifyAnakinDetectionPatternPass<6>; ...@@ -227,6 +228,9 @@ template class SimplifyAnakinDetectionPatternPass<6>;
REGISTER_PASS(simplify_anakin_detection_pattern_pass, REGISTER_PASS(simplify_anakin_detection_pattern_pass,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>); paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass2,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass3, REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>); paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
......
cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc DEPS anakin_engine framework_proto scope op_registry) elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc
batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc
detection_out.cc scale.cc DEPS anakin_engine framework_proto scope op_registry)
cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op) cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv) cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv)
cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter) cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter)
...@@ -13,3 +16,4 @@ cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter res ...@@ -13,3 +16,4 @@ cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter res
cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op) cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op)
cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op) cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op)
cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op) cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op)
cc_test(test_anakin_scale SRCS test_scale_op.cc DEPS anakin_op_converter scale_op math_function)
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -72,32 +73,71 @@ class AnakinOpConverter { ...@@ -72,32 +73,71 @@ class AnakinOpConverter {
// The scope here should be inited with the parameter vars. // The scope here should be inited with the parameter vars.
void ConvertBlockToAnakinEngine( void ConvertBlockToAnakinEngine(
framework::BlockDesc *block_desc, const framework::Scope &scope, framework::BlockDesc *block_desc, framework::Scope *scope,
const std::vector<std::string> &inputs, const std::vector<std::string> &inputs,
const std::unordered_set<std::string> &parameters, const std::unordered_set<std::string> &parameters,
const std::vector<std::string> &outputs, AnakinNvEngine *engine) { const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
framework::proto::BlockDesc *block_proto = block_desc->Proto(); framework::proto::BlockDesc *block_proto = block_desc->Proto();
ConvertBlock(*block_proto, parameters, scope, engine); ConvertBlock(*block_proto, parameters, *scope, engine);
engine->Freeze(); engine->Freeze();
// if the max_batch size
int max_batch_size = engine->GetMaxBatchSize();
PADDLE_ENFORCE(max_batch_size > 0,
"the max_batch_size setted from config->EnableAnakinEngine "
"must largger than 0");
// If the user does not specify this variable, we use the input shape from
// the block_desc.
auto max_input_shape = engine->GetMaxInputShape();
std::map<std::string, std::vector<int>> temp_max_input_shape;
for (auto &input : inputs) { for (auto &input : inputs) {
if (parameters.count(input)) continue; if (parameters.count(input)) continue;
auto *var = block_desc->FindVar(input);
PADDLE_ENFORCE(var, "no variable called %s", input);
auto var_shape = var->GetShape();
PADDLE_ENFORCE(var_shape.size() == 4);
std::vector<int> input_shape; std::vector<int> input_shape;
for (int i = 0; i < var_shape.size(); i++) { input_shape.resize(4);
input_shape.push_back(var_shape[i]); input_shape[0] = max_batch_size;
if (max_input_shape.count(input)) {
PADDLE_ENFORCE(max_input_shape[input].size() == 4,
"the dimensions of max_input_shape setted from "
"config->EnableAnakinEngine must be 4");
for (int i = 1; i < 4; i++) {
input_shape[i] = max_input_shape[input][i];
}
} else {
auto *var = block_desc->FindVar(input);
PADDLE_ENFORCE(var, "no variable called %s", input);
auto var_shape = var->GetShape();
std::cout << "input :" << input << std::endl;
PADDLE_ENFORCE(var_shape.size() == 4);
for (size_t i = 1; i < var_shape.size(); i++) {
input_shape[i] = var_shape[i];
}
} }
input_shape[0] = engine->GetMaxBatch(); temp_max_input_shape[input] = input_shape;
engine->SetInputShape(input, input_shape); engine->SetInputShape(input, input_shape);
// engine->Graph()->RegistVar(input); // For share from data.
} }
engine->SetMaxInputShape(temp_max_input_shape);
// engine->Graph()->RegistAllOut();
engine->Optimize(); engine->Optimize();
engine->InitGraph(); engine->InitGraph();
/*
for(auto& input : inputs) {
platform::CUDAPlace gpu_place(engine->GetDevice());
auto input_var = scope->Var();
auto input_tensor = input_var->GetMutable<framework::LoDTensor>();
auto input_max_shape = temp_max_input_shape[input];
input_tensor->Resize(framework::make_ddim(input_max_shape));
auto input_data = input_tensor->mutable_data<float>(gpu_place);
auto* anakin_input = engine->Net()->get_in(input);
::anakin::saber::Tensor<::anakin::saber::NV> tmp_anakin_tensor(input_data,
::anakin::saber::NV(), 0, input_max_shape);
anakin_input->share_from(tmp_anakin_tensor);
}
*/
} }
void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/scale.h"
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto input_name = op_desc.Input("X").front();
auto output_name = op_desc.Output("Out").front();
float scale = boost::get<float>(op_desc.GetAttr("scale"));
float bias = boost::get<float>(op_desc.GetAttr("bias"));
float bias_after_scale =
boost::get<bool>(op_desc.GetAttr("bias_after_scale"));
PADDLE_ENFORCE(bias_after_scale,
"The anakin scale layer only support bias after scale now.");
engine_->AddOp(op_name, "Power", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "shift", bias);
engine_->AddOpAttr(op_name, "scale", scale);
engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0));
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class ScaleOpConverter : public AnakinOpConverter {
public:
ScaleOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~ScaleOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
...@@ -122,6 +122,8 @@ class AnakinConvertValidation { ...@@ -122,6 +122,8 @@ class AnakinConvertValidation {
Singleton<AnakinOpConverter>::Global().ConvertOp( Singleton<AnakinOpConverter>::Global().ConvertOp(
desc, parameters_, scope_, engine_.get(), true /*test_mode*/); desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
engine_->Freeze(); engine_->Freeze();
std::map<std::string, std::vector<int>> temp_max_input_shape;
for (const auto& input : op_desc_->InputArgumentNames()) { for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue; if (parameters_.count(input)) continue;
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_, auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_,
...@@ -131,7 +133,9 @@ class AnakinConvertValidation { ...@@ -131,7 +133,9 @@ class AnakinConvertValidation {
t_shape.push_back(1); t_shape.push_back(1);
} }
engine_->SetInputShape(input, t_shape); engine_->SetInputShape(input, t_shape);
temp_max_input_shape[input] = t_shape;
} }
engine_->SetMaxInputShape(temp_max_input_shape);
engine_->Optimize(); engine_->Optimize();
engine_->InitGraph(); engine_->InitGraph();
} }
......
...@@ -33,13 +33,14 @@ namespace inference { ...@@ -33,13 +33,14 @@ namespace inference {
namespace anakin { namespace anakin {
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary, AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
int device, bool need_summary, int device, int max_batch_size,
int max_batch_size) std::map<std::string, std::vector<int>> max_input_shape)
: graph_(new AnakinGraphT<TargetT, PrecisionType>()), : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) { net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
device_ = device; device_ = device;
max_batch_size_ = max_batch_size; max_batch_size_ = max_batch_size;
max_input_shape_ = max_input_shape;
} }
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
...@@ -75,20 +76,31 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( ...@@ -75,20 +76,31 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
auto *data = tensor->data<float>(); auto *data = tensor->data<float>();
auto fluid_input_shape = framework::vectorize2int(tensor->dims()); auto fluid_input_shape = framework::vectorize2int(tensor->dims());
while (fluid_input_shape.size() < 4) {
fluid_input_shape.push_back(1);
}
auto *anakin_input = net_->get_in(input.first); auto *anakin_input = net_->get_in(input.first);
auto net_shape = anakin_input->shape(); std::vector<int> max_input_shape = max_input_shape_[input.first];
int max_shape_sum =
std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1,
std::multiplies<int>());
PADDLE_ENFORCE(max_shape_sum >= tensor->numel(),
"The anakin input max shape should be greater than"
" or equal to the real input shape, Please set the max "
"input shape using EnableAnakinEngine");
/*
if (tensor->numel() > net_shape.count()) { if (tensor->numel() > net_shape.count()) {
graph_->Reshape(input.first, fluid_input_shape); graph_->Reshape(input.first, fluid_input_shape);
net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(true)); net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(true));
net_->init(*graph_); net_->init(*graph_);
anakin_input = net_->get_in(input.first); anakin_input = net_->get_in(input.first);
} }
*/
anakin_input->reshape(fluid_input_shape); anakin_input->reshape(fluid_input_shape);
net_shape = anakin_input->shape();
::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0, ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
// net_shape);
fluid_input_shape); fluid_input_shape);
anakin_input->copy_from(tmp_anakin_tensor); anakin_input->copy_from(tmp_anakin_tensor);
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <functional>
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -55,8 +56,9 @@ class AnakinEngine { ...@@ -55,8 +56,9 @@ class AnakinEngine {
using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>; using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
public: public:
explicit AnakinEngine(bool need_summary = false, int device = 0, explicit AnakinEngine(
int max_batch_size = 1); bool need_summary = false, int device = 0, int max_batch_size = 1,
std::map<std::string, std::vector<int>> max_input_shape = {});
~AnakinEngine(); ~AnakinEngine();
void InitGraph(); void InitGraph();
void SetInputShape(const std::string &name, std::vector<int> shape); void SetInputShape(const std::string &name, std::vector<int> shape);
...@@ -73,10 +75,17 @@ class AnakinEngine { ...@@ -73,10 +75,17 @@ class AnakinEngine {
NetT *Net() { return net_.get(); } NetT *Net() { return net_.get(); }
GraphT *Graph() { return graph_.get(); } GraphT *Graph() { return graph_.get(); }
std::unique_ptr<AnakinEngine> Clone(); std::unique_ptr<AnakinEngine> Clone();
const std::map<std::string, std::vector<int>> &GetMaxInputShape() {
return max_input_shape_;
}
void SetMaxInputShape(std::map<std::string, std::vector<int>> shape) {
max_input_shape_ = shape;
}
int GetMaxBatchSize() { return max_batch_size_; }
void Freeze(); void Freeze();
void Optimize(); void Optimize();
void Save(std::string path) { graph_->save(path); } void Save(std::string path) { graph_->save(path); }
int GetMaxBatch() { return max_batch_size_; } int GetDevice() { return device_; }
// void SaveSerializedData(std::string& data) { graph_->save_to_string(data); // void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
// } // }
// void LoadSerializedData(const std::string& data) { // void LoadSerializedData(const std::string& data) {
...@@ -87,6 +96,7 @@ class AnakinEngine { ...@@ -87,6 +96,7 @@ class AnakinEngine {
private: private:
int max_batch_size_; int max_batch_size_;
std::map<std::string, std::vector<int>> max_input_shape_;
int device_; int device_;
std::unique_ptr<GraphT> graph_; std::unique_ptr<GraphT> graph_;
std::unique_ptr<NetT> net_; std::unique_ptr<NetT> net_;
...@@ -104,11 +114,13 @@ class AnakinEngineManager { ...@@ -104,11 +114,13 @@ class AnakinEngineManager {
return engines_.at(name).get(); return engines_.at(name).get();
} }
AnakinNvEngineT *Create(bool need_summary, int device, int max_batch_size, AnakinNvEngineT *Create(
std::string engine_name) { bool need_summary, int device, int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape,
std::string engine_name) {
std::unique_lock<std::mutex> lk(mut_); std::unique_lock<std::mutex> lk(mut_);
auto *p = new AnakinEngine<NV, Precision::FP32>(need_summary, device, auto *p = new AnakinEngine<NV, Precision::FP32>(
max_batch_size); need_summary, device, max_batch_size, max_input_shape);
engines_[engine_name].reset(p); engines_[engine_name].reset(p);
return p; return p;
} }
......
...@@ -38,6 +38,7 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -38,6 +38,7 @@ struct SimpleOpTypeSetTeller : public Teller {
teller_set.insert("transpose2"); teller_set.insert("transpose2");
teller_set.insert("density_prior_box"); teller_set.insert("density_prior_box");
teller_set.insert("detection_out"); teller_set.insert("detection_out");
teller_set.insert("scale");
} }
bool operator()(const std::string& op_type, bool operator()(const std::string& op_type,
......
...@@ -57,6 +57,7 @@ struct Argument { ...@@ -57,6 +57,7 @@ struct Argument {
using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>; using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>;
using fusion_statis_t = std::unordered_map<std::string, int>; using fusion_statis_t = std::unordered_map<std::string, int>;
using engine_opt_info_t = std::map<std::string, std::string>; using engine_opt_info_t = std::map<std::string, std::string>;
using anakin_max_shape_t = std::map<std::string, std::vector<int>>;
bool Has(const std::string& key) const { return valid_fields_.count(key); } bool Has(const std::string& key) const { return valid_fields_.count(key); }
...@@ -150,6 +151,8 @@ struct Argument { ...@@ -150,6 +151,8 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
bool); bool);
DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
anakin_max_shape_t);
DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int); DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
......
...@@ -77,6 +77,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -77,6 +77,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("engine_opt_info", new std::map<std::string, std::string>( pass->Set("engine_opt_info", new std::map<std::string, std::string>(
argument->engine_opt_info())); argument->engine_opt_info()));
pass->Set("predictor_id", new int(argument->predictor_id())); pass->Set("predictor_id", new int(argument->predictor_id()));
pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
argument->anakin_max_input_shape()));
pass->Set("max_batch_size", new int(argument->anakin_max_batch_size())); pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <map>
#include <memory> #include <memory>
#include <set> #include <set>
#include <string> #include <string>
...@@ -256,11 +257,14 @@ void AnakinSubgraphPass::CreateAnakinOp( ...@@ -256,11 +257,14 @@ void AnakinSubgraphPass::CreateAnakinOp(
input_names_with_id, output_names_with_id, std::to_string(predictor_id)); input_names_with_id, output_names_with_id, std::to_string(predictor_id));
SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "engine_key", engine_key);
int max_batch_size = Get<int>("max_batch_size"); auto max_input_shape =
Get<std::map<std::string, std::vector<int>>>("max_input_shape");
auto max_batch_size = Get<int>("max_batch_size");
auto *anakin_engine = auto *anakin_engine =
inference::Singleton<anakin::AnakinEngineManager>::Global().Create( inference::Singleton<anakin::AnakinEngineManager>::Global().Create(
true, Get<int>("gpu_device_id"), max_batch_size, engine_key); true, Get<int>("gpu_device_id"), max_batch_size, max_input_shape,
engine_key);
auto *scope = param_scope(); auto *scope = param_scope();
std::unordered_set<std::string> param_set(params.begin(), params.end()); std::unordered_set<std::string> param_set(params.begin(), params.end());
...@@ -268,7 +272,7 @@ void AnakinSubgraphPass::CreateAnakinOp( ...@@ -268,7 +272,7 @@ void AnakinSubgraphPass::CreateAnakinOp(
inference::Singleton<inference::anakin::AnakinOpConverter>::Global() inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
.ConvertBlockToAnakinEngine( .ConvertBlockToAnakinEngine(
&block_desc_temp, *scope, &block_desc_temp, scope,
std::vector<std::string>(input_names.begin(), input_names.end()), std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, anakin_engine); param_set, output_mapping, anakin_engine);
} }
......
...@@ -214,13 +214,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -214,13 +214,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
std::to_string(0)); std::to_string(0));
// Get "" when there is no cached calibration table data. // Get "" when there is no cached calibration table data.
std::string calibration_data = GetTrtCalibTableData( bool load_from_memory = Get<bool>("model_from_memory");
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8); std::string calibration_data = "";
if (!load_from_memory) {
calibration_data = GetTrtCalibTableData(
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
}
SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "engine_key", engine_key);
bool load_from_memory = Get<bool>("model_from_memory");
std::string trt_engine_serialized_data = ""; std::string trt_engine_serialized_data = "";
if (load_from_memory) { if (load_from_memory) {
std::map<std::string, std::string> engine_opt_info = std::map<std::string, std::string> engine_opt_info =
......
...@@ -30,7 +30,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { ...@@ -30,7 +30,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary. // The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return; if (!argument->use_gpu()) return;
return;
auto &graph = argument->main_graph(); auto &graph = argument->main_graph();
std::vector<std::string> repetitive_params; std::vector<std::string> repetitive_params;
......
...@@ -111,6 +111,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -111,6 +111,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_anakin_); CP_MEMBER(use_anakin_);
CP_MEMBER(anakin_max_batchsize_); CP_MEMBER(anakin_max_batchsize_);
CP_MEMBER(anakin_max_input_shape_);
// Ir related. // Ir related.
CP_MEMBER(enable_ir_optim_); CP_MEMBER(enable_ir_optim_);
...@@ -355,8 +356,11 @@ void AnalysisConfig::SwitchIrDebug(int x) { ...@@ -355,8 +356,11 @@ void AnalysisConfig::SwitchIrDebug(int x) {
ir_debug_ = x; ir_debug_ = x;
Update(); Update();
} }
void AnalysisConfig::EnableAnakinEngine(int max_batch_size) { void AnalysisConfig::EnableAnakinEngine(
int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape) {
anakin_max_batchsize_ = max_batch_size; anakin_max_batchsize_ = max_batch_size;
anakin_max_input_shape_ = max_input_shape;
use_anakin_ = true; use_anakin_ = true;
Update(); Update();
} }
......
...@@ -380,6 +380,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -380,6 +380,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
if (config_.use_gpu() && config_.anakin_engine_enabled()) { if (config_.use_gpu() && config_.anakin_engine_enabled()) {
argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_); argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
LOG(INFO) << "Anakin subgraph engine is enabled"; LOG(INFO) << "Anakin subgraph engine is enabled";
} }
...@@ -835,3 +836,4 @@ USE_ANAKIN_CONVERTER(softmax); ...@@ -835,3 +836,4 @@ USE_ANAKIN_CONVERTER(softmax);
USE_ANAKIN_CONVERTER(detection_out); USE_ANAKIN_CONVERTER(detection_out);
USE_ANAKIN_CONVERTER(density_prior_box); USE_ANAKIN_CONVERTER(density_prior_box);
USE_ANAKIN_CONVERTER(scale);
...@@ -145,7 +145,9 @@ struct AnalysisConfig { ...@@ -145,7 +145,9 @@ struct AnalysisConfig {
/** /**
* \brief Turn on the usage of Anakin sub-graph engine. * \brief Turn on the usage of Anakin sub-graph engine.
*/ */
void EnableAnakinEngine(int max_batch_size = 1); void EnableAnakinEngine(
int max_batch_size = 1,
std::map<std::string, std::vector<int>> max_input_shape = {});
/** A boolean state indicating whether the Anakin sub-graph engine is used. /** A boolean state indicating whether the Anakin sub-graph engine is used.
*/ */
...@@ -271,6 +273,7 @@ struct AnalysisConfig { ...@@ -271,6 +273,7 @@ struct AnalysisConfig {
mutable std::unique_ptr<PassStrategy> pass_builder_; mutable std::unique_ptr<PassStrategy> pass_builder_;
bool use_anakin_{false}; bool use_anakin_{false};
int anakin_max_batchsize_; int anakin_max_batchsize_;
std::map<std::string, std::vector<int>> anakin_max_input_shape_;
std::map<std::string, std::string> engine_opt_info_; std::map<std::string, std::string> engine_opt_info_;
}; };
......
...@@ -71,7 +71,11 @@ void GpuPassStrategy::EnableMKLDNN() { ...@@ -71,7 +71,11 @@ void GpuPassStrategy::EnableMKLDNN() {
// The following passes works for Anakin sub-graph engine. // The following passes works for Anakin sub-graph engine.
const std::vector<std::string> kAnakinSubgraphPasses({ const std::vector<std::string> kAnakinSubgraphPasses({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"simplify_anakin_detection_pattern_pass5", //
"simplify_anakin_detection_pattern_pass4", //
"simplify_anakin_detection_pattern_pass3", // "simplify_anakin_detection_pattern_pass3", //
"simplify_anakin_detection_pattern_pass2", //
"anakin_fillconstant_elementwisemul_fuse", //
"fc_fuse_pass", // "fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
......
...@@ -97,6 +97,7 @@ class AnakinEngineOp : public framework::OperatorBase { ...@@ -97,6 +97,7 @@ class AnakinEngineOp : public framework::OperatorBase {
if (param_names_.count(x)) continue; if (param_names_.count(x)) continue;
auto &t = auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x); inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
/*
auto t_shape = framework::vectorize(t.dims()); auto t_shape = framework::vectorize(t.dims());
auto *anakin_input = engine->Net()->get_in(x); auto *anakin_input = engine->Net()->get_in(x);
auto net_shape = anakin_input->shape(); auto net_shape = anakin_input->shape();
...@@ -112,20 +113,16 @@ class AnakinEngineOp : public framework::OperatorBase { ...@@ -112,20 +113,16 @@ class AnakinEngineOp : public framework::OperatorBase {
t.mutable_data<float>(dev_place); t.mutable_data<float>(dev_place);
TensorCopySync(temp_t, dev_place, &t); TensorCopySync(temp_t, dev_place, &t);
} }
*/
inputs.insert({x, &t}); inputs.insert({x, &t});
} }
std::map<std::string, framework::LoDTensor *> outputs; std::map<std::string, framework::LoDTensor *> outputs;
int output_index = 0; int output_index = 0;
for (const auto &y : Outputs("Ys")) { for (const auto &y : Outputs("Ys")) {
// std::vector<int> ddim =
// engine->Net()->get_out(output_maps[output_index])->valid_shape();
// we need get the output anakin output shape.
auto *fluid_v = scope.FindVar(y); auto *fluid_v = scope.FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>(); auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
// fluid_t->Resize(framework::make_ddim(ddim));
// fluid_t->mutable_data<float>(boost::get<platform::CUDAPlace>(dev_place));
outputs.insert({output_maps[output_index], fluid_t}); outputs.insert({output_maps[output_index], fluid_t});
output_index += 1; output_index += 1;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册