提交 f3a2e4b3 编写于 作者: N nhzlx

1. Add ANAKIN_ROOT compile option

2. refine trt code
test=develop
上级 4f4daa4b
......@@ -66,7 +66,6 @@ option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_ANAKIN_SUBGRAPH "Compile with Anakin subgraph library" OFF)
option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
......@@ -192,6 +191,7 @@ include(configure) # add paddle env configuration
if(WITH_GPU)
include(cuda)
include(tensorrt)
include(anakin_subgraph)
endif()
if(WITH_MKL OR WITH_MKLML)
include(external/anakin)
......
if(NOT WITH_GPU)
return()
endif()
set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
find_path(ANAKIN_INCLUDE_DIR anakin_config.h
PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
$ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include
NO_DEFAULT_PATH
)
find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
PATHS ${ANAKIN_ROOT}
$ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib
NO_DEFAULT_PATH
DOC "Path to ANAKIN library.")
if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
if(WITH_DSO)
set(ANAKIN_FOUND ON)
endif(WITH_DSO)
else()
set(ANAKIN_FOUND OFF)
endif()
if(ANAKIN_FOUND)
message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
include_directories(${ANAKIN_ROOT}/include)
include_directories(${ANAKIN_ROOT}/include/saber)
link_directories(${ANAKIN_ROOT})
add_definitions(-DPADDLE_WITH_ANAKIN)
endif()
......@@ -33,5 +33,6 @@ if(TENSORRT_FOUND)
message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
include_directories(${TENSORRT_INCLUDE_DIR})
link_directories(${TENSORRT_LIBRARY})
add_definitions(-DPADDLE_WITH_TENSORRT)
endif()
......@@ -17,7 +17,7 @@ if (TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()
if (WITH_ANAKIN_SUBGRAPH)
if (ANAKIN_FOUND)
add_subdirectory(anakin)
endif()
......
cc_library(anakin_engine SRCS engine.cc)
cc_library(anakin_engine SRCS engine.cc DEPS framework_proto)
cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
target_link_libraries(anakin_engine anakin anakin_saber_common)
cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
......
cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv)
cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter)
cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling)
cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split)
cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split)
cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op)
cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL)
cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax)
cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op)
cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op)
cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op)
cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op)
cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op)
cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor)
cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling SERIAL)
cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split SERIAL)
cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split SERIAL)
cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op SERIAL)
cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL SERIAL)
cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax SERIAL)
cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op SERIAL)
cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op SERIAL)
cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL)
cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL)
cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL)
#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor SERIAL)
......@@ -26,7 +26,7 @@ static void test_activation_op(const std::string &op_type) {
PADDLE_ENFORCE(converter != nullptr);
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc;
......
......@@ -24,7 +24,7 @@ TEST(batch_norm_op, test) {
{"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
"batch_norm_variance"});
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
std::vector<int> param_shape{2};
validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5});
......
......@@ -24,7 +24,7 @@ namespace anakin {
TEST(concat_op, test) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
......@@ -47,7 +47,7 @@ TEST(concat_op, test) {
TEST(concat_op, test2) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("concat_x1", {1, 4});
validator.DeclInputVar("concat_x2", {3, 4});
validator.DeclInputVar("concat_x3", {2, 4});
......
......@@ -27,7 +27,7 @@ TEST(conv2d_op, test) {
ASSERT_TRUE(conv2d_converter != nullptr);
std::unordered_set<std::string> parameters({"conv2d-Y"});
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});
......
......@@ -24,7 +24,7 @@ namespace anakin {
TEST(dropout_op, native) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2});
validator.DeclOutputVar("mask", {1, 1, 2, 2});
......
......@@ -24,7 +24,7 @@ namespace anakin {
static void test_elementwise_op(const std::string &op_type) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclInputVar("y", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2});
......
......@@ -26,7 +26,7 @@ TEST(fc_op, test) {
std::unordered_set<std::string> parameters({"mul_y"});
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("mul_x", {1, 1, 2, 2});
validator.DeclParamVar("mul_y", {4, 2});
validator.DeclOutputVar("mul_out", {1, 2});
......
......@@ -26,7 +26,7 @@ TEST(flatten_op, test) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
framework::OpDesc desc;
......
......@@ -24,7 +24,7 @@ namespace anakin {
TEST(im2sequence_op, native) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
std::vector<int> kernels = {6, 1};
std::vector<int> strides = {1, 1};
......
......@@ -27,7 +27,7 @@ void test_pool2d(bool global_pooling, bool ceil_mode,
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
......@@ -72,7 +72,7 @@ void test_pool2d2(bool global_pooling, bool ceil_mode,
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
......
......@@ -26,7 +26,7 @@ static void test_activation_op(const std::string &op_type) {
PADDLE_ENFORCE(converter != nullptr);
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc;
......
......@@ -25,7 +25,7 @@ TEST(reshape, test) {
ASSERT_TRUE(converter);
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
// validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
// validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
......@@ -48,7 +48,7 @@ TEST(reshape, test) {
TEST(reshape, test2) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("reshape-X", {1, 2, 4});
validator.DeclOutputVar("reshape-Out", {1, 4, 2});
......
......@@ -25,10 +25,10 @@ TEST(softmax, test) {
ASSERT_TRUE(converter);
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("softmax-X", {1, 10});
validator.DeclOutputVar("softmax-Out", {1, 10});
validator.DeclInputVar("softmax-X", {1, 10, 2});
validator.DeclOutputVar("softmax-Out", {1, 10, 2});
framework::OpDesc desc;
desc.SetType("softmax");
......
......@@ -26,7 +26,7 @@ void AnakinSliceTest(const std::vector<int> &in_shape,
const std::vector<int> &sections) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("split_input", in_shape);
std::vector<std::string> output_vars;
......
......@@ -25,7 +25,7 @@ namespace anakin {
TEST(sum, native) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("sum_x1", {1, 2, 1, 2});
validator.DeclInputVar("sum_x2", {1, 2, 1, 2});
validator.DeclOutputVar("sum_out", {1, 2, 1, 2});
......
......@@ -25,7 +25,7 @@ TEST(transpose_op, test) {
ASSERT_TRUE(converter != nullptr);
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("transpose-X", {2, 3, 4, 5});
validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3});
......@@ -47,7 +47,7 @@ TEST(transpose_op, test) {
TEST(transpose_op, test2) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, scope);
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("transpose-X", {3, 4, 5});
validator.DeclOutputVar("transpose-Out", {3, 5, 4});
......
......@@ -84,7 +84,7 @@ class AnakinConvertValidation {
AnakinConvertValidation() = delete;
AnakinConvertValidation(const std::unordered_set<std::string>& parameters,
framework::Scope& scope)
framework::Scope* scope)
: parameters_(parameters), scope_(scope), place_(0) {
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
......@@ -108,7 +108,7 @@ class AnakinConvertValidation {
void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CUDADeviceContext ctx(place_);
auto* x = scope_.Var(name);
auto* x = scope_->Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place_, ctx);
......@@ -120,13 +120,13 @@ class AnakinConvertValidation {
// should init anakin engine here.
Singleton<AnakinOpConverter>::Global().ConvertOp(
desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
desc, parameters_, *scope_, engine_.get(), true /*test_mode*/);
engine_->Freeze();
std::map<std::string, std::vector<int>> temp_max_input_shape;
for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue;
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_,
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(*scope_,
input);
auto t_shape = framework::vectorize2int(t.dims());
while (t_shape.size() < 4) {
......@@ -147,14 +147,14 @@ class AnakinConvertValidation {
std::unordered_set<std::string> neglected_output = {}) {
// Execute Fluid Op
platform::CUDADeviceContext ctx(place_);
op_->Run(scope_, place_);
op_->Run(*scope_, place_);
// std::vector<framework::LoDTensor> input_vector;
// std::vector<framework::LoDTensor> output_vector;
std::map<std::string, framework::LoDTensor*> inputs;
for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue;
auto* var = scope_.FindVar(input);
auto* var = scope_->FindVar(input);
auto tensor = var->GetMutable<framework::LoDTensor>();
inputs.insert({input, tensor});
}
......@@ -164,7 +164,7 @@ class AnakinConvertValidation {
for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue;
std::vector<float> fluid_out;
auto* var = scope_.FindVar(output);
auto* var = scope_->FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &fluid_out);
fluid_outputs.push_back(fluid_out);
......@@ -177,7 +177,7 @@ class AnakinConvertValidation {
for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue;
std::vector<float> anakin_out;
auto* var = scope_.FindVar(output);
auto* var = scope_->FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &anakin_out);
......@@ -189,15 +189,13 @@ class AnakinConvertValidation {
}
}
framework::Scope& scope() { return scope_; }
private:
std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
cudaStream_t stream_;
std::unique_ptr<framework::OperatorBase> op_;
std::unique_ptr<framework::OpDesc> op_desc_;
const std::unordered_set<std::string>& parameters_;
framework::Scope& scope_;
framework::Scope* scope_;
platform::CUDAPlace place_;
};
......
......@@ -97,7 +97,11 @@ void IRPassManager::CreatePasses(Argument *argument,
bool use_static_engine = argument->tensorrt_use_static_engine();
bool model_from_memory = argument->model_from_memory();
if ((!model_from_memory && use_static_engine)) {
bool int8_valid = !(model_from_memory && enable_int8);
PADDLE_ENFORCE(int8_valid,
"TRT INT8 Now don't support model load from memory.");
if ((!model_from_memory && use_static_engine) || enable_int8) {
std::string model_opt_cache_dir =
argument->Has("model_dir")
? argument->model_dir()
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
cc_library(subgraph_detector SRCS subgraph_detector.cc subgraph_util.cc DEPS proto_desc)
if(WITH_TESTING)
add_dependencies(subgraph_detector gtest)
endif()
......@@ -15,7 +15,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
endif()
if (WITH_ANAKIN_SUBGRAPH)
if (ANAKIN_FOUND)
cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)
set(analysis_deps ${analysis_deps}
......
......@@ -35,9 +35,6 @@ namespace analysis {
using framework::ir::Node;
std::vector<std::string> ExtractAnakinParameters(
const std::unordered_set<Node *> &nodes);
std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
......@@ -51,11 +48,10 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
fuser();
std::vector<std::string> graph_param_names =
ExtractAnakinParameters(graph->Nodes());
ExtractParameters(graph->Nodes());
// those parameter already exist in anakin, and should not have another copy
// in
// fluid.
// in fluid.
std::vector<std::string> repetitive_params;
for (auto *node : graph->Nodes()) {
......@@ -157,74 +153,13 @@ void AnakinSubgraphPass::CreateAnakinOp(
op_desc->SetType("anakin_engine");
std::unordered_map<std::string, std::string> output_name_map;
auto &subgraph_nodes = *Agent(node).subgraph();
// The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph.
// Why we do this?
// During the transition from fluid OP to anakin OP, we map
// the input and output Tensor(fluid data structure) of fluid OP
// to the corresponding ITensor (trt data structure) through the
// Tensor name. When we set up ITensor for an variable, we must
// ensure that it has not been set before.
// If there is variable in the fluid graph, which is not only the
// input of a OP, but also the output of a Op, there will be problems.
// So we have to rename the variable in the subgraph to make sure
// it is either an OP's input or an OP's output.
auto &subgraph_nodes = *Agent(node).subgraph();
for (size_t index = 0; index < block_desc.OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
std::unordered_map<std::string, size_t> var2id;
for (auto *in_var : correspond_node->inputs) {
var2id[in_var->Name()] = in_var->id();
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
// one input
auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id();
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id.count(arg_value_with_id)) {
output_name_map[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
&output_names_with_id, &output_names, &output_name_map,
false);
// When anakin engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor
......@@ -249,8 +184,7 @@ void AnakinSubgraphPass::CreateAnakinOp(
SetAttr(op_desc->Proto(), "subgraph",
block_desc.Proto()->SerializeAsString());
// Set attrs
SetAttr(op_desc->Proto(), "parameters",
ExtractAnakinParameters(graph->Nodes()));
SetAttr(op_desc->Proto(), "parameters", params);
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
int predictor_id = Get<int>("predictor_id");
auto engine_key = GenerateAnakinEngineKey(
......@@ -277,34 +211,6 @@ void AnakinSubgraphPass::CreateAnakinOp(
param_set, output_mapping, anakin_engine);
}
std::vector<std::string> ExtractAnakinParameters(
const std::unordered_set<Node *> &nodes) {
// We can judge whether a variable is a parameter by
// its presistable property, but sometimes the presistable
// of the feed op output is true, so we have to identify it.
std::vector<std::string> feed_outputs;
for (const auto &node : nodes) {
if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type();
if (op_type == "feed" || op_type == "fetch") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs));
}
}
std::vector<std::string> parameters;
for (const auto &node : nodes) {
if (!node->IsVar()) continue;
if (node->Var()->Persistable() &&
std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
feed_outputs.end()) {
parameters.push_back(node->Name());
}
}
return parameters;
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......
......@@ -19,6 +19,7 @@
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/anakin/engine.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
using anakin::Precision;
using anakin::saber::NV;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file defines the the class to partition a graph.
*/
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
#include <algorithm>
#include <string>
namespace paddle {
namespace inference {
namespace analysis {
using framework::ir::Node;
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes) {
// We can judge whether a variable is a parameter by
// its presistable property, but sometimes the presistable
// of the feed op output is true, so we have to identify it.
std::vector<std::string> feed_outputs;
for (const auto &node : nodes) {
if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type();
if (op_type == "feed" || op_type == "fetch") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs));
}
}
std::vector<std::string> parameters;
for (const auto &node : nodes) {
if (!node->IsVar()) continue;
if (node->Var()->Persistable() &&
std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
feed_outputs.end()) {
parameters.push_back(node->Name());
}
}
return parameters;
}
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map,
bool is_trt) {
//// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
same_hierarchy_conv2d_num_map;
for (size_t index = 0; index < block_desc->OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
framework::OpDesc op_desc(*op, nullptr);
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
std::unordered_map<std::string, size_t> var2id;
std::unordered_map<std::string, framework::ir::Node *> in_vars;
for (auto *in_var : correspond_node->inputs) {
var2id[in_var->Name()] = in_var->id();
in_vars[in_var->Name()] = in_var;
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
// one input
auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id();
}
if (op_desc.Type() == "conv2d" && is_trt) {
auto input_var_name = op_desc.Input("Input").front();
auto filter_var_name = op_desc.Input("Filter").front();
auto out_var_name = op_desc.Output("Output").front();
auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
const std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
(*output_names_with_id)
.insert(out_var_name + std::to_string(var2id[out_var_name]));
(*output_names).insert(out_var_name);
} else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
paddings[1] == 0) {
same_hierarchy_conv2d_num_map[input_var_name] += 1;
}
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id->count(arg_value_with_id)) {
(*output_name_map)[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file defines the the class to partition a graph.
*/
#pragma once
#include <set>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/node.h"
namespace paddle {
namespace inference {
namespace analysis {
using framework::ir::Node;
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes);
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map,
bool is_trt = true);
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -31,17 +31,6 @@ namespace analysis {
using framework::ir::Node;
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes);
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map);
std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
......@@ -217,7 +206,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// Get "" when there is no cached calibration table data.
bool load_from_memory = Get<bool>("model_from_memory");
std::string calibration_data = "";
if (!load_from_memory && use_static_engine) {
if (enable_int8) {
calibration_data = GetTrtCalibTableData(
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
}
......@@ -226,13 +215,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key);
std::string trt_engine_serialized_data = "";
if (load_from_memory) {
std::map<std::string, std::string> engine_opt_info =
Get<std::map<std::string, std::string>>("engine_opt_info");
if (engine_opt_info.count(engine_key)) {
trt_engine_serialized_data = engine_opt_info[engine_key];
}
}
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
......@@ -240,27 +223,41 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
if (enable_int8 && calibration_data.size() != 0) {
calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
}
// When in int8 mode and calibration_mode, the program just produce the
// calibration table data.
bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
if (!calibration_mode && use_static_engine &&
trt_engine_serialized_data.empty()) {
if (calibration_mode) {
// calibraion mode means generate int8 calibration table data process.
return;
}
std::copy(params.begin(), params.end(),
std::back_inserter(*repetitive_params));
bool need_serialize = (use_static_engine && !load_from_memory);
if (use_static_engine && !load_from_memory) {
if (need_serialize) {
trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key);
// we can load the engine info serialized before from the disk.
if (!trt_engine_serialized_data.empty()) {
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
return;
}
}
if (trt_engine_serialized_data.empty()) {
// the following code will NOT run in following situation:
// 1. calibraion mode (generate trt int8 calibraiton table data)
// 2. already load serialized trt engine info.
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time.";
std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
new tensorrt::TensorRTEngine(
Get<int>("max_batch_size"), Get<int>("workspace_size"),
enable_int8, calibrator.get(), Get<int>("gpu_device_id")));
Get<int>("max_batch_size"), Get<int>("workspace_size"), enable_int8,
calibrator.get(), Get<int>("gpu_device_id")));
auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
std::unordered_set<std::string> param_set(params.begin(), params.end());
......@@ -274,142 +271,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
std::string((const char *)serialized_engine_data->data(),
serialized_engine_data->size());
if (use_static_engine && !load_from_memory) {
if (need_serialize) {
SaveTrtEngineSerializedDataToFile(
GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
engine_key),
trt_engine_serialized_data);
}
} else {
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
}
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
}
}
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes) {
// We can judge whether a variable is a parameter by
// its presistable property, but sometimes the presistable
// of the feed op output is true, so we have to identify it.
std::vector<std::string> feed_outputs;
for (const auto &node : nodes) {
if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type();
if (op_type == "feed" || op_type == "fetch") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs));
}
}
std::vector<std::string> parameters;
for (const auto &node : nodes) {
if (!node->IsVar()) continue;
if (node->Var()->Persistable() &&
std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
feed_outputs.end()) {
parameters.push_back(node->Name());
}
}
return parameters;
}
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map) {
//// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
same_hierarchy_conv2d_num_map;
for (size_t index = 0; index < block_desc->OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
framework::OpDesc op_desc(*op, nullptr);
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
std::unordered_map<std::string, size_t> var2id;
std::unordered_map<std::string, framework::ir::Node *> in_vars;
for (auto *in_var : correspond_node->inputs) {
var2id[in_var->Name()] = in_var->id();
in_vars[in_var->Name()] = in_var;
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
// one input
auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id();
}
if (op_desc.Type() == "conv2d") {
auto input_var_name = op_desc.Input("Input").front();
auto filter_var_name = op_desc.Input("Filter").front();
auto out_var_name = op_desc.Output("Output").front();
auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
const std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
(*output_names_with_id)
.insert(out_var_name + std::to_string(var2id[out_var_name]));
(*output_names).insert(out_var_name);
} else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
paddings[1] == 0) {
same_hierarchy_conv2d_num_map[input_var_name] += 1;
}
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id->count(arg_value_with_id)) {
(*output_name_map)[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
}
} // namespace analysis
......
......@@ -20,6 +20,7 @@
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
namespace paddle {
namespace inference {
......
......@@ -27,7 +27,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
endif()
if (WITH_ANAKIN_SUBGRAPH)
if (ANAKIN_FOUND)
set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
endif()
......
......@@ -40,10 +40,11 @@
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#endif
#if PADDLE_WITH_ANAKIN
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#endif
DECLARE_bool(profile);
......@@ -817,6 +818,7 @@ USE_TRT_CONVERTER(conv2d_transpose);
USE_TRT_CONVERTER(leaky_relu);
#endif
#if PADDLE_WITH_ANAKIN
USE_ANAKIN_CONVERTER(mul);
USE_ANAKIN_CONVERTER(fc);
USE_ANAKIN_CONVERTER(conv2d);
......@@ -838,3 +840,4 @@ USE_ANAKIN_CONVERTER(detection_out);
USE_ANAKIN_CONVERTER(density_prior_box);
USE_ANAKIN_CONVERTER(dropout);
USE_ANAKIN_CONVERTER(sum);
#endif
......@@ -81,6 +81,8 @@ PaddleDType ZeroCopyTensor::type() {
return PaddleDType::FLOAT32;
} else if (type == framework::proto::VarType::INT64) {
return PaddleDType::INT64;
} else if (type == framework::proto::VarType::INT32) {
return PaddleDType::INT32;
} else {
LOG(ERROR) << "unknown type, only support float32 and int64 now.";
}
......
......@@ -34,7 +34,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()
if (WITH_ANAKIN_SUBGRAPH)
if (ANAKIN_FOUND)
add_subdirectory(anakin)
endif()
......
......@@ -39,8 +39,7 @@ class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker {
class AnakinEngineInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {}
void operator()(framework::InferVarTypeContext *ctx) const override {}
};
} // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册