From f3a2e4b3d8df8bc699caa04077112338d5f11ed9 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Fri, 22 Mar 2019 07:01:35 +0000 Subject: [PATCH] 1. Add ANAKIN_ROOT compile option 2. refine trt code test=develop --- CMakeLists.txt | 2 +- cmake/anakin_subgraph.cmake | 32 +++ cmake/tensorrt.cmake | 1 + paddle/fluid/inference/CMakeLists.txt | 2 +- paddle/fluid/inference/anakin/CMakeLists.txt | 2 +- .../inference/anakin/convert/CMakeLists.txt | 32 +-- .../anakin/convert/test_activation_op.cc | 2 +- .../anakin/convert/test_batch_norm_op.cc | 2 +- .../anakin/convert/test_concat_op.cc | 4 +- .../anakin/convert/test_conv2d_op.cc | 2 +- .../anakin/convert/test_dropout_op.cc | 2 +- .../anakin/convert/test_elementwise_op.cc | 2 +- .../inference/anakin/convert/test_fc_op.cc | 2 +- .../anakin/convert/test_flatten_op.cc | 2 +- .../anakin/convert/test_im2sequence_op.cc | 2 +- .../anakin/convert/test_pool2d_op.cc | 4 +- .../inference/anakin/convert/test_relu_op.cc | 2 +- .../anakin/convert/test_reshape_op.cc | 4 +- .../anakin/convert/test_softmax_op.cc | 6 +- .../inference/anakin/convert/test_split_op.cc | 2 +- .../inference/anakin/convert/test_sum_op.cc | 2 +- .../anakin/convert/test_transpose_op.cc | 4 +- .../inference/anakin/convert/ut_helper.h | 20 +- .../inference/analysis/ir_pass_manager.cc | 6 +- .../analysis/ir_passes/CMakeLists.txt | 4 +- .../ir_passes/anakin_subgraph_pass.cc | 108 +-------- .../analysis/ir_passes/anakin_subgraph_pass.h | 1 + .../analysis/ir_passes/subgraph_util.cc | 152 ++++++++++++ .../analysis/ir_passes/subgraph_util.h | 49 ++++ .../ir_passes/tensorrt_subgraph_pass.cc | 225 ++++-------------- .../ir_passes/tensorrt_subgraph_pass.h | 1 + paddle/fluid/inference/api/CMakeLists.txt | 2 +- .../fluid/inference/api/analysis_predictor.cc | 5 +- .../inference/api/details/zero_copy_tensor.cc | 2 + paddle/fluid/operators/CMakeLists.txt | 2 +- .../operators/anakin/anakin_engine_op.cc | 3 +- 36 files changed, 356 insertions(+), 339 deletions(-) create mode 100644 cmake/anakin_subgraph.cmake create mode 100644 paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc create mode 100644 paddle/fluid/inference/analysis/ir_passes/subgraph_util.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f4d515c6e..a38e32b73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,7 +66,6 @@ option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. option(WITH_ANAKIN "Compile with Anakin library" OFF) -option(WITH_ANAKIN_SUBGRAPH "Compile with Anakin subgraph library" OFF) option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF) option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) @@ -192,6 +191,7 @@ include(configure) # add paddle env configuration if(WITH_GPU) include(cuda) include(tensorrt) + include(anakin_subgraph) endif() if(WITH_MKL OR WITH_MKLML) include(external/anakin) diff --git a/cmake/anakin_subgraph.cmake b/cmake/anakin_subgraph.cmake new file mode 100644 index 000000000..4a7d32a63 --- /dev/null +++ b/cmake/anakin_subgraph.cmake @@ -0,0 +1,32 @@ +if(NOT WITH_GPU) + return() +endif() + +set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT") +find_path(ANAKIN_INCLUDE_DIR anakin_config.h + PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include + $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include + NO_DEFAULT_PATH +) + +find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so + PATHS ${ANAKIN_ROOT} + $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib + NO_DEFAULT_PATH + DOC "Path to ANAKIN library.") + +if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY) + if(WITH_DSO) + set(ANAKIN_FOUND ON) + endif(WITH_DSO) +else() + set(ANAKIN_FOUND OFF) +endif() + +if(ANAKIN_FOUND) + message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ") + include_directories(${ANAKIN_ROOT}/include) + include_directories(${ANAKIN_ROOT}/include/saber) + link_directories(${ANAKIN_ROOT}) + add_definitions(-DPADDLE_WITH_ANAKIN) +endif() diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index 891ff2226..3bf12094e 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -33,5 +33,6 @@ if(TENSORRT_FOUND) message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") include_directories(${TENSORRT_INCLUDE_DIR}) + link_directories(${TENSORRT_LIBRARY}) add_definitions(-DPADDLE_WITH_TENSORRT) endif() diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 5ee12aab9..4cd29486a 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -17,7 +17,7 @@ if (TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -if (WITH_ANAKIN_SUBGRAPH) +if (ANAKIN_FOUND) add_subdirectory(anakin) endif() diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt index 1646c7d16..e8fb56590 100644 --- a/paddle/fluid/inference/anakin/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(anakin_engine SRCS engine.cc) +cc_library(anakin_engine SRCS engine.cc DEPS framework_proto) cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto) target_link_libraries(anakin_engine anakin anakin_saber_common) cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine) diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt index 9cfe6671f..1e7f5ac79 100644 --- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt @@ -1,19 +1,19 @@ cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry) -cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op) -cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv) -cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter) -cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling) -cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split) -cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split) -cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op) -cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL) -cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax) -cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op) -cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op) -cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op) -cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op) -cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op) -cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col) -cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor) +cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL) +cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL) +cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL) +cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling SERIAL) +cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split SERIAL) +cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split SERIAL) +cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op SERIAL) +cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL SERIAL) +cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax SERIAL) +cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op SERIAL) +cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op SERIAL) +cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL) +cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL) +cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL) +#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col) +cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor SERIAL) diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc index 6a81ec54e..8bedd4a74 100644 --- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc @@ -26,7 +26,7 @@ static void test_activation_op(const std::string &op_type) { PADDLE_ENFORCE(converter != nullptr); std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("act-X", {10, 6, 1, 1}); validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); framework::OpDesc desc; diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc index 2e438dd72..2832e1c8d 100644 --- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc @@ -24,7 +24,7 @@ TEST(batch_norm_op, test) { {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean", "batch_norm_variance"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); std::vector param_shape{2}; validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5}); diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc index 8c6949cb2..ecf44def5 100644 --- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc @@ -24,7 +24,7 @@ namespace anakin { TEST(concat_op, test) { std::unordered_set parameters({""}); framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("concat_x1", {1, 2, 1, 1}); validator.DeclInputVar("concat_x2", {1, 3, 1, 1}); validator.DeclInputVar("concat_x3", {1, 1, 1, 1}); @@ -47,7 +47,7 @@ TEST(concat_op, test) { TEST(concat_op, test2) { std::unordered_set parameters({""}); framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("concat_x1", {1, 4}); validator.DeclInputVar("concat_x2", {3, 4}); validator.DeclInputVar("concat_x3", {2, 4}); diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc index 76f11c7b9..6d93e50bc 100644 --- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc @@ -27,7 +27,7 @@ TEST(conv2d_op, test) { ASSERT_TRUE(conv2d_converter != nullptr); std::unordered_set parameters({"conv2d-Y"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("conv2d-X", {1, 3, 3, 3}); validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1}); validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3}); diff --git a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc index ded279bf4..b2de5ae0a 100644 --- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc @@ -24,7 +24,7 @@ namespace anakin { TEST(dropout_op, native) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("x", {1, 1, 2, 2}); validator.DeclOutputVar("out", {1, 1, 2, 2}); validator.DeclOutputVar("mask", {1, 1, 2, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc index eec8b96f7..3a437f5fd 100644 --- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc @@ -24,7 +24,7 @@ namespace anakin { static void test_elementwise_op(const std::string &op_type) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("x", {1, 1, 2, 2}); validator.DeclInputVar("y", {1, 1, 2, 2}); validator.DeclOutputVar("out", {1, 1, 2, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc index c72974cd5..87bce3640 100644 --- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc @@ -26,7 +26,7 @@ TEST(fc_op, test) { std::unordered_set parameters({"mul_y"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("mul_x", {1, 1, 2, 2}); validator.DeclParamVar("mul_y", {4, 2}); validator.DeclOutputVar("mul_out", {1, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc index 381fab786..d13281f11 100644 --- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc @@ -26,7 +26,7 @@ TEST(flatten_op, test) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("flatten-X", {3, 10, 10, 4}); validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1}); framework::OpDesc desc; diff --git a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc index 2f6c48629..5e5764633 100644 --- a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc @@ -24,7 +24,7 @@ namespace anakin { TEST(im2sequence_op, native) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); std::vector kernels = {6, 1}; std::vector strides = {1, 1}; diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc index 95cb41949..1ac019467 100644 --- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc @@ -27,7 +27,7 @@ void test_pool2d(bool global_pooling, bool ceil_mode, framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); // The ITensor's Dims should not contain the batch size. // So, the ITensor's Dims of input and output should be C * H * W. @@ -72,7 +72,7 @@ void test_pool2d2(bool global_pooling, bool ceil_mode, framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); // The ITensor's Dims should not contain the batch size. // So, the ITensor's Dims of input and output should be C * H * W. diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc index 1695c0dbf..04e624518 100644 --- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc @@ -26,7 +26,7 @@ static void test_activation_op(const std::string &op_type) { PADDLE_ENFORCE(converter != nullptr); std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("act-X", {10, 6, 1, 1}); validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); framework::OpDesc desc; diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc index a60544e65..306ebf510 100644 --- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc @@ -25,7 +25,7 @@ TEST(reshape, test) { ASSERT_TRUE(converter); framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); // validator.DeclInputVar("reshape-X", {2, 3, 3, 1}); // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3}); @@ -48,7 +48,7 @@ TEST(reshape, test) { TEST(reshape, test2) { framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("reshape-X", {1, 2, 4}); validator.DeclOutputVar("reshape-Out", {1, 4, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc index 933e3eb2b..8c14fae0a 100644 --- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc @@ -25,10 +25,10 @@ TEST(softmax, test) { ASSERT_TRUE(converter); framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); - validator.DeclInputVar("softmax-X", {1, 10}); - validator.DeclOutputVar("softmax-Out", {1, 10}); + validator.DeclInputVar("softmax-X", {1, 10, 2}); + validator.DeclOutputVar("softmax-Out", {1, 10, 2}); framework::OpDesc desc; desc.SetType("softmax"); diff --git a/paddle/fluid/inference/anakin/convert/test_split_op.cc b/paddle/fluid/inference/anakin/convert/test_split_op.cc index e5a782329..aa61c01a5 100644 --- a/paddle/fluid/inference/anakin/convert/test_split_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc @@ -26,7 +26,7 @@ void AnakinSliceTest(const std::vector &in_shape, const std::vector §ions) { std::unordered_set parameters({""}); framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("split_input", in_shape); std::vector output_vars; diff --git a/paddle/fluid/inference/anakin/convert/test_sum_op.cc b/paddle/fluid/inference/anakin/convert/test_sum_op.cc index a93539303..d6a59a016 100644 --- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc @@ -25,7 +25,7 @@ namespace anakin { TEST(sum, native) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("sum_x1", {1, 2, 1, 2}); validator.DeclInputVar("sum_x2", {1, 2, 1, 2}); validator.DeclOutputVar("sum_out", {1, 2, 1, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc index 67c9c23f1..016ed26f0 100644 --- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc @@ -25,7 +25,7 @@ TEST(transpose_op, test) { ASSERT_TRUE(converter != nullptr); std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("transpose-X", {2, 3, 4, 5}); validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3}); @@ -47,7 +47,7 @@ TEST(transpose_op, test) { TEST(transpose_op, test2) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); + AnakinConvertValidation validator(parameters, &scope); validator.DeclInputVar("transpose-X", {3, 4, 5}); validator.DeclOutputVar("transpose-Out", {3, 5, 4}); diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h index d62d11d25..e0371d953 100644 --- a/paddle/fluid/inference/anakin/convert/ut_helper.h +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -84,7 +84,7 @@ class AnakinConvertValidation { AnakinConvertValidation() = delete; AnakinConvertValidation(const std::unordered_set& parameters, - framework::Scope& scope) + framework::Scope* scope) : parameters_(parameters), scope_(scope), place_(0) { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); engine_.reset(new AnakinEngine(true)); @@ -108,7 +108,7 @@ class AnakinConvertValidation { void DeclVar(const std::string& name, const std::vector dim_vec) { platform::CUDADeviceContext ctx(place_); - auto* x = scope_.Var(name); + auto* x = scope_->Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); RandomizeTensor(x_tensor, place_, ctx); @@ -120,13 +120,13 @@ class AnakinConvertValidation { // should init anakin engine here. Singleton::Global().ConvertOp( - desc, parameters_, scope_, engine_.get(), true /*test_mode*/); + desc, parameters_, *scope_, engine_.get(), true /*test_mode*/); engine_->Freeze(); std::map> temp_max_input_shape; for (const auto& input : op_desc_->InputArgumentNames()) { if (parameters_.count(input)) continue; - auto& t = inference::analysis::GetFromScope(scope_, + auto& t = inference::analysis::GetFromScope(*scope_, input); auto t_shape = framework::vectorize2int(t.dims()); while (t_shape.size() < 4) { @@ -147,14 +147,14 @@ class AnakinConvertValidation { std::unordered_set neglected_output = {}) { // Execute Fluid Op platform::CUDADeviceContext ctx(place_); - op_->Run(scope_, place_); + op_->Run(*scope_, place_); // std::vector input_vector; // std::vector output_vector; std::map inputs; for (const auto& input : op_desc_->InputArgumentNames()) { if (parameters_.count(input)) continue; - auto* var = scope_.FindVar(input); + auto* var = scope_->FindVar(input); auto tensor = var->GetMutable(); inputs.insert({input, tensor}); } @@ -164,7 +164,7 @@ class AnakinConvertValidation { for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; std::vector fluid_out; - auto* var = scope_.FindVar(output); + auto* var = scope_->FindVar(output); auto tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &fluid_out); fluid_outputs.push_back(fluid_out); @@ -177,7 +177,7 @@ class AnakinConvertValidation { for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; std::vector anakin_out; - auto* var = scope_.FindVar(output); + auto* var = scope_->FindVar(output); auto tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &anakin_out); @@ -189,15 +189,13 @@ class AnakinConvertValidation { } } - framework::Scope& scope() { return scope_; } - private: std::unique_ptr engine_{nullptr}; cudaStream_t stream_; std::unique_ptr op_; std::unique_ptr op_desc_; const std::unordered_set& parameters_; - framework::Scope& scope_; + framework::Scope* scope_; platform::CUDAPlace place_; }; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index b0e07fdf1..882e1d93b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -97,7 +97,11 @@ void IRPassManager::CreatePasses(Argument *argument, bool use_static_engine = argument->tensorrt_use_static_engine(); bool model_from_memory = argument->model_from_memory(); - if ((!model_from_memory && use_static_engine)) { + bool int8_valid = !(model_from_memory && enable_int8); + PADDLE_ENFORCE(int8_valid, + "TRT INT8 Now don't support model load from memory."); + + if ((!model_from_memory && use_static_engine) || enable_int8) { std::string model_opt_cache_dir = argument->Has("model_dir") ? argument->model_dir() diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index bd9f08d84..05a3d7ddf 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) +cc_library(subgraph_detector SRCS subgraph_detector.cc subgraph_util.cc DEPS proto_desc) if(WITH_TESTING) add_dependencies(subgraph_detector gtest) endif() @@ -15,7 +15,7 @@ if (WITH_GPU AND TENSORRT_FOUND) set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") endif() -if (WITH_ANAKIN_SUBGRAPH) +if (ANAKIN_FOUND) cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller) set(analysis_deps ${analysis_deps} diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index 0cd1d327e..12deed253 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -35,9 +35,6 @@ namespace analysis { using framework::ir::Node; -std::vector ExtractAnakinParameters( - const std::unordered_set &nodes); - std::unique_ptr analysis::AnakinSubgraphPass::ApplyImpl( std::unique_ptr graph) const { framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get()); @@ -51,11 +48,10 @@ std::unique_ptr analysis::AnakinSubgraphPass::ApplyImpl( fuser(); std::vector graph_param_names = - ExtractAnakinParameters(graph->Nodes()); + ExtractParameters(graph->Nodes()); // those parameter already exist in anakin, and should not have another copy - // in - // fluid. + // in fluid. std::vector repetitive_params; for (auto *node : graph->Nodes()) { @@ -157,74 +153,13 @@ void AnakinSubgraphPass::CreateAnakinOp( op_desc->SetType("anakin_engine"); std::unordered_map output_name_map; + auto &subgraph_nodes = *Agent(node).subgraph(); // The following procedure is used to rename all the intermediate // variables and the output variables of the subgraph. - // Why we do this? - // During the transition from fluid OP to anakin OP, we map - // the input and output Tensor(fluid data structure) of fluid OP - // to the corresponding ITensor (trt data structure) through the - // Tensor name. When we set up ITensor for an variable, we must - // ensure that it has not been set before. - // If there is variable in the fluid graph, which is not only the - // input of a OP, but also the output of a Op, there will be problems. - // So we have to rename the variable in the subgraph to make sure - // it is either an OP's input or an OP's output. - - auto &subgraph_nodes = *Agent(node).subgraph(); - for (size_t index = 0; index < block_desc.OpSize(); ++index) { - framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); - auto correspond_node = subgraph_nodes[index]; - PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); - - std::unordered_map var2id; - for (auto *in_var : correspond_node->inputs) { - var2id[in_var->Name()] = in_var->id(); - } - // rename for the input variables of op inside subgraph - for (int i = 0; i < op->inputs_size(); i++) { - // one input - auto *in_var = op->mutable_inputs(i); - std::vector replaced_names; - for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments - std::string arg_value = in_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (input_names_with_id.count(arg_value_with_id)) { - replaced_names.push_back(arg_value); - } else { - replaced_names.push_back(arg_value_with_id); - } - } - in_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - in_var->add_arguments(replaced_names[k]); - } - } - var2id.clear(); - for (auto out_var : correspond_node->outputs) { - var2id[out_var->Name()] = out_var->id(); - } - - // rename for the output variables of op inside subgraph - for (int i = 0; i < op->outputs_size(); i++) { - framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); - std::vector replaced_names; - for (int k = 0; k < out_var->arguments_size(); k++) { - std::string arg_value = out_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (output_names_with_id.count(arg_value_with_id)) { - output_name_map[arg_value] = arg_value_with_id; - } - replaced_names.push_back(arg_value_with_id); - } - out_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - out_var->add_arguments(replaced_names[k]); - } - } - } + RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, + &output_names_with_id, &output_names, &output_name_map, + false); // When anakin engine runs at the end of the operation, // output_mapping help us copy the data from the renamed ITensor @@ -249,8 +184,7 @@ void AnakinSubgraphPass::CreateAnakinOp( SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); // Set attrs - SetAttr(op_desc->Proto(), "parameters", - ExtractAnakinParameters(graph->Nodes())); + SetAttr(op_desc->Proto(), "parameters", params); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); int predictor_id = Get("predictor_id"); auto engine_key = GenerateAnakinEngineKey( @@ -277,34 +211,6 @@ void AnakinSubgraphPass::CreateAnakinOp( param_set, output_mapping, anakin_engine); } -std::vector ExtractAnakinParameters( - const std::unordered_set &nodes) { - // We can judge whether a variable is a parameter by - // its presistable property, but sometimes the presistable - // of the feed op output is true, so we have to identify it. - std::vector feed_outputs; - for (const auto &node : nodes) { - if (!node->IsOp()) continue; - std::string op_type = node->Op()->Type(); - if (op_type == "feed" || op_type == "fetch") { - std::vector output_names = node->Op()->OutputArgumentNames(); - std::copy(output_names.begin(), output_names.end(), - std::back_inserter(feed_outputs)); - } - } - - std::vector parameters; - for (const auto &node : nodes) { - if (!node->IsVar()) continue; - if (node->Var()->Persistable() && - std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == - feed_outputs.end()) { - parameters.push_back(node->Name()); - } - } - return parameters; -} - } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h index a732cb559..c13b9ecda 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h @@ -19,6 +19,7 @@ #include #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/anakin/engine.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" using anakin::Precision; using anakin::saber::NV; diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc new file mode 100644 index 000000000..a17ee1b70 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the the class to partition a graph. + */ + +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" +#include +#include + +namespace paddle { +namespace inference { +namespace analysis { +using framework::ir::Node; + +std::vector ExtractParameters( + const std::unordered_set &nodes) { + // We can judge whether a variable is a parameter by + // its presistable property, but sometimes the presistable + // of the feed op output is true, so we have to identify it. + std::vector feed_outputs; + for (const auto &node : nodes) { + if (!node->IsOp()) continue; + std::string op_type = node->Op()->Type(); + if (op_type == "feed" || op_type == "fetch") { + std::vector output_names = node->Op()->OutputArgumentNames(); + std::copy(output_names.begin(), output_names.end(), + std::back_inserter(feed_outputs)); + } + } + + std::vector parameters; + for (const auto &node : nodes) { + if (!node->IsVar()) continue; + if (node->Var()->Persistable() && + std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == + feed_outputs.end()) { + parameters.push_back(node->Name()); + } + } + return parameters; +} + +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map, + bool is_trt) { + //// In the normal case, the paddle-trt exists bug when runing the googlenet. + // When there are more than two convolutions of 1 * 1 with the same input, the + // paddle-tensorrt will do the merging optimization, which fuse those conv + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this optimization for the time being. This bug will be fixed in the future. + std::unordered_map + same_hierarchy_conv2d_num_map; + + for (size_t index = 0; index < block_desc->OpSize(); ++index) { + framework::proto::OpDesc *op = block_desc->Op(index)->Proto(); + framework::OpDesc op_desc(*op, nullptr); + auto correspond_node = subgraph_nodes[index]; + PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); + + std::unordered_map var2id; + std::unordered_map in_vars; + for (auto *in_var : correspond_node->inputs) { + var2id[in_var->Name()] = in_var->id(); + in_vars[in_var->Name()] = in_var; + } + // rename for the input variables of op inside subgraph + for (int i = 0; i < op->inputs_size(); i++) { + // one input + auto *in_var = op->mutable_inputs(i); + std::vector replaced_names; + for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments + std::string arg_value = in_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { + replaced_names.push_back(arg_value); + } else { + replaced_names.push_back(arg_value_with_id); + } + } + in_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + in_var->add_arguments(replaced_names[k]); + } + } + var2id.clear(); + for (auto out_var : correspond_node->outputs) { + var2id[out_var->Name()] = out_var->id(); + } + + if (op_desc.Type() == "conv2d" && is_trt) { + auto input_var_name = op_desc.Input("Input").front(); + auto filter_var_name = op_desc.Input("Filter").front(); + auto out_var_name = op_desc.Output("Output").front(); + auto filter_shape = in_vars[filter_var_name]->Var()->GetShape(); + const std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + if (same_hierarchy_conv2d_num_map[input_var_name] > 0) { + (*output_names_with_id) + .insert(out_var_name + std::to_string(var2id[out_var_name])); + (*output_names).insert(out_var_name); + } else if (filter_shape[2] == 1 && filter_shape[3] == 1 && + strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 && + paddings[1] == 0) { + same_hierarchy_conv2d_num_map[input_var_name] += 1; + } + } + + // rename for the output variables of op inside subgraph + for (int i = 0; i < op->outputs_size(); i++) { + framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); + std::vector replaced_names; + for (int k = 0; k < out_var->arguments_size(); k++) { + std::string arg_value = out_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id->count(arg_value_with_id)) { + (*output_name_map)[arg_value] = arg_value_with_id; + } + replaced_names.push_back(arg_value_with_id); + } + out_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + out_var->add_arguments(replaced_names[k]); + } + } + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h new file mode 100644 index 000000000..3cf21bf5f --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the the class to partition a graph. + */ + +#pragma once +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace inference { +namespace analysis { +using framework::ir::Node; + +std::vector ExtractParameters( + const std::unordered_set &nodes); + +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map, + bool is_trt = true); + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 00490fbb1..593994032 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -31,17 +31,6 @@ namespace analysis { using framework::ir::Node; -std::vector ExtractParameters( - const std::unordered_set &nodes); - -void RenameAndGetOutputs( - const std::vector &subgraph_nodes, - framework::BlockDesc *block_desc, - const std::set &input_names_with_id, - std::set *output_names_with_id, - std::set *output_names, - std::unordered_map *output_name_map); - std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( std::unique_ptr graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); @@ -217,7 +206,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // Get "" when there is no cached calibration table data. bool load_from_memory = Get("model_from_memory"); std::string calibration_data = ""; - if (!load_from_memory && use_static_engine) { + if (enable_int8) { calibration_data = GetTrtCalibTableData( Get("model_opt_cache_dir"), engine_key, enable_int8); } @@ -226,13 +215,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); std::string trt_engine_serialized_data = ""; - if (load_from_memory) { - std::map engine_opt_info = - Get>("engine_opt_info"); - if (engine_opt_info.count(engine_key)) { - trt_engine_serialized_data = engine_opt_info[engine_key]; - } - } + SetAttr(op_desc->Proto(), "engine_serialized_data", trt_engine_serialized_data); @@ -240,176 +223,62 @@ void TensorRtSubgraphPass::CreateTensorRTOp( if (enable_int8 && calibration_data.size() != 0) { calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); } - // When in int8 mode and calibration_mode, the program just produce the // calibration table data. bool calibration_mode = (enable_int8 && calibration_data.size() == 0); - if (!calibration_mode && use_static_engine && - trt_engine_serialized_data.empty()) { - std::copy(params.begin(), params.end(), - std::back_inserter(*repetitive_params)); - - if (use_static_engine && !load_from_memory) { - trt_engine_serialized_data = GetTrtEngineSerializedData( - Get("model_opt_cache_dir"), engine_key); - } + if (calibration_mode) { + // calibraion mode means generate int8 calibration table data process. + return; + } - if (trt_engine_serialized_data.empty()) { - LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " - "kernel etc). This process may cost a lot of time."; - std::unique_ptr trt_engine( - new tensorrt::TensorRTEngine( - Get("max_batch_size"), Get("workspace_size"), - enable_int8, calibrator.get(), Get("gpu_device_id"))); - auto *scope = param_scope(); - framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); - std::unordered_set param_set(params.begin(), params.end()); - inference::Singleton::Global() - .ConvertBlockToTRTEngine( - &block_desc_temp, *scope, - std::vector(input_names.begin(), input_names.end()), - param_set, output_mapping, trt_engine.get()); - nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); - trt_engine_serialized_data = - std::string((const char *)serialized_engine_data->data(), - serialized_engine_data->size()); - - if (use_static_engine && !load_from_memory) { - SaveTrtEngineSerializedDataToFile( - GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), - engine_key), - trt_engine_serialized_data); - } - } else { + std::copy(params.begin(), params.end(), + std::back_inserter(*repetitive_params)); + bool need_serialize = (use_static_engine && !load_from_memory); + + if (need_serialize) { + trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); + // we can load the engine info serialized before from the disk. + if (!trt_engine_serialized_data.empty()) { + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); LOG(INFO) << "Load TRT Optimized Info from " << GetTrtEngineSerializedPath( Get("model_opt_cache_dir"), engine_key); - } - - SetAttr(op_desc->Proto(), "engine_serialized_data", - trt_engine_serialized_data); - } -} - -std::vector ExtractParameters( - const std::unordered_set &nodes) { - // We can judge whether a variable is a parameter by - // its presistable property, but sometimes the presistable - // of the feed op output is true, so we have to identify it. - std::vector feed_outputs; - for (const auto &node : nodes) { - if (!node->IsOp()) continue; - std::string op_type = node->Op()->Type(); - if (op_type == "feed" || op_type == "fetch") { - std::vector output_names = node->Op()->OutputArgumentNames(); - std::copy(output_names.begin(), output_names.end(), - std::back_inserter(feed_outputs)); + return; } } - std::vector parameters; - for (const auto &node : nodes) { - if (!node->IsVar()) continue; - if (node->Var()->Persistable() && - std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == - feed_outputs.end()) { - parameters.push_back(node->Name()); - } - } - return parameters; -} - -void RenameAndGetOutputs( - const std::vector &subgraph_nodes, - framework::BlockDesc *block_desc, - const std::set &input_names_with_id, - std::set *output_names_with_id, - std::set *output_names, - std::unordered_map *output_name_map) { - //// In the normal case, the paddle-trt exists bug when runing the googlenet. - // When there are more than two convolutions of 1 * 1 with the same input, the - // paddle-tensorrt will do the merging optimization, which fuse those conv - // into one conv, and then trigger bug. So, We should use strategy to avoid - // this optimization for the time being. This bug will be fixed in the future. - std::unordered_map - same_hierarchy_conv2d_num_map; - - for (size_t index = 0; index < block_desc->OpSize(); ++index) { - framework::proto::OpDesc *op = block_desc->Op(index)->Proto(); - framework::OpDesc op_desc(*op, nullptr); - auto correspond_node = subgraph_nodes[index]; - PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); - - std::unordered_map var2id; - std::unordered_map in_vars; - for (auto *in_var : correspond_node->inputs) { - var2id[in_var->Name()] = in_var->id(); - in_vars[in_var->Name()] = in_var; - } - // rename for the input variables of op inside subgraph - for (int i = 0; i < op->inputs_size(); i++) { - // one input - auto *in_var = op->mutable_inputs(i); - std::vector replaced_names; - for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments - std::string arg_value = in_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (input_names_with_id.count(arg_value_with_id)) { - replaced_names.push_back(arg_value); - } else { - replaced_names.push_back(arg_value_with_id); - } - } - in_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - in_var->add_arguments(replaced_names[k]); - } - } - var2id.clear(); - for (auto out_var : correspond_node->outputs) { - var2id[out_var->Name()] = out_var->id(); - } - - if (op_desc.Type() == "conv2d") { - auto input_var_name = op_desc.Input("Input").front(); - auto filter_var_name = op_desc.Input("Filter").front(); - auto out_var_name = op_desc.Output("Output").front(); - auto filter_shape = in_vars[filter_var_name]->Var()->GetShape(); - const std::vector strides = - boost::get>(op_desc.GetAttr("strides")); - const std::vector paddings = - boost::get>(op_desc.GetAttr("paddings")); - if (same_hierarchy_conv2d_num_map[input_var_name] > 0) { - (*output_names_with_id) - .insert(out_var_name + std::to_string(var2id[out_var_name])); - (*output_names).insert(out_var_name); - } else if (filter_shape[2] == 1 && filter_shape[3] == 1 && - strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 && - paddings[1] == 0) { - same_hierarchy_conv2d_num_map[input_var_name] += 1; - } - } - - // rename for the output variables of op inside subgraph - for (int i = 0; i < op->outputs_size(); i++) { - framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); - std::vector replaced_names; - for (int k = 0; k < out_var->arguments_size(); k++) { - std::string arg_value = out_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (output_names_with_id->count(arg_value_with_id)) { - (*output_name_map)[arg_value] = arg_value_with_id; - } - replaced_names.push_back(arg_value_with_id); - } - out_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - out_var->add_arguments(replaced_names[k]); - } - } + // the following code will NOT run in following situation: + // 1. calibraion mode (generate trt int8 calibraiton table data) + // 2. already load serialized trt engine info. + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; + std::unique_ptr trt_engine( + new tensorrt::TensorRTEngine( + Get("max_batch_size"), Get("workspace_size"), enable_int8, + calibrator.get(), Get("gpu_device_id"))); + auto *scope = param_scope(); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); + std::unordered_set param_set(params.begin(), params.end()); + inference::Singleton::Global() + .ConvertBlockToTRTEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, trt_engine.get()); + nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); + trt_engine_serialized_data = + std::string((const char *)serialized_engine_data->data(), + serialized_engine_data->size()); + + if (need_serialize) { + SaveTrtEngineSerializedDataToFile( + GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), + engine_key), + trt_engine_serialized_data); } + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index 6689a668f..f043670c5 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -20,6 +20,7 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 38313754e..90f09505c 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -27,7 +27,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) endif() -if (WITH_ANAKIN_SUBGRAPH) +if (ANAKIN_FOUND) set(inference_deps ${inference_deps} anakin_op_converter anakin_engine) endif() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 4278d579e..001e8e66d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -40,10 +40,11 @@ #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" - #endif +#if PADDLE_WITH_ANAKIN #include "paddle/fluid/inference/anakin/convert/op_converter.h" +#endif DECLARE_bool(profile); @@ -817,6 +818,7 @@ USE_TRT_CONVERTER(conv2d_transpose); USE_TRT_CONVERTER(leaky_relu); #endif +#if PADDLE_WITH_ANAKIN USE_ANAKIN_CONVERTER(mul); USE_ANAKIN_CONVERTER(fc); USE_ANAKIN_CONVERTER(conv2d); @@ -838,3 +840,4 @@ USE_ANAKIN_CONVERTER(detection_out); USE_ANAKIN_CONVERTER(density_prior_box); USE_ANAKIN_CONVERTER(dropout); USE_ANAKIN_CONVERTER(sum); +#endif diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 33de430fc..d2e2a98a8 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -81,6 +81,8 @@ PaddleDType ZeroCopyTensor::type() { return PaddleDType::FLOAT32; } else if (type == framework::proto::VarType::INT64) { return PaddleDType::INT64; + } else if (type == framework::proto::VarType::INT32) { + return PaddleDType::INT32; } else { LOG(ERROR) << "unknown type, only support float32 and int64 now."; } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 8ccb6d5a1..afac8e4d2 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -34,7 +34,7 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -if (WITH_ANAKIN_SUBGRAPH) +if (ANAKIN_FOUND) add_subdirectory(anakin) endif() diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.cc b/paddle/fluid/operators/anakin/anakin_engine_op.cc index 48b0490d0..58db16ea0 100644 --- a/paddle/fluid/operators/anakin/anakin_engine_op.cc +++ b/paddle/fluid/operators/anakin/anakin_engine_op.cc @@ -39,8 +39,7 @@ class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker { class AnakinEngineInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override {} + void operator()(framework::InferVarTypeContext *ctx) const override {} }; } // namespace operators -- GitLab