From bce259e5bfe6f61b5eb431402a983d6fe134e729 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Tue, 11 Jun 2019 13:55:49 +0800 Subject: [PATCH] Update the Anakin interfaces for content-dnn and MLU (#17890) * update anakin-engine interfaces for content-dnn test=develop * support only-gpu mode of Anakin modify eltwise parse test=develop * modification for thread-safe test=develop * Integrated template instance test=develop * increase template parameters test=develop * support MLU predictor test=develop * update anakin cmake files test=develop * update TargetWrapper::set_device * update the initialization of anakin subgraph test=develop * use the default constructor of base class test=develop --- cmake/anakin_subgraph.cmake | 11 +- paddle/fluid/framework/ir/CMakeLists.txt | 2 +- paddle/fluid/inference/CMakeLists.txt | 8 +- .../inference/anakin/convert/elementwise.cc | 2 +- .../inference/anakin/convert/op_converter.h | 27 +- .../anakin/convert/test_activation_op.cc | 32 -- .../anakin/convert/test_affine_channel_op.cc | 7 +- .../anakin/convert/test_batch_norm_op.cc | 8 +- .../anakin/convert/test_concat_op.cc | 8 +- .../anakin/convert/test_conv2d_op.cc | 8 +- .../anakin/convert/test_dropout_op.cc | 7 +- .../anakin/convert/test_elementwise_op.cc | 10 +- .../inference/anakin/convert/test_fc_op.cc | 7 +- .../anakin/convert/test_flatten_op.cc | 7 +- .../anakin/convert/test_pool2d_op.cc | 8 +- .../inference/anakin/convert/test_relu_op.cc | 5 - .../anakin/convert/test_reshape_op.cc | 8 +- .../anakin/convert/test_softmax_op.cc | 8 +- .../inference/anakin/convert/test_split_op.cc | 7 +- .../inference/anakin/convert/test_sum_op.cc | 7 +- .../anakin/convert/test_transpose_op.cc | 7 +- .../inference/anakin/convert/ut_helper.h | 8 +- paddle/fluid/inference/anakin/engine.cc | 11 +- paddle/fluid/inference/anakin/engine.h | 4 +- .../inference/anakin/test_anakin_engine.cc | 1 - .../analysis/ir_passes/CMakeLists.txt | 2 +- .../ir_passes/anakin_subgraph_pass.cc | 6 +- paddle/fluid/inference/api/CMakeLists.txt | 38 +- .../fluid/inference/api/api_anakin_engine.cc | 445 +++++++++++------- .../fluid/inference/api/api_anakin_engine.h | 60 ++- paddle/fluid/inference/api/helper.h | 5 +- .../inference/api/paddle_anakin_config.h | 20 +- .../inference/api/paddle_inference_api.h | 2 +- .../inference/tests/api/anakin_mlu_tester.cc | 98 ++++ .../tests/api/anakin_mobilenet_tester.cc | 4 +- .../inference/tests/api/anakin_rnn1_tester.cc | 4 +- .../inference/tests/api/anakin_rnn2_tester.cc | 261 ++++++++++ paddle/fluid/operators/CMakeLists.txt | 2 +- .../fluid/operators/anakin/anakin_engine_op.h | 4 + 39 files changed, 818 insertions(+), 351 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/anakin_mlu_tester.cc create mode 100644 paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc diff --git a/cmake/anakin_subgraph.cmake b/cmake/anakin_subgraph.cmake index b5437e776d3..eb7bce9f3b7 100644 --- a/cmake/anakin_subgraph.cmake +++ b/cmake/anakin_subgraph.cmake @@ -1,7 +1,3 @@ -if(NOT WITH_GPU) - return() -endif() - set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT") find_path(ANAKIN_INCLUDE_DIR anakin_config.h PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include @@ -16,9 +12,7 @@ find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so DOC "Path to ANAKIN library.") if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY) - if(WITH_DSO) set(ANAKIN_FOUND ON) - endif(WITH_DSO) else() set(ANAKIN_FOUND OFF) endif() @@ -31,3 +25,8 @@ if(ANAKIN_FOUND) link_directories(${ANAKIN_ROOT}) add_definitions(-DPADDLE_WITH_ANAKIN) endif() + +if(ANAKIN_FOUND AND WITH_GPU AND WITH_DSO) + message(STATUS "Compile with anakin subgraph.") + set(ANAKIN_SUBGRAPH ON) +endif() diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 4e2549ed511..5228840c960 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -77,7 +77,7 @@ pass_library(fillconstant_elementwisemul_fuse inference) pass_library(shuffle_channel_detect_pass inference) pass_library(delete_quant_dequant_op_pass inference) -if(ANAKIN_FOUND) +if(ANAKIN_SUBGRAPH) pass_library(simplify_anakin_priorbox_detection_out_pass inference) endif() diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 5e0be5d445e..44eaf90371d 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -17,7 +17,7 @@ if (TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -if (ANAKIN_FOUND) +if (ANAKIN_SUBGRAPH) add_subdirectory(anakin) endif() @@ -43,11 +43,15 @@ if(WITH_MKLDNN) endif() set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor) +if (ANAKIN_FOUND) + set(ANAKIN_SHARED_INFERENCE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/api/api_anakin_engine.cc) +endif() set(SHARED_INFERENCE_SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc ${mkldnn_quantizer_src} - ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) + ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc + ${ANAKIN_SHARED_INFERENCE_SRCS}) if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc index dd32baa0b90..d221f26e119 100644 --- a/paddle/fluid/inference/anakin/convert/elementwise.cc +++ b/paddle/fluid/inference/anakin/convert/elementwise.cc @@ -60,7 +60,7 @@ void ElementwiseMulOpConverter::operator()( auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name}); - std::string elementwise_type = "Prod"; + std::string elementwise_type = "Mul"; this->engine_->template AddOpAttr(op_name, "type", elementwise_type); std::vector coeff = {1.0, 1.0}; diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index a6ae51bd4b1..1058e744bca 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -153,11 +153,12 @@ template class AnakinOpConverter<::anakin::saber::NV, ::anakin::Precision::FP32>; template class AnakinOpConverter<::anakin::saber::NV, ::anakin::Precision::INT8>; - +#ifdef ANAKIN_X86_PLACE template class AnakinOpConverter<::anakin::saber::X86, ::anakin::Precision::FP32>; template class AnakinOpConverter<::anakin::saber::X86, ::anakin::Precision::INT8>; +#endif } // namespace anakin } // namespace inference } // namespace paddle @@ -203,16 +204,16 @@ template class AnakinOpConverter<::anakin::saber::X86, CPU, ::anakin::saber::X86, precision_type__, \ ::anakin::Precision::precision_type__) -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE) #define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8); \ REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8) -#else -#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ - REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ - REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8) +#elif defined(PADDLE_WITH_CUDA) +#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ + REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ + REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8) #endif #define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \ @@ -221,12 +222,16 @@ template class AnakinOpConverter<::anakin::saber::X86, __attribute__((unused)) = \ Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); +#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE) +#define USE_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32) +#define USE_INT8_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8) +#elif defined(PADDLE_WITH_CUDA) #define USE_ANAKIN_CONVERTER(op_type__) \ USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) #define USE_INT8_ANAKIN_CONVERTER(op_type__) \ USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) - -#define USE_CPU_ANAKIN_CONVERTER(op_type__) \ - USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32) -#define USE_CPU_INT8_ANAKIN_CONVERTER(op_type__) \ - USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8) +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc index 4f898252d27..5ac8b45882f 100644 --- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc @@ -77,32 +77,6 @@ TEST(swish_op, gpu) { } #endif -/* -TEST(sigm_op, cpu) { - platform::CPUPlace cpu_place; - platform::CPUDeviceContext ctx(cpu_place); - test_activation_op<::anakin::saber::X86>("sigmoid", ctx, false); -} - -TEST(tanh_op, cpu) { - platform::CPUPlace cpu_place; - platform::CPUDeviceContext ctx(cpu_place); - test_activation_op<::anakin::saber::X86>("tanh", ctx, false); -} - -TEST(relu6_op, cpu) { - platform::CPUPlace cpu_place; - platform::CPUDeviceContext ctx(cpu_place); - test_activation_op<::anakin::saber::X86>("relu6", ctx, false); -} - -TEST(swish_op, cpu) { - platform::CPUPlace cpu_place; - platform::CPUDeviceContext ctx(cpu_place); - test_activation_op<::anakin::saber::X86>("swish", ctx, false); -} -*/ - } // namespace anakin } // namespace inference } // namespace paddle @@ -112,13 +86,7 @@ USE_OP(tanh); USE_OP(relu6); USE_OP(swish); -USE_CPU_ANAKIN_CONVERTER(sigmoid); -USE_CPU_ANAKIN_CONVERTER(tanh); -USE_CPU_ANAKIN_CONVERTER(relu6); -USE_CPU_ANAKIN_CONVERTER(swish); -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(sigmoid); USE_ANAKIN_CONVERTER(tanh); USE_ANAKIN_CONVERTER(relu6); USE_ANAKIN_CONVERTER(swish); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc index f6399387aa2..008537dc8a5 100644 --- a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc @@ -57,19 +57,16 @@ TEST(affine_channel_op, gpu) { test_affine_channel_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(affine_channel_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_affine_channel_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(affine_channel); -USE_CPU_ANAKIN_CONVERTER(affine_channel); -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(affine_channel); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc index c008ef1bd5e..edba90235fa 100644 --- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc @@ -73,19 +73,15 @@ TEST(batch_norm_op, gpu) { test_batchnorm_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(batch_norm_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_batchnorm_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(batch_norm); -USE_CPU_ANAKIN_CONVERTER(batch_norm); - -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(batch_norm); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc index 42dfbeb5cdc..6870260c865 100644 --- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc @@ -53,19 +53,15 @@ TEST(concat_op, gpu) { test_concat_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(concat_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_concat_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(concat); -USE_CPU_ANAKIN_CONVERTER(concat); - -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(concat); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc index e95e11c4f96..723a348b12e 100644 --- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc @@ -60,20 +60,16 @@ TEST(conv2d_op, gpu) { test_conv2d_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(conv2d_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_conv2d_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(conv2d); -USE_CPU_ANAKIN_CONVERTER(conv2d); - -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(conv2d); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc index ae27e27ded5..83792676a00 100644 --- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc @@ -54,19 +54,16 @@ TEST(dropout_op, gpu) { test_dropout_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(dropout_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_dropout_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(dropout); -USE_CPU_ANAKIN_CONVERTER(dropout); -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(dropout); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc index bff75294908..ee128c1ec9a 100644 --- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc @@ -59,29 +59,23 @@ TEST(elementwise_op, native_mul_gpu) { test_elementwise_op<::anakin::saber::NV>("elementwise_mul", ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(elementwise_op, native_add_cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_elementwise_op<::anakin::saber::X86>("elementwise_add", ctx, false); } - TEST(elementwise_op, native_mul_cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_elementwise_op<::anakin::saber::X86>("elementwise_mul", ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(elementwise_add); USE_OP(elementwise_mul); -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(elementwise_add); USE_ANAKIN_CONVERTER(elementwise_mul); -#endif - -USE_CPU_ANAKIN_CONVERTER(elementwise_add); -USE_CPU_ANAKIN_CONVERTER(elementwise_mul); diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc index a24c809c022..3e68d8fed6a 100644 --- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc @@ -49,19 +49,16 @@ TEST(mul_op, gpu) { test_mul_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(mul_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_mul_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(mul); -USE_CPU_ANAKIN_CONVERTER(fc); -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(fc); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc index 5765f5ebd1f..5e4cfdabfd7 100644 --- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc @@ -48,20 +48,17 @@ TEST(flatten_op, gpu) { test_flatten_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(flatten_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_flatten_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(reshape); USE_OP_ITSELF(flatten); -USE_CPU_ANAKIN_CONVERTER(flatten); -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(flatten); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc index 90503b1fbba..9b23b5b93df 100644 --- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc @@ -87,7 +87,7 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d<::anakin::saber::NV>(ctx, true, false, true, "avg"); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(Pool2dOpConverter, normal_cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); @@ -110,14 +110,10 @@ TEST(Pool2dOpConverter, avg_ceil_test_cpu) { platform::CPUDeviceContext ctx(cpu_place); test_pool2d<::anakin::saber::X86>(ctx, false, false, true, "avg"); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(pool2d); -USE_CPU_ANAKIN_CONVERTER(pool2d); - -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(pool2d); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc index 3f224796519..eb6429f3383 100644 --- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc @@ -66,10 +66,5 @@ TEST(leaky_relu_op, gpu) { USE_OP(relu); USE_OP(leaky_relu); -USE_CPU_ANAKIN_CONVERTER(relu); -USE_CPU_ANAKIN_CONVERTER(leaky_relu); - -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(relu); USE_ANAKIN_CONVERTER(leaky_relu); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc index e102bd3ac3e..b1be42e542c 100644 --- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc @@ -81,7 +81,7 @@ TEST(reshape2_op, gpu) { test_reshape2_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(reshape1_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); @@ -93,14 +93,10 @@ TEST(reshape2_op, cpu) { platform::CPUDeviceContext ctx(cpu_place); test_reshape2_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(reshape); -USE_CPU_ANAKIN_CONVERTER(reshape); - -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(reshape); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc index de0b18fdbfd..1a324739d98 100644 --- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc @@ -48,20 +48,16 @@ TEST(softmax_op, gpu) { test_softmax_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(relu_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_softmax_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(softmax); -USE_CPU_ANAKIN_CONVERTER(softmax); - -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(softmax); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_split_op.cc b/paddle/fluid/inference/anakin/convert/test_split_op.cc index 9a42ffd853b..f9ef54fdcac 100644 --- a/paddle/fluid/inference/anakin/convert/test_split_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc @@ -92,7 +92,7 @@ TEST(split_op, test_different_shape_axis3_batch1) { platform::CUDADeviceContext ctx(gpu_place); AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 3}, {2, 1}); } - +#ifdef ANAKIN_X86_PLACE TEST(split_op, test_different_shape_axis1_batch1_cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); @@ -110,13 +110,10 @@ TEST(split_op, test_different_shape_axis3_batch1_cpu) { platform::CPUDeviceContext ctx(cpu_place); AnakinSliceTest<::anakin::saber::X86, 3>(ctx, false, {1, 3, 2, 4}, {2, 2}); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(split); -USE_CPU_ANAKIN_CONVERTER(split); -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(split); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_sum_op.cc b/paddle/fluid/inference/anakin/convert/test_sum_op.cc index 65f67ebd129..9d26430ea68 100644 --- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc @@ -49,19 +49,16 @@ TEST(sum_op, gpu) { test_sum_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(sum_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); test_sum_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(sum); -USE_CPU_ANAKIN_CONVERTER(sum); -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(sum); -#endif diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc index 51b69dfbb08..466e2f1a49f 100644 --- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc @@ -79,7 +79,7 @@ TEST(transpose2_op, gpu) { test_transpose2_op<::anakin::saber::NV>(ctx, true); } #endif - +#ifdef ANAKIN_X86_PLACE TEST(transpose1_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); @@ -91,13 +91,10 @@ TEST(transpose2_op, cpu) { platform::CPUDeviceContext ctx(cpu_place); test_transpose2_op<::anakin::saber::X86>(ctx, false); } - +#endif } // namespace anakin } // namespace inference } // namespace paddle USE_OP(transpose); -USE_CPU_ANAKIN_CONVERTER(transpose); -#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(transpose); -#endif diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h index 2f8f953892c..92441f2560f 100644 --- a/paddle/fluid/inference/anakin/convert/ut_helper.h +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -33,7 +33,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" using anakin::Precision; -using anakin::saber::X86; namespace paddle { namespace inference { @@ -215,13 +214,14 @@ class AnakinConvertValidation { template class AnakinConvertValidation<::anakin::saber::NV, ::anakin::Precision::FP32>; -template class AnakinConvertValidation<::anakin::saber::X86, - ::anakin::Precision::FP32>; - template class AnakinConvertValidation<::anakin::saber::NV, ::anakin::Precision::INT8>; +#ifdef ANAKIN_X86_PLACE +template class AnakinConvertValidation<::anakin::saber::X86, + ::anakin::Precision::FP32>; template class AnakinConvertValidation<::anakin::saber::X86, ::anakin::Precision::INT8>; +#endif } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index 98bbeddd76c..13f16c4c898 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -46,10 +46,9 @@ AnakinEngine::AnakinEngine( max_input_shape_(max_input_shape), program_inputs_(program_inputs), auto_config_layout_(auto_config_layout) { - std::call_once(init_anakin_, [this]() { - ::anakin::TargetWrapper::set_device(device_); - ::anakin::Env::env_init(); - }); + ::anakin::TargetWrapper::set_device(device_); + std::call_once(init_anakin_, + [this]() { ::anakin::Env::env_init(); }); graph_.reset(new AnakinGraphT()); net_.reset(new AnakinNetT(need_summary)); } @@ -194,14 +193,14 @@ template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>; template class AnakinEngineManager<::anakin::saber::NV, ::anakin::Precision::INT8>; #endif - +#ifdef ANAKIN_X86_PLACE template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; template class AnakinEngineManager<::anakin::saber::X86, ::anakin::Precision::FP32>; template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>; template class AnakinEngineManager<::anakin::saber::X86, ::anakin::Precision::INT8>; - +#endif // template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; } // namespace anakin } // namespace inference diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h index 04ac000e1ec..e62bb82fd12 100644 --- a/paddle/fluid/inference/anakin/engine.h +++ b/paddle/fluid/inference/anakin/engine.h @@ -24,7 +24,9 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/utils/singleton.h" - +#ifdef EXIT // NOLINT +#undef EXIT // NOLINT +#endif // NOLINT #include "framework/core/net/net.h" #include "framework/core/types.h" #include "framework/graph/graph.h" diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc index 422f415a5db..3c8a33ec60f 100644 --- a/paddle/fluid/inference/anakin/test_anakin_engine.cc +++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc @@ -22,7 +22,6 @@ limitations under the License. */ using anakin::AK_FLOAT; using anakin::Precision; using anakin::saber::NV; -using anakin::saber::X86; using anakin::saber::Shape; using anakin::PBlock; using anakin::PTuple; diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index 05a3d7ddfdb..ddadbc6df4a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -15,7 +15,7 @@ if (WITH_GPU AND TENSORRT_FOUND) set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") endif() -if (ANAKIN_FOUND) +if (ANAKIN_SUBGRAPH) cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller) set(analysis_deps ${analysis_deps} diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index 9586ce3e6b0..a6c6f33cf77 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -226,7 +226,6 @@ void AnakinSubgraphPass::CreateAnakinEngine( auto max_batch_size = Get("max_batch_size"); auto max_input_shape = Get>>("max_input_shape"); - bool auto_config_layout = Get("auto_config_layout"); if (use_gpu) { #ifdef PADDLE_WITH_CUDA inference::Singleton< @@ -235,11 +234,14 @@ void AnakinSubgraphPass::CreateAnakinEngine( max_input_shape, program_inputs, false, engine_key); #endif } else { +#ifdef ANAKIN_X86_PLACE + bool auto_config_layout = Get("auto_config_layout"); inference::Singleton< anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global() .Create(true, Get("gpu_device_id"), max_batch_size, max_input_shape, program_inputs, auto_config_layout, engine_key); +#endif } auto *scope = param_scope(); @@ -258,6 +260,7 @@ void AnakinSubgraphPass::CreateAnakinEngine( param_set, output_mapping, anakin_engine); #endif } else { +#ifdef ANAKIN_X86_PLACE auto *anakin_engine = inference::Singleton>::Global() @@ -268,6 +271,7 @@ void AnakinSubgraphPass::CreateAnakinEngine( &block_desc_temp, scope, std::vector(input_names.begin(), input_names.end()), param_set, output_mapping, anakin_engine); +#endif } } diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 65b9443c212..1921e419383 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -27,7 +27,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) endif() -if (ANAKIN_FOUND) +if (ANAKIN_SUBGRAPH) set(inference_deps ${inference_deps} anakin_op_converter anakin_engine) endif() @@ -38,9 +38,9 @@ endif() add_subdirectory(details) if(WITH_MKLDNN) - set(mkldnn_quantizer_src mkldnn_quantizer.cc) - set(mkldnn_quantizer_cfg mkldnn_quantizer_config) - cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder) + set(mkldnn_quantizer_src mkldnn_quantizer.cc) + set(mkldnn_quantizer_cfg mkldnn_quantizer_config) + cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder) endif() cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder) @@ -56,9 +56,7 @@ cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS paddle_pass_builder zero_copy_tensor reset_tensor_array) -cc_test(test_paddle_inference_api - SRCS api_tester.cc - DEPS paddle_inference_api) +cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) if(WITH_TESTING) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} @@ -69,13 +67,21 @@ endif() cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} ARGS --dirname=${WORD2VEC_MODEL_DIR}) -if (WITH_ANAKIN AND WITH_MKL) # only needed in CI - # compile the libinference_anakin_api.a and anakin.so. - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy device_context) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy device_context) - function(anakin_target target_name) - target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) - endfunction() - anakin_target(inference_anakin_api) - anakin_target(inference_anakin_api_shared) +if(ANAKIN_FOUND) + if (ANAKIN_MLU AND NOT WITH_GPU AND NOT ANAKIN_X86) + message(STATUS "Compile with anakin mlu place.") + add_definitions(-DANAKIN_MLU_PLACE) + elseif(ANAKIN_X86) + message(STATUS "Compile with anakin x86 place.") + add_definitions(-DANAKIN_X86_PLACE) + endif() + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc) + target_link_libraries(inference_anakin_api anakin anakin_saber_common) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc) + target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) + function(anakin_target target_name) + target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + endfunction() + anakin_target(inference_anakin_api) + anakin_target(inference_anakin_api_shared) endif() diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc index 2c4894fd887..63d23321ab4 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.cc +++ b/paddle/fluid/inference/api/api_anakin_engine.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,19 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/api/api_anakin_engine.h" - -#ifdef PADDLE_WITH_CUDA -#include -#endif - -#include -#include #include #include #include #include +#include "paddle/fluid/inference/api/api_anakin_engine.h" +#include "paddle/fluid/inference/api/paddle_api.h" + #include "framework/core/net/net.h" #include "framework/operators/ops.h" #include "saber/funcs/timer.h" @@ -32,209 +27,346 @@ namespace paddle { using paddle::contrib::AnakinConfig; +template +extern std::mutex PaddleInferenceAnakinPredictor::mutex_; +template +extern std::once_flag PaddleInferenceAnakinPredictor::init_anakin_; -template -PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( - const contrib::AnakinConfig &config) { - CHECK(Init(config)); +template +void PaddleInferenceAnakinPredictor::InitEnv() { + anakin::TargetWrapper::set_device(this->config_.device_id); + std::call_once(this->init_anakin_, [this]() { + anakin::Env::env_init(this->config_.max_stream); + }); } -template <> -PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( - const contrib::AnakinConfig &config) { - omp_set_dynamic(0); - omp_set_num_threads(1); - mkl_set_num_threads(1); - CHECK(Init(config)); +template +void PaddleInferenceAnakinPredictor::InitNet() { + std::unique_lock lock(this->mutex_); + this->executor_p_ = new anakin::Net(*this->graph_p_, true); } -template -bool PaddleInferenceAnakinPredictor::Init( - const contrib::AnakinConfig &config) { - if (!(graph_.load(config.model_file))) { - VLOG(3) << "fail to load graph from " << config.model_file; - return false; +template +void PaddleInferenceAnakinPredictor::SetContext() { + this->ctx_p_ = std::make_shared>( + this->config_.device_id, this->config_.data_stream_id, + this->config_.compute_stream_id); +} +template +void PaddleInferenceAnakinPredictor::InitGraph() { + this->graph_p_ = + std::make_shared>(); + if (!(this->graph_p_->load(this->config_.model_file))) { + LOG(FATAL) << "fail to load graph from " << this->config_.model_file; } - auto inputs = graph_.get_ins(); + auto inputs = this->graph_p_->get_ins(); for (auto &input_str : inputs) { - graph_.ResetBatchSize(input_str, config.max_batch_size); - max_batch_size_ = config.max_batch_size; + if (this->config_.init_inputs_shape.find(input_str) == + this->config_.init_inputs_shape.end()) { + LOG(FATAL) << input_str << " is not implemented."; + } + std::vector shape = + this->config_.init_inputs_shape.find(input_str)->second; + this->graph_p_->Reshape(input_str, shape); } - // optimization for graph - if (!(graph_.Optimize())) { - return false; +} +template +void PaddleInferenceAnakinPredictor::OptimizeGraph() { + if (!this->graph_p_->Optimize()) { + LOG(FATAL) << "Graph optimization error."; } - // construct executer - if (executor_p_ == nullptr) { - executor_p_ = new anakin::Net(graph_, true); +} +template +void PaddleInferenceAnakinPredictor::InitPredictor() { + this->InitEnv(); + this->SetContext(); + this->InitGraph(); + this->OptimizeGraph(); + this->InitNet(); +} +template +void PaddleInferenceAnakinPredictor::Predict() { + anakin::TargetWrapper::device_sync(); + this->executor_p_->prediction(); + anakin::TargetWrapper::device_sync(); +} +template +bool PaddleInferenceAnakinPredictor::Run( + const std::vector &inputs, + std::vector *output_data, int batch_size) { + if (this->config_.re_allocable) { + return this->RunImpl(inputs, output_data); + } else { + // Run inputs data that exceeds batch size in batches. + // 1. Reassign the batch size. + if (batch_size == -1) { + if (!inputs[0].lod.empty()) { + batch_size = inputs[0].lod[0].size() - 1; + } else { + batch_size = inputs[0].shape[0]; + } + } + // 2. If the data don't need to be batched, run it directly. + if (batch_size <= this->config_.init_batch_size) { + return this->RunImpl(inputs, output_data); + } + // 3. Check the batch size and define temporary variables. + std::vector cur_inputs; + std::vector outputs_master; + std::vector> outputs_vec; + for (const auto &input : inputs) { + if (!input.lod.empty()) { + if (input.lod.size() != 1) { + return false; + } + if (input.lod[0].size() - 1 != batch_size) { + return false; + } + } else { + LOG(INFO) << "Non-lod mode to be implemented."; + return false; + } + PaddleTensor tensor; + tensor.name = input.name; + tensor.dtype = PaddleDType::FLOAT32; + cur_inputs.push_back(tensor); + } + for (auto output : *output_data) { + PaddleTensor tensor; + tensor.name = output.name; + outputs_master.push_back(tensor); + } + // 4. Batch execution. + for (size_t start_batch = 0; start_batch < batch_size;) { + auto end_batch = start_batch + this->config_.init_batch_size; + if (end_batch > batch_size) { + end_batch = batch_size; + } + auto cur_outputs = outputs_master; + for (size_t i = 0; i < inputs.size(); i++) { + auto start = inputs[i].lod[0][start_batch]; + auto end = inputs[i].lod[0][end_batch]; + std::vector offsets; + for (size_t j = start_batch; j <= end_batch; j++) { + offsets.push_back(inputs[i].lod[0][j] - + inputs[i].lod[0][start_batch]); + } + auto mem_start = static_cast(inputs[i].data.data()) + start; + cur_inputs[i].data = + PaddleBuf(mem_start, (end - start) * sizeof(float)); + cur_inputs[i].lod = std::vector>({offsets}); + cur_inputs[i].shape = + std::vector({static_cast(end - start), 1, 1, 1}); + } + if (!this->RunImpl(cur_inputs, &cur_outputs)) { + return false; + } + outputs_vec.push_back(cur_outputs); + start_batch = end_batch; + } + // 5. Copy the results to contiguous memory. + // Assume that each batch has the same final outputs size. + auto count = [](const std::vector &v) { + int cnt = 1; + for_each(v.begin(), v.end(), [&cnt](int n) { cnt *= n; }); + return cnt; + }; + for (size_t i = 0; i < output_data->size(); i++) { + std::vector shape = outputs_vec[i][0].shape; + shape[0] = batch_size; + int total_cnt = count(shape); + (*output_data)[i].shape = shape; + (*output_data)[i].data.Resize(total_cnt * sizeof(float)); + float *addr = static_cast((*output_data)[i].data.data()); + for (const auto &single_out : outputs_vec) { + int cnt = count(single_out[i].shape); + memcpy(addr, single_out[i].data.data(), cnt * sizeof(float)); + addr += cnt; + } + } } return true; } - -template -bool PaddleInferenceAnakinPredictor::Run( +template +bool PaddleInferenceAnakinPredictor::RunImpl( const std::vector &inputs, - std::vector *output_data, int batch_size) { + std::vector *output_data) { for (const auto &input : inputs) { if (input.dtype != PaddleDType::FLOAT32) { - VLOG(3) << "Only support float type inputs. " << input.name - << "'s type is not float"; - return false; + LOG(FATAL) << "Only support float type inputs. " << input.name + << "'s type is not float"; } - auto d_tensor_in_p = executor_p_->get_in(input.name); - auto net_shape = d_tensor_in_p->shape(); + auto d_tensor_p = this->executor_p_->get_in(input.name); + auto net_shape = d_tensor_p->shape(); if (net_shape.size() != input.shape.size()) { - VLOG(3) << " input " << input.name - << "'s shape size should be equal to that of net"; - return false; + LOG(FATAL) << " input " << input.name + << "'s shape size should be equal to that of net"; } int sum = 1; for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; }); if (sum > net_shape.count()) { - graph_.Reshape(input.name, input.shape); - delete executor_p_; - executor_p_ = new anakin::Net(graph_, true); - d_tensor_in_p = executor_p_->get_in(input.name); + if (this->config_.re_allocable) { + this->graph_p_->Reshape(input.name, input.shape); + delete this->executor_p_; + this->InitNet(); + d_tensor_p = this->executor_p_->get_in(input.name); + } else { + LOG(FATAL) + << "Run failed because Anakin was expected not to reallocate " + "memory."; + } } - - anakin::saber::Shape tmp_shape; + std::vector tmp_shape; for (auto s : input.shape) { tmp_shape.push_back(s); } - d_tensor_in_p->reshape(tmp_shape); + auto *data = static_cast(input.data.data()); + anakin::saber::Tensor::Host_type> + h_tensor(data, typename anakin::DefaultHostType::Host_type(), 0, + tmp_shape); + d_tensor_p->reshape(tmp_shape); if (input.lod.size() > 0) { if (input.lod.size() > 1) { - VLOG(3) << " input lod first dim should <=1, but you set " - << input.lod.size(); - return false; + LOG(FATAL) << " input lod first dim should <=1, but you set " + << input.lod.size(); } - std::vector offset(input.lod[0].begin(), input.lod[0].end()); - d_tensor_in_p->set_seq_offset(offset); - VLOG(3) << "offset.size(): " << offset.size(); - for (int i = 0; i < offset.size(); i++) { - VLOG(3) << offset[i]; - } - } - - float *d_data_p = d_tensor_in_p->mutable_data(); - -#ifdef PADDLE_WITH_CUDA - if (std::is_same::value) { - if (cudaMemcpy(d_data_p, static_cast(input.data.data()), - d_tensor_in_p->valid_size() * sizeof(float), - cudaMemcpyHostToDevice) != 0) { - VLOG(3) << "copy data from CPU to GPU error"; - return false; + std::vector lod(input.lod[0].begin(), input.lod[0].end()); + std::vector> offset({lod}); + d_tensor_p->set_seq_offset(offset); + VLOG(3) << "offset.size(): " << offset[0].size(); + for (int i = 0; i < offset[0].size(); i++) { + VLOG(3) << offset[0][i]; } } -#endif - if (std::is_same::value) { - memcpy(d_data_p, static_cast(input.data.data()), - d_tensor_in_p->valid_size() * sizeof(float)); - } + d_tensor_p->copy_from(h_tensor); } -#ifdef PADDLE_WITH_CUDA - cudaDeviceSynchronize(); - executor_p_->prediction(); - cudaDeviceSynchronize(); -#endif - + this->Predict(); if (output_data->empty()) { - VLOG(3) << "At least one output should be set with tensors' names."; - return false; + LOG(FATAL) << "At least one output should be set with tensors' names."; } for (auto &output : *output_data) { - auto *tensor = executor_p_->get_out(output.name); - output.shape = tensor->valid_shape(); - if (output.data.length() < tensor->valid_size() * sizeof(float)) { - output.data.Resize(tensor->valid_size() * sizeof(float)); - } - -#if PADDLE_WITH_CUDA - if (std::is_same::value) { - // Copy data from GPU -> CPU - if (cudaMemcpy(output.data.data(), tensor->mutable_data(), - tensor->valid_size() * sizeof(float), - cudaMemcpyDeviceToHost) != 0) { - VLOG(3) << "copy data from GPU to CPU error"; - return false; - } - } -#endif - if (std::is_same::value) { - memcpy(output.data.data(), tensor->mutable_data(), - tensor->valid_size() * sizeof(float)); + auto *d_tensor_p = this->executor_p_->get_out(output.name); + output.shape = d_tensor_p->valid_shape(); + if (output.data.length() < d_tensor_p->valid_size() * sizeof(float)) { + output.data.Resize(d_tensor_p->valid_size() * sizeof(float)); } + auto *data = static_cast(output.data.data()); + anakin::saber::Tensor::Host_type> + h_tensor(data, typename anakin::DefaultHostType::Host_type(), 0, + d_tensor_p->valid_shape()); + h_tensor.copy_from(*d_tensor_p); } return true; } - -template -anakin::Net - &PaddleInferenceAnakinPredictor::get_executer() { - return *executor_p_; +template +bool PaddleInferenceAnakinPredictor::ResetConfig( + const AnakinConfig &config) { + this->config_ = config; + return true; +} +template +anakin::Net &PaddleInferenceAnakinPredictor::ResetExecuter( + std::shared_ptr> graph_p) { + this->graph_p_ = graph_p; + this->ctx_p_ = std::make_shared>( + this->config_.device_id, this->config_.data_stream_id, + this->config_.compute_stream_id); + this->InitNet(); + return *this->executor_p_; } - // the cloned new Predictor of anakin share the same net weights from original // Predictor -template +template std::unique_ptr -PaddleInferenceAnakinPredictor::Clone() { +PaddleInferenceAnakinPredictor::Clone() { VLOG(3) << "Anakin Predictor::clone"; std::unique_ptr cls( - new PaddleInferenceAnakinPredictor()); + new PaddleInferenceAnakinPredictor()); // construct executer from other graph auto anakin_predictor_p = - dynamic_cast *>(cls.get()); + dynamic_cast *>(cls.get()); if (!anakin_predictor_p) { - VLOG(3) << "fail to call Init"; - return nullptr; + LOG(FATAL) << "fail to call Init"; } - anakin_predictor_p->get_executer().init(graph_); + anakin_predictor_p->ResetConfig(this->config_); + anakin_predictor_p->ResetExecuter(this->graph_p_); + return cls; +} - return std::move(cls); +#ifdef ANAKIN_MLU_PLACE +template +void PaddleInferenceAnakinMLUPredictor::SetContext() { + this->ctx_p_ = std::make_shared>( + this->config_.device_id, this->config_.data_stream_id, + this->config_.compute_stream_id); + this->ctx_p_->set_model_parallel(this->config_.model_parallel); + this->ctx_p_->set_fusion(this->config_.op_fuse); } +template +void PaddleInferenceAnakinMLUPredictor::OptimizeGraph() { + if (!this->graph_p_->fusion_optimize(this->config_.op_fuse)) { + LOG(FATAL) << "Graph optimization error."; + } +} +template +void PaddleInferenceAnakinMLUPredictor::InitNet() { + std::unique_lock lock(this->mutex_); + this->executor_p_ = new anakin::Net(); + this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true); +} +template +void PaddleInferenceAnakinMLUPredictor::Predict() { + anakin::TargetWrapper::device_sync(); + this->executor_p_->fusion_prediction(); + anakin::TargetWrapper::device_sync(); +} +#endif #ifdef PADDLE_WITH_CUDA -template class PaddleInferenceAnakinPredictor; +template class PaddleInferenceAnakinPredictor< + anakin::NV, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>; +#endif +#ifdef ANAKIN_X86_PLACE +template class PaddleInferenceAnakinPredictor< + anakin::X86, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>; +#endif +#ifdef ANAKIN_MLU_PLACE +template class PaddleInferenceAnakinMLUPredictor; #endif -template class PaddleInferenceAnakinPredictor; // A factory to help create difference predictor. template <> std::unique_ptr CreatePaddlePredictor( const contrib::AnakinConfig &config) { - VLOG(3) << "Anakin Predictor create."; - if (config.target_type == contrib::AnakinConfig::NVGPU) { #ifdef PADDLE_WITH_CUDA - VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ]."; - std::unique_ptr x( - new PaddleInferenceAnakinPredictor(config)); - return x; -#else - LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment"; - return nullptr; + if (config.target_type == contrib::AnakinConfig::NVGPU) { + return std::unique_ptr( + new PaddleInferenceAnakinPredictor(config)); + } #endif - } else if (config.target_type == contrib::AnakinConfig::X86) { - VLOG(3) << "Anakin Predictor create on [ Intel X86 ]."; - std::unique_ptr x( - new PaddleInferenceAnakinPredictor(config)); - return x; - } else { - VLOG(3) << "Anakin Predictor create on unknown platform."; - return nullptr; +#ifdef ANAKIN_X86_PLACE + if (config.target_type == contrib::AnakinConfig::X86) { + return std::unique_ptr( + new PaddleInferenceAnakinPredictor(config)); } +#endif +#ifdef ANAKIN_MLU_PLACE + if (config.target_type == contrib::AnakinConfig::MLU) { + return std::unique_ptr( + new PaddleInferenceAnakinMLUPredictor( + config)); + } +#endif + LOG(FATAL) << "Anakin Predictor create on unknown platform."; + return nullptr; } - +template +void DisplayOpTimer(anakin::Net *net_executor, int epoch) { #ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER -template -using executor_t = - anakin::Net; - -template -void DisplayOpTimer(executor_t *net_executor, int epoch) { std::vector op_time = net_executor->get_op_time(); auto exec_funcs = net_executor->get_exec_funcs(); auto op_param = net_executor->get_op_param(); @@ -254,16 +386,13 @@ void DisplayOpTimer(executor_t *net_executor, int epoch) { for (auto it = op_map.begin(); it != op_map.end(); ++it) { LOG(INFO) << it->first << " " << (it->second) / epoch << " ms"; } -} #endif - -template -PaddleInferenceAnakinPredictor::~PaddleInferenceAnakinPredictor() { -#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER - DisplayOpTimer(executor_p_, max_batch_size_); -#endif - delete executor_p_; - executor_p_ = nullptr; +} +template +PaddleInferenceAnakinPredictor::~PaddleInferenceAnakinPredictor() { + DisplayOpTimer(this->executor_p_, this->config_.init_batch_size); + delete this->executor_p_; + this->executor_p_ = nullptr; } } // namespace paddle diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index e14d93de2c4..32f8def63c0 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ limitations under the License. */ #pragma once +#include #include #include "framework/core/net/net.h" @@ -30,13 +31,18 @@ limitations under the License. */ namespace paddle { using contrib::AnakinConfig; +using anakin::Precision; +using anakin::OpRunType; -template +template class PaddleInferenceAnakinPredictor : public PaddlePredictor { public: - PaddleInferenceAnakinPredictor() {} + PaddleInferenceAnakinPredictor() = default; - explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config); + explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config) + : config_(config) { + this->InitPredictor(); + } // NOTE Unlike the native engine, the buffers of anakin engine's output_data // should be allocated first. @@ -45,21 +51,45 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { int batch_size = -1) override; std::unique_ptr Clone() override; - - anakin::Net& - get_executer(); + virtual bool ResetConfig(const AnakinConfig& config); + virtual anakin::Net& ResetExecuter( + std::shared_ptr> graph_p); + void InitPredictor(); ~PaddleInferenceAnakinPredictor() override; - private: - bool Init(const AnakinConfig& config); - - anakin::graph::Graph - graph_; - anakin::Net* - executor_p_{nullptr}; + static std::mutex mutex_; AnakinConfig config_; - int max_batch_size_{0}; + std::shared_ptr> ctx_p_; + std::shared_ptr> graph_p_; + anakin::Net* executor_p_{nullptr}; + + void InitEnv(); + void InitGraph(); + virtual void OptimizeGraph(); + virtual void InitNet(); + virtual void SetContext(); + virtual void Predict(); + + private: + bool RunImpl(const std::vector& inputs, + std::vector* output_data); + static std::once_flag init_anakin_; }; +#ifdef ANAKIN_MLU_PLACE +template +class PaddleInferenceAnakinMLUPredictor final + : public PaddleInferenceAnakinPredictor { + public: + explicit PaddleInferenceAnakinMLUPredictor(const AnakinConfig& config) { + this->ResetConfig(config); + this->InitPredictor(); + } + void SetContext() override; + void OptimizeGraph() override; + void InitNet() override; + void Predict() override; +}; +#endif } // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 38f754b92d4..e5820c3637b 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -64,9 +64,12 @@ static int GetUniqueId() { } static void split(const std::string &str, char sep, - std::vector *pieces) { + std::vector *pieces, bool ignore_null = true) { pieces->clear(); if (str.empty()) { + if (!ignore_null) { + pieces->push_back(str); + } return; } size_t pos = 0; diff --git a/paddle/fluid/inference/api/paddle_anakin_config.h b/paddle/fluid/inference/api/paddle_anakin_config.h index 0e91c2624be..7c0e2f06ff4 100644 --- a/paddle/fluid/inference/api/paddle_anakin_config.h +++ b/paddle/fluid/inference/api/paddle_anakin_config.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ #pragma once #include +#include #include #include #include @@ -24,11 +25,22 @@ namespace paddle { namespace contrib { // Configurations for Anakin engine. struct AnakinConfig : public PaddlePredictor::Config { - enum TargetType { NVGPU = 0, X86 }; - int device; + enum TargetType { NVGPU = 0, X86, MLU }; + int device_id{0}; std::string model_file; - int max_batch_size{-1}; + std::map> init_inputs_shape; + int init_batch_size{-1}; + bool re_allocable{true}; + int max_stream{4}; + int data_stream_id{0}; + int compute_stream_id{0}; TargetType target_type; +#ifdef ANAKIN_MLU_PLACE + int model_parallel{8}; + int data_parallel{1}; + bool op_fuse{false}; + bool sparse{false}; +#endif }; } // namespace contrib diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 1785bd520a1..2906a4926f7 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -28,6 +28,6 @@ limitations under the License. */ #include "paddle_analysis_config.h" // NOLINT #include "paddle_api.h" // NOLINT -#ifdef WITH_ANAKIN +#if (defined WITH_ANAKIN) || (defined PADDLE_WITH_ANAKIN) #include "paddle_anakin_config.h" // NOLINT #endif diff --git a/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc b/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc new file mode 100644 index 00000000000..8094c744fef --- /dev/null +++ b/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc @@ -0,0 +1,98 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +DEFINE_string(model, "", "Directory of the inference model."); + +namespace paddle { + +contrib::AnakinConfig Config() { + // Determine the use of memory here. + std::map> init_inputs_shape; + init_inputs_shape["input_0"] = std::vector({1, 3, 112, 112}); + + contrib::AnakinConfig config; + config.target_type = contrib::AnakinConfig::MLU; + config.model_file = FLAGS_model; + config.init_inputs_shape = init_inputs_shape; + + // Determine the device execution context. + config.device_id = 0; + config.data_stream_id = 0; + config.compute_stream_id = 0; + + // Set re_allocable and op_fuse TRUE. + config.re_allocable = true; + config.op_fuse = true; + + return config; +} + +void single_test() { + // 1. Defining basic data structures. + auto config = paddle::Config(); + auto predictor = + paddle::CreatePaddlePredictor(config); + + // 2. Define the data structure of the predictor inputs and outputs. + std::vector input_tensors; + std::vector output_tensors; + + // 3. Define and fill the inputs tensor. + int num = 1; + int channel = 3; + int height = 112; + int width = 112; + std::vector input(num * channel * height * width, 1); + std::vector> inputs({input}); + const std::vector input_names{"input_0"}; + for (auto& name : input_names) { + paddle::PaddleTensor tensor; + tensor.name = name; + tensor.dtype = PaddleDType::FLOAT32; + input_tensors.push_back(tensor); + } + for (size_t j = 0; j < input_tensors.size(); j++) { + input_tensors[j].data = + paddle::PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float)); + // The shape of each execution can be changed. + input_tensors[j].shape = std::vector({num, channel, height, width}); + } + + // 4. Set the output placeholder of predictor. + PaddleTensor predict_out, score_out; + predict_out.name = "landmark_predict_out"; + score_out.name = "landmark_score_out"; + output_tensors.push_back(predict_out); + output_tensors.push_back(score_out); + + // 5. Execution predict. + predictor->Run(input_tensors, &output_tensors); + + // 6. Take out the output data. + for (auto out : output_tensors) { + float* data_o = static_cast(out.data.data()); + LOG(INFO) << out.name << " size = " << out.data.length() / sizeof(float); + } +} +} // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::single_test(); + return 0; +} diff --git a/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc index cf97f064bed..48689486af4 100644 --- a/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc +++ b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc @@ -27,8 +27,8 @@ contrib::AnakinConfig GetConfig() { // using AnakinConfig::X86 if you need to use cpu to do inference config.target_type = contrib::AnakinConfig::NVGPU; config.model_file = FLAGS_model; - config.device = 0; - config.max_batch_size = 1; + config.device_id = 0; + config.init_batch_size = 1; return config; } diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc index da42688f29f..db01cfebcb2 100644 --- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc @@ -100,8 +100,8 @@ contrib::AnakinConfig GetConfig() { // using AnakinConfig::X86 if you need to use cpu to do inference config.target_type = contrib::AnakinConfig::X86; config.model_file = FLAGS_model; - config.device = 0; - config.max_batch_size = 1000; // the max number of token + config.device_id = 0; + config.init_batch_size = 1000; // the max number of token return config; } diff --git a/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc new file mode 100644 index 00000000000..27abaa530b3 --- /dev/null +++ b/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc @@ -0,0 +1,261 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +#define BUFFER_SIZE (10000) +#define COMPARE_OUTPUTS (1) +#define PRINT_INPUTS (0) + +DEFINE_string(model, "", "Directory of the inference model."); +DEFINE_string(datapath, "", "Path of the dataset."); +DEFINE_string(truthpath, "", "Path of the dataset."); +DEFINE_int32(batch_size, 1, "Batch size per execution."); +DEFINE_int32(repeats, 1, "Number of iterations."); +DEFINE_int32( + start_line, 0, + "The starting line of the text file read (this line will be read)."); +DEFINE_int32(end_line, 1000000, + "The ending line of the text file read (this line will be read)."); +DEFINE_int32(init_batch_size, 40, + "Max batch size for Anakin memory allocation."); +DEFINE_int32(threads_num, 2, "Threads num for Anakin."); + +class Data { + public: + Data(std::string file_name, size_t batch_size, size_t start = 0, + size_t end = 1000000) + : _batch_size(batch_size), _total_length(0), _inputs_size(6) { + _file.open(file_name); + _file.seekg(_file.end); + _total_length = _file.tellg(); + _file.seekg(_file.beg); + read_file_to_vec(start, end); + reset_current_line(); + } + void reset_current_line(); + const std::vector& get_lines(); + void read_file_to_vec(const size_t start, const size_t end); + int get_next_batches(std::vector>* inputs, + std::vector>* seq_offsets); + + private: + std::fstream _file; + int _batch_size; + size_t _total_length; + size_t _inputs_size; + std::vector _lines; + size_t _current_line; +}; + +void Data::read_file_to_vec(const size_t start, const size_t end) { + std::string line; + size_t count = 0; + _lines.clear(); + while (std::getline(_file, line)) { + if (count >= start && count <= end) { + _lines.push_back(line); + } + count++; + } +} + +const std::vector& Data::get_lines() { return _lines; } + +void Data::reset_current_line() { _current_line = 0; } + +int Data::get_next_batches(std::vector>* data, + std::vector>* offsets) { + data->clear(); + offsets->clear(); + data->resize(_inputs_size); + offsets->resize(_inputs_size); + for (auto& offset : *offsets) { + offset.push_back(0); + } + + int seq_num = -1; + int pre_query_index = -1; + while (_current_line < _lines.size()) { + int cur_query_index = -1; + std::vector line; + paddle::inference::split(_lines[_current_line], ';', &line); + for (size_t i = 0; i < line.size(); i++) { + std::vector float_v; + paddle::inference::split_to_float(line[i], ' ', &float_v); + if (i == 0) { + cur_query_index = float_v[0]; + if (pre_query_index != -1 && cur_query_index != pre_query_index) { + return seq_num; + } + seq_num++; + _current_line++; + } else { + if (float_v.size() == 0) { + float_v.push_back(-1); + } + (*data)[i - 1].insert((*data)[i - 1].end(), float_v.begin(), + float_v.end()); + (*offsets)[i - 1].push_back((*offsets)[i - 1][seq_num] + + float_v.size()); + } + } + if (seq_num + 1 >= _batch_size) { + return seq_num; + } else { + pre_query_index = cur_query_index; + } + } + return seq_num; +} + +namespace paddle { + +contrib::AnakinConfig GetConfig() { + contrib::AnakinConfig config; + + std::map> init_inputs_shape; + init_inputs_shape["q_basic"] = std::vector({1000, 1, 1, 1}); + init_inputs_shape["q_bigram0"] = std::vector({1000, 1, 1, 1}); + init_inputs_shape["pt_basic"] = std::vector({2000, 1, 1, 1}); + init_inputs_shape["pa_basic"] = std::vector({4000, 1, 1, 1}); + init_inputs_shape["pa_bigram0"] = std::vector({4000, 1, 1, 1}); + init_inputs_shape["pt_bigram0"] = std::vector({2000, 1, 1, 1}); + + // using AnakinConfig::X86 if you need to use cpu to do inference + config.target_type = contrib::AnakinConfig::NVGPU; + config.model_file = FLAGS_model; + config.device_id = 0; + config.init_batch_size = FLAGS_init_batch_size; + config.init_inputs_shape = init_inputs_shape; + config.re_allocable = false; + return config; +} + +void single_test(PaddlePredictor* predictor_master) { + auto predictor = predictor_master->Clone(); + + Data data(FLAGS_datapath, FLAGS_batch_size, FLAGS_start_line, FLAGS_end_line); + + std::vector> inputs; + std::vector> seq_offsets; + std::vector compare_outputs; + + const std::vector input_names{"q_basic", "q_bigram0", + "pt_basic", "pt_bigram0", + "pa_basic", "pa_bigram0"}; + std::vector input_tensors; + std::vector output_tensors; + for (auto& name : input_names) { + PaddleTensor tensor; + tensor.name = name; + tensor.dtype = PaddleDType::FLOAT32; + input_tensors.push_back(tensor); + } + + PaddleTensor tensor_out; + tensor_out.name = "save_infer_model/scale_0"; + tensor_out.shape = std::vector({}); + tensor_out.data = PaddleBuf(); + tensor_out.dtype = PaddleDType::FLOAT32; + output_tensors.push_back(tensor_out); + + inference::Timer timer; + for (int i = 0; i < FLAGS_repeats; i++) { + data.reset_current_line(); + size_t count = 0; + float time_sum = 0; + while (data.get_next_batches(&inputs, &seq_offsets) >= 0) { +#if PRINT_INPUTS + for (size_t i = 0; i < inputs.size(); i++) { + LOG(INFO) << "data " << i; + for (size_t j = 0; j < inputs[i].size(); j++) { + LOG(INFO) << j << ": " << inputs[i][j]; + } + for (auto j : seq_offsets[i]) { + LOG(INFO) << "offsets: " << i << ": " << j; + } + } +#endif + for (size_t j = 0; j < input_tensors.size(); j++) { + input_tensors[j].data = + PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float)); + input_tensors[j].lod = + std::vector>({seq_offsets[j]}); + input_tensors[j].shape = + std::vector({static_cast(inputs[j].size()), 1, 1, 1}); + } + timer.tic(); + predictor->Run(input_tensors, &output_tensors); + float time = timer.toc(); +#if COMPARE_OUTPUTS + float* data_o = static_cast(output_tensors[0].data.data()); + LOG(INFO) << "outputs[0].data.size() = " + << output_tensors[0].data.length() / sizeof(float); + size_t sum = 1; + for_each(output_tensors[0].shape.begin(), output_tensors[0].shape.end(), + [&](int n) { sum *= n; }); + for (size_t j = 0; j < sum; ++j) { + LOG(INFO) << "output[" << j << "]: " << data_o[j]; + compare_outputs.push_back(data_o[j]); + } +#endif + LOG(INFO) << "Single Time: " << time; + count++; + if (count > 10) { + time_sum += timer.toc(); + } + } + inference::PrintTime(FLAGS_batch_size, FLAGS_repeats, 1, 0, + time_sum / (count - 10)); +#if COMPARE_OUTPUTS + Data data(FLAGS_truthpath, 1); + const std::vector truth_vals = data.get_lines(); + for (size_t j = 0; j < truth_vals.size(); j++) { + float truth = std::atof(truth_vals[j].c_str()); + float compa = compare_outputs[j]; + float diff = std::abs(truth - compa); + LOG(INFO) << "[DIFF " << j << " ] " << diff; + if (diff > 0.0001) { + LOG(FATAL) << "The result is wrong!"; + } + } + LOG(INFO) << "The result is correct!"; +#endif + } +} +} // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + std::vector threads; + + auto config = paddle::GetConfig(); + config.data_stream_id = 0; + config.compute_stream_id = 0; + std::unique_ptr predictor_master = + paddle::CreatePaddlePredictor(config); + + for (int i = 0; i < FLAGS_threads_num; i++) { + threads.push_back(std::thread(paddle::single_test, predictor_master.get())); + } + for (auto& t : threads) { + t.join(); + } + return 0; +} diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index b7abc68949c..3356c1e669d 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -33,7 +33,7 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -if (ANAKIN_FOUND) +if (ANAKIN_SUBGRAPH) add_subdirectory(anakin) endif() diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h index 11c394c76cd..b4aaa228693 100644 --- a/paddle/fluid/operators/anakin/anakin_engine_op.h +++ b/paddle/fluid/operators/anakin/anakin_engine_op.h @@ -119,11 +119,15 @@ class AnakinEngineOp : public framework::OperatorBase { engine->Execute(inputs, outputs, stream); #endif } else { +#ifdef ANAKIN_X86_PLACE auto *engine = inference::Singleton>::Global() .Get(engine_key_); engine->Execute(inputs, outputs); +#else + LOG(FATAL) << "Unknown Platform for AnakinEngine!"; +#endif } } }; -- GitLab