未验证 提交 bce259e5 编写于 作者: 石晓伟 提交者: GitHub

Update the Anakin interfaces for content-dnn and MLU (#17890)

* update anakin-engine interfaces for content-dnn

test=develop

* support only-gpu mode of Anakin

modify eltwise parse

test=develop

* modification for thread-safe

test=develop

* Integrated template instance

test=develop

* increase template parameters

test=develop

* support MLU predictor

test=develop

* update anakin cmake files

test=develop

* update TargetWrapper::set_device

* update the initialization of anakin subgraph

test=develop

* use the default constructor of base class

test=develop
上级 410907f6
if(NOT WITH_GPU)
return()
endif()
set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
find_path(ANAKIN_INCLUDE_DIR anakin_config.h
PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
......@@ -16,9 +12,7 @@ find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
DOC "Path to ANAKIN library.")
if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
if(WITH_DSO)
set(ANAKIN_FOUND ON)
endif(WITH_DSO)
else()
set(ANAKIN_FOUND OFF)
endif()
......@@ -31,3 +25,8 @@ if(ANAKIN_FOUND)
link_directories(${ANAKIN_ROOT})
add_definitions(-DPADDLE_WITH_ANAKIN)
endif()
if(ANAKIN_FOUND AND WITH_GPU AND WITH_DSO)
message(STATUS "Compile with anakin subgraph.")
set(ANAKIN_SUBGRAPH ON)
endif()
......@@ -77,7 +77,7 @@ pass_library(fillconstant_elementwisemul_fuse inference)
pass_library(shuffle_channel_detect_pass inference)
pass_library(delete_quant_dequant_op_pass inference)
if(ANAKIN_FOUND)
if(ANAKIN_SUBGRAPH)
pass_library(simplify_anakin_priorbox_detection_out_pass inference)
endif()
......
......@@ -17,7 +17,7 @@ if (TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()
if (ANAKIN_FOUND)
if (ANAKIN_SUBGRAPH)
add_subdirectory(anakin)
endif()
......@@ -43,11 +43,15 @@ if(WITH_MKLDNN)
endif()
set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor)
if (ANAKIN_FOUND)
set(ANAKIN_SHARED_INFERENCE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/api/api_anakin_engine.cc)
endif()
set(SHARED_INFERENCE_SRCS
io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
${mkldnn_quantizer_src}
${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
${ANAKIN_SHARED_INFERENCE_SRCS})
if(WIN32)
sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
......
......@@ -60,7 +60,7 @@ void ElementwiseMulOpConverter<TargetT, PrecisionT>::operator()(
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
std::string elementwise_type = "Prod";
std::string elementwise_type = "Mul";
this->engine_->template AddOpAttr<std::string>(op_name, "type",
elementwise_type);
std::vector<float> coeff = {1.0, 1.0};
......
......@@ -153,11 +153,12 @@ template class AnakinOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
template class AnakinOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
#ifdef ANAKIN_X86_PLACE
template class AnakinOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
template class AnakinOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
......@@ -203,16 +204,16 @@ template class AnakinOpConverter<::anakin::saber::X86,
CPU, ::anakin::saber::X86, precision_type__, \
::anakin::Precision::precision_type__)
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8); \
REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
#else
#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
#elif defined(PADDLE_WITH_CUDA)
#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
#endif
#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \
......@@ -221,12 +222,16 @@ template class AnakinOpConverter<::anakin::saber::X86,
__attribute__((unused)) = \
Touch_anakin_##op_type__##_##place_type__##_##precision_type__();
#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
#define USE_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32)
#define USE_INT8_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8)
#elif defined(PADDLE_WITH_CUDA)
#define USE_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32)
#define USE_INT8_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8)
#define USE_CPU_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32)
#define USE_CPU_INT8_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8)
#endif
......@@ -77,32 +77,6 @@ TEST(swish_op, gpu) {
}
#endif
/*
TEST(sigm_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("sigmoid", ctx, false);
}
TEST(tanh_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("tanh", ctx, false);
}
TEST(relu6_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("relu6", ctx, false);
}
TEST(swish_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("swish", ctx, false);
}
*/
} // namespace anakin
} // namespace inference
} // namespace paddle
......@@ -112,13 +86,7 @@ USE_OP(tanh);
USE_OP(relu6);
USE_OP(swish);
USE_CPU_ANAKIN_CONVERTER(sigmoid);
USE_CPU_ANAKIN_CONVERTER(tanh);
USE_CPU_ANAKIN_CONVERTER(relu6);
USE_CPU_ANAKIN_CONVERTER(swish);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(sigmoid);
USE_ANAKIN_CONVERTER(tanh);
USE_ANAKIN_CONVERTER(relu6);
USE_ANAKIN_CONVERTER(swish);
#endif
......@@ -57,19 +57,16 @@ TEST(affine_channel_op, gpu) {
test_affine_channel_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(affine_channel_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_affine_channel_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(affine_channel);
USE_CPU_ANAKIN_CONVERTER(affine_channel);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(affine_channel);
#endif
......@@ -73,19 +73,15 @@ TEST(batch_norm_op, gpu) {
test_batchnorm_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(batch_norm_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_batchnorm_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(batch_norm);
USE_CPU_ANAKIN_CONVERTER(batch_norm);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(batch_norm);
#endif
......@@ -53,19 +53,15 @@ TEST(concat_op, gpu) {
test_concat_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(concat_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_concat_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(concat);
USE_CPU_ANAKIN_CONVERTER(concat);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(concat);
#endif
......@@ -60,20 +60,16 @@ TEST(conv2d_op, gpu) {
test_conv2d_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(conv2d_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_conv2d_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(conv2d);
USE_CPU_ANAKIN_CONVERTER(conv2d);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(conv2d);
#endif
......@@ -54,19 +54,16 @@ TEST(dropout_op, gpu) {
test_dropout_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(dropout_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_dropout_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(dropout);
USE_CPU_ANAKIN_CONVERTER(dropout);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(dropout);
#endif
......@@ -59,29 +59,23 @@ TEST(elementwise_op, native_mul_gpu) {
test_elementwise_op<::anakin::saber::NV>("elementwise_mul", ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(elementwise_op, native_add_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_elementwise_op<::anakin::saber::X86>("elementwise_add", ctx, false);
}
TEST(elementwise_op, native_mul_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_elementwise_op<::anakin::saber::X86>("elementwise_mul", ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(elementwise_add);
USE_OP(elementwise_mul);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(elementwise_add);
USE_ANAKIN_CONVERTER(elementwise_mul);
#endif
USE_CPU_ANAKIN_CONVERTER(elementwise_add);
USE_CPU_ANAKIN_CONVERTER(elementwise_mul);
......@@ -49,19 +49,16 @@ TEST(mul_op, gpu) {
test_mul_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(mul_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_mul_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(mul);
USE_CPU_ANAKIN_CONVERTER(fc);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(fc);
#endif
......@@ -48,20 +48,17 @@ TEST(flatten_op, gpu) {
test_flatten_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(flatten_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_flatten_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(reshape);
USE_OP_ITSELF(flatten);
USE_CPU_ANAKIN_CONVERTER(flatten);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(flatten);
#endif
......@@ -87,7 +87,7 @@ TEST(Pool2dOpConverter, avg_ceil_test) {
test_pool2d<::anakin::saber::NV>(ctx, true, false, true, "avg");
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(Pool2dOpConverter, normal_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
......@@ -110,14 +110,10 @@ TEST(Pool2dOpConverter, avg_ceil_test_cpu) {
platform::CPUDeviceContext ctx(cpu_place);
test_pool2d<::anakin::saber::X86>(ctx, false, false, true, "avg");
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(pool2d);
USE_CPU_ANAKIN_CONVERTER(pool2d);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(pool2d);
#endif
......@@ -66,10 +66,5 @@ TEST(leaky_relu_op, gpu) {
USE_OP(relu);
USE_OP(leaky_relu);
USE_CPU_ANAKIN_CONVERTER(relu);
USE_CPU_ANAKIN_CONVERTER(leaky_relu);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(relu);
USE_ANAKIN_CONVERTER(leaky_relu);
#endif
......@@ -81,7 +81,7 @@ TEST(reshape2_op, gpu) {
test_reshape2_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(reshape1_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
......@@ -93,14 +93,10 @@ TEST(reshape2_op, cpu) {
platform::CPUDeviceContext ctx(cpu_place);
test_reshape2_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(reshape);
USE_CPU_ANAKIN_CONVERTER(reshape);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(reshape);
#endif
......@@ -48,20 +48,16 @@ TEST(softmax_op, gpu) {
test_softmax_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(relu_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_softmax_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(softmax);
USE_CPU_ANAKIN_CONVERTER(softmax);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(softmax);
#endif
......@@ -92,7 +92,7 @@ TEST(split_op, test_different_shape_axis3_batch1) {
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 3}, {2, 1});
}
#ifdef ANAKIN_X86_PLACE
TEST(split_op, test_different_shape_axis1_batch1_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
......@@ -110,13 +110,10 @@ TEST(split_op, test_different_shape_axis3_batch1_cpu) {
platform::CPUDeviceContext ctx(cpu_place);
AnakinSliceTest<::anakin::saber::X86, 3>(ctx, false, {1, 3, 2, 4}, {2, 2});
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(split);
USE_CPU_ANAKIN_CONVERTER(split);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(split);
#endif
......@@ -49,19 +49,16 @@ TEST(sum_op, gpu) {
test_sum_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(sum_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_sum_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(sum);
USE_CPU_ANAKIN_CONVERTER(sum);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(sum);
#endif
......@@ -79,7 +79,7 @@ TEST(transpose2_op, gpu) {
test_transpose2_op<::anakin::saber::NV>(ctx, true);
}
#endif
#ifdef ANAKIN_X86_PLACE
TEST(transpose1_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
......@@ -91,13 +91,10 @@ TEST(transpose2_op, cpu) {
platform::CPUDeviceContext ctx(cpu_place);
test_transpose2_op<::anakin::saber::X86>(ctx, false);
}
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(transpose);
USE_CPU_ANAKIN_CONVERTER(transpose);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(transpose);
#endif
......@@ -33,7 +33,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
using anakin::Precision;
using anakin::saber::X86;
namespace paddle {
namespace inference {
......@@ -215,13 +214,14 @@ class AnakinConvertValidation {
template class AnakinConvertValidation<::anakin::saber::NV,
::anakin::Precision::FP32>;
template class AnakinConvertValidation<::anakin::saber::X86,
::anakin::Precision::FP32>;
template class AnakinConvertValidation<::anakin::saber::NV,
::anakin::Precision::INT8>;
#ifdef ANAKIN_X86_PLACE
template class AnakinConvertValidation<::anakin::saber::X86,
::anakin::Precision::FP32>;
template class AnakinConvertValidation<::anakin::saber::X86,
::anakin::Precision::INT8>;
#endif
} // namespace anakin
} // namespace inference
} // namespace paddle
......@@ -46,10 +46,9 @@ AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
max_input_shape_(max_input_shape),
program_inputs_(program_inputs),
auto_config_layout_(auto_config_layout) {
std::call_once(init_anakin_, [this]() {
::anakin::TargetWrapper<TargetT>::set_device(device_);
::anakin::Env<TargetT>::env_init();
});
::anakin::TargetWrapper<TargetT>::set_device(device_);
std::call_once(init_anakin_,
[this]() { ::anakin::Env<TargetT>::env_init(); });
graph_.reset(new AnakinGraphT<TargetT, PrecisionType>());
net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary));
}
......@@ -194,14 +193,14 @@ template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>;
template class AnakinEngineManager<::anakin::saber::NV,
::anakin::Precision::INT8>;
#endif
#ifdef ANAKIN_X86_PLACE
template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
template class AnakinEngineManager<::anakin::saber::X86,
::anakin::Precision::FP32>;
template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>;
template class AnakinEngineManager<::anakin::saber::X86,
::anakin::Precision::INT8>;
#endif
// template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
} // namespace anakin
} // namespace inference
......
......@@ -24,7 +24,9 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/utils/singleton.h"
#ifdef EXIT // NOLINT
#undef EXIT // NOLINT
#endif // NOLINT
#include "framework/core/net/net.h"
#include "framework/core/types.h"
#include "framework/graph/graph.h"
......
......@@ -22,7 +22,6 @@ limitations under the License. */
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
......
......@@ -15,7 +15,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
endif()
if (ANAKIN_FOUND)
if (ANAKIN_SUBGRAPH)
cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)
set(analysis_deps ${analysis_deps}
......
......@@ -226,7 +226,6 @@ void AnakinSubgraphPass::CreateAnakinEngine(
auto max_batch_size = Get<int>("max_batch_size");
auto max_input_shape =
Get<std::map<std::string, std::vector<int>>>("max_input_shape");
bool auto_config_layout = Get<bool>("auto_config_layout");
if (use_gpu) {
#ifdef PADDLE_WITH_CUDA
inference::Singleton<
......@@ -235,11 +234,14 @@ void AnakinSubgraphPass::CreateAnakinEngine(
max_input_shape, program_inputs, false, engine_key);
#endif
} else {
#ifdef ANAKIN_X86_PLACE
bool auto_config_layout = Get<bool>("auto_config_layout");
inference::Singleton<
anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global()
.Create(true, Get<int>("gpu_device_id"), max_batch_size,
max_input_shape, program_inputs, auto_config_layout,
engine_key);
#endif
}
auto *scope = param_scope();
......@@ -258,6 +260,7 @@ void AnakinSubgraphPass::CreateAnakinEngine(
param_set, output_mapping, anakin_engine);
#endif
} else {
#ifdef ANAKIN_X86_PLACE
auto *anakin_engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::X86, PrecisionT>>::Global()
......@@ -268,6 +271,7 @@ void AnakinSubgraphPass::CreateAnakinEngine(
&block_desc_temp, scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, anakin_engine);
#endif
}
}
......
......@@ -27,7 +27,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
endif()
if (ANAKIN_FOUND)
if (ANAKIN_SUBGRAPH)
set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
endif()
......@@ -38,9 +38,9 @@ endif()
add_subdirectory(details)
if(WITH_MKLDNN)
set(mkldnn_quantizer_src mkldnn_quantizer.cc)
set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
set(mkldnn_quantizer_src mkldnn_quantizer.cc)
set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
endif()
cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder)
......@@ -56,9 +56,7 @@ cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
paddle_pass_builder zero_copy_tensor
reset_tensor_array)
cc_test(test_paddle_inference_api
SRCS api_tester.cc
DEPS paddle_inference_api)
cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
if(WITH_TESTING)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
......@@ -69,13 +67,21 @@ endif()
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy device_context)
cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy device_context)
function(anakin_target target_name)
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endfunction()
anakin_target(inference_anakin_api)
anakin_target(inference_anakin_api_shared)
if(ANAKIN_FOUND)
if (ANAKIN_MLU AND NOT WITH_GPU AND NOT ANAKIN_X86)
message(STATUS "Compile with anakin mlu place.")
add_definitions(-DANAKIN_MLU_PLACE)
elseif(ANAKIN_X86)
message(STATUS "Compile with anakin x86 place.")
add_definitions(-DANAKIN_X86_PLACE)
endif()
cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
target_link_libraries(inference_anakin_api anakin anakin_saber_common)
cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
function(anakin_target target_name)
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endfunction()
anakin_target(inference_anakin_api)
anakin_target(inference_anakin_api_shared)
endif()
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -19,6 +19,7 @@ limitations under the License. */
#pragma once
#include <memory>
#include <vector>
#include "framework/core/net/net.h"
......@@ -30,13 +31,18 @@ limitations under the License. */
namespace paddle {
using contrib::AnakinConfig;
using anakin::Precision;
using anakin::OpRunType;
template <typename Target>
template <typename T, Precision P, OpRunType R>
class PaddleInferenceAnakinPredictor : public PaddlePredictor {
public:
PaddleInferenceAnakinPredictor() {}
PaddleInferenceAnakinPredictor() = default;
explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config);
explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config)
: config_(config) {
this->InitPredictor();
}
// NOTE Unlike the native engine, the buffers of anakin engine's output_data
// should be allocated first.
......@@ -45,21 +51,45 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
int batch_size = -1) override;
std::unique_ptr<PaddlePredictor> Clone() override;
anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
get_executer();
virtual bool ResetConfig(const AnakinConfig& config);
virtual anakin::Net<T, P, R>& ResetExecuter(
std::shared_ptr<anakin::graph::Graph<T, P>> graph_p);
void InitPredictor();
~PaddleInferenceAnakinPredictor() override;
private:
bool Init(const AnakinConfig& config);
anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
graph_;
anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
executor_p_{nullptr};
static std::mutex mutex_;
AnakinConfig config_;
int max_batch_size_{0};
std::shared_ptr<anakin::Context<T>> ctx_p_;
std::shared_ptr<anakin::graph::Graph<T, P>> graph_p_;
anakin::Net<T, P, R>* executor_p_{nullptr};
void InitEnv();
void InitGraph();
virtual void OptimizeGraph();
virtual void InitNet();
virtual void SetContext();
virtual void Predict();
private:
bool RunImpl(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data);
static std::once_flag init_anakin_;
};
#ifdef ANAKIN_MLU_PLACE
template <Precision P, OpRunType R>
class PaddleInferenceAnakinMLUPredictor final
: public PaddleInferenceAnakinPredictor<anakin::MLU, P, R> {
public:
explicit PaddleInferenceAnakinMLUPredictor(const AnakinConfig& config) {
this->ResetConfig(config);
this->InitPredictor();
}
void SetContext() override;
void OptimizeGraph() override;
void InitNet() override;
void Predict() override;
};
#endif
} // namespace paddle
......@@ -64,9 +64,12 @@ static int GetUniqueId() {
}
static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) {
std::vector<std::string> *pieces, bool ignore_null = true) {
pieces->clear();
if (str.empty()) {
if (!ignore_null) {
pieces->push_back(str);
}
return;
}
size_t pos = 0;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,6 +14,7 @@
#pragma once
#include <cassert>
#include <map>
#include <memory>
#include <string>
#include <vector>
......@@ -24,11 +25,22 @@ namespace paddle {
namespace contrib {
// Configurations for Anakin engine.
struct AnakinConfig : public PaddlePredictor::Config {
enum TargetType { NVGPU = 0, X86 };
int device;
enum TargetType { NVGPU = 0, X86, MLU };
int device_id{0};
std::string model_file;
int max_batch_size{-1};
std::map<std::string, std::vector<int>> init_inputs_shape;
int init_batch_size{-1};
bool re_allocable{true};
int max_stream{4};
int data_stream_id{0};
int compute_stream_id{0};
TargetType target_type;
#ifdef ANAKIN_MLU_PLACE
int model_parallel{8};
int data_parallel{1};
bool op_fuse{false};
bool sparse{false};
#endif
};
} // namespace contrib
......
......@@ -28,6 +28,6 @@ limitations under the License. */
#include "paddle_analysis_config.h" // NOLINT
#include "paddle_api.h" // NOLINT
#ifdef WITH_ANAKIN
#if (defined WITH_ANAKIN) || (defined PADDLE_WITH_ANAKIN)
#include "paddle_anakin_config.h" // NOLINT
#endif
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string(model, "", "Directory of the inference model.");
namespace paddle {
contrib::AnakinConfig Config() {
// Determine the use of memory here.
std::map<std::string, std::vector<int>> init_inputs_shape;
init_inputs_shape["input_0"] = std::vector<int>({1, 3, 112, 112});
contrib::AnakinConfig config;
config.target_type = contrib::AnakinConfig::MLU;
config.model_file = FLAGS_model;
config.init_inputs_shape = init_inputs_shape;
// Determine the device execution context.
config.device_id = 0;
config.data_stream_id = 0;
config.compute_stream_id = 0;
// Set re_allocable and op_fuse TRUE.
config.re_allocable = true;
config.op_fuse = true;
return config;
}
void single_test() {
// 1. Defining basic data structures.
auto config = paddle::Config();
auto predictor =
paddle::CreatePaddlePredictor<paddle::contrib::AnakinConfig,
paddle::PaddleEngineKind::kAnakin>(config);
// 2. Define the data structure of the predictor inputs and outputs.
std::vector<paddle::PaddleTensor> input_tensors;
std::vector<paddle::PaddleTensor> output_tensors;
// 3. Define and fill the inputs tensor.
int num = 1;
int channel = 3;
int height = 112;
int width = 112;
std::vector<float> input(num * channel * height * width, 1);
std::vector<std::vector<float>> inputs({input});
const std::vector<std::string> input_names{"input_0"};
for (auto& name : input_names) {
paddle::PaddleTensor tensor;
tensor.name = name;
tensor.dtype = PaddleDType::FLOAT32;
input_tensors.push_back(tensor);
}
for (size_t j = 0; j < input_tensors.size(); j++) {
input_tensors[j].data =
paddle::PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float));
// The shape of each execution can be changed.
input_tensors[j].shape = std::vector<int>({num, channel, height, width});
}
// 4. Set the output placeholder of predictor.
PaddleTensor predict_out, score_out;
predict_out.name = "landmark_predict_out";
score_out.name = "landmark_score_out";
output_tensors.push_back(predict_out);
output_tensors.push_back(score_out);
// 5. Execution predict.
predictor->Run(input_tensors, &output_tensors);
// 6. Take out the output data.
for (auto out : output_tensors) {
float* data_o = static_cast<float*>(out.data.data());
LOG(INFO) << out.name << " size = " << out.data.length() / sizeof(float);
}
}
} // namespace paddle
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
paddle::single_test();
return 0;
}
......@@ -27,8 +27,8 @@ contrib::AnakinConfig GetConfig() {
// using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = contrib::AnakinConfig::NVGPU;
config.model_file = FLAGS_model;
config.device = 0;
config.max_batch_size = 1;
config.device_id = 0;
config.init_batch_size = 1;
return config;
}
......
......@@ -100,8 +100,8 @@ contrib::AnakinConfig GetConfig() {
// using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = contrib::AnakinConfig::X86;
config.model_file = FLAGS_model;
config.device = 0;
config.max_batch_size = 1000; // the max number of token
config.device_id = 0;
config.init_batch_size = 1000; // the max number of token
return config;
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <cmath>
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#define BUFFER_SIZE (10000)
#define COMPARE_OUTPUTS (1)
#define PRINT_INPUTS (0)
DEFINE_string(model, "", "Directory of the inference model.");
DEFINE_string(datapath, "", "Path of the dataset.");
DEFINE_string(truthpath, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "Batch size per execution.");
DEFINE_int32(repeats, 1, "Number of iterations.");
DEFINE_int32(
start_line, 0,
"The starting line of the text file read (this line will be read).");
DEFINE_int32(end_line, 1000000,
"The ending line of the text file read (this line will be read).");
DEFINE_int32(init_batch_size, 40,
"Max batch size for Anakin memory allocation.");
DEFINE_int32(threads_num, 2, "Threads num for Anakin.");
class Data {
public:
Data(std::string file_name, size_t batch_size, size_t start = 0,
size_t end = 1000000)
: _batch_size(batch_size), _total_length(0), _inputs_size(6) {
_file.open(file_name);
_file.seekg(_file.end);
_total_length = _file.tellg();
_file.seekg(_file.beg);
read_file_to_vec(start, end);
reset_current_line();
}
void reset_current_line();
const std::vector<std::string>& get_lines();
void read_file_to_vec(const size_t start, const size_t end);
int get_next_batches(std::vector<std::vector<float>>* inputs,
std::vector<std::vector<size_t>>* seq_offsets);
private:
std::fstream _file;
int _batch_size;
size_t _total_length;
size_t _inputs_size;
std::vector<std::string> _lines;
size_t _current_line;
};
void Data::read_file_to_vec(const size_t start, const size_t end) {
std::string line;
size_t count = 0;
_lines.clear();
while (std::getline(_file, line)) {
if (count >= start && count <= end) {
_lines.push_back(line);
}
count++;
}
}
const std::vector<std::string>& Data::get_lines() { return _lines; }
void Data::reset_current_line() { _current_line = 0; }
int Data::get_next_batches(std::vector<std::vector<float>>* data,
std::vector<std::vector<size_t>>* offsets) {
data->clear();
offsets->clear();
data->resize(_inputs_size);
offsets->resize(_inputs_size);
for (auto& offset : *offsets) {
offset.push_back(0);
}
int seq_num = -1;
int pre_query_index = -1;
while (_current_line < _lines.size()) {
int cur_query_index = -1;
std::vector<std::string> line;
paddle::inference::split(_lines[_current_line], ';', &line);
for (size_t i = 0; i < line.size(); i++) {
std::vector<float> float_v;
paddle::inference::split_to_float(line[i], ' ', &float_v);
if (i == 0) {
cur_query_index = float_v[0];
if (pre_query_index != -1 && cur_query_index != pre_query_index) {
return seq_num;
}
seq_num++;
_current_line++;
} else {
if (float_v.size() == 0) {
float_v.push_back(-1);
}
(*data)[i - 1].insert((*data)[i - 1].end(), float_v.begin(),
float_v.end());
(*offsets)[i - 1].push_back((*offsets)[i - 1][seq_num] +
float_v.size());
}
}
if (seq_num + 1 >= _batch_size) {
return seq_num;
} else {
pre_query_index = cur_query_index;
}
}
return seq_num;
}
namespace paddle {
contrib::AnakinConfig GetConfig() {
contrib::AnakinConfig config;
std::map<std::string, std::vector<int>> init_inputs_shape;
init_inputs_shape["q_basic"] = std::vector<int>({1000, 1, 1, 1});
init_inputs_shape["q_bigram0"] = std::vector<int>({1000, 1, 1, 1});
init_inputs_shape["pt_basic"] = std::vector<int>({2000, 1, 1, 1});
init_inputs_shape["pa_basic"] = std::vector<int>({4000, 1, 1, 1});
init_inputs_shape["pa_bigram0"] = std::vector<int>({4000, 1, 1, 1});
init_inputs_shape["pt_bigram0"] = std::vector<int>({2000, 1, 1, 1});
// using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = contrib::AnakinConfig::NVGPU;
config.model_file = FLAGS_model;
config.device_id = 0;
config.init_batch_size = FLAGS_init_batch_size;
config.init_inputs_shape = init_inputs_shape;
config.re_allocable = false;
return config;
}
void single_test(PaddlePredictor* predictor_master) {
auto predictor = predictor_master->Clone();
Data data(FLAGS_datapath, FLAGS_batch_size, FLAGS_start_line, FLAGS_end_line);
std::vector<std::vector<float>> inputs;
std::vector<std::vector<size_t>> seq_offsets;
std::vector<float> compare_outputs;
const std::vector<std::string> input_names{"q_basic", "q_bigram0",
"pt_basic", "pt_bigram0",
"pa_basic", "pa_bigram0"};
std::vector<PaddleTensor> input_tensors;
std::vector<PaddleTensor> output_tensors;
for (auto& name : input_names) {
PaddleTensor tensor;
tensor.name = name;
tensor.dtype = PaddleDType::FLOAT32;
input_tensors.push_back(tensor);
}
PaddleTensor tensor_out;
tensor_out.name = "save_infer_model/scale_0";
tensor_out.shape = std::vector<int>({});
tensor_out.data = PaddleBuf();
tensor_out.dtype = PaddleDType::FLOAT32;
output_tensors.push_back(tensor_out);
inference::Timer timer;
for (int i = 0; i < FLAGS_repeats; i++) {
data.reset_current_line();
size_t count = 0;
float time_sum = 0;
while (data.get_next_batches(&inputs, &seq_offsets) >= 0) {
#if PRINT_INPUTS
for (size_t i = 0; i < inputs.size(); i++) {
LOG(INFO) << "data " << i;
for (size_t j = 0; j < inputs[i].size(); j++) {
LOG(INFO) << j << ": " << inputs[i][j];
}
for (auto j : seq_offsets[i]) {
LOG(INFO) << "offsets: " << i << ": " << j;
}
}
#endif
for (size_t j = 0; j < input_tensors.size(); j++) {
input_tensors[j].data =
PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float));
input_tensors[j].lod =
std::vector<std::vector<size_t>>({seq_offsets[j]});
input_tensors[j].shape =
std::vector<int>({static_cast<int>(inputs[j].size()), 1, 1, 1});
}
timer.tic();
predictor->Run(input_tensors, &output_tensors);
float time = timer.toc();
#if COMPARE_OUTPUTS
float* data_o = static_cast<float*>(output_tensors[0].data.data());
LOG(INFO) << "outputs[0].data.size() = "
<< output_tensors[0].data.length() / sizeof(float);
size_t sum = 1;
for_each(output_tensors[0].shape.begin(), output_tensors[0].shape.end(),
[&](int n) { sum *= n; });
for (size_t j = 0; j < sum; ++j) {
LOG(INFO) << "output[" << j << "]: " << data_o[j];
compare_outputs.push_back(data_o[j]);
}
#endif
LOG(INFO) << "Single Time: " << time;
count++;
if (count > 10) {
time_sum += timer.toc();
}
}
inference::PrintTime(FLAGS_batch_size, FLAGS_repeats, 1, 0,
time_sum / (count - 10));
#if COMPARE_OUTPUTS
Data data(FLAGS_truthpath, 1);
const std::vector<std::string> truth_vals = data.get_lines();
for (size_t j = 0; j < truth_vals.size(); j++) {
float truth = std::atof(truth_vals[j].c_str());
float compa = compare_outputs[j];
float diff = std::abs(truth - compa);
LOG(INFO) << "[DIFF " << j << " ] " << diff;
if (diff > 0.0001) {
LOG(FATAL) << "The result is wrong!";
}
}
LOG(INFO) << "The result is correct!";
#endif
}
}
} // namespace paddle
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
std::vector<std::thread> threads;
auto config = paddle::GetConfig();
config.data_stream_id = 0;
config.compute_stream_id = 0;
std::unique_ptr<paddle::PaddlePredictor> predictor_master =
paddle::CreatePaddlePredictor<paddle::contrib::AnakinConfig,
paddle::PaddleEngineKind::kAnakin>(config);
for (int i = 0; i < FLAGS_threads_num; i++) {
threads.push_back(std::thread(paddle::single_test, predictor_master.get()));
}
for (auto& t : threads) {
t.join();
}
return 0;
}
......@@ -33,7 +33,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()
if (ANAKIN_FOUND)
if (ANAKIN_SUBGRAPH)
add_subdirectory(anakin)
endif()
......
......@@ -119,11 +119,15 @@ class AnakinEngineOp : public framework::OperatorBase {
engine->Execute(inputs, outputs, stream);
#endif
} else {
#ifdef ANAKIN_X86_PLACE
auto *engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::X86, PrecisionT>>::Global()
.Get(engine_key_);
engine->Execute(inputs, outputs);
#else
LOG(FATAL) << "Unknown Platform for AnakinEngine!";
#endif
}
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册