From e14ab180fe76b97aa33c0089f98d1cfa771905e9 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 11 Apr 2019 17:07:32 +0000 Subject: [PATCH] Cherry-pick from 1662, 16797.. : add anakin int8 support --- paddle/fluid/framework/ir/fc_fuse_pass.cc | 3 +- .../framework/ir/graph_pattern_detector.cc | 25 ++-- .../framework/ir/graph_pattern_detector.h | 3 +- .../ir/quant_conv2d_dequant_fuse_pass.cc | 28 ++-- .../inference/anakin/convert/CMakeLists.txt | 7 +- .../inference/anakin/convert/activation.cc | 49 ++++-- .../inference/anakin/convert/activation.h | 17 ++- .../anakin/convert/affine_channel.cc | 79 +++------- .../inference/anakin/convert/affine_channel.h | 4 +- .../inference/anakin/convert/batch_norm.cc | 106 +++++-------- .../inference/anakin/convert/batch_norm.h | 4 +- .../fluid/inference/anakin/convert/concat.cc | 25 +++- .../fluid/inference/anakin/convert/concat.h | 4 +- .../fluid/inference/anakin/convert/conv2d.cc | 79 +++++++--- .../fluid/inference/anakin/convert/conv2d.h | 4 +- .../inference/anakin/convert/conv2d_fusion.cc | 111 +++++++------- .../inference/anakin/convert/conv2d_fusion.h | 4 +- .../anakin/convert/density_prior_box.cc | 31 ++-- .../anakin/convert/density_prior_box.h | 5 +- .../inference/anakin/convert/detection_out.cc | 25 +++- .../inference/anakin/convert/detection_out.h | 4 +- .../fluid/inference/anakin/convert/dropout.cc | 37 +++-- .../fluid/inference/anakin/convert/dropout.h | 4 +- .../inference/anakin/convert/elementwise.cc | 46 ++++-- .../inference/anakin/convert/elementwise.h | 10 +- paddle/fluid/inference/anakin/convert/fc.cc | 140 +++++++++++------- paddle/fluid/inference/anakin/convert/fc.h | 12 +- .../fluid/inference/anakin/convert/flatten.cc | 25 +++- .../fluid/inference/anakin/convert/flatten.h | 4 +- .../fluid/inference/anakin/convert/helper.cc | 32 ++++ .../fluid/inference/anakin/convert/helper.h | 88 +++++++++++ .../inference/anakin/convert/im2sequence.cc | 21 ++- .../inference/anakin/convert/im2sequence.h | 4 +- .../inference/anakin/convert/op_converter.h | 81 +++++++--- .../fluid/inference/anakin/convert/pool2d.cc | 25 +++- .../fluid/inference/anakin/convert/pool2d.h | 4 +- paddle/fluid/inference/anakin/convert/relu.cc | 45 ++++-- paddle/fluid/inference/anakin/convert/relu.h | 8 +- .../fluid/inference/anakin/convert/reshape.cc | 24 ++- .../fluid/inference/anakin/convert/reshape.h | 4 +- .../inference/anakin/convert/roi_align.cc | 30 ++-- .../inference/anakin/convert/roi_align.h | 4 +- .../fluid/inference/anakin/convert/scale.cc | 24 ++- paddle/fluid/inference/anakin/convert/scale.h | 4 +- .../fluid/inference/anakin/convert/softmax.cc | 25 +++- .../fluid/inference/anakin/convert/softmax.h | 4 +- .../fluid/inference/anakin/convert/split.cc | 23 ++- paddle/fluid/inference/anakin/convert/split.h | 4 +- paddle/fluid/inference/anakin/convert/sum.cc | 28 +++- paddle/fluid/inference/anakin/convert/sum.h | 4 +- .../anakin/convert/test_activation_op.cc | 6 +- .../anakin/convert/test_affine_channel_op.cc | 4 +- .../anakin/convert/test_batch_norm_op.cc | 4 +- .../anakin/convert/test_concat_op.cc | 4 +- .../anakin/convert/test_conv2d_op.cc | 4 +- .../anakin/convert/test_dropout_op.cc | 4 +- .../anakin/convert/test_elementwise_op.cc | 4 +- .../inference/anakin/convert/test_fc_op.cc | 4 +- .../anakin/convert/test_flatten_op.cc | 4 +- .../anakin/convert/test_pool2d_op.cc | 4 +- .../inference/anakin/convert/test_relu_op.cc | 18 +-- .../anakin/convert/test_reshape_op.cc | 8 +- .../anakin/convert/test_softmax_op.cc | 4 +- .../inference/anakin/convert/test_split_op.cc | 4 +- .../inference/anakin/convert/test_sum_op.cc | 4 +- .../anakin/convert/test_transpose_op.cc | 8 +- .../inference/anakin/convert/transpose.cc | 20 ++- .../inference/anakin/convert/transpose.h | 4 +- .../inference/anakin/convert/ut_helper.h | 21 ++- paddle/fluid/inference/anakin/engine.cc | 13 +- paddle/fluid/inference/anakin/engine.h | 13 +- paddle/fluid/inference/analysis/argument.h | 6 + .../inference/analysis/ir_pass_manager.cc | 5 + .../ir_passes/anakin_subgraph_pass.cc | 54 +++++-- .../analysis/ir_passes/anakin_subgraph_pass.h | 8 + paddle/fluid/inference/api/analysis_config.cc | 15 +- .../fluid/inference/api/analysis_predictor.cc | 3 + .../inference/api/paddle_analysis_config.h | 7 +- .../inference/api/paddle_pass_builder.cc | 16 +- .../fluid/operators/anakin/anakin_engine_op.h | 28 +++- paddle/fluid/pybind/inference_api.cc | 10 +- 81 files changed, 1103 insertions(+), 589 deletions(-) create mode 100644 paddle/fluid/inference/anakin/convert/helper.cc create mode 100644 paddle/fluid/inference/anakin/convert/helper.h diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index ca008763bff..a5488eaa1b6 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -48,8 +48,9 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); + auto base_op_desc = *mul->Op()->Proto(); // Create an FC Node. - OpDesc desc; + OpDesc desc(base_op_desc, nullptr); std::string fc_x_in = subgraph.at(x)->Name(); std::string fc_Y_in = w->Name(); std::string fc_bias_in = fc_bias->Name(); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 8468f9ccc12..77f50e914b6 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1640,7 +1640,8 @@ PDNode *patterns::FillConstantElementWiseMulFuse::operator()( void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, const std::string &op_type, const std::string &weight_name, - int times) { + int times, + const std::string &quant_type) { const int kNumFields = 5; const int kQuantizedWeightOffset = 0; const int kQuantizedOpOffset = 1; @@ -1648,24 +1649,22 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, const int kDequantOpOffset = 3; const int kDequantOpOutOffset = 4; // the quant op always be one. - auto quant_op_in_scale = - pattern->NewNode(GetNodeName("quant_op_in_scale")) - ->assert_is_op_input("fake_quantize_range_abs_max", "InScale") - ->AsInput(); - auto quant_op = pattern->NewNode(GetNodeName("quant_op")) - ->assert_is_op("fake_quantize_range_abs_max"); + auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale")) + ->assert_is_op_input(quant_type, "InScale") + ->AsInput(); + auto quant_op = + pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type); auto quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale")) - ->assert_is_op_output("fake_quantize_range_abs_max", "OutScale") + ->assert_is_op_output(quant_type, "OutScale") ->assert_is_op_input("fake_dequantize_max_abs", "Scale") ->AsIntermediate(); - auto quant_op_out = - pattern->NewNode(GetNodeName("quant_op_out")) - ->assert_is_op_output("fake_quantize_range_abs_max", "Out") - ->assert_is_op_input(op_type) - ->AsIntermediate(); + auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out")) + ->assert_is_op_output(quant_type, "Out") + ->assert_is_op_input(op_type) + ->AsIntermediate(); // there are 'times' quantized and dequant op std::vector nodes; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index a5ac3a0c373..525987e0072 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -880,7 +880,8 @@ struct QuantDequantOpFuse : public PatternBase { : PatternBase(pattern, name_scope, "quant_dequant_fuse") {} void operator()(PDNode* quant_op_input, const std::string& op_name, - const std::string& weight_name, int times = 1); + const std::string& weight_name, int times, + const std::string& quant_type); std::string GetNodeName(const std::string& op_type) { return PDNodeName(name_scope_, repr_, id_, op_type); diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 7cab9c353d3..017e3ef234c 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -25,7 +25,8 @@ namespace framework { namespace ir { void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, - std::string op_type) { + const std::string& op_type, + const std::string& quant_type) { const std::string pattern_name = "quant_dequant_fuse"; // FusePassBase::Init(pattern_name, graph); const int kNumFields = 5; @@ -38,7 +39,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() ->NewNode("x") - ->assert_is_op_input("fake_quantize_range_abs_max", "X") + ->assert_is_op_input(quant_type, "X") ->AsInput(); std::string quantized_op_type = ""; @@ -46,6 +47,9 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, if (op_type == "conv2d") { quantized_op_type = "conv2d"; weight_name = "Filter"; + } else if (op_type == "depthwise_conv2d") { + quantized_op_type = "depthwise_conv2d"; + weight_name = "Filter"; } else if (op_type == "conv2d_fusion") { quantized_op_type = "conv2d_fusion"; weight_name = "Filter"; @@ -62,7 +66,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, } patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name); - pattern(x, quantized_op_type, weight_name, times); + pattern(x, quantized_op_type, weight_name, times, quant_type); auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { @@ -103,7 +107,6 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, std::unordered_set delete_nodes; for (int i = 0; i < times; i++) { - // max_range = (range * range) / weight_scale float max_range = boost::get( nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range")); float weight_scale = (range * range) / max_range; @@ -118,7 +121,8 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, new_op_desc.SetType(quantized_op_type); if (quantized_op_type == "conv2d" || - quantized_op_type == "conv2d_fusion") { + quantized_op_type == "conv2d_fusion" || + quantized_op_type == "depthwise_conv2d") { new_op_desc.SetInput("Input", {new_input}); new_op_desc.SetOutput("Output", {new_output}); } else if (quantized_op_type == "fc") { @@ -156,11 +160,17 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "quant_dequant_fuse"; FusePassBase::Init(pattern_name, graph); - std::unordered_set quantized_op_types = {"conv2d", "mul"}; + std::unordered_set quant_types = { + "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"}; + + std::unordered_set quantized_op_types = {"conv2d", "mul", + "depthwise_conv2d"}; auto* scope = param_scope(); - for (auto& op_type : quantized_op_types) { - for (int i = 1; i <= 6; i++) { - RunQuantDequant(graph, scope, i, op_type); + for (auto& quant_type : quant_types) { + for (auto& op_type : quantized_op_types) { + for (int i = 6; i >= 1; i--) { + RunQuantDequant(graph, scope, i, op_type, quant_type); + } } } } diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt index 7cc75de8ee6..6546d3b855f 100644 --- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt @@ -1,4 +1,9 @@ -cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc roi_align.cc DEPS anakin_engine framework_proto scope op_registry) +cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc +elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc +batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc +detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc +roi_align.cc helper.cc DEPS anakin_engine framework_proto scope op_registry +gtest) cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL) cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL) diff --git a/paddle/fluid/inference/anakin/convert/activation.cc b/paddle/fluid/inference/anakin/convert/activation.cc index 11f92c95217..6e52357483d 100644 --- a/paddle/fluid/inference/anakin/convert/activation.cc +++ b/paddle/fluid/inference/anakin/convert/activation.cc @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -ActivationOpConverter::ActivationOpConverter( +template +ActivationOpConverter::ActivationOpConverter( const std::string &op_type) : op_type_(op_type) { auto it = anakin_op_types_.find(op_type_); @@ -30,8 +30,8 @@ ActivationOpConverter::ActivationOpConverter( anakin_op_type_ = it->second; } -template -void ActivationOpConverter::operator()( +template +void ActivationOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -50,11 +50,40 @@ void ActivationOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(sigmoid, - SigmoidOpConverter<::anakin::saber::NV>); -REGISTER_CUDA_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter<::anakin::saber::NV>); +using sigmoid_nv_fp32 = + ::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using sigmoid_nv_int8 = + ::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +using tanh_nv_fp32 = + ::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using tanh_nv_int8 = + ::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; + +REGISTER_CUDA_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_int8); +REGISTER_CUDA_ANAKIN_OP_CONVERTER(tanh, tanh_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(sigmoid, - SigmoidOpConverter<::anakin::saber::X86>); -REGISTER_CPU_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter<::anakin::saber::X86>); +using sigmoid_cpu_fp32 = + ::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using sigmoid_cpu_int8 = + ::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +using tanh_cpu_fp32 = + ::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using tanh_cpu_int8 = + ::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; + +REGISTER_CPU_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_int8); + +REGISTER_CPU_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/activation.h b/paddle/fluid/inference/anakin/convert/activation.h index b3fe4748641..021ec4c7fdf 100644 --- a/paddle/fluid/inference/anakin/convert/activation.h +++ b/paddle/fluid/inference/anakin/convert/activation.h @@ -22,8 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class ActivationOpConverter : public AnakinOpConverter { +template +class ActivationOpConverter : public AnakinOpConverter { public: explicit ActivationOpConverter(const std::string &op_type); @@ -40,16 +40,17 @@ class ActivationOpConverter : public AnakinOpConverter { {"sigmoid", "Sigmoid"}}; }; -template -class TanhOpConverter : public ActivationOpConverter { +template +class TanhOpConverter : public ActivationOpConverter { public: - TanhOpConverter() : ActivationOpConverter("tanh") {} + TanhOpConverter() : ActivationOpConverter("tanh") {} }; -template -class SigmoidOpConverter : public ActivationOpConverter { +template +class SigmoidOpConverter : public ActivationOpConverter { public: - SigmoidOpConverter() : ActivationOpConverter("sigmoid") {} + SigmoidOpConverter() + : ActivationOpConverter("sigmoid") {} }; } // namespace anakin } // namespace inference diff --git a/paddle/fluid/inference/anakin/convert/affine_channel.cc b/paddle/fluid/inference/anakin/convert/affine_channel.cc index 6bf913e7ffb..074c1b26ba8 100644 --- a/paddle/fluid/inference/anakin/convert/affine_channel.cc +++ b/paddle/fluid/inference/anakin/convert/affine_channel.cc @@ -16,18 +16,14 @@ #include #include #include - -using anakin::graph::GraphGlobalMem; -using anakin::PTuple; -using anakin::AK_FLOAT; -using anakin::saber::Shape; +#include "paddle/fluid/inference/anakin/convert/helper.h" namespace paddle { namespace inference { namespace anakin { -template -void AffineChannelOpConverter::operator()( +template +void AffineChannelOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -35,60 +31,20 @@ void AffineChannelOpConverter::operator()( PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - auto input_name = op_desc.Input("X").front(); auto output_name = op_desc.Output("Out").front(); + this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name}); // Copy the Scale to CPUPlace and get the pointer. auto *scale_v = scope.FindVar(op_desc.Input("Scale").front()); PADDLE_ENFORCE_NOT_NULL(scale_v); - auto *scale_t = scale_v->GetMutable(); - std::unique_ptr scale_tensor( - new framework::LoDTensor()); - scale_tensor->Resize(scale_t->dims()); - TensorCopySync((*scale_t), platform::CPUPlace(), scale_tensor.get()); + auto weight1 = pblock_from_var(*scale_v); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); // Copy the Bias to CPUPlace and get the pointer. auto *bias_v = scope.FindVar(op_desc.Input("Bias").front()); PADDLE_ENFORCE_NOT_NULL(bias_v); - auto *bias_t = bias_v->GetMutable(); - std::unique_ptr bias_tensor(new framework::LoDTensor()); - bias_tensor->Resize(bias_t->dims()); - TensorCopySync((*bias_t), platform::CPUPlace(), bias_tensor.get()); - - this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name}); - - // Generate the Scale parameter of Anakin. - auto scale_shape = framework::vectorize2int(scale_t->dims()); - while (scale_shape.size() < 4) { - scale_shape.insert(scale_shape.begin(), 1); - } - Shape anakin_scale_shape(scale_shape); - auto *weight1 = - GraphGlobalMem::Global().template new_block( - anakin_scale_shape); - float *scale_cpu_data = - static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(scale_tensor->data(), scale_tensor->numel(), - scale_cpu_data); - weight1->d_tensor().set_shape(anakin_scale_shape); - weight1->d_tensor().copy_from(weight1->h_tensor()); - this->engine_->AddOpAttr(op_name, "weight_1", *weight1); - - // Generate the Bias parameter of Anakin. - auto bias_shape = framework::vectorize2int(bias_t->dims()); - while (bias_shape.size() < 4) { - bias_shape.insert(bias_shape.begin(), 1); - } - Shape anakin_bias_shape(bias_shape); - auto *weight2 = - GraphGlobalMem::Global().template new_block( - anakin_bias_shape); - float *bias_cpu_data = - static_cast(weight2->h_tensor().mutable_data()); - std::copy_n(bias_tensor->data(), bias_tensor->numel(), bias_cpu_data); - weight2->d_tensor().set_shape(anakin_bias_shape); - weight2->d_tensor().copy_from(weight2->h_tensor()); + auto weight2 = pblock_from_var(*bias_v); this->engine_->AddOpAttr(op_name, "weight_2", *weight2); } @@ -97,8 +53,21 @@ void AffineChannelOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER( - affine_channel, AffineChannelOpConverter<::anakin::saber::NV>); +using affine_channel_nv_fp32 = + ::paddle::inference::anakin::AffineChannelOpConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using affine_channel_nv_int8 = + ::paddle::inference::anakin::AffineChannelOpConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER( - affine_channel, AffineChannelOpConverter<::anakin::saber::X86>); + +using affine_channel_cpu_fp32 = + ::paddle::inference::anakin::AffineChannelOpConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using affine_channel_cpu_int8 = + ::paddle::inference::anakin::AffineChannelOpConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/affine_channel.h b/paddle/fluid/inference/anakin/convert/affine_channel.h index 5da4a736e8d..443f6101288 100644 --- a/paddle/fluid/inference/anakin/convert/affine_channel.h +++ b/paddle/fluid/inference/anakin/convert/affine_channel.h @@ -21,8 +21,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class AffineChannelOpConverter : public AnakinOpConverter { +template +class AffineChannelOpConverter : public AnakinOpConverter { public: AffineChannelOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc index 1c837e9c3df..3e1e422aea1 100644 --- a/paddle/fluid/inference/anakin/convert/batch_norm.cc +++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc @@ -18,17 +18,14 @@ #include #include #include - -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::Shape; +#include "paddle/fluid/inference/anakin/convert/helper.h" namespace paddle { namespace inference { namespace anakin { -template -void BatchNormOpConverter::operator()( +template +void BatchNormOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -36,87 +33,46 @@ void BatchNormOpConverter::operator()( std::map inputs; for (auto k : {"X", "Scale", "Bias", "Mean", "Variance"}) { PADDLE_ENFORCE_EQ(op_desc.Input(k).size(), 1UL); - auto v = op_desc.Input(k).front(); - inputs.insert({k, v}); } + auto input = op_desc.Input("X").front(); auto output = op_desc.Output("Y").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front(); auto epsilon = boost::get(op_desc.GetAttr("epsilon")); - // auto momentum = boost::get(op_desc.GetAttr("momentum")); auto bn_op_name = op_name + ":bn"; auto bn_output = bn_op_name + "_output"; - this->engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output}); + this->engine_->AddOp(bn_op_name, "BatchNorm", {input}, {bn_output}); this->engine_->AddOpAttr(bn_op_name, "epsilon", epsilon); this->engine_->AddOpAttr(bn_op_name, "momentum", static_cast(1.0)); auto scale_op_name = op_name + ":scale"; - auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name, - framework::LoDTensor *tensor) { - auto *v = scope.FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(v); - auto *t = v->GetMutable(); - tensor->Resize(t->dims()); - TensorCopySync(*t, platform::CPUPlace(), tensor); - }; - - framework::LoDTensor bias_t; - framework::LoDTensor mean_t; - framework::LoDTensor scale_t; - framework::LoDTensor variance_t; - get_lod_tensor(inputs["Bias"], &bias_t); - get_lod_tensor(inputs["Mean"], &mean_t); - get_lod_tensor(inputs["Scale"], &scale_t); - get_lod_tensor(inputs["Variance"], &variance_t); + this->engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output}); + this->engine_->AddOpAttr(scale_op_name, "axis", 1); + this->engine_->AddOpAttr(scale_op_name, "num_axes", 1); + this->engine_->AddOpAttr(scale_op_name, "bias_term", true); - auto fill_shape = [](size_t n, std::vector shape) { - shape.insert(shape.begin(), 1); - if (shape.size() < n) { - shape.insert(shape.end(), n - shape.size(), 1); - } - return shape; - }; - Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims()))); - Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims()))); - auto *weight1 = - GraphGlobalMem::Global().template new_block(shape1); - auto *mean_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(mean_t.data(), mean_t.numel(), mean_data); + auto *mean_v = scope.FindVar(op_desc.Input("Mean").front()); + PADDLE_ENFORCE_NOT_NULL(mean_v); + auto weight1 = pblock_from_var(*mean_v); this->engine_->AddOpAttr(bn_op_name, "weight_1", *weight1); - auto *weight2 = - GraphGlobalMem::Global().template new_block(shape2); - auto *variance_data = - static_cast(weight2->h_tensor().mutable_data()); - std::copy_n(variance_t.data(), variance_t.numel(), variance_data); + auto *variance_v = scope.FindVar(op_desc.Input("Variance").front()); + PADDLE_ENFORCE_NOT_NULL(variance_v); + auto weight2 = pblock_from_var(*variance_v); this->engine_->AddOpAttr(bn_op_name, "weight_2", *weight2); - Shape shape3(std::vector({1, 1, 1, 1})); - auto *weight3 = - GraphGlobalMem::Global().template new_block(shape3); - auto *alpha_data = static_cast(weight3->h_tensor().mutable_data()); - float weight3_data[] = {1}; - std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data); + auto *weight3 = pblock_from_vector(std::vector({1})); this->engine_->AddOpAttr(bn_op_name, "weight_3", *weight3); - Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims()))); - auto *scale = GraphGlobalMem::Global().template new_block( - scale_shape); - auto *scale_data = static_cast(scale->h_tensor().mutable_data()); - std::copy_n(scale_t.data(), scale_t.numel(), scale_data); - - Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims()))); - auto *bias = GraphGlobalMem::Global().template new_block( - bias_shape); - auto *bias_data = static_cast(bias->h_tensor().mutable_data()); - std::copy_n(bias_t.data(), bias_t.numel(), bias_data); - - this->engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output}); - this->engine_->AddOpAttr(scale_op_name, "axis", 1); - this->engine_->AddOpAttr(scale_op_name, "num_axes", 1); - this->engine_->AddOpAttr(scale_op_name, "bias_term", true); + auto *scale_v = scope.FindVar(op_desc.Input("Scale").front()); + PADDLE_ENFORCE_NOT_NULL(scale_v); + auto scale = pblock_from_var(*scale_v); this->engine_->AddOpAttr(scale_op_name, "weight_1", *scale); + + auto *bias_v = scope.FindVar(op_desc.Input("Bias").front()); + PADDLE_ENFORCE_NOT_NULL(bias_v); + auto bias = pblock_from_var(*bias_v); this->engine_->AddOpAttr(scale_op_name, "weight_2", *bias); } @@ -125,9 +81,17 @@ void BatchNormOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(batch_norm, - BatchNormOpConverter<::anakin::saber::NV>); +using bn_nv_fp32 = ::paddle::inference::anakin::BatchNormOpConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using bn_nv_int8 = ::paddle::inference::anakin::BatchNormOpConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(batch_norm, bn_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(batch_norm, bn_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(batch_norm, - BatchNormOpConverter<::anakin::saber::X86>); +using bn_cpu_fp32 = ::paddle::inference::anakin::BatchNormOpConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using bn_cpu_int8 = ::paddle::inference::anakin::BatchNormOpConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(batch_norm, bn_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(batch_norm, bn_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.h b/paddle/fluid/inference/anakin/convert/batch_norm.h index dc94b6ff64d..52156aeb028 100644 --- a/paddle/fluid/inference/anakin/convert/batch_norm.h +++ b/paddle/fluid/inference/anakin/convert/batch_norm.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class BatchNormOpConverter : public AnakinOpConverter { +template +class BatchNormOpConverter : public AnakinOpConverter { public: BatchNormOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc index cfd9540acf6..6655c2f047a 100644 --- a/paddle/fluid/inference/anakin/convert/concat.cc +++ b/paddle/fluid/inference/anakin/convert/concat.cc @@ -19,8 +19,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void ConcatOpConverter::operator()( +template +void ConcatOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -39,8 +39,21 @@ void ConcatOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(concat, - ConcatOpConverter<::anakin::saber::NV>); +using concat_nv_fp32 = + ::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using concat_nv_int8 = + ::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(concat, concat_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(concat, concat_nv_int8); + #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(concat, - ConcatOpConverter<::anakin::saber::X86>); +using concat_cpu_fp32 = + ::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using concat_cpu_int8 = + ::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(concat, concat_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(concat, concat_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/concat.h b/paddle/fluid/inference/anakin/convert/concat.h index a32f8a46129..fb5514affa7 100644 --- a/paddle/fluid/inference/anakin/convert/concat.h +++ b/paddle/fluid/inference/anakin/convert/concat.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class ConcatOpConverter : public AnakinOpConverter { +template +class ConcatOpConverter : public AnakinOpConverter { public: ConcatOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc index f9ab9874751..4bd380e7bb2 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d.cc @@ -16,18 +16,16 @@ #include #include #include +#include "paddle/fluid/inference/anakin/convert/helper.h" -using anakin::graph::GraphGlobalMem; using anakin::PTuple; -using anakin::AK_FLOAT; -using anakin::saber::Shape; namespace paddle { namespace inference { namespace anakin { -template -void Conv2dOpConverter::operator()( +template +void Conv2dOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -42,11 +40,8 @@ void Conv2dOpConverter::operator()( auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); PADDLE_ENFORCE_NOT_NULL(filter_v); - auto *filter_t = filter_v->GetMutable(); - std::unique_ptr weight_tensor( - new framework::LoDTensor()); - weight_tensor->Resize(filter_t->dims()); - TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get()); + auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); @@ -69,25 +64,61 @@ void Conv2dOpConverter::operator()( this->engine_->AddOpAttr(op_name, "axis", 1); this->engine_->AddOpAttr(op_name, "bias_term", false); - auto weight_shape = framework::vectorize2int(filter_t->dims()); - Shape anakin_shape(weight_shape); - auto *weight1 = - GraphGlobalMem::Global().template new_block( - anakin_shape); - float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(weight_tensor->data(), weight_tensor->numel(), cpu_data); - weight1->d_tensor().set_shape(anakin_shape); - weight1->d_tensor().copy_from(weight1->h_tensor()); - this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + ::anakin::saber::Shape anakin_shape(weight_shape); + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + + if (enable_int8) { + const float int8_range = 127.; + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + float weight_scale = boost::get(op_desc.GetAttr("weight_scale")); + auto *weight1 = ::anakin::graph::GraphGlobalMem::Global() + .template new_block<::anakin::AK_INT8>(anakin_shape); + float *weight_data = weight_tensor->data(); + std::vector weight_int8; + int weight_num = weight_tensor->numel(); + for (int i = 0; i < weight_tensor->numel(); i++) { + bool is_valid_int8 = + ((weight_data[i] >= -128) && (weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of conv " + "should be in range [-128, 127]"); + weight_int8.push_back(static_cast(weight_data[i])); + } + memcpy(static_cast(weight1->h_tensor().mutable_data()), + static_cast(weight_int8.data()), sizeof(char) * weight_num); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); + this->engine_->Graph()->SetWeightsScale(op_name, + {weight_scale / int8_range}, false); + this->engine_->AddTensorScale(input_name, in_scale / int8_range); + } else { + auto *weight1 = pblock_from_tensor(*weight_tensor, weight_shape); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + } } } // namespace anakin } // namespace inference } // namespace paddle -REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d, - Conv2dOpConverter<::anakin::saber::X86>); #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d, - Conv2dOpConverter<::anakin::saber::NV>); +using conv2d_nv_fp32 = + ::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using conv2d_nv_int8 = + ::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d, conv2d_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(conv2d, conv2d_nv_int8); #endif + +using conv2d_cpu_fp32 = + ::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using conv2d_cpu_int8 = + ::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d, conv2d_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(conv2d, conv2d_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/conv2d.h b/paddle/fluid/inference/anakin/convert/conv2d.h index 6ecb3284051..b22cb8ea931 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d.h +++ b/paddle/fluid/inference/anakin/convert/conv2d.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class Conv2dOpConverter : public AnakinOpConverter { +template +class Conv2dOpConverter : public AnakinOpConverter { public: Conv2dOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc index ff60771f87b..a8ef73d50f2 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc @@ -16,18 +16,16 @@ #include #include #include +#include "paddle/fluid/inference/anakin/convert/helper.h" -using anakin::graph::GraphGlobalMem; using anakin::PTuple; -using anakin::AK_FLOAT; -using anakin::saber::Shape; namespace paddle { namespace inference { namespace anakin { -template -void Conv2dFusionOpConverter::operator()( +template +void Conv2dFusionOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -43,24 +41,16 @@ void Conv2dFusionOpConverter::operator()( auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); PADDLE_ENFORCE_NOT_NULL(filter_v); - auto *filter_t = filter_v->GetMutable(); + + auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); auto *b_v = scope.FindVar(op_desc.Input("Bias").front()); PADDLE_ENFORCE_NOT_NULL(b_v); - auto *b_t = b_v->GetMutable(); - - std::unique_ptr weight_tensor( - new framework::LoDTensor()); - weight_tensor->Resize(filter_t->dims()); - TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get()); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); - - // const int n_output = weight_tensor->dims()[0]; - // const int n_input = weight_tensor->dims()[1]; const int filter_h = weight_tensor->dims()[2]; const int filter_w = weight_tensor->dims()[3]; - // auto filter_num = n_input * filter_h * filter_w ; auto filter_num = weight_tensor->dims()[0]; this->engine_->template AddOpAttr(op_name, "filter_num", filter_num); this->engine_->template AddOpAttr>(op_name, "kernel_size", @@ -77,37 +67,42 @@ void Conv2dFusionOpConverter::operator()( this->engine_->AddOpAttr(op_name, "axis", 1); this->engine_->AddOpAttr(op_name, "bias_term", true); - auto weight_shape = framework::vectorize2int(filter_t->dims()); - Shape anakin_shape(weight_shape); - auto *weight1 = - GraphGlobalMem::Global().template new_block( - anakin_shape); - float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(weight_tensor->data(), weight_tensor->numel(), cpu_data); - weight1->d_tensor().set_shape(anakin_shape); - weight1->d_tensor().copy_from(weight1->h_tensor()); - this->engine_->AddOpAttr(op_name, "weight_1", *weight1); - - auto bias_shape = framework::vectorize2int(b_t->dims()); - framework::LoDTensor bias_tensor; - bias_tensor.Resize(b_t->dims()); - TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor); - auto *bias_data = bias_tensor.data(); - bias_shape.insert(bias_shape.begin(), 1); - bias_shape.insert(bias_shape.begin(), 1); - bias_shape.insert(bias_shape.begin(), 1); - // bias_shape.push_back(1); - // bias_shape.push_back(1); - Shape anakin_bias_shape(bias_shape); - - auto *weight2 = - GraphGlobalMem::Global().template new_block( - anakin_bias_shape); - float *cpu_data2 = static_cast(weight2->h_tensor().mutable_data()); - std::copy_n(bias_data, bias_tensor.numel(), cpu_data2); - weight2->d_tensor().set_shape(anakin_bias_shape); - weight2->d_tensor().copy_from(weight2->h_tensor()); - this->engine_->AddOpAttr(op_name, "weight_2", *weight2); + ::anakin::saber::Shape anakin_shape(weight_shape); + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + if (enable_int8) { + const float int8_range = 127.; + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + float weight_scale = boost::get(op_desc.GetAttr("weight_scale")); + auto *weight1 = ::anakin::graph::GraphGlobalMem::Global() + .template new_block<::anakin::AK_INT8>(anakin_shape); + float *weight_data = weight_tensor->data(); + std::vector weight_int8; + int weight_num = weight_tensor->numel(); + for (int i = 0; i < weight_tensor->numel(); i++) { + bool is_valid_int8 = + ((weight_data[i] >= -128) && (weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of conv " + "should be in range [-128, 127]"); + weight_int8.push_back(static_cast(weight_data[i])); + } + memcpy(static_cast(weight1->h_tensor().mutable_data()), + static_cast(weight_int8.data()), sizeof(char) * weight_num); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); + this->engine_->Graph()->SetWeightsScale(op_name, + {weight_scale / int8_range}, false); + this->engine_->AddTensorScale(input_name, in_scale / int8_range); + } else { + auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + auto *weight1 = pblock_from_tensor(*weight_tensor, weight_shape); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + auto weight2 = pblock_from_var(*b_v); + this->engine_->AddOpAttr(op_name, "weight_2", *weight2); + } } } // namespace anakin @@ -115,9 +110,21 @@ void Conv2dFusionOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d_fusion, - Conv2dFusionOpConverter<::anakin::saber::NV>); +using conv2d_fusion_nv_fp32 = + ::paddle::inference::anakin::Conv2dFusionOpConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using conv2d_fusion_nv_int8 = + ::paddle::inference::anakin::Conv2dFusionOpConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_nv_int8); #endif - -REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d_fusion, - Conv2dFusionOpConverter<::anakin::saber::X86>); +using conv2d_fusion_cpu_fp32 = + ::paddle::inference::anakin::Conv2dFusionOpConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using conv2d_fusion_cpu_int8 = + ::paddle::inference::anakin::Conv2dFusionOpConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; + +REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h index abcf61a75e0..768814d3f99 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class Conv2dFusionOpConverter : public AnakinOpConverter { +template +class Conv2dFusionOpConverter : public AnakinOpConverter { public: Conv2dFusionOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc index f552e41c85f..92d147708bf 100644 --- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc @@ -23,8 +23,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void DensityPriorBoxOpConverter::operator()( +template +void DensityPriorBoxOpConverter::operator()( const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc, const framework::Scope& scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -109,13 +109,24 @@ void DensityPriorBoxOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER( - density_prior_box, DensityPriorBoxOpConverter<::anakin::saber::NV>); -REGISTER_CUDA_ANAKIN_OP_CONVERTER( - prior_box, DensityPriorBoxOpConverter<::anakin::saber::NV>); +using ds_pr_nv_fp32 = ::paddle::inference::anakin::DensityPriorBoxOpConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using ds_pr_nv_int8 = ::paddle::inference::anakin::DensityPriorBoxOpConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; + +REGISTER_CUDA_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_nv_fp32); +REGISTER_CUDA_ANAKIN_OP_CONVERTER(prior_box, ds_pr_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_nv_int8); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(prior_box, ds_pr_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER( - density_prior_box, DensityPriorBoxOpConverter<::anakin::saber::X86>); -REGISTER_CPU_ANAKIN_OP_CONVERTER( - prior_box, DensityPriorBoxOpConverter<::anakin::saber::X86>); +using ds_pr_cpu_fp32 = ::paddle::inference::anakin::DensityPriorBoxOpConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using ds_pr_cpu_int8 = ::paddle::inference::anakin::DensityPriorBoxOpConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; + +REGISTER_CPU_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_cpu_fp32); +REGISTER_CPU_ANAKIN_OP_CONVERTER(prior_box, ds_pr_cpu_fp32); + +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_cpu_int8); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(prior_box, ds_pr_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.h b/paddle/fluid/inference/anakin/convert/density_prior_box.h index 29f4f6f7f9d..5714f57a04b 100644 --- a/paddle/fluid/inference/anakin/convert/density_prior_box.h +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h @@ -22,8 +22,9 @@ namespace paddle { namespace inference { namespace anakin { -template -class DensityPriorBoxOpConverter : public AnakinOpConverter { +template +class DensityPriorBoxOpConverter + : public AnakinOpConverter { public: DensityPriorBoxOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc index 4a28c604f58..c06a8860e16 100644 --- a/paddle/fluid/inference/anakin/convert/detection_out.cc +++ b/paddle/fluid/inference/anakin/convert/detection_out.cc @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void DetectionOutOpConverter::operator()( +template +void DetectionOutOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -67,8 +67,21 @@ void DetectionOutOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(detection_out, - DetectionOutOpConverter<::anakin::saber::NV>); +using detection_out_nv_fp32 = + ::paddle::inference::anakin::DetectionOutOpConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using detection_out_nv_int8 = + ::paddle::inference::anakin::DetectionOutOpConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(detection_out, detection_out_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(detection_out, detection_out_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(detection_out, - DetectionOutOpConverter<::anakin::saber::X86>); + +using detection_out_cpu_fp32 = + ::paddle::inference::anakin::DetectionOutOpConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using detection_out_cpu_int8 = + ::paddle::inference::anakin::DetectionOutOpConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(detection_out, detection_out_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(detection_out, detection_out_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/detection_out.h b/paddle/fluid/inference/anakin/convert/detection_out.h index 396d5c9554f..c34342a66c1 100644 --- a/paddle/fluid/inference/anakin/convert/detection_out.h +++ b/paddle/fluid/inference/anakin/convert/detection_out.h @@ -22,8 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class DetectionOutOpConverter : public AnakinOpConverter { +template +class DetectionOutOpConverter : public AnakinOpConverter { public: DetectionOutOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/dropout.cc b/paddle/fluid/inference/anakin/convert/dropout.cc index 989eafcd91e..e779aca7308 100644 --- a/paddle/fluid/inference/anakin/convert/dropout.cc +++ b/paddle/fluid/inference/anakin/convert/dropout.cc @@ -16,17 +16,14 @@ #include #include #include - -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::Shape; +#include "paddle/fluid/inference/anakin/convert/helper.h" namespace paddle { namespace inference { namespace anakin { -template -void DropoutOpConverter::operator()( +template +void DropoutOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -42,12 +39,7 @@ void DropoutOpConverter::operator()( auto dropout_prob = boost::get(op_desc.GetAttr("dropout_prob")); auto factor = 1 - dropout_prob; - Shape shape1(std::vector({1, 1, 1, 1})); - auto *weight1 = - GraphGlobalMem::Global().template new_block(shape1); - auto *factor_data = static_cast(weight1->h_tensor().mutable_data()); - float weight1_data[] = {factor}; - std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data); + auto *weight1 = pblock_from_vector(std::vector({factor})); this->engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->AddOpAttr(op_name, "axis", 0); @@ -60,8 +52,21 @@ void DropoutOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(dropout, - DropoutOpConverter<::anakin::saber::NV>); +using dropout_nv_fp32 = + ::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using dropout_nv_int8 = + ::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(dropout, dropout_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(dropout, dropout_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(dropout, - DropoutOpConverter<::anakin::saber::X86>); + +using dropout_cpu_fp32 = + ::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using dropout_cpu_int8 = + ::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(dropout, dropout_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(dropout, dropout_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/dropout.h b/paddle/fluid/inference/anakin/convert/dropout.h index c43c851fc0e..801aa3dd16f 100644 --- a/paddle/fluid/inference/anakin/convert/dropout.h +++ b/paddle/fluid/inference/anakin/convert/dropout.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class DropoutOpConverter : public AnakinOpConverter { +template +class DropoutOpConverter : public AnakinOpConverter { public: DropoutOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc index 81e1d10d82b..e3ea6b2a97d 100644 --- a/paddle/fluid/inference/anakin/convert/elementwise.cc +++ b/paddle/fluid/inference/anakin/convert/elementwise.cc @@ -17,17 +17,14 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::Shape; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -template -void ElementwiseAddOpConverter::operator()( +template +void ElementwiseAddOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -48,8 +45,8 @@ void ElementwiseAddOpConverter::operator()( this->engine_->template AddOpAttr>(op_name, "coeff", coeff); } -template -void ElementwiseMulOpConverter::operator()( +template +void ElementwiseMulOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -75,12 +72,31 @@ void ElementwiseMulOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER( - elementwise_add, ElementwiseAddOpConverter<::anakin::saber::NV>); -REGISTER_CUDA_ANAKIN_OP_CONVERTER( - elementwise_mul, ElementwiseMulOpConverter<::anakin::saber::NV>); +using elet_nv_fp32 = ::paddle::inference::anakin::ElementwiseAddOpConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using elet_nv_int8 = ::paddle::inference::anakin::ElementwiseAddOpConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; +using eletmul_nv_fp32 = ::paddle::inference::anakin::ElementwiseMulOpConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using eletmul_nv_int8 = ::paddle::inference::anakin::ElementwiseMulOpConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; + +REGISTER_CUDA_ANAKIN_OP_CONVERTER(elementwise_add, elet_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(elementwise_add, elet_nv_int8); +REGISTER_CUDA_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_nv_int8); + #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER( - elementwise_add, ElementwiseAddOpConverter<::anakin::saber::X86>); -REGISTER_CPU_ANAKIN_OP_CONVERTER( - elementwise_mul, ElementwiseMulOpConverter<::anakin::saber::X86>); +using elet_cpu_fp32 = ::paddle::inference::anakin::ElementwiseAddOpConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using elet_cpu_int8 = ::paddle::inference::anakin::ElementwiseAddOpConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; +using eletmul_cpu_fp32 = ::paddle::inference::anakin::ElementwiseMulOpConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using eletmul_cpu_int8 = ::paddle::inference::anakin::ElementwiseMulOpConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; + +REGISTER_CPU_ANAKIN_OP_CONVERTER(elementwise_add, elet_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(elementwise_add, elet_cpu_int8); +REGISTER_CPU_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/elementwise.h b/paddle/fluid/inference/anakin/convert/elementwise.h index f64a8c5f7f3..190a8b55f0e 100644 --- a/paddle/fluid/inference/anakin/convert/elementwise.h +++ b/paddle/fluid/inference/anakin/convert/elementwise.h @@ -20,8 +20,9 @@ namespace paddle { namespace inference { namespace anakin { -template -class ElementwiseAddOpConverter : public AnakinOpConverter { +template +class ElementwiseAddOpConverter + : public AnakinOpConverter { public: ElementwiseAddOpConverter() = default; @@ -34,8 +35,9 @@ class ElementwiseAddOpConverter : public AnakinOpConverter { private: }; -template -class ElementwiseMulOpConverter : public AnakinOpConverter { +template +class ElementwiseMulOpConverter + : public AnakinOpConverter { public: ElementwiseMulOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc index a04035eabac..10ceb2154b1 100644 --- a/paddle/fluid/inference/anakin/convert/fc.cc +++ b/paddle/fluid/inference/anakin/convert/fc.cc @@ -16,22 +16,19 @@ #include #include #include - -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::Shape; +#include "paddle/fluid/inference/anakin/convert/helper.h" namespace paddle { namespace inference { namespace anakin { -template -void FcBaseOpConverter::operator()( +template +void FcBaseOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); auto input_names = op_desc.InputNames(); - bool with_bias = input_names.size() == 3; + bool with_bias = input_names.size() >= 3; std::string w_name = "Y"; std::string i_name = "X"; @@ -45,7 +42,12 @@ void FcBaseOpConverter::operator()( // get weights auto *y_v = scope.FindVar(op_desc.Input(w_name).front()); PADDLE_ENFORCE_NOT_NULL(y_v); - auto *y_t = y_v->GetMutable(); + auto weight_tensor = tensor_from_var(*y_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + + int out_dim = weight_shape[1]; + const int w_m = weight_shape[0]; + const int w_k = weight_shape[1]; auto input_name = op_desc.Input(i_name).front(); auto output_name = op_desc.Output("Out").front(); @@ -53,64 +55,58 @@ void FcBaseOpConverter::operator()( this->engine_->AddOp(op_name, "Dense", {input_name}, {output_name}); this->engine_->AddOpAttr(op_name, "bias_term", with_bias); this->engine_->AddOpAttr(op_name, "axis", 1); - - auto weight_shape = framework::vectorize2int(y_t->dims()); - int out_dim = weight_shape[1]; this->engine_->AddOpAttr(op_name, "out_dim", out_dim); - const int w_m = weight_shape[0]; - const int w_k = weight_shape[1]; - - if (weight_shape.size() < 4UL) { - weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1); - } - Shape anakin_shape(weight_shape); - framework::LoDTensor weight_tensor; - weight_tensor.Resize(y_t->dims()); - TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor); - auto *weight_data = weight_tensor.data(); - PADDLE_ENFORCE(w_m * w_k == weight_tensor.numel()); + auto *weight_data = weight_tensor->data(); + PADDLE_ENFORCE(w_m * w_k == weight_tensor->numel()); - std::vector trans_weight_data(weight_tensor.numel()); + std::vector trans_weight_data(weight_tensor->numel()); for (int i = 0; i < w_m; i++) { for (int j = 0; j < w_k; j++) { trans_weight_data[i + j * w_m] = weight_data[i * w_k + j]; } } - auto *weight1 = - GraphGlobalMem::Global().template new_block( - anakin_shape); - float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data); - weight1->d_tensor().set_shape(anakin_shape); - weight1->d_tensor().copy_from(weight1->h_tensor()); - this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + + int weight_num = weight_tensor->numel(); + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + if (enable_int8) { + if (weight_shape.size() < 4UL) { + weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1); + } + ::anakin::saber::Shape anakin_shape(weight_shape); + const float int8_range = 127.; + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + float weight_scale = boost::get(op_desc.GetAttr("weight_scale")); + auto *weight1 = ::anakin::graph::GraphGlobalMem::Global() + .template new_block<::anakin::AK_INT8>(anakin_shape); + std::vector weight_int8; + for (int i = 0; i < weight_num; i++) { + bool is_valid_int8 = + ((trans_weight_data[i] >= -128) && (trans_weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of fc " + "should be in range [-128, 127]"); + weight_int8.push_back(static_cast(trans_weight_data[i])); + } + memcpy(static_cast(weight1->h_tensor().mutable_data()), + static_cast(weight_int8.data()), sizeof(char) * weight_num); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); + this->engine_->Graph()->SetWeightsScale(op_name, + {weight_scale / int8_range}, false); + this->engine_->AddTensorScale(input_name, in_scale / int8_range); + } else { + auto *weight1 = pblock_from_vector(trans_weight_data); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + } // get bias if (with_bias) { auto *b_v = scope.FindVar(op_desc.Input("Bias").front()); PADDLE_ENFORCE_NOT_NULL(b_v); - auto *b_t = b_v->GetMutable(); - - auto bias_shape = framework::vectorize2int(b_t->dims()); - framework::LoDTensor bias_tensor; - bias_tensor.Resize(b_t->dims()); - TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor); - auto *bias_data = bias_tensor.data(); - bias_shape.insert(bias_shape.begin(), 1); - bias_shape.insert(bias_shape.begin(), 1); - bias_shape.insert(bias_shape.begin(), 1); - // bias_shape.push_back(1); - // bias_shape.push_back(1); - Shape anakin_bias_shape(bias_shape); - - auto *weight2 = - GraphGlobalMem::Global().template new_block( - anakin_bias_shape); - float *cpu_data2 = static_cast(weight2->h_tensor().mutable_data()); - std::copy_n(bias_data, bias_tensor.numel(), cpu_data2); - weight2->d_tensor().set_shape(anakin_bias_shape); - weight2->d_tensor().copy_from(weight2->h_tensor()); + auto weight2 = pblock_from_var(*b_v); this->engine_->AddOpAttr(op_name, "weight_2", *weight2); } } @@ -120,9 +116,39 @@ void FcBaseOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(mul, MulOpConverter<::anakin::saber::NV>); -REGISTER_CUDA_ANAKIN_OP_CONVERTER(fc, FcOpConverter<::anakin::saber::NV>); +using mul_nv_fp32 = + ::paddle::inference::anakin::MulOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using fc_nv_fp32 = + ::paddle::inference::anakin::FcOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using mul_nv_int8 = + ::paddle::inference::anakin::MulOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +using fc_nv_int8 = + ::paddle::inference::anakin::FcOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; + +REGISTER_CUDA_ANAKIN_OP_CONVERTER(mul, mul_nv_fp32); +REGISTER_CUDA_ANAKIN_OP_CONVERTER(fc, fc_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(mul, mul_nv_int8); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(fc, fc_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(mul, MulOpConverter<::anakin::saber::X86>); -REGISTER_CPU_ANAKIN_OP_CONVERTER(fc, FcOpConverter<::anakin::saber::X86>); +using mul_cpu_fp32 = + ::paddle::inference::anakin::MulOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using fc_cpu_fp32 = + ::paddle::inference::anakin::FcOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using mul_cpu_int8 = + ::paddle::inference::anakin::MulOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +using fc_cpu_int8 = + ::paddle::inference::anakin::FcOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; + +REGISTER_CPU_ANAKIN_OP_CONVERTER(mul, mul_cpu_fp32); +REGISTER_CPU_ANAKIN_OP_CONVERTER(fc, fc_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(mul, mul_cpu_int8); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(fc, fc_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h index 10808c31575..6fe65e3ecd4 100644 --- a/paddle/fluid/inference/anakin/convert/fc.h +++ b/paddle/fluid/inference/anakin/convert/fc.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class FcBaseOpConverter : public AnakinOpConverter { +template +class FcBaseOpConverter : public AnakinOpConverter { public: FcBaseOpConverter() = default; @@ -33,15 +33,15 @@ class FcBaseOpConverter : public AnakinOpConverter { }; // with bias -template -class FcOpConverter : public FcBaseOpConverter { +template +class FcOpConverter : public FcBaseOpConverter { public: FcOpConverter() = default; }; // without bias -template -class MulOpConverter : public FcBaseOpConverter { +template +class MulOpConverter : public FcBaseOpConverter { public: MulOpConverter() = default; }; diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc index a38dec25d83..7ef9e11b091 100644 --- a/paddle/fluid/inference/anakin/convert/flatten.cc +++ b/paddle/fluid/inference/anakin/convert/flatten.cc @@ -21,8 +21,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void FlattenOpConverter::operator()( +template +void FlattenOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -46,8 +46,21 @@ void FlattenOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(flatten, - FlattenOpConverter<::anakin::saber::NV>); +using flatten_nv_fp32 = + ::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using flatten_nv_int8 = + ::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; + +REGISTER_CUDA_ANAKIN_OP_CONVERTER(flatten, flatten_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(flatten, flatten_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(flatten, - FlattenOpConverter<::anakin::saber::X86>); +using flatten_cpu_fp32 = + ::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using flatten_cpu_int8 = + ::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(flatten, flatten_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(flatten, flatten_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/flatten.h b/paddle/fluid/inference/anakin/convert/flatten.h index cd29b6e7d73..6e5e059927d 100644 --- a/paddle/fluid/inference/anakin/convert/flatten.h +++ b/paddle/fluid/inference/anakin/convert/flatten.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class FlattenOpConverter : public AnakinOpConverter { +template +class FlattenOpConverter : public AnakinOpConverter { public: FlattenOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/helper.cc b/paddle/fluid/inference/anakin/convert/helper.cc new file mode 100644 index 00000000000..7804619bf83 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/helper.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +std::unique_ptr tensor_from_var( + const framework::Variable& var, const platform::Place& place) { + auto& src = var.Get(); + std::unique_ptr dst(new framework::LoDTensor()); + dst->Resize(src.dims()); + TensorCopySync((src), place, dst.get()); + return dst; +} + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/helper.h b/paddle/fluid/inference/anakin/convert/helper.h new file mode 100644 index 00000000000..5581f7dd641 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/helper.h @@ -0,0 +1,88 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/variable.h" + +#include "framework/core/net/net.h" +#include "framework/core/types.h" +#include "framework/graph/graph.h" +#include "framework/graph/graph_global_mem.h" +#include "saber/saber_types.h" + +using anakin::saber::Shape; +using anakin::AK_FLOAT; +using anakin::PBlock; +using anakin::graph::GraphGlobalMem; + +namespace paddle { +namespace inference { +namespace anakin { + +std::unique_ptr tensor_from_var( + const framework::Variable& var, const platform::Place& place); +template +PBlock* pblock_from_tensor(const framework::LoDTensor& tensor, + std::vector shape) { + while (shape.size() < 4) { + shape.insert(shape.begin(), 1); + } + Shape anakin_shape(shape); + auto* weight = + GraphGlobalMem::Global().template new_block(anakin_shape); + float* cpu_data = static_cast(weight->h_tensor().mutable_data()); + std::copy_n(tensor.data(), tensor.numel(), cpu_data); + weight->d_tensor().set_shape(anakin_shape); + weight->d_tensor().copy_from(weight->h_tensor()); + return weight; +} + +template +PBlock* pblock_from_vector(const std::vector& vec, + std::vector shape_vec) { + while (shape_vec.size() < 4) { + shape_vec.insert(shape_vec.begin(), 1); + } + Shape shape(shape_vec); + auto* weight = + GraphGlobalMem::Global().template new_block(shape); + auto* weight_data = static_cast(weight->h_tensor().mutable_data()); + std::copy(std::begin(vec), std::end(vec), weight_data); + weight->d_tensor().set_shape(shape); + weight->d_tensor().copy_from(weight->h_tensor()); + return weight; +} + +template +PBlock* pblock_from_vector(const std::vector& vec) { + int size = vec.size(); + return pblock_from_vector(vec, std::vector({1, 1, 1, size})); +} + +template +PBlock* pblock_from_var(const framework::Variable& var) { + auto tensor = tensor_from_var(var, platform::CPUPlace()); + auto shape = framework::vectorize2int(tensor->dims()); + return pblock_from_tensor(*tensor, shape); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.cc b/paddle/fluid/inference/anakin/convert/im2sequence.cc index bd7e9b4b63c..37f3f425a4f 100644 --- a/paddle/fluid/inference/anakin/convert/im2sequence.cc +++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc @@ -23,8 +23,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void Im2SequenceConverter::operator()( +template +void Im2SequenceConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -55,5 +55,18 @@ void Im2SequenceConverter::operator()( } // namespace inference } // namespace paddle -REGISTER_CUDA_ANAKIN_OP_CONVERTER(im2sequence, - Im2SequenceConverter<::anakin::saber::NV>); +#ifdef PADDLE_WITH_CUDA +using im2sequence_nv_fp32 = ::paddle::inference::anakin::Im2SequenceConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using im2sequence_nv_int8 = ::paddle::inference::anakin::Im2SequenceConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_nv_int8); +#endif + +using im2sequence_cpu_fp32 = ::paddle::inference::anakin::Im2SequenceConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using im2sequence_cpu_int8 = ::paddle::inference::anakin::Im2SequenceConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.h b/paddle/fluid/inference/anakin/convert/im2sequence.h index 97d1564b028..8241d4d6f9c 100644 --- a/paddle/fluid/inference/anakin/convert/im2sequence.h +++ b/paddle/fluid/inference/anakin/convert/im2sequence.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class Im2SequenceConverter : public AnakinOpConverter { +template +class Im2SequenceConverter : public AnakinOpConverter { public: Im2SequenceConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index 71631a7745c..6ff49c4a820 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -32,9 +32,9 @@ namespace paddle { namespace inference { namespace anakin { -template +template class AnakinOpConverter { - using AnakinEngineT = AnakinEngine; + using AnakinEngineT = AnakinEngine; public: AnakinOpConverter() = default; @@ -96,6 +96,13 @@ class AnakinOpConverter { engine->Graph()->RegistVar(output); } engine->Freeze(); + // Add scale for tensor in int8 mode. + auto tensor_scales = engine->GetTensorScales(); + + for (auto &item : tensor_scales) { + engine->Graph()->SetVarScale(item.first, item.second); + } + for (auto &input : inputs) { if (parameters.count(input)) continue; std::vector input_shape; @@ -136,52 +143,78 @@ class AnakinOpConverter { AnakinEngineT *engine_{nullptr}; private: - std::unordered_map *> converters_; + std::unordered_map *> + converters_; framework::Scope *scope_{nullptr}; std::mutex mutex_; }; -template class AnakinOpConverter<::anakin::saber::NV>; -template class AnakinOpConverter<::anakin::saber::X86>; +template class AnakinOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +template class AnakinOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; + +template class AnakinOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +template class AnakinOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; } // namespace anakin } // namespace inference } // namespace paddle #define REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, \ - place_type__, place_class__) \ - struct anakin_##op_type__##_##place_type__##_converter \ + place_type__, place_class__, \ + precision_type__, precision_class__) \ + struct anakin_##op_type__##_##place_type__##_##precision_type__##_converter \ : public ::paddle::framework::Registrar { \ - anakin_##op_type__##_##place_type__##_converter() { \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter() { \ LOG(INFO) << "register convert " << #op_type__ << " "; \ ::paddle::inference::Registry< \ - ::paddle::inference::anakin::AnakinOpConverter>:: \ - Global() \ - .Register<::paddle::inference::anakin::Converter__>(#op_type__); \ + ::paddle::inference::anakin::AnakinOpConverter< \ + place_class__, precision_class__>>::Global() \ + .Register(#op_type__); \ } \ }; \ - anakin_##op_type__##_##place_type__##_converter \ - anakin_##op_type__##_##place_type__##_converter__; \ - int TouchConverterRegister_anakin_##op_type__##_##place_type__() { \ - anakin_##op_type__##_##place_type__##_converter__.Touch(); \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter__; \ + int Touch_anakin_##op_type__##_##place_type__##_##precision_type__() { \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter__ \ + .Touch(); \ return 0; \ } #define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \ - ::anakin::saber::NV) + ::anakin::saber::NV, FP32, \ + ::anakin::Precision::FP32) + +#define REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ + REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \ + ::anakin::saber::NV, INT8, \ + ::anakin::Precision::INT8) #define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \ - ::anakin::saber::X86) + ::anakin::saber::X86, FP32, \ + ::anakin::Precision::FP32) + +#define REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ + REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \ + ::anakin::saber::X86, INT8, \ + ::anakin::Precision::INT8) -#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__) \ - extern int TouchConverterRegister_anakin_##op_type__##_##place_type__(); \ - int use_op_converter_anakin_##op_type__##_##place_type__ \ - __attribute__((unused)) = \ - TouchConverterRegister_anakin_##op_type__##_##place_type__(); +#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \ + extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \ + int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__ \ + __attribute__((unused)) = \ + Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); #define USE_ANAKIN_CONVERTER(op_type__) \ - USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA) + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) +#define USE_INT8_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) #define USE_CPU_ANAKIN_CONVERTER(op_type__) \ - USE_ANAKIN_CONVERTER_BASE(op_type__, CPU) + USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32) +#define USE_CPU_INT8_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8) diff --git a/paddle/fluid/inference/anakin/convert/pool2d.cc b/paddle/fluid/inference/anakin/convert/pool2d.cc index d0206a5bf9b..436741b43b7 100644 --- a/paddle/fluid/inference/anakin/convert/pool2d.cc +++ b/paddle/fluid/inference/anakin/convert/pool2d.cc @@ -23,8 +23,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void Pool2dOpConverter::operator()( +template +void Pool2dOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -72,8 +72,21 @@ void Pool2dOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(pool2d, - Pool2dOpConverter<::anakin::saber::NV>); +using pool2d_nv_float32 = + ::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using pool2d_nv_int8 = + ::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(pool2d, pool2d_nv_float32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(pool2d, pool2d_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(pool2d, - Pool2dOpConverter<::anakin::saber::X86>); + +using pool2d_cpu_float32 = + ::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using pool2d_cpu_int8 = + ::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(pool2d, pool2d_cpu_float32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(pool2d, pool2d_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/pool2d.h b/paddle/fluid/inference/anakin/convert/pool2d.h index 0f85ec14b33..7a06ff1b660 100644 --- a/paddle/fluid/inference/anakin/convert/pool2d.h +++ b/paddle/fluid/inference/anakin/convert/pool2d.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class Pool2dOpConverter : public AnakinOpConverter { +template +class Pool2dOpConverter : public AnakinOpConverter { public: Pool2dOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/relu.cc b/paddle/fluid/inference/anakin/convert/relu.cc index 71de3113cba..6d456ccfdcd 100644 --- a/paddle/fluid/inference/anakin/convert/relu.cc +++ b/paddle/fluid/inference/anakin/convert/relu.cc @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void ReluOpConverter::operator()( +template +void ReluOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -36,8 +36,8 @@ void ReluOpConverter::operator()( this->engine_->AddOpAttr(op_name, "alpha", 0); } -template -void LeakyReluOpConverter::operator()( +template +void LeakyReluOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -58,10 +58,35 @@ void LeakyReluOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(relu, ReluOpConverter<::anakin::saber::NV>); -REGISTER_CUDA_ANAKIN_OP_CONVERTER(leaky_relu, - LeakyReluOpConverter<::anakin::saber::NV>); +using relu_nv_fp32 = + ::paddle::inference::anakin::ReluOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using leaky_nv_fp32 = ::paddle::inference::anakin::LeakyReluOpConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using relu_nv_int8 = + ::paddle::inference::anakin::ReluOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +using leaky_nv_int8 = ::paddle::inference::anakin::LeakyReluOpConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; + +REGISTER_CUDA_ANAKIN_OP_CONVERTER(relu, relu_nv_fp32); +REGISTER_CUDA_ANAKIN_OP_CONVERTER(leaky_relu, leaky_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(relu, relu_nv_int8); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(leaky_relu, leaky_nv_int8); + #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(relu, ReluOpConverter<::anakin::saber::X86>); -REGISTER_CPU_ANAKIN_OP_CONVERTER(leaky_relu, - LeakyReluOpConverter<::anakin::saber::X86>); + +using relu_cpu_fp32 = + ::paddle::inference::anakin::ReluOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using leaky_cpu_fp32 = ::paddle::inference::anakin::LeakyReluOpConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using relu_cpu_int8 = + ::paddle::inference::anakin::ReluOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +using leaky_cpu_int8 = ::paddle::inference::anakin::LeakyReluOpConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(relu, relu_cpu_fp32); +REGISTER_CPU_ANAKIN_OP_CONVERTER(leaky_relu, leaky_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(relu, relu_cpu_int8); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(leaky_relu, leaky_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/relu.h b/paddle/fluid/inference/anakin/convert/relu.h index 74222a7ea1b..f366f05a94a 100644 --- a/paddle/fluid/inference/anakin/convert/relu.h +++ b/paddle/fluid/inference/anakin/convert/relu.h @@ -22,8 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class ReluOpConverter : public AnakinOpConverter { +template +class ReluOpConverter : public AnakinOpConverter { public: ReluOpConverter() = default; @@ -34,8 +34,8 @@ class ReluOpConverter : public AnakinOpConverter { virtual ~ReluOpConverter() {} }; -template -class LeakyReluOpConverter : public AnakinOpConverter { +template +class LeakyReluOpConverter : public AnakinOpConverter { public: LeakyReluOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc index a6696e8e81b..b7b47e30b1c 100644 --- a/paddle/fluid/inference/anakin/convert/reshape.cc +++ b/paddle/fluid/inference/anakin/convert/reshape.cc @@ -21,8 +21,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void ReshapeOpConverter::operator()( +template +void ReshapeOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -47,9 +47,21 @@ void ReshapeOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(reshape, - ReshapeOpConverter<::anakin::saber::NV>); +using reshape_nv_fp32 = + ::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using reshape_nv_int8 = + ::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(reshape, reshape_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(reshape, reshape_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(reshape, - ReshapeOpConverter<::anakin::saber::X86>); +using reshape_cpu_fp32 = + ::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using reshape_cpu_int8 = + ::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(reshape, reshape_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(reshape, reshape_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/reshape.h b/paddle/fluid/inference/anakin/convert/reshape.h index bd0fd08c5cb..88de2641e60 100644 --- a/paddle/fluid/inference/anakin/convert/reshape.h +++ b/paddle/fluid/inference/anakin/convert/reshape.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class ReshapeOpConverter : public AnakinOpConverter { +template +class ReshapeOpConverter : public AnakinOpConverter { public: ReshapeOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/roi_align.cc b/paddle/fluid/inference/anakin/convert/roi_align.cc index 152578b50fe..68d3bffd89d 100644 --- a/paddle/fluid/inference/anakin/convert/roi_align.cc +++ b/paddle/fluid/inference/anakin/convert/roi_align.cc @@ -16,17 +16,12 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; - namespace paddle { namespace inference { namespace anakin { -template -void RoiAlignOpConverter::operator()( +template +void RoiAlignOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -57,8 +52,21 @@ void RoiAlignOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(roi_align, - RoiAlignOpConverter<::anakin::saber::NV>); +using roi_align_nv_fp32 = + ::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using roi_align_nv_int8 = + ::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(roi_align, roi_align_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(roi_align, roi_align_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(roi_align, - RoiAlignOpConverter<::anakin::saber::X86>); + +using roi_align_cpu_fp32 = + ::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using roi_align_cpu_int8 = + ::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(roi_align, roi_align_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(roi_align, roi_align_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/roi_align.h b/paddle/fluid/inference/anakin/convert/roi_align.h index 93c28f3e055..8b5d23a0167 100644 --- a/paddle/fluid/inference/anakin/convert/roi_align.h +++ b/paddle/fluid/inference/anakin/convert/roi_align.h @@ -22,8 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class RoiAlignOpConverter : public AnakinOpConverter { +template +class RoiAlignOpConverter : public AnakinOpConverter { public: RoiAlignOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/scale.cc b/paddle/fluid/inference/anakin/convert/scale.cc index d72f9a5fa0c..cdfdf86a974 100644 --- a/paddle/fluid/inference/anakin/convert/scale.cc +++ b/paddle/fluid/inference/anakin/convert/scale.cc @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void ScaleOpConverter::operator()( +template +void ScaleOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -49,4 +49,22 @@ void ScaleOpConverter::operator()( } // namespace inference } // namespace paddle -REGISTER_CUDA_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter<::anakin::saber::NV>); +#ifdef PADDLE_WITH_CUDA +using scale_nv_fp32 = + ::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using scale_nv_int8 = + ::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(scale, scale_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(scale, scale_nv_int8); +#endif + +using scale_cpu_fp32 = + ::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using scale_cpu_int8 = + ::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(scale, scale_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(scale, scale_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/scale.h b/paddle/fluid/inference/anakin/convert/scale.h index 92d936b5262..f19a9201934 100644 --- a/paddle/fluid/inference/anakin/convert/scale.h +++ b/paddle/fluid/inference/anakin/convert/scale.h @@ -22,8 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class ScaleOpConverter : public AnakinOpConverter { +template +class ScaleOpConverter : public AnakinOpConverter { public: ScaleOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/softmax.cc b/paddle/fluid/inference/anakin/convert/softmax.cc index 851dafa8bdf..eb50e17e55f 100644 --- a/paddle/fluid/inference/anakin/convert/softmax.cc +++ b/paddle/fluid/inference/anakin/convert/softmax.cc @@ -18,8 +18,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void SoftMaxOpConverter::operator()( +template +void SoftMaxOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -45,9 +45,22 @@ void SoftMaxOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(softmax, - SoftMaxOpConverter<::anakin::saber::NV>); +using sm_nv_fp32 = + ::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using sm_nv_int8 = + ::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; + +REGISTER_CUDA_ANAKIN_OP_CONVERTER(softmax, sm_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(softmax, sm_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(softmax, - SoftMaxOpConverter<::anakin::saber::X86>); +using sm_cpu_fp32 = + ::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using sm_cpu_int8 = + ::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(softmax, sm_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(softmax, sm_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/softmax.h b/paddle/fluid/inference/anakin/convert/softmax.h index c2421f9eb9d..dc431b5b867 100644 --- a/paddle/fluid/inference/anakin/convert/softmax.h +++ b/paddle/fluid/inference/anakin/convert/softmax.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class SoftMaxOpConverter : public AnakinOpConverter { +template +class SoftMaxOpConverter : public AnakinOpConverter { public: SoftMaxOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/split.cc b/paddle/fluid/inference/anakin/convert/split.cc index f99233e78b5..b84860220fb 100644 --- a/paddle/fluid/inference/anakin/convert/split.cc +++ b/paddle/fluid/inference/anakin/convert/split.cc @@ -22,8 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void SplitOpConverter::operator()( +template +void SplitOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -56,7 +56,22 @@ void SplitOpConverter::operator()( } // namespace inference } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(split, SplitOpConverter<::anakin::saber::NV>); +using split_nv_fp32 = + ::paddle::inference::anakin::SplitOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using split_nv_int8 = + ::paddle::inference::anakin::SplitOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(split, split_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(split, split_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(split, SplitOpConverter<::anakin::saber::X86>); +using split_cpu_fp32 = + ::paddle::inference::anakin::SplitOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using split_cpu_int8 = + ::paddle::inference::anakin::SplitOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; + +REGISTER_CPU_ANAKIN_OP_CONVERTER(split, split_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(split, split_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/split.h b/paddle/fluid/inference/anakin/convert/split.h index 989d7acd500..819915315d9 100644 --- a/paddle/fluid/inference/anakin/convert/split.h +++ b/paddle/fluid/inference/anakin/convert/split.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class SplitOpConverter : public AnakinOpConverter { +template +class SplitOpConverter : public AnakinOpConverter { public: SplitOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/sum.cc b/paddle/fluid/inference/anakin/convert/sum.cc index 7fc9d764078..2bc4d124c90 100644 --- a/paddle/fluid/inference/anakin/convert/sum.cc +++ b/paddle/fluid/inference/anakin/convert/sum.cc @@ -23,11 +23,10 @@ namespace paddle { namespace inference { namespace anakin { -template -void SumOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void SumOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); @@ -49,6 +48,21 @@ void SumOpConverter::operator()(const framework::proto::OpDesc &op, } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(sum, SumOpConverter<::anakin::saber::NV>); +using sum_nv_fp32 = + ::paddle::inference::anakin::SumOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +using sum_nv_int8 = + ::paddle::inference::anakin::SumOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(sum, sum_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(sum, sum_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(sum, SumOpConverter<::anakin::saber::X86>); + +using sum_cpu_fp32 = + ::paddle::inference::anakin::SumOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +using sum_cpu_int8 = + ::paddle::inference::anakin::SumOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(sum, sum_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(sum, sum_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/sum.h b/paddle/fluid/inference/anakin/convert/sum.h index 27c15a82ebd..aefc64c623e 100644 --- a/paddle/fluid/inference/anakin/convert/sum.h +++ b/paddle/fluid/inference/anakin/convert/sum.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class SumOpConverter : public AnakinOpConverter { +template +class SumOpConverter : public AnakinOpConverter { public: SumOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc index 18b8b6f3b63..67d3222d985 100644 --- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc @@ -27,8 +27,8 @@ static void test_activation_op(const std::string& op_type, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("act-X", {10, 6, 1, 1}); validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); framework::OpDesc desc; @@ -57,6 +57,7 @@ TEST(tanh_op, gpu) { } #endif +/* TEST(sigm_op, cpu) { platform::CPUPlace cpu_place; platform::CPUDeviceContext ctx(cpu_place); @@ -68,6 +69,7 @@ TEST(tanh_op, cpu) { platform::CPUDeviceContext ctx(cpu_place); test_activation_op<::anakin::saber::X86>("tanh", ctx, false); } +*/ } // namespace anakin } // namespace inference diff --git a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc index 123f93370b8..f6399387aa2 100644 --- a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc @@ -28,8 +28,8 @@ void test_affine_channel_op(const platform::DeviceContext& context, std::unordered_set parameters({"scale", "bias"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("x", {1, 3, 5, 2}); validator.DeclOutputVar("out", {1, 3, 5, 2}); validator.DeclParamVar("scale", {3}); diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc index 6a6675b6abf..c008ef1bd5e 100644 --- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc @@ -25,8 +25,8 @@ void test_batchnorm_op(const platform::DeviceContext& context, bool use_gpu) { {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean", "batch_norm_variance"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); std::vector param_shape{2}; validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5}); diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc index 4ea3305e466..42dfbeb5cdc 100644 --- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc @@ -25,8 +25,8 @@ template void test_concat_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters({""}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("concat_x1", {1, 2, 1, 1}); validator.DeclInputVar("concat_x2", {1, 3, 1, 1}); validator.DeclInputVar("concat_x3", {1, 1, 1, 1}); diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc index fa1b319bc1c..e95e11c4f96 100644 --- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc @@ -25,8 +25,8 @@ template void test_conv2d_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters({"conv2d-Y"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("conv2d-X", {1, 3, 3, 3}); validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1}); validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3}); diff --git a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc index a252dc74c0b..ae27e27ded5 100644 --- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc @@ -25,8 +25,8 @@ template void test_dropout_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("x", {1, 1, 2, 2}); validator.DeclOutputVar("out", {1, 1, 2, 2}); validator.DeclOutputVar("mask", {1, 1, 2, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc index ee1bedcfb25..bff75294908 100644 --- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc @@ -27,8 +27,8 @@ static void test_elementwise_op(const std::string& op_type, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("x", {1, 1, 2, 2}); validator.DeclInputVar("y", {1, 1, 2, 2}); validator.DeclOutputVar("out", {1, 1, 2, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc index 5510008d3c4..a24c809c022 100644 --- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc @@ -25,8 +25,8 @@ void test_mul_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters({"mul_y"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("mul_x", {1, 1, 2, 2}); validator.DeclParamVar("mul_y", {4, 2}); validator.DeclOutputVar("mul_out", {1, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc index 86bc1d810f8..5765f5ebd1f 100644 --- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc @@ -24,8 +24,8 @@ template void test_flatten_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("flatten-X", {3, 10, 10, 4}); validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1}); framework::OpDesc desc; diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc index b1be7f93c67..90503b1fbba 100644 --- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc @@ -25,8 +25,8 @@ void test_pool2d(const platform::DeviceContext& context, bool use_gpu, std::string pool_type = "max") { framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); // The ITensor's Dims should not contain the batch size. // So, the ITensor's Dims of input and output should be C * H * W. diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc index 369f1920f24..3f224796519 100644 --- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc @@ -27,8 +27,8 @@ static void test_activation_op(const std::string& op_type, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("act-X", {10, 6, 1, 1}); validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); framework::OpDesc desc; @@ -60,20 +60,6 @@ TEST(leaky_relu_op, gpu) { } #endif -/* seems bug here -TEST(relu_op, cpu) { - platform::CPUPlace cpu_place; - platform::CPUDeviceContext ctx(cpu_place); - test_activation_op<::anakin::saber::X86>("relu", ctx, false); -} - -TEST(leaky_relu_op, cpu) { - platform::CPUPlace cpu_place; - platform::CPUDeviceContext ctx(cpu_place); - test_activation_op<::anakin::saber::X86>("leaky_relu", ctx, false); -} -*/ - } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc index 3facdbe9c69..e102bd3ac3e 100644 --- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc @@ -24,8 +24,8 @@ template void test_reshape1_op(const platform::DeviceContext& context, bool use_gpu) { framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); // validator.DeclInputVar("reshape-X", {2, 3, 3, 1}); // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3}); @@ -49,8 +49,8 @@ template void test_reshape2_op(const platform::DeviceContext& context, bool use_gpu) { framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("reshape-X", {1, 2, 4}); validator.DeclOutputVar("reshape-Out", {1, 4, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc index e15d19135b4..de0b18fdbfd 100644 --- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc @@ -24,8 +24,8 @@ template void test_softmax_op(const platform::DeviceContext& context, bool use_gpu) { framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("softmax-X", {1, 10, 2}); validator.DeclOutputVar("softmax-Out", {1, 10, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_split_op.cc b/paddle/fluid/inference/anakin/convert/test_split_op.cc index 7131b07558d..9a42ffd853b 100644 --- a/paddle/fluid/inference/anakin/convert/test_split_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc @@ -27,8 +27,8 @@ void AnakinSliceTest(const platform::DeviceContext &context, bool use_gpu, const std::vector §ions) { std::unordered_set parameters({""}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("split_input", in_shape); std::vector output_vars; diff --git a/paddle/fluid/inference/anakin/convert/test_sum_op.cc b/paddle/fluid/inference/anakin/convert/test_sum_op.cc index 8714890666c..65f67ebd129 100644 --- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc @@ -26,8 +26,8 @@ template static void test_sum_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("sum_x1", {1, 2, 1, 2}); validator.DeclInputVar("sum_x2", {1, 2, 1, 2}); validator.DeclOutputVar("sum_out", {1, 2, 1, 2}); diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc index 6b2f1ed1566..51b69dfbb08 100644 --- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc @@ -24,8 +24,8 @@ template void test_transpose1_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("transpose-X", {2, 3, 4, 5}); validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3}); @@ -47,8 +47,8 @@ template void test_transpose2_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope, context, - use_gpu); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("transpose-X", {3, 4, 5}); validator.DeclOutputVar("transpose-Out", {3, 5, 4}); diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc index cffc526065f..849bfc9ea3e 100644 --- a/paddle/fluid/inference/anakin/convert/transpose.cc +++ b/paddle/fluid/inference/anakin/convert/transpose.cc @@ -23,8 +23,8 @@ namespace paddle { namespace inference { namespace anakin { -template -void TransposeOpConverter::operator()( +template +void TransposeOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -50,9 +50,17 @@ void TransposeOpConverter::operator()( } // namespace paddle #ifdef PADDLE_WITH_CUDA -REGISTER_CUDA_ANAKIN_OP_CONVERTER(transpose, - TransposeOpConverter<::anakin::saber::NV>); +using transpose_nv_fp32 = ::paddle::inference::anakin::TransposeOpConverter< + ::anakin::saber::NV, ::anakin::Precision::FP32>; +using transpose_nv_int8 = ::paddle::inference::anakin::TransposeOpConverter< + ::anakin::saber::NV, ::anakin::Precision::INT8>; +REGISTER_CUDA_ANAKIN_OP_CONVERTER(transpose, transpose_nv_fp32); +REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(transpose, transpose_nv_int8); #endif -REGISTER_CPU_ANAKIN_OP_CONVERTER(transpose, - TransposeOpConverter<::anakin::saber::X86>); +using transpose_cpu_fp32 = ::paddle::inference::anakin::TransposeOpConverter< + ::anakin::saber::X86, ::anakin::Precision::FP32>; +using transpose_cpu_int8 = ::paddle::inference::anakin::TransposeOpConverter< + ::anakin::saber::X86, ::anakin::Precision::INT8>; +REGISTER_CPU_ANAKIN_OP_CONVERTER(transpose, transpose_cpu_fp32); +REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(transpose, transpose_cpu_int8); diff --git a/paddle/fluid/inference/anakin/convert/transpose.h b/paddle/fluid/inference/anakin/convert/transpose.h index 54090468ae1..b7b0a0f209e 100644 --- a/paddle/fluid/inference/anakin/convert/transpose.h +++ b/paddle/fluid/inference/anakin/convert/transpose.h @@ -20,8 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -template -class TransposeOpConverter : public AnakinOpConverter { +template +class TransposeOpConverter : public AnakinOpConverter { public: TransposeOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h index 140a33a7cbb..2f8f953892c 100644 --- a/paddle/fluid/inference/anakin/convert/ut_helper.h +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -61,7 +61,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, auto* temp_data = temp_tensor.mutable_data(cpu_place); for (size_t i = 0; i < num_elements; i++) { - *(temp_data + i) = random(-128., 128.); + *(temp_data + i) = random(0., 1.); } TensorCopySync(temp_tensor, place, tensor); @@ -72,9 +72,9 @@ void RandomizeTensor(framework::LoDTensor* tensor, * anakin * layer. */ -template +template class AnakinConvertValidation { - using AnakinNvEngineT = AnakinEngine; + using AnakinNvEngineT = AnakinEngine; public: AnakinConvertValidation() = delete; @@ -84,7 +84,7 @@ class AnakinConvertValidation { const platform::DeviceContext& ctx, bool use_gpu = true) : parameters_(parameters), scope_(scope), ctx_(ctx), use_gpu_(use_gpu) { - engine_.reset(new AnakinEngine(true)); + engine_.reset(new AnakinEngine(true)); } // Declare a Variable as input with random initialization. @@ -127,7 +127,7 @@ class AnakinConvertValidation { // should init anakin engine here. auto& block_desc = program_desc_.Block(framework::kRootBlockIndex); - Singleton>::Global().ConvertOp( + Singleton>::Global().ConvertOp( desc, block_desc, parameters_, *scope_, engine_.get(), true /*test_mode*/); engine_->Freeze(); @@ -213,8 +213,15 @@ class AnakinConvertValidation { bool use_gpu_{true}; }; -template class AnakinConvertValidation<::anakin::saber::NV>; -template class AnakinConvertValidation<::anakin::saber::X86>; +template class AnakinConvertValidation<::anakin::saber::NV, + ::anakin::Precision::FP32>; +template class AnakinConvertValidation<::anakin::saber::X86, + ::anakin::Precision::FP32>; + +template class AnakinConvertValidation<::anakin::saber::NV, + ::anakin::Precision::INT8>; +template class AnakinConvertValidation<::anakin::saber::X86, + ::anakin::Precision::INT8>; } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index 17e66122243..90bc9c2514c 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -172,11 +172,20 @@ AnakinEngine::Clone() { #ifdef PADDLE_WITH_CUDA template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>; -template class AnakinEngineManager<::anakin::saber::NV>; +template class AnakinEngineManager<::anakin::saber::NV, + ::anakin::Precision::FP32>; + +template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>; +template class AnakinEngineManager<::anakin::saber::NV, + ::anakin::Precision::INT8>; #endif template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; -template class AnakinEngineManager<::anakin::saber::X86>; +template class AnakinEngineManager<::anakin::saber::X86, + ::anakin::Precision::FP32>; +template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>; +template class AnakinEngineManager<::anakin::saber::X86, + ::anakin::Precision::INT8>; // template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; } // namespace anakin diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h index 215c8a6c614..ade15537db8 100644 --- a/paddle/fluid/inference/anakin/engine.h +++ b/paddle/fluid/inference/anakin/engine.h @@ -93,6 +93,12 @@ class AnakinEngine { void Save(std::string path) { graph_->save(path); } bool IsInit() { return initialized_; } int GetDevice() { return device_; } + void AddTensorScale(const std::string &tensor_name, float scale) { + tensor_scales_[tensor_name] = scale; + } + std::unordered_map GetTensorScales() { + return tensor_scales_; + } void Execute(const std::map &inputs, const std::map &outputs); #ifdef PADDLE_WITH_CUDA @@ -112,11 +118,12 @@ class AnakinEngine { std::unique_ptr graph_; std::unique_ptr net_; std::vector program_inputs_; + std::unordered_map tensor_scales_; }; -template +template class AnakinEngineManager { - using AnakinEngineT = AnakinEngine; + using AnakinEngineT = AnakinEngine; public: bool HasEngine(const std::string &name) const { @@ -132,7 +139,7 @@ class AnakinEngineManager { std::vector program_inputs, std::string engine_name) { std::unique_lock lk(mut_); - auto *p = new AnakinEngine( + auto *p = new AnakinEngine( need_summary, device, max_batch_size, max_input_shape, program_inputs); engines_[engine_name].reset(p); return p; diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 37b7583fde2..0e6374201f4 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -169,7 +169,13 @@ struct Argument { anakin_max_shape_t); DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int); DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(anakin_precision_mode, AnakinPrecisionMode, + AnalysisConfig::Precision); DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); + DECL_ARGUMENT_FIELD(anakin_passes_filter, AnakinPassesFilter, + std::vector); + DECL_ARGUMENT_FIELD(anakin_ops_filter, AnakinOpsFilter, + std::vector); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index bbc3938969a..25db3346cff 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -123,6 +123,11 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("max_input_shape", new std::map>( argument->anakin_max_input_shape())); pass->Set("max_batch_size", new int(argument->anakin_max_batch_size())); + bool enable_int8 = + argument->anakin_precision_mode() == AnalysisConfig::Precision::kInt8; + pass->Set("enable_int8", new bool(enable_int8)); + pass->Set("anakin_ops_filter", + new std::vector(argument->anakin_ops_filter())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index 658006c22cd..5f74121dc3a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -39,8 +39,14 @@ void analysis::AnakinSubgraphPass::ApplyImpl( framework::ir::Graph *graph) const { framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph); - auto teller = [](const framework::ir::Node *node) { - if (!node->IsOp() || !node->Op()) return false; + auto &anakin_ops_filter = Get>("anakin_ops_filter"); + + auto teller = [&anakin_ops_filter](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) + return false; + else if (std::find(anakin_ops_filter.begin(), anakin_ops_filter.end(), + node->Op()->Type()) != anakin_ops_filter.end()) + return false; return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); }; @@ -191,47 +197,71 @@ void AnakinSubgraphPass::CreateAnakinOp( SetAttr(op_desc->Proto(), "engine_key", engine_key); auto max_input_shape = Get>>("max_input_shape"); - auto max_batch_size = Get("max_batch_size"); auto program_inputs = program_desc->GetFeedTargetNames(); bool use_gpu = Get("use_gpu"); SetAttr(op_desc->Proto(), "use_gpu", use_gpu); + bool enable_int8 = Get("enable_int8"); + SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + if (enable_int8) { + CreateAnakinEngine<::anakin::Precision::INT8>(&block_desc, params, + input_names, output_mapping, + program_inputs, engine_key); + } else { + CreateAnakinEngine<::anakin::Precision::FP32>(&block_desc, params, + input_names, output_mapping, + program_inputs, engine_key); + } +} +template <::anakin::Precision PrecisionT> +void AnakinSubgraphPass::CreateAnakinEngine( + framework::BlockDesc *block_desc, const std::vector ¶ms, + const std::set &input_names, + const std::vector &output_mapping, + const std::vector &program_inputs, + const std::string &engine_key) const { + framework::BlockDesc block_desc_temp(nullptr, block_desc->Proto()); + bool use_gpu = Get("use_gpu"); + auto max_batch_size = Get("max_batch_size"); + auto max_input_shape = + Get>>("max_input_shape"); if (use_gpu) { #ifdef PADDLE_WITH_CUDA inference::Singleton< - anakin::AnakinEngineManager<::anakin::saber::NV>>::Global() + anakin::AnakinEngineManager<::anakin::saber::NV, PrecisionT>>::Global() .Create(true, Get("gpu_device_id"), max_batch_size, max_input_shape, program_inputs, engine_key); #endif } else { inference::Singleton< - anakin::AnakinEngineManager<::anakin::saber::X86>>::Global() + anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global() .Create(true, Get("gpu_device_id"), max_batch_size, max_input_shape, program_inputs, engine_key); } auto *scope = param_scope(); std::unordered_set param_set(params.begin(), params.end()); - framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); if (use_gpu) { +#ifdef PADDLE_WITH_CUDA auto *anakin_engine = inference::Singleton>::Global() + ::anakin::saber::NV, PrecisionT>>::Global() .Get(engine_key); - inference::Singleton< - inference::anakin::AnakinOpConverter<::anakin::saber::NV>>::Global() + inference::Singleton>::Global() .ConvertBlockToAnakinEngine( &block_desc_temp, scope, std::vector(input_names.begin(), input_names.end()), param_set, output_mapping, anakin_engine); +#endif } else { auto *anakin_engine = inference::Singleton>::Global() + ::anakin::saber::X86, PrecisionT>>::Global() .Get(engine_key); - inference::Singleton< - inference::anakin::AnakinOpConverter<::anakin::saber::X86>>::Global() + inference::Singleton>::Global() .ConvertBlockToAnakinEngine( &block_desc_temp, scope, std::vector(input_names.begin(), input_names.end()), diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h index e80b8bb6120..4ab2297b2d4 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h @@ -15,6 +15,7 @@ #pragma once #include #include +#include #include #include #include "paddle/fluid/framework/ir/pass.h" @@ -36,6 +37,13 @@ class AnakinSubgraphPass : public framework::ir::FusePassBase { const std::vector &graph_params, std::vector *repetitive_params) const; void CleanIntermediateOutputs(framework::ir::Node *node); + template <::anakin::Precision PrecisionT> + void CreateAnakinEngine(framework::BlockDesc *block_desc, + const std::vector ¶ms, + const std::set &input_names, + const std::vector &output_mapping, + const std::vector &program_inputs, + const std::string &engine_key) const; }; } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 4f9e0b63956..228d80bf9f7 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -116,6 +116,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(anakin_max_batchsize_); CP_MEMBER(anakin_max_input_shape_); CP_MEMBER(anakin_min_subgraph_size_); + CP_MEMBER(anakin_precision_mode_); + CP_MEMBER(anakin_passes_filter_); + CP_MEMBER(anakin_ops_filter_); // Ir related. CP_MEMBER(enable_ir_optim_); @@ -276,7 +279,10 @@ void AnalysisConfig::Update() { pass_builder()->ClearPasses(); for (const auto &pass : kAnakinSubgraphPasses) { - pass_builder()->AppendPass(pass); + if (std::find(anakin_passes_filter_.begin(), anakin_passes_filter_.end(), + pass) == anakin_passes_filter_.end()) { + pass_builder()->AppendPass(pass); + } } } @@ -391,11 +397,16 @@ void AnalysisConfig::SwitchIrDebug(int x) { } void AnalysisConfig::EnableAnakinEngine( int max_batch_size, std::map> max_input_shape, - int min_subgraph_size) { + int min_subgraph_size, AnalysisConfig::Precision precision_mode, + std::vector passes_filter, + std::vector ops_filter) { anakin_max_batchsize_ = max_batch_size; anakin_max_input_shape_ = max_input_shape; anakin_min_subgraph_size_ = min_subgraph_size; + anakin_passes_filter_ = passes_filter; + anakin_ops_filter_ = ops_filter; use_anakin_ = true; + anakin_precision_mode_ = precision_mode; Update(); } } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 231beab641a..e1709fe2e67 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -386,6 +386,9 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_); argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_); argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_); + argument_.SetAnakinPrecisionMode(config_.anakin_precision_mode_); + argument_.SetAnakinPassesFilter(config_.anakin_passes_filter_); + argument_.SetAnakinOpsFilter(config_.anakin_ops_filter_); LOG(INFO) << "Anakin subgraph engine is enabled"; } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index c67c4b5bd0b..0f1c42c3602 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -152,7 +152,9 @@ struct AnalysisConfig { void EnableAnakinEngine( int max_batch_size = 1, std::map> max_input_shape = {}, - int min_subgraph_size = 6); + int min_subgraph_size = 6, Precision precision = Precision::kFloat32, + std::vector passes_filter = {}, + std::vector ops_filter = {}); /** A boolean state indicating whether the Anakin sub-graph engine is used. */ @@ -291,6 +293,9 @@ struct AnalysisConfig { int anakin_max_batchsize_; int anakin_min_subgraph_size_{6}; std::map> anakin_max_input_shape_; + Precision anakin_precision_mode_; + std::vector anakin_passes_filter_; + std::vector anakin_ops_filter_; std::map engine_opt_info_; bool use_mkldnn_quantizer_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 3d72295be4b..a3259f5321f 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -73,15 +73,21 @@ void PaddlePassBuilder::ClearPasses() { passes_.clear(); } // The following passes works for Anakin sub-graph engine. const std::vector kAnakinSubgraphPasses({ "infer_clean_graph_pass", // + "graph_viz_pass", // + "quant_conv2d_dequant_fuse_pass", // + "graph_viz_pass", // "simplify_anakin_priorbox_detection_out_pass", // "fillconstant_elementwisemul_fuse", // "fc_fuse_pass", // "conv_elementwise_add_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_elementwise_add_fuse_pass", // - "fc_gru_fuse_pass", // - "quant_conv2d_dequant_fuse_pass", // - "anakin_subgraph_pass", + // "conv_bn_fuse_pass", // + // "conv_elementwise_add_fuse_pass", // + "fc_gru_fuse_pass", // + "graph_viz_pass", // + "anakin_subgraph_pass", // + "graph_viz_pass", // + "fc_gru_fuse_pass", // + "graph_viz_pass", // }); GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h index 99c5a6dc84a..11c394c76cd 100644 --- a/paddle/fluid/operators/anakin/anakin_engine_op.h +++ b/paddle/fluid/operators/anakin/anakin_engine_op.h @@ -44,6 +44,7 @@ class AnakinEngineOp : public framework::OperatorBase { std::string engine_key_; std::string engine_serialized_data_; bool use_gpu_; + bool enable_int8_; public: AnakinEngineOp(const std::string &type, @@ -55,6 +56,7 @@ class AnakinEngineOp : public framework::OperatorBase { engine_key_ = Attr("engine_key"); auto params = Attr>("parameters"); use_gpu_ = Attr("use_gpu"); + enable_int8_ = Attr("enable_int8"); for (const auto ¶m : params) { param_names_.insert(param); } @@ -68,11 +70,6 @@ class AnakinEngineOp : public framework::OperatorBase { void RunAnakin(const framework::Scope &scope, const platform::Place &dev_place) const { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - auto stream = - reinterpret_cast(dev_ctx).stream(); - PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = @@ -96,18 +93,35 @@ class AnakinEngineOp : public framework::OperatorBase { outputs.insert({output_maps[output_index], fluid_t}); output_index += 1; } + if (enable_int8_) { + Execute<::anakin::Precision::INT8>(inputs, outputs, dev_place); + } else { + Execute<::anakin::Precision::FP32>(inputs, outputs, dev_place); + } + } + + template <::anakin::Precision PrecisionT> + void Execute(const std::map &inputs, + const std::map &outputs, + const platform::Place &dev_place) const { if (use_gpu_) { #ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx) + .stream(); auto *engine = inference::Singleton>::Global() + ::anakin::saber::NV, PrecisionT>>::Global() .Get(engine_key_); engine->Execute(inputs, outputs, stream); #endif } else { auto *engine = inference::Singleton>::Global() + ::anakin::saber::X86, PrecisionT>>::Global() .Get(engine_key_); engine->Execute(inputs, outputs); } diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index ace385ec60f..8385e6331d7 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include "paddle/fluid/inference/api/analysis_predictor.h" @@ -230,8 +231,13 @@ void BindAnalysisConfig(py::module *m) { py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("use_static") = true) .def("enable_anakin_engine", &AnalysisConfig::EnableAnakinEngine, - py::arg("max_batch_size") = 1, py::arg("max_input_shape") = {}, - py::arg("min_subgraph_size") = 6) + py::arg("max_batch_size") = 1, + py::arg("max_input_shape") = + std::map>(), + py::arg("min_subgraph_size") = 6, + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, + py::arg("passes_filter") = std::vector(), + py::arg("ops_filter") = std::vector()) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) -- GitLab