From 77c208358681ae334b825dd45c6fdc09673c8b88 Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Fri, 8 Nov 2019 15:04:36 +0100 Subject: [PATCH] Add transpose2 INT8 for mkl-dnn (#19424) * Add transpose2 INT8 for mkl-dnn test=develop * Fix test_transpose_int8_mkldnn test=develop * Revert "Merge branch 'develop' into transpose_int8_mkldnn_2" This reverts commit 34011bdba4c859abb945e062ab13124f70508054, reversing changes made to 2ce6473f144da298aba4a43d46918f27d463cf7c. * Revert "Revert "Merge branch 'develop' into transpose_int8_mkldnn_2"" This reverts commit 23754dd78ca47ae56881161172b2aacd349aba90. * Add template to TransposeMKLDNNHandler test=develop * Resolve conflict test=develop * Restore get_size and refactor test=develop --- cmake/operators.cmake | 5 +- .../framework/ir/graph_pattern_detector.cc | 21 +++ .../framework/ir/graph_pattern_detector.h | 15 ++ .../framework/ir/mkldnn/cpu_quantize_pass.cc | 55 +++++++ .../framework/ir/mkldnn/cpu_quantize_pass.h | 2 + .../ir/mkldnn/cpu_quantize_pass_tester.cc | 152 ++++++++++++------ .../inference/api/mkldnn_quantizer_config.cc | 3 + .../analyzer_int8_object_detection_tester.cc | 2 +- .../operators/mkldnn/dequantize_mkldnn_op.cc | 6 +- .../operators/mkldnn/transpose_mkldnn_op.cc | 44 +++-- paddle/fluid/operators/transpose_op.cc | 27 +++- paddle/fluid/operators/transpose_op.h | 2 + paddle/fluid/platform/mkldnn_reuse.h | 23 ++- .../mkldnn/test_transpose_int8_mkldnn_op.py | 15 +- 14 files changed, 296 insertions(+), 76 deletions(-) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 3d3ce56d89d..7d75690f6b7 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -186,7 +186,10 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n") - + elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, U8);\n") else() file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") endif() diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index d10394f4f31..bc0284e3fcd 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1143,6 +1143,27 @@ PDNode *patterns::Conv::operator()() { return output_var; } +PDNode *patterns::Transpose::operator()() { + auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op(); + + auto transpose_op = + pattern->NewNode(transpose_op_repr())->assert_is_op("transpose2"); + + auto transpose_in = pattern->NewNode(transpose_in_repr()) + ->AsInput() + ->assert_is_op_input("transpose2"); + auto transpose_out = pattern->NewNode(transpose_out_repr()) + ->AsOutput() + ->assert_is_op_output("transpose2", "Out"); + + auto next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + prev_op->LinksTo({transpose_in}); + transpose_op->LinksFrom({transpose_in}).LinksTo({transpose_out}); + next_op->LinksFrom({transpose_out}); + return transpose_out; +} + PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 5fea6523657..e6748cd34cb 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -733,6 +733,21 @@ struct ElementwiseAdd : public PatternBase { PATTERN_DECL_NODE(elementwise_add_out); }; +// Transpose op +// Forward pass for transpose. +// transpose_out is a result of the operator. +struct Transpose : public PatternBase { + Transpose(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "transpose2") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(prev_op); + PATTERN_DECL_NODE(transpose_in); + PATTERN_DECL_NODE(transpose_op); + PATTERN_DECL_NODE(transpose_out); + PATTERN_DECL_NODE(next_op); +}; + // Concat op // Forward pass for concat. // concat_out is a result of the operator. diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 9cf55ee3254..b743dee8b18 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -351,6 +351,60 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const { quantize_prior_box_count); } +void CPUQuantizePass::QuantizeTranspose(Graph* graph) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::Transpose transpose_pattern{pattern, name_scope_}; + transpose_pattern(); + + int quantize_transpose_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize transpose op"; + GET_IR_NODE_FROM_SUBGRAPH(transpose_op, transpose_op, transpose_pattern); + auto* transpose_op_desc = transpose_op->Op(); + + // skip if should not be quantized + if (!transpose_op_desc->GetAttrIfExists("use_quantizer")) { + return; + } + GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern); + GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, transpose_pattern); + + // skip if prev op is not quantized + // in future we should checked if next_op is quantized + // transpose INT8 schould be used only between INT8 operators + if (!(prev_op->Op()->Type() == "dequantize" || + (prev_op->Op()->GetAttrIfExists("use_quantizer")))) { + return; + } + + GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose_out, transpose_out, transpose_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + auto input_scale = scales[transpose_in->Name()].second.data()[0]; + bool is_input_unsigned = scales[transpose_in->Name()].first; + QuantizeInput(g, transpose_op, transpose_in, "X", input_scale, + is_input_unsigned); + + auto output_scale = scales[transpose_out->Name()].second.data()[0]; + bool is_output_unsigned = scales[transpose_out->Name()].first; + DequantizeOutput(g, transpose_op, transpose_out, "Out", output_scale, + is_output_unsigned); + + ++quantize_transpose_count; + }; + + gpd(graph, handler); + AddStatis(quantize_transpose_count); + + PrettyLogDetail("--- quantized %d transpose ops", + quantize_transpose_count); +} + void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Quantizing the graph."; PADDLE_ENFORCE(graph); @@ -363,6 +417,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizePool(graph); QuantizeConcat(graph); QuantizePriorBox(graph); + QuantizeTranspose(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index ec4db66240c..d1b23227b68 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -52,6 +52,8 @@ class CPUQuantizePass : public FusePassBase { void QuantizePriorBox(Graph* graph) const; + void QuantizeTranspose(Graph* graph) const; + void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, double scale_to_one, bool is_unsigned, std::string scale_attr_name = "") const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 947f4fd37a9..00208c9bba2 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -50,7 +50,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetAttr("Scale_in", 1.0f); op->SetAttr("Scale_out", 1.0f); op->SetAttr("Scale_weights", std::vector{1.0f}); - } else if (type == "pool2d") { + } else if (type == "pool2d" || type == "transpose2") { op->SetInput("X", {inputs[0]}); op->SetOutput("Out", {outputs[0]}); op->SetAttr("use_quantizer", use_quantizer); @@ -115,19 +115,14 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, tensor->mutable_data(place, proto::VarType::FP32, 1); } -void MainTest(const ProgramDesc& prog, int conv_count, int pool_count, - int quant_count, int dequant_count, int added_nodes_count, - float scale) { - std::unique_ptr graph(new ir::Graph(prog)); - - // Init scope, as it is used in pass +void PreparePass(std::unique_ptr* graph, const ProgramDesc& prog, + const std::initializer_list variable_names, + int* original_nodes_num, int* current_nodes_num) { auto place = paddle::platform::CPUPlace(); NaiveExecutor exe{place}; Scope scope; exe.CreateVariables(prog, 0, true, &scope); - auto* scales = new VarQuantScale(); - for (auto& v : variable_names) { InitTensorHolder(&scope, place, v.c_str()); LoDTensor tensor; @@ -138,16 +133,23 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count, (*scales)[v] = std::make_pair(false, std::move(tensor)); } - graph->SetNotOwned(kParamScopeAttr, &scope); - - auto pass = PassRegistry::Instance().Get("cpu_quantize_pass"); + (*graph)->SetNotOwned(kParamScopeAttr, &scope); + std::unique_ptr pass = + PassRegistry::Instance().Get("cpu_quantize_pass"); pass->Set("quant_var_scales", scales); - int original_nodes_num = graph->Nodes().size(); - - graph.reset(pass->Apply(graph.release())); + *original_nodes_num = (*graph)->Nodes().size(); + (*graph).reset(pass->Apply((*graph).release())); + *current_nodes_num = (*graph)->Nodes().size(); +} - int current_nodes_num = graph->Nodes().size(); +void MainTest(const ProgramDesc& prog, int conv_count, int pool_count, + int quant_count, int dequant_count, int added_nodes_count, + float scale) { + std::unique_ptr graph(new ir::Graph(prog)); + int original_nodes_num, current_nodes_num; + PreparePass(&graph, prog, variable_names, &original_nodes_num, + ¤t_nodes_num); int quantize_nodes_count = 0; int dequantize_nodes_count = 0; @@ -234,35 +236,9 @@ ProgramDesc BuildProgramDescConcat() { void MainTestConcat(const ProgramDesc& prog, int pool_count, int concat_count, int quant_count, int dequant_count, int added_nodes_count) { std::unique_ptr graph(new ir::Graph(prog)); - - // Init scope, as it is used in pass - auto place = paddle::platform::CPUPlace(); - NaiveExecutor exe{place}; - Scope scope; - exe.CreateVariables(prog, 0, true, &scope); - - auto* scales = new VarQuantScale(); - - for (auto& v : variable_names_concat) { - InitTensorHolder(&scope, place, v.c_str()); - LoDTensor tensor; - tensor.Resize({1}); - auto* ptr = tensor.mutable_data(place); - ptr[0] = 2.0; - - (*scales)[v] = std::make_pair(false, std::move(tensor)); - } - - graph->SetNotOwned(kParamScopeAttr, &scope); - - auto pass = PassRegistry::Instance().Get("cpu_quantize_pass"); - pass->Set("quant_var_scales", scales); - - int original_nodes_num = graph->Nodes().size(); - - graph.reset(pass->Apply(graph.release())); - - int current_nodes_num = graph->Nodes().size(); + int original_nodes_num, current_nodes_num; + PreparePass(&graph, prog, variable_names_concat, &original_nodes_num, + ¤t_nodes_num); int quantize_nodes_count = 0; int dequantize_nodes_count = 0; @@ -302,9 +278,93 @@ TEST(CpuQuantizePass, concat) { MainTestConcat(BuildProgramDescConcat(), pool_count, concat_count, quant_count, dequant_count, added_nodes_count); } - } // namespace +namespace { +static const std::initializer_list variable_names_transpose = { + "a", "w1", "b", "c", "w2", "d", "e", "f"}; + +// a->Conv1->b +// b->Transpose1->c +// c->Conv2->d +// d->Transpose2->e +// e->Dropout->f +ProgramDesc BuildProgramDescTranspose() { + ProgramDesc prog; + for (auto& v : variable_names_transpose) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v.find("w") == 0) { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"b"}, true, true); + SetOp(&prog, "transpose2", "Transpose1", {"b"}, {"c"}, true, true); + SetOp(&prog, "conv2d", "Conv1", {"c", "w2"}, {"d"}, true, true); + SetOp(&prog, "transpose2", "Transpose2", {"d"}, {"e"}, true, true); + SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false); + + return prog; +} + +void MainTestTranspose(const ProgramDesc& prog, int conv_count, + int transpose_count, int quant_count, int dequant_count, + int added_nodes_count, float scale) { + std::unique_ptr graph(new ir::Graph(prog)); + int original_nodes_num, current_nodes_num; + PreparePass(&graph, prog, variable_names_transpose, &original_nodes_num, + ¤t_nodes_num); + + int quantize_nodes_count = 0; + int dequantize_nodes_count = 0; + int transpose_nodes_count = 0; + int conv_nodes_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "transpose2") { + transpose_nodes_count++; + } else if (op->Type() == "conv2d") { + conv_nodes_count++; + auto op_name = boost::get(op->GetAttr("name")); + EXPECT_EQ(boost::get(op->GetAttr("Scale_in")), scale) + << "Scale_in for node '" + op_name + "'."; + EXPECT_EQ(boost::get(op->GetAttr("Scale_out")), scale) + << "Scale_out for node '" + op_name + "'."; + EXPECT_EQ( + boost::get>(op->GetAttr("Scale_weights"))[0], + scale) + << "Scale_weights for node '" + op_name + "'."; + } else if (op->Type() == "quantize") { + quantize_nodes_count++; + } else if (op->Type() == "dequantize") { + dequantize_nodes_count++; + } + } + } + EXPECT_EQ(transpose_nodes_count, transpose_count); + EXPECT_EQ(conv_nodes_count, conv_count); + EXPECT_EQ(quantize_nodes_count, quant_count); + EXPECT_EQ(dequantize_nodes_count, dequant_count); + EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num); +} + +TEST(CpuQuantizePass, transpose) { + // a1->Quant->a2->Conv1->b1->Dequant->b2 + // b2->Quant->b3->Transpose->c1->Dequant->c2 + // c2->Quant->c3->Conv2->d1->Dequant->d2 + // d2->Quant->d3->Transpose->e1->Dequant->e2 + // e2->Dropout->f + int conv_count = 2; + int transpose_count = 2; + int quant_count = 4; + int dequant_count = 4; + // 4 Quant + 4 IN + 4 DeQuant + 4 OUT + int added_nodes_count = 16; + MainTestTranspose(BuildProgramDescTranspose(), conv_count, transpose_count, + quant_count, dequant_count, added_nodes_count, 2.0f * 127); +} +} // namespace } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc index c2b2ba0b60a..b7d6c87fd99 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -34,6 +34,9 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { rules_["prior_box"]["Image"] = ScaleAlgo::NONE; rules_["prior_box"]["Boxes"] = ScaleAlgo::NONE; rules_["prior_box"]["Variances"] = ScaleAlgo::NONE; + + rules_["transpose2"]["X"] = ScaleAlgo::KL; + rules_["transpose2"]["Out"] = ScaleAlgo::NONE; } ScaleAlgo MkldnnQuantizerConfig::scale_algo( diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc index 72da7c48b25..b13e454876f 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc @@ -269,7 +269,7 @@ TEST(Analyzer_int8_mobilenet_ssd, quantization) { q_cfg.EnableMkldnnQuantizer(); q_cfg.mkldnn_quantizer_config(); std::unordered_set quantize_operators( - {"conv2d", "depthwise_conv2d", "prior_box"}); + {"conv2d", "depthwise_conv2d", "prior_box", "transpose2"}); q_cfg.mkldnn_quantizer_config()->SetEnabledOpTypes(quantize_operators); q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size); diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index 4353c621365..1d451915a15 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -75,8 +75,10 @@ class DeQuantOpKernel : public framework::OpKernel { std::shared_ptr src_memory_p = std::shared_ptr(new primitive::at(*src_memory)); - auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32, - MKLDNNMemoryFormat::nchw); + auto dst_md = platform::MKLDNNMemDesc( + {dst_tz}, memory::data_type::f32, + platform::MKLDNNFormatForSize(dst_tz.size(), memory::format::nchw)); + auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); dst_memory = std::make_shared( dst_pd, to_void_cast(output_data)); diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index a091122c5c1..6c4206f9103 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { @@ -48,8 +49,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { const std::string key = platform::CreateKey(nchw_tz, ctx.op().Output("Out")); - platform::TransposeMKLDNNHandler handler(nchw_tz, axis, dev_ctx, - mkldnn_engine, key); + platform::TransposeMKLDNNHandler handler(nchw_tz, axis, dev_ctx, + mkldnn_engine, key); auto transpose_src_memory_p = handler.AcquireSrcMemory( input->format(), platform::to_void_cast(input_data)); @@ -77,7 +78,6 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { ctx.Input(framework::GradVarName("Out")); auto* x_grad = ctx.Output(framework::GradVarName("X")); if (!x_grad) return; - auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -101,8 +101,8 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key = platform::CreateKey( nchw_tz, ctx.op().Output(framework::GradVarName("X"))); - platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, - mkldnn_engine, key); + platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, + mkldnn_engine, key); auto transpose_src_memory_p = handler.AcquireSrcMemory( out_grad->format(), platform::to_void_cast(out_grad_data)); @@ -122,11 +122,35 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace, - ops::TransposeMKLDNNOpKernel); - -REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace, - ops::TransposeMKLDNNOpKernel); +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kTransposeMKLDNNFP32, + ops::TransposeMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, + ::paddle::platform::CPUPlace, U8, + ops::kTransposeMKLDNNINT8, + ops::TransposeMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, + ::paddle::platform::CPUPlace, S8, + ops::kTransposeMKLDNNINT8, + ops::TransposeMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(transpose, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kTransposeMKLDNNFP32, + ops::TransposeMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(transpose, MKLDNN, + ::paddle::platform::CPUPlace, U8, + ops::kTransposeMKLDNNINT8, + ops::TransposeMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(transpose, MKLDNN, + ::paddle::platform::CPUPlace, S8, + ops::kTransposeMKLDNNINT8, + ops::TransposeMKLDNNOpKernel); REGISTER_OP_KERNEL(transpose_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::TransposeMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index de22faa2d2d..9444bb44ad7 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -71,16 +71,24 @@ class TransposeOp : public framework::OperatorWithKernel { framework::LibraryType library_{framework::LibraryType::kPlain}; std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; layout_ = framework::DataLayout::kMKLDNN; + using framework::proto::VarType; + auto input_data_type = ctx.Input("X")->type(); + customized_type_value = (input_data_type == VarType::INT8 || + input_data_type == VarType::UINT8) + ? kTransposeMKLDNNINT8 + : kTransposeMKLDNNFP32; } #endif return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), - layout_, library_); + layout_, library_, customized_type_value); } }; @@ -106,6 +114,13 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { "Defaults to \"NHWC\". Specify the data format of the output data, " "the input will be transformed automatically. ") .SetDefault("AnyLayout"); + /* int8 parameters */ + AddAttr("use_quantizer", + "(bool, default false) " + "Set to true for operators that should be quantized and use " + "int8 kernel. " + "Only used on CPU.") + .SetDefault(false); AddComment(R"DOC( Transpose Operator. @@ -203,17 +218,25 @@ class Transpose2Op : public TransposeOp { const framework::ExecutionContext &ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; std::string data_format = ctx.Attr("data_format"); + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; layout_ = framework::DataLayout::kMKLDNN; + using framework::proto::VarType; + auto input_data_type = ctx.Input("X")->type(); + customized_type_value = (input_data_type == VarType::INT8 || + input_data_type == VarType::UINT8) + ? kTransposeMKLDNNINT8 + : kTransposeMKLDNNFP32; } #endif return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), - layout_, library_); + layout_, library_, customized_type_value); } }; diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index 895d1ce2cca..9ed76d066fd 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -21,6 +21,8 @@ limitations under the License. */ namespace paddle { namespace operators { +enum { kTransposeMKLDNNFP32 = 1, kTransposeMKLDNNINT8 = 2 }; + template inline void TransCompute(const int dim, const DeviceContext& dev_ctx, const framework::Tensor& in, framework::Tensor* out, diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 024b8ef56fa..6fe8332c6c8 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -633,6 +633,7 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT class TransposeMKLDNNHandler : public MKLDNNHandler { public: TransposeMKLDNNHandler(std::vector& dims, // NOLINT @@ -655,9 +656,10 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { for (size_t i = 0; i < logical_axis_.size(); ++i) { logical_axis_[i] = i; } - auto src_md = fmt != MKLDNNMemoryFormat::nchw + + auto src_md = fmt != mkldnn::memory::format::nchw ? platform::MKLDNNMemDesc( - dims_, platform::MKLDNNGetDataType(), fmt) + dims_, platform::MKLDNNGetDataType(), fmt) : Axis2MemoryDesc(dims_, logical_axis_); mem_p = std::make_shared( mkldnn::memory::primitive_desc{src_md, engine_}, ptr); @@ -677,12 +679,12 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { auto dst_mdp = mkldnn::memory::primitive_desc{ Axis2MemoryDesc(dims_, axis_), engine_}; - auto dst_data = output->mutable_data(place, dst_mdp.get_size()); + auto dst_data = output->mutable_data(place, dst_mdp.get_size()); mem_p = std::make_shared(dst_mdp, dst_data); dev_ctx_.SetBlob(local_key, mem_p); } else { - auto dst_data = output->mutable_data(place); + auto dst_data = output->mutable_data(place); mem_p->set_data_handle(dst_data); } return mem_p; @@ -703,9 +705,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { } protected: - mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT - std::vector& axis // NOLINT - ) { + mkldnn_memory_desc_t Axis2MemoryDesc( + const std::vector& nchw_tz, // NOLINT + const std::vector& axis) { mkldnn_memory_desc_t mem_fmt; mem_fmt.primitive_kind = mkldnn_memory; @@ -714,7 +716,12 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { mem_fmt.dims[i] = nchw_tz[i]; // logical dimensions (nchw format, // regardless physical layout) } - mem_fmt.data_type = mkldnn_f32; + if (platform::MKLDNNGetDataType() == mkldnn::memory::data_type::s8) + mem_fmt.data_type = mkldnn_s8; + else if (platform::MKLDNNGetDataType() == mkldnn::memory::data_type::u8) + mem_fmt.data_type = mkldnn_u8; + else + mem_fmt.data_type = mkldnn_f32; mem_fmt.format = mkldnn_blocked; unsigned int total_stride = 1; diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py index a8127bcc781..3a7f620a100 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle.fluid.core as core from paddle.fluid.tests.unittests.op_test import OpTest from mkldnn_op_test import format_reorder @@ -26,10 +27,11 @@ class TestTransposeOp(OpTest): self.initTestCase() self.initInputData() self.use_mkldnn = True + self._cpu_only = True self.axis = (0, 2, 3, 1) self.inputs = { - 'X': format_reorder(self.input_data, self.shape) + 'X': format_reorder(self.input_data, self.shape).astype(np.int8) } #transform data format to 'NHWC' for INT8 transpose specially. self.attrs = { @@ -38,7 +40,7 @@ class TestTransposeOp(OpTest): } self.outputs = { - 'XShape': np.random.random(self.shape).astype('int8'), + 'XShape': np.random.random(self.shape).astype(np.int8), 'Out': self.inputs['X'].transpose(self.axis) } @@ -46,14 +48,15 @@ class TestTransposeOp(OpTest): self.op_type = "transpose2" def test_check_output(self): - self.check_output(no_check_set=['XShape']) + self.check_output_with_place( + core.CPUPlace(), 1e-5, no_check_set=['XShape']) def initTestCase(self): self.shape = (2, 3, 4, 5) def initInputData(self): self.input_data = ( - np.random.randint(0, 100, self.shape) - 50).astype('int8') + np.random.randint(0, 100, self.shape) - 50).astype(np.int8) class TestINT8Case(TestTransposeOp): @@ -62,7 +65,7 @@ class TestINT8Case(TestTransposeOp): def initInputData(self): self.input_data = ( - np.random.randint(0, 100, self.shape) - 50).astype('int8') + np.random.randint(0, 100, self.shape) - 50).astype(np.int8) class TestUINT8Case(TestTransposeOp): @@ -71,7 +74,7 @@ class TestUINT8Case(TestTransposeOp): def initDataType(self): self.input_data = (np.random.randint(0, 100, - self.shape)).astype('uint8') + self.shape)).astype(np.uint8) if __name__ == '__main__': -- GitLab