diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index e6abde83498f8437550bcba2c64bd82553e83eaa..2a72642b17d23b590ec5a35d7b9680f740f1ec21 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2645,6 +2645,20 @@ PDNode *patterns::MultiGruSeq::operator()() { return h2; } +PDNode *patterns::MultiGru::operator()() { + auto x = pattern->NewNode(x_repr())->AsInput()->assert_is_op_input( + "multi_gru", "X"); + auto gru = pattern->NewNode(gru_repr())->assert_is_op("multi_gru"); + auto wx = pattern->NewNode(wx_repr())->AsInput()->assert_is_op_nth_input( + "multi_gru", "WeightX", 0); + auto wh = pattern->NewNode(wh_repr())->AsInput()->assert_is_op_nth_input( + "multi_gru", "WeightH", 0); + auto h = pattern->NewNode(h_repr())->AsOutput()->assert_is_op_output( + "multi_gru", "Hidden"); + gru->LinksFrom({x, wx, wh}).LinksTo({h}); + return h; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 491e896db483e9174bed515dc8d7f1c5926168d4..a1e7435523c6ce7ddaba2a3e178eeedf7f46264e 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1490,6 +1490,21 @@ struct MultiGruSeq : public PatternBase { PATTERN_DECL_NODE(h2); }; +// multi_gru op +// Quantization pass for multi_gru op. +// Hidden of the multi_gru op is a result of the operator(). +struct MultiGru : public PatternBase { + MultiGru(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "multi_gru") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(x); + PATTERN_DECL_NODE(gru); + PATTERN_DECL_NODE(wx); + PATTERN_DECL_NODE(wh); + PATTERN_DECL_NODE(h); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 58931f3ed38725579c4c04882319baaca3efcc60..c7c4a1cf2384828284c3509867ac5ca499f918f7 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -26,6 +26,8 @@ namespace framework { namespace ir { using EigenVectorArrayMap = Eigen::Map>; +using EigenVectorArrayMapFloat = + Eigen::Map>; using string::PrettyLogDetail; namespace { @@ -45,9 +47,12 @@ void LogCannotQuantizeOp(Node* op, const char* details = nullptr) { PrettyLogDetail(msg_ss.str().c_str()); } -void LogScaleIsMissingForVar(Node* var) { - VLOG(4) << "Quantization scale for the variable " << var->Name() - << " is missing."; +void LogScaleIsMissingForVarName(const std::string& name) { + VLOG(4) << "Quantization scale for the variable " << name << " is missing."; +} + +void LogScaleIsMissingForVarNode(Node* node) { + LogScaleIsMissingForVarName(node->Name()); } void LogQuantizationDisabled(Node* op) { @@ -202,23 +207,45 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output, if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); } +bool CPUQuantizePass::AreScalesPresentForVarNames( + std::vector names) const { + auto& scales = Get("quant_var_scales"); + bool present = true; + for (auto name : names) { + if (scales.find(name) == scales.end()) { + present = false; + LogScaleIsMissingForVarName(name); + } + } + return present; +} + bool CPUQuantizePass::AreScalesPresentForNodes( - const Node* op_node, std::initializer_list nodes) const { + std::initializer_list nodes) const { auto& scales = Get("quant_var_scales"); bool present = true; for (auto node : nodes) { if (scales.count(node->Name()) == 0) { present = false; - LogScaleIsMissingForVar(node); + LogScaleIsMissingForVarNode(node); } } return present; } +std::pair CPUQuantizePass::GetScaleDataByName( + const std::string& name) const { + auto& scales = Get("quant_var_scales"); + return scales.at(name); +} + std::pair CPUQuantizePass::GetScaleDataForNode( const Node* node) const { - auto& scales = Get("quant_var_scales"); - return scales[node->Name()]; + return GetScaleDataByName(node->Name()); +} + +LoDTensor CPUQuantizePass::GetScaleTensorByName(const std::string& name) const { + return GetScaleDataByName(name).second; } LoDTensor CPUQuantizePass::GetScaleTensorForNode(const Node* node) const { @@ -265,7 +292,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - auto has_output_scale = AreScalesPresentForNodes(conv_op, {conv_output}); + auto has_output_scale = AreScalesPresentForNodes({conv_output}); if (with_residual_data && !has_output_scale) { LogCannotQuantizeOp(conv_op, "Conv op with ResidualData input cannot be quantized " @@ -277,7 +304,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data, conv_pattern); if (!AreScalesPresentForNodes( - conv_op, {conv_input, conv_filter, conv_residual_data})) { + {conv_input, conv_filter, conv_residual_data})) { LogCannotQuantizeOp(conv_op); return; } @@ -289,7 +316,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, QuantizeInput(g, conv_op, conv_residual_data, "ResidualData", residual_scale, is_residual_unsigned, "Scale_in_eltwise"); } else { - if (!AreScalesPresentForNodes(conv_op, {conv_input, conv_filter})) { + if (!AreScalesPresentForNodes({conv_input, conv_filter})) { LogCannotQuantizeOp(conv_op); return; } @@ -302,7 +329,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, auto filter_scale_tensor = GetScaleTensorForNode(conv_filter); EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data(), - filter_scale_tensor.numel(), 1}; + filter_scale_tensor.numel()}; eigen_tensor *= static_cast(S8_MAX); std::vector filter_scale{ filter_scale_tensor.data(), @@ -372,7 +399,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern); - if (!AreScalesPresentForNodes(fc, {input, weights})) { + if (!AreScalesPresentForNodes({input, weights})) { LogCannotQuantizeOp(fc); return; } @@ -384,7 +411,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { auto weight_scale_tensor = GetScaleTensorForNode(weights); EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data(), - weight_scale_tensor.numel(), 1}; + weight_scale_tensor.numel()}; eigen_tensor *= static_cast(S8_MAX); std::vector filter_scale{ weight_scale_tensor.data(), @@ -393,7 +420,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { fc->Op()->SetAttr("Scale_weights", filter_scale); // if quantization scale is missing for output tensor, return fp32 data - if (AreScalesPresentForNodes(fc, {output})) { + if (AreScalesPresentForNodes({output})) { bool is_output_unsigned{false}; auto output_scale = GetScaleValueForNode(output, &is_output_unsigned); DequantizeOutput(g, fc, output, "Out", output_scale, is_output_unsigned, @@ -434,7 +461,7 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern); GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern); - if (!AreScalesPresentForNodes(pool_op, {pool_input, pool_output})) { + if (!AreScalesPresentForNodes({pool_input, pool_output})) { LogCannotQuantizeOp(pool_op); return; } @@ -477,7 +504,7 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern); - if (!AreScalesPresentForNodes(concat_op, {concat_out})) { + if (!AreScalesPresentForNodes({concat_out})) { LogCannotQuantizeOp(concat_op); return; } @@ -523,7 +550,7 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(prior_box_input, prior_box_input, prior_box_pattern); - if (!AreScalesPresentForNodes(prior_box_op, {prior_box_input})) { + if (!AreScalesPresentForNodes({prior_box_input})) { LogCannotQuantizeOp(prior_box_op); return; } @@ -571,8 +598,7 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern); GET_IR_NODE_FROM_SUBGRAPH(transpose_out, transpose_out, transpose_pattern); - if (!AreScalesPresentForNodes(transpose_op, - {transpose_in, transpose_out})) { + if (!AreScalesPresentForNodes({transpose_in, transpose_out})) { LogCannotQuantizeOp(transpose_op); return; } @@ -626,7 +652,7 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(reshape_in, reshape_in, reshape_pattern); GET_IR_NODE_FROM_SUBGRAPH(reshape_out, reshape_out, reshape_pattern); - if (!AreScalesPresentForNodes(reshape_op, {reshape_in, reshape_out})) { + if (!AreScalesPresentForNodes({reshape_in, reshape_out})) { LogCannotQuantizeOp(reshape_op); return; } @@ -678,7 +704,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern); GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern); - if (!AreScalesPresentForNodes(matmul_op, {matmul_in_x, matmul_in_y})) { + if (!AreScalesPresentForNodes({matmul_in_x, matmul_in_y})) { LogCannotQuantizeOp(matmul_op); return; } @@ -698,7 +724,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { "Scale_y"); // if quantization scale is missing for output tensor, return fp32 data - if (AreScalesPresentForNodes(matmul_op, {matmul_out})) { + if (AreScalesPresentForNodes({matmul_out})) { bool is_output_unsigned{false}; auto output_scale = GetScaleValueForNode(matmul_out, &is_output_unsigned); DequantizeOutput(g, matmul_op, matmul_out, "Out", output_scale, @@ -744,8 +770,7 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, elementwise_add_pattern); - if (!AreScalesPresentForNodes(elementwise_add_op, - {elementwise_add_x, elementwise_add_y})) { + if (!AreScalesPresentForNodes({elementwise_add_x, elementwise_add_y})) { LogCannotQuantizeOp(elementwise_add_op); return; } @@ -769,7 +794,7 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { is_y_unsigned, "Scale_y"); // if quantization scale is missing for output tensor, return fp32 data - if (AreScalesPresentForNodes(elementwise_add_op, {elementwise_add_out})) { + if (AreScalesPresentForNodes({elementwise_add_out})) { bool is_output_unsigned{false}; auto output_scale = GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); @@ -810,7 +835,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern); GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern); - if (!AreScalesPresentForNodes(op, {x, weight_h, weight_x})) { + if (!AreScalesPresentForNodes({x, weight_h, weight_x})) { LogCannotQuantizeOp(op); return; } @@ -826,7 +851,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { auto weight_scale_tensor = GetScaleTensorForNode(weight_x); EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data(), - weight_scale_tensor.numel(), 1}; + weight_scale_tensor.numel()}; eigen_tensor *= static_cast(S8_MAX); std::vector scale_weights{ weight_scale_tensor.data(), @@ -844,6 +869,84 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { PrettyLogDetail("--- quantized %d fusion_gru ops", quantize_count); } +void CPUQuantizePass::QuantizeMultiGru(Graph* graph) const { + GraphPatternDetector gpd; + patterns::MultiGru pattern{gpd.mutable_pattern(), name_scope_}; + pattern(); + + int quantize_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize multi_gru op"; + GET_IR_NODE_FROM_SUBGRAPH(gru, gru, pattern); + + // skip if should not be quantized + if (!platform::HasOpINT8DataType(gru->Op())) { + LogQuantizationDisabled(gru); + return; + } + + GET_IR_NODE_FROM_SUBGRAPH(x, x, pattern); + GET_IR_NODE_FROM_SUBGRAPH(wx, wx, pattern); + GET_IR_NODE_FROM_SUBGRAPH(h, h, pattern); + + auto wx_names = gru->Op()->Input("WeightX"); + if (!AreScalesPresentForNodes({x}) || + !AreScalesPresentForVarNames(wx_names)) { + LogCannotQuantizeOp(gru); + return; + } + + bool is_x_unsigned{false}; + auto input_x_scale = GetScaleValueForNode(x, &is_x_unsigned); + + double input_x_shift{128.}; + if (is_x_unsigned) input_x_shift = 0.; + + QuantizeInput(g, gru, x, "X", input_x_scale, is_x_unsigned, "Scale_data", + input_x_shift, "Shift_data"); + + auto* scope = param_scope(); + int wx_size = wx_names.size(); + std::vector w_scale_var_names; + for (int i = 0; i < wx_size; ++i) { + auto scale_tensor_src = GetScaleTensorByName(wx_names[i]); + EigenVectorArrayMap eigen_tensor_src{scale_tensor_src.data(), + scale_tensor_src.numel()}; + + VarDesc scale_var_desc(patterns::PDNodeName("multi_gru", "w_scale")); + + scale_var_desc.SetShape(framework::vectorize(scale_tensor_src.dims())); + scale_var_desc.SetDataType(proto::VarType::FP32); + scale_var_desc.SetLoDLevel(scale_tensor_src.lod().size()); + scale_var_desc.SetPersistable(true); + auto* w_scale_node = g->CreateVarNode(&scale_var_desc); + + auto* w_scale_tensor_dst = + scope->Var(w_scale_node->Name())->GetMutable(); + w_scale_tensor_dst->Resize(scale_tensor_src.dims()); + auto* dst_data = + w_scale_tensor_dst->mutable_data(platform::CPUPlace()); + EigenVectorArrayMapFloat eigen_tensor_dst{dst_data, + w_scale_tensor_dst->numel()}; + eigen_tensor_dst = + eigen_tensor_src.cast() * static_cast(S8_MAX); + w_scale_var_names.push_back(w_scale_node->Name()); + IR_NODE_LINK_TO(w_scale_node, gru); + } + + gru->Op()->SetInput("Scale_weights", w_scale_var_names); + // return fp32 data + gru->Op()->SetAttr("force_fp32_output", true); + + ++quantize_count; + }; + gpd(graph, handler); + AddStatis(quantize_count); + + PrettyLogDetail("--- quantized %d multi_gru ops", quantize_count); +} + void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Quantizing the graph."; PADDLE_ENFORCE_NOT_NULL( @@ -864,6 +967,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeMatmul(graph); QuantizeElementwiseAdd(graph); QuantizeFusionGru(graph); + QuantizeMultiGru(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 0d4c4249010817da42dd3a3762c49144b14bdb35..896b31c154710cf25645e66e45248771fa31f2ba 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -18,6 +18,7 @@ #include #include #include +#include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" @@ -58,6 +59,7 @@ class CPUQuantizePass : public FusePassBase { void QuantizeMatmul(Graph* graph) const; void QuantizeElementwiseAdd(Graph* graph) const; void QuantizeFusionGru(Graph* graph) const; + void QuantizeMultiGru(Graph* graph) const; void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, double scale_to_one, bool is_input_unsigned, @@ -75,10 +77,14 @@ class CPUQuantizePass : public FusePassBase { bool is_unsigned, std::string scale_attr_name = "") const; - bool AreScalesPresentForNodes(const Node* op_node, - std::initializer_list nodes) const; + bool AreScalesPresentForVarNames(std::vector names) const; + bool AreScalesPresentForNodes(std::initializer_list nodes) const; + std::pair GetScaleDataByName(const std::string& name) const; std::pair GetScaleDataForNode(const Node* node) const; + LoDTensor GetScaleTensorByName(const std::string& name) const; LoDTensor GetScaleTensorForNode(const Node* node) const; + double GetScaleValueByName(const std::string& name, + bool* is_unsigned = nullptr) const; double GetScaleValueForNode(const Node* node, bool* is_unsigned = nullptr) const; bool IsOpDequantized(const Node* node) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 65be404dfef2f9d3c16518cd73a47d2f499071be..adb431fdb097f54d5d476065aa6915b66ac71299 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -112,7 +112,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, } void PreparePass(std::unique_ptr* graph, const ProgramDesc& prog, - const std::initializer_list variable_names, + const std::vector variable_names, int* original_nodes_num, int* current_nodes_num, std::string var_without_scale = "", std::string var_signed = "") { @@ -402,7 +402,7 @@ TEST(CpuQuantizePass, transpose) { static const std::initializer_list variable_names_fusion_gru = { "x", "wx", "wh", "b", "h"}; -// x->Fusion_gru->h +// (x, wx, wh, b)->Fusion_gru->h ProgramDesc BuildProgramDescFusionGru() { ProgramDesc prog; for (auto& v : variable_names_transpose) { @@ -460,7 +460,7 @@ void MainTestFusionGru(const ProgramDesc& prog, int gru_count, int quant_count, } TEST(CpuQuantizePass, fusion_gru) { - // x->Fusion_gru->h + // (x, wx, wh, b)->Fusion_gru->h int gru_count = 1; int quant_count = 1; int dequant_count = 0; @@ -470,6 +470,128 @@ TEST(CpuQuantizePass, fusion_gru) { dequant_count, added_nodes_count, 2. * 127, 128.); } +const std::vector churn_out_vars(ProgramDesc* prog, + const std::string& prefix, + int number) { + auto v = std::vector(); + for (int i = 0; i < number; ++i) { + auto name = prefix + std::to_string(i); + prog->MutableBlock(0)->Var(name); + v.push_back(name); + } + return v; +} + +void create_vars(ProgramDesc* prog, + const std::initializer_list& names) { + for (auto name : names) prog->MutableBlock(0)->Var(name); +} + +void SetMultiGruOp(ProgramDesc* prog, const std::string x, + const std::vector wx, + const std::vector wh, + const std::vector b, const std::string h, + int layers) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType("multi_gru"); + op->SetInput("X", {x}); + op->SetInput("WeightX", wx); + op->SetInput("WeightH", wh); + op->SetInput("Bias", b); + op->SetOutput("Hidden", {h}); + op->SetAttr("layers", layers); + op->SetAttr("origin_mode", false); + op->SetAttr("use_mkldnn", true); + op->SetAttr("name", std::string("Multi_gru")); + op->SetAttr("mkldnn_data_type", std::string("int8")); + op->SetAttr("Scale_data", 1.0f); + op->SetAttr("Shift_data", 0.0f); +} + +void MainTestMultiGru(int layers) { + ProgramDesc prog; + + // Create variables + create_vars(&prog, {"x", "h"}); + const std::vector wx = churn_out_vars(&prog, "wx", 2 * layers); + const std::vector wh = churn_out_vars(&prog, "wh", 2 * layers); + const std::vector b = churn_out_vars(&prog, "b", 2 * layers); + + std::vector all_vars; + all_vars.reserve(wx.size() + wh.size() + b.size() + 2); + all_vars.insert(all_vars.end(), wx.begin(), wx.end()); + all_vars.insert(all_vars.end(), wh.begin(), wh.end()); + all_vars.insert(all_vars.end(), b.begin(), b.end()); + all_vars.push_back("x"); + all_vars.push_back("h"); + + // Prepare program descriptor + SetMultiGruOp(&prog, "x", wx, wh, b, "h", layers); + + // Prepare and run the pass + std::unique_ptr graph(new ir::Graph(prog)); + int original_nodes_num, current_nodes_num; + PreparePass(&graph, prog, all_vars, &original_nodes_num, ¤t_nodes_num); + + // Verify graph after quantization + float scale = 2 * 127; + float shift = 128; + int quantize_nodes_count = 0; + int dequantize_nodes_count = 0; + int multi_gru_nodes_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "multi_gru") { + multi_gru_nodes_count++; + + auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name")); + EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_data")), scale) + << "Scale_data for node '" + op_name + "'."; + EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Shift_data")), shift) + << "Shift_data for node '" + op_name + "'."; + EXPECT_EQ(op->Input("Scale_weights").size(), 2u * layers) + << "Scale_weights for node '" + op_name + "'."; + EXPECT_EQ(BOOST_GET_CONST(bool, op->GetAttr("force_fp32_output")), true) + << "force_fp32_output for node '" + op_name + "'."; + } else if (op->Type() == "quantize") { + quantize_nodes_count++; + } else if (op->Type() == "dequantize") { + dequantize_nodes_count++; + } + } + } + + int multi_gru_count = 1; + int quant_count = 1; + int quant_out_count = 1; + int dequant_count = 0; + int dequant_out_count = 0; + int scale_weights_count = 2 * layers; + int added_nodes_count = quant_count + quant_out_count + scale_weights_count + + dequant_count + dequant_out_count; + + EXPECT_EQ(multi_gru_nodes_count, multi_gru_count); + EXPECT_EQ(quantize_nodes_count, quant_count); + EXPECT_EQ(dequantize_nodes_count, dequant_count); + EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num); +} + +TEST(CpuQuantizePass, multi_gru_1) { + int layers = 1; + MainTestMultiGru(layers); +} + +TEST(CpuQuantizePass, multi_gru_2) { + int layers = 2; + MainTestMultiGru(layers); +} + +TEST(CpuQuantizePass, multi_gru_3) { + int layers = 3; + MainTestMultiGru(layers); +} + static const std::initializer_list variable_names_reshape = { "a", "w1", "b", "c", "d", "e", "f"}; diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index 45df381b63183f93b350bc7e7aaf6130c01b75c2..98123a474c9bcca43b79b755b898622199fd5c64 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -66,7 +66,7 @@ class Quant2Int8MkldnnPass(object): self._fc_ops = ['fc'] self._relu_ops = ['relu', 'relu6'] self._matmul_ops = ['matmul'] - self._gru_ops = ['fusion_gru'] + self._gru_ops = ['fusion_gru', 'multi_gru'] self._weight_scales = {} # Collect the Input and Output sclaes from Fake quant models self._var_quant_scales = {} @@ -352,6 +352,8 @@ class Quant2Int8MkldnnPass(object): graph = self._apply_pass(graph, 'mul_lstm_fuse_pass') graph = self._apply_pass(graph, 'fc_gru_fuse_pass') graph = self._apply_pass(graph, 'mul_gru_fuse_pass') + graph = self._apply_pass(graph, 'multi_gru_fuse_pass') + graph = self._apply_pass(graph, 'multi_gru_seq_fuse_pass') graph = self._apply_pass(graph, 'seq_concat_fc_fuse_pass') graph = self._apply_pass(graph, 'squared_mat_sub_fuse_pass') graph = self._apply_pass(graph, 'is_test_pass') @@ -450,38 +452,46 @@ class Quant2Int8MkldnnPass(object): self._var_quant_scales[weight_var_name] = (use_unsigned_int, lod_tensor) + def _compute_single_gru_weight_scales(wx_var_name, wh_var_name): + wx = np.array(self._load_param(self._scope, wx_var_name)) + wh = np.array(self._load_param(self._scope, wh_var_name)) + OC = wh.shape[0] + scale_ur = 1.0 / np.max(np.abs( + np.concatenate( + [ + wx[:, :2 * OC], wh.flatten()[:2 * OC * OC].reshape(OC, 2 + * OC) + ], + axis=0)), + axis=0) + scale_o = 1.0 / np.max(np.abs( + np.concatenate( + [ + wx[:, 2 * OC:], wh.flatten()[2 * OC * OC:].reshape(OC, + OC) + ], + axis=0)), + axis=0) + + gru_weights_scale = np.concatenate([scale_ur, + scale_o]).astype('float') + + return self._convert_scale2tensor(gru_weights_scale) + def _compute_gru_weight_scales(wx_name, wh_name): for op in graph.all_op_nodes(): if op.op().type() in self._gru_ops: - wx_var_name = op.input(wx_name)[0] - wh_var_name = op.input(wh_name)[0] - wx = np.array(self._load_param(self._scope, wx_var_name)) - wh = np.array(self._load_param(self._scope, wh_var_name)) - OC = wh.shape[0] - scale_ur = 1.0 / np.max(np.abs( - np.concatenate( - [ - wx[:, :2 * OC], wh.flatten()[:2 * OC * OC] - .reshape(OC, 2 * OC) - ], - axis=0)), - axis=0) - scale_o = 1.0 / np.max(np.abs( - np.concatenate( - [ - wx[:, 2 * OC:], wh.flatten()[2 * OC * OC:] - .reshape(OC, OC) - ], - axis=0)), - axis=0) - - gru_weights_scale = np.concatenate( - [scale_ur, scale_o]).astype('float') - - lod_tensor = self._convert_scale2tensor(gru_weights_scale) - use_unsigned_int = False - self._var_quant_scales[wx_var_name] = (use_unsigned_int, - lod_tensor) + assert len(op.input(wx_name)) == len( + op.input(wh_name) + ), 'Mismatch in number of weights inputs ({} for WeightX vs. {} for WeightH).'.format( + len(op.input(wx_name)), len(op.input(wh_name))) + for i, wx_var_name in enumerate(op.input(wx_name)): + wh_var_name = op.input(wh_name)[i] + use_unsigned_int = False + lod_tensor = _compute_single_gru_weight_scales( + wx_var_name, wh_var_name) + self._var_quant_scales[wx_var_name] = (use_unsigned_int, + lod_tensor) _compute_var_scales(self._conv_ops, "Filter", axis=1) _compute_var_scales(self._fc_ops, "W", axis=0) diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index 0f05d941a9189f29ad2871b149cc4155b978ec04..c3379a9a573c773d31b196f47338fdd48626dbe9 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -239,7 +239,7 @@ if(LINUX AND WITH_MKLDNN) set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz") set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2") download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE}) - set(QUANT2_GRU_OPS_TO_QUANTIZE "fusion_gru") + set(QUANT2_GRU_OPS_TO_QUANTIZE "multi_gru") ### Save FP32 model or INT8 model from Quant model