未验证 提交 ce08fdcf 编写于 作者: J joanna.wozna.intel 提交者: GitHub

Add support for INT8 matmul in C-API quantization (#23463)

* Integrate matmul with cpu_quantize_pass

test=develop

* Add matmul checking scales

test=develop

* Change condition of matmul quantization

test=develop

* Remove redundant var

test=develop
上级 c068512f
...@@ -1349,6 +1349,27 @@ PDNode *patterns::Reshape::operator()() { ...@@ -1349,6 +1349,27 @@ PDNode *patterns::Reshape::operator()() {
return reshape_out; return reshape_out;
} }
PDNode *patterns::Matmul::operator()() {
auto prev_op_x = pattern->NewNode(prev_op_x_repr())->assert_is_op();
auto prev_op_y = pattern->NewNode(prev_op_y_repr())->assert_is_op();
auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul");
auto matmul_in_x = pattern->NewNode(matmul_in_x_repr())
->AsInput()
->assert_is_op_input("matmul", "X");
auto matmul_in_y = pattern->NewNode(matmul_in_y_repr())
->AsInput()
->assert_is_op_input("matmul", "Y");
auto matmul_out = pattern->NewNode(matmul_out_repr())
->AsOutput()
->assert_is_op_output("matmul", "Out");
prev_op_x->LinksTo({matmul_in_x});
prev_op_y->LinksTo({matmul_in_y});
matmul_op->LinksFrom({matmul_in_x, matmul_in_y}).LinksTo({matmul_out});
return matmul_out;
}
PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
......
...@@ -835,6 +835,22 @@ struct Reshape : public PatternBase { ...@@ -835,6 +835,22 @@ struct Reshape : public PatternBase {
PATTERN_DECL_NODE(next_op); PATTERN_DECL_NODE(next_op);
}; };
// Matmul op
// Forward pass for matmul.
// matmul_out is a result of the operator.
struct Matmul : public PatternBase {
Matmul(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "reshape2") {}
PDNode* operator()();
PATTERN_DECL_NODE(prev_op_x);
PATTERN_DECL_NODE(prev_op_y);
PATTERN_DECL_NODE(matmul_in_x);
PATTERN_DECL_NODE(matmul_in_y);
PATTERN_DECL_NODE(matmul_op);
PATTERN_DECL_NODE(matmul_out);
};
// Concat op // Concat op
// Forward pass for concat. // Forward pass for concat.
// concat_out is a result of the operator. // concat_out is a result of the operator.
......
...@@ -190,6 +190,16 @@ double CPUQuantizePass::GetScaleValueForNode(const Node* node, ...@@ -190,6 +190,16 @@ double CPUQuantizePass::GetScaleValueForNode(const Node* node,
return scale_data.second.data<double>()[0]; return scale_data.second.data<double>()[0];
} }
bool CPUQuantizePass::IsOpDequantized(const Node* node) const {
return node->Op()->Type() == "dequantize" ||
node->Op()->GetAttrIfExists<bool>("use_quantizer");
}
bool CPUQuantizePass::IsOpQuantized(const Node* node) const {
return node->Op()->Type() == "quantize" ||
node->Op()->GetAttrIfExists<bool>("use_quantizer");
}
void CPUQuantizePass::QuantizeConv(Graph* graph, void CPUQuantizePass::QuantizeConv(Graph* graph,
bool with_residual_data) const { bool with_residual_data) const {
GraphPatternDetector gpd; GraphPatternDetector gpd;
...@@ -449,11 +459,8 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const { ...@@ -449,11 +459,8 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern); GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern);
GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, transpose_pattern); GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, transpose_pattern);
// skip if prev op and next op are not quantized // skip if prev op and next op is not quantized
if (!(prev_op->Op()->Type() == "dequantize" || if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) {
(prev_op->Op()->GetAttrIfExists<bool>("use_quantizer"))) &&
!(next_op->Op()->Type() == "quantize" ||
(next_op->Op()->GetAttrIfExists<bool>("use_quantizer")))) {
return; return;
} }
GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern); GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern);
...@@ -500,11 +507,8 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const { ...@@ -500,11 +507,8 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, reshape_pattern); GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, reshape_pattern);
GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, reshape_pattern); GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, reshape_pattern);
// skip if prev op and next op is not quantized // skip if prev op and next op is not quantized
if (!(prev_op->Op()->Type() == "dequantize" || if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) {
(prev_op->Op()->GetAttrIfExists<bool>("use_quantizer"))) &&
!(next_op->Op()->Type() == "quantize" ||
(next_op->Op()->GetAttrIfExists<bool>("use_quantizer")))) {
return; return;
} }
...@@ -530,6 +534,59 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const { ...@@ -530,6 +534,59 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
PrettyLogDetail("--- quantized %d reshape ops", quantize_reshape_count); PrettyLogDetail("--- quantized %d reshape ops", quantize_reshape_count);
} }
void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::Matmul matmul_pattern{pattern, name_scope_};
matmul_pattern();
int quantize_matmul_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "Quantize matmul op";
GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern);
auto* matmul_op_desc = matmul_op->Op();
// skip if should not be quantized
if (!matmul_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
return;
}
GET_IR_NODE_FROM_SUBGRAPH(prev_op_x, prev_op_x, matmul_pattern);
GET_IR_NODE_FROM_SUBGRAPH(prev_op_y, prev_op_y, matmul_pattern);
// skip if prev ops are not quantized
if (!IsOpDequantized(prev_op_x) || !IsOpDequantized(prev_op_y)) {
return;
}
GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern);
GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern);
GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern);
bool is_x_unsigned{false}, is_y_unsigned{false};
auto input_x_scale = GetScaleValueForNode(matmul_in_x, &is_x_unsigned);
auto input_y_scale = GetScaleValueForNode(matmul_in_y, &is_y_unsigned);
PADDLE_ENFORCE_EQ(
is_x_unsigned, is_y_unsigned,
platform::errors::InvalidArgument(
"Matmul inputs should have the same value of is_unsigned"));
QuantizeInput(g, matmul_op, matmul_in_x, "X", input_x_scale, is_x_unsigned,
"Scale_x");
QuantizeInput(g, matmul_op, matmul_in_y, "Y", input_y_scale, is_y_unsigned,
"Scale_y");
bool is_output_unsigned{false};
auto output_scale = GetScaleValueForNode(matmul_out, &is_output_unsigned);
DequantizeOutput(g, matmul_op, matmul_out, "Out", output_scale,
is_output_unsigned, "Scale_out");
++quantize_matmul_count;
};
gpd(graph, handler);
AddStatis(quantize_matmul_count);
PrettyLogDetail("--- quantized %d matmul ops", quantize_matmul_count);
}
void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Quantizing the graph."; VLOG(3) << "Quantizing the graph.";
PADDLE_ENFORCE(graph); PADDLE_ENFORCE(graph);
...@@ -545,6 +602,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { ...@@ -545,6 +602,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
QuantizeTranspose(graph); QuantizeTranspose(graph);
QuantizeFc(graph); QuantizeFc(graph);
QuantizeReshape(graph); QuantizeReshape(graph);
QuantizeMatmul(graph);
} }
} // namespace ir } // namespace ir
......
...@@ -58,6 +58,8 @@ class CPUQuantizePass : public FusePassBase { ...@@ -58,6 +58,8 @@ class CPUQuantizePass : public FusePassBase {
void QuantizeReshape(Graph* graph) const; void QuantizeReshape(Graph* graph) const;
void QuantizeMatmul(Graph* graph) const;
void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
double scale_to_one, bool is_unsigned, double scale_to_one, bool is_unsigned,
std::string scale_attr_name = "") const; std::string scale_attr_name = "") const;
...@@ -76,6 +78,8 @@ class CPUQuantizePass : public FusePassBase { ...@@ -76,6 +78,8 @@ class CPUQuantizePass : public FusePassBase {
LoDTensor GetScaleTensorForNode(const Node* node) const; LoDTensor GetScaleTensorForNode(const Node* node) const;
double GetScaleValueForNode(const Node* node, double GetScaleValueForNode(const Node* node,
bool* is_unsigned = nullptr) const; bool* is_unsigned = nullptr) const;
bool IsOpDequantized(const Node* node) const;
bool IsOpQuantized(const Node* node) const;
const std::string name_scope_{"quantize"}; const std::string name_scope_{"quantize"};
}; };
......
...@@ -74,6 +74,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, ...@@ -74,6 +74,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
op->SetInput("Input", {inputs[0]}); op->SetInput("Input", {inputs[0]});
op->SetOutput("Output", {outputs[0]}); op->SetOutput("Output", {outputs[0]});
op->SetAttr("Scale", 1.0f); op->SetAttr("Scale", 1.0f);
} else if (type == "matmul") {
op->SetInput("X", {inputs[0]});
if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
op->SetOutput("Out", {outputs[0]});
op->SetAttr("use_quantizer", use_quantizer);
op->SetAttr("Scale_x", 1.0f);
op->SetAttr("Scale_y", 1.0f);
op->SetAttr("Scale_out", 1.0f);
} }
} }
...@@ -513,6 +521,89 @@ TEST(CPUQuantizePass, check_scales) { ...@@ -513,6 +521,89 @@ TEST(CPUQuantizePass, check_scales) {
MainTestCheckScales(BuildProgramDescCheckScalesConv(), var_names, "a"); MainTestCheckScales(BuildProgramDescCheckScalesConv(), var_names, "a");
} }
static const std::initializer_list<std::string> variable_names_matmul = {
"a", "b", "c", "d", "e", "f"};
ProgramDesc BuildProgramDescMatmul() {
ProgramDesc prog;
for (auto& v : variable_names_transpose) {
prog.MutableBlock(0)->Var(v);
}
SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
return prog;
}
ProgramDesc BuildProgramDescMatmulNotQuantized() {
ProgramDesc prog;
for (auto& v : variable_names_transpose) {
prog.MutableBlock(0)->Var(v);
}
SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false);
SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true);
SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
return prog;
}
void MainTestMatmul(const ProgramDesc& prog, int matmul_count, int quant_count,
int dequant_count, int added_nodes_count, float scale) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
int original_nodes_num, current_nodes_num;
PreparePass(&graph, prog, variable_names_matmul, &original_nodes_num,
&current_nodes_num);
int quantize_nodes_count = 0;
int dequantize_nodes_count = 0;
int matmul_nodes_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
if (op->Type() == "matmul") {
matmul_nodes_count++;
auto op_name = boost::get<std::string>(op->GetAttr("name"));
EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_x")), scale)
<< "Scale_x for node '" + op_name + "'.";
EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_y")), scale)
<< "Scale_y for node '" + op_name + "'.";
EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
<< "Scale_out for node '" + op_name + "'.";
} else if (op->Type() == "quantize") {
quantize_nodes_count++;
} else if (op->Type() == "dequantize") {
dequantize_nodes_count++;
}
}
}
EXPECT_EQ(matmul_nodes_count, matmul_count);
EXPECT_EQ(quantize_nodes_count, quant_count);
EXPECT_EQ(dequantize_nodes_count, dequant_count);
EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
}
TEST(CpuQuantizePass, matmul) {
int matmul_count = 1;
int quant_count = 2;
int dequant_count = 3;
// 2 Quant + 2 IN + 1 DeQuant + 1 OUT
int added_nodes_count = 6;
MainTestMatmul(BuildProgramDescMatmul(), matmul_count, quant_count,
dequant_count, added_nodes_count, 2.0f * 127);
}
TEST(CpuQuantizePass, matmul_not_quantized) {
int matmul_count = 1;
int quant_count = 0;
int dequant_count = 1;
// nothing change
int added_nodes_count = 0;
MainTestMatmul(BuildProgramDescMatmulNotQuantized(), matmul_count,
quant_count, dequant_count, added_nodes_count, 1.0f);
}
} // namespace } // namespace
} // namespace ir } // namespace ir
......
...@@ -45,6 +45,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { ...@@ -45,6 +45,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
rules_["fc"]["Bias"] = ScaleAlgo::NONE; rules_["fc"]["Bias"] = ScaleAlgo::NONE;
rules_["fc"]["Out"] = ScaleAlgo::KL; rules_["fc"]["Out"] = ScaleAlgo::KL;
rules_["matmul"]["X"] = ScaleAlgo::KL;
rules_["matmul"]["Y"] = ScaleAlgo::KL;
rules_["matmul"]["Out"] = ScaleAlgo::KL;
// Reshape2 does not perform calculation on the data and shapes are not // Reshape2 does not perform calculation on the data and shapes are not
// changed. Scale is calculated on input data and assign to Quantize and // changed. Scale is calculated on input data and assign to Quantize and
// Dequantize scale. // Dequantize scale.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册