未验证 提交 4fd4095d 编写于 作者: W Wojciech Uss 提交者: GitHub

Add quantization of multi_gru op and tests (#28615)

上级 4adddcc8
...@@ -2645,6 +2645,20 @@ PDNode *patterns::MultiGruSeq::operator()() { ...@@ -2645,6 +2645,20 @@ PDNode *patterns::MultiGruSeq::operator()() {
return h2; return h2;
} }
PDNode *patterns::MultiGru::operator()() {
auto x = pattern->NewNode(x_repr())->AsInput()->assert_is_op_input(
"multi_gru", "X");
auto gru = pattern->NewNode(gru_repr())->assert_is_op("multi_gru");
auto wx = pattern->NewNode(wx_repr())->AsInput()->assert_is_op_nth_input(
"multi_gru", "WeightX", 0);
auto wh = pattern->NewNode(wh_repr())->AsInput()->assert_is_op_nth_input(
"multi_gru", "WeightH", 0);
auto h = pattern->NewNode(h_repr())->AsOutput()->assert_is_op_output(
"multi_gru", "Hidden");
gru->LinksFrom({x, wx, wh}).LinksTo({h});
return h;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -1490,6 +1490,21 @@ struct MultiGruSeq : public PatternBase { ...@@ -1490,6 +1490,21 @@ struct MultiGruSeq : public PatternBase {
PATTERN_DECL_NODE(h2); PATTERN_DECL_NODE(h2);
}; };
// multi_gru op
// Quantization pass for multi_gru op.
// Hidden of the multi_gru op is a result of the operator().
struct MultiGru : public PatternBase {
MultiGru(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "multi_gru") {}
PDNode* operator()();
PATTERN_DECL_NODE(x);
PATTERN_DECL_NODE(gru);
PATTERN_DECL_NODE(wx);
PATTERN_DECL_NODE(wh);
PATTERN_DECL_NODE(h);
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.
......
...@@ -26,6 +26,8 @@ namespace framework { ...@@ -26,6 +26,8 @@ namespace framework {
namespace ir { namespace ir {
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>; using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
using EigenVectorArrayMapFloat =
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
using string::PrettyLogDetail; using string::PrettyLogDetail;
namespace { namespace {
...@@ -45,9 +47,12 @@ void LogCannotQuantizeOp(Node* op, const char* details = nullptr) { ...@@ -45,9 +47,12 @@ void LogCannotQuantizeOp(Node* op, const char* details = nullptr) {
PrettyLogDetail(msg_ss.str().c_str()); PrettyLogDetail(msg_ss.str().c_str());
} }
void LogScaleIsMissingForVar(Node* var) { void LogScaleIsMissingForVarName(const std::string& name) {
VLOG(4) << "Quantization scale for the variable " << var->Name() VLOG(4) << "Quantization scale for the variable " << name << " is missing.";
<< " is missing."; }
void LogScaleIsMissingForVarNode(Node* node) {
LogScaleIsMissingForVarName(node->Name());
} }
void LogQuantizationDisabled(Node* op) { void LogQuantizationDisabled(Node* op) {
...@@ -202,23 +207,45 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output, ...@@ -202,23 +207,45 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
} }
bool CPUQuantizePass::AreScalesPresentForVarNames(
std::vector<std::string> names) const {
auto& scales = Get<VarQuantScale>("quant_var_scales");
bool present = true;
for (auto name : names) {
if (scales.find(name) == scales.end()) {
present = false;
LogScaleIsMissingForVarName(name);
}
}
return present;
}
bool CPUQuantizePass::AreScalesPresentForNodes( bool CPUQuantizePass::AreScalesPresentForNodes(
const Node* op_node, std::initializer_list<Node*> nodes) const { std::initializer_list<Node*> nodes) const {
auto& scales = Get<VarQuantScale>("quant_var_scales"); auto& scales = Get<VarQuantScale>("quant_var_scales");
bool present = true; bool present = true;
for (auto node : nodes) { for (auto node : nodes) {
if (scales.count(node->Name()) == 0) { if (scales.count(node->Name()) == 0) {
present = false; present = false;
LogScaleIsMissingForVar(node); LogScaleIsMissingForVarNode(node);
} }
} }
return present; return present;
} }
std::pair<bool, LoDTensor> CPUQuantizePass::GetScaleDataByName(
const std::string& name) const {
auto& scales = Get<VarQuantScale>("quant_var_scales");
return scales.at(name);
}
std::pair<bool, LoDTensor> CPUQuantizePass::GetScaleDataForNode( std::pair<bool, LoDTensor> CPUQuantizePass::GetScaleDataForNode(
const Node* node) const { const Node* node) const {
auto& scales = Get<VarQuantScale>("quant_var_scales"); return GetScaleDataByName(node->Name());
return scales[node->Name()]; }
LoDTensor CPUQuantizePass::GetScaleTensorByName(const std::string& name) const {
return GetScaleDataByName(name).second;
} }
LoDTensor CPUQuantizePass::GetScaleTensorForNode(const Node* node) const { LoDTensor CPUQuantizePass::GetScaleTensorForNode(const Node* node) const {
...@@ -265,7 +292,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, ...@@ -265,7 +292,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
auto has_output_scale = AreScalesPresentForNodes(conv_op, {conv_output}); auto has_output_scale = AreScalesPresentForNodes({conv_output});
if (with_residual_data && !has_output_scale) { if (with_residual_data && !has_output_scale) {
LogCannotQuantizeOp(conv_op, LogCannotQuantizeOp(conv_op,
"Conv op with ResidualData input cannot be quantized " "Conv op with ResidualData input cannot be quantized "
...@@ -277,7 +304,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, ...@@ -277,7 +304,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data, GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data,
conv_pattern); conv_pattern);
if (!AreScalesPresentForNodes( if (!AreScalesPresentForNodes(
conv_op, {conv_input, conv_filter, conv_residual_data})) { {conv_input, conv_filter, conv_residual_data})) {
LogCannotQuantizeOp(conv_op); LogCannotQuantizeOp(conv_op);
return; return;
} }
...@@ -289,7 +316,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, ...@@ -289,7 +316,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
QuantizeInput(g, conv_op, conv_residual_data, "ResidualData", QuantizeInput(g, conv_op, conv_residual_data, "ResidualData",
residual_scale, is_residual_unsigned, "Scale_in_eltwise"); residual_scale, is_residual_unsigned, "Scale_in_eltwise");
} else { } else {
if (!AreScalesPresentForNodes(conv_op, {conv_input, conv_filter})) { if (!AreScalesPresentForNodes({conv_input, conv_filter})) {
LogCannotQuantizeOp(conv_op); LogCannotQuantizeOp(conv_op);
return; return;
} }
...@@ -302,7 +329,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, ...@@ -302,7 +329,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
auto filter_scale_tensor = GetScaleTensorForNode(conv_filter); auto filter_scale_tensor = GetScaleTensorForNode(conv_filter);
EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(), EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
filter_scale_tensor.numel(), 1}; filter_scale_tensor.numel()};
eigen_tensor *= static_cast<double>(S8_MAX); eigen_tensor *= static_cast<double>(S8_MAX);
std::vector<float> filter_scale{ std::vector<float> filter_scale{
filter_scale_tensor.data<double>(), filter_scale_tensor.data<double>(),
...@@ -372,7 +399,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { ...@@ -372,7 +399,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern);
if (!AreScalesPresentForNodes(fc, {input, weights})) { if (!AreScalesPresentForNodes({input, weights})) {
LogCannotQuantizeOp(fc); LogCannotQuantizeOp(fc);
return; return;
} }
...@@ -384,7 +411,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { ...@@ -384,7 +411,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {
auto weight_scale_tensor = GetScaleTensorForNode(weights); auto weight_scale_tensor = GetScaleTensorForNode(weights);
EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data<double>(), EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data<double>(),
weight_scale_tensor.numel(), 1}; weight_scale_tensor.numel()};
eigen_tensor *= static_cast<double>(S8_MAX); eigen_tensor *= static_cast<double>(S8_MAX);
std::vector<float> filter_scale{ std::vector<float> filter_scale{
weight_scale_tensor.data<double>(), weight_scale_tensor.data<double>(),
...@@ -393,7 +420,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { ...@@ -393,7 +420,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {
fc->Op()->SetAttr("Scale_weights", filter_scale); fc->Op()->SetAttr("Scale_weights", filter_scale);
// if quantization scale is missing for output tensor, return fp32 data // if quantization scale is missing for output tensor, return fp32 data
if (AreScalesPresentForNodes(fc, {output})) { if (AreScalesPresentForNodes({output})) {
bool is_output_unsigned{false}; bool is_output_unsigned{false};
auto output_scale = GetScaleValueForNode(output, &is_output_unsigned); auto output_scale = GetScaleValueForNode(output, &is_output_unsigned);
DequantizeOutput(g, fc, output, "Out", output_scale, is_output_unsigned, DequantizeOutput(g, fc, output, "Out", output_scale, is_output_unsigned,
...@@ -434,7 +461,7 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const { ...@@ -434,7 +461,7 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern); GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern); GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
if (!AreScalesPresentForNodes(pool_op, {pool_input, pool_output})) { if (!AreScalesPresentForNodes({pool_input, pool_output})) {
LogCannotQuantizeOp(pool_op); LogCannotQuantizeOp(pool_op);
return; return;
} }
...@@ -477,7 +504,7 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const { ...@@ -477,7 +504,7 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern); GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern);
if (!AreScalesPresentForNodes(concat_op, {concat_out})) { if (!AreScalesPresentForNodes({concat_out})) {
LogCannotQuantizeOp(concat_op); LogCannotQuantizeOp(concat_op);
return; return;
} }
...@@ -523,7 +550,7 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const { ...@@ -523,7 +550,7 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(prior_box_input, prior_box_input, GET_IR_NODE_FROM_SUBGRAPH(prior_box_input, prior_box_input,
prior_box_pattern); prior_box_pattern);
if (!AreScalesPresentForNodes(prior_box_op, {prior_box_input})) { if (!AreScalesPresentForNodes({prior_box_input})) {
LogCannotQuantizeOp(prior_box_op); LogCannotQuantizeOp(prior_box_op);
return; return;
} }
...@@ -571,8 +598,7 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const { ...@@ -571,8 +598,7 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern); GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern);
GET_IR_NODE_FROM_SUBGRAPH(transpose_out, transpose_out, transpose_pattern); GET_IR_NODE_FROM_SUBGRAPH(transpose_out, transpose_out, transpose_pattern);
if (!AreScalesPresentForNodes(transpose_op, if (!AreScalesPresentForNodes({transpose_in, transpose_out})) {
{transpose_in, transpose_out})) {
LogCannotQuantizeOp(transpose_op); LogCannotQuantizeOp(transpose_op);
return; return;
} }
...@@ -626,7 +652,7 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const { ...@@ -626,7 +652,7 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(reshape_in, reshape_in, reshape_pattern); GET_IR_NODE_FROM_SUBGRAPH(reshape_in, reshape_in, reshape_pattern);
GET_IR_NODE_FROM_SUBGRAPH(reshape_out, reshape_out, reshape_pattern); GET_IR_NODE_FROM_SUBGRAPH(reshape_out, reshape_out, reshape_pattern);
if (!AreScalesPresentForNodes(reshape_op, {reshape_in, reshape_out})) { if (!AreScalesPresentForNodes({reshape_in, reshape_out})) {
LogCannotQuantizeOp(reshape_op); LogCannotQuantizeOp(reshape_op);
return; return;
} }
...@@ -678,7 +704,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { ...@@ -678,7 +704,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern); GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern);
GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern); GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern);
if (!AreScalesPresentForNodes(matmul_op, {matmul_in_x, matmul_in_y})) { if (!AreScalesPresentForNodes({matmul_in_x, matmul_in_y})) {
LogCannotQuantizeOp(matmul_op); LogCannotQuantizeOp(matmul_op);
return; return;
} }
...@@ -698,7 +724,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { ...@@ -698,7 +724,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
"Scale_y"); "Scale_y");
// if quantization scale is missing for output tensor, return fp32 data // if quantization scale is missing for output tensor, return fp32 data
if (AreScalesPresentForNodes(matmul_op, {matmul_out})) { if (AreScalesPresentForNodes({matmul_out})) {
bool is_output_unsigned{false}; bool is_output_unsigned{false};
auto output_scale = GetScaleValueForNode(matmul_out, &is_output_unsigned); auto output_scale = GetScaleValueForNode(matmul_out, &is_output_unsigned);
DequantizeOutput(g, matmul_op, matmul_out, "Out", output_scale, DequantizeOutput(g, matmul_op, matmul_out, "Out", output_scale,
...@@ -744,8 +770,7 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { ...@@ -744,8 +770,7 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern); elementwise_add_pattern);
if (!AreScalesPresentForNodes(elementwise_add_op, if (!AreScalesPresentForNodes({elementwise_add_x, elementwise_add_y})) {
{elementwise_add_x, elementwise_add_y})) {
LogCannotQuantizeOp(elementwise_add_op); LogCannotQuantizeOp(elementwise_add_op);
return; return;
} }
...@@ -769,7 +794,7 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { ...@@ -769,7 +794,7 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
is_y_unsigned, "Scale_y"); is_y_unsigned, "Scale_y");
// if quantization scale is missing for output tensor, return fp32 data // if quantization scale is missing for output tensor, return fp32 data
if (AreScalesPresentForNodes(elementwise_add_op, {elementwise_add_out})) { if (AreScalesPresentForNodes({elementwise_add_out})) {
bool is_output_unsigned{false}; bool is_output_unsigned{false};
auto output_scale = auto output_scale =
GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); GetScaleValueForNode(elementwise_add_out, &is_output_unsigned);
...@@ -810,7 +835,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { ...@@ -810,7 +835,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern); GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern);
GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern); GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern);
if (!AreScalesPresentForNodes(op, {x, weight_h, weight_x})) { if (!AreScalesPresentForNodes({x, weight_h, weight_x})) {
LogCannotQuantizeOp(op); LogCannotQuantizeOp(op);
return; return;
} }
...@@ -826,7 +851,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { ...@@ -826,7 +851,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
auto weight_scale_tensor = GetScaleTensorForNode(weight_x); auto weight_scale_tensor = GetScaleTensorForNode(weight_x);
EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data<double>(), EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data<double>(),
weight_scale_tensor.numel(), 1}; weight_scale_tensor.numel()};
eigen_tensor *= static_cast<double>(S8_MAX); eigen_tensor *= static_cast<double>(S8_MAX);
std::vector<float> scale_weights{ std::vector<float> scale_weights{
weight_scale_tensor.data<double>(), weight_scale_tensor.data<double>(),
...@@ -844,6 +869,84 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { ...@@ -844,6 +869,84 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
PrettyLogDetail("--- quantized %d fusion_gru ops", quantize_count); PrettyLogDetail("--- quantized %d fusion_gru ops", quantize_count);
} }
void CPUQuantizePass::QuantizeMultiGru(Graph* graph) const {
GraphPatternDetector gpd;
patterns::MultiGru pattern{gpd.mutable_pattern(), name_scope_};
pattern();
int quantize_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "Quantize multi_gru op";
GET_IR_NODE_FROM_SUBGRAPH(gru, gru, pattern);
// skip if should not be quantized
if (!platform::HasOpINT8DataType(gru->Op())) {
LogQuantizationDisabled(gru);
return;
}
GET_IR_NODE_FROM_SUBGRAPH(x, x, pattern);
GET_IR_NODE_FROM_SUBGRAPH(wx, wx, pattern);
GET_IR_NODE_FROM_SUBGRAPH(h, h, pattern);
auto wx_names = gru->Op()->Input("WeightX");
if (!AreScalesPresentForNodes({x}) ||
!AreScalesPresentForVarNames(wx_names)) {
LogCannotQuantizeOp(gru);
return;
}
bool is_x_unsigned{false};
auto input_x_scale = GetScaleValueForNode(x, &is_x_unsigned);
double input_x_shift{128.};
if (is_x_unsigned) input_x_shift = 0.;
QuantizeInput(g, gru, x, "X", input_x_scale, is_x_unsigned, "Scale_data",
input_x_shift, "Shift_data");
auto* scope = param_scope();
int wx_size = wx_names.size();
std::vector<std::string> w_scale_var_names;
for (int i = 0; i < wx_size; ++i) {
auto scale_tensor_src = GetScaleTensorByName(wx_names[i]);
EigenVectorArrayMap eigen_tensor_src{scale_tensor_src.data<double>(),
scale_tensor_src.numel()};
VarDesc scale_var_desc(patterns::PDNodeName("multi_gru", "w_scale"));
scale_var_desc.SetShape(framework::vectorize(scale_tensor_src.dims()));
scale_var_desc.SetDataType(proto::VarType::FP32);
scale_var_desc.SetLoDLevel(scale_tensor_src.lod().size());
scale_var_desc.SetPersistable(true);
auto* w_scale_node = g->CreateVarNode(&scale_var_desc);
auto* w_scale_tensor_dst =
scope->Var(w_scale_node->Name())->GetMutable<LoDTensor>();
w_scale_tensor_dst->Resize(scale_tensor_src.dims());
auto* dst_data =
w_scale_tensor_dst->mutable_data<float>(platform::CPUPlace());
EigenVectorArrayMapFloat eigen_tensor_dst{dst_data,
w_scale_tensor_dst->numel()};
eigen_tensor_dst =
eigen_tensor_src.cast<float>() * static_cast<float>(S8_MAX);
w_scale_var_names.push_back(w_scale_node->Name());
IR_NODE_LINK_TO(w_scale_node, gru);
}
gru->Op()->SetInput("Scale_weights", w_scale_var_names);
// return fp32 data
gru->Op()->SetAttr("force_fp32_output", true);
++quantize_count;
};
gpd(graph, handler);
AddStatis(quantize_count);
PrettyLogDetail("--- quantized %d multi_gru ops", quantize_count);
}
void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Quantizing the graph."; VLOG(3) << "Quantizing the graph.";
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
...@@ -864,6 +967,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { ...@@ -864,6 +967,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
QuantizeMatmul(graph); QuantizeMatmul(graph);
QuantizeElementwiseAdd(graph); QuantizeElementwiseAdd(graph);
QuantizeFusionGru(graph); QuantizeFusionGru(graph);
QuantizeMultiGru(graph);
} }
} // namespace ir } // namespace ir
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -58,6 +59,7 @@ class CPUQuantizePass : public FusePassBase { ...@@ -58,6 +59,7 @@ class CPUQuantizePass : public FusePassBase {
void QuantizeMatmul(Graph* graph) const; void QuantizeMatmul(Graph* graph) const;
void QuantizeElementwiseAdd(Graph* graph) const; void QuantizeElementwiseAdd(Graph* graph) const;
void QuantizeFusionGru(Graph* graph) const; void QuantizeFusionGru(Graph* graph) const;
void QuantizeMultiGru(Graph* graph) const;
void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
double scale_to_one, bool is_input_unsigned, double scale_to_one, bool is_input_unsigned,
...@@ -75,10 +77,14 @@ class CPUQuantizePass : public FusePassBase { ...@@ -75,10 +77,14 @@ class CPUQuantizePass : public FusePassBase {
bool is_unsigned, bool is_unsigned,
std::string scale_attr_name = "") const; std::string scale_attr_name = "") const;
bool AreScalesPresentForNodes(const Node* op_node, bool AreScalesPresentForVarNames(std::vector<std::string> names) const;
std::initializer_list<Node*> nodes) const; bool AreScalesPresentForNodes(std::initializer_list<Node*> nodes) const;
std::pair<bool, LoDTensor> GetScaleDataByName(const std::string& name) const;
std::pair<bool, LoDTensor> GetScaleDataForNode(const Node* node) const; std::pair<bool, LoDTensor> GetScaleDataForNode(const Node* node) const;
LoDTensor GetScaleTensorByName(const std::string& name) const;
LoDTensor GetScaleTensorForNode(const Node* node) const; LoDTensor GetScaleTensorForNode(const Node* node) const;
double GetScaleValueByName(const std::string& name,
bool* is_unsigned = nullptr) const;
double GetScaleValueForNode(const Node* node, double GetScaleValueForNode(const Node* node,
bool* is_unsigned = nullptr) const; bool* is_unsigned = nullptr) const;
bool IsOpDequantized(const Node* node) const; bool IsOpDequantized(const Node* node) const;
......
...@@ -112,7 +112,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, ...@@ -112,7 +112,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
} }
void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog, void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
const std::initializer_list<std::string> variable_names, const std::vector<std::string> variable_names,
int* original_nodes_num, int* current_nodes_num, int* original_nodes_num, int* current_nodes_num,
std::string var_without_scale = "", std::string var_without_scale = "",
std::string var_signed = "") { std::string var_signed = "") {
...@@ -402,7 +402,7 @@ TEST(CpuQuantizePass, transpose) { ...@@ -402,7 +402,7 @@ TEST(CpuQuantizePass, transpose) {
static const std::initializer_list<std::string> variable_names_fusion_gru = { static const std::initializer_list<std::string> variable_names_fusion_gru = {
"x", "wx", "wh", "b", "h"}; "x", "wx", "wh", "b", "h"};
// x->Fusion_gru->h // (x, wx, wh, b)->Fusion_gru->h
ProgramDesc BuildProgramDescFusionGru() { ProgramDesc BuildProgramDescFusionGru() {
ProgramDesc prog; ProgramDesc prog;
for (auto& v : variable_names_transpose) { for (auto& v : variable_names_transpose) {
...@@ -460,7 +460,7 @@ void MainTestFusionGru(const ProgramDesc& prog, int gru_count, int quant_count, ...@@ -460,7 +460,7 @@ void MainTestFusionGru(const ProgramDesc& prog, int gru_count, int quant_count,
} }
TEST(CpuQuantizePass, fusion_gru) { TEST(CpuQuantizePass, fusion_gru) {
// x->Fusion_gru->h // (x, wx, wh, b)->Fusion_gru->h
int gru_count = 1; int gru_count = 1;
int quant_count = 1; int quant_count = 1;
int dequant_count = 0; int dequant_count = 0;
...@@ -470,6 +470,128 @@ TEST(CpuQuantizePass, fusion_gru) { ...@@ -470,6 +470,128 @@ TEST(CpuQuantizePass, fusion_gru) {
dequant_count, added_nodes_count, 2. * 127, 128.); dequant_count, added_nodes_count, 2. * 127, 128.);
} }
const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
const std::string& prefix,
int number) {
auto v = std::vector<std::string>();
for (int i = 0; i < number; ++i) {
auto name = prefix + std::to_string(i);
prog->MutableBlock(0)->Var(name);
v.push_back(name);
}
return v;
}
void create_vars(ProgramDesc* prog,
const std::initializer_list<std::string>& names) {
for (auto name : names) prog->MutableBlock(0)->Var(name);
}
void SetMultiGruOp(ProgramDesc* prog, const std::string x,
const std::vector<std::string> wx,
const std::vector<std::string> wh,
const std::vector<std::string> b, const std::string h,
int layers) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType("multi_gru");
op->SetInput("X", {x});
op->SetInput("WeightX", wx);
op->SetInput("WeightH", wh);
op->SetInput("Bias", b);
op->SetOutput("Hidden", {h});
op->SetAttr("layers", layers);
op->SetAttr("origin_mode", false);
op->SetAttr("use_mkldnn", true);
op->SetAttr("name", std::string("Multi_gru"));
op->SetAttr("mkldnn_data_type", std::string("int8"));
op->SetAttr("Scale_data", 1.0f);
op->SetAttr("Shift_data", 0.0f);
}
void MainTestMultiGru(int layers) {
ProgramDesc prog;
// Create variables
create_vars(&prog, {"x", "h"});
const std::vector<std::string> wx = churn_out_vars(&prog, "wx", 2 * layers);
const std::vector<std::string> wh = churn_out_vars(&prog, "wh", 2 * layers);
const std::vector<std::string> b = churn_out_vars(&prog, "b", 2 * layers);
std::vector<std::string> all_vars;
all_vars.reserve(wx.size() + wh.size() + b.size() + 2);
all_vars.insert(all_vars.end(), wx.begin(), wx.end());
all_vars.insert(all_vars.end(), wh.begin(), wh.end());
all_vars.insert(all_vars.end(), b.begin(), b.end());
all_vars.push_back("x");
all_vars.push_back("h");
// Prepare program descriptor
SetMultiGruOp(&prog, "x", wx, wh, b, "h", layers);
// Prepare and run the pass
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
int original_nodes_num, current_nodes_num;
PreparePass(&graph, prog, all_vars, &original_nodes_num, &current_nodes_num);
// Verify graph after quantization
float scale = 2 * 127;
float shift = 128;
int quantize_nodes_count = 0;
int dequantize_nodes_count = 0;
int multi_gru_nodes_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
if (op->Type() == "multi_gru") {
multi_gru_nodes_count++;
auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_data")), scale)
<< "Scale_data for node '" + op_name + "'.";
EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Shift_data")), shift)
<< "Shift_data for node '" + op_name + "'.";
EXPECT_EQ(op->Input("Scale_weights").size(), 2u * layers)
<< "Scale_weights for node '" + op_name + "'.";
EXPECT_EQ(BOOST_GET_CONST(bool, op->GetAttr("force_fp32_output")), true)
<< "force_fp32_output for node '" + op_name + "'.";
} else if (op->Type() == "quantize") {
quantize_nodes_count++;
} else if (op->Type() == "dequantize") {
dequantize_nodes_count++;
}
}
}
int multi_gru_count = 1;
int quant_count = 1;
int quant_out_count = 1;
int dequant_count = 0;
int dequant_out_count = 0;
int scale_weights_count = 2 * layers;
int added_nodes_count = quant_count + quant_out_count + scale_weights_count +
dequant_count + dequant_out_count;
EXPECT_EQ(multi_gru_nodes_count, multi_gru_count);
EXPECT_EQ(quantize_nodes_count, quant_count);
EXPECT_EQ(dequantize_nodes_count, dequant_count);
EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
}
TEST(CpuQuantizePass, multi_gru_1) {
int layers = 1;
MainTestMultiGru(layers);
}
TEST(CpuQuantizePass, multi_gru_2) {
int layers = 2;
MainTestMultiGru(layers);
}
TEST(CpuQuantizePass, multi_gru_3) {
int layers = 3;
MainTestMultiGru(layers);
}
static const std::initializer_list<std::string> variable_names_reshape = { static const std::initializer_list<std::string> variable_names_reshape = {
"a", "w1", "b", "c", "d", "e", "f"}; "a", "w1", "b", "c", "d", "e", "f"};
......
...@@ -66,7 +66,7 @@ class Quant2Int8MkldnnPass(object): ...@@ -66,7 +66,7 @@ class Quant2Int8MkldnnPass(object):
self._fc_ops = ['fc'] self._fc_ops = ['fc']
self._relu_ops = ['relu', 'relu6'] self._relu_ops = ['relu', 'relu6']
self._matmul_ops = ['matmul'] self._matmul_ops = ['matmul']
self._gru_ops = ['fusion_gru'] self._gru_ops = ['fusion_gru', 'multi_gru']
self._weight_scales = {} self._weight_scales = {}
# Collect the Input and Output sclaes from Fake quant models # Collect the Input and Output sclaes from Fake quant models
self._var_quant_scales = {} self._var_quant_scales = {}
...@@ -352,6 +352,8 @@ class Quant2Int8MkldnnPass(object): ...@@ -352,6 +352,8 @@ class Quant2Int8MkldnnPass(object):
graph = self._apply_pass(graph, 'mul_lstm_fuse_pass') graph = self._apply_pass(graph, 'mul_lstm_fuse_pass')
graph = self._apply_pass(graph, 'fc_gru_fuse_pass') graph = self._apply_pass(graph, 'fc_gru_fuse_pass')
graph = self._apply_pass(graph, 'mul_gru_fuse_pass') graph = self._apply_pass(graph, 'mul_gru_fuse_pass')
graph = self._apply_pass(graph, 'multi_gru_fuse_pass')
graph = self._apply_pass(graph, 'multi_gru_seq_fuse_pass')
graph = self._apply_pass(graph, 'seq_concat_fc_fuse_pass') graph = self._apply_pass(graph, 'seq_concat_fc_fuse_pass')
graph = self._apply_pass(graph, 'squared_mat_sub_fuse_pass') graph = self._apply_pass(graph, 'squared_mat_sub_fuse_pass')
graph = self._apply_pass(graph, 'is_test_pass') graph = self._apply_pass(graph, 'is_test_pass')
...@@ -450,36 +452,44 @@ class Quant2Int8MkldnnPass(object): ...@@ -450,36 +452,44 @@ class Quant2Int8MkldnnPass(object):
self._var_quant_scales[weight_var_name] = (use_unsigned_int, self._var_quant_scales[weight_var_name] = (use_unsigned_int,
lod_tensor) lod_tensor)
def _compute_gru_weight_scales(wx_name, wh_name): def _compute_single_gru_weight_scales(wx_var_name, wh_var_name):
for op in graph.all_op_nodes():
if op.op().type() in self._gru_ops:
wx_var_name = op.input(wx_name)[0]
wh_var_name = op.input(wh_name)[0]
wx = np.array(self._load_param(self._scope, wx_var_name)) wx = np.array(self._load_param(self._scope, wx_var_name))
wh = np.array(self._load_param(self._scope, wh_var_name)) wh = np.array(self._load_param(self._scope, wh_var_name))
OC = wh.shape[0] OC = wh.shape[0]
scale_ur = 1.0 / np.max(np.abs( scale_ur = 1.0 / np.max(np.abs(
np.concatenate( np.concatenate(
[ [
wx[:, :2 * OC], wh.flatten()[:2 * OC * OC] wx[:, :2 * OC], wh.flatten()[:2 * OC * OC].reshape(OC, 2
.reshape(OC, 2 * OC) * OC)
], ],
axis=0)), axis=0)),
axis=0) axis=0)
scale_o = 1.0 / np.max(np.abs( scale_o = 1.0 / np.max(np.abs(
np.concatenate( np.concatenate(
[ [
wx[:, 2 * OC:], wh.flatten()[2 * OC * OC:] wx[:, 2 * OC:], wh.flatten()[2 * OC * OC:].reshape(OC,
.reshape(OC, OC) OC)
], ],
axis=0)), axis=0)),
axis=0) axis=0)
gru_weights_scale = np.concatenate( gru_weights_scale = np.concatenate([scale_ur,
[scale_ur, scale_o]).astype('float') scale_o]).astype('float')
return self._convert_scale2tensor(gru_weights_scale)
lod_tensor = self._convert_scale2tensor(gru_weights_scale) def _compute_gru_weight_scales(wx_name, wh_name):
for op in graph.all_op_nodes():
if op.op().type() in self._gru_ops:
assert len(op.input(wx_name)) == len(
op.input(wh_name)
), 'Mismatch in number of weights inputs ({} for WeightX vs. {} for WeightH).'.format(
len(op.input(wx_name)), len(op.input(wh_name)))
for i, wx_var_name in enumerate(op.input(wx_name)):
wh_var_name = op.input(wh_name)[i]
use_unsigned_int = False use_unsigned_int = False
lod_tensor = _compute_single_gru_weight_scales(
wx_var_name, wh_var_name)
self._var_quant_scales[wx_var_name] = (use_unsigned_int, self._var_quant_scales[wx_var_name] = (use_unsigned_int,
lod_tensor) lod_tensor)
......
...@@ -239,7 +239,7 @@ if(LINUX AND WITH_MKLDNN) ...@@ -239,7 +239,7 @@ if(LINUX AND WITH_MKLDNN)
set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz") set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz")
set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2") set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2")
download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE}) download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE})
set(QUANT2_GRU_OPS_TO_QUANTIZE "fusion_gru") set(QUANT2_GRU_OPS_TO_QUANTIZE "multi_gru")
### Save FP32 model or INT8 model from Quant model ### Save FP32 model or INT8 model from Quant model
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册