未验证 提交 7ef04da6 编写于 作者: J joanna.wozna.intel 提交者: GitHub

Add fusion_lstm INT8 PTQ (#35334)

* Add fusion_lstm INT8 PTQ

* Correct mkldnn_cache_capacity and enable fc_lstm_fuse_pass only for this test

* Change mkldnn_cache_capacity
上级 97798f9a
...@@ -2249,9 +2249,10 @@ PDNode *patterns::MultipleQuantize::operator()() { ...@@ -2249,9 +2249,10 @@ PDNode *patterns::MultipleQuantize::operator()() {
PDNode *patterns::QuantizePlacement::operator()( PDNode *patterns::QuantizePlacement::operator()(
const std::unordered_set<std::string> &quantize_enabled_op_types) { const std::unordered_set<std::string> &quantize_enabled_op_types) {
std::unordered_set<std::string> supported_op_types = std::unordered_set<std::string> supported_op_types =
std::unordered_set<std::string>( std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
{"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d", "fc", "matmul", "pool2d", "prior_box",
"prior_box", "reshape2", "transpose2", "fusion_gru", "multi_gru"}); "reshape2", "transpose2", "fusion_gru",
"fusion_lstm", "multi_gru"});
if (!quantize_enabled_op_types.empty()) { if (!quantize_enabled_op_types.empty()) {
supported_op_types = quantize_enabled_op_types; supported_op_types = quantize_enabled_op_types;
} }
...@@ -2723,6 +2724,26 @@ PDNode *patterns::FusionGru::operator()() { ...@@ -2723,6 +2724,26 @@ PDNode *patterns::FusionGru::operator()() {
return out; return out;
} }
PDNode *patterns::FusionLSTM::operator()() {
auto op = pattern->NewNode(op_repr())->assert_is_op("fusion_lstm");
auto x = pattern->NewNode(x_repr())->AsInput()->assert_is_op_input(
"fusion_lstm", "X");
auto weight_h = pattern->NewNode(weight_h_repr())
->AsInput()
->assert_is_op_input("fusion_lstm", "WeightH");
auto weight_x = pattern->NewNode(weight_x_repr())
->AsInput()
->assert_is_op_input("fusion_lstm", "WeightX");
auto hidden = pattern->NewNode(hidden_repr())
->AsOutput()
->assert_is_op_output("fusion_lstm", "Hidden");
auto cell = pattern->NewNode(cell_repr())
->AsOutput()
->assert_is_op_output("fusion_lstm", "Cell");
op->LinksFrom({x, weight_h, weight_x}).LinksTo({hidden, cell});
return hidden;
}
PDNode *patterns::TwoFusionGruConcat::operator()() { PDNode *patterns::TwoFusionGruConcat::operator()() {
auto x = pattern->NewNode(x_repr())->AsInput()->assert_is_op_input( auto x = pattern->NewNode(x_repr())->AsInput()->assert_is_op_input(
"fusion_gru", "X"); "fusion_gru", "X");
......
...@@ -1562,6 +1562,28 @@ struct FusionGru : public PatternBase { ...@@ -1562,6 +1562,28 @@ struct FusionGru : public PatternBase {
PATTERN_DECL_NODE(out); PATTERN_DECL_NODE(out);
}; };
// fusion_lstm op
// Forward pass for fusion_lstm.
// fusion_lstm out is a result of the operator.
struct FusionLSTM : public PatternBase {
FusionLSTM(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "fusion_lstm") {}
// TODO(lidanqing): Is it enough to detect fusion_lstm with these things
PDNode* operator()();
// declare op
PATTERN_DECL_NODE(op);
// declate inputs
PATTERN_DECL_NODE(x);
PATTERN_DECL_NODE(weight_h);
PATTERN_DECL_NODE(weight_x);
// decalre outputs
PATTERN_DECL_NODE(hidden);
PATTERN_DECL_NODE(cell);
};
// two concatenated fusion_gru ops // two concatenated fusion_gru ops
// Forward pass for fusion of two concatenated fusion_gru ops. // Forward pass for fusion of two concatenated fusion_gru ops.
// concat_out is a result of the operator(). // concat_out is a result of the operator().
......
...@@ -944,6 +944,64 @@ void CPUQuantizePass::QuantizeMultiGru(Graph* graph) const { ...@@ -944,6 +944,64 @@ void CPUQuantizePass::QuantizeMultiGru(Graph* graph) const {
PrettyLogDetail("--- quantized %d multi_gru ops", quantize_count); PrettyLogDetail("--- quantized %d multi_gru ops", quantize_count);
} }
void CPUQuantizePass::QuantizeFusionLSTM(Graph* graph) const {
GraphPatternDetector gpd;
patterns::FusionLSTM pattern{gpd.mutable_pattern(), name_scope_};
pattern();
int quantize_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "Quantize fusion_lstm op";
GET_IR_NODE_FROM_SUBGRAPH(op, op, pattern);
// skip if should not be quantized
if (!platform::HasOpINT8DataType(op->Op())) {
LogQuantizationDisabled(op);
return;
}
GET_IR_NODE_FROM_SUBGRAPH(x, x, pattern);
GET_IR_NODE_FROM_SUBGRAPH(weight_h, weight_h, pattern);
GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern);
GET_IR_NODE_FROM_SUBGRAPH(hidden, hidden, pattern);
GET_IR_NODE_FROM_SUBGRAPH(cell, cell, pattern);
// Starting from here there maybe issues
if (!AreScalesPresentForNodes({x, weight_x})) {
LogCannotQuantizeOp(op);
return;
}
bool is_x_unsigned{false};
auto input_x_scale = GetScaleValueForNode(x, &is_x_unsigned);
double input_x_shift{128.};
if (is_x_unsigned) input_x_shift = 0.;
QuantizeInput(g, op, x, "X", input_x_scale, is_x_unsigned, "Scale_data",
input_x_shift, "Shift_data");
auto weight_scale_tensor = GetScaleTensorForNode(weight_x);
EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data<double>(),
weight_scale_tensor.numel()};
eigen_tensor *= static_cast<double>(S8_MAX);
std::vector<float> scale_weights{
weight_scale_tensor.data<double>(),
weight_scale_tensor.data<double>() + weight_scale_tensor.numel()};
op->Op()->SetAttr("Scale_weights", scale_weights);
// return fp32 data
op->Op()->SetAttr("force_fp32_output", true);
++quantize_count;
};
gpd(graph, handler);
AddStatis(quantize_count);
PrettyLogDetail("--- quantized %d fusion_lstm ops", quantize_count);
}
void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Quantizing the graph."; VLOG(3) << "Quantizing the graph.";
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
...@@ -965,6 +1023,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { ...@@ -965,6 +1023,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
QuantizeElementwiseAdd(graph); QuantizeElementwiseAdd(graph);
QuantizeFusionGru(graph); QuantizeFusionGru(graph);
QuantizeMultiGru(graph); QuantizeMultiGru(graph);
QuantizeFusionLSTM(graph);
} }
} // namespace ir } // namespace ir
......
...@@ -60,6 +60,7 @@ class CPUQuantizePass : public FusePassBase { ...@@ -60,6 +60,7 @@ class CPUQuantizePass : public FusePassBase {
void QuantizeElementwiseAdd(Graph* graph) const; void QuantizeElementwiseAdd(Graph* graph) const;
void QuantizeFusionGru(Graph* graph) const; void QuantizeFusionGru(Graph* graph) const;
void QuantizeMultiGru(Graph* graph) const; void QuantizeMultiGru(Graph* graph) const;
void QuantizeFusionLSTM(Graph* graph) const;
void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
double scale_to_one, bool is_input_unsigned, double scale_to_one, bool is_input_unsigned,
......
...@@ -61,8 +61,8 @@ static void check_tensor(const LoDTensor& tensor) { ...@@ -61,8 +61,8 @@ static void check_tensor(const LoDTensor& tensor) {
"Tensor dimension is empty.")); "Tensor dimension is empty."));
} }
void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForGRUWeights( void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForRNNWeights(
const paddle::framework::OpDesc* op) { const paddle::framework::OpDesc* op, bool gru) {
const auto& wx_names = op->Input("WeightX"); const auto& wx_names = op->Input("WeightX");
const auto& wh_names = op->Input("WeightH"); const auto& wh_names = op->Input("WeightH");
for (size_t i = 0; i < wx_names.size(); ++i) { for (size_t i = 0; i < wx_names.size(); ++i) {
...@@ -74,14 +74,20 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForGRUWeights( ...@@ -74,14 +74,20 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForGRUWeights(
check_var(wh_var, wh_name); check_var(wh_var, wh_name);
LoDTensor* wx_tensor = wx_var->GetMutable<LoDTensor>(); LoDTensor* wx_tensor = wx_var->GetMutable<LoDTensor>();
LoDTensor* wh_tensor = wh_var->GetMutable<LoDTensor>(); LoDTensor* wh_tensor = wh_var->GetMutable<LoDTensor>();
scales_[wx_name] = GetMaxChGRUScalingFactor(*wx_tensor, *wh_tensor); if (gru) {
scales_[wx_name] = GetMaxChGRUScalingFactor(*wx_tensor, *wh_tensor);
} else {
scales_[wx_name] = GetMaxChLSTMScalingFactor(*wx_tensor, *wh_tensor);
}
} }
} }
void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpInputs( void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpInputs(
const paddle::framework::OpDesc* op) { const paddle::framework::OpDesc* op) {
if (op->Type() == "fusion_gru" || op->Type() == "multi_gru") { if (op->Type() == "fusion_gru" || op->Type() == "multi_gru") {
CalculateScalesForGRUWeights(op); CalculateScalesForRNNWeights(op, true);
} else if (op->Type() == "fusion_lstm") {
CalculateScalesForRNNWeights(op, false);
} }
for (auto const& input : op->Inputs()) { for (auto const& input : op->Inputs()) {
for (const auto& var_name : input.second) { for (const auto& var_name : input.second) {
...@@ -464,6 +470,41 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChGRUScalingFactor( ...@@ -464,6 +470,41 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChGRUScalingFactor(
return std::make_pair(is_unsigned, scale_tensor); return std::make_pair(is_unsigned, scale_tensor);
} }
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetMaxChLSTMScalingFactor(
const LoDTensor& wx_tensor, const LoDTensor& wh_tensor) const {
check_tensor(wx_tensor);
check_tensor(wh_tensor);
std::vector<float> scale(wx_tensor.dims()[1]);
for (int row_id = 0; row_id < wx_tensor.dims()[0]; row_id++) {
for (int col_id = 0; col_id < wx_tensor.dims()[1]; col_id++) {
int idx = (row_id * wx_tensor.dims()[1]) + col_id;
auto abs_value = std::abs(wx_tensor.data<float>()[idx]);
if (row_id == 0) {
scale[col_id] = abs_value;
} else {
if (abs_value > scale[col_id]) scale[col_id] = abs_value;
}
}
}
for (int row_id = 0; row_id < wh_tensor.dims()[0]; row_id++) {
for (int col_id = 0; col_id < wh_tensor.dims()[1]; col_id++) {
int idx = (row_id * wh_tensor.dims()[1]) + col_id;
auto abs_value = std::abs(wh_tensor.data<float>()[idx]);
if (abs_value > scale[col_id]) scale[col_id] = abs_value;
}
}
transform(scale.begin(), scale.end(), scale.begin(),
[](float& c) { return 1 / c; });
LoDTensor scale_tensor = CreateScaleTensor(scale.size());
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
std::copy(scale.begin(), scale.end(), scale_ptr);
bool is_unsigned = false;
return std::make_pair(is_unsigned, scale_tensor);
}
std::pair<std::vector<int>, float> std::pair<std::vector<int>, float>
AnalysisPredictor::MkldnnQuantizer::Histogram( AnalysisPredictor::MkldnnQuantizer::Histogram(
const framework::LoDTensor& var_tensor, float min_val, float max_val, const framework::LoDTensor& var_tensor, float min_val, float max_val,
......
...@@ -69,7 +69,8 @@ class AnalysisPredictor::MkldnnQuantizer { ...@@ -69,7 +69,8 @@ class AnalysisPredictor::MkldnnQuantizer {
bool is_unsigned); bool is_unsigned);
void CalculateSingleGRUWeightsScale(const std::string& var_name, void CalculateSingleGRUWeightsScale(const std::string& var_name,
const framework::LoDTensor& var_tensor); const framework::LoDTensor& var_tensor);
void CalculateScalesForGRUWeights(const paddle::framework::OpDesc* op); void CalculateScalesForRNNWeights(const paddle::framework::OpDesc* op,
bool gru);
void CalculateScalesForOpOutputs(const paddle::framework::OpDesc* op); void CalculateScalesForOpOutputs(const paddle::framework::OpDesc* op);
void CalculateScalesForOpInputs(const paddle::framework::OpDesc* op); void CalculateScalesForOpInputs(const paddle::framework::OpDesc* op);
void PrepareArgument() const; void PrepareArgument() const;
...@@ -91,6 +92,10 @@ class AnalysisPredictor::MkldnnQuantizer { ...@@ -91,6 +92,10 @@ class AnalysisPredictor::MkldnnQuantizer {
const framework::LoDTensor& wx_tensor, const framework::LoDTensor& wx_tensor,
const framework::LoDTensor& wh_tensor) const; const framework::LoDTensor& wh_tensor) const;
std::pair<bool, framework::LoDTensor> GetMaxChLSTMScalingFactor(
const framework::LoDTensor& wx_tensor,
const framework::LoDTensor& wh_tensor) const;
std::pair<bool, framework::LoDTensor> GetMaxScalingFactor( std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const; const framework::LoDTensor& var_tensor, bool is_unsigned) const;
......
...@@ -85,6 +85,25 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { ...@@ -85,6 +85,25 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
rules_["multi_gru"]["WeightH"] = ScaleAlgo::NONE; // separately rules_["multi_gru"]["WeightH"] = ScaleAlgo::NONE; // separately
rules_["multi_gru"]["Scale_weights"] = ScaleAlgo::NONE; rules_["multi_gru"]["Scale_weights"] = ScaleAlgo::NONE;
rules_["multi_gru"]["Hidden"] = ScaleAlgo::KL; rules_["multi_gru"]["Hidden"] = ScaleAlgo::KL;
rules_["fusion_lstm"]["X"] = ScaleAlgo::KL;
rules_["fusion_lstm"]["H0"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["C0"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["Bias"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["WeightX"] =
ScaleAlgo::NONE; // Weights will be handled separately
rules_["fusion_lstm"]["WeightH"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["XX"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["Cell"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["BatchedInput"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["BatchedHidden"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["BatchedCell"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["BatchedGate"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["BatchedCellPreAct"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["ReorderedH0"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["ReorderedC0"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["CheckedCell"] = ScaleAlgo::NONE;
rules_["fusion_lstm"]["Hidden"] = ScaleAlgo::KL;
} }
ScaleAlgo MkldnnQuantizerConfig::scale_algo( ScaleAlgo MkldnnQuantizerConfig::scale_algo(
......
...@@ -62,6 +62,12 @@ class MkldnnQuantizerTest : public testing::Test { ...@@ -62,6 +62,12 @@ class MkldnnQuantizerTest : public testing::Test {
return mkldnn_quantizer->GetMaxChGRUScalingFactor(wx_tensor, wh_tensor); return mkldnn_quantizer->GetMaxChGRUScalingFactor(wx_tensor, wh_tensor);
} }
std::pair<bool, framework::LoDTensor> GetMaxChLSTMScalingFactor(
const framework::LoDTensor& wx_tensor,
const framework::LoDTensor& wh_tensor) const {
return mkldnn_quantizer->GetMaxChLSTMScalingFactor(wx_tensor, wh_tensor);
}
protected: protected:
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
std::unique_ptr<AnalysisPredictor::MkldnnQuantizer> mkldnn_quantizer; std::unique_ptr<AnalysisPredictor::MkldnnQuantizer> mkldnn_quantizer;
...@@ -297,4 +303,33 @@ TEST_F(MkldnnQuantizerTest, max_ch_gru_scaling_factor) { ...@@ -297,4 +303,33 @@ TEST_F(MkldnnQuantizerTest, max_ch_gru_scaling_factor) {
ASSERT_NEAR(lod_tensor.data<double>()[i], scales[i], abs_error); ASSERT_NEAR(lod_tensor.data<double>()[i], scales[i], abs_error);
} }
} }
TEST_F(MkldnnQuantizerTest, max_ch_lstm_scaling_factor) {
framework::LoDTensor wx_tensor, wh_tensor, lod_tensor;
wx_tensor.Resize(framework::make_dim(wx.size(), wx[0].size()));
for (size_t i = 0; i < wx.size(); i++)
std::copy(
begin(wx[i]), end(wx[i]),
wx_tensor.mutable_data<float>(platform::CPUPlace()) + i * wx[0].size());
wh_tensor.Resize(framework::make_dim(wh.size(), wh[0].size()));
for (size_t i = 0; i < wh.size(); i++)
std::copy(
begin(wh[i]), end(wh[i]),
wh_tensor.mutable_data<float>(platform::CPUPlace()) + i * wh[0].size());
bool is_unsigned;
std::tie(is_unsigned, lod_tensor) =
GetMaxChLSTMScalingFactor(wx_tensor, wh_tensor);
std::vector<double> scales = {2.35381475, 1.10797026, 1.00151656,
1.19001095, 1.09045166, 1.01785819};
ASSERT_EQ(is_unsigned, false);
ASSERT_EQ(lod_tensor.numel(), static_cast<int64_t>(scales.size()));
for (int64_t i = 0; i < lod_tensor.numel(); i++) {
ASSERT_NEAR(lod_tensor.data<double>()[i], scales[i], abs_error);
}
}
} // namespace paddle } // namespace paddle
...@@ -192,7 +192,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -192,7 +192,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
"seqpool_cvm_concat_fuse_pass", // "seqpool_cvm_concat_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", // // "embedding_fc_lstm_fuse_pass", //
// TODO(wilber): fix correctness problem. // TODO(wilber): fix correctness problem.
// "fc_lstm_fuse_pass", // // "fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", // "mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", // "fc_gru_fuse_pass", //
"mul_gru_fuse_pass", // "mul_gru_fuse_pass", //
......
...@@ -78,12 +78,12 @@ class LSTMMKLDNNHandler ...@@ -78,12 +78,12 @@ class LSTMMKLDNNHandler
auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(), auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldgo); MKLDNNMemoryFormat::ldgo);
auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(), auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
MKLDNNMemoryFormat::tnc); MKLDNNMemoryFormat::any);
auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(), auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::ldnc); MKLDNNMemoryFormat::any);
auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<float>(), auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc); MKLDNNMemoryFormat::any);
// Create LSTM oneDNN primitive // Create LSTM oneDNN primitive
const auto direction = const auto direction =
......
...@@ -43,6 +43,12 @@ function(download_quant_fp32_model install_dir data_file check_sum) ...@@ -43,6 +43,12 @@ function(download_quant_fp32_model install_dir data_file check_sum)
endif() endif()
endfunction() endfunction()
function(download_lstm_model install_dir data_file check_sum)
if (NOT EXISTS ${install_dir}/${data_file})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/lstm ${data_file} ${check_sum})
endif()
endfunction()
function(inference_quant_int8_image_classification_test target quant_model_dir dataset_path) function(inference_quant_int8_image_classification_test target quant_model_dir dataset_path)
py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant_int8_image_classification_comparison.py" py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant_int8_image_classification_comparison.py"
ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI} ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
...@@ -86,6 +92,20 @@ function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir da ...@@ -86,6 +92,20 @@ function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir da
--ops_to_quantize ${ops_to_quantize}) --ops_to_quantize ${ops_to_quantize})
endfunction() endfunction()
function(inference_quant2_int8_lstm_model_test target fp32_model dataset_path)
py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_lstm_model.py"
ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
FLAGS_use_mkldnn=true
ARGS --fp32_model ${fp32_model}
--infer_data ${dataset_path}
--num_threads 4
--mkldnn_cache_capacity 100
--warmup_iter 100
--warmup_batch_size 1
--acc_diff_threshold 0.11)
endfunction()
function(download_quant_data install_dir data_file check_sum) function(download_quant_data install_dir data_file check_sum)
if (NOT EXISTS ${install_dir}/${data_file}) if (NOT EXISTS ${install_dir}/${data_file})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum}) inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
...@@ -260,6 +280,16 @@ if(LINUX AND WITH_MKLDNN) ...@@ -260,6 +280,16 @@ if(LINUX AND WITH_MKLDNN)
set(QUANT2_INT8_ERNIE_DOT_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8_dot_file") set(QUANT2_INT8_ERNIE_DOT_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8_dot_file")
convert_model2dot_test(convert_model2dot_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_INT8_ERNIE_DOT_SAVE_PATH} "Ernie_quant2_int8") convert_model2dot_test(convert_model2dot_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_INT8_ERNIE_DOT_SAVE_PATH} "Ernie_quant2_int8")
### PTQ INT8
# PTQ int8 lstm model
set(LSTM_DATA_ARCHIVE "unittest_model_data/quant_lstm_input_data.tar.gz")
set(QUANT2_INT8_LSTM_SAVE_PATH "${QUANT_INSTALL_DIR}/lstm_quant2")
download_quant_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_DATA_ARCHIVE} add84c754e9b792fea1fbd728d134ab7)
set(QUANT2_FP32_LSTM_MODEL_ARCHIVE "lstm_fp32_model.tar.gz")
download_lstm_model(${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_FP32_LSTM_MODEL_ARCHIVE} eecd9f44d69a84acc1cf2235c4b8b743)
inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data)
endif() endif()
# Since the tests for Quant & INT8 comparison support only testing on Linux # Since the tests for Quant & INT8 comparison support only testing on Linux
...@@ -323,4 +353,5 @@ if(LINUX AND WITH_MKLDNN) ...@@ -323,4 +353,5 @@ if(LINUX AND WITH_MKLDNN)
set_tests_properties(test_quant2_int8_ernie_mkldnn PROPERTIES TIMEOUT 120) set_tests_properties(test_quant2_int8_ernie_mkldnn PROPERTIES TIMEOUT 120)
set_tests_properties(test_quant_int8_googlenet_mkldnn PROPERTIES TIMEOUT 120) set_tests_properties(test_quant_int8_googlenet_mkldnn PROPERTIES TIMEOUT 120)
set_tests_properties(test_quant2_int8_resnet50_mkldnn PROPERTIES TIMEOUT 120) set_tests_properties(test_quant2_int8_resnet50_mkldnn PROPERTIES TIMEOUT 120)
set_tests_properties(test_quant2_int8_lstm_mkldnn PROPERTIES TIMEOUT 120)
endif() endif()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
import struct
import sys
import time
import unittest
from paddle import fluid
from paddle.fluid.core import AnalysisConfig, create_paddle_predictor
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--fp32_model', type=str, default='', help='A path to a FP32 model.')
parser.add_argument('--infer_data', type=str, default='', help='Data file.')
parser.add_argument(
'--num_threads', type=int, default=1, help='Number of threads.')
parser.add_argument(
'--warmup_iter',
type=int,
default=1,
help='Number of the first iterations to skip in performance statistics.')
parser.add_argument(
'--warmup_batch_size',
type=int,
default=1,
help='Number of batches to use in PTQ warmup. Default: 1.')
parser.add_argument(
'--acc_diff_threshold',
type=float,
default=0.01,
help='Accepted accuracy difference threshold.')
parser.add_argument(
'--mkldnn_cache_capacity',
type=int,
default=0,
help='Mkldnn cache capacity. The default value in Python API is 15, which can slow down int8 models. Default 0 means unlimited cache.'
)
test_args, args = parser.parse_known_args(namespace=unittest)
return test_args, sys.argv[:1] + args
class TestLstmModelPTQ(unittest.TestCase):
def get_warmup_tensor(self, data_path, place, warmup_batch_size):
data = []
with open(data_path, 'rb') as in_f:
while True:
plen = in_f.read(4)
if plen is None or len(plen) != 4:
break
alllen = struct.unpack('i', plen)[0]
label_len = alllen & 0xFFFF
seq_len = (alllen >> 16) & 0xFFFF
label = in_f.read(4 * label_len)
label = np.frombuffer(
label, dtype=np.int32).reshape([len(label) // 4])
feat = in_f.read(4 * seq_len * 8)
feat = np.frombuffer(
feat, dtype=np.float32).reshape([len(feat) // 4 // 8, 8])
lod_feat = [feat.shape[0]]
minputs = fluid.create_lod_tensor(feat, [lod_feat], place)
infer_data = fluid.core.PaddleTensor()
infer_data.lod = minputs.lod()
infer_data.data = fluid.core.PaddleBuf(np.array(minputs))
infer_data.shape = minputs.shape()
infer_data.dtype = fluid.core.PaddleDType.FLOAT32
infer_label = fluid.core.PaddleTensor()
infer_label.data = fluid.core.PaddleBuf(np.array(label))
infer_label.shape = label.shape
infer_label.dtype = fluid.core.PaddleDType.INT32
data.append([infer_data, infer_label])
warmup_data = data[:warmup_batch_size]
inputs = data[warmup_batch_size:]
return warmup_data, inputs
def set_config(self,
model_path,
num_threads,
mkldnn_cache_capacity,
warmup_batch_size,
warmup_data=None,
enable_int8=False):
config = AnalysisConfig(model_path)
config.disable_gpu()
config.switch_use_feed_fetch_ops(True)
config.switch_ir_optim(True)
config.set_cpu_math_library_num_threads(num_threads)
# This pass to work properly, must be added before fc_fuse_pass
config.pass_builder().insert_pass(5, "fc_lstm_fuse_pass")
config.enable_mkldnn()
config.set_mkldnn_cache_capacity(mkldnn_cache_capacity)
if enable_int8:
config.enable_quantizer()
config.quantizer_config().set_quant_data(warmup_data)
config.quantizer_config().set_quant_batch_size(warmup_batch_size)
return config
def run_program(self,
model_path,
data_path,
num_threads,
mkldnn_cache_capacity,
warmup_iter,
warmup_batch_size,
enable_ptq_int8=False):
place = fluid.CPUPlace()
warmup_data, inputs = self.get_warmup_tensor(data_path, place,
warmup_batch_size)
warmup_data = [item[0] for item in warmup_data]
config = self.set_config(model_path, num_threads, mkldnn_cache_capacity,
warmup_batch_size, warmup_data,
enable_ptq_int8)
predictor = create_paddle_predictor(config)
data = [item[0] for item in inputs]
label = np.array([item[1] for item in inputs])
all_hz_num = 0
ok_hz_num = 0
all_ctc_num = 0
ok_ctc_num = 0
dataset_size = len(data)
start = time.time()
for i in range(dataset_size):
if i == warmup_iter:
start = time.time()
hz_out, ctc_out = predictor.run([data[i]])
np_hz_out = np.array(hz_out.data.float_data()).reshape(-1)
np_ctc_out = np.array(ctc_out.data.int64_data()).reshape(-1)
out_hz_label = np.argmax(np_hz_out)
this_label = label[i]
this_label_data = np.array(this_label.data.int32_data()).reshape(-1)
if this_label.shape[0] == 1:
all_hz_num += 1
best = this_label_data[0]
if out_hz_label == best:
ok_hz_num += 1
if this_label_data[0] <= 6350:
all_ctc_num += 1
if np_ctc_out.shape[0] == 1 and np_ctc_out.all(
) == this_label_data.all():
ok_ctc_num += 1
else:
all_ctc_num += 1
if np_ctc_out.shape[0] == this_label.shape[
0] and np_ctc_out.all() == this_label_data.all():
ok_ctc_num += 1
if all_ctc_num > 1000 or all_hz_num > 1000:
break
end = time.time()
fps = (dataset_size - warmup_iter) / (end - start)
hx_acc = ok_hz_num / all_hz_num
ctc_acc = ok_ctc_num / all_ctc_num
return hx_acc, ctc_acc, fps
def test_lstm_model(self):
if not fluid.core.is_compiled_with_mkldnn():
return
fp32_model = test_case_args.fp32_model
assert fp32_model, 'The FP32 model path cannot be empty. Please, use the --fp32_model option.'
infer_data = test_case_args.infer_data
assert infer_data, 'The dataset path cannot be empty. Please, use the --infer_data option.'
num_threads = test_case_args.num_threads
mkldnn_cache_capacity = test_case_args.mkldnn_cache_capacity
warmup_iter = test_case_args.warmup_iter
warmup_batch_size = test_case_args.warmup_batch_size
acc_diff_threshold = test_case_args.acc_diff_threshold
(fp32_hx_acc, fp32_ctc_acc, fp32_fps) = self.run_program(
fp32_model, infer_data, num_threads, mkldnn_cache_capacity,
warmup_iter, warmup_batch_size, False)
(int8_hx_acc, int8_ctc_acc, int8_fps) = self.run_program(
fp32_model, infer_data, num_threads, mkldnn_cache_capacity,
warmup_iter, warmup_batch_size, True)
print("FP32: fps {0}, hx_acc {1}, ctc_acc {2}.".format(
fp32_fps, fp32_hx_acc, fp32_ctc_acc))
print("PTQ INT8: fps {0}, hx_acc {1}, ctc_acc {2}.".format(
int8_fps, int8_hx_acc, int8_ctc_acc))
sys.stdout.flush()
hx_delta_value = fp32_hx_acc - int8_hx_acc
ctc_delta_value = fp32_ctc_acc - int8_ctc_acc
self.assertLess(hx_delta_value, acc_diff_threshold)
self.assertLess(ctc_delta_value, acc_diff_threshold)
if __name__ == "__main__":
global test_case_args
test_case_args, remaining_args = parse_args()
unittest.main(argv=remaining_args)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册