add fc-residual quantization (#46917)

* add fc-residual quantization * revert removal of check for use_mkldnn * fix bug * add disable_logs * review fix call twice AreScalesPresntForNodes instead of if-else * rewrite residual input to output * revert fc mkldnn taking residual data * format fix * fix LoDTensor->DenseTensor * LoDTensor->DenseTensor * output->input * revert changes to unsupported script revert changes to unsupported script * remove fc residualdata from output blocklist in cpu_bfloat16_pass.cc

add fc-residual quantization (#46917)
* add fc-residual quantization * revert removal of check for use_mkldnn * fix bug * add disable_logs * review fix call twice AreScalesPresntForNodes instead of if-else * rewrite residual input to output * revert fc mkldnn taking residual data * format fix * fix LoDTensor->DenseTensor * LoDTensor->DenseTensor * output->input * revert changes to unsupported script revert changes to unsupported script * remove fc residualdata from output blocklist in cpu_bfloat16_pass.cc
fed0ed34 · Sylwester Fraczek · GitHub · 41483383 · fed0ed34 · fed0ed34
10 changed file
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1163,21 +1163,12 @@ PDNode *patterns::FCMKLDNN::operator()(bool with_residual_data) {
  if (with_residual_data) {
    auto res_fc_var = pattern->NewNode(residual_data_repr())
                          ->AsInput()
-                          ->assert_is_op_input("fc")
-                          // assert_is_op_input with two arguments doesn't work
-                          // because ResidualData in FC is set as output with
-                          // SetOutput so we do custom assert output
-                          ->assert_more([&](Node *x) {
-                            for (auto *op : x->outputs)
-                              if (IsNthOutput(x, op, "ResidualData", 0))
-                                return true;
-                            return false;
-                          });
+                          ->assert_is_op_input("fc", "ResidualData");
    links_from.push_back(res_fc_var);
  } else {
    fc_op->assert_more([&](Node *x) {
-      if (!HasOutput(x, "ResidualData") ||
-          x->Op()->Output("ResidualData").size() == 0)
+      if (!HasInput(x, "ResidualData") ||
+          x->Op()->Input("ResidualData").size() == 0)
        return true;
      return false;
    });

--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -200,7 +200,6 @@ class DeQuantizer final : public Quanter {
    std::unordered_map<std::string, std::vector<std::string>> block_list{
        {"layer_norm",
         {"Mean", "Variance"}},        // not used in inference in MKLDNN
-        {"fc", {"ResidualData"}},      // artifical output, already dequantized
        {"matmul", {"ResidualData"}},  // artifical output, already dequantized
        {"matmul_v2",
         {"ResidualData"}}};  // artifical output, already dequantized

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -515,16 +515,17 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
      ((with_residual_data) ? "with residual connection" : ""));
 }

-void CPUQuantizePass::QuantizeFc(Graph* graph) const {
+void CPUQuantizePass::QuantizeFc(Graph* graph, bool with_residual_data) const {
  GraphPatternDetector gpd;
  auto pattern = gpd.mutable_pattern();
  patterns::FCMKLDNN fc_pattern{pattern, name_scope_};
-  fc_pattern(false /* with_residual */);
+  fc_pattern(with_residual_data);

  int quantize_fc_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
-    VLOG(4) << "Quantize fc op";
+    VLOG(4) << "Quantize fc op " << (with_residual_data ? "with" : "without")
+            << " residual data";
    GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern);

    // skip if should not be quantized
@@ -532,6 +533,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {
      LogQuantizationDisabled(fc);
      return;
    }
+
    if (!fc->Op()->GetAttrIfExists<bool>("use_mkldnn")) {
      MarkAndLogCannotQuantizeOp(fc, "use_mkldnn attribute set to false");
      return;
@@ -546,6 +548,26 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {
      return;
    }

+    if (with_residual_data) {
+      GET_IR_NODE_FROM_SUBGRAPH(residual_data, residual_data, fc_pattern);
+      if (!AreScalesPresentForNodes({residual_data})) {
+        MarkAndLogCannotQuantizeOp(fc, "No scale available for the operator");
+        return;
+      }
+
+      bool is_residual_unsigned{false};
+      auto residual_scale =
+          GetScaleValueForNode(residual_data, &is_residual_unsigned);
+
+      QuantizeInput(g,
+                    fc,
+                    residual_data,
+                    "ResidualData",
+                    residual_scale,
+                    is_residual_unsigned,
+                    "Scale_in_eltwise");
+    }
+
    bool is_input_unsigned{false};
    auto input_scale = GetScaleValueForNode(input, &is_input_unsigned);
    QuantizeInput(
@@ -576,7 +598,9 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {

  gpd(graph, handler);
  AddStatis(quantize_fc_count);
-  LogQuantizedOpsCounter("fc", quantize_fc_count);
+  LogQuantizedOpsCounter("fc",
+                         quantize_fc_count,
+                         with_residual_data ? "with residual connection" : "");
 }

 void CPUQuantizePass::QuantizePool(Graph* graph) const {
@@ -1228,7 +1252,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
  QuantizePool(graph);
  QuantizeConcat(graph);
  QuantizePriorBox(graph);
-  QuantizeFc(graph);
+  QuantizeFc(graph, false /* with_residual_data */);
+  QuantizeFc(graph, true /* with_residual_data */);
  QuantizeMatmul(graph, false /* with_residual_data */);
  QuantizeMatmul(graph, true /* with_residual_data */);
  QuantizeImmutable(graph, "reshape2", "X");

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -49,8 +49,8 @@ class CPUQuantizePass : public FusePassBase {
 protected:
  void ApplyImpl(ir::Graph* graph) const override;

-  void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
-  void QuantizeFc(Graph* graph) const;
+  void QuantizeConv(Graph* graph, bool with_residual_data) const;
+  void QuantizeFc(Graph* graph, bool with_residual_data) const;
  void QuantizePool(Graph* graph) const;
  void QuantizeConcat(Graph* graph) const;
  void QuantizePriorBox(Graph* graph) const;

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -337,7 +337,8 @@ void CPUQuantizeSquashPass::OpDequantSquash(Graph* graph) const {

    if (dequant_in->outputs.size() == 1) {
      if (any_op->Op()->Type() == "conv2d" ||
-          any_op->Op()->Type() == "conv2d_transpose") {
+          any_op->Op()->Type() == "conv2d_transpose" ||
+          any_op->Op()->Type() == "fc") {
        // do not squash if fuse residual connection is true
        // because residual fusion does not support force output with fp32
        if (any_op->Op()->GetAttrIfExists<bool>("fuse_residual_connection"))
@@ -418,8 +419,8 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
                last_op_names.begin(), last_op_names.end(), quant_out->Name()),
            last_op_names.end());
        last_op_names.push_back(first_quant_out->Name());
-        last_op->Op()->SetInput(last_op_input_name,
-                                std::vector<std::string>(last_op_names));
+        last_op_op->SetInput(last_op_input_name,
+                             std::vector<std::string>(last_op_names));

        IR_NODE_LINK_TO(first_quant_out, last_op);
        GraphSafeRemoveNodes(graph, {quant_op, quant_out});

--- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
@@ -119,7 +119,7 @@ GraphWithStats FCResidualConnectionMKLDNNFusePass::FuseFC(
      return;
    }

-    fc_op->Op()->SetOutput("ResidualData", {residual_data->Name()});
+    fc_op->Op()->SetInput("ResidualData", {residual_data->Name()});
    fc_op->Op()->SetOutput("Out", {elementwise_out->Name()});
    fc_op->Op()->SetAttr("fuse_residual_connection", true);


--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
@@ -29,18 +29,16 @@ namespace ir {

 class Graph;

-namespace {
-void LogEnabledOps(const int counter, const std::string& details) {
-  std::string msg_ss{"---    enabled FC MKL-DNN for "};
-  msg_ss += counter + " fc ops " + details;
-  string::PrettyLogDetail(msg_ss.c_str());
-}
-}  // namespace
+void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::InvalidArgument(
+                              "Pointer to graph argument should not be NULL."));
+  Init("fc_mkldnn_pass", graph);

-void FCMKLDNNPass::ApplyPass(ir::Graph* graph, bool with_residual) const {
  GraphPatternDetector gpd;
  patterns::FCMKLDNN fc_pattern(gpd.mutable_pattern(), "fc_mkldnn_pass");
-  fc_pattern(with_residual);
+  // searching for fc+residual  doesn't make sense at this stage
+  fc_pattern(false /*with_residual*/);

  int found_fc_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -79,19 +77,12 @@ void FCMKLDNNPass::ApplyPass(ir::Graph* graph, bool with_residual) const {

  AddStatis(found_fc_count);

-  LogEnabledOps(found_fc_count,
-                (with_residual ? "with residual connection"
-                               : "without residual connection"));
-}
-
-void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph,
-                          platform::errors::InvalidArgument(
-                              "Pointer to graph argument should not be NULL."));
-  Init("fc_mkldnn_pass", graph);
-
-  ApplyPass(graph, true);
-  ApplyPass(graph, false);
+  if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
+      (found_fc_count > 0)) {
+    std::string msg_ss = "---    enabled FC MKL-DNN for " +
+                         std::to_string(found_fc_count) + " fc ops ";
+    string::PrettyLogDetail(msg_ss.c_str());
+  }
 }

 }  // namespace ir

--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
@@ -34,7 +34,6 @@ class FCMKLDNNPass : public FusePassBase {

 protected:
  void ApplyImpl(ir::Graph* graph) const;
-  void ApplyPass(ir::Graph* graph, bool with_residual) const;
 };

 }  // namespace ir

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -439,6 +439,7 @@ void CpuPassStrategy::EnableMkldnnInt8() {
    passes_.push_back("repeated_fc_relu_fuse_pass");
    passes_.push_back("fc_mkldnn_pass");
    passes_.push_back("fc_act_mkldnn_fuse_pass");
+    passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");
    passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass");
    passes_.push_back("batch_norm_act_fuse_pass");
    passes_.push_back("softplus_activation_mkldnn_fuse_pass");

--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -103,15 +103,16 @@ class FCMKLDNNHandler
    dnnl::primitive_attr attributes;
    dnnl::post_ops post_operations;

-    std::vector<float> output_shift_scale;
-    float scale = 1.0f;
+    float sum_scale = 1.0f;
+    float activation_scale = 1.0f;
    if (phi::funcs::is_int8<T_w>()) {
-      std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx);
+      std::vector<float> output_shift_scale;
+      std::tie(output_shift_scale, sum_scale, activation_scale) =
+          GetOutputScales(ctx);
      int mask = CreateMask(1, output_shift_scale.size() > 1);
      attributes.set_output_scales(mask, output_shift_scale);
    }

-    float sum_scale = 1.0f;
    if (ctx.HasAttr("fuse_residual_connection") &&
        ctx.Attr<bool>("fuse_residual_connection")) {
      post_operations.append_sum(sum_scale);
@@ -120,9 +121,9 @@ class FCMKLDNNHandler
    // ReLU from "fc_fuse_pass"
    if (ctx.Attr<std::string>("activation_type") == "relu") {
      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_relu, 0.0f, 0.0f);
+          activation_scale, dnnl::algorithm::eltwise_relu, 0.0f, 0.0f);
    }
-    platform::AppendActivation(ctx, post_operations, scale);
+    platform::AppendActivation(ctx, post_operations, activation_scale);

    if (ctx.HasAttr("fused_output_scale")) {
      float scale_alpha = ctx.Attr<float>("fused_output_scale");
@@ -136,18 +137,22 @@ class FCMKLDNNHandler

  // Compute the bias scales so that its values correspond to the
  // scale of data being an output of weights and input multiplication
-  std::vector<float> ComputeBiasScales(
-      const float scale_in, const std::vector<float>& scale_weights) {
-    std::vector<float> bias_scales(scale_weights.size());
-
-    for (size_t i = 0; i < bias_scales.size(); ++i) {
-      if (scale_weights[i] == 0.0)
-        bias_scales[i] = 1.0f;
-      else
-        bias_scales[i] = scale_in * scale_weights[i];
+  std::vector<float> GetBiasScales(const framework::ExecutionContext& ctx) {
+    if (ctx.HasAttr("Bias_scales")) {
+      return ctx.Attr<std::vector<float>>("Bias_scales");
+    } else {
+      const float scale_in = ctx.Attr<float>("Scale_in");
+      const auto& scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
+      std::vector<float> bias_scales(scale_weights.size());
+
+      for (size_t i = 0; i < bias_scales.size(); ++i) {
+        if (scale_weights[i] == 0.0)
+          bias_scales[i] = 1.0f;
+        else
+          bias_scales[i] = scale_in * scale_weights[i];
+      }
+      return bias_scales;
    }
-
-    return bias_scales;
  }

  // Correct output scale, to take into account scaling of input and weights
@@ -155,32 +160,44 @@ class FCMKLDNNHandler
  // scaled with its own scales, this data needs to be divided by
  // those scales to normalise them back to what their floating-point range
  // was. Then we multiply them by desired output scale we want on the output.
-  std::tuple<std::vector<float>, float> ComputeOutputShiftScale(
+  std::tuple<std::vector<float>, float, float> GetOutputScales(
      const ExecutionContext& ctx) {
-    auto scale_in_data = ctx.Attr<float>("Scale_in");
-    auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
-    bool has_activation = !ctx.Attr<std::string>("activation_type").empty();
-    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-
-    // If the output will be in floats, we don't multiply by scale_out.
-
-    float scale = (!force_fp32_output && has_activation)
-                      ? ctx.Attr<float>("Scale_out")
-                      : 1.0f;
-    float inner_scale = (force_fp32_output || has_activation)
-                            ? 1.0f
-                            : ctx.Attr<float>("Scale_out");
-    const size_t weight_scales_num = scale_weights_data.size();
-
-    for (size_t i = 0; i < weight_scales_num; ++i) {
-      if (scale_weights_data[i] == 0.0)
-        scale_weights_data[i] = inner_scale;
-      else
-        scale_weights_data[i] =
-            inner_scale / (scale_in_data * scale_weights_data[i]);
+    if (ctx.HasAttr("Sum_scale")) {
+      return std::make_tuple(ctx.Attr<std::vector<float>>("Output_shift_scale"),
+                             ctx.Attr<float>("Sum_scale"),
+                             ctx.Attr<float>("Activation_scale"));
+    } else {
+      auto scale_in_data = ctx.Attr<float>("Scale_in");
+      auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
+      bool has_activation = !ctx.Attr<std::string>("activation_type").empty();
+      bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+      bool fuse_residual_conn = ctx.HasAttr("fuse_residual_connection") &&
+                                ctx.Attr<bool>("fuse_residual_connection");
+      auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise")
+                                       ? ctx.Attr<float>("Scale_in_eltwise")
+                                       : 1.0f;
+
+      // If the output will be in floats, we don't multiply by scale_out.
+
+      float activation_scale = (!force_fp32_output && has_activation)
+                                   ? ctx.Attr<float>("Scale_out")
+                                   : 1.0f;
+      float scale_out_data = (force_fp32_output || has_activation)
+                                 ? 1.0f
+                                 : ctx.Attr<float>("Scale_out");
+      float sum_scale =
+          fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
+      const size_t weight_scales_num = scale_weights_data.size();
+
+      for (size_t i = 0; i < weight_scales_num; ++i) {
+        if (scale_weights_data[i] == 0.0)
+          scale_weights_data[i] = scale_out_data;
+        else
+          scale_weights_data[i] =
+              scale_out_data / (scale_in_data * scale_weights_data[i]);
+      }
+      return std::make_tuple(scale_weights_data, sum_scale, activation_scale);
    }
-
-    return make_tuple(scale_weights_data, scale);
  }

  // Computing MKL-DNN's scaling mask which determines along which dimension
@@ -240,9 +257,7 @@ class FCMKLDNNHandler
  }

  std::shared_ptr<dnnl::memory> AcquireBiasMemoryWithReorder(
-      const phi::DenseTensor* bias,
-      const float scale_in,
-      const std::vector<float>& scale_weights) {
+      const framework::ExecutionContext& ctx, const phi::DenseTensor* bias) {
    const float* bias_data = bias->data<float>();

    if (phi::funcs::is_int8<T_w>() == false) {
@@ -255,7 +270,7 @@ class FCMKLDNNHandler
          this->dev_ctx_.GetBlob(bias_key));

      if (!memory_p) {
-        const auto& scale_data = ComputeBiasScales(scale_in, scale_weights);
+        const auto& scale_data = GetBiasScales(ctx);
        dnnl::primitive_attr attrs;

        int mask = CreateMask(0, scale_data.size() > 1);
@@ -316,7 +331,7 @@ class FCMKLDNNHandler
      const ExecutionContext& ctx, phi::DenseTensor* out) {
    if (ctx.HasAttr("fuse_residual_connection") &&
        ctx.Attr<bool>("fuse_residual_connection")) {
-      auto* residual_param = ctx.Output<phi::DenseTensor>("ResidualData");
+      auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");

      PADDLE_ENFORCE_EQ(
          out->dims(),
@@ -393,7 +408,6 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
    auto out = ctx.Output<LoDTensor>("Out");

-    const float scale_in = ctx.Attr<float>("Scale_in");
    const auto& scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");

    std::shared_ptr<dnnl::inner_product_forward> fc_p;
@@ -430,7 +444,7 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
          std::make_shared<dnnl::memory>(inner_product_cache->dst_mem);
      if (ctx.HasAttr("fuse_residual_connection") &&
          ctx.Attr<bool>("fuse_residual_connection")) {
-        auto* residual_param = ctx.Output<phi::DenseTensor>("ResidualData");
+        auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
        out->ShareDataWith(*residual_param);
      }
      auto out_ptr = out->mutable_data<T_out>(
@@ -460,8 +474,7 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
      dst_memory_p = handler.AcquireCustomDstMemory(ctx, out);

      if (bias) {
-        bias_memory_p =
-            handler.AcquireBiasMemoryWithReorder(bias, scale_in, scale_weights);
+        bias_memory_p = handler.AcquireBiasMemoryWithReorder(ctx, bias);
      }

      fc_p = handler.AcquireForwardPrimitive();