From 269db0d1d16c0079a0f7039a5717b27e2c139af6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Gallus?= <michal.gallus@intel.com>
Date: Fri, 31 Jan 2020 14:53:16 +0100
Subject: [PATCH] [DNNL] Fix accuracy in INT8 FC (#22404)

* Enable quantize to reorder to nchw as well

* Correct FC MKL-DNN input dim requirements to accept 3D

* Improve DNNL FC format, error and 3D input handling

test=develop

* Improve error checking in FC

test=develop

* Improve PADDLE_ENFORCE messages in fc-related files

* Remove data layout attribute from obligatory pass args

test=develop

* Fix message in fc_mkldnn_pass to be logically correct

test=develop
---
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |  3 +
 .../framework/ir/mkldnn/fc_mkldnn_pass.cc     |  6 +-
 paddle/fluid/operators/fc_op.cc               |  8 +-
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 76 +++++++++++++------
 .../operators/mkldnn/quantize_mkldnn_op.cc    |  9 ++-
 paddle/fluid/operators/quantize_op.cc         |  3 +
 paddle/fluid/platform/mkldnn_reuse.h          |  5 +-
 7 files changed, 74 insertions(+), 36 deletions(-)
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index a0324279a3..da9a28baa1 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -66,6 +66,9 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
                    std::vector<std::string>({quantize_out_node->Name()}));
   q_desc.SetAttr("Scale", scale);
   q_desc.SetAttr("is_negative_input", !is_unsigned);
+
+  q_desc.SetAttr("output_format",
+                 Has("data_layout") ? Get<std::string>("data_layout") : "NHWC");
   auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
 
   // update op's input
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
index 9b71e2abd7..95afc54837 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
@@ -56,14 +56,14 @@ void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
     OpDesc* desc = fc->Op();
     auto dims = fc->inputs[0]->Var()->GetShape();
     auto dim_num = dims.size();
-    bool are_dims_supported = dim_num == 2 || dim_num == 4;
+    bool are_dims_supported = dim_num >= 2 && dim_num <= 4;
     constexpr size_t height_axis = 2;
     constexpr size_t width_axis = 3;
     bool is_size_supported =
         dim_num == 4 ? (dims[width_axis] == 1 && dims[height_axis] == 1) : true;
     if (!are_dims_supported || !is_size_supported) {
-      VLOG(3) << "Do not enable FC MKL-DNN for dimensions different than 2 & 4";
-      VLOG(3) << "Or when width and height are different than one";
+      VLOG(3) << "Do not enable FC MKL-DNN for dimensions different than"
+                 "2, 3 & 4, or when width or height is different than one.";
       return;
     }
     desc->SetAttr("use_mkldnn", true);
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 38af314986..f81ed30962 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -69,11 +69,13 @@ class FCOp : public framework::OperatorWithKernel {
                         activation_type.c_str());
     }
     if (ctx->Attrs().Get<bool>("use_mkldnn")) {
-      PADDLE_ENFORCE_EQ(in_dims.size() == 2 || in_dims.size() == 4, true,
-                        "Fully Connected input should be 2-D or 4-D tensor.");
+      PADDLE_ENFORCE_EQ(
+          in_dims.size() >= 2 && in_dims.size() <= 4, true,
+          platform::errors::Unimplemented(
+              "Fully Connected input should be 2D, 3D or 4D tensor."));
     }
     PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "Fully Connected input should be 2-D tensor.");
+                      "Fully Connected weights should be 2-D tensor.");
     int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
     PADDLE_ENFORCE_GT(
         in_dims.size(), in_num_col_dims,
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index edc14add80..dcf0b996bd 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -54,6 +54,25 @@ class FCPrimitiveFactory {
       return;
     }  // Otherwise, create a new one.
 
+    auto in_col_dims = ctx.Attr<int>("in_num_col_dims");
+    PADDLE_ENFORCE_LE(in_col_dims, 2,
+                      platform::errors::Unimplemented(
+                          "DNNL FC doesn't support in_num_col_dims paramter to "
+                          "be higher than "
+                          "2."));
+    if (in_col_dims == 2) {
+      PADDLE_ENFORCE_EQ(
+          input->dims().size(), 3,
+          platform::errors::Unimplemented(
+              "DNNL FC only supports in_num_col_dims equal to 2 when "
+              "3 dim input is provided."));
+      PADDLE_ENFORCE_EQ(
+          input->format(), MKLDNNMemoryFormat::ncw,
+          platform::errors::Unimplemented(
+              "DNNL FC only supports in_num_col_dims equal to 2 when "
+              "input format is equal to ncw."));
+    }
+
     // Transform weights to default MKL-DNN format
     weights_ = TransposeWeights(weights);
     // Since MKL-DNN has a lot of limitations on what the input/weights/output
@@ -121,6 +140,33 @@ class FCPrimitiveFactory {
   }
 
  private:
+  // DNNL always returns 2-dimensional data block as a result of computing
+  // inner product. Hence the format 'nc' is always set for its output
+  // primitive. Therefore, function SetOutputFormat is needed to choose
+  // an appropriate format based on the number of input dimensions and
+  // format of an input tensor.
+  void SetOutputFormat(MKLDNNMemoryFormat in_format, Tensor* out) {
+    int dim_num = out->dims().size();
+    // In case of 2 dims, we set the only possible format, nc
+    if (dim_num == 2) {
+      out->set_format(MKLDNNMemoryFormat::nc);
+      // In case of 3 dims, we generate a format that is based on number
+      // of output dims and the layout of input format (nchw or nhwc).
+    } else if (dim_num == 3) {
+      if (in_format == MKLDNNMemoryFormat::nwc ||
+          in_format == MKLDNNMemoryFormat::nhwc) {
+        out->set_format(
+            platform::MKLDNNFormatForSize(dim_num, MKLDNNMemoryFormat::nhwc));
+      } else {
+        out->set_format(
+            platform::MKLDNNFormatForSize(dim_num, MKLDNNMemoryFormat::nchw));
+      }
+      // In any other case we overwrite the output format with the input one.
+    } else {
+      out->set_format(in_format);
+    }
+  }
+
   void UpdateDataPointers(const ExecutionContext& ctx, Tensor* out,
                           const Tensor* in) {
     input_->set_data_handle(to_void_cast(in->data<T_in>()));
@@ -129,17 +175,7 @@ class FCPrimitiveFactory {
     // variable, update its format to what has been determined in first
     // call to CreateFcPrimitive method.
     if (out->format() == MKLDNNMemoryFormat::undef) {
-      MKLDNNMemoryFormat format;
-      auto data_type = input_->get_desc().data.data_type;
-      if (data_type == mkldnn_f32)
-        format = MKLDNNMemoryFormat::nchw;
-      else
-        format = MKLDNNMemoryFormat::nhwc;
-
-      MKLDNNMemoryFormat selected = platform::MKLDNNFormatForSize(
-          framework::vectorize<int>(out->dims()).size(), format);
-
-      out->set_format(selected);
+      SetOutputFormat(in->format(), out);
     }
   }
 
@@ -168,8 +204,8 @@ class FCPrimitiveFactory {
       const LoDTensor* input, const Tensor* weights, const Tensor* bias,
       LoDTensor* output, const ExecutionContext& ctx) {
     auto input_dims = framework::vectorize(input->dims());
-    std::vector<int64_t> new_input_dims = {input_dims[0] * input_dims[1], 1,
-                                           input_dims[2]};
+    std::vector<int64_t> new_input_dims = {input_dims[0] * input_dims[1],
+                                           input_dims[2], 1};
     auto src_desc = CreateMemDescriptor<T_in>(new_input_dims, input->format());
 
     auto weight_dims = Get3DWeightDimsForDNNL(weights);
@@ -187,7 +223,7 @@ class FCPrimitiveFactory {
 
   std::vector<int64_t> Get3DWeightDimsForDNNL(const Tensor* weights) {
     auto paddle_w_dims = framework::vectorize(weights->dims());
-    return {paddle_w_dims[1], 1, paddle_w_dims[0]};
+    return {paddle_w_dims[1], paddle_w_dims[0], 1};
   }
 
   memory::desc Create3DUserWeightsDesc(const Tensor* weights) {
@@ -405,18 +441,8 @@ class FCPrimitiveFactory {
     T_out* output_data =
         output->mutable_data<T_out>(ctx.GetPlace(), buffer_size);
     memory dst_mem(dst_desc, engine_, to_void_cast<T_out>(output_data));
+    SetOutputFormat(ctx.Input<LoDTensor>("Input")->format(), output);
 
-    MKLDNNMemoryFormat format;
-    auto data_type = input_->get_desc().data.data_type;
-    if (data_type == mkldnn_f32)
-      format = MKLDNNMemoryFormat::nchw;
-    else
-      format = MKLDNNMemoryFormat::nhwc;
-
-    MKLDNNMemoryFormat selected = platform::MKLDNNFormatForSize(
-        framework::vectorize<int>(output->dims()).size(), format);
-
-    output->set_format(selected);
     return dst_mem;
   }
 
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index be5c639829..55bd683f8f 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -60,6 +60,9 @@ class QuantOpKernel : public framework::OpKernel<T> {
     reorder_p = std::static_pointer_cast<reorder>(dev_ctx.GetBlob(key_prim));
 
     if (reorder_p == nullptr) {
+      std::string out_layout = ctx.Attr<std::string>("output_format");
+      MKLDNNMemoryFormat out_format =
+          platform::data_format_to_memory_format(out_layout);
       mkldnn::primitive_attr attri;
       int mask = 0;
       attri.set_output_scales(mask, {scale_data});
@@ -72,10 +75,10 @@ class QuantOpKernel : public framework::OpKernel<T> {
       std::shared_ptr<mkldnn::memory::desc> dst_md;
       if (is_negative) {
         platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
-                                                dst_md, dst_memory);
+                                                dst_md, dst_memory, out_format);
       } else {
-        platform::SetDstMemoryQuantized<uint8_t>(ctx, output, dst_tz, engine,
-                                                 dst_md, dst_memory);
+        platform::SetDstMemoryQuantized<uint8_t>(
+            ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
       }
       auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
           new reorder::primitive_desc(*src_memory, *dst_memory, attri));
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
index 69264e3a45..8924e21b46 100644
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -37,6 +37,9 @@ void QuantOpMaker::Make() {
                 "(bool, default false) Only used in mkldnn INT8 kernel")
       .SetDefault(false);
   AddAttr<float>("Scale", "scale data").SetDefault({1.0f});
+  AddAttr<std::string>("output_format",
+                       "Convert format to NHWC or NCHW during quantization.")
+      .SetDefault("NHWC");
   AddComment(R"DOC(This op will quantize data from FP32 to INT8)DOC");
 }
 
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 27756ed011..f8ee9b9639 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -1143,13 +1143,14 @@ static void SetDstMemoryQuantized(
     const framework::ExecutionContext& ctx, framework::Tensor* output,
     std::vector<int64_t> dst_tz, const mkldnn::engine& engine,
     std::shared_ptr<mkldnn::memory::desc>& dst_md,  // NOLINT
-    std::shared_ptr<mkldnn::memory>& dst_memory) {  // NOLINT
+    std::shared_ptr<mkldnn::memory>& dst_memory,    // NOLINT
+    MKLDNNMemoryFormat output_format) {
   T* output_data = output->mutable_data<T>(ctx.GetPlace());
   const size_t dst_dims = dst_tz.size();
   MKLDNNMemoryFormat dst_fmt;
   PADDLE_ENFORCE_LE(dst_dims, 5,
                     "Dst memory for quantization can not have dims > 5");
-  dst_fmt = platform::MKLDNNFormatForSize(dst_dims, MKLDNNMemoryFormat::nhwc);
+  dst_fmt = platform::MKLDNNFormatForSize(dst_dims, output_format);
 
   auto tmp_dst_md = platform::MKLDNNMemDesc(
       {dst_tz}, paddle::framework::ToMKLDNNDataType(
-- 
GitLab