From 02cf676445af390fa625ebeca4d89d1d69137773 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 31 Mar 2022 15:25:06 +0800
Subject: [PATCH] [new-exec] fit mkldnn op (#41058)

* fix bug that some op has no op_role attr

* add mkldnn support for new executor

* fit for mkldnn data_transfer

* fit for mkldnn data_transfer
---
 .../framework/new_executor/data_transfer.cc   | 163 ++++++++++++++----
 .../framework/new_executor/data_transfer.h    |  13 +-
 .../framework/new_executor/interpretercore.cc |  18 ++
 .../new_executor/interpretercore_util.cc      |   8 +
 .../new_executor/standalone_executor.cc       |   3 +-
 .../new_executor/standalone_executor.h        |   2 +-
 .../fluid/operators/controlflow/fetch_op.cc   |   1 +
 paddle/fluid/operators/transfer_layout_op.cc  |  22 ++-
 paddle/fluid/operators/transfer_layout_op.h   |  33 +++-
 paddle/fluid/platform/device_context.cc       |   1 +
 paddle/fluid/platform/mkldnn_helper.h         |   1 +
 11 files changed, 213 insertions(+), 52 deletions(-)
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index d9dcef62237..1d0727b80ba 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -24,7 +24,7 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
                               const std::string& var_name,
                               std::string* new_var_name,
                               std::vector<OpFuncNode>* op_func_nodes,
-                              bool use_local_scope) {
+                              bool use_local_scope, bool is_fetch_v2) {
   bool is_transferred = false;
   auto* src_var_name = &var_name;
 
@@ -35,8 +35,11 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
   if (need_layout_transform(kernel_type_for_var, expected_kernel_key)) {
     auto op = TransferLayout(
         *src_var_name, new_var_name, kernel_type_for_var.data_layout_,
-        expected_kernel_key.data_layout_, var_scope_, local_scope);
-    RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
+        expected_kernel_key.data_layout_, var_scope_, local_scope, is_fetch_v2);
+    if (op) {
+      RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name,
+                                op_func_nodes);
+    }
     // update src_var_name
     src_var_name = new_var_name;
     is_transferred = true;
@@ -46,7 +49,10 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
     auto op = TransferDtype(
         *src_var_name, new_var_name, kernel_type_for_var.data_type_,
         expected_kernel_key.data_type_, var_scope_, local_scope);
-    RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
+    if (op) {
+      RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name,
+                                op_func_nodes);
+    }
     // update src_var_name
     src_var_name = new_var_name;
     is_transferred = true;
@@ -55,9 +61,13 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
   if (need_device_transform(kernel_type_for_var, expected_kernel_key)) {
     auto src_place = kernel_type_for_var.place_;
     auto dst_place = expected_kernel_key.place_;
+
     auto op = TransferDevice(*src_var_name, new_var_name, src_place, dst_place,
                              var_scope_, local_scope);
-    RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
+    if (op) {
+      RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name,
+                                op_func_nodes);
+    }
     is_transferred = true;
   }
   return is_transferred;
@@ -128,17 +138,44 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
   new_op_func_nodes->emplace_back(std::move(new_op_func_node));
 }
 
-std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
-                                             std::string* new_var_name,
-                                             DataLayout in_layout,
-                                             DataLayout out_layout,
-                                             VariableScope* var_scope,
-                                             framework::Scope* local_scope) {
+// Var is initialized && var contains tensor && tensor is initialized
+bool IsTensorOfVarInitialized(Variable* var) {
+  if (var->IsInitialized()) {
+    if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
+      return GetLoDTensorOrSelectedRowsValueFromVar(*var)->IsInitialized();
+    } else if (var->IsType<LoDTensorArray>()) {
+      return static_cast<const Tensor*>(&(var->Get<LoDTensorArray>()[0]))
+          ->IsInitialized();
+    }
+  }
+  return false;
+}
+
+std::shared_ptr<OperatorBase> TransferLayout(
+    const std::string& var_name, std::string* new_var_name,
+    DataLayout in_layout, DataLayout out_layout, VariableScope* var_scope,
+    framework::Scope* local_scope, bool is_fetch_v2) {
+#ifdef PADDLE_WITH_MKLDNN
+  // NOTE(zhiqiu): hot fix, follow the same logic in DataCopy() in fetch_op.cc
+  if (in_layout == framework::DataLayout::kMKLDNN &&
+      var_name == framework::GradVarName("Filter") && is_fetch_v2) {
+    out_layout = framework::DataLayout::kNCHW;
+  }
+#endif
+
   // 1. Generate new_var_name and Initialize it
-  *new_var_name =
-      var_name + "_layout_" + std::to_string(var_scope->VarSize() + 1);
-  auto* ptr = local_scope->Var(*new_var_name);
+  *new_var_name = var_name + "_layout_" +
+                  std::to_string(static_cast<int>(in_layout)) + "_" +
+                  std::to_string(static_cast<int>(out_layout));
+
+  if (var_scope->HasVar(*new_var_name) &&
+      IsTensorOfVarInitialized(var_scope->Var(*new_var_name))) {
+    // already has same var
+    VLOG(4) << "Use cached variable: " << *new_var_name;
+    return nullptr;
+  }
 
+  auto* ptr = local_scope->Var(*new_var_name);
   auto var_type = var_scope->Var(var_name)->Type();
   InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
   VLOG(3) << "Create Variable " << *new_var_name
@@ -171,10 +208,17 @@ std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
                                             VariableScope* var_scope,
                                             framework::Scope* local_scope) {
   // 1. Generate new_var_name and Initialize it
-  *new_var_name =
-      var_name + "_dtype_" + std::to_string(var_scope->VarSize() + 1);
-  auto* ptr = local_scope->Var(*new_var_name);
+  *new_var_name = var_name + "_dtype_" +
+                  std::to_string(static_cast<int>(in_dtype)) + "_" +
+                  std::to_string(static_cast<int>(out_dtype));
+  if (var_scope->HasVar(*new_var_name) &&
+      IsTensorOfVarInitialized(var_scope->Var(*new_var_name))) {
+    // already has same var
+    VLOG(4) << "Use cached variable: " << *new_var_name;
+    return nullptr;
+  }
 
+  auto* ptr = local_scope->Var(*new_var_name);
   auto var_type = var_scope->Var(var_name)->Type();
   InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
 
@@ -211,10 +255,17 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
                                              VariableScope* var_scope,
                                              framework::Scope* local_scope) {
   // 1. Generate new_var_name and Initialize it
-  *new_var_name =
-      var_name + "_device_" + std::to_string(var_scope->VarSize() + 1);
-  auto* ptr = local_scope->Var(*new_var_name);
+  *new_var_name = var_name + "_device_" + src_place.DebugString() + "_" +
+                  dst_place.DebugString();
+
+  if (var_scope->HasVar(*new_var_name) &&
+      IsTensorOfVarInitialized(var_scope->Var(*new_var_name))) {
+    // already has same var
+    VLOG(4) << "Use cached variable: " << *new_var_name;
+    return nullptr;
+  }
 
+  auto* ptr = local_scope->Var(*new_var_name);
   auto var_type = var_scope->Var(var_name)->Type();
   InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
   VLOG(3) << "Create Variable " << *new_var_name
@@ -258,12 +309,28 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
   // record the no need transform variable index.
   std::unordered_set<int> no_data_transform_index;
 
+  const std::unordered_set<std::string>* no_buffer_ins = nullptr;
+  auto& no_buffer_inferer = op_base->Info().NoNeedBufferVarsInferer();
+  if (no_buffer_inferer) {
+    no_buffer_ins = &(no_buffer_inferer(op_base->Inputs(), op_base->Outputs(),
+                                        op_base->Attrs()));
+    if (no_buffer_ins->empty()) {
+      no_buffer_ins = nullptr;
+    }
+  }
+
   DataTranferHelper data_transfer_helper(place, var_scope);
   for (auto& var_name_item : *ins_map_temp) {
+    bool should_skip_input =
+        no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0;
+
     for (size_t i = 0; i < var_name_item.second.size(); ++i) {
       auto var = var_name_item.second[i];
       auto var_name = new_ins[var_name_item.first].at(i);
       const Tensor* tensor_in;
+      std::string new_var_name;
+      bool is_transferred = false;
+
       if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
         tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
       } else if (var->IsType<LoDTensorArray>()) {
@@ -272,18 +339,54 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
       } else {
         continue;
       }
+      // special case
       if (!tensor_in->IsInitialized()) {
-        continue;
+        if (should_skip_input == true) {
+#ifdef PADDLE_WITH_MKLDNN
+          // Var without buffer may be needed
+          // for some situation like InferShape().
+          // In this situation We cannot skip Var analysis, as
+          // MKL-DNN shape of Var may differ from kNHWC Var
+          // In such situation corressponding resized Var
+          // has to be created and registered
+          if ((tensor_in->layout() == DataLayout::kMKLDNN) &&
+              (var->IsType<LoDTensor>() == true) &&
+              (expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) &&
+              (paddle::platform::MKLDNNDeviceContext::tls()
+                   .get_cur_paddle_data_layout() == DataLayout::kNHWC)) {
+            VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
+                       "but kNHWC layout"
+                    << var_name_item.first << " in Operator "
+                    << op_base->Type();
+            Scope* local_scope = use_local_scope
+                                     ? var_scope->GetMutableLocalScope()
+                                     : var_scope->GetMutableScope();
+            auto op = TransferLayout(
+                var_name, &new_var_name, tensor_in->layout(), DataLayout::kNHWC,
+                var_scope, local_scope, op_base->Type() == "fetch_v2");
+            if (op) {
+              data_transfer_helper.RunAndConstructOpFuncNode(
+                  op, var_name, new_var_name, new_op_func_nodes);
+            }
+            is_transferred = true;
+          } else {
+            VLOG(7) << "Skip scanning input " << var_name_item.first
+                    << " in Operator " << op_base->Type();
+          }
+#endif
+        } else {
+          continue;
+        }
+      } else {
+        auto kernel_type_for_var =
+            static_cast<const framework::OperatorWithKernel*>(op_base)
+                ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
+                                      expected_kernel_key);
+        // apply data transform
+        is_transferred = data_transfer_helper.apply(
+            kernel_type_for_var, expected_kernel_key, var_name, &new_var_name,
+            new_op_func_nodes, use_local_scope, op_base->Type() == "fetch_v2");
       }
-      auto kernel_type_for_var =
-          static_cast<const framework::OperatorWithKernel*>(op_base)
-              ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
-                                    expected_kernel_key);
-      // apply data transform
-      std::string new_var_name;
-      bool is_transferred = data_transfer_helper.apply(
-          kernel_type_for_var, expected_kernel_key, var_name, &new_var_name,
-          new_op_func_nodes, use_local_scope);
 
       if (is_transferred) {
         // update RuntimeContext.inputs and original op_func_node inputs
diff --git a/paddle/fluid/framework/new_executor/data_transfer.h b/paddle/fluid/framework/new_executor/data_transfer.h
index 1c480189279..9525ba5bc8f 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.h
+++ b/paddle/fluid/framework/new_executor/data_transfer.h
@@ -35,7 +35,8 @@ class DataTranferHelper {
   bool apply(const OpKernelType& kernel_type_for_var,
              const OpKernelType& expected_kernel_key,
              const std::string& var_name, std::string* new_var_name,
-             std::vector<OpFuncNode>* new_op_func_nodes, bool use_local_scope);
+             std::vector<OpFuncNode>* new_op_func_nodes, bool use_local_scope,
+             bool is_fetch_v2);
 
   void RunAndConstructShareNode(const std::string& src_var_name,
                                 const std::string& dst_var_name,
@@ -94,12 +95,10 @@ inline bool need_layout_transform(const OpKernelType& kernel_type_for_var,
                                         expected_kernel_key.data_layout_);
 }
 
-std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
-                                             std::string* new_var_name,
-                                             DataLayout in_layout,
-                                             DataLayout out_layout,
-                                             VariableScope* var_scope,
-                                             framework::Scope* local_scope);
+std::shared_ptr<OperatorBase> TransferLayout(
+    const std::string& var_name, std::string* new_var_name,
+    DataLayout in_layout, DataLayout out_layout, VariableScope* var_scope,
+    framework::Scope* local_scope, bool is_fetch_v2);
 
 std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
                                             std::string* new_var_name,
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index d956f23242d..e30dd21fc5c 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -22,6 +22,9 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
                             "Use inplace in new executor");
@@ -55,6 +58,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
       block_(block),
       global_scope_(global_scope),
       stream_analyzer_(place) {
+  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
   is_build_ = false;
   async_work_queue_.reset(new interpreter::AsyncWorkQueue(
       kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_));
@@ -92,6 +96,14 @@ InterpreterCore::~InterpreterCore() {
   gc_.reset(nullptr);
 
   async_work_queue_.reset(nullptr);
+  VLOG(4) << "~InterpreterCore(): " << this;
+  VLOG(4) << " on" << place_;
+
+#ifdef PADDLE_WITH_MKLDNN
+  // Clear mkl-dnn cache,
+  // this is needed to have mkl-dnn unit tests working
+  platform::ClearMKLDNNCache(place_, this);
+#endif
 }
 
 void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
@@ -101,6 +113,9 @@ void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
 paddle::framework::FetchList InterpreterCore::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<framework::LoDTensor>& feed_tensors) {
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
   bool is_build = is_build_;
   global_scope_->SetLocalScope(local_scope_);
   Prepare(feed_names, feed_tensors, is_build);
@@ -120,6 +135,9 @@ paddle::framework::FetchList InterpreterCore::Run(
 
 paddle::framework::FetchList InterpreterCore::Run(
     const std::vector<std::string>& feed_names) {
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
   if (!is_build_) {
     if (create_local_scope_ &&
         global_scope_->GetMutableLocalScope() !=
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index b9470cd3736..d56082a91a6 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -21,6 +21,10 @@
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/phi/core/kernel_factory.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 PADDLE_DEFINE_EXPORTED_bool(
     new_executor_sequential_run, false,
     "Enable sequential execution for standalone executor, used for debug");
@@ -312,6 +316,10 @@ void build_op_func_list(const platform::Place& place,
   operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
       main_program, block.ID(), ops_unique);
 
+#ifdef PADDLE_WITH_MKLDNN
+  platform::RegisterModelLayout(ops_unique, place);
+#endif
+
   // its elements will be moved to vec_func_list
   std::vector<std::shared_ptr<OperatorBase>> ops;
   for (auto& op_unique : ops_unique) {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 50770b6c4a7..a2250231475 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -112,7 +112,8 @@ std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
   auto iter = interpretercores_.find(oss.str());
 
   if (iter == interpretercores_.end()) {
-    VLOG(3) << "create interpreter_core for " << oss.str();
+    VLOG(3) << "create interpreter_core for " << oss.str() << " on place "
+            << place_;
     VLOG(3) << "add fetch op: " << add_fetch_op;
     std::shared_ptr<InterpreterCore> core = nullptr;
     if (add_fetch_op) {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index e84df2abb36..0b9e348ab76 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -63,7 +63,7 @@ class StandaloneExecutor : public ExecutorBase {
       const std::vector<std::string>& feed_names,
       const std::vector<std::string>& fetch_names, bool add_fetch_op);
 
-  const platform::Place& place_;
+  platform::Place place_;
   const ProgramDesc& startup_prog_;
   const ProgramDesc& main_prog_;
   VariableScope global_scope_;
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index de3d8bd9961..111ca9c63c6 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -33,6 +33,7 @@ static void DataCopy(const framework::LoDTensor &src_item,
       framework::Tensor out;
       // Convert to desired Paddle layout, apart from grads of filter
       // as params are not a subject to paddle's data_format
+      VLOG(4) << "innerTransDataLayoutFromMKLDNN";
       framework::innerTransDataLayoutFromMKLDNN(
           src_item.layout(), fetch_var_name == framework::GradVarName("Filter")
                                  ? framework::DataLayout::kNCHW
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
index 96946ee15f4..f26bcdca4a7 100644
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -67,19 +67,25 @@ class TransferLayoutOp : public framework::OperatorWithKernel {
     // kernel's device type is decided by input tensor place
     auto *in = ctx.InputVar("X");
     auto *in_tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in);
-    PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(), true,
-                      platform::errors::PreconditionNotMet(
-                          "The tensor of Input(X) is not initialized."));
+    // NOTE(zhiqiu): hot fix, allow empty tensor of kMKLDNN layout to run this
+    // op
+    if (in_tensor->layout() != DataLayout::kMKLDNN) {
+      PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(), true,
+                        platform::errors::PreconditionNotMet(
+                            "The tensor of Input(X) is not initialized."));
+    }
+    auto place =
+        in_tensor->IsInitialized() ? in_tensor->place() : platform::CPUPlace();
+
     // dtype is not important
-    return framework::OpKernelType(framework::proto::VarType::FP32,
-                                   in_tensor->place());
+    return framework::OpKernelType(framework::proto::VarType::FP32, place);
   }
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(),
+                                   expected_kernel_type.place_,
                                    expected_kernel_type.data_layout_);
   }
 };
@@ -99,7 +105,9 @@ class TransferLayoutKernel {
     auto &dev_ctx = ctx.device_context();
     auto src_layout = ctx.Attr<int>("src_layout");
     auto dst_layout = ctx.Attr<int>("dst_layout");
-    TransferLayoutFunctor(x, out, dev_ctx, src_layout, dst_layout)();
+    auto input_name = ctx.InputName("X");
+    TransferLayoutFunctor(x, out, dev_ctx, src_layout, dst_layout,
+                          input_name)();
   }
 };
 
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index 06bf54e998c..c90a44dc494 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -39,12 +39,14 @@ class TransferLayoutFunctor {
  public:
   TransferLayoutFunctor(const framework::Variable *in, framework::Variable *out,
                         const platform::DeviceContext &dev_ctx,
-                        const int src_layout, const int dst_layout)
+                        const int src_layout, const int dst_layout,
+                        std::string in_name)
       : in_(in),
         out_(out),
         dev_ctx_(dev_ctx),
         src_layout_(src_layout),
-        dst_layout_(dst_layout) {}
+        dst_layout_(dst_layout),
+        in_name_(in_name) {}
 
   void operator()() const {
     auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_);
@@ -54,8 +56,18 @@ class TransferLayoutFunctor {
     out_tensor.set_layout(out_layout);
 
 #ifdef PADDLE_WITH_MKLDNN
+    // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in
+    // data_transfer.cc
     auto in_layout = static_cast<DataLayout>(src_layout_);
+    auto *tensor_out = out_->GetMutable<framework::LoDTensor>();
     VLOG(4) << in_layout << "->" << out_layout << " " << in_tensor.layout();
+    if (!in_tensor.IsInitialized() && in_layout == DataLayout::kMKLDNN &&
+        out_layout == DataLayout::kNHWC) {
+      tensor_out->Resize(in_tensor.dims());
+      tensor_out->set_layout(out_layout);
+      platform::MatchShapeToLayout(tensor_out, in_layout, out_layout);
+      return;
+    }
     if (in_layout == DataLayout::kMKLDNN || out_layout == DataLayout::kMKLDNN) {
       PADDLE_ENFORCE_NE(
           in_layout, out_layout,
@@ -81,13 +93,21 @@ class TransferLayoutFunctor {
         out_tensor.set_layout(DataLayout::kMKLDNN);
         out_tensor.set_format(out_format);
       } else {
-        VLOG(4) << "kNCHW";
+        auto target_layout = paddle::platform::MKLDNNDeviceContext::tls()
+                                 .get_cur_paddle_data_layout();
+        // NOTE(zhiqiu): hot fix, follow the same logic in DataCopy() in
+        // fetch_op.cc
+        if (out_layout == DataLayout::kNCHW &&
+            in_name_ == framework::GradVarName("Filter")) {
+          target_layout = out_layout;
+        }
+        VLOG(4) << "innerTransDataLayoutFromMKLDNN: " << in_layout << "->"
+                << target_layout;
         // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
         // Do transform via MKLDNN lib
         paddle::framework::innerTransDataLayoutFromMKLDNN(
-            in_layout, paddle::platform::MKLDNNDeviceContext::tls()
-                           .get_cur_paddle_data_layout(),
-            in_tensor, &out_tensor, dev_ctx_.GetPlace());
+            in_layout, target_layout, in_tensor, &out_tensor,
+            dev_ctx_.GetPlace());
       }
     } else {
       // Case3 - transfrom between Non-MKLDNN OPKernels
@@ -132,6 +152,7 @@ class TransferLayoutFunctor {
   const platform::DeviceContext &dev_ctx_;
   const int src_layout_;
   const int dst_layout_;
+  std::string in_name_;
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 9aa362546ec..5ee54b1c865 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -742,6 +742,7 @@ dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
 }
 
 void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
+  VLOG(4) << tls().get_curr_exec() << " " << ptr;
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
   if (!block_next_cache_clearing_) {
     VLOG(3) << "Clearing DNNL cache.";
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 4001fd744e6..d2e48c11138 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -563,6 +563,7 @@ inline void RegisterModelLayout(
     std::vector<std::unique_ptr<framework::OperatorBase>>& ops,
     const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
+    VLOG(4) << "RegisterModelLayout for mkldnn";
     auto check_attrib = [](std::unique_ptr<framework::OperatorBase>& op,
                            const std::string& attrib_name) -> bool {
       if (op->HasAttr(attrib_name)) {
-- 
GitLab