[NewExe] Support layout/dtype transform by adding transfer_layout/transfer_dtype op (#37299)

* Add transfer_layout/dtype op * clean useless codes * fix unused var * add optest in white.txt * split into data_transfer.cc * fix cmake * modify according reviewer comment * replace cast_op with transfer_dtype_op

[NewExe] Support layout/dtype transform by adding transfer_layout/transfer_dtype op (#37299)
* Add transfer_layout/dtype op * clean useless codes * fix unused var * add optest in white.txt * split into data_transfer.cc * fix cmake * modify according reviewer comment * replace cast_op with transfer_dtype_op
2a1f009e · Aurelius84 · GitHub · 684de4b3 · 2a1f009e · 2a1f009e
14 changed file
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -37,30 +37,19 @@ std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
  }
 }

-struct CastDataLayout {
-  CastDataLayout(const platform::DeviceContext* ctx,
-                 const std::vector<int>& axis, const framework::Tensor& in,
-                 framework::Tensor* out)
-      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
-  const framework::Tensor in_;
-  framework::Tensor* out_;
-  const platform::DeviceContext* ctx_;
-  const std::vector<int> axis_;
-
-  template <typename T>
-  void apply() {
-    auto place = ctx_->GetPlace();
-
-    if (platform::is_cpu_place(place)) {
-      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
-      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
-      trans4(*context, in_, out_, axis_);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Unsupported data layout cast from CPU to GPU."));
-    }
+template <typename T>
+void CastDataLayout::apply() {
+  auto place = ctx_->GetPlace();
+
+  if (platform::is_cpu_place(place)) {
+    operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+    auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+    trans4(*context, in_, out_, axis_);
+  } else {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Unsupported data layout cast from CPU to GPU."));
  }
-};
+}

 void TransDataLayout(const OpKernelType& kernel_type_for_var,
                     const OpKernelType& expected_kernel_type, const Tensor& in,

--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -36,6 +36,21 @@ class Tensor;
 namespace paddle {
 namespace framework {

+struct CastDataLayout {
+  CastDataLayout(const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis, const framework::Tensor& in,
+                 framework::Tensor* out)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+
+  template <typename T>
+  void apply();
+};
+
 #ifdef PADDLE_WITH_MKLDNN
 using MKLDNNDataType = dnnl::memory::data_type;


--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -2,10 +2,11 @@ set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_f
 lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
 graph_to_program_pass variable_helper timer monitor nan_inf_utils)

+cc_library(data_transfer SRCS data_transfer.cc DEPS enforce scope glog)
 cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce)
 cc_library(new_executor_defs SRCS new_executor_defs.cc DEPS enforce glog scope)
 cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS} executor_gc_helper)
-cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs)
+cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer)
 cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs)
 cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs)
 cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_garbage_collector stream_analyzer event_manager)

--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/data_transfer.h"
+
+namespace paddle {
+namespace framework {
+namespace interpreter {
+
+bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
+                              const OpKernelType& expected_kernel_key,
+                              const std::string& var_name,
+                              std::string* new_var_name,
+                              std::vector<OpFuncNode>* op_func_nodes,
+                              bool use_local_scope) {
+  bool is_transferred = false;
+  auto* src_var_name = &var_name;
+
+  Scope* local_scope = use_local_scope ? var_scope_->GetMutableLocalScope()
+                                       : var_scope_->GetMutableScope();
+
+  // 1. layout transform
+  if (need_layout_transform(kernel_type_for_var, expected_kernel_key)) {
+    auto op = TransferLayout(
+        *src_var_name, new_var_name, kernel_type_for_var.data_layout_,
+        expected_kernel_key.data_layout_, var_scope_, local_scope);
+    RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
+    // update src_var_name
+    src_var_name = new_var_name;
+    is_transferred = true;
+  }
+  // 2. dype transform
+  if (need_dtype_transform(kernel_type_for_var, expected_kernel_key)) {
+    auto op = TransferDtype(
+        *src_var_name, new_var_name, kernel_type_for_var.data_type_,
+        expected_kernel_key.data_type_, var_scope_, local_scope);
+    RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
+    // update src_var_name
+    src_var_name = new_var_name;
+    is_transferred = true;
+  }
+  // 3. device transform
+  if (need_device_transform(kernel_type_for_var, expected_kernel_key)) {
+    auto src_place = kernel_type_for_var.place_;
+    auto dst_place = expected_kernel_key.place_;
+    auto op = TransferDevice(*src_var_name, new_var_name, src_place, dst_place,
+                             var_scope_, local_scope);
+    RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
+    is_transferred = true;
+  }
+  return is_transferred;
+}
+
+void DataTranferHelper::RunAndConstructOpFuncNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string& var_name,
+    const std::string& new_var_name,
+    std::vector<OpFuncNode>* new_op_func_nodes) {
+  auto& op_type = op->Type();
+
+  // 1. Construct RuntimeContext
+  RuntimeContext runtime_context({}, {});
+  runtime_context.inputs["X"] = {var_scope_->Var(var_name)};
+  runtime_context.outputs["Out"] = {var_scope_->Var(new_var_name)};
+  InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
+
+  // 2. Execute infer shape and choose kernel
+  auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
+  static_cast<const framework::OperatorWithKernel*>(op.get())->InferShape(
+      &infer_shape_ctx);
+  auto kernels_iter = all_op_kernels.find(op_type);
+  PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(),
+                    platform::errors::Unavailable(
+                        "There are no kernels which are registered in "
+                        "the %s operator.",
+                        op_type));
+  OpKernelMap& kernels = kernels_iter->second;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place_);
+  Scope scope;
+  auto exec_ctx = ExecutionContext(*op, scope, *dev_ctx, runtime_context);
+  auto expected_kernel_key =
+      dynamic_cast<const framework::OperatorWithKernel*>(op.get())
+          ->GetExpectedKernelType(exec_ctx);
+  auto kernel_iter = kernels.find(expected_kernel_key);
+
+  // 3. Execute transfer op and construct OpFuncNode
+  OpFuncNode new_op_func_node;
+  new_op_func_node.input_index["X"] = {var_scope_->VarId(var_name)};
+  new_op_func_node.output_index["Out"] = {var_scope_->VarId(new_var_name)};
+  new_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
+  new_op_func_node.kernel_func_(exec_ctx);
+  // NOTE(Aurelius84): data_transform_op is expensive operation, so we tag them
+  // as kQueueSync and execute them in thread pool.
+  new_op_func_node.type_ = OpFuncType::kQueueSync;
+  new_op_func_node.dev_ctx_ = dev_ctx;
+  new_op_func_node.operator_base_ = op;
+  VLOG(3) << "Run " << op_type << " done.";
+
+  new_op_func_nodes->emplace_back(std::move(new_op_func_node));
+}
+
+std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
+                                             std::string* new_var_name,
+                                             DataLayout in_layout,
+                                             DataLayout out_layout,
+                                             VariableScope* var_scope,
+                                             framework::Scope* local_scope) {
+  // 1. Generate new_var_name and Initialize it
+  *new_var_name =
+      var_name + "_layout_" + std::to_string(var_scope->VarSize() + 1);
+  auto* ptr = local_scope->Var(new_var_name);
+
+  auto var_type = var_scope->Var(var_name)->Type();
+  InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
+  VLOG(3) << "Create Variable " << var_name << " locally, which pointer is "
+          << ptr << "Variable Type " << var_type;
+  var_scope->SetVarDesc(var_name, nullptr);
+
+  // 2. Construct VariableNameMap
+  VariableNameMap in_name_map = {{"X", {var_name}}};
+  VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
+  AttributeMap attr_map = {{"dst_layout", static_cast<int>(out_layout)}};
+
+  // 3. Create transfer_op
+  std::string op_type("transfer_layout");
+  auto& op_info = OpInfoMap::Instance().Get(op_type);
+  auto op = std::shared_ptr<OperatorBase>(
+      op_info.Creator()(op_type, in_name_map, out_name_map, attr_map));
+
+  VLOG(3) << string::Sprintf("Insert %s(%s) with %s -> %s(%s).", op_type,
+                             var_name, in_layout, *new_var_name, out_layout);
+  return op;
+}
+
+std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
+                                            std::string* new_var_name,
+                                            proto::VarType::Type in_dtype,
+                                            proto::VarType::Type out_dtype,
+                                            VariableScope* var_scope,
+                                            framework::Scope* local_scope) {
+  // 1. Generate new_var_name and Initialize it
+  *new_var_name =
+      var_name + "_dtype_" + std::to_string(var_scope->VarSize() + 1);
+  auto* ptr = local_scope->Var(new_var_name);
+
+  auto var_type = var_scope->Var(var_name)->Type();
+  InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
+  VLOG(3) << "Create Variable " << var_name << " locally, which pointer is "
+          << ptr << "Variable Type " << var_type;
+  var_scope->SetVarDesc(var_name, nullptr);
+
+  // 2. Construct VariableNameMap
+  VariableNameMap in_name_map = {{"X", {var_name}}};
+  VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
+  AttributeMap attr_map;
+  attr_map["in_dtype"] = static_cast<int>(in_dtype);
+  attr_map["out_dtype"] = static_cast<int>(out_dtype);
+  // NOTE(Aurelius84): In whice case use_mkldnn = true?
+  attr_map["use_mkldnn"] = false;
+
+  // 3. Create transfer_op
+  std::string op_type("transfer_dtype");
+  auto& op_info = OpInfoMap::Instance().Get(op_type);
+  auto op = std::shared_ptr<OperatorBase>(
+      op_info.Creator()(op_type, in_name_map, out_name_map, attr_map));
+
+  VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", op_type,
+                             var_name, DataTypeToString(in_dtype),
+                             *new_var_name, DataTypeToString(out_dtype));
+  return op;
+}
+
+std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
+                                             std::string* new_var_name,
+                                             const platform::Place& src_place,
+                                             const platform::Place& dst_place,
+                                             VariableScope* var_scope,
+                                             framework::Scope* local_scope) {
+  // 1. Generate new_var_name and Initialize it
+  *new_var_name =
+      var_name + "_device_" + std::to_string(var_scope->VarSize() + 1);
+  auto* ptr = local_scope->Var(new_var_name);
+
+  auto var_type = var_scope->Var(var_name)->Type();
+  InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
+  VLOG(3) << "Create Variable " << var_name << " locally, which pointer is "
+          << ptr << "Variable Type " << var_type;
+  var_scope->SetVarDesc(var_name, nullptr);
+
+  // 2. Construct VariableNameMap
+  VariableNameMap in_name_map = {{"X", {var_name}}};
+  VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
+  int dst_place_type = platform::is_cpu_place(dst_place)
+                           ? 0
+                           : platform::is_gpu_place(dst_place) ? 1 : -1;
+  AttributeMap attr_map = {{"dst_place_type", dst_place_type}};
+
+  // 3. Create transfer_op
+  std::string op_type = get_memcpy_type(src_place, dst_place);
+  auto& op_info = OpInfoMap::Instance().Get(op_type);
+  auto op = std::shared_ptr<OperatorBase>(
+      op_info.Creator()(op_type, in_name_map, out_name_map, attr_map));
+
+  VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", op_type,
+                             var_name, src_place, *new_var_name, dst_place);
+  return op;
+}
+
+void ApplyDataTransform(const OpKernelType& expected_kernel_key,
+                        const platform::Place& place,
+                        VariableValueMap* ins_map_temp,
+                        VariableScope* var_scope, OpFuncNode* op_func_node,
+                        std::vector<OpFuncNode>* new_op_func_nodes,
+                        bool use_local_scope) {
+  auto op_base = op_func_node->operator_base_.get();
+  PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet(
+                                       "op_base is null, please pass a valid "
+                                       "op_base in apply_data_transform."));
+
+  VariableNameMap new_ins(op_base->Inputs());
+  // record the no need transform variable index.
+  std::unordered_set<int> no_data_transform_index;
+
+  DataTranferHelper data_transfer_helper(place, var_scope);
+  for (auto& var_name_item : *ins_map_temp) {
+    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
+      auto var = var_name_item.second[i];
+      if (!(var->IsType<LoDTensor>() || var->IsType<SelectedRows>())) {
+        continue;
+      }
+      auto& var_name = new_ins[var_name_item.first].at(i);
+      auto tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      if (!tensor_in->IsInitialized()) {
+        continue;
+      }
+      auto kernel_type_for_var =
+          static_cast<const framework::OperatorWithKernel*>(op_base)
+              ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
+                                    expected_kernel_key);
+      // apply data transform
+      std::string new_var_name;
+      bool is_transferred = data_transfer_helper.apply(
+          kernel_type_for_var, expected_kernel_key, var_name, &new_var_name,
+          new_op_func_nodes, use_local_scope);
+
+      if (is_transferred) {
+        // update RuntimeContext.inputs and original op_func_node inputs
+        op_func_node->input_index[var_name_item.first][i] =
+            var_scope->VarId(new_var_name);
+        var_name_item.second[i] = var_scope->Var(new_var_name);
+        new_ins[var_name_item.first][i] = new_var_name;
+        // NOTE(Aurelius84): avoid deepcopy twice if we already insert data
+        // transfer op.
+        if (op_base->Type() == "fetch_v2") {
+          op_base->SetAttr("deepcopy", false);
+        }
+      } else {
+        // record no need data transformer input var_id
+        VLOG(3) << op_base->Type()
+                << " found no data_transform var: " << var_name
+                << " with id: " << var_scope->VarId(var_name);
+        no_data_transform_index.emplace(var_scope->VarId(var_name));
+      }
+    }
+  }
+
+  // NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent
+  // with instruction. (hot fix, it is not good design here)
+  op_func_node->operator_base_ =
+      std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
+          op_base->Type(), new_ins, op_base->Outputs(), op_base->Attrs()));
+  op_func_node->no_data_transform_index = std::move(no_data_transform_index);
+}
+
+std::string get_memcpy_type(const platform::Place& src_place,
+                            const platform::Place& dst_place) {
+  PADDLE_ENFORCE_EQ(platform::is_same_place(src_place, dst_place), false,
+                    platform::errors::PreconditionNotMet(
+                        "Required src_place shall be different with dst_place, "
+                        "but received same place: %s",
+                        src_place));
+  if (platform::is_gpu_place(dst_place)) {
+    return kMemcpyH2D;
+  } else if (platform::is_gpu_place(src_place)) {
+    return kMemcpyD2H;
+  } else {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Not support Memcpy typ : %s -> %s", src_place, dst_place));
+  }
+}
+
+}  // namespace interpreter
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/new_executor/data_transfer.h
+++ b/paddle/fluid/framework/new_executor/data_transfer.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace framework {
+namespace interpreter {
+
+/*
+ * A Helper class to implement data transform operation.
+ * It will apply layout/dtype/device transfer by turns.
+ */
+class DataTranferHelper {
+ public:
+  DataTranferHelper(const platform::Place& place, VariableScope* var_scope)
+      : place_(place), var_scope_(var_scope) {}
+
+  bool apply(const OpKernelType& kernel_type_for_var,
+             const OpKernelType& expected_kernel_key,
+             const std::string& var_name, std::string* new_var_name,
+             std::vector<OpFuncNode>* new_op_func_nodes, bool use_local_scope);
+
+ private:
+  platform::Place place_;
+  VariableScope* var_scope_;
+
+  void RunAndConstructOpFuncNode(const std::shared_ptr<OperatorBase>& op,
+                                 const std::string& var_name,
+                                 const std::string& new_var_name,
+                                 std::vector<OpFuncNode>* op_func_nodes);
+};
+
+void ApplyDataTransform(const OpKernelType& expected_kernel_key,
+                        const platform::Place& place,
+                        VariableValueMap* ins_map_temp,
+                        VariableScope* var_scope, OpFuncNode* op_func_node,
+                        std::vector<OpFuncNode>* op_func_nodes,
+                        bool use_local_scope = true);
+
+std::string get_memcpy_type(const platform::Place& src_place,
+                            const platform::Place& dst_place);
+
+inline bool need_device_transform(const OpKernelType& kernel_type_for_var,
+                                  const OpKernelType& expected_kernel_key) {
+  auto& src_place = kernel_type_for_var.place_;
+  auto& dst_place = expected_kernel_key.place_;
+  if (platform::is_same_place(src_place, dst_place) ||
+      (platform::is_cuda_pinned_place(src_place) &&
+       platform::is_cpu_place(dst_place))) {
+    return false;
+  }
+  return true;
+}
+
+inline bool need_dtype_transform(const OpKernelType& kernel_type_for_var,
+                                 const OpKernelType& expected_kernel_key) {
+  return framework::NeedTransformDataType(kernel_type_for_var,
+                                          expected_kernel_key);
+}
+
+inline bool need_layout_transform(const OpKernelType& kernel_type_for_var,
+                                  const OpKernelType& expected_kernel_key) {
+  return framework::NeedTransformLayout(kernel_type_for_var.data_layout_,
+                                        expected_kernel_key.data_layout_);
+}
+
+std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
+                                             std::string* new_var_name,
+                                             DataLayout in_layout,
+                                             DataLayout out_layout,
+                                             VariableScope* var_scope,
+                                             framework::Scope* local_scope);
+
+std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
+                                            std::string* new_var_name,
+                                            proto::VarType::Type in_dtype,
+                                            proto::VarType::Type out_dtype,
+                                            VariableScope* var_scope,
+                                            framework::Scope* local_scope);
+
+std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
+                                             std::string* new_var_name,
+                                             const platform::Place& src_place,
+                                             const platform::Place& dst_place,
+                                             VariableScope* var_scope,
+                                             framework::Scope* local_scope);
+
+}  // namespace interpreter
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -15,6 +15,7 @@
 #include <algorithm>

 #include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/new_executor/data_transfer.h"

 namespace paddle {
 namespace framework {
@@ -114,23 +115,6 @@ get_unused_vars(const BlockDesc& block,
  return result;
 }

-std::string get_memcpy_type(const platform::Place& src_place,
-                            const platform::Place& dst_place) {
-  PADDLE_ENFORCE_EQ(platform::is_same_place(src_place, dst_place), false,
-                    platform::errors::PreconditionNotMet(
-                        "Required src_place shall be different with dst_place, "
-                        "but received same place: %s",
-                        src_place));
-  if (platform::is_gpu_place(dst_place)) {
-    return kMemcpyH2D;
-  } else if (platform::is_gpu_place(src_place)) {
-    return kMemcpyD2H;
-  } else {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Not support Memcpy typ : %s -> %s", src_place, dst_place));
-  }
-}
-
 void build_variable_scope(const framework::BlockDesc& block,
                          VariableScope* var_scope, bool use_local_scope) {
  VLOG(3) << "Creating Variables";
@@ -269,195 +253,6 @@ void deal_operator_base(const platform::Place& place,
  op_func_node->dev_ctx_ = dev_ctx;
 }

-// the return value is whether data transformer is needed for this var
-bool need_place_transform_for_var(const OpKernelType& kernel_type_for_var,
-                                  const OpKernelType& expected_kernel_key) {
-  if (platform::is_same_place(kernel_type_for_var.place_,
-                              expected_kernel_key.place_) ||
-      (is_cuda_pinned_place(kernel_type_for_var.place_) &&
-       is_cpu_place(expected_kernel_key.place_))) {
-    return false;
-  } else {
-    return true;
-  }
-}
-
-bool need_dtype_transform_for_var(const OpKernelType& kernel_type_for_var,
-                                  const OpKernelType& expected_kernel_key) {
-  return false;  // TODO(@xiongkun) add dtype judgement here
-}
-
-bool need_layout_transform_for_var(const OpKernelType& kernel_type_for_var,
-                                   const OpKernelType& expected_kernel_key) {
-  return false;  // TODO(@xiongkun) add layout judgement here
-}
-
-// NOTE(@xiongkun03)
-// the difference between var_name and outer_name :
-// if "X": ["var1", "var2"], then X is the outer name,
-// var1 and var2 is the var_name
-std::tuple<std::string, OpFuncNode> apply_place_transform_for_var(
-    const OpKernelType& kernel_type_for_var,
-    const OpKernelType& expected_kernel_key, const platform::Place& place,
-    const std::string& var_name, const std::string& outer_name,
-    const OpFuncNode& op_func_node, Variable* var, VariableScope* var_scope,
-    bool use_local_scope = true) {
-  Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope()
-                                       : var_scope->GetMutableScope();
-
-  auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  std::string new_var_name =
-      var_name + "_copy_" + std::to_string(var_scope->VarSize() + 1);
-
-  auto* ptr = local_scope->Var(new_var_name);
-  InitializeVariable(ptr, static_cast<proto::VarType::Type>(var->Type()));
-  VLOG(3) << "Create Variable " << var_name << " locally, which pointer is "
-          << ptr << "Variable Type " << var->Type();
-  var_scope->SetVarDesc(var_name, nullptr);
-
-  VariableNameMap copy_in_map;
-  copy_in_map["X"] = {var_name};
-  VariableNameMap copy_out_map;
-  copy_out_map["Out"] = {new_var_name};
-  AttributeMap attr_map;
-  attr_map["dst_place_type"] =
-      is_cpu_place(expected_kernel_key.place_)
-          ? 0
-          : is_gpu_place(expected_kernel_key.place_) ? 1 : -1;
-
-  std::map<std::string, std::vector<int>> copy_ins_name2id;
-  copy_ins_name2id["X"] = {var_scope->VarId(var_name)};
-  std::map<std::string, std::vector<int>> copy_out_name2id;
-  copy_out_name2id["Out"] = {var_scope->VarId(new_var_name)};
-
-  VariableValueMap copy_ins_value_map;
-  copy_ins_value_map["X"] = {var};
-  VariableValueMap copy_outs_value_map;
-  copy_outs_value_map["Out"] = {var_scope->Var(new_var_name)};
-
-  // memcpy_d2h, memcpy_h2d
-  auto memcpy_op_type =
-      get_memcpy_type(kernel_type_for_var.place_, expected_kernel_key.place_);
-  VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", memcpy_op_type,
-                             var_name, kernel_type_for_var.place_, new_var_name,
-                             expected_kernel_key.place_);
-  auto& copy_info = OpInfoMap::Instance().Get(memcpy_op_type);
-  auto copy_op = std::shared_ptr<OperatorBase>(
-      copy_info.Creator()(memcpy_op_type, copy_in_map, copy_out_map, attr_map));
-
-  OpFuncNode copy_op_func_node;
-  copy_op_func_node.input_index = copy_ins_name2id;
-  copy_op_func_node.output_index = copy_out_name2id;
-
-  RuntimeContext copy_runtime_context({}, {});
-  copy_runtime_context.inputs.swap(copy_ins_value_map);
-  copy_runtime_context.outputs.swap(copy_outs_value_map);
-  InterpretercoreInferShapeContext copy_infer_shape_ctx(*copy_op.get(),
-                                                        copy_runtime_context);
-  static_cast<const framework::OperatorWithKernel*>(copy_op.get())
-      ->InferShape(&copy_infer_shape_ctx);
-
-  auto kernels_iter = all_op_kernels.find(memcpy_op_type);
-  PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(),
-                    platform::errors::Unavailable(
-                        "There are no kernels which are registered in "
-                        "the memcpy operator."));
-
-  OpKernelMap& kernels = kernels_iter->second;
-  auto* dev_ctx = pool.Get(place);
-  Scope scope;
-  auto copy_exec_ctx =
-      ExecutionContext(*copy_op, scope, *dev_ctx, copy_runtime_context);
-  auto copy_expected_kernel_key =
-      dynamic_cast<const framework::OperatorWithKernel*>(copy_op.get())
-          ->GetExpectedKernelType(copy_exec_ctx);
-  auto kernel_iter = kernels.find(copy_expected_kernel_key);
-  copy_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
-  copy_op_func_node.kernel_func_(copy_exec_ctx);
-  VLOG(3) << "Run " << memcpy_op_type << " done.";
-  // NOTE(Aurelius84): memcpy_op is expensive operation, so we tag them
-  // as kQueueSync and execute them in thread pool.
-  copy_op_func_node.type_ = OpFuncType::kQueueSync;
-  copy_op_func_node.dev_ctx_ = dev_ctx;
-  copy_op_func_node.operator_base_ = copy_op;
-
-  return std::make_pair(new_var_name, copy_op_func_node);
-}
-
-void apply_data_transform(const OpKernelType& expected_kernel_key,
-                          const platform::Place& place,
-                          VariableValueMap* ins_map_temp,
-                          VariableScope* var_scope, OpFuncNode* op_func_node,
-                          std::vector<OpFuncNode>* copy_func_nodes,
-                          bool use_local_scope = true) {
-  auto op_base = op_func_node->operator_base_.get();
-  PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet(
-                                       "op_base is null, please pass a valid "
-                                       "op_base in apply_data_transform."));
-
-  VariableNameMap new_ins(op_base->Inputs());
-
-  std::unordered_set<int>
-      no_data_transform_index;  // record the no need transform variable index.
-
-  for (auto& var_name_item : *ins_map_temp) {
-    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
-      auto var = var_name_item.second[i];
-      if (!(var->IsType<LoDTensor>() || var->IsType<SelectedRows>())) {
-        continue;
-      }
-      auto& var_name = new_ins[var_name_item.first].at(i);
-      auto tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      if (!tensor_in->IsInitialized()) {
-        continue;
-      }
-      auto kernel_type_for_var =  // the true kernel type for op_base
-          static_cast<const framework::OperatorWithKernel*>(op_base)
-              ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
-                                    expected_kernel_key);
-      if (need_place_transform_for_var(kernel_type_for_var,
-                                       expected_kernel_key)) {
-        if (op_base->Type() == "fetch_v2") {
-          op_base->SetAttr("deepcopy", false);
-        }
-        std::string new_var_name;
-        OpFuncNode copy_op_func_node;
-        std::tie(new_var_name, copy_op_func_node) =
-            apply_place_transform_for_var(kernel_type_for_var,
-                                          expected_kernel_key, place, var_name,
-                                          var_name_item.first, *op_func_node,
-                                          var, var_scope, use_local_scope);
-        op_func_node->input_index[var_name_item.first][i] =
-            var_scope->VarId(new_var_name);
-        copy_func_nodes->emplace_back(copy_op_func_node);
-        var_name_item.second[i] = var_scope->Var(new_var_name);
-        new_ins[var_name_item.first][i] = new_var_name;
-      } else if (need_dtype_transform_for_var(kernel_type_for_var,
-                                              expected_kernel_key)) {
-        // TODO(@xiongkun) add dtype judgement here
-      } else if (need_layout_transform_for_var(kernel_type_for_var,
-                                               expected_kernel_key)) {
-        // TODO(@xiongkun) add layout judgement here
-      } else {
-        // record no need data transformer input var_id
-        VLOG(3) << op_base->Type()
-                << " found no data_transform var: " << var_name
-                << " with id: " << var_scope->VarId(var_name);
-        no_data_transform_index.emplace(var_scope->VarId(var_name));
-      }
-    }
-  }
-
-  // NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent
-  // with instruction
-  // hot fix, it is not good design here
-  op_func_node->operator_base_ =
-      std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
-          op_base->Type(), new_ins, op_base->Outputs(), op_base->Attrs()));
-  op_func_node->no_data_transform_index = std::move(no_data_transform_index);
-}
-
 void build_op_func_list(const platform::Place& place,
                        const framework::BlockDesc& block,
                        std::vector<OpFuncNode>* vec_func_list,
@@ -498,6 +293,7 @@ void build_op_func_list(const platform::Place& place,

    // step 2: build OpFuncNode
    OpFuncNode op_func_node;
+    op_func_node.operator_base_ = ops[i];
    op_func_node.input_index = ins_name2id;
    op_func_node.output_index = outs_name2id;

@@ -538,16 +334,13 @@ void build_op_func_list(const platform::Place& place,
          &expected_kernel_key);  // change device by the device_guard()
      VLOG(3) << "expected_kernel_key : " << expected_kernel_key;

-      // step 3. apply data transforms and insert memory ops
+      // step 3. apply data transforms and insert data transfer ops
      VariableValueMap& ins_map_temp = runtime_context.inputs;
-      std::vector<OpFuncNode> copy_op_to_insert;
-      // NOTE(xiongkun03): assign op_base here to reduce parameter number of
-      // apply_data_transform.
-      op_func_node.operator_base_ = ops[i];
-      apply_data_transform(expected_kernel_key, place, &ins_map_temp, var_scope,
-                           &op_func_node, &copy_op_to_insert, use_local_scope);
-      for (auto& item : copy_op_to_insert) {
-        vec_func_list->push_back(item);
+      std::vector<OpFuncNode> new_op_func_nodes;
+      ApplyDataTransform(expected_kernel_key, place, &ins_map_temp, var_scope,
+                         &op_func_node, &new_op_func_nodes, use_local_scope);
+      for (auto& item : new_op_func_nodes) {
+        vec_func_list->emplace_back(std::move(item));
      }
      // step 4. Run op kernel
      VLOG(3) << op->Type()
@@ -660,12 +453,13 @@ void update_var_min_rw_op(const std::map<int, std::set<int>>& op2dependences,
                          int cur_op, int rw_var) {
  // rw_var is inputs or outputs of cur_op
  // this function update the var2min_rw_op set .
-  if (var2min_rw_op->find(rw_var) == var2min_rw_op->end())
+  if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) {
    (*var2min_rw_op)[rw_var] = std::list<int>();
+  }
  for (auto dep_op : op2dependences.at(cur_op)) {
-    (*var2min_rw_op)[rw_var].remove(dep_op);
+    var2min_rw_op->at(rw_var).remove(dep_op);
  }
-  (*var2min_rw_op)[rw_var].push_back(cur_op);
+  var2min_rw_op->at(rw_var).push_back(cur_op);
 }

 std::map<int, std::list<int>> get_downstream_map(

--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -94,9 +94,6 @@ class AsyncWorkQueue {
  AtomicVectorSizeT atomic_var_ref_;
 };

-std::string get_memcpy_type(const platform::Place& src_place,
-                            const platform::Place& dst_place);
-
 void build_variable_scope(const framework::BlockDesc& block,
                          VariableScope* var_scope,
                          bool use_local_scope = true);

--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -112,16 +112,23 @@ class CastOp : public framework::OperatorWithKernel {

 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OPERATOR(cast, ops::CastOp,
-                  ops::CastOpGradMaker<paddle::framework::OpDesc>,
-                  ops::CastOpGradMaker<paddle::imperative::OpBase>,
-                  ops::CastOpProtoMaker);
-REGISTER_OP_CPU_KERNEL(
-    cast, ops::CastOpKernel<CPU, float>, ops::CastOpKernel<CPU, double>,
-    ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int64_t>,
-    ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int16_t>,
-    ops::CastOpKernel<CPU, bool>, ops::CastOpKernel<CPU, uint8_t>,
-    ops::CastOpKernel<CPU, paddle::platform::float16>,
-    ops::CastOpKernel<CPU, paddle::platform::bfloat16>,
-    ops::CastOpKernel<CPU, paddle::platform::complex<float>>,
-    ops::CastOpKernel<CPU, paddle::platform::complex<double>>);
+#define REGISTER_CAST_CPU_BASE(op_name, ...)                                  \
+  REGISTER_OPERATOR(op_name, ops::CastOp,                                     \
+                    ops::CastOpGradMaker<paddle::framework::OpDesc>,          \
+                    ops::CastOpGradMaker<paddle::imperative::OpBase>,         \
+                    ops::CastOpProtoMaker);                                   \
+  REGISTER_OP_CPU_KERNEL(                                                     \
+      op_name, ops::CastOpKernel<CPU, float>, ops::CastOpKernel<CPU, double>, \
+      ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int64_t>,           \
+      ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int16_t>,           \
+      ops::CastOpKernel<CPU, bool>, ops::CastOpKernel<CPU, uint8_t>,          \
+      ops::CastOpKernel<CPU, paddle::platform::float16>,                      \
+      ops::CastOpKernel<CPU, paddle::platform::bfloat16>,                     \
+      ops::CastOpKernel<CPU, paddle::platform::complex<float>>,               \
+      ops::CastOpKernel<CPU, paddle::platform::complex<double>>);
+
+REGISTER_CAST_CPU_BASE(cast)
+// [ why register transfer_dtype_op alias with cast_op? ]
+// In case of InterpreterCore, if we reuse cast_op, we cannot distinguish
+// which cast_op is inserted by new executor when we do profiling.
+REGISTER_CAST_CPU_BASE(transfer_dtype)
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -107,6 +107,9 @@ namespace plat = paddle::platform;

 #if !defined(PADDLE_WITH_HIP)
 REGISTER_CAST_CUDA_BASE(cast, ops::CastCUDAOpKernel<plat::bfloat16>)
+// See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
+REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastCUDAOpKernel<plat::bfloat16>)
 #else
 REGISTER_CAST_CUDA_BASE(cast)
+REGISTER_CAST_CUDA_BASE(transfer_dtype)
 #endif
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/transfer_layout_op.h"
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class InferShapeContext;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class TransferLayoutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "TransferLayout");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TransferLayout");
+
+    auto dst_layout = ctx->Attrs().Get<int>("dst_layout");
+    auto low_bound = static_cast<int>(framework::DataLayout::kNHWC);
+    auto upper_bound = static_cast<int>(framework::DataLayout::kMKLDNN);
+    PADDLE_ENFORCE_GE(
+        dst_layout, low_bound,
+        platform::errors::PreconditionNotMet(
+            "Required dst_layout >= %d, but received dst_layout = %d",
+            low_bound, dst_layout));
+    PADDLE_ENFORCE_LE(
+        dst_layout, upper_bound,
+        platform::errors::PreconditionNotMet(
+            "Required dst_layout <= %d, but received dst_layout = %d",
+            upper_bound, dst_layout));
+
+    // TODO(Aurelius84): Out's ddim is different with X because they have
+    // different layout
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    // kernel's device type is decided by input tensor place
+    auto *in = ctx.InputVar("X");
+    auto *in_tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in);
+    PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(), true,
+                      platform::errors::PreconditionNotMet(
+                          "The tensor of Input(X) is not initialized."));
+    // dtype is not important
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   in_tensor->place());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(),
+                                   expected_kernel_type.data_layout_);
+  }
+};
+
+class TransferLayoutInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    ctx->SyncTypeAndDataType("X", "Out");
+  }
+};
+
+class TransferLayoutKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *x = ctx.InputVar("X");
+    auto *out = ctx.OutputVar("Out");
+    auto &dev_ctx = ctx.device_context();
+    auto dst_layout = ctx.Attr<int>("dst_layout");
+    TransferLayoutFunctor(x, out, dev_ctx, dst_layout)();
+  }
+};
+
+class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(LoDTensor) The input Tensor");
+    AddOutput("Out", "(LoDTensor) The Output Tensor with desired layout");
+    AddAttr<int>("dst_layout",
+                 "kNHWC = 0, kNCHW = 1, kAnyLayout = 2, kMKLDNN = 3");
+    AddComment(R"DOC(
+    TransferLayout Operator)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(
+    transfer_layout, ops::TransferLayoutOp, ops::TransferLayoutOpProtoMaker,
+    ops::TransferLayoutInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+// dtype is not important
+REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout, float,
+                               ops::TransferLayoutKernel);
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/data_transform.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+using DataLayout = framework::DataLayout;
+
+class TransferLayoutFunctor {
+ public:
+  TransferLayoutFunctor(const framework::Variable *in, framework::Variable *out,
+                        const platform::DeviceContext &dev_ctx,
+                        const int dst_layout)
+      : in_(in), out_(out), dev_ctx_(dev_ctx), dst_layout_(dst_layout) {}
+
+  void operator()() const {
+    auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_);
+    framework::LoDTensor out_tensor;
+
+    auto out_layout = static_cast<DataLayout>(dst_layout_);
+    out_tensor.set_layout(out_layout);
+
+#ifdef PADDLE_WITH_MKLDNN
+    auto in_layout = in_tensor.layout();
+    if (in_layout == DataLayout::kMKLDNN || out_layout == DataLayout::kMKLDNN) {
+      PADDLE_ENFORCE_NE(
+          in_layout, out_layout,
+          platform::errors::PreconditionNotMet(
+              "No layout transform needed between two MKLDNN OPKernels."));
+
+      if (in_layout != DataLayout::kMKLDNN &&
+          out_layout == DataLayout::kMKLDNN) {
+        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
+        // Just set layout/format. No real transform occur
+
+        auto out_format = platform::MKLDNNFormatForSize(
+            in_tensor.dims().size(), ToMKLDNNFormat(in_layout));
+        out_tensor.ShareDataWith(in_tensor);
+        // For NHWC data we need reshape of tensors as MKL-DNN
+        // is expecting NHWC dims description order
+        platform::MatchShapeToLayout(&out_tensor, in_layout, out_layout);
+        paddle::platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
+            in_layout);
+        out_tensor.set_layout(DataLayout::kMKLDNN);
+        out_tensor.set_format(out_format);
+      } else {
+        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
+        // Do transform via MKLDNN lib
+        innerTransDataLayoutFromMKLDNN(
+            in_layout, paddle::platform::MKLDNNDeviceContext::tls()
+                           .get_cur_paddle_data_layout(),
+            in_tensor, &out_tensor, dev_ctx_.GetPlace());
+      }
+    } else {
+      // Case3 - transfrom between Non-MKLDNN OPKernels
+      TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
+    }
+#else
+    // Case3 - transfrom between Non-MKLDNN OPKernels
+    TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
+#endif
+    framework::SetTensorToVariable(*in_, out_tensor, out_);
+  }
+
+ private:
+  void TransDataLayout(const platform::DeviceContext &dev_ctx,
+                       const framework::Tensor &in,
+                       framework::Tensor *out) const {
+    PADDLE_ENFORCE_EQ(
+        framework::arity(in.dims()), 4,
+        platform::errors::InvalidArgument(
+            "Input dimension arity only can be 4, the input dimension is %s.",
+            in.dims()));
+
+    auto src_dim = in.dims();
+    std::vector<int64_t> dst_dim;
+
+    auto axis = framework::GetAxis(in.layout(), out->layout());
+    dst_dim.resize(axis.size());
+    for (size_t i = 0; i < axis.size(); i++) {
+      dst_dim[i] = src_dim[axis[i]];
+    }
+
+    out->Resize(framework::make_ddim(dst_dim));
+    out->mutable_data(in.place(), in.type());
+
+    framework::VisitDataType(
+        in.type(), framework::CastDataLayout(&dev_ctx, axis, in, out));
+  }
+
+  const framework::Variable *in_;
+  framework::Variable *out_;
+  const platform::DeviceContext &dev_ctx_;
+  const int dst_layout_;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/fluid/tests/unittests/test_transfer_dtype_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transfer_dtype_op.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16
+
+
+class TestTransferDtypeOpFp32ToFp64(OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float64')}
+        self.attrs = {
+            'out_dtype': int(core.VarDesc.VarType.FP64),
+            'in_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'transfer_dtype'
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTransferDtypeOpFp16ToFp32(OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float16')}
+        self.outputs = {'Out': ipt.astype('float32')}
+        self.attrs = {
+            'out_dtype': int(core.VarDesc.VarType.FP32),
+            'in_dtype': int(core.VarDesc.VarType.FP16)
+        }
+        self.op_type = 'transfer_dtype'
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
+class TestTransferDtypeOpFp32ToFp16(OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float16')}
+        self.attrs = {
+            'out_dtype': int(core.VarDesc.VarType.FP16),
+            'in_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'transfer_dtype'
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
+class TestTransferDtypeOpBf16ToFp32(OpTest):
+    def setUp(self):
+        ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16')
+        self.inputs = {'X': ipt}
+        self.outputs = {'Out': convert_uint16_to_float(ipt)}
+        self.attrs = {
+            'out_dtype': int(core.VarDesc.VarType.FP32),
+            'in_dtype': int(core.VarDesc.VarType.BF16)
+        }
+        self.op_type = 'transfer_dtype'
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTransferDtypeFp32ToBf16(OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10]).astype('float32')
+        self.inputs = {'X': ipt}
+        self.outputs = {'Out': convert_float_to_uint16(ipt)}
+        self.attrs = {
+            'out_dtype': int(core.VarDesc.VarType.BF16),
+            'in_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'transfer_dtype'
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+
+
+# default kNCHW
+class TestTransferLayoutOpkNCHWTokNHWC(OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[2, 3, 10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.transpose([0, 2, 3, 1])}
+        self.attrs = {
+            'dst_layout': 0  # kNHWC
+        }
+        self.op_type = 'transfer_layout'
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -14,6 +14,8 @@

 STATIC_MODE_TESTING_LIST = [
    'test_affine_channel_op',
+    'test_transfer_dtype_op',
+    'test_transfer_layout_op',
    'test_concat_op',
    'test_elementwise_add_op',
    'test_elementwise_sub_op',