diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 51cf78d1cdf27250e4cf83cbba54895f7dfeab07..581701c1e119c876464fd62c25ab2815c20cf1e9 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -149,7 +149,8 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
   // 2. Construct VariableNameMap
   VariableNameMap in_name_map = {{"X", {var_name}}};
   VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
-  AttributeMap attr_map = {{"dst_layout", static_cast<int>(out_layout)}};
+  AttributeMap attr_map = {{"src_layout", static_cast<int>(in_layout)},
+                           {"dst_layout", static_cast<int>(out_layout)}};
 
   // 3. Create transfer_layout_op
   std::string op_type("transfer_layout");
@@ -157,8 +158,9 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
   auto op = std::shared_ptr<OperatorBase>(
       op_info.Creator()(op_type, in_name_map, out_name_map, attr_map));
 
-  VLOG(3) << string::Sprintf("Insert %s(%s) with %s -> %s(%s).", op_type,
-                             var_name, in_layout, *new_var_name, out_layout);
+  VLOG(3) << string::Sprintf("Insert %s for variable %s(%s) -> %s(%s).",
+                             op_type, var_name, in_layout, *new_var_name,
+                             out_layout);
   return op;
 }
 
@@ -242,6 +244,7 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
 void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                         const platform::Place& place,
                         VariableValueMap* ins_map_temp,
+                        VariableValueMap* outs_map_temp,
                         VariableScope* var_scope, OpFuncNode* op_func_node,
                         std::vector<OpFuncNode>* new_op_func_nodes,
                         bool use_local_scope) {
@@ -251,6 +254,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                                        "op_base in apply_data_transform."));
 
   VariableNameMap new_ins(op_base->Inputs());
+  VariableNameMap new_outs(op_base->Outputs());
   // record the no need transform variable index.
   std::unordered_set<int> no_data_transform_index;
 
@@ -258,7 +262,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
   for (auto& var_name_item : *ins_map_temp) {
     for (size_t i = 0; i < var_name_item.second.size(); ++i) {
       auto var = var_name_item.second[i];
-      auto& var_name = new_ins[var_name_item.first].at(i);
+      auto var_name = new_ins[var_name_item.first].at(i);
       const Tensor* tensor_in;
       if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
         tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
@@ -287,6 +291,28 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
             var_scope->VarId(new_var_name);
         var_name_item.second[i] = var_scope->Var(new_var_name);
         new_ins[var_name_item.first][i] = new_var_name;
+        for (auto& pair : new_outs) {
+          for (size_t j = 0; j < pair.second.size(); ++j) {
+            VLOG(4) << pair.second[j] << " " << var_name;
+            if (pair.second[j] == var_name) {
+              VLOG(4) << "Found inplace between input(" << var_name_item.first
+                      << ") and output(" << pair.first
+                      << "), the variable name is " << var_name;
+              (*outs_map_temp)[pair.first][j] = var_scope->Var(new_var_name);
+              new_outs[pair.first][j] = new_var_name;
+              op_func_node
+                  ->inplace_back_map[var_scope->GetIdByName(new_var_name)] =
+                  var_scope->GetIdByName(var_name);
+              op_func_node->output_index[pair.first][j] =
+                  var_scope->VarId(new_var_name);
+              // NOTE(zhiqiu): The inplace op with `transfer` also changes
+              // original output after that
+              // so add original output as well
+              op_func_node->output_index[pair.first].push_back(
+                  var_scope->VarId(var_name));
+            }
+          }
+        }
         // NOTE(Aurelius84): avoid deepcopy twice if we already insert data
         // transfer op.
         if (op_base->Type() == "fetch_v2") {
@@ -306,7 +332,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
   // with instruction. (hot fix, it is not good design here)
   op_func_node->operator_base_ =
       std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
-          op_base->Type(), new_ins, op_base->Outputs(), op_base->Attrs()));
+          op_base->Type(), new_ins, new_outs, op_base->Attrs()));
   op_func_node->no_data_transform_index = std::move(no_data_transform_index);
 }
 
diff --git a/paddle/fluid/framework/new_executor/data_transfer.h b/paddle/fluid/framework/new_executor/data_transfer.h
index 7744e955c857874f7a01190092cb299f93175e43..1c48018927934cf5987f9bee232ecf9d9ba38bd1 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.h
+++ b/paddle/fluid/framework/new_executor/data_transfer.h
@@ -54,6 +54,7 @@ class DataTranferHelper {
 void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                         const platform::Place& place,
                         VariableValueMap* ins_map_temp,
+                        VariableValueMap* outs_map_temp,
                         VariableScope* var_scope, OpFuncNode* op_func_node,
                         std::vector<OpFuncNode>* op_func_nodes,
                         bool use_local_scope = true);
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index b36ff519ce18722160f1e98a4d64167f4945ae22..d956f23242d4b52c9def4f9a5daa244cc6cb9523 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -457,6 +457,21 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 
   VLOG(4) << "End run " << place << " " << op->DebugStringEx(global_scope_);
 
+  if (!instr_node.InplaceBackMap().empty()) {
+    auto& m = instr_node.InplaceBackMap();
+    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
+    for (auto& p : m) {
+      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          global_scope_->Var(p.first));
+      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          global_scope_->Var(p.second));
+      original_tensor->ShareDataWith(*transformed_tensor);
+      VLOG(4) << "Transfer inplace variable back form "
+              << global_scope_->GetNameById(p.first) << " to "
+              << global_scope_->GetNameById(p.second);
+    }
+  }
+
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index a045d6c7f4a65fdda83578f319b75788d0f68f95..b89683dc4399167e3c702dadb1bc9e25f31c0677 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -138,7 +138,9 @@ get_unused_vars(const BlockDesc& block,
     size_t op_idx = name_op_idx_pair.second;
 
     result[ops[op_idx].get()].emplace_back(name);
+    VLOG(4) << ops[op_idx].get()->Type() << " " << name;
   }
+  VLOG(4) << "gc map size:" << result.size();
   return result;
 }
 
@@ -311,8 +313,8 @@ void build_op_func_list(const platform::Place& place,
   operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
       main_program, block.ID(), ops_unique);
 
-  std::vector<std::shared_ptr<OperatorBase>>
-      ops;  // its elements will be moved to vec_func_list
+  // its elements will be moved to vec_func_list
+  std::vector<std::shared_ptr<OperatorBase>> ops;
   for (auto& op_unique : ops_unique) {
     ops.emplace_back(std::move(op_unique));
   }
@@ -348,34 +350,28 @@ void build_op_func_list(const platform::Place& place,
     op_func_node.operator_base_ = ops[i];
     op_func_node.input_index = ins_name2id;
     op_func_node.output_index = outs_name2id;
+    VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
 
-    if (dynamic_cast<const framework::OperatorWithKernel*>(op) == nullptr) {
+    if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
       // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
       deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope);
+      VLOG(4) << "End run " << place << " "
+              << op_func_node.operator_base_->DebugStringEx(local_scope);
     } else {
-      auto op_with_kernel =
-          static_cast<const framework::OperatorWithKernel*>(op);
+      auto op_with_kernel = const_cast<framework::OperatorWithKernel*>(
+          static_cast<const framework::OperatorWithKernel*>(op));
       // construct RuntimeContext and analysis KernelType
       RuntimeContext runtime_context({}, {});
       runtime_context.inputs.swap(ins_map);
       runtime_context.outputs.swap(outs_map);
 
-      // see OperatorWithKernel::RunImpl in operator.cc for why
-      if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
-            op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
-        InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
-        // TODO(Aurelius84): In case of control flow ops, they are NOT
-        // inheritted
-        // from OperatorWithKernel.
-        op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
-      }
-
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
       auto* dev_ctx = pool.Get(place);
       Scope scope;
       auto expected_kernel_key = op_with_kernel->GetExpectedKernelType(
           ExecutionContext(*op, scope, *dev_ctx, runtime_context));
+      op_with_kernel->ResetKernelType(new OpKernelType(expected_kernel_key));
 
       // change device by the device_guard()
       apply_device_guard(op, place, &expected_kernel_key);
@@ -383,13 +379,16 @@ void build_op_func_list(const platform::Place& place,
 
       // step 3. apply data transforms and insert data transfer ops
       VariableValueMap& ins_map_temp = runtime_context.inputs;
+      VariableValueMap& outs_map_temp = runtime_context.outputs;
 
       // NOTE(zhiqiu): op_func_node->operator_base_ maybe changed in
       // ApplyDataTransform
-      ApplyDataTransform(expected_kernel_key, place, &ins_map_temp, var_scope,
-                         &op_func_node, vec_func_list, use_local_scope);
-      op_with_kernel = static_cast<const framework::OperatorWithKernel*>(
-          op_func_node.operator_base_.get());
+      ApplyDataTransform(expected_kernel_key, place, &ins_map_temp,
+                         &outs_map_temp, var_scope, &op_func_node,
+                         vec_func_list, use_local_scope);
+      op_with_kernel = const_cast<framework::OperatorWithKernel*>(
+          static_cast<const framework::OperatorWithKernel*>(
+              op_func_node.operator_base_.get()));
 
       // step 4. Run op kernel
       VLOG(3) << op_with_kernel->Type()
@@ -412,6 +411,16 @@ void build_op_func_list(const platform::Place& place,
       auto exec_ctx =
           ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
 
+      // see OperatorWithKernel::RunImpl in operator.cc for why
+      if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
+            op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
+        InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
+        // TODO(Aurelius84): In case of control flow ops, they are NOT
+        // inheritted
+        // from OperatorWithKernel.
+        op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
+      }
+
       auto run_phi_kernel = false;
       if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
               op_with_kernel->Type())) {
@@ -476,9 +485,28 @@ void build_op_func_list(const platform::Place& place,
             op_func_node, place, outputs_names, &runtime_context.outputs,
             var_scope, vec_func_list, local_scope);
       }
+      if (!op_func_node.inplace_back_map.empty()) {
+        auto& m = op_func_node.inplace_back_map;
+        // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
+        for (auto& p : m) {
+          auto* transformed_tensor =
+              GetMutableLoDTensorOrSelectedRowsValueFromVar(
+                  var_scope->Var(p.first));
+          auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+              var_scope->Var(p.second));
+          original_tensor->ShareDataWith(*transformed_tensor);
+          VLOG(4) << "Transfer inplace variable back form "
+                  << var_scope->GetNameById(p.first) << " to "
+                  << var_scope->GetNameById(p.second);
+        }
+      }
     }
 
+    VLOG(4) << "End run " << place << " "
+            << op_func_node.operator_base_->DebugStringEx(local_scope);
+
     vec_func_list->emplace_back(op_func_node);
+
     // gc---------------------------------------------------------------------------
     auto iter = unused_var_map.find(op);
     if (iter == unused_var_map.end()) {
@@ -514,10 +542,7 @@ void build_op_func_list(const platform::Place& place,
             framework::ToTypeName(var->Type()), var_name));
       }
     }
-
     delete garbages;  // free mem
-
-    VLOG(3) << "run " << op->Type() << " done.";
   }
 }
 
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 35bac4393170331486298a29f1b6be26065ad864..ccdd9dc9d50ced8d1fb0ec57b24ee878637dd5a4 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -692,6 +692,10 @@ phi::Kernel* Instruction::PhiKernel() const { return op_func_node_.pt_kernel_; }
 
 OpFuncType Instruction::KernelType() const { return op_func_node_.type_; }
 
+const std::map<int, int>& Instruction::InplaceBackMap() const {
+  return op_func_node_.inplace_back_map;
+}
+
 OperatorBase* Instruction::OpBase() const {
   auto op_base = op_func_node_.operator_base_;
   PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index dc34bd2c69411837b6130b87dba1753687cf82f8..5704fa414bbb2b195c66a7d85e0cd587403e04fc 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -297,6 +297,8 @@ struct OpFuncNode {
   std::map<std::string, std::vector<int>> output_index;
   std::unordered_set<int> no_data_transform_index;
 
+  std::map<int, int> inplace_back_map;
+
   OpKernelComputeFunc kernel_func_;
   platform::DeviceContext* dev_ctx_;  // not owned
 
@@ -325,6 +327,8 @@ class Instruction {
 
   OpFuncType KernelType() const;
 
+  const std::map<int, int>& InplaceBackMap() const;
+
   OperatorBase* OpBase() const;
 
   NextInstruction& NextInstructions();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 4048995a44c8e745629116471a36b8d50f9a8551..71fc059728956b6178572a0dd8dbae85327c34fd 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -664,6 +664,10 @@ class OperatorWithKernel : public OperatorBase {
 
   const OpKernelType* kernel_type() const { return kernel_type_.get(); }
 
+  void ResetKernelType(OpKernelType* kernel_type) {
+    kernel_type_.reset(kernel_type);
+  }
+
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
   void RunImpl(const Scope& scope, const platform::Place& place,
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 5194c8772e47bca5ec728079b4b2dce883e39c22..36a0d53e052453f97a4d510cad5587614c9796c7 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -94,7 +94,8 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
           "must smaller than or equal to 5. But received: the shape of input X "
           "= [%s], the dimension of input X = [%d]",
           x_dims, x_dims.size()));
-
+  VLOG(4) << ctx->IsRunMKLDNNKernel();
+  VLOG(4) << data_layout;
   const int64_t C =
       ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
            ? x_dims[1]
@@ -136,6 +137,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
                           C, bias_dim[0]));
   }
   ctx->SetOutputDim("Y", x_dims);
+  VLOG(4) << x_dims;
   ctx->SetOutputDim("MeanOut", {C});
   ctx->SetOutputDim("VarianceOut", {C});
   ctx->SetOutputDim("SavedMean", {C});
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index 41dc774c1111955cde4bd7ad8d68b46469edf1cd..900d3e54c797184697daede09d674ffdef7d96bd 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -203,14 +203,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto *y = ctx.Output<Tensor>("Y");
     auto *batch_mean = ctx.Output<Tensor>("SavedMean");
     auto *batch_variance = ctx.Output<Tensor>("SavedVariance");
-
     BatchNormMKLDNNHandler<T> handler(ctx, mkldnn_engine, x, global_stats,
                                       test_mode);
 
     auto src_memory = handler.AcquireSrcMemory(x);
     auto scaleshift_memory = handler.AcquireScaleShiftMemory(scale, shift);
     auto dst_memory = handler.AcquireDstMemory(y);
-
     auto batch_norm_p = handler.AcquireForwardPrimitive();
 
     std::shared_ptr<memory> mean_memory;
@@ -300,7 +298,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto diff_src_memory = handler.AcquireDiffSrcMemory(diff_x);
     auto diff_scaleshift_memory =
         handler.AcquireDiffScaleShiftMemory(diff_scaleshift_data.data());
-
     // finally create batch_norm backward primitive
     auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive();
 
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
index bf3a985923f87db35702bb443ab1651a4db86001..96946ee15f41a5d70484562ec2cb490e0ed681c0 100644
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -16,6 +16,8 @@
 
 #include <string>
 
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -95,8 +97,9 @@ class TransferLayoutKernel {
     auto *x = ctx.InputVar("X");
     auto *out = ctx.OutputVar("Out");
     auto &dev_ctx = ctx.device_context();
+    auto src_layout = ctx.Attr<int>("src_layout");
     auto dst_layout = ctx.Attr<int>("dst_layout");
-    TransferLayoutFunctor(x, out, dev_ctx, dst_layout)();
+    TransferLayoutFunctor(x, out, dev_ctx, src_layout, dst_layout)();
   }
 };
 
@@ -105,6 +108,14 @@ class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(LoDTensor) The input Tensor");
     AddOutput("Out", "(LoDTensor) The Output Tensor with desired layout");
+    // NOTE(zhiqiu): in most case, the src_layout is not needed, the op can use
+    // the layout
+    // of input X. However, in some mkldnn kernel, the src layout computed by
+    // GetKernelTypeForVar is different with the layout of tensor X.
+    AddAttr<int>("src_layout",
+                 "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3, default "
+                 "-1 means unspecified and use the tensor's layout.")
+        .SetDefault(-1);
     AddAttr<int>("dst_layout",
                  "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3");
     AddComment(R"DOC(
@@ -126,3 +137,8 @@ REGISTER_OPERATOR(
 // dtype is not important
 REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout, float,
                                ops::TransferLayoutKernel);
+REGISTER_OP_VERSION(transfer_layout)
+    .AddCheckpoint(
+        R"ROC(refine transfer_layout, add src_layout attribute)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "src_layout", "(int, the layout of the input tensor", -1));
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index f2d75b4194adf4a80d15d7ff6be18e1c5d14fc49..06bf54e998cc2f080f828c0a723f8e6631eb70e2 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -39,8 +39,12 @@ class TransferLayoutFunctor {
  public:
   TransferLayoutFunctor(const framework::Variable *in, framework::Variable *out,
                         const platform::DeviceContext &dev_ctx,
-                        const int dst_layout)
-      : in_(in), out_(out), dev_ctx_(dev_ctx), dst_layout_(dst_layout) {}
+                        const int src_layout, const int dst_layout)
+      : in_(in),
+        out_(out),
+        dev_ctx_(dev_ctx),
+        src_layout_(src_layout),
+        dst_layout_(dst_layout) {}
 
   void operator()() const {
     auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_);
@@ -50,7 +54,8 @@ class TransferLayoutFunctor {
     out_tensor.set_layout(out_layout);
 
 #ifdef PADDLE_WITH_MKLDNN
-    auto in_layout = in_tensor.layout();
+    auto in_layout = static_cast<DataLayout>(src_layout_);
+    VLOG(4) << in_layout << "->" << out_layout << " " << in_tensor.layout();
     if (in_layout == DataLayout::kMKLDNN || out_layout == DataLayout::kMKLDNN) {
       PADDLE_ENFORCE_NE(
           in_layout, out_layout,
@@ -68,6 +73,7 @@ class TransferLayoutFunctor {
         // For NHWC data we need reshape of tensors as MKL-DNN
         // is expecting NHWC dims description order
         if (in_layout == DataLayout::kNHWC) {
+          VLOG(4) << "kNHWC";
           platform::MatchShapeToLayout(&out_tensor, in_layout, out_layout);
           paddle::platform::MKLDNNDeviceContext::tls()
               .set_cur_paddle_data_layout(in_layout);
@@ -75,6 +81,7 @@ class TransferLayoutFunctor {
         out_tensor.set_layout(DataLayout::kMKLDNN);
         out_tensor.set_format(out_format);
       } else {
+        VLOG(4) << "kNCHW";
         // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
         // Do transform via MKLDNN lib
         paddle::framework::innerTransDataLayoutFromMKLDNN(
@@ -123,6 +130,7 @@ class TransferLayoutFunctor {
   const framework::Variable *in_;
   framework::Variable *out_;
   const platform::DeviceContext &dev_ctx_;
+  const int src_layout_;
   const int dst_layout_;
 };
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 5605d326f2cfa53e5f3f8aba1b65d1a2cd3e8893..9aa362546ec23e978fff618c3b5d07796e02aaf0 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -531,6 +531,7 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
 }
 
 void CUDADeviceContext::Wait() const {
+  VLOG(4) << "CUDA context(" << this << ")  Wait";
   if (thread_ctx_.count(this)) {
     context()->Stream()->Wait();
     return;
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 1e856a0fe900fca423333f2d859af40db49e8f24..cff4f7f41d02b9b9b47fbb359e455cbb5c9ae27d 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -352,5 +352,23 @@ class TestException(unittest.TestCase):
             self.fetch_vars.name))
 
 
+class TestInplaceApiWithDataTransform(unittest.TestCase):
+    def test_increment(self):
+        if paddle.fluid.core.is_compiled_with_cuda():
+            with paddle.fluid.device_guard("gpu:0"):
+                x = paddle.fluid.layers.fill_constant([1], "float32", 0)
+            with paddle.fluid.device_guard("cpu"):
+                x = paddle.increment(x)
+            exe = paddle.static.Executor(paddle.CUDAPlace(0))
+            os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
+
+            for i in range(10):
+                a, = exe.run(paddle.static.default_main_program(),
+                             fetch_list=[x])
+                self.assertEqual(a[0], 1)
+
+            del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
index 0f64f7f5d8d10078b9743231f597ec0bc46d2966..94644cf2fec1d5357bf9b6ab23443cbbfec5cce4 100644
--- a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
@@ -30,6 +30,7 @@ class TestTransferLayoutOpkNCHWTokNHWC(OpTest):
         self.inputs = {'X': ipt.astype('float32')}
         self.outputs = {'Out': ipt.transpose([0, 2, 3, 1])}
         self.attrs = {
+            'src_layout': 0,
             'dst_layout': 1  # kNHWC
         }
         self.op_type = 'transfer_layout'