[new-exec] fit for mkldnn and inplace op (#40955)

* fit for mkldnn and inplace op * fix compile * refine ut * register op version * fix inplace op * fix transfer_layout

[new-exec] fit for mkldnn and inplace op (#40955)
* fit for mkldnn and inplace op * fix compile * refine ut * register op version * fix inplace op * fix transfer_layout
afa0e82c · Leo Chen · GitHub · de8962bd · afa0e82c · afa0e82c
14 changed file
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -149,7 +149,8 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
  // 2. Construct VariableNameMap
  VariableNameMap in_name_map = {{"X", {var_name}}};
  VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
-  AttributeMap attr_map = {{"dst_layout", static_cast<int>(out_layout)}};
+  AttributeMap attr_map = {{"src_layout", static_cast<int>(in_layout)},
+                           {"dst_layout", static_cast<int>(out_layout)}};

  // 3. Create transfer_layout_op
  std::string op_type("transfer_layout");
@@ -157,8 +158,9 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
  auto op = std::shared_ptr<OperatorBase>(
      op_info.Creator()(op_type, in_name_map, out_name_map, attr_map));

-  VLOG(3) << string::Sprintf("Insert %s(%s) with %s -> %s(%s).", op_type,
-                             var_name, in_layout, *new_var_name, out_layout);
+  VLOG(3) << string::Sprintf("Insert %s for variable %s(%s) -> %s(%s).",
+                             op_type, var_name, in_layout, *new_var_name,
+                             out_layout);
  return op;
 }

@@ -242,6 +244,7 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
 void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                        const platform::Place& place,
                        VariableValueMap* ins_map_temp,
+                        VariableValueMap* outs_map_temp,
                        VariableScope* var_scope, OpFuncNode* op_func_node,
                        std::vector<OpFuncNode>* new_op_func_nodes,
                        bool use_local_scope) {
@@ -251,6 +254,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                                       "op_base in apply_data_transform."));

  VariableNameMap new_ins(op_base->Inputs());
+  VariableNameMap new_outs(op_base->Outputs());
  // record the no need transform variable index.
  std::unordered_set<int> no_data_transform_index;

@@ -258,7 +262,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
  for (auto& var_name_item : *ins_map_temp) {
    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
      auto var = var_name_item.second[i];
-      auto& var_name = new_ins[var_name_item.first].at(i);
+      auto var_name = new_ins[var_name_item.first].at(i);
      const Tensor* tensor_in;
      if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
        tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
@@ -287,6 +291,28 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
            var_scope->VarId(new_var_name);
        var_name_item.second[i] = var_scope->Var(new_var_name);
        new_ins[var_name_item.first][i] = new_var_name;
+        for (auto& pair : new_outs) {
+          for (size_t j = 0; j < pair.second.size(); ++j) {
+            VLOG(4) << pair.second[j] << " " << var_name;
+            if (pair.second[j] == var_name) {
+              VLOG(4) << "Found inplace between input(" << var_name_item.first
+                      << ") and output(" << pair.first
+                      << "), the variable name is " << var_name;
+              (*outs_map_temp)[pair.first][j] = var_scope->Var(new_var_name);
+              new_outs[pair.first][j] = new_var_name;
+              op_func_node
+                  ->inplace_back_map[var_scope->GetIdByName(new_var_name)] =
+                  var_scope->GetIdByName(var_name);
+              op_func_node->output_index[pair.first][j] =
+                  var_scope->VarId(new_var_name);
+              // NOTE(zhiqiu): The inplace op with `transfer` also changes
+              // original output after that
+              // so add original output as well
+              op_func_node->output_index[pair.first].push_back(
+                  var_scope->VarId(var_name));
+            }
+          }
+        }
        // NOTE(Aurelius84): avoid deepcopy twice if we already insert data
        // transfer op.
        if (op_base->Type() == "fetch_v2") {
@@ -306,7 +332,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
  // with instruction. (hot fix, it is not good design here)
  op_func_node->operator_base_ =
      std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
-          op_base->Type(), new_ins, op_base->Outputs(), op_base->Attrs()));
+          op_base->Type(), new_ins, new_outs, op_base->Attrs()));
  op_func_node->no_data_transform_index = std::move(no_data_transform_index);
 }


--- a/paddle/fluid/framework/new_executor/data_transfer.h
+++ b/paddle/fluid/framework/new_executor/data_transfer.h
@@ -54,6 +54,7 @@ class DataTranferHelper {
 void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                        const platform::Place& place,
                        VariableValueMap* ins_map_temp,
+                        VariableValueMap* outs_map_temp,
                        VariableScope* var_scope, OpFuncNode* op_func_node,
                        std::vector<OpFuncNode>* op_func_nodes,
                        bool use_local_scope = true);

--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -457,6 +457,21 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {

  VLOG(4) << "End run " << place << " " << op->DebugStringEx(global_scope_);

+  if (!instr_node.InplaceBackMap().empty()) {
+    auto& m = instr_node.InplaceBackMap();
+    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
+    for (auto& p : m) {
+      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          global_scope_->Var(p.first));
+      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          global_scope_->Var(p.second));
+      original_tensor->ShareDataWith(*transformed_tensor);
+      VLOG(4) << "Transfer inplace variable back form "
+              << global_scope_->GetNameById(p.first) << " to "
+              << global_scope_->GetNameById(p.second);
+    }
+  }
+
  /*For profiling/benchmark only*/
  if (FLAGS_benchmark) {
    instr_node.DeviceContext().Wait();

--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -138,7 +138,9 @@ get_unused_vars(const BlockDesc& block,
    size_t op_idx = name_op_idx_pair.second;

    result[ops[op_idx].get()].emplace_back(name);
+    VLOG(4) << ops[op_idx].get()->Type() << " " << name;
  }
+  VLOG(4) << "gc map size:" << result.size();
  return result;
 }

@@ -311,8 +313,8 @@ void build_op_func_list(const platform::Place& place,
  operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
      main_program, block.ID(), ops_unique);

-  std::vector<std::shared_ptr<OperatorBase>>
-      ops;  // its elements will be moved to vec_func_list
+  // its elements will be moved to vec_func_list
+  std::vector<std::shared_ptr<OperatorBase>> ops;
  for (auto& op_unique : ops_unique) {
    ops.emplace_back(std::move(op_unique));
  }
@@ -348,34 +350,28 @@ void build_op_func_list(const platform::Place& place,
    op_func_node.operator_base_ = ops[i];
    op_func_node.input_index = ins_name2id;
    op_func_node.output_index = outs_name2id;
+    VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);

-    if (dynamic_cast<const framework::OperatorWithKernel*>(op) == nullptr) {
+    if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
      // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
      deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope);
+      VLOG(4) << "End run " << place << " "
+              << op_func_node.operator_base_->DebugStringEx(local_scope);
    } else {
-      auto op_with_kernel =
-          static_cast<const framework::OperatorWithKernel*>(op);
+      auto op_with_kernel = const_cast<framework::OperatorWithKernel*>(
+          static_cast<const framework::OperatorWithKernel*>(op));
      // construct RuntimeContext and analysis KernelType
      RuntimeContext runtime_context({}, {});
      runtime_context.inputs.swap(ins_map);
      runtime_context.outputs.swap(outs_map);

-      // see OperatorWithKernel::RunImpl in operator.cc for why
-      if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
-            op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
-        InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
-        // TODO(Aurelius84): In case of control flow ops, they are NOT
-        // inheritted
-        // from OperatorWithKernel.
-        op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
-      }
-
      platform::DeviceContextPool& pool =
          platform::DeviceContextPool::Instance();
      auto* dev_ctx = pool.Get(place);
      Scope scope;
      auto expected_kernel_key = op_with_kernel->GetExpectedKernelType(
          ExecutionContext(*op, scope, *dev_ctx, runtime_context));
+      op_with_kernel->ResetKernelType(new OpKernelType(expected_kernel_key));

      // change device by the device_guard()
      apply_device_guard(op, place, &expected_kernel_key);
@@ -383,13 +379,16 @@ void build_op_func_list(const platform::Place& place,

      // step 3. apply data transforms and insert data transfer ops
      VariableValueMap& ins_map_temp = runtime_context.inputs;
+      VariableValueMap& outs_map_temp = runtime_context.outputs;

      // NOTE(zhiqiu): op_func_node->operator_base_ maybe changed in
      // ApplyDataTransform
-      ApplyDataTransform(expected_kernel_key, place, &ins_map_temp, var_scope,
-                         &op_func_node, vec_func_list, use_local_scope);
-      op_with_kernel = static_cast<const framework::OperatorWithKernel*>(
-          op_func_node.operator_base_.get());
+      ApplyDataTransform(expected_kernel_key, place, &ins_map_temp,
+                         &outs_map_temp, var_scope, &op_func_node,
+                         vec_func_list, use_local_scope);
+      op_with_kernel = const_cast<framework::OperatorWithKernel*>(
+          static_cast<const framework::OperatorWithKernel*>(
+              op_func_node.operator_base_.get()));

      // step 4. Run op kernel
      VLOG(3) << op_with_kernel->Type()
@@ -412,6 +411,16 @@ void build_op_func_list(const platform::Place& place,
      auto exec_ctx =
          ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);

+      // see OperatorWithKernel::RunImpl in operator.cc for why
+      if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
+            op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
+        InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
+        // TODO(Aurelius84): In case of control flow ops, they are NOT
+        // inheritted
+        // from OperatorWithKernel.
+        op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
+      }
+
      auto run_phi_kernel = false;
      if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
              op_with_kernel->Type())) {
@@ -476,9 +485,28 @@ void build_op_func_list(const platform::Place& place,
            op_func_node, place, outputs_names, &runtime_context.outputs,
            var_scope, vec_func_list, local_scope);
      }
+      if (!op_func_node.inplace_back_map.empty()) {
+        auto& m = op_func_node.inplace_back_map;
+        // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
+        for (auto& p : m) {
+          auto* transformed_tensor =
+              GetMutableLoDTensorOrSelectedRowsValueFromVar(
+                  var_scope->Var(p.first));
+          auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+              var_scope->Var(p.second));
+          original_tensor->ShareDataWith(*transformed_tensor);
+          VLOG(4) << "Transfer inplace variable back form "
+                  << var_scope->GetNameById(p.first) << " to "
+                  << var_scope->GetNameById(p.second);
+        }
+      }
    }

+    VLOG(4) << "End run " << place << " "
+            << op_func_node.operator_base_->DebugStringEx(local_scope);
+
    vec_func_list->emplace_back(op_func_node);
+
    // gc---------------------------------------------------------------------------
    auto iter = unused_var_map.find(op);
    if (iter == unused_var_map.end()) {
@@ -514,10 +542,7 @@ void build_op_func_list(const platform::Place& place,
            framework::ToTypeName(var->Type()), var_name));
      }
    }
-
    delete garbages;  // free mem
-
-    VLOG(3) << "run " << op->Type() << " done.";
  }
 }


--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -692,6 +692,10 @@ phi::Kernel* Instruction::PhiKernel() const { return op_func_node_.pt_kernel_; }

 OpFuncType Instruction::KernelType() const { return op_func_node_.type_; }

+const std::map<int, int>& Instruction::InplaceBackMap() const {
+  return op_func_node_.inplace_back_map;
+}
+
 OperatorBase* Instruction::OpBase() const {
  auto op_base = op_func_node_.operator_base_;
  PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet(

--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -297,6 +297,8 @@ struct OpFuncNode {
  std::map<std::string, std::vector<int>> output_index;
  std::unordered_set<int> no_data_transform_index;

+  std::map<int, int> inplace_back_map;
+
  OpKernelComputeFunc kernel_func_;
  platform::DeviceContext* dev_ctx_;  // not owned

@@ -325,6 +327,8 @@ class Instruction {

  OpFuncType KernelType() const;

+  const std::map<int, int>& InplaceBackMap() const;
+
  OperatorBase* OpBase() const;

  NextInstruction& NextInstructions();

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -664,6 +664,10 @@ class OperatorWithKernel : public OperatorBase {

  const OpKernelType* kernel_type() const { return kernel_type_.get(); }

+  void ResetKernelType(OpKernelType* kernel_type) {
+    kernel_type_.reset(kernel_type);
+  }
+
 private:
  void RunImpl(const Scope& scope, const platform::Place& place) const final;
  void RunImpl(const Scope& scope, const platform::Place& place,

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -94,7 +94,8 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
          "must smaller than or equal to 5. But received: the shape of input X "
          "= [%s], the dimension of input X = [%d]",
          x_dims, x_dims.size()));
-
+  VLOG(4) << ctx->IsRunMKLDNNKernel();
+  VLOG(4) << data_layout;
  const int64_t C =
      ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
           ? x_dims[1]
@@ -136,6 +137,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
                          C, bias_dim[0]));
  }
  ctx->SetOutputDim("Y", x_dims);
+  VLOG(4) << x_dims;
  ctx->SetOutputDim("MeanOut", {C});
  ctx->SetOutputDim("VarianceOut", {C});
  ctx->SetOutputDim("SavedMean", {C});

--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -203,14 +203,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto *y = ctx.Output<Tensor>("Y");
    auto *batch_mean = ctx.Output<Tensor>("SavedMean");
    auto *batch_variance = ctx.Output<Tensor>("SavedVariance");
-
    BatchNormMKLDNNHandler<T> handler(ctx, mkldnn_engine, x, global_stats,
                                      test_mode);

    auto src_memory = handler.AcquireSrcMemory(x);
    auto scaleshift_memory = handler.AcquireScaleShiftMemory(scale, shift);
    auto dst_memory = handler.AcquireDstMemory(y);
-
    auto batch_norm_p = handler.AcquireForwardPrimitive();

    std::shared_ptr<memory> mean_memory;
@@ -300,7 +298,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto diff_src_memory = handler.AcquireDiffSrcMemory(diff_x);
    auto diff_scaleshift_memory =
        handler.AcquireDiffScaleShiftMemory(diff_scaleshift_data.data());
-
    // finally create batch_norm backward primitive
    auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive();


--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -16,6 +16,8 @@

 #include <string>

+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -95,8 +97,9 @@ class TransferLayoutKernel {
    auto *x = ctx.InputVar("X");
    auto *out = ctx.OutputVar("Out");
    auto &dev_ctx = ctx.device_context();
+    auto src_layout = ctx.Attr<int>("src_layout");
    auto dst_layout = ctx.Attr<int>("dst_layout");
-    TransferLayoutFunctor(x, out, dev_ctx, dst_layout)();
+    TransferLayoutFunctor(x, out, dev_ctx, src_layout, dst_layout)();
  }
 };

@@ -105,6 +108,14 @@ class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "(LoDTensor) The input Tensor");
    AddOutput("Out", "(LoDTensor) The Output Tensor with desired layout");
+    // NOTE(zhiqiu): in most case, the src_layout is not needed, the op can use
+    // the layout
+    // of input X. However, in some mkldnn kernel, the src layout computed by
+    // GetKernelTypeForVar is different with the layout of tensor X.
+    AddAttr<int>("src_layout",
+                 "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3, default "
+                 "-1 means unspecified and use the tensor's layout.")
+        .SetDefault(-1);
    AddAttr<int>("dst_layout",
                 "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3");
    AddComment(R"DOC(
@@ -126,3 +137,8 @@ REGISTER_OPERATOR(
 // dtype is not important
 REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout, float,
                               ops::TransferLayoutKernel);
+REGISTER_OP_VERSION(transfer_layout)
+    .AddCheckpoint(
+        R"ROC(refine transfer_layout, add src_layout attribute)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "src_layout", "(int, the layout of the input tensor", -1));
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -39,8 +39,12 @@ class TransferLayoutFunctor {
 public:
  TransferLayoutFunctor(const framework::Variable *in, framework::Variable *out,
                        const platform::DeviceContext &dev_ctx,
-                        const int dst_layout)
-      : in_(in), out_(out), dev_ctx_(dev_ctx), dst_layout_(dst_layout) {}
+                        const int src_layout, const int dst_layout)
+      : in_(in),
+        out_(out),
+        dev_ctx_(dev_ctx),
+        src_layout_(src_layout),
+        dst_layout_(dst_layout) {}

  void operator()() const {
    auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_);
@@ -50,7 +54,8 @@ class TransferLayoutFunctor {
    out_tensor.set_layout(out_layout);

 #ifdef PADDLE_WITH_MKLDNN
-    auto in_layout = in_tensor.layout();
+    auto in_layout = static_cast<DataLayout>(src_layout_);
+    VLOG(4) << in_layout << "->" << out_layout << " " << in_tensor.layout();
    if (in_layout == DataLayout::kMKLDNN || out_layout == DataLayout::kMKLDNN) {
      PADDLE_ENFORCE_NE(
          in_layout, out_layout,
@@ -68,6 +73,7 @@ class TransferLayoutFunctor {
        // For NHWC data we need reshape of tensors as MKL-DNN
        // is expecting NHWC dims description order
        if (in_layout == DataLayout::kNHWC) {
+          VLOG(4) << "kNHWC";
          platform::MatchShapeToLayout(&out_tensor, in_layout, out_layout);
          paddle::platform::MKLDNNDeviceContext::tls()
              .set_cur_paddle_data_layout(in_layout);
@@ -75,6 +81,7 @@ class TransferLayoutFunctor {
        out_tensor.set_layout(DataLayout::kMKLDNN);
        out_tensor.set_format(out_format);
      } else {
+        VLOG(4) << "kNCHW";
        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
        // Do transform via MKLDNN lib
        paddle::framework::innerTransDataLayoutFromMKLDNN(
@@ -123,6 +130,7 @@ class TransferLayoutFunctor {
  const framework::Variable *in_;
  framework::Variable *out_;
  const platform::DeviceContext &dev_ctx_;
+  const int src_layout_;
  const int dst_layout_;
 };


--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -531,6 +531,7 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
 }

 void CUDADeviceContext::Wait() const {
+  VLOG(4) << "CUDA context(" << this << ")  Wait";
  if (thread_ctx_.count(this)) {
    context()->Stream()->Wait();
    return;

--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -352,5 +352,23 @@ class TestException(unittest.TestCase):
            self.fetch_vars.name))


+class TestInplaceApiWithDataTransform(unittest.TestCase):
+    def test_increment(self):
+        if paddle.fluid.core.is_compiled_with_cuda():
+            with paddle.fluid.device_guard("gpu:0"):
+                x = paddle.fluid.layers.fill_constant([1], "float32", 0)
+            with paddle.fluid.device_guard("cpu"):
+                x = paddle.increment(x)
+            exe = paddle.static.Executor(paddle.CUDAPlace(0))
+            os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
+
+            for i in range(10):
+                a, = exe.run(paddle.static.default_main_program(),
+                             fetch_list=[x])
+                self.assertEqual(a[0], 1)
+
+            del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
@@ -30,6 +30,7 @@ class TestTransferLayoutOpkNCHWTokNHWC(OpTest):
        self.inputs = {'X': ipt.astype('float32')}
        self.outputs = {'Out': ipt.transpose([0, 2, 3, 1])}
        self.attrs = {
+            'src_layout': 0,
            'dst_layout': 1  # kNHWC
        }
        self.op_type = 'transfer_layout'