diff --git a/lite/core/memory.h b/lite/core/memory.h
index bc382d8c36d87735bbd3028ff06b05f3394bf377..5a56f73b0de0fce64905f483ded88eda9ceffd52 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -90,11 +90,6 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
     case TARGET(kBM):
       TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
       break;
-#endif
-#ifdef LITE_WITH_MLU
-    case TARGET(kMLU):
-      TargetWrapperMlu::MemcpySync(dst, src, size, dir);
-      break;
 #endif
     default:
       LOG(FATAL)
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
index f3658fe484822509eb2a9e3d3652bc9e7ed457a0..727189db77427a01a6f4d4477c053112e99e8103 100644
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -413,6 +413,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
     auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
     UpdateOutputTo(
         sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    /* graph like this
+     *        subgraph_op_0
+     *          /       \
+     *         /         \
+     * subgraph_op_1   host_op
+     */
+    UpdateInputTo(
+        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
   }
 
   // recreate the op
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index 91aa04d99505eac5fa9abc50a5008ec7b5de4fbf..454682043b5199a32e56ed7c8bbea1752942c212 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -450,6 +450,9 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   for (auto &var_node : output_var_nodes) {
     output_var_names.push_back(var_node->AsArg().name);
   }
+  for (auto &var_node : local_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
   subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
                                                      input_var_names);
   subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
@@ -491,9 +494,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   for (auto &var_node : weight_var_nodes) {
     input_var_names.push_back(var_node->AsArg().name);
   }
-  for (auto &var_node : local_var_nodes) {
-    output_var_names.push_back(var_node->AsArg().name);
-  }
   for (auto &var_node : unused_var_nodes) {
     output_var_names.push_back(var_node->AsArg().name);
   }
diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc
index 5d6e2f9ca0932c03ade897b416420e41019d775a..286195d9d5f961288dd0156db31ff8aacae58227 100644
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
@@ -37,7 +37,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
   auto output_dims = output->dims().Vectorize();
   auto output_tensor = graph->AddNode(
-      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, fp_type);
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type);
   CHECK(graph->HasNode(x_var_name));
   auto input_tensor = graph->GetNode(x_var_name);
   cnmlBaseOp_t activation_op;
diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc
index d95a5115c96c10a8881f50c44fee9881c6a9e218..7353a685dd5fd3a5bcc8c88def8ffb8b96fdde55 100644
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -42,7 +42,7 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output = scope->FindVar(y_var_name)->GetMutable<Tensor>();
   auto output_dims = output->dims().Vectorize();
   auto output_tensor = graph->AddNode(
-      y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      y_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   CHECK(graph->HasNode(x_var_name));
 
diff --git a/lite/kernels/mlu/bridges/batch_norm_op_test.cc b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
index 706b0421b32e25a8ceaea465c57e60b52202f104..65b24a0a72a48a306b6a8976efd8839679d58038 100644
--- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
@@ -137,9 +137,7 @@ void test_batch_norm(
             {bs, ic, ih, iw},
             {0, 2, 3, 1});
 
-  out->Resize({bs, ih, iw, ic});
   x->CopyDataFrom(input_trans);
-  x->Resize({bs, ih, iw, ic});
 
   LaunchOp(op, {x_var_name}, {out_var_name});
 
diff --git a/lite/kernels/mlu/bridges/concat_op.cc b/lite/kernels/mlu/bridges/concat_op.cc
index e2986b964853ab90e5d7317638ee8c2c2969a4d0..14f0da746a00c1ea10ffae824217dbb2df84df55 100644
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
@@ -32,60 +32,33 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   auto x_var_name = op_info->Input("X");
   auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
   auto param_axis = op_info->GetAttr<int>("axis");
-  // auto x = scope->FindVar(x_var_name[0])->GetMutable<Tensor>();
-
-  auto input_num = x_var_name.size();
 
   std::vector<cnmlTensor_t> input_tensor;
-  std::vector<std::vector<int64_t>> input_dims;
   for (auto x_name : x_var_name) {
     CHECK(graph->HasNode(x_name));
     input_tensor.push_back(graph->GetNode(x_name)->mlu_tensor());
-    auto x = scope->FindVar(x_name)->GetMutable<Tensor>();
-    input_dims.push_back(x->dims().Vectorize());
   }
 
-  auto dims = input_dims[0].size();
+  auto dims = output_dims.size();
   int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
-  int nhwc_axis = -1;
-  if (dims == 4) {
-    int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
-    nhwc_axis = nchw_to_nhwc_axis_map[axis];
-  } else if (dims == 3) {
-    int nchw_to_nhwc_axis_map[3] = {0, 2, 1};
-    nhwc_axis = nchw_to_nhwc_axis_map[axis];
-  } else {
-    CHECK(0) << "Unsupport dims in mlu concat";
-  }
-
-  std::vector<int64_t> output_dims;
-  output_dims.assign(dims, 0);
-
-  /* std::cout << string_format("concat axis: %d(NCHW), %d(NHWC)", axis,
-   * nhwc_axis) << std::endl; */
-
-  for (int i = 0; i < output_dims.size(); ++i) {
-    if (i == nhwc_axis) {
-      for (auto& dim : input_dims) output_dims[i] += dim[i];
-    } else {
-      output_dims[i] = input_dims[0][i];
-    }
-  }
-
-  /* std::cout << string_format("concat output dim: %ld, %ld, %ld, %ld") <<
-   * std::endl; */
+  CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
+  int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
+  int nhwc_axis = nchw_to_nhwc_axis_map[axis];
 
-  auto* output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
-  output->Resize(output_dims);
   auto output_tensor = graph->AddNode(
-      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   cnmlBaseOp_t concat_op;
-  cnmlTensor_t outputs[1];
-  outputs[0] = output_tensor->mlu_tensor();
-  CNML_CALL(cnmlCreateNdConcatOp(
-      &concat_op, nhwc_axis, input_tensor.data(), input_num, outputs, 1));
+  cnmlTensor_t outputs = output_tensor->mlu_tensor();
+  CNML_CALL(cnmlCreateNdConcatOp(&concat_op,
+                                 nhwc_axis,
+                                 input_tensor.data(),
+                                 x_var_name.size(),
+                                 &outputs,
+                                 1));
   graph->FuseOp(concat_op);
   return SUCCESS;
 }
diff --git a/lite/kernels/mlu/bridges/concat_op_test.cc b/lite/kernels/mlu/bridges/concat_op_test.cc
index b75ebc0951af91799f40ef8782a5bca43d0a87e1..c4b48a9ef45430ec5867d231bbc2d0a798ec66d0 100644
--- a/lite/kernels/mlu/bridges/concat_op_test.cc
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
@@ -113,21 +113,8 @@ void test_concat(std::vector<std::vector<int64_t>> input, int axis) {
              static_cast<int>(input[1][2]),
              static_cast<int>(input[1][3])},
             {0, 2, 3, 1});
-  auto os = out->dims();
-  out->Resize({static_cast<int>(os[0]),
-               static_cast<int>(os[2]),
-               static_cast<int>(os[3]),
-               static_cast<int>(os[1])});
   x->CopyDataFrom(input_x);
   y->CopyDataFrom(input_y);
-  x->Resize({static_cast<int>(input[0][0]),
-             static_cast<int>(input[0][2]),
-             static_cast<int>(input[0][3]),
-             static_cast<int>(input[0][1])});
-  y->Resize({static_cast<int>(input[1][0]),
-             static_cast<int>(input[1][2]),
-             static_cast<int>(input[1][3]),
-             static_cast<int>(input[1][1])});
 
   LaunchOp(op, {x_var_name, y_var_name}, {out_var_name});
 
@@ -136,6 +123,7 @@ void test_concat(std::vector<std::vector<int64_t>> input, int axis) {
 
   Tensor output_trans;
   output_trans.Resize(out->dims());
+  auto os = out->dims();
   transpose(out_data,
             output_trans.mutable_data<float>(),
             {static_cast<int>(os[0]),
diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc
index e4f672e06e38c0212d1887de5cebed6a35bd0e0d..6a7ef408eb7432950d5a0985dd6e174236e937e0 100644
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -33,13 +33,14 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // get input, filter and op attributes
   const auto input_var_name = op_info->Input("Input").front();
-  const auto& input_dims_nhwc =
+  const auto& input_dims =
       scope->FindVar(input_var_name)->GetMutable<Tensor>()->dims();
-  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
   const auto filter_var_name = op_info->Input("Filter").front();
   auto* filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
   const auto& filter_dims = filter->dims();
   const auto output_var_name = op_info->Output("Output").front();
+  auto* output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
+  const auto output_shape = output->dims().Vectorize();
   const auto bs = input_dims[0];
   const auto oc = filter_dims[0];
   CHECK_EQ(input_dims.size(), 4);
@@ -70,24 +71,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                       input_dims,
                                       filter_dims);
 
-  std::vector<int64_t> output_shape({bs, oc});
-  for (size_t i = 0; i < 2; i++) {
-    const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
-    output_shape.push_back(
-        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
-            strides[i] +
-        1);
-  }
-
-  const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
-  const auto output_tensor = graph->AddNode(output_var_name,
-                                            output_shape_nhwc,
-                                            CNML_TENSOR,
-                                            CNML_NHWC,
-                                            graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
+  const auto output_tensor = graph->AddNode(
+      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   // Create filter node
   const auto filter_tensor = graph->AddNode(filter_var_name,
@@ -156,7 +141,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const auto input_scale = op_info->GetAttr<float>("input_scale");
 
   bool use_first_conv = false;
-  if (lite::DeviceInfo::Global().UseFirstConv() && input_dims_nhwc[3] == 3) {
+  if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) {
     use_first_conv = true;
   }
 
diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc
index 6155a75018382482118ddbe8878bcdb69214bcd6..e34dd7c2a85dbda62596b6e82d820fc437bfd194 100644
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -244,10 +244,6 @@ void test_conv(int bs,
     }
   }
 
-  input->Resize({bs, ih, iw, ic});
-  output->Resize(
-      {output_shape[0], output_shape[2], output_shape[3], output_shape[1]});
-
   // create and convert op to MLU model, then run it on MLU
   auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope);
   LaunchOp(op, {input_var_name}, {output_var_name});
diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc
index 4ef949925d20e0a2cb1c7f25d840e2041d79dd7a..41526a0100ba71be9eda25983cb96aa888d6cf4d 100644
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -77,7 +77,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output_tensor = graph->AddNode(out_var_name,
                                       x->dims().Vectorize(),
                                       CNML_TENSOR,
-                                      CNML_NHWC,
+                                      CNML_NCHW,
                                       graph->FPType());
 
   cnmlBaseOp_t elementwise_op;
@@ -90,7 +90,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     auto mid_tensor = graph->AddNode(out_var_name + "_mid",
                                      x->dims().Vectorize(),
                                      CNML_TENSOR,
-                                     CNML_NHWC,
+                                     CNML_NCHW,
                                      graph->FPType());
     CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
                                        x_tensor->mlu_tensor(),
diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc
index f480a9110790406ddb2aa7464221c7062b26268e..286feec8d4d44eaa025f333d559c32ca72f042ff 100644
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -37,6 +37,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
   auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
   auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
+  auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
   auto x_dims = x->dims();
   auto w_dims = w->dims();
 
@@ -50,15 +51,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   auto input_scale = op_info->GetAttr<float>("input_scale");
 
-  std::vector<int64_t> output_shape_nhwc({x_dims[0], 1, 1, w_dims[1]});
   auto output_tensor = graph->AddNode(output_var_name,
-                                      output_shape_nhwc,
+                                      output->dims().Vectorize(),
                                       CNML_TENSOR,
-                                      CNML_NHWC,
+                                      CNML_NCHW,
                                       graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
 
   std::string bias_var_name;
   std::shared_ptr<MLUTensor> bias_tensor;
diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc
index 79bee35501051e7b46f9c3dd3fa8cea6222abec4..8f92b6abad97650100d0862d49550abaf62daac9 100644
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -139,15 +139,34 @@ void test_fc(const std::vector<int64_t>& input_shape,
   }
 
   auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope);
-  input->Resize({static_cast<int>(input_shape[0]),
-                 static_cast<int>(input_shape[2]),
-                 static_cast<int>(input_shape[3]),
-                 static_cast<int>(input_shape[1])});
-  out->Resize({static_cast<int>(input_shape[0]), static_cast<int>(w_shape[1])});
+
+  Tensor input_tmp, out_tmp;
+  input_tmp.Resize(input_shape);
+  transpose(input->mutable_data<float>(),
+            input_tmp.mutable_data<float>(),
+            {static_cast<int>(input_shape[0]),
+             static_cast<int>(input_shape[1]),
+             static_cast<int>(input_shape[2]),
+             static_cast<int>(input_shape[3])},
+            {0, 2, 3, 1});
+  input->CopyDataFrom(input_tmp);
+
   LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name});
 
-  // compare results
+  auto os = out->dims();
+  out_tmp.Resize(os);
   auto* out_data = out->mutable_data<float>();
+  //  transpose(out_data,
+  //            out_tmp.mutable_data<float>(),
+  //            {static_cast<int>(os[0]),
+  //             static_cast<int>(os[2]),
+  //             static_cast<int>(os[3]),
+  //             static_cast<int>(os[1])},
+  //            {0, 3, 1, 2});
+  //
+  //  out_data = out_tmp.mutable_data<float>();
+
+  // compare results
   auto* out_ref_data = out_ref->mutable_data<float>();
   for (int i = 0; i < out->dims().production(); i++) {
     EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc
index 27c6ab2597fa6930b14c4c4e34750030608167b6..65c2f8214c13ee8d004dbe4b2e706523d007469c 100644
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
@@ -25,12 +25,12 @@ namespace mlu {
 std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
                                           std::vector<int64_t> shape,
                                           cnmlTensorType_t tensor_type,
-                                          cnmlDataOrder_t data_order,
+                                          cnmlDataOrder_t shape_order,
                                           cnmlDataType_t mlu_dtype,
                                           void* raw_ptr) {
   CHECK(!HasNode(name));
   auto node = std::shared_ptr<MLUTensor>(
-      new MLUTensor(shape, tensor_type, data_order, mlu_dtype));
+      new MLUTensor(shape, tensor_type, shape_order, mlu_dtype));
   node->set_mlu_ptr(raw_ptr);
   nodes_.insert(std::make_pair(name, node));
   return node;
diff --git a/lite/kernels/mlu/bridges/interpolate_op.cc b/lite/kernels/mlu/bridges/interpolate_op.cc
index 77a6722e03a63c4ee06a0159916509aa0ca36139..e201199824d8042abd6002ccbe5bb659a9ca2898 100644
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
@@ -45,8 +45,8 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(graph->HasNode(x_var_name));
   auto input_tensor = graph->GetNode(x_var_name);
 
-  auto in_h = x_dims[1];
-  auto in_w = x_dims[2];
+  auto in_h = x_dims[2];
+  auto in_w = x_dims[3];
 
   // Priority: SizeTensor > OutSize > Scale > scale > out_h/out_w
   if (HasInputArg(op_info, scope, "SizeTensor")) {
@@ -69,25 +69,13 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     }
   }
 
-  out->Resize({x_dims[0], out_h, out_w, x_dims[3]});
-
   auto output_tensor = graph->AddNode(out_var_name,
                                       out->dims().Vectorize(),
                                       CNML_TENSOR,
-                                      CNML_NHWC,
+                                      CNML_NCHW,
                                       graph->FPType());
 
   cnmlBaseOp_t interp_op;
-  /* if (interp_method == "bilinear") { */
-  /*   cnmlInterpOpParam_t interp_param; */
-  /*   CNML_CALL(cnmlCreateInterpOpParam(&interp_param, out_w, out_h,
-   * align_corners)); */
-  /*   CNML_CALL(cnmlCreateInterpOp(&interp_op, */
-  /*                                input_tensor->mlu_tensor(), */
-  /*                                output_tensor->mlu_tensor(), */
-  /*                                interp_param)); */
-  /*   CNML_CALL(cnmlDestroyInterpOpParam(&interp_param)); */
-  /* } else if (interp_method == "nearest") { */
   cnmlNearestNeighborOpParam_t nn_param;
   CNML_CALL(cnmlCreateNearestNeighborOpParam(&nn_param, out_w, out_h));
   CNML_CALL(cnmlSetNearestNeighborAlignCorner(&nn_param, align_corners));
@@ -96,11 +84,6 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                         output_tensor->mlu_tensor(),
                                         nn_param));
   CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param));
-  /* } else { */
-  /*   LOG(WARNING) << "[MLU] Unsupported interpolate method: " <<
-   * interp_method; */
-  /*   return FAILED; */
-  /* } */
   graph->FuseOp(interp_op);
 
   return SUCCESS;
diff --git a/lite/kernels/mlu/bridges/interpolate_op_test.cc b/lite/kernels/mlu/bridges/interpolate_op_test.cc
index 29abff819afed2471f301ed11582b3dabf708e21..0e99da64358e6590af0b8e57dc3ddec142c8d0f0 100644
--- a/lite/kernels/mlu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
@@ -237,7 +237,6 @@ class InterpComputeTester {
     /* printf("----output tensor dims: %ld, %d, %d, %ld\n", dims_[0], out_h,
      * out_w, dims_[1]); */
     std::vector<int64_t> out_shape_nchw = {dims_[0], dims_[1], out_h, out_w};
-    out->Resize(DimNCHW2NHWC(out_shape_nchw));
     outref->Resize(out_shape_nchw);
     outsize->Resize({2});
 
@@ -283,7 +282,6 @@ class InterpComputeTester {
               {in, ic, ih, iw},
               {0, 2, 3, 1});
     x->CopyDataFrom(input_trans);
-    x->Resize(DimNCHW2NHWC(dims_.Vectorize()));
     if (use_outsize_) {
       LaunchOp(op, {x_var_name, outsize_var_name}, {out_var_name});
     } else {
diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc
index 3119b6c77dca10641c7c7c32072969fedb1ecef6..f77c8084c76fc52c39938e723f02bde9b3cac41b 100644
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
@@ -47,9 +47,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Get input, and attributes
   auto x_var_name = op_info->Input("X").front();
   auto x = scope->FindTensor(x_var_name);
-  auto input_dims_nhwc = x->dims();
-  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
   auto output_var_name = op_info->Output("Out").front();
+  auto output_shape = scope->FindTensor(output_var_name)->dims().Vectorize();
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
@@ -81,23 +80,17 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  strides,
                                  ksize);
 
-  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
-  for (size_t i = 0; i < 2; i++) {
-    output_shape.push_back(
-        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) /
-            strides[i] +
-        1);
-  }
+  //  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
+  //  for (size_t i = 0; i < 2; i++) {
+  //    output_shape.push_back(
+  //        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
+  //        ksize[0]) /
+  //            strides[i] +
+  //        1);
+  //  }
 
-  auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
-  auto output_tensor = graph->AddNode(output_var_name,
-                                      output_shape_nhwc,
-                                      CNML_TENSOR,
-                                      CNML_NHWC,
-                                      graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
+  auto output_tensor = graph->AddNode(
+      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   cnmlPoolOpParam_t pool_param;
   CNML_CALL(
diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc
index 90e43987e481fdfcc22da847937aa18a5149568d..8cee8dbe86109b14cff49f329d71074a9b3bfb61 100644
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
@@ -180,12 +180,7 @@ void test_pool(int bs,
             {0, 2, 3, 1});
 
   auto os = out->dims();
-  out->Resize({static_cast<int>(os[0]),
-               static_cast<int>(os[2]),
-               static_cast<int>(os[3]),
-               static_cast<int>(os[1])});
   x->CopyDataFrom(input_trans);
-  x->Resize({bs, ih, iw, ic});
 
   LaunchOp(op, {x_var_name}, {out_var_name});
 
diff --git a/lite/kernels/mlu/bridges/scale_op.cc b/lite/kernels/mlu/bridges/scale_op.cc
index d500786006286884af0843967410fbc907923e56..5557602bd7576ccd71c51f52a538a45fe27f7ada 100644
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
@@ -36,7 +36,7 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
   auto output_dims = output->dims().Vectorize();
   auto output_tensor = graph->AddNode(
-      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
   auto bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
   auto scale = op_info->GetAttr<float>("scale");
   auto bias = op_info->GetAttr<float>("bias");
diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc
index b9e2b1116dc95ec276f8d85a5669cec45d98ea39..17c911675718a15c7ede4888b268ffcd62b4d8ed 100644
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
@@ -45,11 +45,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       axis = output_dims.size() + axis;
     }
   }
-
   int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
 
   auto output_tensor = graph->AddNode(
-      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
   cnmlBaseOp_t softmax_op;
   CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op,
                                   nhwc_axis,
diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc
index 87f8f589bc6e610071235eac25554353122fa085..a5251ed43c9187fc2874f9b01853b45b8abf7f1c 100644
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
@@ -110,9 +110,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
             {bs, ic, ih, iw},
             {0, 2, 3, 1});
 
-  out->Resize({bs, ih, iw, ic});
   x->CopyDataFrom(input_trans);
-  x->Resize({bs, ih, iw, ic});
 
   LaunchOp(op, {x_var_name}, {out_var_name});
 
diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc
index c6c87e42b40abcac49c41a35c95e893c6f70fb8c..377a00689ef3a27f78ae008072578ab3701cd337 100644
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -58,7 +58,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
         graph.AddNode(input_name,
                       input_tensor->dims().Vectorize(),
                       CNML_TENSOR,
-                      CNML_NHWC,
+                      CNML_NCHW,
                       graph.FPType(),
                       reinterpret_cast<void*>(
                           input_tensor->mutable_data<float>(TARGET(kMLU))));
@@ -68,6 +68,8 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
                           sizeof(float) * input_tensor->dims().production(),
                           CNRT_MEM_TRANS_DIR_HOST2DEV));
   }
+  op->CheckShape();
+  op->InferShape();
   bridges.Select(op_type, TARGET(kMLU))(
       reinterpret_cast<void*>(&graph), const_cast<OpLite*>(op.get()), nullptr);
 
diff --git a/lite/kernels/mlu/bridges/transpose_op.cc b/lite/kernels/mlu/bridges/transpose_op.cc
index 74af692b6f5b8834e29b8e008a4e48801a1e8820..5e5c5b79ebff4e4ae06e99e4a18f22ebabd4ceb5 100644
--- a/lite/kernels/mlu/bridges/transpose_op.cc
+++ b/lite/kernels/mlu/bridges/transpose_op.cc
@@ -21,8 +21,8 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-std::vector<int> axis_to_nhwc4d(const std::vector<int>& axis) {
-  CHECK_EQ(axis.size(), 4);
+std::vector<int> axis_to_nhwc(const std::vector<int>& axis) {
+  CHECK_EQ(axis.size(), 4) << "Unsupport dim in mlu transpose";
   std::vector<int> new_axis(4, 0);
   const std::vector<int> axis_map1 = {0, 2, 3, 1};
   const std::vector<int> axis_map2 = {0, 3, 1, 2};
@@ -32,26 +32,6 @@ std::vector<int> axis_to_nhwc4d(const std::vector<int>& axis) {
   return new_axis;
 }
 
-std::vector<int> axis_to_nhw3d(const std::vector<int>& axis) {
-  CHECK_EQ(axis.size(), 3);
-  std::vector<int> new_axis(3, 0);
-  const std::vector<int> axis_map = {0, 2, 1};
-  for (size_t i = 0; i < new_axis.size(); ++i) {
-    new_axis[i] = axis_map[axis[axis_map[i]]];
-  }
-  new_axis.push_back(3);
-  return new_axis;
-}
-
-std::vector<int64_t> infer_shape(const std::vector<int64_t>& x_dims,
-                                 const std::vector<int>& axis_nhwc) {
-  std::vector<int64_t> out_dims(x_dims);
-  for (size_t i = 0; i < out_dims.size(); ++i) {
-    out_dims[i] = x_dims[axis_nhwc[i]];
-  }
-  return out_dims;
-}
-
 int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(ctx != nullptr);
   CHECK(op != nullptr);
@@ -71,21 +51,13 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output_dims = output->dims().Vectorize();
 
   auto axis = op_info->GetAttr<std::vector<int>>("axis");
-
-  std::vector<int> axis_nhwc;
-  if (axis.size() == 4) {
-    axis_nhwc = axis_to_nhwc4d(axis);
-  } else if (axis.size() == 3) {
-    axis_nhwc = axis_to_nhw3d(axis);
-  } else {
-    CHECK(0) << "Unsupport dim in mlu transpose";
+  while (axis.size() < 4) {
+    axis.push_back(axis.size());
   }
-
-  auto output_dims_nhwc = infer_shape(x_dims, axis_nhwc);
-  output->Resize(output_dims_nhwc);
+  std::vector<int> axis_nhwc = axis_to_nhwc(axis);
 
   auto output_tensor = graph->AddNode(
-      out_var_name, output_dims_nhwc, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   CHECK(graph->HasNode(x_var_name));
   auto input_tensor = graph->GetNode(x_var_name);
@@ -113,7 +85,6 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(transpose,
                          kMLU,
                          paddle::lite::subgraph::mlu::TransposeConverter);
-
 REGISTER_SUBGRAPH_BRIDGE(transpose2,
                          kMLU,
                          paddle::lite::subgraph::mlu::TransposeConverter);
diff --git a/lite/kernels/mlu/bridges/transpose_op_test.cc b/lite/kernels/mlu/bridges/transpose_op_test.cc
index 0b2c015975740eac7a3e07783292bc17c132ef58..f10801fbc6844769342223f9ab15da88e748e0c0 100644
--- a/lite/kernels/mlu/bridges/transpose_op_test.cc
+++ b/lite/kernels/mlu/bridges/transpose_op_test.cc
@@ -115,6 +115,7 @@ void test_transpose(const std::vector<int64_t>& input_shape,
   }
 }
 
+// TODO(pmshst): fix the transpose test
 TEST(MLUBridges, transpose) {
   std::vector<int64_t> input_shape = {2, 3, 4, 5};
   test_transpose(input_shape, std::vector<int>{0, 1, 3, 2});
diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h
index 2d355e5ddf590b55c22a103d3d2a24ad4357da4c..5e87e3526417573f2e0f01280b1d86ccb5691093 100644
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
@@ -67,6 +67,8 @@ class LayoutNchwToNhwcCompute
     auto x_dims = param.x->dims().size();
     auto& context = this->ctx_->template As<X86Context>();
 
+    const auto origin_dims = out->dims().Vectorize();
+
     std::vector<int> axis;
     switch (x_dims) {
       case 2:
@@ -88,6 +90,10 @@ class LayoutNchwToNhwcCompute
 
     LayoutTransCompute<lite::TargetType::kX86, float>(
         x_dims, context, *x, out, axis);
+
+    if (x_dims > 2) {
+      out->Resize(origin_dims);
+    }
   }
 
   std::string doc() const override {
@@ -109,20 +115,22 @@ class LayoutNhwcToNchwCompute
     auto x_dims = param.x->dims().size();
     auto& context = this->ctx_->template As<X86Context>();
 
+    const auto origin_dims = out->dims().Vectorize();
+
     std::vector<int> axis;
     switch (x_dims) {
       case 2:
         axis = {0, 1};
         break;
       case 3:
-        axis = {0, 2, 1};
         out->Resize(std::vector<int64_t>{
             out->dims()[0], out->dims()[2], out->dims()[1]});
+        axis = {0, 2, 1};
         break;
       case 4:
-        axis = {0, 3, 1, 2};
         out->Resize(std::vector<int64_t>{
             out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]});
+        axis = {0, 3, 1, 2};
         break;
       default:
         CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw";
@@ -130,6 +138,10 @@ class LayoutNhwcToNchwCompute
 
     LayoutTransCompute<lite::TargetType::kX86, float>(
         x_dims, context, *x, out, axis);
+
+    if (x_dims > 2) {
+      out->Resize(origin_dims);
+    }
   }
 
   std::string doc() const override {
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
index 0e79e54eb2888fa9c2d6867d16de81c2f334af29..51a9c0ffe05232bd807017e79c490d947e26c0f7 100644
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -83,7 +83,7 @@ class SubgraphEngine : public subgraph::Engine {
           graph_.AddNode(input_name,
                          input_tensor->dims().Vectorize(),
                          CNML_TENSOR,
-                         CNML_NHWC,
+                         CNML_NCHW,
                          graph_.FPType(),
                          const_cast<void*>(input_tensor->raw_data()));
       CHECK(input_node);
@@ -99,9 +99,7 @@ class SubgraphEngine : public subgraph::Engine {
       CHECK(op);
       std::string op_type = op->op_info()->Type();
       op->CheckShape();
-      if (op_type != "concat") {
-        op->InferShape();
-      }
+      op->InferShape();
       if (!bridges.Exists(op_type, TARGET(kMLU))) {
         LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
         return subgraph::FAILED;