From 95e7f6f3ff9e736212d0ca2cf078ad1ea2106881 Mon Sep 17 00:00:00 2001
From: zhangshijin <zhangshijin@cambricon.com>
Date: Fri, 6 Mar 2020 16:22:07 +0800
Subject: [PATCH] [MLU] resnet50 supported on MLU,test=develop (#3087)

* [MLU] support resnet50 on MLU

* [MLU] support resnet50 on MLU
---
 cmake/mlu.cmake                               |  61 +++
 lite/backends/mlu/CMakeLists.txt              |   7 +
 lite/backends/mlu/mlu_utils.h                 |  67 +++
 lite/backends/mlu/target_wrapper.cc           |  91 ++++
 lite/backends/mlu/target_wrapper.h            |  54 ++
 lite/core/mir/mlu_postprocess_pass.cc         | 499 ++++++++++++++++++
 lite/core/mir/mlu_postprocess_pass.h          | 114 ++++
 lite/core/mir/subgraph_cast_display_pass.cc   | 111 ++++
 lite/kernels/mlu/CMakeLists.txt               |   8 +
 lite/kernels/mlu/bridges/CMakeLists.txt       |  41 ++
 lite/kernels/mlu/bridges/act_op.cc            |  57 ++
 lite/kernels/mlu/bridges/act_op_test.cc       | 156 ++++++
 lite/kernels/mlu/bridges/batch_norm_op.cc     |  94 ++++
 .../kernels/mlu/bridges/batch_norm_op_test.cc | 186 +++++++
 lite/kernels/mlu/bridges/conv_op.cc           | 200 +++++++
 lite/kernels/mlu/bridges/conv_op_test.cc      | 350 ++++++++++++
 lite/kernels/mlu/bridges/elementwise_ops.cc   | 153 ++++++
 .../mlu/bridges/elementwise_ops_test.cc       | 198 +++++++
 lite/kernels/mlu/bridges/fc_op.cc             | 127 +++++
 lite/kernels/mlu/bridges/fc_op_test.cc        | 173 ++++++
 lite/kernels/mlu/bridges/graph.cc             |  42 ++
 lite/kernels/mlu/bridges/graph.h              | 166 ++++++
 lite/kernels/mlu/bridges/paddle_use_bridges.h |  24 +
 lite/kernels/mlu/bridges/pool_op.cc           | 134 +++++
 lite/kernels/mlu/bridges/pool_op_test.cc      | 280 ++++++++++
 lite/kernels/mlu/bridges/softmax_op.cc        |  69 +++
 lite/kernels/mlu/bridges/softmax_op_test.cc   | 176 ++++++
 lite/kernels/mlu/bridges/tensor.cc            | 271 ++++++++++
 lite/kernels/mlu/bridges/tensor.h             |  66 +++
 lite/kernels/mlu/bridges/test_helper.cc       | 111 ++++
 lite/kernels/mlu/bridges/test_helper.h        |  66 +++
 lite/kernels/mlu/bridges/utility.cc           | 111 ++++
 lite/kernels/mlu/bridges/utility.h            |  93 ++++
 lite/kernels/mlu/calib_compute.cc             |  90 ++++
 lite/kernels/mlu/calib_compute.h              |  51 ++
 lite/kernels/mlu/io_copy_compute.cc           | 154 ++++++
 lite/kernels/mlu/subgraph_compute.cc          |  52 ++
 lite/kernels/mlu/subgraph_compute.h           | 168 ++++++
 lite/tools/build_mlu.sh                       | 122 +++++
 39 files changed, 4993 insertions(+)
 create mode 100644 cmake/mlu.cmake
 create mode 100644 lite/backends/mlu/CMakeLists.txt
 create mode 100644 lite/backends/mlu/mlu_utils.h
 create mode 100644 lite/backends/mlu/target_wrapper.cc
 create mode 100644 lite/backends/mlu/target_wrapper.h
 create mode 100644 lite/core/mir/mlu_postprocess_pass.cc
 create mode 100644 lite/core/mir/mlu_postprocess_pass.h
 create mode 100644 lite/core/mir/subgraph_cast_display_pass.cc
 create mode 100644 lite/kernels/mlu/CMakeLists.txt
 create mode 100644 lite/kernels/mlu/bridges/CMakeLists.txt
 create mode 100644 lite/kernels/mlu/bridges/act_op.cc
 create mode 100644 lite/kernels/mlu/bridges/act_op_test.cc
 create mode 100644 lite/kernels/mlu/bridges/batch_norm_op.cc
 create mode 100644 lite/kernels/mlu/bridges/batch_norm_op_test.cc
 create mode 100644 lite/kernels/mlu/bridges/conv_op.cc
 create mode 100644 lite/kernels/mlu/bridges/conv_op_test.cc
 create mode 100644 lite/kernels/mlu/bridges/elementwise_ops.cc
 create mode 100644 lite/kernels/mlu/bridges/elementwise_ops_test.cc
 create mode 100644 lite/kernels/mlu/bridges/fc_op.cc
 create mode 100644 lite/kernels/mlu/bridges/fc_op_test.cc
 create mode 100644 lite/kernels/mlu/bridges/graph.cc
 create mode 100644 lite/kernels/mlu/bridges/graph.h
 create mode 100644 lite/kernels/mlu/bridges/paddle_use_bridges.h
 create mode 100644 lite/kernels/mlu/bridges/pool_op.cc
 create mode 100644 lite/kernels/mlu/bridges/pool_op_test.cc
 create mode 100644 lite/kernels/mlu/bridges/softmax_op.cc
 create mode 100644 lite/kernels/mlu/bridges/softmax_op_test.cc
 create mode 100644 lite/kernels/mlu/bridges/tensor.cc
 create mode 100644 lite/kernels/mlu/bridges/tensor.h
 create mode 100644 lite/kernels/mlu/bridges/test_helper.cc
 create mode 100644 lite/kernels/mlu/bridges/test_helper.h
 create mode 100644 lite/kernels/mlu/bridges/utility.cc
 create mode 100644 lite/kernels/mlu/bridges/utility.h
 create mode 100644 lite/kernels/mlu/calib_compute.cc
 create mode 100644 lite/kernels/mlu/calib_compute.h
 create mode 100644 lite/kernels/mlu/io_copy_compute.cc
 create mode 100644 lite/kernels/mlu/subgraph_compute.cc
 create mode 100644 lite/kernels/mlu/subgraph_compute.h
 create mode 100755 lite/tools/build_mlu.sh

diff --git a/cmake/mlu.cmake b/cmake/mlu.cmake
new file mode 100644
index 0000000000..b73ab16462
--- /dev/null
+++ b/cmake/mlu.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_MLU)
+  return()
+endif()
+
+if(NOT DEFINED NEUWARE_HOME)
+    set(NEUWARE_HOME $ENV{NEUWARE_HOME})
+    if(NOT NEUWARE_HOME)
+        message(FATAL_ERROR "Must set NEUWARE_HOME or env NEUWARE_HOME when LITE_WITH_MLU=ON")
+    endif()
+endif()
+
+message(STATUS "LITE_WITH_MLU: ${LITE_WITH_MLU}")
+find_path(CNML_INC NAMES cnml.h
+  PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
+if(NOT CNML_INC)
+  message(FATAL_ERROR "Can not find cnml.h in ${NEUWARE_HOME}/include")
+endif()
+
+find_path(CNRT_INC NAMES cnrt.h
+  PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
+if(NOT CNRT_INC)
+  message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include")
+endif()
+
+include_directories("${NEUWARE_HOME}/include")
+
+find_library(CNML_LIB_FILE NAMES cnml
+  PATHS ${NEUWARE_HOME}/lib64)
+
+if(NOT CNML_LIB_FILE)
+  message(FATAL_ERROR "Can not find CNML Library in ${NEUWARE_HOME}/lib64")
+else()
+  message(STATUS "Found CNML Library: ${CNML_LIB_FILE}")
+  add_library(cnml_lib SHARED IMPORTED GLOBAL)
+  set_property(TARGET cnml_lib PROPERTY IMPORTED_LOCATION ${CNML_LIB_FILE})
+endif()
+
+find_library(CNRT_LIB_FILE NAMES cnrt
+  PATHS ${NEUWARE_HOME}/lib64)
+
+if(NOT CNRT_LIB_FILE)
+  message(FATAL_ERROR "Can not find CNRT Library in ${NEUWARE_HOME}/lib64")
+else()
+  message(STATUS "Found CNRT Library: ${CNRT_LIB_FILE}")
+  add_library(cnrt_lib SHARED IMPORTED GLOBAL)
+  set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE})
+endif()
diff --git a/lite/backends/mlu/CMakeLists.txt b/lite/backends/mlu/CMakeLists.txt
new file mode 100644
index 0000000000..29c90b4220
--- /dev/null
+++ b/lite/backends/mlu/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(NOT LITE_WITH_MLU)
+    return()
+endif()
+
+message (STATUS "Lite with mlu backend")
+
+lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib)
diff --git a/lite/backends/mlu/mlu_utils.h b/lite/backends/mlu/mlu_utils.h
new file mode 100644
index 0000000000..08dd355e81
--- /dev/null
+++ b/lite/backends/mlu/mlu_utils.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cnml.h>
+#include <cnrt.h>
+#include <lite/utils/cp_logging.h>
+
+/*
+ * This file contains some MLU specific uitls.
+ */
+
+#define CNRT_CALL(msg)                                    \
+  CHECK_EQ(static_cast<cnrtRet_t>(msg), CNRT_RET_SUCCESS) \
+      << (msg)                                            \
+      << " MLU CNRT: " << cnrtGetErrorStr(static_cast<cnrtRet_t>(msg))
+
+#define CNML_CALL(msg)                                          \
+  CHECK_EQ(static_cast<cnmlStatus_t>(msg), CNML_STATUS_SUCCESS) \
+      << (msg) << " MLU CNML: "                                 \
+      << ::paddle::lite::mlu::CnmlErrorInfo(static_cast<int>(msg))
+
+namespace paddle {
+namespace lite {
+namespace mlu {
+
+static const char* CnmlErrorInfo(int error) {
+  switch (error) {
+#define LITE_CNML_ERROR_INFO(xx) \
+  case xx:                       \
+    return #xx;                  \
+    break;
+    LITE_CNML_ERROR_INFO(CNML_STATUS_NODEVICE);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_SUCCESS);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_DOMAINERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDARG);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_LENGTHERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_OUTOFRANGE);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_RANGEERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_OVERFLOWERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_UNDERFLOWERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDPARAM);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_BADALLOC);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_BADTYPEID);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_BADCAST);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_UNSUPPORT);
+#undef LITE_CNML_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+}  // namespace mlu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/mlu/target_wrapper.cc b/lite/backends/mlu/target_wrapper.cc
new file mode 100644
index 0000000000..2385f69246
--- /dev/null
+++ b/lite/backends/mlu/target_wrapper.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/mlu/target_wrapper.h"
+
+#include <memory>
+
+#include "lite/backends/mlu/mlu_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace mlu {
+
+void cnrtMemcpyHtoD(void* dst, const void* src, size_t size) {
+  CNRT_CALL(cnrtMemcpy(
+      dst, const_cast<void*>(src), size, CNRT_MEM_TRANS_DIR_HOST2DEV))
+      << " cnrt memcpy htod failed";
+}
+
+void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {
+  CNRT_CALL(cnrtMemcpy(
+      dst, const_cast<void*>(src), size, CNRT_MEM_TRANS_DIR_DEV2HOST))
+      << " cnrt memcpy dtoh failed";
+}
+
+}  // namespace mlu
+
+size_t TargetWrapperMlu::num_devices() {
+  uint32_t dev_count = 0;
+  CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed";
+  LOG(INFO) << "Current MLU device count: " << dev_count;
+  return dev_count;
+}
+
+void* TargetWrapperMlu::Malloc(size_t size) {
+  void* ptr{};
+  CNRT_CALL(cnrtMalloc(&ptr, size)) << " cnrt malloc failed";
+  // LOG(INFO) << "Malloc mlu ptr: " << ptr << " with size: " << size;
+  return ptr;
+}
+
+void TargetWrapperMlu::Free(void* ptr) {
+  CNRT_CALL(cnrtFree(ptr)) << " cnrt free failed";
+}
+
+void TargetWrapperMlu::MemcpySync(void* dst,
+                                  const void* src,
+                                  size_t size,
+                                  IoDirection dir) {
+  // LOG(INFO) << "dst: " << dst << " src: " << src << " size: " << size
+  //<< " dir: " << (int)dir;
+  switch (dir) {
+    case IoDirection::DtoD: {
+      std::unique_ptr<char[]> cpu_tmp_ptr(new char[size]);
+      mlu::cnrtMemcpyDtoH(cpu_tmp_ptr.get(), src, size);
+      mlu::cnrtMemcpyHtoD(dst, cpu_tmp_ptr.get(), size);
+      break;
+    }
+    case IoDirection::HtoD:
+      mlu::cnrtMemcpyHtoD(dst, src, size);
+      break;
+    case IoDirection::DtoH:
+      mlu::cnrtMemcpyDtoH(dst, src, size);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection" << static_cast<int>(dir);
+  }
+}
+
+// void TargetWrapperMlu::MemcpyAsync(void* dst,
+//                                    const void* src,
+//                                    size_t size,
+//                                    IoDirection dir,
+//                                    const stream_t& stream) {
+//   LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
+//   MemcpySync(dst, src, size, dir);
+// }
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/mlu/target_wrapper.h b/lite/backends/mlu/target_wrapper.h
new file mode 100644
index 0000000000..2d9e10806f
--- /dev/null
+++ b/lite/backends/mlu/target_wrapper.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/backends/mlu/mlu_utils.h"
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+
+using TargetWrapperMlu = TargetWrapper<TARGET(kMLU)>;
+
+template <>
+class TargetWrapper<TARGET(kMLU)> {
+ public:
+  using queue_t = cnrtQueue_t;
+
+  static size_t num_devices();
+  static size_t maxinum_queue() { return 0; }  // TODO(zhangshijin): fix out it.
+
+  static size_t GetCurDevice() { return 0; }
+
+  static void CreateQueue(queue_t* queue) {}
+  static void DestroyQueue(const queue_t& queue) {}
+
+  static void QueueSync(const queue_t& queue) {}
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+  // static void MemcpyAsync(void* dst,
+  //                         const void* src,
+  //                         size_t size,
+  //                         IoDirection dir,
+  //                         const queue_t& queue);
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
new file mode 100644
index 0000000000..d6240888d0
--- /dev/null
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -0,0 +1,499 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/mlu_postprocess_pass.h"
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
+                                           const std::string& cast_arg_name,
+                                           SSAGraph* graph,
+                                           Node* cur_node,
+                                           Node* inst_node,
+                                           const Type* cast_type) {
+  // create the arg node
+  auto* cast_arg = graph->NewArgumentNode(cast_arg_name);
+  cast_arg->AsArg().type = cast_type;
+  inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
+
+  // create the stmt node
+  auto* cast_inst = graph->NewInstructNode();
+  // create op
+  auto cast_op = LiteOpRegistry::Global().Create(op_type);
+  CHECK(cast_op) << "create op [" << op_type << "] failed";
+  cpp::OpDesc op_desc;
+  op_desc.SetType(op_type);
+  if (op_type == "cast") {
+    op_desc.SetAttr<int>("in_dtype", 5);   // FP32
+    op_desc.SetAttr<int>("out_dtype", 4);  // FP16
+    op_desc.SetInput("X", {cur_node->AsArg().name});
+    op_desc.SetOutput("Out", {cast_arg_name});
+  } else if (op_type == "transpose") {
+    // NCHW -> NHWC
+    op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1});
+    op_desc.SetInput("X", {cur_node->AsArg().name});
+    op_desc.SetOutput("Out", {cast_arg_name});
+  } else if (op_type == "io_copy") {
+    op_desc.SetInput("Input", {cur_node->AsArg().name});
+    op_desc.SetOutput("Out", {cast_arg_name});
+  } else {
+    CHECK(0) << "Unsupport cast type";
+  }
+  cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+  // create kernels
+  auto kernels = cast_op->CreateKernels(graph->valid_places());
+  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+  bool is_found = false;
+  for (auto& kernel : kernels) {
+    if (op_type == "cast") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("X");
+      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
+    } else if (op_type == "transpose") {
+      is_found = true;
+    } else if (op_type == "io_copy") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) &&
+          TargetCompatibleTo(*out_arg_ty, *cast_type)) {
+        is_found = true;
+      }
+    } else {
+      CHECK(0) << "Unsupport cast type";
+    }
+    if (is_found) {
+      selected_kernels.emplace_back(std::move(kernel));
+      // we pick the kernel
+      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
+      auto& stmt = cast_inst->AsStmt();
+      stmt.picked_kernel().SetContext(
+          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+      break;
+    }
+  }
+  CHECK(is_found) << "Can't find a Cast kernel for Cast op: "
+                  << cur_node->AsArg().name << "->" << op_type;
+  // modify links
+  DirectedLink(cur_node, cast_inst);
+  DirectedLink(cast_inst, cast_arg);
+  return cast_arg;
+}
+
+Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
+                                          const std::string& cast_arg_name,
+                                          SSAGraph* graph,
+                                          Node* cur_node,
+                                          Node* inst_node,
+                                          const Type* cast_type) {
+  // create the arg node
+  auto* cast_arg = graph->NewArgumentNode(cast_arg_name);
+  cast_arg->AsArg().type = cast_type;
+  auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
+  // for CastAfter manully set the tensor's type
+  var->GetMutable<::paddle::lite::Tensor>();
+
+  // create the stmt node
+  auto* cast_inst = graph->NewInstructNode();
+  // create op
+  auto cast_op = LiteOpRegistry::Global().Create(op_type);
+  CHECK(cast_op) << "create op [" << op_type << "] failed";
+  cpp::OpDesc op_desc;
+  op_desc.SetType(op_type);
+  if (op_type == "cast") {
+    op_desc.SetAttr<int>("in_dtype", 4);   // FP32
+    op_desc.SetAttr<int>("out_dtype", 5);  // FP16
+    op_desc.SetInput("X", {cast_arg_name});
+    op_desc.SetOutput("Out", {cur_node->AsArg().name});
+  } else if (op_type == "transpose") {
+    // NHWC -> NCHW
+    op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2});
+    op_desc.SetInput("X", {cast_arg_name});
+    op_desc.SetOutput("Out", {cur_node->AsArg().name});
+  } else if (op_type == "io_copy") {
+    op_desc.SetInput("Input", {cast_arg_name});
+    op_desc.SetOutput("Out", {cur_node->AsArg().name});
+  } else {
+    CHECK(0) << "Unsupport cast type";
+  }
+
+  cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+
+  // create kernels
+  auto kernels = cast_op->CreateKernels(graph->valid_places());
+  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+  bool is_found = false;
+  for (auto& kernel : kernels) {
+    if (op_type == "cast") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("X");
+      if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
+        is_found = true;
+      }
+    } else if (op_type == "transpose") {
+      is_found = true;
+    } else if (op_type == "io_copy") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (TargetCompatibleTo(*in_arg_ty, *cast_type) &&
+          TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
+    } else {
+      CHECK(0) << "Unsupport cast type";
+    }
+    if (is_found) {
+      selected_kernels.emplace_back(std::move(kernel));
+      // we pick the kernel
+      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
+      auto& stmt = cast_inst->AsStmt();
+      stmt.picked_kernel().SetContext(
+          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+      break;
+    }
+  }
+  CHECK(is_found) << "Can't find a Cast kernel for Cast op: "
+                  << cur_node->AsArg().name << "->" << op_type;
+  // modify links
+  DirectedLink(cast_arg, cast_inst);
+  DirectedLink(cast_inst, cur_node);
+  return cast_arg;
+}
+
+void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
+                                      Node* head_node,
+                                      Node* inst_node,
+                                      const Type* inst_type) {
+  const auto* head_type = head_node->AsArg().type;
+
+  // break original link
+  RemoveDirectedLink(head_node, inst_node);
+
+  auto* cur_node = head_node;
+  const auto name_prefix =
+      head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
+
+  // layout cast node
+  if (head_type->layout() != inst_type->layout()) {
+    cur_node = InsertCastBefore(
+        "transpose",
+        name_prefix + "transpose",
+        graph,
+        cur_node,
+        inst_node,
+        LiteType::GetTensorTy(
+            head_type->target(), head_type->precision(), inst_type->layout()));
+  }
+
+  // precision cast node
+  if (head_type->precision() != inst_type->precision()) {
+    cur_node = InsertCastBefore(
+        "cast",
+        name_prefix + "cast",
+        graph,
+        cur_node,
+        inst_node,
+        LiteType::GetTensorTy(
+            head_type->target(), inst_type->precision(), inst_type->layout()));
+  }
+
+  // io copy
+  cur_node = InsertCastBefore(
+      "io_copy",
+      name_prefix + "io_copy",
+      graph,
+      cur_node,
+      inst_node,
+      LiteType::GetTensorTy(
+          inst_type->target(), inst_type->precision(), inst_type->layout()));
+
+  // connect cur_node to inst_node
+  DirectedLink(cur_node, inst_node);
+
+  // reset opdesc and update kernel information
+  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
+                head_node->AsArg().name,
+                cur_node->AsArg().name);
+  // for subgraph op, modify the BlockDesc
+  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                             inst_node->AsStmt().op().get())
+                             ->GetSubBlock();
+  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
+    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
+    UpdateInputTo(
+        sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
+  }
+
+  // recreate the op
+  RecreateOp(inst_node, graph);
+
+  graph->CheckValid();
+}
+
+void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
+                                              const Type** arg_type,
+                                              SSAGraph* graph) {
+  CHECK(inst_node->IsStmt());
+  constexpr auto subgraph_target = TARGET(kMLU);
+  constexpr auto subgraph_layout = DATALAYOUT(kNHWC);
+
+  // get subgraph's valid precision
+  const auto& places = graph->valid_places();
+  std::set<::paddle::lite_api::PrecisionType> prec_set;
+  for (const auto& place : places) {
+    if (place.target == TARGET(kMLU)) {
+      prec_set.insert(place.precision);
+    }
+  }
+
+  // get subgraph op's type info
+  size_t kernel_size = inst_node->AsStmt().kernels().size();
+  CHECK_GT(kernel_size, 0);
+  VLOG(4) << "subgraph kernel size: " << kernel_size;
+
+  for (size_t i = 0; i < kernel_size; ++i) {
+    auto* kernel = inst_node->AsStmt().kernels()[i].get();
+    VLOG(4) << i << "th kernel: " << TargetToStr(kernel->target()) << ", "
+            << PrecisionToStr(kernel->precision()) << ", "
+            << DataLayoutToStr(kernel->layout());
+  }
+
+  for (size_t i = 0; i < kernel_size; ++i) {
+    auto* kernel = inst_node->AsStmt().kernels()[i].get();
+    CHECK(kernel->target() == subgraph_target);
+    CHECK(kernel->layout() == subgraph_layout);
+    if (prec_set.count(kernel->precision()) == 1) {
+      const auto subgraph_precision = kernel->precision();
+      CHECK(subgraph_precision == PRECISION(kFloat) ||
+            subgraph_precision == PRECISION(kFP16))
+          << "Mlu node has unsupport precision";
+      VLOG(4) << "picked kernel precision: "
+              << PrecisionToStr(subgraph_precision);
+      *arg_type = LiteType::GetTensorTy(
+          subgraph_target, subgraph_precision, subgraph_layout);
+      break;
+    }
+  }
+}
+
+bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) {
+  CHECK(node->IsArg());
+
+  // some op, for example batch_norm, has output nodes useless
+  if (node->outlinks.size() == 0) {
+    return false;
+  }
+
+  // check if node is weight or persistent
+  bool is_persist = node->AsArg().is_weight || node->AsArg().is_persist;
+  if (is_persist) {
+    VLOG(4) << "Persistent arg name: " << node->AsArg().name
+            << " is_weight: " << node->AsArg().is_weight
+            << " is_persist: " << node->AsArg().is_persist;
+    return false;
+  }
+
+  const auto target = node->AsArg().type->target();
+  const auto precision = node->AsArg().type->precision();
+  const auto layout = node->AsArg().type->layout();
+  VLOG(4) << "arg name: " << node->AsArg().name
+          << " type: " << TargetToStr(target) << ", "
+          << PrecisionToStr(precision) << ", " << DataLayoutToStr(layout);
+
+  // do not insert nodes if previous node is on mlu already
+  if (target == inst_type->target()) {
+    CHECK(layout == inst_type->layout()) << "Mlu node has wrong layout";
+    return false;
+  }
+
+  return true;
+}
+
+void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
+                                     Node* tail_node,
+                                     Node* inst_node,
+                                     const Type* inst_type) {
+  const auto* tail_type = tail_node->AsArg().type;
+
+  // break original link
+  RemoveDirectedLink(inst_node, tail_node);
+
+  auto* cur_node = tail_node;
+  const auto name_prefix =
+      tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
+
+  // layout cast node
+  if (tail_type->layout() != inst_type->layout()) {
+    cur_node = InsertCastAfter(
+        "transpose",
+        name_prefix + "transpose",
+        graph,
+        cur_node,
+        inst_node,
+        LiteType::GetTensorTy(
+            tail_type->target(), tail_type->precision(), inst_type->layout()));
+  }
+
+  // precision cast node
+  if (tail_type->precision() != inst_type->precision()) {
+    cur_node = InsertCastAfter(
+        "cast",
+        name_prefix + "cast",
+        graph,
+        cur_node,
+        inst_node,
+        LiteType::GetTensorTy(
+            tail_type->target(), inst_type->precision(), inst_type->layout()));
+  }
+
+  // io copy
+  cur_node = InsertCastAfter(
+      "io_copy",
+      name_prefix + "io_copy",
+      graph,
+      cur_node,
+      inst_node,
+      LiteType::GetTensorTy(
+          inst_type->target(), inst_type->precision(), inst_type->layout()));
+
+  // connect cur_node to inst_node
+  DirectedLink(inst_node, cur_node);
+
+  // reset opdesc and update kernel information
+  UpdateOutputTo(inst_node->AsStmt().op()->mutable_op_info(),
+                 tail_node->AsArg().name,
+                 cur_node->AsArg().name);
+  // for subgraph op, modify the BlockDesc
+  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                             inst_node->AsStmt().op().get())
+                             ->GetSubBlock();
+  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
+    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
+    UpdateOutputTo(
+        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+  }
+
+  // recreate the op
+  RecreateOp(inst_node, graph);
+
+  graph->CheckValid();
+}
+
+void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
+  auto original_selected_kernel =
+      std::move(inst_node->AsStmt().kernels().front());
+  auto updated_op_info = *inst_node->AsStmt().mutable_op_info();
+
+  inst_node->AsStmt().ResetOp(updated_op_info, graph->valid_places());
+  inst_node->AsStmt().kernels().clear();
+  inst_node->AsStmt().kernels().emplace_back(
+      std::move(original_selected_kernel));
+  for (auto& kernel : inst_node->AsStmt().kernels()) {
+    VLOG(4) << "kernel info: " << kernel->name();
+    inst_node->AsStmt().op()->AttachKernel(kernel.get());
+  }
+}
+
+void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (!node.IsStmt()) continue;
+    if (node.AsStmt().op_type() == "feed") {
+      for (auto& out : node.outlinks) {
+        bool change = true;
+        for (auto& inst : out->outlinks) {
+          if (inst->AsStmt().op_type() != "subgraph") {
+            change = false;
+            break;
+          }
+        }
+        if (change) {
+          const auto* old_type = out->AsArg().type;
+          out->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    old_type->precision(),
+                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    old_type->device());
+        }
+      }
+    }
+    if (node.AsStmt().op_type() == "fetch") {
+      for (auto& inp : node.inlinks) {
+        bool change = true;
+        for (auto& inst : inp->inlinks) {
+          if (inst->AsStmt().op_type() != "subgraph") {
+            change = false;
+            break;
+          }
+        }
+        if (change) {
+          const auto* old_type = inp->AsArg().type;
+          inp->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    old_type->precision(),
+                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    old_type->device());
+        }
+      }
+    }
+  }
+}
+
+void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  // currently for non-persistent input and output args, mlu subgraph op
+  // only support float16/float32 data type
+
+  // in two situations as folllows:
+  // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
+  // arg_in and arg_out are assumed to be NHWC which user should be aware of.
+  // Thus here we change these args' layout to NHWC
+  ModifyLayout(graph.get());
+
+  // insert io_copy, layout and precision cast of subgraph's inputs and outputs
+  for (auto& node : graph->mutable_nodes()) {
+    if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
+      const Type* subgraph_arg_type = nullptr;
+      GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get());
+
+      auto links_tmp = node.inlinks;
+      for (auto p_in : links_tmp) {
+        if (NeedInsert(p_in, subgraph_arg_type)) {
+          InsertBefore(graph.get(), p_in, &node, subgraph_arg_type);
+        }
+      }
+      links_tmp.assign(node.outlinks.begin(), node.outlinks.end());
+      for (auto p_out : links_tmp) {
+        if (NeedInsert(p_out, subgraph_arg_type)) {
+          InsertAfter(graph.get(), p_out, &node, subgraph_arg_type);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(mlu_postprocess_pass, paddle::lite::mir::MLUPostprocessPass)
+    .BindTargets({TARGET(kMLU)});
diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h
new file mode 100644
index 0000000000..8ffcbc952a
--- /dev/null
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+static void UpdateInputTo(cpp::OpDesc* desc,
+                          const std::string& from,
+                          const std::string& to) {
+  for (auto& item : *desc->mutable_inputs()) {
+    for (auto& input : item.second) {
+      if (input == from) {
+        input = to;
+      }
+    }
+  }
+  if (desc->Type() != "subgraph") return;
+  auto input_names =
+      desc->GetAttr<std::vector<std::string>>("input_data_names");
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    if (input_names[i] == from) {
+      input_names[i] = to;
+    }
+  }
+  desc->SetAttr<std::vector<std::string>>("input_data_names", input_names);
+}
+
+static void UpdateOutputTo(cpp::OpDesc* desc,
+                           const std::string& from,
+                           const std::string& to) {
+  for (auto& item : *desc->mutable_outputs()) {
+    for (auto& output : item.second) {
+      if (output == from) {
+        output = to;
+      }
+    }
+  }
+  if (desc->Type() != "subgraph") return;
+  auto output_names =
+      desc->GetAttr<std::vector<std::string>>("output_data_names");
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    if (output_names[i] == from) {
+      output_names[i] = to;
+    }
+  }
+  desc->SetAttr<std::vector<std::string>>("output_data_names", output_names);
+}
+
+/*
+ * The pass changes the node's target to mlu which follows a mlu subgraph op
+ * */
+class MLUPostprocessPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+ private:
+  void GetSubgraphOpArgType(Node* inst_node,
+                            const Type** arg_type,
+                            SSAGraph* graph);
+
+  void ModifyLayout(SSAGraph* graph);
+
+  bool NeedInsert(Node* node, const Type* inst_type);
+
+  void InsertBefore(SSAGraph* graph,
+                    Node* head_node,
+                    Node* inst_node,
+                    const Type* type);
+
+  void InsertAfter(SSAGraph* graph,
+                   Node* tail_node,
+                   Node* inst_node,
+                   const Type* type);
+
+  Node* InsertCastBefore(const std::string& op_type,
+                         const std::string& cast_arg_name,
+                         SSAGraph* graph,
+                         Node* cur_node,
+                         Node* inst_node,
+                         const Type* cast_type);
+
+  Node* InsertCastAfter(const std::string& op_type,
+                        const std::string& cast_arg_name,
+                        SSAGraph* graph,
+                        Node* cur_node,
+                        Node* inst_node,
+                        const Type* cast_type);
+
+  void RecreateOp(Node* inst_node, SSAGraph* graph);
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph_cast_display_pass.cc b/lite/core/mir/subgraph_cast_display_pass.cc
new file mode 100644
index 0000000000..3a2c94d232
--- /dev/null
+++ b/lite/core/mir/subgraph_cast_display_pass.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class SubgraphCastDisplayPass : public DebugPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    VLOG(3) << "== Argument types ==";
+    for (auto& node : graph->mutable_nodes()) {
+      if (!node.IsArg()) continue;
+
+      auto* type = node.AsArg().type;
+      if (type) {
+        VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
+      } else {
+        VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
+      }
+    }
+    VLOG(3) << "---------------------";
+
+    //
+    VLOG(0) << "== SubgraphOp Debug Info ==";
+    for (auto& node : graph->mutable_nodes()) {
+      if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
+        VLOG(0) << "FOUND SUBGRAPH OP";
+        display_debug_info(node, "subgraph");
+        break;
+      }
+    }
+    VLOG(0) << "---------------------";
+  }
+
+  void display_debug_info(const Node& node,
+                          std::string op_type,
+                          bool display_in_nodes = true,
+                          bool display_out_nodes = true) {
+    CHECK(node.IsStmt());
+    VLOG(0) << node.AsStmt();
+    if (display_in_nodes) {
+      for (auto p_in_arg_node : node.inlinks) {
+        CHECK(p_in_arg_node->IsArg());
+        VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
+                << " type: " << *p_in_arg_node->AsArg().type
+                << " is_weight: " << p_in_arg_node->AsArg().is_weight
+                << " is_persist: " << p_in_arg_node->AsArg().is_persist
+                << " input_count: " << p_in_arg_node->inlinks.size();
+        if (p_in_arg_node->inlinks.size() == 0) {
+          VLOG(0) << "** END with No Op";
+        }
+        for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
+          CHECK(p_in_stmt_node->IsStmt());
+          std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type();
+          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
+              stmt_op_type == "io_copy") {
+            display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
+          } else {
+            VLOG(0) << "** END with op type: " << stmt_op_type;
+          }
+        }
+      }
+    }
+    if (display_out_nodes) {
+      for (auto p_out_arg_node : node.outlinks) {
+        CHECK(p_out_arg_node->IsArg());
+        VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
+                << " type: " << *p_out_arg_node->AsArg().type
+                << " is_weight: " << p_out_arg_node->AsArg().is_weight
+                << " is_persist: " << p_out_arg_node->AsArg().is_persist
+                << " output_count: " << p_out_arg_node->outlinks.size();
+        if (p_out_arg_node->outlinks.size() == 0) {
+          VLOG(0) << "** END with No Op";
+        }
+        for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
+          CHECK(p_out_stmt_node->IsStmt());
+          std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type();
+          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
+              stmt_op_type == "io_copy") {
+            display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
+          } else {
+            VLOG(0) << "** END with op type: " << stmt_op_type;
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(subgraph_cast_display_pass,
+                  paddle::lite::mir::SubgraphCastDisplayPass)
+    .BindTargets({TARGET(kAny)});
diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt
new file mode 100644
index 0000000000..1c41f05ca0
--- /dev/null
+++ b/lite/kernels/mlu/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(NOT LITE_WITH_MLU)
+    return()
+endif()
+
+add_subdirectory(bridges)
+add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
+add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt
new file mode 100644
index 0000000000..302d580ee1
--- /dev/null
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -0,0 +1,41 @@
+if(NOT LITE_WITH_MLU)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs})
+lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu)
+
+set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu)
+
+lite_cc_library(subgraph_bridge_act_op_mlu SRCS act_op.cc DEPS ${mlu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_mlu SRCS batch_norm_op.cc DEPS ${mlu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_mlu SRCS conv_op.cc DEPS ${mlu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS ${mlu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu})
+set(mlu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_mlu
+        subgraph_bridge_graph_mlu
+        subgraph_bridge_act_op_mlu
+        subgraph_bridge_conv_op_mlu
+        subgraph_bridge_elementwise_ops_mlu
+        subgraph_bridge_pool_op_mlu
+        subgraph_bridge_softmax_op_mlu
+        subgraph_bridge_fc_op_mlu
+        subgraph_bridge_batch_norm_op_mlu
+        CACHE INTERNAL "mlu_subgraph_bridges")
+
+
+# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
+# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+
+message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc
new file mode 100644
index 0000000000..50291ec297
--- /dev/null
+++ b/lite/kernels/mlu/bridges/act_op.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Create act node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
+  cnmlBaseOp_t activation_op;
+  CNML_CALL(cnmlCreateActiveOp(&activation_op,
+                               act_type,
+                               input_tensor->mlu_tensor(),
+                               output_tensor->mlu_tensor()));
+  graph->FuseOp(activation_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc
new file mode 100644
index 0000000000..51cdc52dc6
--- /dev/null
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/operators/activation_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ActConverter(void* ctx, OpLite* op);
+
+template void FillTensor<float, int>(Tensor* x,
+                                     float lower = -2,
+                                     float upper = -2);
+
+void act_ref(const std::shared_ptr<operators::ActivationOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out");
+  auto out_ref = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
+  out_ref->Resize(x->dims());
+  auto x_data = x->data<float>();
+  auto out_data = out->mutable_data<float>();
+  CHECK_EQ(x->numel(), out->numel());
+
+  // "sigmoid","relu","tanh","relu_clipped","leaky_relu","softsign","hard_sigmoid"
+  if (op_type == "sigmoid") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = 1.f / (1.f + std::exp(-x_data[i]));
+    }
+  } else if (op_type == "relu") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::max(0.f, x_data[i]);
+    }
+  } else if (op_type == "tanh") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) /
+                    (std::exp(x_data[i]) + std::exp(-x_data[i]));
+    }
+  } else if (op_type == "relu_clipped") {
+    auto relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef);
+    }
+  } else if (op_type == "relu6") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(std::max(0.f, x_data[i]), 6.f);
+    }
+  } else if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::max(x_data[i], x_data[i] * alpha);
+    }
+  } else if (op_type == "softsign") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = x_data[i] / (1 + std::abs(x_data[i]));
+    }
+  } else if (op_type == "hard_sigmoid") {
+    auto slope = op_info->GetAttr<float>("slope");
+    auto offset = op_info->GetAttr<float>("offset");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(1.f, slope * x_data[i] + offset);
+      out_data[i] = std::max(0.f, out_data[i]);
+    }
+  } else {
+    LOG(FATAL) << "unsupported activation type: " << op_type;
+  }
+}
+
+void test_act(std::vector<int64_t> x_shape, std::string op_type) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(x_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x, 2, 8);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType(op_type);
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  if (op_type == "relu_clipped") {
+    opdesc.SetAttr("Relu_clipped_coef", 3.f);
+  } else if (op_type == "relu6") {
+    opdesc.SetAttr("Relu_clipped_coef", 6.f);
+  } else if (op_type == "leaky_relu") {
+    opdesc.SetAttr("alpha", 0.02f);
+  } else if (op_type == "hard_sigmoid") {
+    opdesc.SetAttr("slope", 0.2f);
+    opdesc.SetAttr("offset", 0.5f);
+  }
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
+  // execute reference implementation and save to output tensor
+  act_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, activation) {
+  std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
+  std::vector<std::string> types{"sigmoid", "relu", "tanh"};
+  for (auto x_shape : shapes) {
+    for (auto op_type : types) {
+      test_act(x_shape, op_type);
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         sigmoid,
+                         paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc
new file mode 100644
index 0000000000..d95a5115c9
--- /dev/null
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto scale_var_name = op_info->Input("Scale").front();
+  auto bias_var_name = op_info->Input("Bias").front();
+  auto mean_var_name = op_info->Input("Mean").front();
+  auto variance_var_name = op_info->Input("Variance").front();
+  auto y_var_name = op_info->Output("Y").front();
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+
+  auto output = scope->FindVar(y_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+
+  CHECK(graph->HasNode(x_var_name));
+
+  auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
+  auto mean_dims = mean->dims().Vectorize();
+  auto mean_tensor = graph->AddNode(
+      mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+
+  auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
+  auto variance_dims = variance->dims().Vectorize();
+  auto variance_tensor = graph->AddNode(
+      variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+
+  auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
+  auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
+
+  int co = static_cast<int>(mean_dims[0]);
+
+  for (int i = 0; i < co; ++i) {
+    variance->mutable_data<float>()[i] =
+        scale->data<float>()[i] / sqrtf(variance->data<float>()[i] + epsilon);
+    mean->mutable_data<float>()[i] =
+        mean->data<float>()[i] -
+        bias->data<float>()[i] / variance->data<float>()[i];
+  }
+
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlBaseOp_t bn_op;
+  CNML_CALL(cnmlCreateBatchNormOpForward(&bn_op,
+                                         input_tensor->mlu_tensor(),
+                                         output_tensor->mlu_tensor(),
+                                         mean_tensor->mlu_tensor(),
+                                         variance_tensor->mlu_tensor()));
+
+  graph->BindConstData(variance_var_name, variance);
+  graph->BindConstData(mean_var_name, mean);
+  graph->FuseOp(bn_op);
+
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::BatchNormConverter);
diff --git a/lite/kernels/mlu/bridges/batch_norm_op_test.cc b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
new file mode 100644
index 0000000000..47e291bf3d
--- /dev/null
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/batch_norm_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int BatchNormConverter(void* ctx, OpLite* op);
+
+template <typename dtype>
+void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable<Tensor>();
+  auto bias =
+      scope->FindVar(op_info->Input("Bias").front())->GetMutable<Tensor>();
+  auto scale =
+      scope->FindVar(op_info->Input("Scale").front())->GetMutable<Tensor>();
+  auto mean =
+      scope->FindVar(op_info->Input("Mean").front())->GetMutable<Tensor>();
+  auto variance =
+      scope->FindVar(op_info->Input("Variance").front())->GetMutable<Tensor>();
+
+  auto x_data = x->data<dtype>();
+  auto y_data = y->mutable_data<dtype>();
+  auto scale_data = scale->mutable_data<dtype>();
+  auto bias_data = bias->mutable_data<dtype>();
+  auto mean_data = mean->mutable_data<dtype>();
+  auto variance_data = variance->mutable_data<dtype>();
+  DDim x_dims = x->dims();
+
+  float epsilon = op_info->GetAttr<float>("epsilon");
+  // float momentum = op_info->GetAttr<float>("momentum");
+  auto data_layout = op_info->GetAttr<std::string>("data_layout");
+
+  bool global_stats = op_info->GetAttr<bool>("use_global_stats");
+  if (global_stats) {
+    int64_t outer_size = 0;
+    int64_t channel_size = 0;
+    int64_t inner_size = 0;
+    if (data_layout == "NCHW") {
+      outer_size = x_dims[0];
+      channel_size = x_dims[1];
+      inner_size = x_dims.Slice(2, x_dims.size()).production();
+    } else {
+      LOG(FATAL) << "Unknown storage order: " << data_layout;
+    }
+    auto x_ptr = x_data;
+    auto y_ptr = y_data;
+    for (int o = 0; o < outer_size; o++) {
+      for (int c = 0; c < channel_size; c++) {
+        for (int i = 0; i < inner_size; i++) {
+          dtype norm_x =
+              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
+          *y_ptr = norm_x * scale_data[c] + bias_data[c];
+          x_ptr++;
+          y_ptr++;
+        }
+      }
+    }
+  }
+}
+
+void test_batch_norm(
+    int bs, int ic, int ih, int iw, float epsilon, float momentum) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  std::string scale_var_name = "scale";
+  std::string bias_var_name = "bias";
+  std::string mean_var_name = "mean";
+  std::string variance_var_name = "variance";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* scale = scope.Var(scale_var_name)->GetMutable<Tensor>();
+  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
+  auto* mean = scope.Var(mean_var_name)->GetMutable<Tensor>();
+  auto* variance = scope.Var(variance_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+  scale->Resize({ic});
+  bias->Resize({ic});
+  mean->Resize({ic});
+  variance->Resize({ic});
+
+  // initialize input&output data
+  FillTensor<float, float>(x, -100, 100);
+  FillTensor<float, float>(scale, -6.7, 13.78);
+  FillTensor<float, float>(bias, -12.11, 12.94);
+  FillTensor<float, float>(mean, -23.45, 67.89);
+  // variance > 0
+  FillTensor<float, float>(variance, 1.5f, 76.78f);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("batch_norm");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetInput("Scale", {scale_var_name});
+  opdesc.SetInput("Bias", {bias_var_name});
+  opdesc.SetInput("Mean", {mean_var_name});
+  opdesc.SetInput("Variance", {variance_var_name});
+  opdesc.SetOutput("Y", {out_var_name});
+  opdesc.SetAttr("is_test", 1);
+  opdesc.SetAttr("use_global_stats", true);
+  opdesc.SetAttr("epsilon", epsilon);
+  opdesc.SetAttr("momentum", momentum);
+  opdesc.SetAttr("data_layout", std::string("NCHW"));
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::BatchNormOp>(opdesc, &scope);
+  // execute reference implementation and save to output tensor
+  batch_norm_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+
+  out->Resize({bs, ih, iw, ic});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize({bs, ic, ih, iw});
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {bs, ih, iw, ic},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, batch_norm) {
+  for (auto bs : {1, 4, 7}) {
+    for (auto ic : {1, 4, 7}) {
+      for (auto ih : {1, 4, 7}) {
+        for (auto iw : {1, 4, 7}) {
+          for (auto epsilon : {1e-4f, 1e-5f}) {
+            for (auto momentum : {0.9f, 0.99f}) {
+              test_batch_norm(bs, ic, ih, iw, epsilon, momentum);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         batch_norm,
+                         paddle::lite::subgraph::mlu::BatchNormConverter);
diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc
new file mode 100644
index 0000000000..e9fdacdca9
--- /dev/null
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -0,0 +1,200 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <algorithm>
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto* graph = static_cast<Graph*>(ctx);
+  const auto* op_info = op->op_info();
+  const auto* scope = op->scope();
+  VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
+
+  // Get input, filter and op attributes
+  const auto input_var_name = op_info->Input("Input").front();
+  const auto& input_dims_nhwc =
+      scope->FindVar(input_var_name)->GetMutable<Tensor>()->dims();
+  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
+  const auto filter_var_name = op_info->Input("Filter").front();
+  auto* filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
+  const auto& filter_dims = filter->dims();
+  const auto output_var_name = op_info->Output("Output").front();
+  const auto bs = input_dims[0];
+  const auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4);
+  CHECK_EQ(filter_dims.size(), 4);
+  const auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the input size.";
+
+  const std::string padding_algorithm =
+      op_info->HasAttr("padding_algorithm")
+          ? op_info->GetAttr<std::string>("padding_algorithm")
+          : "";
+
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  std::vector<int64_t> output_shape({bs, oc});
+  for (size_t i = 0; i < 2; i++) {
+    const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
+    output_shape.push_back(
+        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
+            strides[i] +
+        1);
+  }
+
+  const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
+  const auto output_tensor = graph->AddNode(output_var_name,
+                                            output_shape_nhwc,
+                                            CNML_TENSOR,
+                                            CNML_NHWC,
+                                            graph->FPType());
+  scope->FindVar(output_var_name)
+      ->GetMutable<::paddle::lite::Tensor>()
+      ->Resize(output_shape_nhwc);
+
+  // Create filter node
+  const auto filter_tensor = graph->AddNode(filter_var_name,
+                                            filter_dims.Vectorize(),
+                                            CNML_FILTER,
+                                            CNML_NCHW,
+                                            graph->FPType());
+  const auto weight_scale =
+      op_info->GetAttr<std::vector<float>>("weight_scale");
+
+  if (filter->precision() == PrecisionType::kUnk ||
+      filter->precision() == PrecisionType::kInt8) {
+    std::vector<float> filter_dequant(filter->data_size());
+    dequant(filter_dequant.data(),
+            filter->mutable_data<int8_t>(),
+            1,
+            filter_dims[0],
+            filter_dims[1] * filter_dims[2] * filter_dims[3],
+            weight_scale);
+    transpose(filter_dequant.data(),
+              filter->mutable_data<float>(),
+              {static_cast<int>(filter_dims[0]),
+               static_cast<int>(filter_dims[1]),
+               static_cast<int>(filter_dims[2]),
+               static_cast<int>(filter_dims[3])},
+              {0, 2, 3, 1});
+    filter->set_precision(PrecisionType::kFloat);
+  } else if (filter->precision() != PrecisionType::kFloat) {
+    LOG(FATAL) << "UnSupported weight precision!";
+  }
+
+  cnmlConvOpParam_t conv_param;
+  CNML_CALL(cnmlCreateConvOpParam(&conv_param,
+                                  strides[0],
+                                  strides[1],
+                                  dilations[0],
+                                  dilations[1],
+                                  paddings[0] * 2,
+                                  paddings[2] * 2));
+  std::string bias_var_name;
+  std::shared_ptr<MLUTensor> bias_tensor;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    const DDim output_dims(output_shape);
+    bias_var_name = op_info->Input("Bias").front();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
+    const auto& bias_dims = bias->dims();
+    const auto bias_data_size = bias_dims.production();
+    const auto output_data_size = output_dims.production();
+    std::vector<int64_t> bias_shape;
+    if (bias_data_size == oc) {
+      // 0: {oc}
+      bias_shape = {oc};
+    } else if (bias_data_size == output_data_size / bs) {
+      LOG(FATAL) << "Unsupported ... ...";
+      // 1: {1, oc, oh, ow}
+      bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
+    } else if (bias_data_size == output_data_size) {
+      LOG(FATAL) << "Unsupported ... ...";
+      // 2: {n, oc, oh, ow}
+      bias_shape = output_dims.Vectorize();
+    } else {
+      LOG(ERROR) << "[MLU] Bias dimension " << bias_dims
+                 << " isn't supported in conv2d Op when output dimension is "
+                 << output_dims;
+    }
+    bias_tensor = graph->AddNode(bias_var_name,
+                                 bias_dims.Vectorize(),
+                                 CNML_CONST,
+                                 CNML_CNHW,
+                                 graph->FPType());
+    graph->BindConstData(bias_var_name, bias);
+  }
+  cnmlBaseOp_t conv_op;
+  const auto input_scale = op_info->GetAttr<float>("input_scale");
+  CNML_CALL(cnmlCreateConvOpForward(
+      &conv_op,
+      conv_param,
+      graph->GetNode(input_var_name)->mlu_tensor(),
+      output_tensor->mlu_tensor(),
+      filter_tensor->mlu_tensor(),
+      bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+
+  graph->SetComputingDataType(
+      conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
+  graph->SetComputingDataType(
+      conv_op,
+      filter_tensor->mlu_tensor(),
+      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+  CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC));
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
+    graph->BindConstData(bias_var_name, bias);
+  }
+  graph->BindConstData(filter_var_name, filter);
+  graph->FuseOp(conv_op);
+  CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ConvConverter);
diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc
new file mode 100644
index 0000000000..e8ef9ba04f
--- /dev/null
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -0,0 +1,350 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ConvConverter(void* ctx, OpLite* op);
+
+void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto input =
+      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
+  auto filter =
+      scope->FindVar(op_info->Input("Filter").front())->GetMutable<Tensor>();
+  auto output =
+      scope->FindVar(op_info->Output("Output").front())->GetMutable<Tensor>();
+  std::vector<int32_t> strides =
+      op_info->GetAttr<std::vector<int32_t>>("strides");
+  std::vector<int32_t> paddings =
+      op_info->GetAttr<std::vector<int32_t>>("paddings");
+  int32_t groups = op_info->GetAttr<int32_t>("groups");
+  std::vector<int32_t> dilations =
+      op_info->GetAttr<std::vector<int32_t>>("dilations");
+  bool fuse_relu = op_info->GetAttr<bool>("fuse_relu");
+  auto input_dims = input->dims();
+  auto filter_dims = filter->dims();
+  auto output_dims = output->dims();
+  auto input_data = input->mutable_data<float>();
+  auto filter_data = filter->mutable_data<float>();
+  auto output_data = output->mutable_data<float>();
+  int kernel_w = filter_dims[3];
+  int kernel_h = filter_dims[2];
+  int stride_w = strides[1];
+  int stride_h = strides[0];
+  int dila_w = dilations[1];
+  int dila_h = dilations[0];
+  int pad_w = paddings[2];
+  int pad_h = paddings[0];
+  int batch_size = input_dims[0];
+  int in_ch_size = input_dims[1];
+  int in_h = input_dims[2];
+  int in_w = input_dims[3];
+  int out_ch_size = output_dims[1];
+  int out_h = output_dims[2];
+  int out_w = output_dims[3];
+  int out_c_group = out_ch_size / groups;
+  int in_c_group = in_ch_size / groups;
+  Tensor* bias = nullptr;
+  float* bias_data = nullptr;
+  bool is_channel_bias = false;
+  if (op_info->HasInput("Bias")) {
+    auto bias_var_names = op_info->Input("Bias");
+    if (bias_var_names.size() > 0) {
+      auto bias_var_name = bias_var_names.front();
+      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+      auto bias_dims = bias->dims();
+      is_channel_bias = bias_dims.production() == out_ch_size;
+      bias_data = bias->mutable_data<float>();
+    }
+  }
+  for (int n = 0; n < batch_size; ++n) {
+    for (int g = 0; g < groups; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * groups * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            float out_value =
+                bias_data != nullptr
+                    ? (is_channel_bias ? bias_data[g * out_c_group + oc]
+                                       : bias_data[out_idx])
+                    : 0;
+            // + out_value *= beta;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+                  int in_idx = n * in_ch_size * in_h * in_w +
+                               g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                               ih * in_w + iw;
+                  int filter_idx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+                  out_value += input_data[in_idx] * filter_data[filter_idx];
+                }
+              }
+            }
+            if (fuse_relu) {
+              out_value = out_value > 0 ? out_value : 0;
+            }
+            output_data[out_idx] = out_value;
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_conv(int bs,
+               int ic,
+               int oc,
+               int ih,
+               int iw,
+               bool has_bias,
+               bool is_channel_bias,
+               bool fuse_relu,
+               bool depthwise,
+               int dilation,
+               int stride,
+               int padding,
+               int kernel) {
+  // prepare input&output variables
+  Scope scope;
+  std::string input_var_name("input");
+  std::string filter_var_name("filter");
+  std::string filter_int_var_name("filter_int");
+  std::string bias_var_name("bias");
+  std::string output_var_name("output");
+  std::string output_ref_var_name("output_ref");
+  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
+  auto* filter = scope.Var(filter_var_name)->GetMutable<Tensor>();
+  auto* filter_int = scope.Var(filter_int_var_name)->GetMutable<Tensor>();
+  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
+  auto* output = scope.Var(output_var_name)->GetMutable<Tensor>();
+  auto* output_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
+
+  // get group size and input&filter shape
+  int groups = 1;
+  if (depthwise) {  // depthwise convolution ?
+    groups = oc = ic;
+  }
+  std::vector<int64_t> input_shape = {bs, ic, ih, iw};
+  std::vector<int64_t> filter_shape = {oc, ic / groups, kernel, kernel};
+  std::vector<int64_t> output_shape({bs, oc});
+  for (size_t i = 0; i < 2; i++) {
+    const int dkernel = dilation * (kernel - 1) + 1;
+    int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1;
+    output_shape.push_back(output_size);
+  }
+  input->Resize(input_shape);
+  filter->Resize(filter_shape);
+  filter_int->Resize(filter_shape);
+  // initialize input&output data
+  FillTensor<int8_t, int8_t>(filter_int, -4, 4);
+  float filter_scale = 1. / 16;
+  float input_scale = 1. / 8;
+
+  Tensor input_int;
+  input_int.Resize(input_shape);
+  FillTensor<int8_t, int8_t>(&input_int, -127, 127);
+  for (int i = 0; i < input->data_size(); i++) {
+    input->mutable_data<float>()[i] = input_int.data<int8_t>()[i] * input_scale;
+  }
+  for (int i = 0; i < filter->data_size(); i++) {
+    filter->mutable_data<float>()[i] =
+        filter_int->data<int8_t>()[i] * filter_scale;
+  }
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
+  opdesc.SetInput("Input", {input_var_name});
+  opdesc.SetInput("Filter", {filter_var_name});
+  opdesc.SetOutput("Output", {output_var_name});
+  opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
+  opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int32_t>({padding, padding, padding, padding}));
+  opdesc.SetAttr("groups", groups);
+  opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
+  if (has_bias) {
+    if (is_channel_bias) {
+      bias->Resize({oc});
+    } else {
+      bias->Resize({output_shape});
+    }
+    FillTensor<float>(bias);
+    opdesc.SetInput("Bias", {bias_var_name});
+  }
+
+  auto op_cpu = CreateOp<operators::ConvOpLite>(opdesc, &scope);
+  // execute reference implementation and save to output tensor('out')
+  conv_ref(op_cpu);
+  output_ref->CopyDataFrom(*output);
+
+  // initialize op desc
+  cpp::OpDesc opdesc_mlu;
+  opdesc_mlu.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
+  opdesc_mlu.SetInput("Input", {input_var_name});
+  opdesc_mlu.SetInput("Filter", {filter_int_var_name});
+  opdesc_mlu.SetOutput("Output", {output_var_name});
+  opdesc_mlu.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
+  opdesc_mlu.SetAttr("strides", std::vector<int32_t>({stride, stride}));
+  opdesc_mlu.SetAttr(
+      "paddings", std::vector<int32_t>({padding, padding, padding, padding}));
+  opdesc_mlu.SetAttr("groups", groups);
+  opdesc_mlu.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
+
+  opdesc_mlu.SetAttr("weight_scale", std::vector<float>(oc, filter_scale));
+  opdesc_mlu.SetAttr("input_scale", input_scale);
+
+  if (has_bias) {
+    if (is_channel_bias) {
+      bias->Resize({oc});
+    } else {
+      bias->Resize({output_shape});
+    }
+    FillTensor<float>(bias);
+    opdesc_mlu.SetInput("Bias", {bias_var_name});
+  }
+
+  for (int i = 0; i < bs; i++) {
+    for (int j = 0; j < ic; j++) {
+      for (int k = 0; k < ih * iw; k++) {
+        input->mutable_data<float>()[i * ic * ih * iw + k * ic + j] =
+            input_int.data<int8_t>()[i * ic * ih * iw + j * ih * iw + k] *
+            input_scale;
+      }
+    }
+  }
+
+  input->Resize({bs, ih, iw, ic});
+  output->Resize(
+      {output_shape[0], output_shape[2], output_shape[3], output_shape[1]});
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope);
+  LaunchOp(op, {input_var_name}, {output_var_name});
+  // compare results
+  auto* output_data = output->mutable_data<float>();
+  auto* output_ref_data = output_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize({output_shape});
+  transpose(output_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(output_shape[0]),
+             static_cast<int>(output_shape[2]),
+             static_cast<int>(output_shape[3]),
+             static_cast<int>(output_shape[1])},
+            {0, 3, 1, 2});
+  output_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < output->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, conv) {
+#if 1
+  for (auto bs : {1}) {
+    for (auto ic : {3}) {
+      for (auto oc : {32}) {
+        for (auto ih : {13}) {
+          for (auto iw : {13}) {
+            for (auto has_bias : {false}) {
+              for (auto is_channel_bias : {true}) {
+                for (auto fuse_relu : {false}) {
+                  for (auto depthwise : {false}) {
+                    for (auto dilation : {1}) {
+                      for (auto stride : {1}) {
+                        for (auto kernel : {3}) {
+                          // std::vector<int> paddings = {kernel / 2};
+                          std::vector<int> paddings = {0};
+                          if (kernel / 2 != 0) {
+                            // paddings.push_back(0);
+                          }
+                          for (auto padding : paddings) {
+                            VLOG(3) << "bs: " << bs << " ic: " << ic
+                                    << " oc: " << oc << " ih: " << ih
+                                    << " iw: " << iw
+                                    << " has_bias: " << has_bias
+                                    << " is_channel_bias: " << is_channel_bias
+                                    << " fuse_relu: " << fuse_relu
+                                    << " depthwise: " << depthwise
+                                    << " dilation: " << dilation
+                                    << " stride: " << stride
+                                    << " padding: " << padding
+                                    << " kernel: " << kernel;
+                            test_conv(bs,
+                                      ic,
+                                      oc,
+                                      ih,
+                                      iw,
+                                      has_bias,
+                                      is_channel_bias,
+                                      fuse_relu,
+                                      depthwise,
+                                      dilation,
+                                      stride,
+                                      padding,
+                                      kernel);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 1, 3);
+  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 3);
+  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 2, 5);
+  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 5);
+#endif
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         conv2d,
+                         paddle::lite::subgraph::mlu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         depthwise_conv2d,
+                         paddle::lite::subgraph::mlu::ConvConverter);
diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc
new file mode 100644
index 0000000000..4ef949925d
--- /dev/null
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
+  auto x_dims = x.dims();
+  CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
+  auto y_dims = y->dims();
+  CHECK_GE(x_dims.size(), y_dims.size());
+
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+
+  std::vector<int64_t> y_new_shape(y_dims.Vectorize());
+  if (y_new_shape.size() == 4UL) {
+    return y_new_shape;
+  }
+  for (int i = 0; i < axis; i++) {
+    y_new_shape.insert(y_new_shape.begin(), 1);
+  }
+  while (y_new_shape.size() < 4) {
+    y_new_shape.push_back(1);
+  }
+  CHECK_EQ(y_new_shape.size(), 4UL);
+  return y_new_shape;
+}
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto y_var_name = op_info->Input("Y").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto axis = op_info->GetAttr<int>("axis");
+
+  auto x_tensor = graph->GetNode(x_var_name);
+  auto x = scope->FindTensor(x_var_name);
+  std::shared_ptr<MLUTensor> y_tensor;
+  if (graph->HasNode(y_var_name)) {
+    y_tensor = graph->GetNode(y_var_name);
+  } else {
+    auto y = scope->FindMutableTensor(y_var_name);
+    auto y_new_shape = CvtYShape(*x, y, axis);
+    // all subgraph input tensor are built at first
+    // If we can not find the tensor, it should be const tensor
+    y_tensor = graph->AddNode(
+        y_var_name, y_new_shape, CNML_CONST, CNML_NCHW, graph->FPType());
+    graph->BindConstData(y_var_name, y);
+  }
+
+  auto output_tensor = graph->AddNode(out_var_name,
+                                      x->dims().Vectorize(),
+                                      CNML_TENSOR,
+                                      CNML_NHWC,
+                                      graph->FPType());
+
+  cnmlBaseOp_t elementwise_op;
+  if (op_type == "elementwise_add") {
+    CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
+                                       x_tensor->mlu_tensor(),
+                                       y_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor()));
+  } else if (op_type == "fusion_elementwise_add_activation") {
+    auto mid_tensor = graph->AddNode(out_var_name + "_mid",
+                                     x->dims().Vectorize(),
+                                     CNML_TENSOR,
+                                     CNML_NHWC,
+                                     graph->FPType());
+    CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
+                                       x_tensor->mlu_tensor(),
+                                       y_tensor->mlu_tensor(),
+                                       mid_tensor->mlu_tensor()));
+  } else if (op_type == "elementwise_sub") {
+    CNML_CALL(cnmlCreateBroadcastSubOp(&elementwise_op,
+                                       x_tensor->mlu_tensor(),
+                                       y_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor()));
+  } else if (op_type == "elementwise_mul") {
+    CNML_CALL(cnmlCreateBroadcastMultOp(&elementwise_op,
+                                        x_tensor->mlu_tensor(),
+                                        y_tensor->mlu_tensor(),
+                                        output_tensor->mlu_tensor()));
+  } else if (op_type == "elementwise_div") {
+    CNML_CALL(cnmlCreateRealDivOp(&elementwise_op,
+                                  x_tensor->mlu_tensor(),
+                                  y_tensor->mlu_tensor(),
+                                  output_tensor->mlu_tensor()));
+  } else {
+    LOG(WARNING) << "[MLU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
+
+  graph->FuseOp(elementwise_op);
+  cnmlBaseOp_t act_op;
+  if (op_type == "fusion_elementwise_add_activation") {
+    auto mid_tensor = graph->GetNode(out_var_name + "_mid");
+    auto type_string = op_info->GetAttr<std::string>("act_type");
+    cnmlActiveFunction_t act_type = OpTypeToCNMLActType(type_string);
+    CNML_CALL(cnmlCreateActiveOp(&act_op,
+                                 act_type,
+                                 mid_tensor->mlu_tensor(),
+                                 output_tensor->mlu_tensor()));
+    graph->FuseOp(act_op);
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
diff --git a/lite/kernels/mlu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
new file mode 100644
index 0000000000..388aa68600
--- /dev/null
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/elementwise_ops.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ElementwiseConverter(void* ctx, OpLite* op);
+
+template <typename dtype>
+void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindTensor("x");
+  auto y = scope->FindTensor("y");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
+
+  auto x_data = x->data<dtype>();
+  auto y_data = y->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  int axis = op_info->GetAttr<int>("axis");
+
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  int batch = 1;
+  int channels = y->numel();
+  int num = x->numel() / channels / batch;
+  // do elementwise add/sub/max...
+  std::string op_type = op_info->Type();
+  if (op_type == "elementwise_add") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr + diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (op_type == "elementwise_sub") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr - diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (op_type == "elementwise_mul") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr * diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (op_type == "elementwise_div") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr / diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (op_type == "elementwise_max") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = std::max(*din_ptr, diny_data);
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << op_type;
+  }
+}
+
+void test_elementwise_add(const std::vector<int64_t>& x_shape,
+                          const std::vector<int64_t>& y_shape,
+                          int axis,
+                          std::string elt_type) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string y_var_name = "y";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(x_shape);
+  y->Resize(y_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x, 1, 3);
+  FillTensor<float>(y, 1, 3);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("elementwise_" + elt_type);
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetInput("Y", {y_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", axis);
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
+
+  // execute reference implementation and save to output tensor
+  elementwise_add_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+
+  LaunchOp(op, {x_var_name, y_var_name}, {out_var_name});
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, elementwise_add) {
+  for (auto elt_type : {"add", "sub", "mul", "div"}) {
+    // test_elementwise_add({1, 2, 3, 4}, {2}, 1, elt_type);
+    // test_elementwise_add({1, 2, 3, 4}, {1, 2, 1, 1}, 1, elt_type);
+    test_elementwise_add({1, 2, 3, 4}, {1, 2, 3, 4}, 3, elt_type);
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         elementwise_add,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         elementwise_sub,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         elementwise_mul,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         elementwise_div,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc
new file mode 100644
index 0000000000..43a75daa2b
--- /dev/null
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("Input").front();
+  auto w_var_name = op_info->Input("W").front();
+  auto output_var_name = op_info->Output("Out").front();
+
+  // int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims();
+  auto w_dims = w->dims();
+
+  CHECK_GE(x_dims.size(), 2UL);
+  CHECK_EQ(w_dims.size(), 2UL);
+
+  // Create w node
+  std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
+  auto w_tensor = graph->AddNode(
+      w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
+
+  auto input_scale = op_info->GetAttr<float>("input_scale");
+
+  std::vector<int64_t> output_shape_nhwc({1, 1, 1, w_dims[1]});
+  auto output_tensor = graph->AddNode(output_var_name,
+                                      output_shape_nhwc,
+                                      CNML_TENSOR,
+                                      CNML_NHWC,
+                                      graph->FPType());
+  scope->FindVar(output_var_name)
+      ->GetMutable<::paddle::lite::Tensor>()
+      ->Resize(output_shape_nhwc);
+
+  std::string bias_var_name;
+  std::shared_ptr<MLUTensor> bias_tensor;
+  // Add bias node if bias tensor exists
+  if (HasInputArg(op_info, scope, "Bias")) {
+    bias_var_name = op_info->Input("Bias").front();
+    auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    auto bias_dims = bias->dims();
+    CHECK(!graph->HasNode(bias_var_name));
+    // CHECK_EQ(bias_dims.production(), n);
+
+    bias_tensor = graph->AddNode(bias_var_name,
+                                 bias_dims.Vectorize(),
+                                 CNML_CONST,
+                                 CNML_CNHW,
+                                 graph->FPType());
+    graph->BindConstData(bias_var_name, bias);
+  }
+  cnmlBaseOp_t fc_op;
+  CNML_CALL(cnmlCreateMlpOp(&fc_op,
+                            graph->GetNode(x_var_name)->mlu_tensor(),
+                            output_tensor->mlu_tensor(),
+                            w_tensor->mlu_tensor(),
+                            bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+  graph->SetComputingDataType(
+      fc_op, graph->GetNode(x_var_name)->mlu_tensor(), 1 / input_scale);
+  auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+
+  // LOG(INFO) << "W precision " << int(w->precision());
+  if (w->precision() == PrecisionType::kUnk ||
+      w->precision() == PrecisionType::kInt8) {
+    std::vector<float> w_dequant(w->data_size());
+    dequant(w_dequant.data(),
+            w->mutable_data<int8_t>(),
+            1,
+            w_dims[1],
+            w_dims[0],
+            weight_scale);
+    for (int i = 0; i < w_dims[1]; i++) {
+      for (int j = 0; j < w_dims[0]; j++) {
+        w->mutable_data<float>()[i * w_dims[0] + j] =
+            w_dequant[i + j * w_dims[1]];
+      }
+    }
+    w->set_precision(PrecisionType::kFloat);
+  } else if (w->precision() != PrecisionType::kFloat) {
+    LOG(FATAL) << "UnSupported weight precision!";
+  }
+  // graph->BindConstData(w_var_name, w_dequant.data());
+  graph->BindConstData(w_var_name, w);
+
+  graph->SetComputingDataType(
+      fc_op,
+      w_tensor->mlu_tensor(),
+      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+
+  graph->FuseOp(fc_op);
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fc, kMLU, paddle::lite::subgraph::mlu::FCConverter);
diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc
new file mode 100644
index 0000000000..7e5cfdb32e
--- /dev/null
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/fc_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int FCConverter(void* ctx, OpLite* op);
+
+void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto input =
+      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
+  auto w = scope->FindVar(op_info->Input("W").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int32_t in_num_col_dims = op_info->GetAttr<int32_t>("in_num_col_dims");
+  Tensor* bias = nullptr;
+  float* bias_data = nullptr;
+  if (op_info->HasInput("Bias")) {
+    auto bias_var_names = op_info->Input("Bias");
+    if (bias_var_names.size() > 0) {
+      auto bias_var_name = bias_var_names.front();
+      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+      bias_data = bias->mutable_data<float>();
+    }
+  }
+  auto input_data = input->data<float>();
+  auto w_data = w->mutable_data<float>();
+  auto out_data = out->mutable_data<float>();
+  auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims);
+  int out_num_classes = w->dims()[1];
+  const int M = in_mat_dims[0];
+  const int K = in_mat_dims[1];
+  const int N = out_num_classes;
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      out_data[m * N + n] = 0;
+      for (int k = 0; k < K; ++k) {
+        out_data[m * N + n] += input_data[m * K + k] * w_data[k * N + n];
+      }
+    }
+  }
+  if (bias_data != nullptr) {
+    for (int m = 0; m < M; ++m) {
+      for (int n = 0; n < N; ++n) {
+        out_data[m * N + n] += bias_data[n];
+      }
+    }
+  }
+}
+
+void test_fc(const std::vector<int64_t>& input_shape,
+             const std::vector<int64_t>& w_shape,
+             int in_num_col_dims,
+             bool has_bias) {
+  CHECK_EQ(w_shape.size(), 2UL);
+
+  Scope scope;
+  std::string input_var_name("Input");
+  std::string w_var_name("W");
+  std::string w_int_var_name("W_int");
+  std::string bias_var_name("Bias");
+  std::string out_var_name("Out");
+  std::string out_ref_var_name("out_ref");
+  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
+  auto* w = scope.Var(w_var_name)->GetMutable<Tensor>();
+  auto* w_int = scope.Var(w_int_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  input->Resize(input_shape);
+  w->Resize(w_shape);
+  w_int->Resize(w_shape);
+
+  FillTensor<int8_t, int8_t>(w_int, -127, 127);
+  float w_scale = 1. / 1024;
+  float input_scale = 1. / 8;
+
+  Tensor input_int;
+  input_int.Resize(input_shape);
+  FillTensor<int8_t, int8_t>(&input_int, -127, 127);
+  for (int i = 0; i < input->data_size(); i++) {
+    input->mutable_data<float>()[i] = input_int.data<int8_t>()[i] * input_scale;
+  }
+
+  for (int i = 0; i < w->data_size(); i++) {
+    w->mutable_data<float>()[i] = w_int->data<int8_t>()[i] * w_scale;
+  }
+
+  // create fc op
+  cpp::OpDesc fc_op_desc;
+  fc_op_desc.SetType("fc");
+  fc_op_desc.SetInput("Input", {input_var_name});
+  fc_op_desc.SetInput("W", {w_var_name});
+  fc_op_desc.SetOutput("Out", {out_var_name});
+  fc_op_desc.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
+  if (has_bias) {
+    auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
+    bias->Resize({w_shape[1]});
+    FillTensor<float, int>(bias);
+    fc_op_desc.SetInput("Bias", {bias_var_name});
+  }
+
+  auto fc_op = CreateOp<operators::FcOpLite>(fc_op_desc, &scope);
+  fc_ref(fc_op);
+  out_ref->CopyDataFrom(*out);
+
+  // create fc imlu op
+  cpp::OpDesc fc_op_desc_mlu;
+  fc_op_desc_mlu.SetType("fc");
+  fc_op_desc_mlu.SetInput("Input", {input_var_name});
+  fc_op_desc_mlu.SetInput("W", {w_int_var_name});
+  fc_op_desc_mlu.SetOutput("Out", {out_var_name});
+  fc_op_desc_mlu.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
+
+  fc_op_desc_mlu.SetAttr("weight_scale",
+                         std::vector<float>(w_shape[1], w_scale));
+  fc_op_desc_mlu.SetAttr("input_scale", input_scale);
+  if (has_bias) {
+    fc_op_desc_mlu.SetInput("Bias", {bias_var_name});
+  }
+
+  auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope);
+  input->Resize({static_cast<int>(input_shape[0]),
+                 static_cast<int>(input_shape[2]),
+                 static_cast<int>(input_shape[3]),
+                 static_cast<int>(input_shape[1])});
+  out->Resize({static_cast<int>(input_shape[0]), static_cast<int>(w_shape[1])});
+  LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, fc) {
+  for (bool use_bias : {true, false}) {
+    // test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
+    // test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
+    // test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
+    test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias);
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter);
diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc
new file mode 100644
index 0000000000..27c6ab2597
--- /dev/null
+++ b/lite/kernels/mlu/bridges/graph.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include <utility>
+#include <vector>
+#include "lite/kernels/mlu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
+                                          std::vector<int64_t> shape,
+                                          cnmlTensorType_t tensor_type,
+                                          cnmlDataOrder_t data_order,
+                                          cnmlDataType_t mlu_dtype,
+                                          void* raw_ptr) {
+  CHECK(!HasNode(name));
+  auto node = std::shared_ptr<MLUTensor>(
+      new MLUTensor(shape, tensor_type, data_order, mlu_dtype));
+  node->set_mlu_ptr(raw_ptr);
+  nodes_.insert(std::make_pair(name, node));
+  return node;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h
new file mode 100644
index 0000000000..140900a2dd
--- /dev/null
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/mlu/bridges/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+// The Context of the converters which used for converting the ops of subgraph
+// to the MLU IR graph
+class Graph {
+ public:
+  Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); }
+
+  ~Graph() {
+    CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
+    for (auto op : ops_) {
+      CNML_CALL(cnmlDestroyBaseOp(&op));
+    }
+  }
+
+  // Data node
+  std::shared_ptr<MLUTensor> AddNode(
+      const std::string& name,
+      std::vector<int64_t> shape,
+      cnmlTensorType_t tensor_type = CNML_TENSOR,
+      cnmlDataOrder_t data_order = CNML_NCHW,
+      cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
+      void* raw_ptr = nullptr);
+
+  std::shared_ptr<MLUTensor> GetNode(const std::string& name) {
+    CHECK(HasNode(name)) << "[MLU] Node " << name << " not found.";
+    return nodes_.at(name);
+  }
+
+  bool HasNode(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+  void AddInput(std::shared_ptr<MLUTensor> tensor) {
+    inputs_.push_back(tensor->mlu_tensor());
+    input_tensors_.push_back(tensor);
+  }
+
+  void AddOutput(std::shared_ptr<MLUTensor> tensor) {
+    outputs_.push_back(tensor->mlu_tensor());
+    output_tensors_.push_back(tensor);
+  }
+
+  void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); }
+
+  void Compile(cnmlCoreVersion_t core_version, int core_number) {
+    CNML_CALL(cnmlSetFusionIO(fusion_op_,
+                              inputs_.data(),
+                              inputs_.size(),
+                              outputs_.data(),
+                              outputs_.size()));
+    CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number));
+    CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version));
+    CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
+    for (auto in : input_tensors_) {
+      input_addrs_.push_back(in->mlu_data());
+    }
+    for (auto out : output_tensors_) {
+      output_addrs_.push_back(out->mlu_data());
+    }
+  }
+
+  void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+    CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
+                                            input_addrs_.data(),
+                                            input_addrs_.size(),
+                                            output_addrs_.data(),
+                                            output_addrs_.size(),
+                                            &forward_param,
+                                            que));
+    CNRT_CALL(cnrtSyncQueue(que));
+  }
+
+  void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
+    const float* data = tensor->data<float>();
+    size_t len = tensor->data_size();
+    if (fp_type_ == CNML_DATA_FLOAT32) {
+      CNML_CALL(cnmlBindConstData_V2(
+          nodes_[tensor_name]->mlu_tensor(),
+          const_cast<void*>(static_cast<const void*>(data)),
+          false));
+    } else if (fp_type_ == CNML_DATA_FLOAT16) {
+      auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>();
+      for (size_t i = 0; i < len; ++i) {
+        data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]);
+      }
+      CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(),
+                                     static_cast<void*>(data_fp16),
+                                     false));
+    } else {
+      CHECK(0);
+    }
+  }
+
+  void SetComputingDataType(cnmlBaseOp_t op,
+                            cnmlTensor_t tensor,
+                            float scale,
+                            cnmlDataType_t data_type = CNML_DATA_INT8) {
+    cnmlQuantizedParam_t quant_param;
+    CNML_CALL(
+        cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0));
+    CNML_CALL(
+        cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param));
+    CNML_CALL(cnmlDestroyQuantizedParam(&quant_param));
+  }
+
+  void SetFPType(::paddle::lite_api::PrecisionType type) {
+    switch (type) {
+      case ::paddle::lite_api::PrecisionType::kFP16:
+        fp_type_ = CNML_DATA_FLOAT16;
+        break;
+      case ::paddle::lite_api::PrecisionType::kFloat:
+        fp_type_ = CNML_DATA_FLOAT32;
+        break;
+      default:
+        CHECK(0);
+    }
+  }
+
+  cnmlDataType_t FPType() { return fp_type_; }
+
+ private:
+  cnmlDataType_t fp_type_{CNML_DATA_FLOAT32};
+  std::unordered_map<std::string, std::shared_ptr<MLUTensor>> nodes_;
+  std::vector<cnmlTensor_t> inputs_;
+  std::vector<cnmlTensor_t> outputs_;
+  std::vector<void*> input_addrs_;
+  std::vector<void*> output_addrs_;
+  std::vector<std::shared_ptr<MLUTensor>> input_tensors_;
+  std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
+  std::vector<cnmlBaseOp_t> ops_;
+  cnmlFusionOp_t fusion_op_;
+};
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000..1b12970afa
--- /dev/null
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kMLU);
+USE_SUBGRAPH_BRIDGE(conv2d, kMLU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU);
+USE_SUBGRAPH_BRIDGE(pool2d, kMLU);
+USE_SUBGRAPH_BRIDGE(softmax, kMLU);
+USE_SUBGRAPH_BRIDGE(batch_norm, kMLU);
+USE_SUBGRAPH_BRIDGE(fc, kMLU);
diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc
new file mode 100644
index 0000000000..3119b6c77d
--- /dev/null
+++ b/lite/kernels/mlu/bridges/pool_op.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+inline cnmlPoolMode_t ToCnmlPoolMode(const std::string& pool_mode) {
+  cnmlPoolMode_t cnml_pool_mode;
+  if (pool_mode == "max") {
+    cnml_pool_mode = CNML_POOL_MAX;
+  } else if (pool_mode == "avg") {
+    cnml_pool_mode = CNML_POOL_AVG;
+  } else {
+    CHECK(false) << "Unexpected pool mode " << pool_mode;
+  }
+
+  return cnml_pool_mode;
+}
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input, and attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_var_name);
+  auto input_dims_nhwc = x->dims();
+  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
+  auto output_var_name = op_info->Output("Out").front();
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  int pad_height = paddings[0];
+  int pad_width = paddings[2];
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
+
+  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
+  for (size_t i = 0; i < 2; i++) {
+    output_shape.push_back(
+        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) /
+            strides[i] +
+        1);
+  }
+
+  auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
+  auto output_tensor = graph->AddNode(output_var_name,
+                                      output_shape_nhwc,
+                                      CNML_TENSOR,
+                                      CNML_NHWC,
+                                      graph->FPType());
+  scope->FindVar(output_var_name)
+      ->GetMutable<::paddle::lite::Tensor>()
+      ->Resize(output_shape_nhwc);
+
+  cnmlPoolOpParam_t pool_param;
+  CNML_CALL(
+      cnmlCreatePoolOpParam_V2(&pool_param,
+                               ksize[0],
+                               ksize[1],
+                               strides[0],
+                               strides[1],
+                               pad_height,
+                               pad_width,
+                               1,  // dilation
+                               1,
+                               ToCnmlPoolMode(pooling_type),
+                               ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL,
+                               true, /* real */
+                               1 /* blend factor */));
+  cnmlBaseOp_t pool_op;
+  CNML_CALL(cnmlCreatePoolOp(&pool_op,
+                             pool_param,
+                             graph->GetNode(x_var_name)->mlu_tensor(),
+                             output_tensor->mlu_tensor()));
+  CNML_CALL(cnmlDestroyPoolOpParam(&pool_param));
+  graph->FuseOp(pool_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::PoolConverter);
diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc
new file mode 100644
index 0000000000..29ef68781f
--- /dev/null
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
@@ -0,0 +1,280 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int PoolConverter(void* ctx, OpLite* op);
+
+void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto& in_dims = x->dims();
+  auto& out_dims = out->dims();
+
+  const float* src_ptr = x->data<const float>();
+  float* dst_ptr = out->mutable_data<float>();
+
+  std::vector<int> ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  std::vector<int> strides = op_info->GetAttr<std::vector<int>>("strides");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  bool exclusive = op_info->GetAttr<bool>("exclusive");
+  std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
+
+  int in_n = in_dims[0];
+  int in_c = in_dims[1];
+  int in_h = in_dims[2];
+  int in_w = in_dims[3];
+  int size_in_n = in_c * in_h * in_w;
+  int size_in_c = in_h * in_w;
+
+  int out_h = out_dims[2];
+  int out_w = out_dims[3];
+  int size_out_n = in_c * out_h * out_w;
+  int size_out_c = out_h * out_w;
+
+  int window_h = ksize[0];
+  int window_w = ksize[1];
+  int stride_h = strides[0];
+  int stride_w = strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
+
+  if (global_pooling == true) {
+    for (int n = 0; n < in_n; ++n) {
+      for (int c = 0; c < in_c; ++c) {
+        const float* src = src_ptr + n * size_in_n + c * size_in_c;
+        float res = src[0];
+        if (pooling_type == "max") {
+          for (int i = 1; i < size_in_c; ++i) {
+            float cur_val = src[i];
+            res = cur_val > res ? cur_val : res;
+          }
+        } else if (pooling_type == "avg") {
+          for (int i = 1; i < size_in_c; ++i) {
+            float cur_val = src[i];
+            res += cur_val;
+          }
+          res /= size_in_c;
+        }
+        dst_ptr[n * size_out_n + c] = res;
+      }
+    }
+  } else {
+    for (int n = 0; n < in_n; ++n) {
+      for (int c = 0; c < in_c; ++c) {
+        for (int h = 0; h < out_h; ++h) {
+          int sh = h * stride_h;
+          int eh = sh + window_h;
+          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
+          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
+          for (int w = 0; w < out_w; ++w) {
+            int sw = w * stride_w;
+            int ew = sw + window_w;
+            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
+            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
+            int pooling_size = (ew - sw) * (eh - sh);
+            if (pooling_size == 0) continue;
+            float res = 0.f;
+            for (int kh = sh; kh < eh; ++kh) {
+              for (int kw = sw; kw < ew; ++kw) {
+                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
+                if (kh == sh && kw == sw) {
+                  res = src_ptr[src_idx];
+                } else {
+                  if (pooling_type == "max") {
+                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
+                  }
+                  if (pooling_type == "avg") {
+                    res += src_ptr[src_idx];
+                  }
+                }
+              }
+            }
+            if (pooling_type == "avg") {
+              if (exclusive) {
+                res /= pooling_size;
+              } else {
+                res /= window_h * window_w;
+              }
+            }
+            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_pool(int bs,
+               int ic,
+               int ih,
+               int iw,
+               std::string pooling_type,
+               bool ceil_mode,
+               bool global_pooling,
+               bool exclusive,
+               int ksize,
+               int stride,
+               int padding) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("pool2d");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("pooling_type", pooling_type);
+  opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
+  opdesc.SetAttr("global_pooling", global_pooling);
+  opdesc.SetAttr("exclusive", exclusive);
+  opdesc.SetAttr("ceil_mode", ceil_mode);
+  opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int>({padding, padding, padding, padding}));
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
+  // execute reference implementation and save to output tensor
+  pool_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+
+  auto os = out->dims();
+  out->Resize({static_cast<int>(os[0]),
+               static_cast<int>(os[2]),
+               static_cast<int>(os[3]),
+               static_cast<int>(os[1])});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize(out->dims());
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, pool) {
+  // for (auto pooling_type : {"max", "avg"}) {
+  //   for (auto ceil_mode : {true, false}) {
+  //     for (auto global_pooling : {/*true, */ false}) {
+  //       for (auto exclusive : {true /*, false*/}) {
+  //         for (auto ksize : {2, 3}) {
+  //           for (auto stride : {1, 2}) {
+  //             for (auto padding : {0, 1}) {
+  //               for (auto bs : {1, 3}) {
+  //                 for (auto ic : {1, 3}) {
+  //                   for (auto ih : {3, 7}) {
+  //                     for (auto iw : {3, 7}) {
+  //                       test_pool(bs,
+  //                                 ic,
+  //                                 ih,
+  //                                 iw,
+  //                                 pooling_type,
+  //                                 ceil_mode,
+  //                                 global_pooling,
+  //                                 exclusive,
+  //                                 ksize,
+  //                                 stride,
+  //                                 padding);
+  //                     }
+  //                   }
+  //                 }
+  //               }
+  //             }
+  //           }
+  //         }
+  //       }
+  //     }
+  //   }
+  // }
+
+  for (auto pooling_type : {"max", "avg"}) {
+    for (auto ceil_mode : {true, false}) {
+      bool global_pooling = false;
+      bool exclusive = true;
+      int ksize = 2;
+      int stride = 1;
+      int padding = 0;
+      int bs = 6;
+      int ic = 6;
+      int ih = 6;
+      int iw = 6;
+      test_pool(bs,
+                ic,
+                ih,
+                iw,
+                pooling_type,
+                ceil_mode,
+                global_pooling,
+                exclusive,
+                ksize,
+                stride,
+                padding);
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         pool2d,
+                         paddle::lite::subgraph::mlu::PoolConverter);
diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc
new file mode 100644
index 0000000000..b9e2b1116d
--- /dev/null
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get op's attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  // nchw axis to nhwc aixs
+  int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2};
+  int axis = 1;
+  if (op_info->HasAttr("axis")) {
+    axis = op_info->GetAttr<int>("axis");
+    if (axis < 0) {
+      axis = output_dims.size() + axis;
+    }
+  }
+
+  int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
+
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+  cnmlBaseOp_t softmax_op;
+  CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op,
+                                  nhwc_axis,
+                                  graph->GetNode(x_var_name)->mlu_tensor(),
+                                  output_tensor->mlu_tensor()));
+  graph->FuseOp(softmax_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SoftmaxConverter);
diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc
new file mode 100644
index 0000000000..7ceb050d80
--- /dev/null
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
@@ -0,0 +1,176 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/softmax_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SoftmaxConverter(void* ctx, OpLite* op);
+
+template <typename dtype>
+void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+  DDim x_dims = x->dims();
+
+  auto x_rank = x_dims.size();
+  int axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis += x_rank;
+  }
+  int axis_size = x_dims[axis];
+  int outer_num = x_dims.Slice(0, axis).production();
+  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
+  int compute_size = outer_num * inner_num;
+  for (int i = 0; i < compute_size; i++) {
+    int idx_inner = i % inner_num;
+    int idx_outer = (i / inner_num) * axis_size;
+    int start = idx_outer * inner_num + idx_inner;
+    int offset;
+
+    offset = start;
+    dtype max_data = std::numeric_limits<dtype>::lowest();
+    for (int j = 0; j < axis_size; j++) {
+      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
+      offset += inner_num;
+    }
+
+    offset = start;
+    dtype sum_data = (dtype)0;
+    for (int j = 0; j < axis_size; j++) {
+      out_data[offset] = exp(x_data[offset] - max_data);
+      sum_data += out_data[offset];
+      offset += inner_num;
+    }
+
+    offset = start;
+    for (int j = 0; j < axis_size; j++) {
+      out_data[offset] /= sum_data;
+      offset += inner_num;
+    }
+  }
+}
+
+void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("softmax");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", axis);
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
+  // execute reference implementation and save to output tensor
+  softmax_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+
+  int bs = x->dims()[0];
+  int ic = x->dims()[1];
+  int ih = x->dims()[2];
+  int iw = x->dims()[3];
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+
+  out->Resize({bs, ih, iw, ic});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize({bs, ic, ih, iw});
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {bs, ih, iw, ic},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, softmax) {
+  // test_softmax({1, 4}, -1);
+  // // Bug exists in HiAI DDK when the number of items > 16500
+  // test_softmax({1, 16500}, -1);
+  // test_softmax({1, 4}, 0);
+  // test_softmax({1, 4}, 1);
+  // test_softmax({3, 4}, -1);
+  // test_softmax({3, 4}, 0);
+  // test_softmax({3, 4}, 1);
+  // test_softmax({1, 4, 7}, -1);
+  // test_softmax({1, 4, 7}, 0);
+  // // Bug exists in HiAI DDK when axis is 1 and iw > 1
+  // // test_softmax({1, 4, 7}, 1);
+  // test_softmax({1, 4, 1}, 1);
+  // test_softmax({1, 4, 7}, 2);
+  // test_softmax({3, 4, 7}, -1);
+  // test_softmax({3, 4, 7}, 0);
+  // test_softmax({3, 4, 1}, 1);
+  // test_softmax({3, 4, 7}, 2);
+  test_softmax({1, 4, 7, 9}, -1);
+  test_softmax({1, 4, 7, 9}, 0);
+  test_softmax({1, 4, 7, 9}, 1);
+  // Bug exists in HiAI DDK when axis is 2 and iw > 1
+  // test_softmax({1, 4, 7, 9}, 2);
+  test_softmax({1, 4, 7, 1}, 2);
+  test_softmax({1, 4, 7, 9}, 3);
+  test_softmax({3, 4, 7, 9}, -1);
+  test_softmax({3, 4, 7, 9}, 0);
+  test_softmax({3, 4, 7, 9}, 1);
+  test_softmax({3, 4, 7, 1}, 2);
+  test_softmax({3, 4, 7, 9}, 3);
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         softmax,
+                         paddle::lite::subgraph::mlu::SoftmaxConverter);
diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc
new file mode 100644
index 0000000000..be7e1f09be
--- /dev/null
+++ b/lite/kernels/mlu/bridges/tensor.cc
@@ -0,0 +1,271 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/tensor.h"
+#include <glog/logging.h>
+#include <algorithm>
+#include <climits>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
+                     cnmlTensorType_t tensor_type,
+                     cnmlDataOrder_t data_order,
+                     cnmlDataType_t mlu_dtype)
+    : mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) {
+  std::vector<int> int_shape;
+  for (auto i : shape) {
+    if (i <= INT_MAX) {
+      int_shape.push_back(i);
+    } else {
+      LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!";
+    }
+  }
+  remember(int_shape, tensor_type, mlu_dtype, data_order);
+}
+
+void MLUTensor::remember(const std::vector<int>& shape,
+                         cnmlTensorType_t tensor_type,
+                         cnmlDataType_t mlu_dtype,
+                         cnmlDataOrder_t shape_order) {
+  tensor_type_ = tensor_type;
+  mlu_dtype_ = mlu_dtype;
+
+  int size = 4;
+  if (shape.size() > 4 || shape_order == CNML_ARRAY) {
+    size = shape.size();
+  }
+  shape_.resize(size);
+  if (shape.size() <= 4) {
+    switch (shape_order) {
+      case CNML_NCHW:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_NCWH:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_NHWC:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_NHCW:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_NWCH:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_NWHC:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_CNHW:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_CNWH:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_CHWN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_CHNW:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_CWNH:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_CWHN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_HNCW:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_HNWC:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_HCWN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_HCNW:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_HWNC:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_HWCN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_WNCH:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WNHC:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WCHN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WCNH:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WHNC:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WHCN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_ARRAY:
+        shape_ = shape;
+        break;
+      default:
+        LOG(FATAL) << "Unsupported mluDataOrder! " << int(shape_order);
+        break;
+    }
+  } else {
+    switch (shape_order) {
+      case CNML_NCDHW:
+        shape_[0] = shape[0];
+        shape_[4] = shape[1];
+        shape_[1] = shape[2];
+        shape_[2] = shape[3];
+        shape_[3] = shape[4];
+        break;
+      case CNML_NDHWC:
+        shape_[0] = shape[0];
+        shape_[4] = shape[4];
+        shape_[1] = shape[1];
+        shape_[2] = shape[2];
+        shape_[3] = shape[3];
+        break;
+      case CNML_DHWCN:
+        shape_[0] = shape[4];
+        shape_[4] = shape[3];
+        shape_[1] = shape[0];
+        shape_[2] = shape[1];
+        shape_[3] = shape[2];
+        break;
+      case CNML_ARRAY:
+        shape_ = shape;
+        break;
+      default:
+        shape_[0] = shape[0];
+        shape_[4] = shape[1];
+        shape_[1] = shape[2];
+        shape_[2] = shape[3];
+        shape_[3] = shape[4];
+        break;
+    }
+  }
+  dim_ = shape_.size();
+}
+
+void MLUTensor::Create() {
+  if (mlu_tensor_ == nullptr) {
+    CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_));
+    std::vector<int> dim_shape(shape_);
+    int* dim_strides = nullptr;
+    CNML_CALL(cnmlSetTensorShape_V2(
+        mlu_tensor_, dim_, dim_shape.data(), dim_strides));
+    CNML_CALL(cnmlSetTensorDataType(mlu_tensor_, mlu_dtype_));
+  }
+}
+
+cnmlTensor_t MLUTensor::mlu_tensor() {
+  Create();
+  return mlu_tensor_;
+}
+
+MLUTensor::~MLUTensor() {
+  if (mlu_tensor_ != nullptr) {
+    CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));
+    mlu_tensor_ = nullptr;
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h
new file mode 100644
index 0000000000..7bb2e1b203
--- /dev/null
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/kernels/mlu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+class MLUTensor {
+ public:
+  MLUTensor()
+      : mlu_tensor_(nullptr),
+        tensor_type_(CNML_TENSOR),
+        mlu_dtype_(CNML_DATA_FLOAT32) {}
+
+  void set_mlu_ptr(void* mlu_data) { mlu_ptr_ = mlu_data; }
+
+  MLUTensor(const std::vector<int64_t>& shape,
+            cnmlTensorType_t tensor_type = CNML_TENSOR,
+            cnmlDataOrder_t data_order = CNML_NCHW,
+            cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32);
+
+  void remember(const std::vector<int>& shape,
+                cnmlTensorType_t tensor_type,
+                cnmlDataType_t mlu_dtype,
+                cnmlDataOrder_t shape_order);
+  void Create();
+  cnmlTensor_t mlu_tensor();
+  void* mlu_data() {
+    CHECK(mlu_ptr_ != nullptr);
+    return mlu_ptr_;
+  }
+
+  ~MLUTensor();
+
+ private:
+  cnmlTensor_t mlu_tensor_;
+
+  std::vector<int> shape_;
+  cnmlTensorType_t tensor_type_;
+  cnmlDataType_t mlu_dtype_;
+  int dim_{0};
+  cnmlDataOrder_t data_order_;
+  void* mlu_ptr_;
+};
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc
new file mode 100644
index 0000000000..cf2d7bd6c1
--- /dev/null
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include <utility>
+#include "lite/core/device_info.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/mlu/subgraph_compute.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void LaunchOp(const std::shared_ptr<lite::OpLite> op,
+              const std::vector<std::string>& input_var_names,
+              const std::vector<std::string>& output_var_names) {
+  CNRT_CALL(cnrtInit(0));
+  SetMluDevice(0);
+  cnrtQueue_t queue_;
+  cnrtInvokeFuncParam_t forward_param;
+  u32_t affinity = 1;
+  int data_param = 1;
+  forward_param.data_parallelism = &data_param;
+  forward_param.affinity = &affinity;
+  forward_param.end = CNRT_PARAM_END;
+  CNRT_CALL(cnrtCreateQueue(&queue_));
+  cnrtDev_t dev_handle;
+  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0));
+  CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
+  auto scope = op->scope();
+  auto op_type = op->op_info()->Type();
+  paddle::lite::subgraph::mlu::Graph graph;
+  // convert op to IR graph
+  const auto& bridges = subgraph::Registry::Instance();
+  CHECK(bridges.Exists(op_type, TARGET(kMLU)));
+
+  // Convert all of input data vars and added into the MLU IR graph
+  for (auto& input_name : input_var_names) {
+    auto input_tensor = scope->FindMutableTensor(input_name);
+    CHECK(input_tensor);
+    Tensor temp_input;
+    temp_input.Resize(input_tensor->dims().Vectorize());
+    temp_input.CopyDataFrom(*input_tensor);
+    auto input_node =
+        graph.AddNode(input_name,
+                      input_tensor->dims().Vectorize(),
+                      CNML_TENSOR,
+                      CNML_NHWC,
+                      graph.FPType(),
+                      reinterpret_cast<void*>(
+                          input_tensor->mutable_data<float>(TARGET(kMLU))));
+    CHECK(input_node);
+    CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<float>(),
+                          temp_input.mutable_data<float>(),
+                          sizeof(float) * input_tensor->dims().production(),
+                          CNRT_MEM_TRANS_DIR_HOST2DEV));
+  }
+  bridges.Select(op_type, TARGET(kMLU))(
+      reinterpret_cast<void*>(&graph), const_cast<OpLite*>(op.get()), nullptr);
+
+  for (auto& output_name : output_var_names) {
+    if (graph.HasNode(output_name)) {
+      graph.AddOutput(graph.GetNode(output_name));
+    }
+    auto output_tensor = scope->FindMutableTensor(output_name);
+    void* p_data =
+        static_cast<void*>(output_tensor->mutable_data<float>(TARGET(kMLU)));
+    auto node = graph.GetNode(output_name);
+    CHECK(p_data);
+    node->set_mlu_ptr(p_data);
+  }
+  for (auto& input_name : input_var_names) {
+    graph.AddInput(graph.GetNode(input_name));
+  }
+
+  graph.Compile(CNML_MLU270, 1);
+
+  graph.Compute(forward_param, queue_);
+  for (auto& output_name : output_var_names) {
+    auto output_tensor = scope->FindMutableTensor(output_name);
+    Tensor temp_out;
+    temp_out.Resize(output_tensor->dims().Vectorize());
+    CNRT_CHECK(cnrtMemcpy(temp_out.mutable_data<float>(TARGET(kHost)),
+                          output_tensor->mutable_data<float>(),
+                          sizeof(float) * output_tensor->dims().production(),
+                          CNRT_MEM_TRANS_DIR_DEV2HOST));
+    output_tensor->mutable_data<float>(TARGET(kHost));
+    output_tensor->CopyDataFrom(temp_out);
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+// USE_LITE_OP(graph_op);
+// USE_LITE_KERNEL(graph_op, kMLU, kFloat, kNHWC, def);
diff --git a/lite/kernels/mlu/bridges/test_helper.h b/lite/kernels/mlu/bridges/test_helper.h
new file mode 100644
index 0000000000..4da9e72dfc
--- /dev/null
+++ b/lite/kernels/mlu/bridges/test_helper.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+template <typename T>
+std::shared_ptr<T> CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  auto op = std::make_shared<T>(opdesc.Type());
+  op->SetValidPlaces(
+      {Place{TARGET(kHost), PRECISION(kFloat)},
+       Place{TARGET(kX86), PRECISION(kFloat)},
+       Place{TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNHWC)}});
+  CHECK(op->Attach(opdesc, scope));
+  CHECK(op->CheckShape());
+  CHECK(op->InferShape());
+  return op;
+}
+
+// T is the target data type
+// R is the range data type, e.g. int, half
+template <typename T, typename R = float>
+void FillTensor(Tensor* x,
+                T lower = static_cast<T>(-2),
+                T upper = static_cast<T>(2)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T* x_data = x->mutable_data<T>();
+  for (int i = 0; i < x->dims().production(); ++i) {
+    auto r = uniform_dist(rng) * (upper - lower) + lower;
+    x_data[i] = static_cast<T>(static_cast<R>(r));
+  }
+}
+
+void LaunchOp(const std::shared_ptr<lite::OpLite> op,
+              const std::vector<std::string>& input_var_names,
+              const std::vector<std::string>& output_var_names);
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc
new file mode 100644
index 0000000000..f18a46518c
--- /dev/null
+++ b/lite/kernels/mlu/bridges/utility.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/utility.h"
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void transpose(float* input_data,
+               float* output_data,
+               std::vector<int> input_shape,
+               std::vector<int> axis) {
+  int old_index = -1;
+  int new_index = -1;
+  int dim[4] = {0};
+  std::vector<int> shape = input_shape;
+  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
+    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
+      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
+        for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
+          old_index = dim[0] * shape[1] * shape[2] * shape[3] +
+                      dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
+          new_index =
+              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
+          output_data[new_index] = input_data[old_index];
+        }
+      }
+    }
+  }
+}
+
+int scale2position(float scale) { return static_cast<int>(-std::log2(scale)); }
+
+void dequant(float* dst, int8_t* src, size_t size, float scale) {
+  for (size_t i = 0; i < size; ++i) {
+    dst[i] = static_cast<float>(src[i]) * scale;
+  }
+}
+
+void dequant(float* dst,
+             int8_t* src,
+             size_t size_o,
+             size_t size,
+             size_t size_in,
+             std::vector<float> scales) {
+  for (int out = 0; out < size_o; ++out) {
+    for (int s = 0; s < size; ++s) {
+      auto scale = scales[s];
+      for (int in = 0; in < size_in; ++in) {
+        int idx = in + s * size_in + out * size_in * size;
+        dst[idx] = static_cast<float>(src[idx]) * scale;
+      }
+    }
+  }
+}
+
+cnmlActiveFunction_t OpTypeToCNMLActType(std::string op_type) {
+  if (op_type == "relu") {
+    return CNML_ACTIVE_RELU;
+  } else if (op_type == "sigmoid") {
+    return CNML_ACTIVE_SIGMOID;
+  } else if (op_type == "tanh") {
+    return CNML_ACTIVE_TANH;
+  } else if (op_type == "relu1") {
+    return CNML_ACTIVE_RELU1;
+  } else if (op_type == "relu6") {
+    return CNML_ACTIVE_RELU6;
+  } else if (op_type == "hard_sigmoid") {
+    return CNML_ACTIVE_HARD_SIGMOID;
+  }
+  LOG(FATAL) << "CNML Unspoorted op type " << op_type;
+  return CNML_ACTIVE_NONE;
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h
new file mode 100644
index 0000000000..2af8274e07
--- /dev/null
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cnml.h>
+#include <cnrt.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/mlu/mlu_utils.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "lite/fluid/data_type.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void transpose(float* input_data,
+               float* output_data,
+               std::vector<int> input_shape,
+               std::vector<int> axis);
+int scale2position(float scale);
+void dequant(float* dst, int8_t* src, size_t size, float scale);
+
+void dequant(float* dst,
+             int8_t* src,
+             size_t size_o,
+             size_t size,
+             size_t size_in,
+             std::vector<float> scales);
+
+template <typename T>
+std::vector<T> recip(std::vector<T> x);
+// Type/tensor converters for converting Paddle type/tensor to MLU type/tensor
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+cnmlActiveFunction_t OpTypeToCNMLActType(std::string op_type);
+
+inline const ::paddle::lite::DDimLite DimNHWC2NCHW(
+    const ::paddle::lite::DDimLite& dim) {
+  return ::paddle::lite::DDimLite(
+      std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]}));
+}
+
+inline const ::paddle::lite::DDimLite DimNCHW2NHWC(
+    const ::paddle::lite::DDimLite& dim) {
+  return ::paddle::lite::DDimLite(
+      std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]}));
+}
+
+inline const std::vector<int64_t> DimNHWC2NCHW(
+    const std::vector<int64_t>& dim) {
+  return std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]});
+}
+
+inline const std::vector<int64_t> DimNCHW2NHWC(
+    const std::vector<int64_t>& dim) {
+  return std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]});
+}
+
+template <paddle::lite_api::PrecisionType>
+struct FPTypeTraits {};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+  typedef float T;
+};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+  typedef ::paddle::lite::fluid::float16 T;
+};
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/calib_compute.cc b/lite/kernels/mlu/calib_compute.cc
new file mode 100644
index 0000000000..a3be9968bd
--- /dev/null
+++ b/lite/kernels/mlu/calib_compute.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/calib_compute.h"
+#include <vector>
+#include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+void CalibComputeFp32ToInt8::Run() {
+  // auto& param = this->Param<operators::CalibParam>();
+  // std::vector<float> scale = {param.scale};
+  // const auto* din = param.input->data<float>();
+  // auto* dout = param.output->mutable_data<signed char>();
+  // lite::arm::math::fp32_to_int8(
+  //     din, dout, scale.data(), 1, 1, param.input->numel());
+  // return;
+}
+
+void CalibComputeInt8ToFp32::Run() {
+  // auto& param = this->Param<operators::CalibParam>();
+  // const auto* din = param.input->data<signed char>();
+  // std::vector<float> scale = {param.scale};
+  // auto* dout = param.output->mutable_data<float>();
+  // lite::arm::math::int8_to_fp32(
+  //     din, dout, scale.data(), 1, 1, param.input->numel());
+  // return;
+}
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(calib,
+                     kMLU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::mlu::CalibComputeFp32ToInt8,
+                     fp32_to_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(calib,
+                     kMLU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::mlu::CalibComputeInt8ToFp32,
+                     int8_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
+    .Finalize();
+REGISTER_LITE_KERNEL(calib_once,
+                     kMLU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::mlu::CalibComputeFp32ToInt8,
+                     fp32_to_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(calib_once,
+                     kMLU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::mlu::CalibComputeInt8ToFp32,
+                     int8_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/mlu/calib_compute.h b/lite/kernels/mlu/calib_compute.h
new file mode 100644
index 0000000000..3c5988c165
--- /dev/null
+++ b/lite/kernels/mlu/calib_compute.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/operators/calib_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+class CalibComputeFp32ToInt8
+    : public KernelLite<TARGET(kMLU), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeFp32ToInt8() override{};
+
+ private:
+};
+
+class CalibComputeInt8ToFp32
+    : public KernelLite<TARGET(kMLU), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt8ToFp32() override{};
+
+ private:
+};
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc
new file mode 100644
index 0000000000..bc6e1838d7
--- /dev/null
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2019 Cambricon Authors. All Rights Reserved.
+
+#include <Eigen/Core>
+#include "lite/backends/mlu/target_wrapper.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+using TargetW = TargetWrapper<TARGET(kMLU)>;
+
+// Host to MLU memory.
+void CopyFromHostSync(void* target, const void* source, size_t size) {
+  TargetW::MemcpySync(target, source, size, IoDirection::HtoD);
+}
+
+// MLU to Host memory.
+void CopyToHostSync(void* target, const void* source, size_t size) {
+  TargetW::MemcpySync(target, source, size, IoDirection::DtoH);
+}
+
+/*
+ * This kernel copies a tensor from host to MLU space.
+ */
+template <PrecisionType Precision>
+class IoCopyHostToMluCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  using handler_t = KernelBase::type_infer_handler_t;
+  using param_t = operators::IoCopyParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    CHECK(param.x->target() == TARGET(kHost) ||
+          param.x->target() == TARGET(kX86));
+    auto mem_size = param.x->memory_size();
+    // LOG(INFO) << "copy size " << mem_size;
+    auto* data = param.y->mutable_data(TARGET(kMLU), mem_size);
+    CopyFromHostSync(data, param.x->raw_data(), mem_size);
+  }
+
+  std::unique_ptr<handler_t> GetTypeInferHandler() override {
+    std::unique_ptr<handler_t> res(new handler_t);
+    *res = [](const std::map<std::string, const Type*>& inputs,
+              const std::string& out) -> const Type* {
+      CHECK(!inputs.empty());
+      auto* type = inputs.at("Input");
+      CHECK(type->target() == TARGET(kHost));
+
+      auto out_place = type->place();
+      out_place.target = TARGET(kMLU);
+      auto* out_type = Type::Get(type->id(),
+                                 out_place.target,
+                                 out_place.precision,
+                                 out_place.layout,
+                                 out_place.device);
+      return out_type;
+    };
+    return res;
+  }
+
+  std::string doc() const override { return "Copy IO from HOST to MLU"; }
+};
+
+/*
+ * This kernel copies a tensor from MLU to host space.
+ */
+template <PrecisionType Precision>
+class IoCopyMluToHostCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override {
+    auto& param = this->template Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kMLU));
+    auto mem_size = param.x->memory_size();
+    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    CopyToHostSync(data, param.x->raw_data(), mem_size);
+  }
+
+  std::string doc() const override { return "Copy IO from MLU to HOST"; }
+};
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFloat)>,
+    host_to_device_kFloat)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFP16)>,
+    host_to_device_kFP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFloat)>,
+    device_to_host_kFloat)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFP16)>,
+    device_to_host_kFP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+//                     kMLU,
+//                     kFloat,
+//                     kNHWC,
+//                     paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
+//                     host_to_device)
+//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
+//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+//    .Finalize();
+//
+//
+//                     kMLU,
+//                     kFloat,
+//                     kNHWC,
+//                     paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
+//                     device_to_host)
+//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
+//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+//    .Finalize();
diff --git a/lite/kernels/mlu/subgraph_compute.cc b/lite/kernels/mlu/subgraph_compute.cc
new file mode 100644
index 0000000000..73ca9dcc20
--- /dev/null
+++ b/lite/kernels/mlu/subgraph_compute.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    subgraph,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFloat)>,
+    def_kFloat)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    subgraph,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFP16)>,
+    def_FP16)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .Finalize();
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
new file mode 100644
index 0000000000..06fc791fe7
--- /dev/null
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/core/types.h"
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+template <PrecisionType Precision>
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext* ctx,
+                 int block_idx,
+                 cpp::BlockDesc* block_desc,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names,
+                 Scope* scope,
+                 ::paddle::lite_api::PrecisionType type)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {
+    graph_.SetFPType(type);
+  }
+
+ protected:
+  int BuildDeviceProgram() override {
+    int status = 0;
+    // Convert all of input data vars and added into the MLU IR graph
+    for (auto& input_name : input_names_) {
+      auto input_tensor = scope_->FindMutableTensor(input_name);
+      CHECK(input_tensor);
+      auto input_node =
+          graph_.AddNode(input_name,
+                         input_tensor->dims().Vectorize(),
+                         CNML_TENSOR,
+                         CNML_NHWC,
+                         graph_.FPType(),
+                         const_cast<void*>(input_tensor->raw_data()));
+      CHECK(input_node);
+      // MLU doesn't support dynamic dimensions/shapes, so need to rebuild
+      // the program when the shape of any input tensor is changed.
+      status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
+    }
+    LOG(INFO) << "START TO CONVERT ";
+    // Convert all of ops and its weights and added into the MLU IR graph
+    const auto& bridges = subgraph::Registry::Instance();
+    for (auto& inst : origin_program_) {
+      auto op = inst.op();
+      CHECK(op);
+      op->CheckShape();
+      op->InferShape();
+      std::string op_type = op->op_info()->Type();
+      if (!bridges.Exists(op_type, TARGET(kMLU))) {
+        LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
+        return subgraph::FAILED;
+      }
+      auto kernel = inst.kernel();
+      status |= bridges.Select(op_type, TARGET(kMLU))(
+          reinterpret_cast<void*>(&graph_),
+          const_cast<OpLite*>(op),
+          const_cast<KernelBase*>(kernel));
+      if (subgraph::CHECK_FAILED(status)) {
+        return subgraph::FAILED;
+      }
+    }
+    // Obtain the output nodes of the MLU IR graph and build the graph to MLU
+    // runtime
+    std::vector<std::string> valid_output_names;
+    for (auto& output_name : output_names_) {
+      if (graph_.HasNode(output_name)) {
+        graph_.AddOutput(graph_.GetNode(output_name));
+        auto output_tensor = scope_->FindMutableTensor(output_name);
+        void* p_data = static_cast<void*>(
+            output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
+                                            FPTypeTraits<Precision>::T>(
+                TARGET(kMLU)));
+        auto node = graph_.GetNode(output_name);
+        CHECK(p_data);
+        node->set_mlu_ptr(p_data);
+        valid_output_names.push_back(output_name);
+      }
+    }
+    for (auto& input_name : input_names_) {
+      graph_.AddInput(graph_.GetNode(input_name));
+    }
+    CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
+    // auto& mlu_context = this->ctx_->template As<MLUContext>();
+    // auto core_version = mlu_context.MLUCoreVersion();
+    // auto core_number = mlu_context.MLUCoreNumber();
+    // graph_.Compile(core_version, core_number);
+    return status;
+  }
+
+  int LaunchDeviceProgram() override {
+    // auto& mlu_context = this->ctx_->template As<MLUContext>();
+    // auto exec_queue = mlu_context.exec_queue();
+    // u32_t affinity = mlu_context.affinity();
+    // cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
+    // int data_param = 1;
+    // forward_param.data_parallelism = &data_param;
+    // forward_param.affinity = &affinity;
+    // forward_param.end = CNRT_PARAM_END;
+    // graph_.Compute(forward_param, exec_queue);
+    return 0;
+  }
+
+  paddle::lite::subgraph::mlu::Graph graph_;
+};
+
+template <PrecisionType Precision>
+class SubgraphCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override {
+    auto& param = this->template Param<param_t>();
+    // LOG(INFO) << "SUBGRAP Prepare RUN index " << param.sub_block_idx;
+    engine_.reset(new SubgraphEngine<Precision>(this->ctx_.get(),
+                                                param.sub_block_idx,
+                                                param.sub_block_desc,
+                                                param.input_data_names,
+                                                param.output_data_names,
+                                                param.scope,
+                                                this->precision()));
+    CHECK(engine_);
+    engine_->Build();
+  }
+
+  void Run() override {
+    CHECK(engine_);
+    engine_->Launch();
+  }
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine<Precision>> engine_;
+};
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh
new file mode 100755
index 0000000000..1912efda5e
--- /dev/null
+++ b/lite/tools/build_mlu.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+set -ex
+
+# global variables with default value
+NEUWARE_HOME="${NEUWARE_HOME}"    # XPU SDK
+TARGET_NAME="all"    # default target
+BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+WITH_TESTING=OFF                     # ON/OFF
+
+function print_usage {
+    echo -e "\nUSAGE:"
+    echo
+    echo "----------------------------------------"
+    echo -e "--mlu_sdk_root=<mlu sdk directory>"
+    echo -e "--target_name=<target name>"
+    echo "----------------------------------------"
+    echo
+}
+
+# readonly variables with default value
+readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
+                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
+                               -DWITH_PYTHON=OFF \
+                               -DLITE_WITH_ARM=OFF"
+
+readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
+
+readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+readonly workspace=$(pwd)
+
+function prepare_thirdparty {
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+        rm -rf $workspace/third-party
+
+        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-05b862.tar.gz
+    else
+        # git submodule update --init --recursive
+        echo "third-party is in ready"
+    fi
+}
+
+# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    # in build directory
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
+    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
+    # cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
+
+    # clone submodule
+    # git submodule update --init --recursive
+    prepare_thirdparty
+}
+
+function build_mlu {
+    build_dir=${workspace}/build.lite.mlu
+    mkdir -p $build_dir
+    cd $build_dir
+
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
+    prepare_workspace
+    cmake .. \
+        ${CMAKE_COMMON_OPTIONS} \
+        -DWITH_GPU=OFF \
+        -DWITH_MKLDNN=OFF \
+        -DLITE_WITH_X86=ON \
+        -DWITH_MKL=ON \
+        -DLITE_WITH_MLU=ON \
+        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
+        -DWITH_TESTING=${WITH_TESTING} \
+        -DMLU_SDK_ROOT=${XPU_SDK_ROOT}
+
+    make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
+
+    cd -
+    echo "Done"
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --target_name=*)
+                TARGET_NAME="${i#*=}"
+                shift
+                ;;
+            --build_extra=*)
+                BUILD_EXTRA="${i#*=}"
+                shift
+                ;;
+            --neuware_home=*)
+                NEUWARE_HOME="${i#*=}"
+                shift
+                ;;
+            build)
+                build_mlu
+                shift
+                ;;
+            full_publish)
+                TARGET_NAME=publish_inference
+                build_mlu
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+}
+
+main $@
-- 
GitLab