fix some error when compiling (#6)

* fix some error when compiling with mlu-sdk1.2.5

fix some error when compiling (#6)
* fix some error when compiling with mlu-sdk1.2.5
809f7fc3 · jackzhang235 · jackzhang235 · ce58801f · 809f7fc3 · 809f7fc3
50 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
@@ -177,6 +178,10 @@ if(LITE_WITH_XPU)
    include(device/xpu)
 endif()

+if(LITE_WITH_MLU)
+    include(mlu)
+endif()
+
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -150,6 +150,10 @@ if (LITE_WITH_BM)
 add_definitions("-DLITE_WITH_BM")
 endif()

+if (LITE_WITH_MLU)
+add_definitions("-DLITE_WITH_MLU")
+endif()
+
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
    if (LITE_WITH_PRECISION_PROFILE)

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -100,6 +100,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_MLU)
+    foreach(var ${lite_deps_MLU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()

@@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -144,6 +150,7 @@ function(lite_cc_library TARGET)
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )

    if (args_SHARED OR ARGS_shared)
@@ -170,7 +177,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -189,6 +196,7 @@ function(lite_cc_binary TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -218,7 +226,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -245,6 +253,7 @@ function(lite_cc_test TARGET)
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
@@ -280,12 +290,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -357,6 +367,12 @@ function(add_kernel TARGET device level)
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "MLU")
+        if (NOT LITE_WITH_MLU)
+            return()
+        endif()
+        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            return()
@@ -391,6 +407,7 @@ function(add_kernel TARGET device level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
 	      BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -409,7 +426,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -442,6 +459,7 @@ function(add_operator TARGET level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
 	      BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -65,7 +65,8 @@ if (WITH_TESTING)
      CUDA_DEPS ${cuda_kernels}
      X86_DEPS ${x86_kernels}
      XPU_DEPS ${xpu_kernels}
-      BM_DEPS ${bm_kernels})
+      BM_DEPS ${bm_kernels}
+      MLU_DEPS ${mlu_kernels})
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -87,6 +88,7 @@ message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
+message(STATUS "get MLU kernels ${mlu_kernels}")

 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -124,7 +126,8 @@ lite_cc_library(light_api SRCS light_api.cc
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels})

 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -143,6 +146,7 @@ if(WITH_TESTING)
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
+       MLU_DEPS ${mlu_kernels}
       EXCLUDE_COMPILE_DEPS "ON"
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -288,6 +292,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        XPU_DEPS ${xpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

@@ -320,6 +325,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
  BM_DEPS ${bm_kernels}
+  MLU_DEPS ${mlu_kernels}
  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
    add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -333,6 +339,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -345,6 +352,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -357,6 +365,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -369,6 +378,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
@@ -380,6 +390,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -34,6 +34,11 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 #ifdef LITE_WITH_CUDA
  Env<TARGET(kCUDA)>::Init();
 #endif
+#ifdef LITE_WITH_MLU
+  Env<TARGET(kMLU)>::Init();
+  mlu_core_version_ = config.mlu_core_version();
+  mlu_core_number_ = config.mlu_core_number();
+#endif  // LITE_WITH_MLU
  auto places = config.valid_places();
  std::vector<std::string> passes{};
  auto use_layout_preprocess_pass =
@@ -82,6 +87,9 @@ std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
 void CxxPaddleApiImpl::Run() {
 #ifdef LITE_WITH_ARM
  lite::DeviceInfo::Global().SetRunMode(mode_, threads_);
+#endif
+#ifdef LITE_WITH_MLU
+  lite::DeviceInfo::Global().SetMLURunMode(mlu_core_version_, mlu_core_number_);
 #endif
  raw_predictor_.Run();
 }

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "mlu") {
+      valid_places.emplace_back(TARGET(kMLU));
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -106,6 +106,8 @@ class LITE_API PaddlePredictor {
 protected:
  int threads_{1};
  lite_api::PowerMode mode_{lite_api::LITE_POWER_NO_BIND};
+  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLU_270};
+  int mlu_core_number_{1};
 };

 /// Base class for all the configs.
@@ -136,6 +138,11 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
  int x86_math_library_math_threads_ = 1;
 #endif
+  bool use_firstconv_{false};
+  std::vector<float> mean_ = {0.0f};
+  std::vector<float> std_ = {1.0f};
+  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
+  int mlu_core_number_{1};

 public:
  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -163,6 +170,20 @@ class LITE_API CxxConfig : public ConfigBase {
    return x86_math_library_math_threads_;
  }
 #endif
+  void set_use_firstconv(const bool firstconv) { use_firstconv_ = firstconv; }
+  void set_mean(const std::vector<float> mean) { mean_ = mean; }
+  void set_std(const std::vector<float> std) { std_ = std; }
+  void set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
+    mlu_core_version_ = core_version;
+  }
+  void set_mlu_core_number(int core_number) { mlu_core_number_ = core_number; }
+  bool use_first_conv() const { return use_firstconv_; }
+  std::vector<float> mean() const { return mean_; }
+  std::vector<float> std() const { return std_; }
+  lite_api::MLUCoreVersion mlu_core_version() const {
+    return mlu_core_version_;
+  }
+  int mlu_core_number() const { return mlu_core_number_; }
 };

 /// MobileConfig is the config for the light weight predictor, it will skip

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "fpga",
                                              "npu",
                                              "xpu",
-                                              "bm"};
+                                              "bm",
+                                              "mlu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) {
                                              "kFPGA",
                                              "kNPU",
                                              "kXPU",
+                                              "kMLU",
                                              "kBM"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
@@ -153,6 +155,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kNPU),
                                               TARGET(kXPU),
                                               TARGET(kBM),
+                                               TARGET(kMLU),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -53,8 +53,9 @@ enum class TargetType : int {
  kNPU = 8,
  kXPU = 9,
  kBM = 10,
+  kMLU = 11,
  kAny = 6,  // any target
-  NUM = 11,  // number of fields.
+  NUM = 12,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,
@@ -88,6 +89,8 @@ typedef enum {
  LITE_POWER_RAND_LOW = 5
 } PowerMode;

+typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
+
 enum class ActivationType : int {
  kIndentity = 0,
  kRelu = 1,

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -45,5 +45,8 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
+USE_MIR_PASS(mlu_subgraph_pass);
+USE_MIR_PASS(mlu_postprocess_pass);
+USE_MIR_PASS(subgraph_cast_display_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -109,6 +109,11 @@ void BindLiteCxxConfig(py::module *m) {
      .def("set_power_mode", &CxxConfig::set_power_mode)
      .def("power_mode", &CxxConfig::power_mode);
 #endif
+#ifdef LITE_WITH_MLU
+  cxx_config.def("set_use_firstconv", &CxxConfig::set_use_firstconv)
+      .def("set_mean", &CxxConfig::set_mean)
+      .def("set_std", &CxxConfig::set_std)
+#endif
 }

 // TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
@@ -150,6 +155,9 @@ void BindLitePlace(py::module *m) {
      .value("OpenCL", TargetType::kOpenCL)
      .value("FPGA", TargetType::kFPGA)
      .value("NPU", TargetType::kNPU)
+#ifdef LITE_WITH_MLU
+      .value("MLU", TargetType::kMLU)
+#endif
      .value("Any", TargetType::kAny);

  // PrecisionType
@@ -230,6 +238,20 @@ void BindLiteTensor(py::module *m) {
  DO_GETTER_ONCE(data_type__, name__##_data)

  DATA_GETTER_SETTER_ONCE(int8_t, int8);
+#ifdef LITE_WITH_MLU
+  tensor.def("set_uint8_data",
+             [](Tensor &self,
+                const std::vector<uint8_t> &data,
+                TargetType type = TargetType::kHost) {
+               if (type == TargetType::kHost) {
+                 self.CopyFromCpu<uint8_t, TargetType::kHost>(data.data());
+               }
+             },
+             py::arg("data"),
+             py::arg("type") = TargetType::kHost);
+
+  DO_GETTER_ONCE(uint8_t, "uint8_data");
+#endif
  DATA_GETTER_SETTER_ONCE(int32_t, int32);
  DATA_GETTER_SETTER_ONCE(float, float);
 #undef DO_GETTER_ONCE

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -6,4 +6,5 @@ add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -7,7 +7,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
  CUDA_DEPS target_wrapper_cuda
  CL_DEPS cl_target_wrapper
  FPGA_DEPS fpga_target_wrapper
-  BM_DEPS target_wrapper_bm)
+  BM_DEPS target_wrapper_bm
+  MLU_DEPS target_wrapper_mlu)

 lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)


--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)

 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -24,6 +24,11 @@
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
+#ifdef LITE_WITH_MLU
+#include <cnml.h>
+#include <cnrt.h>
+#include "lite/backends/mlu/mlu_utils.h"
+#endif

 #include <map>
 #include <memory>
@@ -52,6 +57,7 @@ using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
+using MLUContext = Context<TargetType::kMLU>;

 template <>
 class Context<TargetType::kHost> {
@@ -171,6 +177,85 @@ class Context<TargetType::kFPGA> {
 };
 #endif

+#ifdef LITE_WITH_MLU
+template <>
+class Context<TargetType::kMLU> {
+ public:
+  typename Env<TargetType::kMLU>::Devs& devs = Env<TargetType::kMLU>::Global();
+
+  void InitOnce() {}
+
+  MLUContext& operator=(const MLUContext& ctx) {
+    this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
+    return *this;
+  }
+
+  void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
+    CHECK_GT(devs.size(), 0UL)
+        << "Env is not initialized or current target is not exit!";
+    if (dev_id >= static_cast<int>(devs.size())) {
+      LOG(WARNING) << "device index exceeds the number of devices, set to "
+                      "default device(0)!";
+      device_id_ = 0;
+    } else {
+      device_id_ = dev_id;
+    }
+    SetMluDevice(device_id_);
+    if (io_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "data queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      io_queue_id = 0;
+    }
+    if (exec_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      exec_queue_id = 0;
+    }
+    io_queue_ = devs[dev_id].io_queues()[io_queue_id];
+    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
+
+    exec_queue_id_ = exec_queue_id;
+    io_queue_id_ = io_queue_id;
+  }
+
+  void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
+
+  const cnrtQueue_t& exec_queue() const { return exec_queue_; }
+  void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; }
+
+  const cnrtQueue_t& io_queue() const { return io_queue_; }
+  void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
+
+  cnmlCoreVersion_t MLUCoreVersion() {
+    return DeviceInfo::Global().MLUCoreVersion();
+  }
+
+  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
+
+  u32_t affinity() { return affinity_; }
+
+  cnrtInvokeFuncParam_t forward_param() { return forward_param_; }
+
+  int device_id() { return device_id_; }
+
+  std::string name() const { return "MLUContext"; }
+
+ private:
+  int device_id_;
+  // overall information
+  int exec_queue_id_;
+  int io_queue_id_;
+  cnrtQueue_t io_queue_;
+  cnrtQueue_t exec_queue_;
+
+  std::vector<cnrtNotifier_t> input_notifiers_;
+  std::vector<cnrtNotifier_t> output_notifiers_;
+
+  cnrtInvokeFuncParam_t forward_param_;
+  u32_t affinity_ = 0x01;
+};
+#endif  // LITE_WITH_MLU
+
 #ifdef LITE_WITH_CUDA
 // Only works with CUDA kernels.
 template <>
@@ -393,6 +478,16 @@ class ContextScheduler {
        kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
            &ctx->As<BMContext>());
        break;
+#endif
+#ifdef LITE_WITH_MLU
+      case TARGET(kMLU): {
+        int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
+        auto& context = ctx->As<MLUContext>();
+        context.Init(dev_id);
+        kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
+            &context);
+        LOG(INFO) << "New Context for MLU";
+      } break;
 #endif
      default:
 #ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
@@ -434,6 +529,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_BM
    InitContext<TargetType::kBM, BMContext>();
+#endif
+#ifdef LITE_WITH_MLU
+    InitContext<TargetType::kMLU, MLUContext>();
 #endif
  }


--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -58,7 +58,7 @@
 namespace paddle {
 namespace lite {

-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM_) || (defined LITE_WITH_MLU))
 thread_local lite_api::PowerMode DeviceInfo::mode_;
 thread_local ARMArch DeviceInfo::arch_;
 thread_local int DeviceInfo::mem_size_;
@@ -66,6 +66,11 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;

+#ifdef LITE_WITH_MLU
+thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
+thread_local int DeviceInfo::mlu_core_number_{1};
+#endif
+
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1080,6 +1085,28 @@ int DeviceInfo::Setup() {
  return 0;
 }

+#ifdef LITE_WITH_MLU
+void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                               int core_number) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+}
+
+cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
+
+int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
+#endif  // LITE_WITH_MLU
+
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
  thread_num = std::min(thread_num, core_num_);
@@ -1159,6 +1186,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {

 #endif  // LITE_WITH_ARM

+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id) {
+  LOG(INFO) << "Set mlu device " << device_id;
+  cnrtDev_t dev_handle;
+  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id));
+  CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
+}
+
+void Device<TARGET(kMLU)>::Init() {
+  SetMluDevice(idx_);
+  GetInfo();
+  CreateQueue();
+}
+
+void Device<TARGET(kMLU)>::GetInfo() {}
+
+void Device<TARGET(kMLU)>::CreateQueue() {
+  exec_queue_.clear();
+  io_queue_.clear();
+  for (size_t i = 0; i < max_queue_; ++i) {
+    cnrtQueue_t exec_queue;
+    cnrtQueue_t io_queue;
+    cnrtCreateQueue(&exec_queue);
+    cnrtCreateQueue(&io_queue);
+    exec_queue_.push_back(exec_queue);
+    io_queue_.push_back(io_queue);
+
+    cnrtCreateQueue(&exec_queue);
+    exec_queue_.push_back(exec_queue);
+  }
+}
+#endif  // LITE_WITH_MLU
+
 #ifdef LITE_WITH_CUDA

 void Device<TARGET(kCUDA)>::Init() {

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -19,11 +19,14 @@
 #include <vector>
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/mlu_utils.h"
+#endif

 namespace paddle {
 namespace lite {

-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))

 typedef enum {
  kAPPLE = 0,
@@ -52,6 +55,11 @@ class DeviceInfo {
  int Setup();

  void SetRunMode(lite_api::PowerMode mode, int thread_num);
+#ifdef LITE_WITH_MLU
+  void SetMLURunMode(lite_api::MLUCoreVersion core_version, int core_number);
+  cnmlCoreVersion_t MLUCoreVersion();
+  int MLUCoreNumber();
+#endif
  void SetCache(int l1size, int l2size, int l3size);
  void SetArch(ARMArch arch) { arch_ = arch; }

@@ -103,6 +111,11 @@ class DeviceInfo {
  static thread_local TensorLite workspace_;
  static thread_local int64_t count_;

+#ifdef LITE_WITH_MLU
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+#endif
+
  void SetDotInfo(int argc, ...);
  void SetFP16Info(int argc, ...);
  void SetFP32Info(int argc, ...);
@@ -134,6 +147,9 @@ class Env {
    return *devs;
  }
  static void Init(int max_stream = 4) {
+#ifdef LITE_WITH_MLU
+    CNRT_CALL(cnrtInit(0));
+#endif
    Devs& devs = Global();
    if (devs.size() > 0) {
      return;
@@ -156,6 +172,41 @@ class Env {
  }
 };

+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id);
+
+template <>
+class Device<TARGET(kMLU)> {
+ public:
+  Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {}
+  void Init();
+
+  int id() { return idx_; }
+  int max_queue() { return max_queue_; }
+  void SetId(int idx) { idx_ = idx; }
+  std::string name() { return "MLU"; }
+  int core_num() { return 16; }
+  float max_memory() { return 16 * 1024; }
+  std::vector<cnrtQueue_t> io_queues() { return io_queue_; }
+  std::vector<cnrtQueue_t> exec_queues() { return exec_queue_; }
+
+ private:
+  void CreateQueue();
+  void GetInfo();
+
+ private:
+  int idx_{0};
+  int max_queue_;
+  std::string device_name_;
+  float max_memory_;
+
+  std::vector<cnrtQueue_t> io_queue_;
+  std::vector<cnrtQueue_t> exec_queue_;
+};
+
+template class Env<TARGET(kMLU)>;
+#endif  // LITE_WITH_MLU
+
 #ifdef LITE_WITH_CUDA
 template <>
 class Device<TARGET(kCUDA)> {

--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -83,6 +83,9 @@ class KernelBase {
 #if defined(LITE_WITH_CUDA)
    WorkSpace::Global_CUDA().AllocReset();
 #endif
+#if defined(LITE_WITH_MLU)
+    WorkSpace::Global_MLU().AllocReset();
+#endif
 #ifdef LITE_WITH_PROFILE
    profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
    profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());

--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -45,6 +45,11 @@ void* TargetMalloc(TargetType target, size_t size) {
      data = TargetWrapper<TARGET(kBM)>::Malloc(size);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
+      break;
+#endif  // LITE_WITH_MLU
    default:
      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
  }
@@ -79,6 +84,11 @@ void TargetFree(TargetType target, void* data) {
      TargetWrapper<TARGET(kBM)>::Free(data);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::Free(data);
+      break;
+#endif  // LITE_WITH_MLU
    default:
      LOG(FATAL) << "Unknown type";
  }
@@ -110,6 +120,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::MemcpySync(
+          dst, src, size, IoDirection::HtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
    case TargetType::kOpenCL:
      TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -30,6 +30,10 @@
 #include "lite/backends/bm/target_wrapper.h"
 #endif  // LITE_WITH_BM

+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif  // LITE_WITH_MLU
+
 namespace paddle {
 namespace lite {

@@ -81,6 +85,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
    case TARGET(kBM):
      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
      break;
+#endif
+#ifdef LITE_WITH_MLU
+    case TARGET(kMLU):
+      TargetWrapperMlu::MemcpySync(dst, src, size, dir);
+      break;
 #endif
    default:
      LOG(FATAL)

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -35,6 +35,8 @@ lite_cc_library(mir_passes
      demo_pass.cc
      runtime_context_assign_pass.cc
      memory_optimize_pass.cc
+      mlu_postprocess_pass.cc
+      subgraph_cast_display_pass.cc
      weight_quantization_preprocess_pass.cc
      quantized_op_attributes_inference_pass.cc
  DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})

--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
  return adj_list;
 }

+std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildNodeAdjList() {
+  std::map<mir::Node *, std::set<mir::Node *>> adj_list;
+
+  for (auto &n : mutable_nodes()) {
+    if (adj_list.find(&n) == adj_list.end()) {
+      adj_list[&n] = std::set<mir::Node *>();
+    }
+    std::vector<mir::Node *> nodes;
+    for (auto &var : n.inlinks) {
+      nodes.push_back(var);
+    }
+    std::sort(nodes.begin(),
+              nodes.end(),
+              [](mir::Node *node1, mir::Node *node2) { return node1 > node2; });
+    adj_list[&n].insert(std::make_move_iterator(nodes.begin()),
+                        std::make_move_iterator(nodes.end()));
+  }
+  return adj_list;
+}
+
 void SSAGraph::SortHelper(
    const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
    mir::Node *node,
@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
  return res;
 }

+std::vector<mir::Node *> SSAGraph::NodeTopologicalOrder() {
+  CheckBidirectionalConnection();
+
+  std::stack<mir::Node *> stack;
+  std::set<mir::Node *> visited;
+  std::vector<mir::Node *> res;
+
+  auto adj_list = BuildNodeAdjList();
+
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &res);
+    }
+  }
+
+  return res;
+}
+
 Node *SSAGraph::GraphCreateInstructNode(
    const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
  node_storage_.emplace_back();

--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
@@ -42,6 +42,8 @@ class SSAGraph : GraphBase {

  std::vector<mir::Node *> StmtTopologicalOrder();

+  std::vector<mir::Node *> NodeTopologicalOrder();
+
  // The inputs of the graph.
  std::vector<mir::Node *> inputs();

@@ -86,6 +88,9 @@ class SSAGraph : GraphBase {
  // Build operator inlink edge table.
  std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList();

+  // Build node inlink edge table.
+  std::map<mir::Node *, std::set<mir::Node *>> BuildNodeAdjList();
+
  void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
                  mir::Node *node,
                  std::set<mir::Node *> *visited,

--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -313,8 +313,9 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {

 std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
    node_map_t *nodes) {
-  for (auto &it : *nodes) {
-    node_dat_t *node = it.second;
+  for (auto &n_tpo : graph_->NodeTopologicalOrder()) {
+    CHECK(nodes->find(n_tpo) != nodes->end());
+    node_dat_t *node = (*nodes)[n_tpo];
    if (!node->marked) {
      continue;
    }

--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -67,6 +67,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }

+void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -77,3 +91,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
    .BindTargets({TARGET(kBM)});
+REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
+    .BindTargets({TARGET(kMLU)});
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

+class MLUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/subgraph_cast_display_pass.cc
+++ b/lite/core/mir/subgraph_cast_display_pass.cc
@@ -22,29 +22,15 @@ namespace mir {
 class SubgraphCastDisplayPass : public DebugPass {
 public:
  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    VLOG(3) << "== Argument types ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (!node.IsArg()) continue;
-
-      auto* type = node.AsArg().type;
-      if (type) {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
-      } else {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
-      }
-    }
-    VLOG(3) << "---------------------";
-
-    //
-    VLOG(0) << "== SubgraphOp Debug Info ==";
+    VLOG(4) << "== SubgraphOp Debug Info ==";
    for (auto& node : graph->mutable_nodes()) {
      if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
-        VLOG(0) << "FOUND SUBGRAPH OP";
+        VLOG(4) << "FOUND SUBGRAPH OP";
        display_debug_info(node, "subgraph");
        break;
      }
    }
-    VLOG(0) << "---------------------";
+    VLOG(4) << "---------------------";
  }

  void display_debug_info(const Node& node,
@@ -52,17 +38,17 @@ class SubgraphCastDisplayPass : public DebugPass {
                          bool display_in_nodes = true,
                          bool display_out_nodes = true) {
    CHECK(node.IsStmt());
-    VLOG(0) << node.AsStmt();
+    // VLOG(4) << node.AsStmt();
    if (display_in_nodes) {
      for (auto p_in_arg_node : node.inlinks) {
        CHECK(p_in_arg_node->IsArg());
-        VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
+        VLOG(4) << "* ARG[IN] " << p_in_arg_node->AsArg().name
                << " type: " << *p_in_arg_node->AsArg().type
                << " is_weight: " << p_in_arg_node->AsArg().is_weight
                << " is_persist: " << p_in_arg_node->AsArg().is_persist
                << " input_count: " << p_in_arg_node->inlinks.size();
        if (p_in_arg_node->inlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
+          VLOG(4) << "** END with No Op";
        }
        for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
          CHECK(p_in_stmt_node->IsStmt());
@@ -71,7 +57,7 @@ class SubgraphCastDisplayPass : public DebugPass {
              stmt_op_type == "io_copy") {
            display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
+            VLOG(4) << "** END with op type: " << stmt_op_type;
          }
        }
      }
@@ -79,13 +65,13 @@ class SubgraphCastDisplayPass : public DebugPass {
    if (display_out_nodes) {
      for (auto p_out_arg_node : node.outlinks) {
        CHECK(p_out_arg_node->IsArg());
-        VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
+        VLOG(4) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
                << " type: " << *p_out_arg_node->AsArg().type
                << " is_weight: " << p_out_arg_node->AsArg().is_weight
                << " is_persist: " << p_out_arg_node->AsArg().is_persist
                << " output_count: " << p_out_arg_node->outlinks.size();
        if (p_out_arg_node->outlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
+          VLOG(4) << "** END with No Op";
        }
        for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
          CHECK(p_out_stmt_node->IsStmt());
@@ -94,7 +80,7 @@ class SubgraphCastDisplayPass : public DebugPass {
              stmt_op_type == "io_copy") {
            display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
+            VLOG(4) << "** END with op type: " << stmt_op_type;
          }
        }
      }
@@ -108,4 +94,4 @@ class SubgraphCastDisplayPass : public DebugPass {

 REGISTER_MIR_PASS(subgraph_cast_display_pass,
                  paddle::lite::mir::SubgraphCastDisplayPass)
-    .BindTargets({TARGET(kAny)});
+    .BindTargets({TARGET(kMLU)});
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -107,6 +107,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
    case TARGET(kBM): {
      CREATE_KERNEL(kBM);
    } break;
+    case TARGET(kMLU): {
+      CREATE_KERNEL(kMLU);
+    } break;
    default:
      CHECK(false) << "not supported kernel target " << TargetToStr(target);
  }
@@ -139,6 +142,15 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kCUDA, kInt64, kNCHW);
  INIT_FOR(kCUDA, kInt64, kNHWC);

+  INIT_FOR(kMLU, kFloat, kNHWC);
+  INIT_FOR(kMLU, kFloat, kNCHW);
+  INIT_FOR(kMLU, kFP16, kNHWC);
+  INIT_FOR(kMLU, kFP16, kNCHW);
+  INIT_FOR(kMLU, kInt8, kNHWC);
+  INIT_FOR(kMLU, kInt8, kNCHW);
+  INIT_FOR(kMLU, kInt16, kNHWC);
+  INIT_FOR(kMLU, kInt16, kNCHW);
+
  INIT_FOR(kHost, kFloat, kNCHW);
  INIT_FOR(kHost, kAny, kNCHW);
  INIT_FOR(kHost, kFloat, kNHWC);

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -268,7 +268,32 @@ class KernelRegistry final {
                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kFPGA),
                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *  //
+                                      DATALAYOUT(kAny)> *,  //
+
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt16),
+                                      DATALAYOUT(kNCHW)> *  //
              >;

  KernelRegistry();

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -115,9 +115,15 @@ class Optimizer {
           "variable_place_inference_pass",  //
           "argument_type_display_pass",

+           "mlu_subgraph_pass",
+           "mlu_postprocess_pass",
+           // subgraph_cast_display_pass
+
           "runtime_context_assign_pass",
           "argument_type_display_pass",
+
           "memory_optimize_pass"}};
+
      if (passes.size() == 1) {
        passes_local.push_back(passes[0]);
      }

--- a/lite/core/workspace.h
+++ b/lite/core/workspace.h
@@ -69,6 +69,13 @@ class WorkSpace {
  }
 #endif

+#if defined(LITE_WITH_MLU)
+  static WorkSpace& Global_MLU() {
+    thread_local std::unique_ptr<WorkSpace> x(new WorkSpace(TARGET(kMLU)));
+    return *x;
+  }
+#endif
+
 private:
  explicit WorkSpace(TargetType x) : target_(x) {}


--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -10,4 +10,5 @@ add_subdirectory(opencl)
 add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -29,13 +29,13 @@ set(mlu_subgraph_bridges
        CACHE INTERNAL "mlu_subgraph_bridges")


-# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
-# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
+lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)

 message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
@@ -54,4 +54,8 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle

+REGISTER_SUBGRAPH_BRIDGE(sigmoid,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -25,8 +25,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {

-int ActConverter(void* ctx, OpLite* op);
-
 template void FillTensor<float, int>(Tensor* x,
                                     float lower = -2,
                                     float upper = -2);
@@ -149,8 +147,6 @@ TEST(MLUBridges, activation) {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         sigmoid,
-                         paddle::lite::subgraph::mlu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
+USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
+USE_SUBGRAPH_BRIDGE(relu, kMLU)
+USE_SUBGRAPH_BRIDGE(tanh, kMLU)
--- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
@@ -23,8 +23,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {

-int BatchNormConverter(void* ctx, OpLite* op);
-
 template <typename dtype>
 void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
  Scope* scope = op->scope();
@@ -181,6 +179,4 @@ TEST(MLUBridges, batch_norm) {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         batch_norm,
-                         paddle::lite::subgraph::mlu::BatchNormConverter);
+USE_SUBGRAPH_BRIDGE(batch_norm, kMLU)
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -25,8 +25,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {

-int ConvConverter(void* ctx, OpLite* op);
-
 void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
  Scope* scope = op->scope();
  const OpInfo* op_info = op->op_info();
@@ -342,9 +340,5 @@ TEST(MLUBridges, conv) {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         conv2d,
-                         paddle::lite::subgraph::mlu::ConvConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         depthwise_conv2d,
-                         paddle::lite::subgraph::mlu::ConvConverter);
+USE_SUBGRAPH_BRIDGE(conv2d, kMLU)
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU)
--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
@@ -24,8 +24,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {

-int ElementwiseConverter(void* ctx, OpLite* op);
-
 template <typename dtype>
 void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
  Scope* scope = op->scope();
@@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_add,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_sub,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_mul,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_div,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU)
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kMLU)
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU)
+USE_SUBGRAPH_BRIDGE(elementwise_div, kMLU)
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -24,8 +24,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {

-int FCConverter(void* ctx, OpLite* op);
-
 void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
  Scope* scope = op->scope();
  const OpInfo* op_info = op->op_info();
@@ -170,4 +168,4 @@ TEST(MLUBridges, fc) {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter);
+USE_SUBGRAPH_BRIDGE(fc, kMLU);
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
@@ -24,8 +24,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {

-int PoolConverter(void* ctx, OpLite* op);
-
 void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
  Scope* scope = op->scope();
  const OpInfo* op_info = op->op_info();
@@ -275,6 +273,4 @@ TEST(MLUBridges, pool) {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         pool2d,
-                         paddle::lite::subgraph::mlu::PoolConverter);
+USE_SUBGRAPH_BRIDGE(pool2d, kMLU)
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
@@ -23,8 +23,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {

-int SoftmaxConverter(void* ctx, OpLite* op);
-
 template <typename dtype>
 void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
  Scope* scope = op->scope();
@@ -171,6 +169,4 @@ TEST(MLUBridges, softmax) {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         softmax,
-                         paddle::lite::subgraph::mlu::SoftmaxConverter);
+USE_SUBGRAPH_BRIDGE(softmax, kMLU)
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
              const std::vector<std::string>& input_var_names,
              const std::vector<std::string>& output_var_names) {
  CNRT_CALL(cnrtInit(0));
-  SetMluDevice(0);
+  ::paddle::lite::SetMluDevice(0);
  cnrtQueue_t queue_;
  cnrtInvokeFuncParam_t forward_param;
  u32_t affinity = 1;

--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL(
    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
    .Finalize();
-
-//                     kMLU,
-//                     kFloat,
-//                     kNHWC,
-//                     paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
-//                     host_to_device)
-//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
-//    .Finalize();
-//
-//
-//                     kMLU,
-//                     kFloat,
-//                     kNHWC,
-//                     paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
-//                     device_to_host)
-//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-//    .Finalize();
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine {
    graph_.SetFPType(type);
  }

+  int Build() {
+    // In order to attach all of the ops of the block desc, we need to build
+    // the original program firstly.
+    BuildOriginProgram();
+    // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
+    build_device_program_status_ = BuildDeviceProgram();
+    return build_device_program_status_;
+  }
+
+  int Launch() {
+    // Rebuild device program when the shapes of input tensors have been
+    // changed.
+    if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
+        subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
+            build_device_program_status_) &&
+        InputShapeChanged()) {
+      Build();
+    }
+    if (subgraph::CHECK_FAILED(build_device_program_status_)) {
+      LaunchOriginProgram();
+    } else {
+      LaunchDeviceProgram();
+    }
+    return 0;
+  }
+
 protected:
  int BuildDeviceProgram() override {
    int status = 0;
@@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine {
      graph_.AddInput(graph_.GetNode(input_name));
    }
    CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
-    // auto& mlu_context = this->ctx_->template As<MLUContext>();
-    // auto core_version = mlu_context.MLUCoreVersion();
-    // auto core_number = mlu_context.MLUCoreNumber();
-    // graph_.Compile(core_version, core_number);
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
+    auto core_version = mlu_context.MLUCoreVersion();
+    auto core_number = mlu_context.MLUCoreNumber();
+    graph_.Compile(core_version, core_number);
    return status;
  }

  int LaunchDeviceProgram() override {
-    // auto& mlu_context = this->ctx_->template As<MLUContext>();
-    // auto exec_queue = mlu_context.exec_queue();
-    // u32_t affinity = mlu_context.affinity();
-    // cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
-    // int data_param = 1;
-    // forward_param.data_parallelism = &data_param;
-    // forward_param.affinity = &affinity;
-    // forward_param.end = CNRT_PARAM_END;
-    // graph_.Compute(forward_param, exec_queue);
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
+    auto exec_queue = mlu_context.exec_queue();
+    u32_t affinity = mlu_context.affinity();
+    cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
+    int data_param = 1;
+    forward_param.data_parallelism = &data_param;
+    forward_param.affinity = &affinity;
+    forward_param.end = CNRT_PARAM_END;
+    graph_.Compute(forward_param, exec_queue);
    return 0;
  }


--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
  return()
 endif()


--- a/lite/kernels/x86/cast_compute.cc
+++ b/lite/kernels/x86/cast_compute.cc
@@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast,
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    cast,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::CastCompute<::paddle::lite::fluid::float16>,
+    fp16_to_any)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
-if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
+if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
 endif()
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
    lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})