Support bitman backend,test=develop (#2761)

* Support bitman backend

Support bitman backend,test=develop (#2761)
* Support bitman backend
c4a87224 · myq406450149 · GitHub · 13945aed · c4a87224 · c4a87224
48 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
+lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
@@ -192,6 +193,9 @@ if(LITE_WITH_CUDA)
  include(cuda)
 endif()

+if(LITE_WITH_BM)
+  include(bm)
+endif()
 include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs

--- a/cmake/bm.cmake
+++ b/cmake/bm.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_BM)
+  return()
+endif()
+
+if(NOT DEFINED BM_SDK_ROOT)
+    set(BM_SDK_ROOT $ENV{BM_SDK_ROOT})
+    if(NOT BM_SDK_ROOT)
+        message(FATAL_ERROR "Must set BM_SDK_ROOT or env BM_SDK_ROOT when LITE_WITH_BM=ON")
+    endif()
+endif()
+
+message(STATUS "BM_SDK_ROOT: ${BM_SDK_ROOT}")
+find_path(BM_SDK_INC NAMES bmruntime_interface.h
+  PATHS ${BM_SDK_ROOT}/include/bmruntime NO_DEFAULT_PATH)
+if(NOT BM_SDK_INC)
+  message(FATAL_ERROR "Can not find bmruntime_interface.h in ${BM_SDK_ROOT}/include")
+endif()
+
+include_directories("${BM_SDK_ROOT}/include/bmruntime")
+include_directories("${BM_SDK_ROOT}/include/bmlib")
+include_directories("${BM_SDK_ROOT}/include/bmcompiler")
+include_directories("${BM_SDK_ROOT}/include/bmcpu")
+include_directories("${BM_SDK_ROOT}/include/bmlog")
+
+find_library(BM_SDK_RT_LIB NAMES bmrt
+  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+if(NOT BM_SDK_RT_LIB)
+  message(FATAL_ERROR "Can not find bmrt Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmrt Library: ${BM_SDK_RT_LIB}")
+  add_library(bmrt SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmrt PROPERTY IMPORTED_LOCATION ${BM_SDK_RT_LIB})
+endif()
+
+find_library(BM_SDK_BM_LIB NAMES bmlib
+  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+if(NOT BM_SDK_BM_LIB)
+  message(FATAL_ERROR "Can not find bmlib Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmlib Library: ${BM_SDK_BM_LIB}")
+  add_library(bmlib SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmlib PROPERTY IMPORTED_LOCATION ${BM_SDK_BM_LIB})
+endif()
+
+find_library(BM_SDK_COMPILER_LIB NAMES bmcompiler
+  PATHS ${BM_SDK_ROOT}/lib/bmcompiler)
+if(NOT BM_SDK_COMPILER_LIB)
+  message(FATAL_ERROR "Can not find bmcompiler Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmcompiler Library: ${BM_SDK_COMPILER_LIB}")
+  add_library(bmcompiler SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmcompiler PROPERTY IMPORTED_LOCATION ${BM_SDK_COMPILER_LIB})
+endif()
+
+find_library(BM_SDK_CPU_LIB NAMES bmcpu
+  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+if(NOT BM_SDK_CPU_LIB)
+  message(FATAL_ERROR "Can not find bmcpu Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmcpu Library: ${BM_SDK_CPU_LIB}")
+  add_library(bmcpu SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmcpu PROPERTY IMPORTED_LOCATION ${BM_SDK_CPU_LIB})
+endif()
+
+set(bm_runtime_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm runtime libs")
+set(bm_builder_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm builder libs")
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -143,6 +143,10 @@ if (LITE_WITH_FPGA)
 add_definitions("-DLITE_WITH_FPGA")
 endif()

+if (LITE_WITH_BM)
+add_definitions("-DLITE_WITH_BM")
+endif()
+
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
    if (LITE_WITH_PRECISION_PROFILE)

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -94,6 +94,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_BM)
+    foreach(var ${lite_deps_BM_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()

@@ -119,7 +125,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -129,6 +135,7 @@ function(lite_cc_library TARGET)
            X86_DEPS ${args_X86_DEPS}
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
+            BM_DEPS ${args_BM_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
@@ -163,7 +170,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -177,6 +184,7 @@ function(lite_cc_binary TARGET)
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
+	    BM_DEPS ${args_BM_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
@@ -210,7 +218,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -232,6 +240,7 @@ function(lite_cc_test TARGET)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -260,6 +269,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(bm_kernels CACHE INTERNAL "bm kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")

@@ -270,12 +280,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA)
+# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -341,6 +351,12 @@ function(add_kernel TARGET device level)
        endif()
        set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "BM")
+        if (NOT LITE_WITH_BM)
+            return()
+        endif()
+        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            return()
@@ -374,6 +390,7 @@ function(add_kernel TARGET device level)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -392,7 +409,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -424,6 +441,7 @@ function(add_operator TARGET level)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")

@@ -66,6 +67,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (LITE_WITH_FPGA)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
    endif(LITE_WITH_FPGA)
+    if (LITE_WITH_BM)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
+    endif(LITE_WITH_BM)
 else()
    set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -61,13 +61,19 @@ if (WITH_TESTING)
           ${ops} ${host_kernels}
      CUDA_DEPS ${cuda_kernels}
      X86_DEPS ${x86_kernels}
-      XPU_DEPS ${xpu_kernels})
+      XPU_DEPS ${xpu_kernels}
+      BM_DEPS ${bm_kernels})
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
    set(cxx_api_deps ${cxx_api_deps} ${fpga_deps})
 endif()

+if(LITE_WITH_BM)
+    set(light_api_deps ${light_api_deps} ${bm_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
+endif()
+
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get CUDA kernels ${cuda_kernels}")
@@ -76,6 +82,7 @@ message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
+message(STATUS "get BM kernels ${bm_kernels}")

 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -90,6 +97,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        CV_DEPS paddle_cv_arm
                        NPU_DEPS ${npu_kernels}
                        XPU_DEPS ${xpu_kernels}
+                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
                        FPGA_DEPS ${fpga_kernels})
 endif()
@@ -111,7 +119,8 @@ lite_cc_library(light_api SRCS light_api.cc
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
-        FPGA_DEPS ${fpga_kernels})
+        FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels})

 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -129,6 +138,7 @@ if(WITH_TESTING)
       XPU_DEPS ${xpu_kernels}
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
+       BM_DEPS ${bm_kernels}
       EXCLUDE_COMPILE_DEPS "ON"
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -164,6 +174,12 @@ if(WITH_TESTING)
           ${ops} ${host_kernels} ${x86_kernels}
           ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn)
        add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz)
+        if(LITE_WITH_BM)
+           lite_cc_test(test_resnet50_lite_bm SRCS test_resnet50_lite_bm.cc
+              DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+              ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
+              ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+        endif()
    endif()
 endif()

@@ -254,6 +270,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
        DEPS light_api program mir_passes paddle_api_light
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels}
        ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

 lite_cc_test(test_apis SRCS apis_test.cc
@@ -262,6 +279,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        X86_DEPS ${x86_kernels}
        XPU_DEPS ${xpu_kernels}
        FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

@@ -293,6 +311,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  CL_DEPS ${opencl_kernels}
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
+  BM_DEPS ${bm_kernels}
  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
    add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -307,6 +326,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
+        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -328,6 +348,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
+	BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -55,7 +55,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "any",
                                              "fpga",
                                              "npu",
-                                              "xpu"};
+                                              "xpu",
+                                              "bm"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -94,7 +95,8 @@ const std::string& TargetRepr(TargetType target) {
                                              "kAny",
                                              "kFPGA",
                                              "kNPU",
-                                              "kXPU"};
+                                              "kXPU",
+                                              "kBM"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -135,6 +137,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kOpenCL),
                                               TARGET(kNPU),
                                               TARGET(kXPU),
+                                               TARGET(kBM),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -52,8 +52,9 @@ enum class TargetType : int {
  kFPGA = 7,
  kNPU = 8,
  kXPU = 9,
+  kBM = 10,
  kAny = 6,  // any target
-  NUM = 10,  // number of fields.
+  NUM = 11,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/api/test_resnet50_lite_bm.cc
+++ b/lite/api/test_resnet50_lite_bm.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(input_img_txt_path,
+              "",
+              "if set input_img_txt_path, read the img filename as input.");
+
+namespace paddle {
+namespace lite {
+
+void TestModel(const std::vector<Place>& valid_places) {
+  lite::Predictor predictor;
+  std::vector<std::string> passes;
+  passes.push_back("bm_subgraph_pass");
+  predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  if (FLAGS_input_img_txt_path.empty()) {
+    for (int i = 0; i < item_size; i++) {
+      data[i] = 1;
+    }
+  } else {
+    std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
+    if (!fs.is_open()) {
+      LOG(FATAL) << "open input_img_txt error.";
+    }
+    for (int i = 0; i < item_size; i++) {
+      fs >> data[i];
+    }
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  auto* out_data = out->data<float>();
+  FILE* fp = fopen("result.txt", "wb");
+  for (int i = 0; i < out->numel(); i++) {
+    fprintf(fp, "%f\n", out_data[i]);
+  }
+  fclose(fp);
+}
+
+TEST(ResNet50, test_bm) {
+  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+
+  TestModel(valid_places);
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(bm)
--- a/lite/backends/bm/CMakeLists.txt
+++ b/lite/backends/bm/CMakeLists.txt
+if (NOT LITE_WITH_BM)
+    return()
+endif()
+
+lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc DEPS ${bm_runtime_libs})
--- a/lite/backends/bm/target_wrapper.cc
+++ b/lite/backends/bm/target_wrapper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/bm/target_wrapper.h"
+#include <bmcompiler_if.h>
+#include <bmlib_runtime.h>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+
+int TargetWrapperBM::device_id_ = 0;
+std::map<int, void*> TargetWrapperBM::bm_hds_;
+
+size_t TargetWrapperBM::num_devices() {
+  int count = 0;
+  bm_dev_getcount(&count);
+  return count;
+}
+
+void TargetWrapperBM::SetDevice(int id) {
+  /*
+    if (id < 0 || (size_t)id >= num_devices()) {
+      LOG(FATAL) << "Failed with invalid device id " << id;
+    }
+  */
+  device_id_ = id;
+  if (bm_hds_.find(id) == bm_hds_.end()) {
+    bm_handle_t bm_handle;
+    bm_status_t ret = bm_dev_request(&bm_handle, id);
+    CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
+                              << static_cast<int>(ret);
+    bm_hds_.insert(std::pair<int, bm_handle_t>(id, bm_handle));
+  }
+  return;
+}
+
+void* TargetWrapperBM::GetHandle() {
+  if (bm_hds_.find(device_id_) == bm_hds_.end()) {
+    LOG(FATAL) << "device not initialized " << device_id_;
+  }
+  return bm_hds_.at(device_id_);
+}
+
+void* TargetWrapperBM::Malloc(size_t size) {
+  void* ptr{};
+
+  if (bm_hds_.find(device_id_) == bm_hds_.end()) {
+    SetDevice(device_id_);
+  }
+
+  bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
+  bm_device_mem_t* p_mem =
+      reinterpret_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
+  bm_malloc_device_byte(bm_handle, p_mem, size);
+  ptr = reinterpret_cast<void*>(p_mem);
+  return ptr;
+}
+
+void TargetWrapperBM::Free(void* ptr) {
+  if (ptr != NULL) {
+    bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
+    bm_device_mem_t* mem = static_cast<bm_device_mem_t*>(ptr);
+    bm_free_device(bm_handle, *mem);
+    free(ptr);
+  }
+  return;
+}
+
+void TargetWrapperBM::MemcpySync(void* dst,
+                                 const void* src,
+                                 size_t size,
+                                 IoDirection dir) {
+  if (bm_hds_.find(device_id_) == bm_hds_.end()) {
+    return;
+  }
+
+  bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
+  bm_device_mem_t* pmem{};
+  const bm_device_mem_t* pcst_mem{};
+
+  switch (dir) {
+    case IoDirection::HtoD:
+      pmem = static_cast<bm_device_mem_t*>(dst);
+      bm_memcpy_s2d_partial_offset(
+          bm_handle, *pmem, const_cast<void*>(src), size, 0);
+      break;
+    case IoDirection::DtoH:
+      pcst_mem = static_cast<const bm_device_mem_t*>(src);
+      bm_memcpy_d2s_partial_offset(
+          bm_handle, reinterpret_cast<void*>(dst), *pcst_mem, size, 0);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+      break;
+  }
+  return;
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/bm/target_wrapper.h
+++ b/lite/backends/bm/target_wrapper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+
+using TargetWrapperBM = TargetWrapper<TARGET(kBM)>;
+
+template <>
+class TargetWrapper<TARGET(kBM)> {
+ public:
+  using stream_t = int;
+  using event_t = int;
+
+  static size_t num_devices();
+  static size_t maximum_stream() { return 0; }
+
+  static void SetDevice(int id);
+  static void CreateStream(stream_t* stream) {}
+  static void DestroyStream(const stream_t& stream) {}
+
+  static void CreateEvent(event_t* event) {}
+  static void DestroyEvent(const event_t& event) {}
+
+  static void RecordEvent(const event_t& event) {}
+  static void SyncEvent(const event_t& event) {}
+
+  static void StreamSync(const stream_t& stream) {}
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void* GetHandle();
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+
+  static void MemcpyAsync(void* dst,
+                          const void* src,
+                          size_t size,
+                          IoDirection dir,
+                          const stream_t& stream) {}
+
+  static void MemsetSync(void* devPtr, int value, size_t count) {}
+
+  static void MemsetAsync(void* devPtr,
+                          int value,
+                          size_t count,
+                          const stream_t& stream) {}
+
+ private:
+  static int device_id_;
+  static std::map<int, void*> bm_hds_;
+};
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -6,7 +6,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
  X86_DEPS target_wrapper_x86
  CUDA_DEPS target_wrapper_cuda
  CL_DEPS cl_target_wrapper
-  FPGA_DEPS fpga_target_wrapper)
+  FPGA_DEPS fpga_target_wrapper
+  BM_DEPS target_wrapper_bm)

 lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)


--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)

 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -55,6 +55,7 @@ using NPUContext = Context<TargetType::kNPU>;
 using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
+using BMContext = Context<TargetType::kBM>;

 template <>
 class Context<TargetType::kHost> {
@@ -82,6 +83,23 @@ class Context<TargetType::kNPU> {
 };
 #endif

+#ifdef LITE_WITH_BM
+template <>
+class Context<TargetType::kBM> {
+ public:
+  Context() {}
+  explicit Context(const BMContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() { Init(0); }
+
+  void Init(int dev_id) { TargetWrapperBM::SetDevice(dev_id); }
+  void CopySharedTo(BMContext* ctx) {}
+  void* GetHandle() { return TargetWrapperBM::GetHandle(); }
+
+  std::string name() const { return "BMContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_XPU
 template <>
 class Context<TargetType::kXPU> {
@@ -374,6 +392,12 @@ class ContextScheduler {
        kernel_contexts_[TargetType::kFPGA].As<FPGAContext>().CopySharedTo(
            &ctx->As<FPGAContext>());
        break;
+#endif
+#ifdef LITE_WITH_BM
+      case TARGET(kBM):
+        kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
+            &ctx->As<BMContext>());
+        break;
 #endif
      default:
 #ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
@@ -412,6 +436,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_XPU
    InitContext<TargetType::kXPU, XPUContext>();
+#endif
+#ifdef LITE_WITH_BM
+    InitContext<TargetType::kBM, BMContext>();
 #endif
  }


--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -40,6 +40,11 @@ void* TargetMalloc(TargetType target, size_t size) {
      data = TargetWrapper<TARGET(kFPGA)>::Malloc(size);
      break;
 #endif  // LITE_WITH_OPENCL
+#ifdef LITE_WITH_BM
+    case TargetType::kBM:
+      data = TargetWrapper<TARGET(kBM)>::Malloc(size);
+      break;
+#endif
    default:
      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
  }
@@ -69,6 +74,11 @@ void TargetFree(TargetType target, void* data) {
      TargetWrapper<TARGET(kFPGA)>::Free(data);
      break;
 #endif  // LITE_WITH_CUDA
+#ifdef LITE_WITH_BM
+    case TargetType::kBM:
+      TargetWrapper<TARGET(kBM)>::Free(data);
+      break;
+#endif
    default:
      LOG(FATAL) << "Unknown type";
  }
@@ -95,6 +105,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
          dst, src, size, IoDirection::DtoD);
      break;
 #endif
+#ifdef LITE_WITH_BM
+    case TargetType::kBM:
+      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
    case TargetType::kOpenCL:
      TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -25,6 +25,10 @@
 #include "lite/backends/cuda/target_wrapper.h"
 #endif  // LITE_WITH_CUDA

+#ifdef LITE_WITH_BM
+#include "lite/backends/bm/target_wrapper.h"
+#endif  // LITE_WITH_BM
+
 namespace paddle {
 namespace lite {

@@ -71,6 +75,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
    case TARGET(kFPGA):
      TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
      break;
+#endif
+#ifdef LITE_WITH_BM
+    case TARGET(kBM):
+      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
+      break;
 #endif
  }
 }

--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -46,4 +46,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass)
    .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kX86), TARGET(kXPU)});
+    .ExcludeTargets({TARGET(kX86), TARGET(kXPU), TARGET(kBM)});
--- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
@@ -47,4 +47,4 @@ void ConvElementwiseFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass,
                  paddle::lite::mir::ConvElementwiseFusePass)
    .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kXPU)});
+    .ExcludeTargets({TARGET(kXPU), TARGET(kBM)});
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
@@ -36,5 +36,6 @@ REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
                  paddle::lite::mir::ElementwiseAddActivationFusePass)
    .BindTargets({TARGET(kAny)})
    .ExcludeTargets({TARGET(kXPU)})
+    .ExcludeTargets({TARGET(kBM)})
    .ExcludeTargets({TARGET(kX86)})
    .BindKernel("fusion_elementwise_add_activation");
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -39,5 +39,6 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
    .BindTargets({TARGET(kAny)})
    .ExcludeTargets({TARGET(kXPU)})
+    .ExcludeTargets({TARGET(kBM)})
    .ExcludeTargets({TARGET(kCUDA)})
    .BindKernel("fc");
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -256,4 +256,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
    .BindTargets({TARGET(kARM)})
-    .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU)});
+    .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -53,6 +53,20 @@ void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }

+void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/bm/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -61,3 +75,5 @@ REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
    .BindTargets({TARGET(kNPU)});
 REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
+REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
+    .BindTargets({TARGET(kBM)});
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -32,6 +32,11 @@ class XPUSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

+class BMSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -100,6 +100,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
    case TARGET(kFPGA): {
      CREATE_KERNEL(kFPGA);
    } break;
+    case TARGET(kBM): {
+      CREATE_KERNEL(kBM);
+    } break;
    default:
      CHECK(false) << "not supported kernel target " << TargetToStr(target);
  }
@@ -186,6 +189,11 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kFPGA, kFloat, kNHWC);
  INIT_FOR(kFPGA, kAny, kNHWC);
  INIT_FOR(kFPGA, kAny, kAny);
+
+  INIT_FOR(kBM, kFloat, kNCHW);
+  INIT_FOR(kBM, kInt8, kNCHW);
+  INIT_FOR(kBM, kAny, kNCHW);
+  INIT_FOR(kBM, kAny, kAny);
 #undef INIT_FOR
 }


--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -230,6 +230,16 @@ class KernelRegistry final {
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //

+              KernelRegistryForTarget<TARGET(kBM),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kBM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kBM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+
              KernelRegistryForTarget<TARGET(kFPGA),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //

--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(opencl)
 add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(bm)
--- a/lite/kernels/bm/CMakeLists.txt
+++ b/lite/kernels/bm/CMakeLists.txt
+if(NOT LITE_WITH_BM)
+  return ()
+endif()
+
+add_subdirectory(bridges)
+add_kernel(subgraph_compute_bm BM basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${bm_subgraph_bridges})
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
+if(NOT LITE_WITH_BM)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_bm SRCS utility.cc DEPS)
+lite_cc_library(subgraph_bridge_graph_bm SRCS graph.cc DEPS subgraph_bridge_utility_bm)
+
+set(bm_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_bm subgraph_bridge_graph_bm)
+
+lite_cc_library(subgraph_bridge_act_op_bm SRCS act_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_bm SRCS conv_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_bm SRCS elementwise_ops.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_bm SRCS pool_op.cc DEPS ${subgraph_bridge_deps_bm})
+lite_cc_library(subgraph_bridge_softmax_op_bm SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_bm})
+lite_cc_library(subgraph_bridge_mul_op_bm SRCS mul_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_bm SRCS batch_norm_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_scale_op_bm SRCS scale_op.cc DEPS ${bm_subgraph_bridge_deps})
+
+set(bm_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_engine
+        subgraph_bridge_graph_bm
+        subgraph_bridge_act_op_bm
+        subgraph_bridge_conv_op_bm
+        subgraph_bridge_elementwise_ops_bm
+        subgraph_bridge_pool_op_bm
+        subgraph_bridge_softmax_op_bm
+        subgraph_bridge_mul_op_bm
+        subgraph_bridge_batch_norm_op_bm
+        subgraph_bridge_scale_op_bm
+        CACHE INTERNAL "bm_subgraph_bridges")
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  CHECK_EQ(op_type, "relu");
+  add_relu_layer(graph->GetCompilerHandle(),
+                 const_cast<const int*>(&i_x_shape_data[0]),
+                 x_dims.size(),
+                 static_cast<const char*>(x_var_name.c_str()),
+                 const_cast<const int*>(&i_output_shape_data[0]),
+                 output_dims.size(),
+                 static_cast<const char*>(output_var_name.c_str()),
+                 0.f,
+                 -1.f);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu, kBM, paddle::lite::subgraph::bm::ActConverter);
--- a/lite/kernels/bm/bridges/batch_norm_op.cc
+++ b/lite/kernels/bm/bridges/batch_norm_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  int channel_size = x_dims[1];
+  auto scale_var_name = op_info->Input("Scale").front();
+  auto scale = scope->FindVar(scale_var_name)->GetMutable<lite::Tensor>();
+  auto bias_var_name = op_info->Input("Bias").front();
+  auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+  auto mean_var_name = op_info->Input("Mean").front();
+  auto mean = scope->FindVar(mean_var_name)->GetMutable<lite::Tensor>();
+  auto variance_var_name = op_info->Input("Variance").front();
+  auto variance = scope->FindVar(variance_var_name)->GetMutable<lite::Tensor>();
+  // output
+  auto output_var_name = op_info->Output("Y").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  auto unique_bn_out_name = lite::subgraph::bm::UniqueName("batch_norm_out");
+  auto* scale_data = scale->mutable_data<float>();
+  auto* bias_data = bias->mutable_data<float>();
+  auto* mean_data = mean->mutable_data<float>();
+  auto* variance_data = variance->mutable_data<float>();
+  for (int c = 0; c < channel_size; c++) {
+    float inv_scale = 1.f / (std::sqrt(variance_data[c] + epsilon));
+    bias_data[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
+    scale_data[c] = inv_scale * scale_data[c];
+  }
+
+  const int input_num = 1;
+  int** shape = new int*[input_num];
+  int* dim = new int[input_num];
+  const char** name = new const char*[input_num];
+  name[0] = static_cast<const char*>(x_var_name.c_str());
+  dim[0] = x_dims.size();
+  shape[0] = &i_x_shape_data[0];
+  add_scale_layer(graph->GetCompilerHandle(),
+                  input_num,
+                  shape,
+                  dim,
+                  name,
+                  const_cast<const int*>(&i_output_shape_data[0]),
+                  output_dims.size(),
+                  static_cast<const char*>(output_var_name.c_str()),
+                  static_cast<const char*>(unique_op_name.c_str()),
+                  static_cast<const float*>(scale->mutable_data<float>()),
+                  static_cast<const float*>(bias->mutable_data<float>()),
+                  1,
+                  1,
+                  1);
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kBM,
+                         paddle::lite::subgraph::bm::BatchNormConverter);
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  auto input_var_name = op_info->Input("Input").front();
+  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input_dims = input->dims();
+  auto output_var_name = op_info->Output("Output").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  auto filter_var_name = op_info->Input("Filter").front();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter_dims = filter->dims();
+  CHECK_EQ(input_dims.size(), 4);
+  CHECK_EQ(output_dims.size(), 4);
+  CHECK_EQ(filter_dims.size(), 4);
+  bool has_bias = lite::subgraph::bm::HasInputArg(op_info, scope, "Bias");
+  float* bias_data = nullptr;
+  if (has_bias) {
+    auto bias_var_name = op_info->Input("Bias").front();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    bias_data = static_cast<float*>(bias->mutable_data<float>());
+  }
+  const int64_t* input_shape_data =
+      const_cast<const int64_t*>(&input_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_input_shape_data(input_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+
+  for (size_t i = 0; i < input_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int>(input_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  const float* filter_data =
+      const_cast<const float*>(filter->mutable_data<float>());
+  auto groups = op_info->GetAttr<int>("groups");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+
+  add_conv_layer(graph->GetCompilerHandle(),
+                 const_cast<const int*>(&i_input_shape_data[0]),
+                 input_dims.size(),
+                 static_cast<const char*>(input_var_name.c_str()),
+                 const_cast<const int*>(&i_output_shape_data[0]),
+                 output_dims.size(),
+                 static_cast<const char*>(output_var_name.c_str()),
+                 static_cast<const char*>(unique_op_name.c_str()),
+                 filter_data,
+                 bias_data,
+                 filter_dims.data()[2],
+                 filter_dims.data()[3],
+                 groups,
+                 paddings[0],
+                 paddings[0],
+                 paddings[1],
+                 paddings[1],
+                 strides[0],
+                 strides[1],
+                 dilations[0],
+                 dilations[1],
+                 static_cast<int>(has_bias));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConvConverter);
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  const int input_num = 2;
+  int** shape = new int*[input_num];
+  int* dim = new int[input_num];
+  const char** name = new const char*[input_num];
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  name[0] = static_cast<const char*>(x_var_name.c_str());
+  dim[0] = x_dims.size();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  shape[0] = &i_x_shape_data[0];
+  auto y_var_name = op_info->Input("Y").front();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto y_dims = y->dims();
+  name[1] = static_cast<const char*>(y_var_name.c_str());
+  dim[1] = y_dims.size();
+  const int64_t* y_shape_data = const_cast<const int64_t*>(&y_dims.data()[0]);
+  std::vector<int32_t> i_y_shape_data(y_dims.size());
+  for (size_t i = 0; i < y_dims.size(); i++) {
+    i_y_shape_data[i] = static_cast<int>(y_shape_data[i]);
+  }
+  shape[1] = &i_y_shape_data[0];
+  bool y_is_const = !graph->HasNode(y_var_name);
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  if (y_is_const) {
+    CHECK_EQ(op_type, "elementwise_add");
+  }
+  int op_code{-1};
+  float coeff[2] = {1.f, 1.f};
+  if (op_type == "elementwise_mul") {
+    op_code = 0;
+  } else if (op_type == "elementwise_add") {
+    op_code = 1;
+  } else if (op_type == "elementwise_sub") {
+    op_code = 1;
+    coeff[1] = -1.f;
+  } else {
+    LOG(FATAL) << "UNSUPPORTED ELTWISE OPERATION: " << op_type;
+  }
+  if (!y_is_const) {
+    add_eltwise_layer(graph->GetCompilerHandle(),
+                      input_num,
+                      shape,
+                      dim,
+                      name,
+                      const_cast<const int*>(&i_output_shape_data[0]),
+                      output_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      op_code,
+                      coeff);
+  } else {
+    const float* y_data = const_cast<const float*>(y->mutable_data<float>());
+    const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+    bm_add_const_tensor(graph->GetCompilerHandle(),
+                        name[1],
+                        shape[0],
+                        dim[0],
+                        static_cast<bm_data_type_t>(DTYPE_FP32),
+                        static_cast<const void*>(y_data));
+
+    add_binary_layer_v2(graph->GetCompilerHandle(),
+                        name[0],
+                        shape[0],
+                        dim[0],
+                        0,
+                        static_cast<const float*>(x_data),
+                        name[1],
+                        shape[0],
+                        dim[0],
+                        0,
+                        static_cast<const float*>(y_data),
+                        static_cast<const char*>(output_var_name.c_str()),
+                        0);
+  }
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
--- a/lite/kernels/bm/bridges/graph.cc
+++ b/lite/kernels/bm/bridges/graph.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/bm/bridges/graph.h"
+#include <bmcompiler_if.h>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+void Graph::AddNode(const std::string& name) {
+  nodes_.insert(std::make_pair(name, name));
+}
+
+void Graph::CreateCompilerHandle() {
+  compiler_handle_ = create_bmcompiler("BM1684");
+  CHECK(compiler_handle_ != nullptr);
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/bm/bridges/graph.h
+++ b/lite/kernels/bm/bridges/graph.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+// Graph to collect all of converted BM IR nodes
+class Graph {
+ public:
+  void AddNode(const std::string& name);
+  bool HasNode(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+  void CreateCompilerHandle();
+  void* GetCompilerHandle() { return compiler_handle_; }
+
+ private:
+  std::unordered_map<std::string, std::string> nodes_;
+  void* compiler_handle_;
+};
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/bm/bridges/mul_op.cc
+++ b/lite/kernels/bm/bridges/mul_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // only support y is const
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // add reshape layer
+  int i_x_reshape_shape_data[2];
+  for (size_t i = 0; i < 2; i++) {
+    i_x_reshape_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  int reshape_param[] = {0, -1};
+  auto unique_op_reshape_name =
+      lite::subgraph::bm::UniqueName(op_type + "_reshape");
+  add_reshape_layer(graph->GetCompilerHandle(),
+                    const_cast<const int*>(&i_x_shape_data[0]),
+                    x_dims.size(),
+                    static_cast<const char*>(x_var_name.c_str()),
+                    const_cast<const int*>(&i_x_reshape_shape_data[0]),
+                    2,
+                    static_cast<const char*>(unique_op_reshape_name.c_str()),
+                    const_cast<const int*>(reshape_param));
+
+  auto y_var_name = op_info->Input("Y").front();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto y_dims = y->dims();
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  add_fc_layer(graph->GetCompilerHandle(),
+               const_cast<const int*>(&i_x_reshape_shape_data[0]),
+               2,
+               static_cast<const char*>(unique_op_reshape_name.c_str()),
+               const_cast<const int*>(&i_output_shape_data[0]),
+               output_dims.size(),
+               static_cast<const char*>(output_var_name.c_str()),
+               static_cast<const char*>(unique_op_name.c_str()),
+               i_x_reshape_shape_data[1],
+               i_output_shape_data[1],
+               static_cast<const float*>(y->mutable_data<float>()),
+               nullptr,
+               0,
+               0);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(mul, kBM, paddle::lite::subgraph::bm::MulConverter);
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kBM);
+USE_SUBGRAPH_BRIDGE(conv2d, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kBM);
+USE_SUBGRAPH_BRIDGE(pool2d, kBM);
+USE_SUBGRAPH_BRIDGE(softmax, kBM);
+USE_SUBGRAPH_BRIDGE(mul, kBM);
+USE_SUBGRAPH_BRIDGE(batch_norm, kBM);
+USE_SUBGRAPH_BRIDGE(scale, kBM);
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  int32_t* shape[1];
+  int32_t dim[1];
+  const char* name[1];
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  shape[0] = &i_output_shape_data[0];
+  name[0] = static_cast<const char*>(output_var_name.c_str());
+  dim[0] = output_dims.size();
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  CHECK(pooling_type == "max" || pooling_type == "avg");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
+  bool average_exclusive = false;
+  if (pooling_type == "avg") {
+    average_exclusive = op_info->GetAttr<bool>("exclusive");
+  }
+  add_pooling_layer(
+      graph->GetCompilerHandle(),
+      const_cast<const int*>(&i_x_shape_data[0]),
+      x_dims.size(),
+      static_cast<const char*>(x_var_name.c_str()),
+      1,
+      shape,
+      dim,
+      name,
+      ksize[0],
+      ksize[1],
+      paddings[0],
+      paddings[0],
+      paddings[1],
+      paddings[1],
+      strides[0],
+      strides[1],
+      (ksize[0] > 1 && ksize[1] > 1) && pooling_type == "max" ? 0 : 1,
+      static_cast<int>(average_exclusive),
+      static_cast<int>(global_pooling),
+      static_cast<int>(ceil_mode),
+      static_cast<const char*>(unique_op_name.c_str()),
+      nullptr);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kBM,
+                         paddle::lite::subgraph::bm::PoolConverter);
--- a/lite/kernels/bm/bridges/scale_op.cc
+++ b/lite/kernels/bm/bridges/scale_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto scale = op_info->GetAttr<float>("scale");
+  auto bias = op_info->GetAttr<float>("bias");
+  auto bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
+  if (!bias_after_scale) {
+    bias *= scale;
+  }
+  auto unique_op_scale_name = lite::subgraph::bm::UniqueName(op_type);
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         scale,
+                         static_cast<const char*>(unique_op_scale_name.c_str()),
+                         BINARY_MUL,
+                         0);
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(unique_op_scale_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         bias,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         BINARY_ADD,
+                         0);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(scale,
+                         kBM,
+                         paddle::lite::subgraph::bm::ScaleConverter);
--- a/lite/kernels/bm/bridges/softmax_op.cc
+++ b/lite/kernels/bm/bridges/softmax_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  size_t length = x_dims.size();
+  std::vector<int32_t> i_x_shape_data(length);
+  for (size_t i = 0; i < length; i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  length = output_dims.size();
+  std::vector<int32_t> i_output_shape_data(length);
+  for (size_t i = 0; i < length; i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  int outer_num = x_dims.Slice(0, axis).production();
+  int inner_num = x_dims.Slice(axis + 1, x_dims.size()).production();
+  add_softmax_layer(graph->GetCompilerHandle(),
+                    const_cast<const int*>(&i_x_shape_data[0]),
+                    x_dims.size(),
+                    static_cast<const char*>(x_var_name.c_str()),
+                    const_cast<const int*>(&i_output_shape_data[0]),
+                    output_dims.size(),
+                    static_cast<const char*>(output_var_name.c_str()),
+                    inner_num,
+                    outer_num,
+                    x_dims[axis]);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kBM,
+                         paddle::lite::subgraph::bm::SoftmaxConverter);
--- a/lite/kernels/bm/bridges/utility.cc
+++ b/lite/kernels/bm/bridges/utility.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/bm/bridges/utility.h"
+#include <mutex>  //NOLINT
+#include <unordered_map>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+std::string UniqueName(const std::string& prefix) {
+  static std::mutex counter_mtx;
+  static std::unordered_map<std::string, int> counter_map;
+  std::unique_lock<std::mutex> counter_lck(counter_mtx);
+  int counter = 1;
+  auto it = counter_map.find(prefix);
+  if (it == counter_map.end()) {
+    counter_map[prefix] = counter;
+  } else {
+    counter = ++(it->second);
+  }
+
+  return prefix + "_" + std::to_string(counter);
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/bm/bridges/utility.h
+++ b/lite/kernels/bm/bridges/utility.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+std::string UniqueName(const std::string& prefix);
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/bm/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/paddle_use_bridges.h"
+#include "lite/kernels/bm/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace bm {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  subgraph::bm::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  graph.CreateCompilerHandle();
+  auto& ctx = this->ctx_->template As<BMContext>();
+  for (auto& inst : origin_program_) {
+    auto op = inst.op();
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kBM))) {
+      return subgraph::FAILED;
+    }
+    auto kernel = inst.kernel();
+    status |=
+        bridges.Select(op_type, TARGET(kBM))(reinterpret_cast<void*>(&graph),
+                                             const_cast<OpLite*>(op),
+                                             const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  std::string net_name = "paddle_bitmain";
+  __bmcompile_opt(
+      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 2);
+  void* bmodel_data = nullptr;
+  unsigned int data_size = 0;
+  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
+  finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size);
+  bmrt_hd_ = bmrt_create(bm_hd_);
+  if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
+    return subgraph::FAILED;
+  }
+  bmrt_get_network_names(bmrt_hd_, &net_names_);
+  net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]);
+  auto& stage = net_info_->stages[0];
+  // input
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  device_inputs_.resize(input_names_.size());
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    bm_device_mem_t* p_mem =
+        static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
+    CHECK(p_mem != nullptr);
+    CHECK_EQ(bm_malloc_device_byte(
+                 bm_hd_, p_mem, origin_itensors_[i]->memory_size()),
+             BM_SUCCESS);
+    bmrt_tensor_with_device(&device_inputs_[i],
+                            *p_mem,
+                            net_info_->input_dtypes[i],
+                            stage.input_shapes[i]);
+  }
+  // output
+  origin_odims_.resize(output_names_.size());
+  origin_otensors_.resize(output_names_.size());
+  device_outputs_.resize(output_names_.size());
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    output_map_.insert(std::pair<std::string, int>(output_names_[i], i));
+    origin_otensors_[i]->mutable_data<float>();
+  }
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    int mapping_index = output_map_.at(net_info_->output_names[i]);
+    bm_device_mem_t* p_mem =
+        static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
+    CHECK(p_mem != nullptr);
+    CHECK_EQ(bm_malloc_device_byte(
+                 bm_hd_, p_mem, origin_otensors_[mapping_index]->memory_size()),
+             BM_SUCCESS);
+    bmrt_tensor_with_device(&device_outputs_[i],
+                            *p_mem,
+                            net_info_->output_dtypes[i],
+                            stage.output_shapes[i]);
+  }
+
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  for (size_t i = 0; i < device_inputs_.size(); i++) {
+    bm_memcpy_s2d(bm_hd_,
+                  device_inputs_[i].device_mem,
+                  const_cast<void*>(origin_itensors_[i]->raw_data()));
+  }
+  bmrt_launch_tensor_ex(bmrt_hd_,
+                        net_names_[0],
+                        static_cast<const bm_tensor_t*>(&device_inputs_[0]),
+                        net_info_->input_num,
+                        static_cast<bm_tensor_t*>(&device_outputs_[0]),
+                        net_info_->output_num,
+                        true,
+                        false);
+  bm_thread_sync(bm_hd_);
+  for (size_t i = 0; i < device_outputs_.size(); i++) {
+    bm_memcpy_d2s(bm_hd_,
+                  const_cast<void*>(origin_otensors_[i]->raw_data()),
+                  device_outputs_[i].device_mem);
+  }
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace bm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kBM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::bm::SubgraphCompute,
+                     def)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <bmcompiler_if.h>
+#include <bmruntime_interface.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+#include "lite/core/types.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace bm {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+ private:
+  void *bmrt_hd_;
+  std::vector<bm_tensor_t> device_inputs_;
+  std::vector<bm_tensor_t> device_outputs_;
+  std::map<std::string, int> output_map_;
+  const char **net_names_;
+  const bm_net_info_t *net_info_;
+  bm_handle_t bm_hd_;
+};
+
+class SubgraphCompute : public KernelLite<TARGET(kBM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SubgraphParam;
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace bm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM)
  return()
 endif()


--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -36,36 +36,36 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

 if(LITE_BUILD_EXTRA)
-    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
-    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels}  ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels}  ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
+#!/bin/bash
+set -ex
+
+# global variables with default value
+BM_SDK_ROOT="$(pwd)/../BM_SDK"     # BM SDK
+TARGET_NAME="BM1682"     # default target
+BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+WITH_TESTING=ON                    # ON/OFF
+
+function print_usage {
+    echo -e "\nUSAGE:"
+    echo
+    echo "----------------------------------------"
+    echo -e "--bm_sdk_root=<bm sdk directory>"
+    echo -e "--target_name=<target name>"
+    echo "----------------------------------------"
+    echo
+}
+
+# readonly variables with default value
+readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
+                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
+                               -DWITH_PYTHON=OFF \
+                               -DLITE_WITH_ARM=OFF"
+
+readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THRLITE_BUILD_THREADSEADS:-1}
+
+readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+readonly workspace=$(pwd)
+
+function prepare_thirdparty {
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+        rm -rf $workspace/third-party
+
+        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-05b862.tar.gz
+    else
+        git submodule update --init --recursive
+    fi
+}
+
+# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    # in build directory
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
+    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
+    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
+
+    # clone submodule
+    # git submodule update --init --recursive
+    prepare_thirdparty
+}
+
+function build_bm {
+    build_dir=${workspace}/build.lite.bm
+    mkdir -p $build_dir
+    cd $build_dir
+
+    prepare_workspace
+    cmake .. \
+        ${CMAKE_COMMON_OPTIONS} \
+        -DWITH_GPU=OFF \
+        -DWITH_MKLDNN=OFF \
+        -DLITE_WITH_X86=ON \
+        -DWITH_MKL=ON \
+        -DLITE_BUILD_EXTRA=ON \
+        -DLITE_WITH_XPU=OFF \
+        -DLITE_WITH_BM=ON \
+        -DWITH_TESTING=${WITH_TESTING} \
+        -DBM_SDK_ROOT=${BM_SDK_ROOT}
+
+    make -j$NUM_CORES_FOR_COMPILE
+
+    cd -
+    echo "Done"
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --target_name=*)
+                TARGET_NAME="${i#*=}"
+                shift
+                ;;
+            --bm_sdk_root=*)
+                BM_SDK_ROOT="${i#*=}"
+                shift
+                ;;
+            bm)
+                build_bm
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+}
+
+main $@