diff --git a/CMakeLists.txt b/CMakeLists.txt
index 199b3bda17f4ac22c1d657b6794446832d448440..1ec5352fa4009144b9f572ecbe061aba11e884d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
@@ -184,6 +185,10 @@ if(LITE_WITH_CUDA)
   include(cuda)
 endif()
 
+if(LITE_WITH_XPU)
+  include(xpu)
+endif()
+
 include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 67830fe2e0ec3c35064acb4c00ec152989ddb655..5dbb7f3fca4a2ecdab943cd49f34ee97f9bac9b0 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -127,6 +127,10 @@ if (LITE_WITH_NPU)
     add_definitions("-DLITE_WITH_NPU")
 endif()
 
+if (LITE_WITH_XPU)
+    add_definitions("-DLITE_WITH_XPU")
+endif()
+
 if (LITE_WITH_OPENCL)
     add_definitions("-DLITE_WITH_OPENCL")
 endif()
diff --git a/cmake/cross_compiling/npu.cmake b/cmake/cross_compiling/npu.cmake
index 863200986c93ea09d3fa3049fe684b32c2fb52dd..25aa4d2bc8c1c145e7a103c9164e1c9e231a8f9e 100644
--- a/cmake/cross_compiling/npu.cmake
+++ b/cmake/cross_compiling/npu.cmake
@@ -50,9 +50,6 @@ find_library(NPU_DDK_IR_FILE NAMES hiai_ir
 find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
   PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
 
-find_library(NPU_DDK_PROTO_FILE NAMES protobuf-lite
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
-
 if(NOT NPU_DDK_HIAI_FILE)
   message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
 else()
@@ -77,14 +74,8 @@ else()
   set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
 endif()
 
-if(NOT NPU_DDK_PROTO_FILE)
-  message(FATAL_ERROR "Can not find NPU_DDK_PROTO_FILE in ${NPU_DDK_ROOT}")
-else()
-  message(STATUS "Found NPU_DDK Protobuf Library: ${NPU_DDK_PROTO_FILE}")
-  add_library(npu_ddk_proto SHARED IMPORTED GLOBAL)
-  set_property(TARGET npu_ddk_proto PROPERTY IMPORTED_LOCATION ${NPU_DDK_PROTO_FILE})
-endif()
+set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
+set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
 
-set(npu_ddk_libs npu_ddk_hiai npu_ddk_ir npu_ddk_ir_build npu_ddk_proto CACHE INTERNAL "npu ddk libs")
 
 
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 7d8641d96da86cf9a2be442b797507ac79058efa..9b6fab3f6261ff13361bda35cfa9cd681075c77d 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -83,6 +83,12 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
 
+  if (LITE_WITH_XPU)
+    foreach(var ${lite_deps_XPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
   set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
 
@@ -107,7 +113,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS XPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -118,6 +124,7 @@ function(lite_cc_library TARGET)
             CUDA_DEPS ${args_CUDA_DEPS}
             CL_DEPS ${args_CL_DEPS}
             NPU_DEPS ${args_NPU_DEPS}
+            XPU_DEPS ${args_XPU_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
@@ -236,6 +243,7 @@ set(arm_kernels CACHE INTERNAL "arm kernels")
 set(x86_kernels CACHE INTERNAL "x86 kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
+set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
 
@@ -305,6 +313,12 @@ function(add_kernel TARGET device level)
         endif()
         set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
+    if ("${device}" STREQUAL "XPU")
+        if (NOT LITE_WITH_XPU)
+            return()
+        endif()
+        set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
     if ("${device}" STREQUAL "FPGA")
         if (NOT LITE_WITH_FPGA)
             return()
@@ -338,6 +352,7 @@ function(add_kernel TARGET device level)
     lite_cc_library(${TARGET} SRCS ${args_SRCS}
               DEPS ${args_DEPS}
               X86_DEPS ${args_X86_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
               CUDA_DEPS ${args_CUDA_DEPS}
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
@@ -386,6 +401,7 @@ function(add_operator TARGET level)
     lite_cc_library(${TARGET} SRCS ${args_SRCS}
               DEPS ${args_DEPS}
               X86_DEPS ${args_X86_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
               CUDA_DEPS ${args_CUDA_DEPS}
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
diff --git a/cmake/xpu.cmake b/cmake/xpu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7bf63f93d3646a2a1f009bd51b369e6bc014091a
--- /dev/null
+++ b/cmake/xpu.cmake
@@ -0,0 +1,103 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_XPU)
+  return()
+endif()
+
+if(NOT DEFINED XPU_SDK_ROOT)
+    set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT})
+    if(NOT XPU_SDK_ROOT)
+        message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
+    endif()
+endif()
+
+message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
+find_path(XPU_SDK_INC NAMES xtcl.h
+  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
+if(NOT XPU_SDK_INC)
+  message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
+endif()
+
+include_directories("${XPU_SDK_ROOT}/XTCL/include")
+include_directories("${XPU_SDK_ROOT}/XTDK/include")
+
+find_library(XPU_SDK_XTCL_FILE NAMES xtcl
+  PATHS ${XPU_SDK_ROOT}/XTCL/so)
+
+if(NOT XPU_SDK_XTCL_FILE)
+  message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
+else()
+  message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
+  add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
+endif()
+
+find_library(XPU_SDK_TVM_FILE NAMES tvm
+  PATHS ${XPU_SDK_ROOT}/XTCL/so)
+
+if(NOT XPU_SDK_TVM_FILE)
+  message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
+else()
+  message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
+  add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
+  set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
+endif()
+
+find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+
+if(NOT XPU_SDK_XPU_API_FILE)
+  message(FATAL_ERROR "Can not find XPU API Library in ${XPU_SDK_ROOT}")
+else()
+  message(STATUS "Found XPU API Library: ${XPU_SDK_XPU_API_FILE}")
+  add_library(xpu_sdk_xpu_api SHARED IMPORTED GLOBAL)
+  set_property(TARGET xpu_sdk_xpu_api PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_API_FILE})
+endif()
+
+find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+
+if(NOT XPU_SDK_XPU_RT_FILE)
+  message(FATAL_ERROR "Can not find XPU RT Library in ${XPU_SDK_ROOT}")
+else()
+  message(STATUS "Found XPU RT Library: ${XPU_SDK_XPU_RT_FILE}")
+  add_library(xpu_sdk_xpu_rt SHARED IMPORTED GLOBAL)
+  set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
+endif()
+
+find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+
+if(NOT XPU_SDK_XPU_JITC_FILE)
+  message(FATAL_ERROR "Can not find XPU JITC Library in ${XPU_SDK_ROOT}")
+else()
+  message(STATUS "Found XPU JITC Library: ${XPU_SDK_XPU_JITC_FILE}")
+  add_library(xpu_sdk_xpu_jitc SHARED IMPORTED GLOBAL)
+  set_property(TARGET xpu_sdk_xpu_jitc PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_JITC_FILE})
+endif()
+
+find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+
+if(NOT XPU_SDK_LLVM_FILE)
+  message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
+else()
+  message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
+  add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
+  set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+endif()
+
+set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
+set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index ecd6f6634251246b3a759f52dbb538e66505025c..bff5a231388b62772c20d194ec140518d9765b27 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -6,6 +6,7 @@ message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
 message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
+message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index a4ea1b8ff9d09268d76e63f0d032a17d390f90a2..4e768731d295452f424e69b80cb6ef167e6b013f 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -26,11 +26,21 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
     DEPS ${light_lib_DEPS}
     ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels})
     target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
+    if (LITE_WITH_NPU)
+        # Strips the symbols of our protobuf functions to fix the conflicts during
+        # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
+        set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+        set_target_properties(paddle_light_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+    endif()
 else()
     if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
         add_library(paddle_light_api_shared SHARED "")
         target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
         add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+        if (LITE_WITH_NPU)
+            # Need to add HIAI runtime libs (libhiai.so) dependency
+            target_link_libraries(paddle_light_api_shared ${npu_runtime_libs})
+        endif()
     endif()
 endif()
 
@@ -39,7 +49,8 @@ if (WITH_TESTING)
       DEPS scope optimizer target_wrapper_host model_parser program
            ${ops} ${host_kernels}
       CUDA_DEPS ${cuda_kernels}
-      X86_DEPS ${x86_kernels})
+      X86_DEPS ${x86_kernels}
+      XPU_DEPS ${xpu_kernels})
 endif()
 if(LITE_WITH_FPGA)
     set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -51,6 +62,7 @@ message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
+message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 
 # for full api
@@ -63,6 +75,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                     X86_DEPS ${x86_kernels}
                     ARM_DEPS ${arm_kernels}
                     NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
+                    XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
                     CL_DEPS ${opencl_kenrels}
                     FPGA_DEPS ${fpga_kenrels})
 endif()
@@ -82,6 +95,7 @@ lite_cc_library(light_api SRCS light_api.cc
         X86_DEPS ${x86_kernels}
         ARM_DEPS ${arm_kernels}
         NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kenrels}
         FPGA_DEPS ${fpga_kenrels})
 
@@ -96,6 +110,7 @@ if(WITH_TESTING)
        X86_DEPS ${x86_kernels}
        ARM_DEPS ${arm_kernels}
        NPU_DEPS ${npu_kernels}
+       XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        EXCLUDE_COMPILE_DEPS "ON"
@@ -223,6 +238,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
         DEPS cxx_api light_api ${ops} paddle_api_light
         CL_DEPS ${opencl_kernels}
         X86_DEPS ${x86_kernels}
+        XPU_DEPS ${xpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
         --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -250,6 +266,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
   ${ops}
   ARM_DEPS ${arm_kernels}
   NPU_DEPS ${npu_kernels}
+  XPU_DEPS ${xpu_kernels}
   CL_DEPS ${opencl_kernels}
   X86_DEPS ${x86_kernels}
   FPGA_DEPS ${fpga_kernels}
@@ -264,6 +281,7 @@ if(NOT IOS)
     ${ops} ${host_kernels}
     ARM_DEPS ${arm_kernels}
     NPU_DEPS ${npu_kernels}
+    XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
     FPGA_DEPS ${fpga_kernels}
     X86_DEPS ${x86_kernels})
@@ -271,6 +289,7 @@ if(NOT IOS)
     ${ops} ${host_kernels}
     ARM_DEPS ${arm_kernels}
     NPU_DEPS ${npu_kernels}
+    XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
     FPGA_DEPS ${fpga_kernels}
     X86_DEPS ${x86_kernels})
diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt
index afe051a437f4de83931bdaa3f2d03427b78d13ad..3efa980332f25d786d5c880fab9b3ba5af0a1013 100644
--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -17,10 +17,20 @@ if (NOT LITE_ON_TINY_PUBLISH)
     # Unlike static library, module library has to link target to be able to work
     # as a single .so lib.
     target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
+    if (LITE_WITH_NPU)
+        # Strips the symbols of our protobuf functions to fix the conflicts during
+        # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
+        set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+        set_target_properties(paddle_lite_jni PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+    endif()
 else()
     add_library(paddle_lite_jni SHARED "")
     target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
     add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
+    if (LITE_WITH_NPU)
+        # Need to add HIAI runtime libs (libhiai.so) dependency
+        target_link_libraries(paddle_lite_jni ${npu_runtime_libs})
+    endif()
 endif()
 
 if (APPLE)
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index dbdf9ff269b372cd3dcd59769b15526b7631a5e5..ccacb027d682b5388e44b05075b66f436c3e2668 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -46,8 +46,16 @@ std::string Place::DebugString() const {
 }
 
 const std::string& TargetToStr(TargetType target) {
-  static const std::string target2string[] = {
-      "unk", "host", "x86", "cuda", "arm", "opencl", "any", "fpga", "npu"};
+  static const std::string target2string[] = {"unk",
+                                              "host",
+                                              "x86",
+                                              "cuda",
+                                              "arm",
+                                              "opencl",
+                                              "any",
+                                              "fpga",
+                                              "npu",
+                                              "xpu"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -84,7 +92,8 @@ const std::string& TargetRepr(TargetType target) {
                                               "kOpenCL",
                                               "kAny",
                                               "kFPGA",
-                                              "kNPU"};
+                                              "kNPU",
+                                              "kXPU"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index 5e4f2ed21c8298ac15a912672e3d15633d0a3ecb..19ec5c6e8b5e39d1c68f9a20968472cbc66e89a2 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -50,8 +50,9 @@ enum class TargetType : int {
   kOpenCL = 5,
   kFPGA = 7,
   kNPU = 8,
+  kXPU = 9,
   kAny = 6,  // any target
-  NUM = 9,   // number of fields.
+  NUM = 10,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index 70b4f0bbf794ed7ca537177f48fee34a5955aba5..dec63e6efa0e4c4548646ebdd6f6de24f046d6d0 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(cuda)
 add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
+add_subdirectory(xpu)
diff --git a/lite/backends/npu/CMakeLists.txt b/lite/backends/npu/CMakeLists.txt
index 370f620b919d9cdb7458a704b205951caf4bf8af..426ff5698146c773c818b2bfd598d6bbbdf7867f 100644
--- a/lite/backends/npu/CMakeLists.txt
+++ b/lite/backends/npu/CMakeLists.txt
@@ -2,4 +2,5 @@ if(NOT LITE_WITH_NPU)
   return()
 endif()
 
-lite_cc_library(npu_runtime SRCS runtime.cc DEPS npu_ddk_hiai)
+lite_cc_library(npu_runtime SRCS runtime.cc DEPS ${npu_runtime_libs})
+lite_cc_library(npu_builder SRCS builder.cc DEPS ${npu_builder_libs} npu_runtime tensor op scope)
diff --git a/lite/kernels/npu/bridges/utils.cc b/lite/backends/npu/builder.cc
similarity index 92%
rename from lite/kernels/npu/bridges/utils.cc
rename to lite/backends/npu/builder.cc
index 933d8188c99a36bd537ac4a5ee5f584a2b79956a..80ab6e486b6cd9a67f4162ffb11d7bdac959eca9 100644
--- a/lite/kernels/npu/bridges/utils.cc
+++ b/lite/backends/npu/builder.cc
@@ -12,21 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/npu/bridges/utils.h"
+#include "lite/backends/npu/builder.h"
 #include <mutex>  // NOLINT
 #include <utility>
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"  // for ge::op::Data
-#include "ai_ddk_lib/include/graph/tensor.h"      // for ge::TensorUtils
-#include "ai_ddk_lib/include/hiai_ir_build.h"
 #include "lite/backends/npu/runtime.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
 namespace npu {
-namespace bridges {
 
 // Build HIAI IR graph to om model, and store om model data into lite tensor
 bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
@@ -165,8 +158,6 @@ bool HasInputArg(const OpInfo* op_info,
   }
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..a245a8517b1c8e20a4630d370da5ca0b203adb71
--- /dev/null
+++ b/lite/backends/npu/builder.h
@@ -0,0 +1,254 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "ai_ddk_lib/include/graph/buffer.h"
+#include "ai_ddk_lib/include/graph/graph.h"
+#include "ai_ddk_lib/include/graph/model.h"
+#include "ai_ddk_lib/include/graph/op/all_ops.h"
+#include "ai_ddk_lib/include/graph/operator.h"
+#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "ai_ddk_lib/include/hiai_ir_build.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/core/tensor.h"
+
+// Extended Ops of HIAI DDK
+namespace ge {
+/**
+ * Multiply the matrix x1 by the matrix x2 to generate x1 * x2.
+ * The inputs must be two-dimensional matrices and the inner dimension of "x1"
+ * (after being transposed if transpose_x1 is true) must match the outer
+ * dimension of "x2" (after being transposed if transposed_x2 is true). <Input>
+ *      x : the first input tensor, must be non const op.
+ *      w : the second input tensor, must be const op.
+ *      bias: the optional bias tensor, must be const op.
+ * <Output>
+ *      y : the output tensor.
+ * <Attr>
+ *      has_bias: If true, enable input bias.
+ */
+REG_OP(MatMul)
+    .INPUT(x, TensorType({DT_FLOAT}))
+    .INPUT(w, TensorType({DT_FLOAT}))
+    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT}))  // bias must be const input
+    .OUTPUT(y, TensorType({DT_FLOAT}))
+    .ATTR(has_bias, AttrValue::BOOL{false})  // when has input::bias,set true
+    .OP_END();
+
+/**
+ * Computes the gradients of convolution with respect to the input.
+ * <Input>
+ *      input_sizes : An integer vector representing the shape of input,
+ * where input is a 4-D [batch, height, width, channels] tensor.
+ *      filter : the filter tensor, with shape [H , W, filter_channel,
+ * filter_number], filter_channel must be same as x channel.
+ *      x :  The input tensor.
+ * <Output>
+ *      y : The output tensor.
+ * <Attr>
+ *      format: 0: NCHW. 1: NHWC
+ *      group : 1: default
+ *      num_output : 0: default, num_output must be equal to
+ * (filter_channel * group)
+ *      pad : Padding for the beginning and ending along each axis
+ *      stride : Stride along each axis.
+ *      dilation : dilation value along each axis of the filter.
+ *      pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET
+ *      bias_term : 0: default
+ *      kernel : The shape of the convolution kernel
+ */
+REG_OP(Deconvolution)
+    .INPUT(input_sizes, TensorType({DT_UINT8}))
+    .INPUT(filter, TensorType({DT_FLOAT}))
+    .INPUT(x, TensorType({DT_FLOAT}))
+    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT}))
+    .ATTR(mode, AttrValue::INT{1})
+    .ATTR(format, AttrValue::INT{1})
+    .ATTR(group, AttrValue::INT{1})
+    .ATTR(num_output, AttrValue::INT{0})
+    .ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0}))
+    .ATTR(stride, AttrValue::LIST_INT({1, 1}))
+    .ATTR(dilation, AttrValue::LIST_INT({1, 1}))
+    .ATTR(pad_mode, AttrValue::INT{0})
+    .ATTR(bias_term, AttrValue::INT{0})
+    .ATTR(kernel, AttrValue::LIST_INT({0, 0}))
+    .OP_END();
+
+/**
+ * Resize images to size using bilinear interpolation.
+ * <Input>
+ *      x : The tensor of 4-D
+ *      w : A int32 Tensor of 2 elements: [height, width].
+ * <Output>
+ *      y : the output tensor
+ * <Attr>
+ *      align_corners : If true, the centers of the 4 corner pixels of the
+ * input and output tensors are aligned, preserving the values at the corner
+ * pixels.
+ *      output_dim_mode : Defaults 2, including 0: zoom_factor , 1:
+ * shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is
+ * controled by the [height, width] of w.
+ *      shrink_factor : shrink factor.
+ *      zoom_factor : zoom factor.
+ *      pad_begin : begin of pad.
+ *      pad_end : end of pad.
+ */
+REG_OP(ResizeBilinear)
+    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
+    .INPUT(w, TensorType({DT_FLOAT, DT_INT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32}))
+    .ATTR(align_corners, AttrValue::BOOL{false})
+    .ATTR(output_dim_mode, AttrValue::INT{2})
+    .ATTR(shrink_factor, AttrValue::INT{1})
+    .ATTR(zoom_factor, AttrValue::INT{1})
+    .ATTR(pad_begin, AttrValue::INT{0})
+    .ATTR(pad_end, AttrValue::INT{0})
+    .OP_END();
+
+/**
+ * Resize images to size using nearest neighbor interpolation.
+ * <Input>
+ *      image : Resize images to size using nearest neighbor interpolation.
+ *      size : Must be one dimension and two  elements
+ * <Output>
+ *      output : the output tensor
+ * <Attr>
+ *      align_corners : If true, the centers of the 4 corner pixels of the
+ * input and output tensors are aligned, preserving the values at the corner
+ * pixels. Defaults to false
+ */
+REG_OP(ResizeNearestNeighbor)
+    .INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
+    .INPUT(size, TensorType({DT_INT32}))
+    .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
+    .ATTR(align_corners, AttrValue::BOOL{false})
+    .OP_END();
+
+/**
+ * Pads a tensor.
+ * <Input>
+ *      x : the input tensor
+ *      padding : the input tensor must be 2-D
+ *      constant_values : constant values must be a scalar
+ * <Output>
+ *      output : the output tensor
+ * <Attr>
+ *      t_paddings : Default DT_INT32 , t_paddings must be  the same with
+ * datatype of the padding
+ *      mode : 0: CONSTANT, 1: REFLECT, 2: SYMMETRIC
+ *      T  :  datatype of constant_values  DT_INT32:3   DT_FLOAT:0
+ */
+REG_OP(Pad)
+    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
+    .INPUT(padding, TensorType({DT_INT32}))
+    .OPTIONAL_INPUT(constant_values, TensorType({DT_INT32, DT_FLOAT}))
+    .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32}))
+    .ATTR(t_paddings, AttrValue::INT{3})
+    .ATTR(mode, AttrValue::INT{0})
+    .REQUIRED_ATTR(T, AttrValue::INT)
+    .OP_END();
+
+}  // namespace ge
+
+namespace paddle {
+namespace lite {
+namespace npu {
+
+class OpList {
+ public:
+  static OpList& Global() {
+    static thread_local OpList x;
+    return x;
+  }
+  void clear() { lists_.clear(); }
+  void add(std::shared_ptr<ge::Operator> p) { lists_.push_back(p); }
+
+ private:
+  std::vector<std::shared_ptr<ge::Operator>> lists_;
+};
+
+// Build HIAI IR graph to om model, and store om model data into lite tensor
+bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
+                std::vector<ge::Operator>& outputs,  // NOLINT
+                lite::Tensor* model_data);
+
+std::string UniqueName(const std::string& prefix);
+
+ge::DataType PrecisionConverter(PrecisionType itype);
+
+ge::Format DataLayoutConverter(DataLayoutType itype);
+
+ge::TensorPtr CvtFromLiteTensor(Tensor* in_tensor,
+                                std::vector<int64_t> out_shape = {},
+                                PrecisionType in_ptype = PRECISION(kFloat),
+                                DataLayoutType in_ltype = DATALAYOUT(kNCHW));
+
+template <typename T>
+ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
+                                      std::vector<int64_t> shape = {},
+                                      ge::Format format = ge::FORMAT_NCHW) {
+  const std::type_info& info = typeid(T);
+  ge::DataType type = ge::DT_FLOAT;
+  if (info == typeid(float)) {
+    type = ge::DT_FLOAT;
+  } else if (info == typeid(int8_t)) {
+    type = ge::DT_INT8;
+  } else if (info == typeid(int32_t)) {
+    type = ge::DT_INT32;
+  } else {
+    LOG(FATAL) << "Unknow value type " << info.name();
+  }
+  if (shape.empty()) {
+    shape = {static_cast<int64_t>(data.size())};
+  } else {
+    int size = 1;
+    for (auto i : shape) {
+      size *= i;
+    }
+    CHECK_EQ(data.size(), size);
+  }
+  ge::TensorDesc desc(ge::Shape(shape), format, type);
+  ge::TensorPtr tensor = std::make_shared<ge::Tensor>();
+  tensor->SetTensorDesc(desc);
+  tensor->SetData(reinterpret_cast<uint8_t*>(data.data()),
+                  data.size() * sizeof(T));
+  return tensor;
+}
+
+template <typename T>
+ge::TensorPtr CreateTensorAndFillData(T value,
+                                      std::vector<int64_t> shape = {1},
+                                      ge::Format format = ge::FORMAT_NCHW) {
+  int64_t size = 1;
+  for (auto i : shape) {
+    size *= i;
+  }
+  std::vector<T> data(size, value);
+  return CreateTensorAndFillData(data, shape, format);
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+}  // namespace npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt
index 5cc4a9f0770e79683197bce0bb83336b8c79c364..2dea4364d5ee2d11d6d266935fad2a1180954369 100644
--- a/lite/backends/x86/math/CMakeLists.txt
+++ b/lite/backends/x86/math/CMakeLists.txt
@@ -32,8 +32,8 @@ math_library(sampler)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
 
-lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3)
-math_library(math_function DEPS blas)
+lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3 dynload_mklml)
+math_library(math_function DEPS blas dynload_mklml)
 math_library(maxouting)
 math_library(pooling)
 math_library(selected_rows_functor DEPS selected_rows math_function blas)
diff --git a/lite/backends/xpu/CMakeLists.txt b/lite/backends/xpu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f911f8e0e7c61481e1d4e309bc0635718be11206
--- /dev/null
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(NOT LITE_WITH_XPU)
+  return()
+endif()
+
+lite_cc_library(xpu_runtime SRCS runtime.cc DEPS ${xpu_runtime_libs})
+lite_cc_library(xpu_builder SRCS builder.cc DEPS ${xpu_builder_libs} xpu_runtime tensor op scope)
diff --git a/lite/backends/xpu/builder.cc b/lite/backends/xpu/builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..796eaf9c46ceb3d29f1ffdc4c86ac45509f07ba1
--- /dev/null
+++ b/lite/backends/xpu/builder.cc
@@ -0,0 +1,189 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/builder.h"
+#include <mutex>  // NOLINT
+#include <utility>
+#include "lite/backends/xpu/runtime.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+std::string UniqueName(const std::string& prefix) {
+  static std::mutex counter_mtx;
+  static std::unordered_map<std::string, int> counter_map;
+  std::unique_lock<std::mutex> counter_lck(counter_mtx);
+  int counter = 1;
+  auto it = counter_map.find(prefix);
+  if (it == counter_map.end()) {
+    counter_map[prefix] = counter;
+  } else {
+    counter = ++(it->second);
+  }
+  return prefix + "_" + std::to_string(counter);
+}
+
+xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
+  xtcl::DataType out_type = ::xtcl::Float(32);
+  switch (in_type) {
+    case PRECISION(kFloat):
+      out_type = ::xtcl::Float(32);
+      break;
+    case PRECISION(kInt8):
+      out_type = ::xtcl::Int(8);
+      break;
+    case PRECISION(kInt32):
+      out_type = ::xtcl::Int(32);
+      break;
+    default:
+      LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type)
+                 << ") from Lite to XPU";
+      break;
+  }
+  return out_type;
+}
+
+DLDataType CvtDataType(PrecisionType in_type) {
+  DLDataType out_type = {kDLFloat, 32, 1};
+  switch (in_type) {
+    case PRECISION(kFloat):
+      out_type = {kDLFloat, 32, 1};
+      break;
+    case PRECISION(kInt8):
+      out_type = {kDLInt, 8, 1};
+      break;
+    case PRECISION(kInt32):
+      out_type = {kDLInt, 32, 1};
+      break;
+    default:
+      LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type)
+                 << ") from Lite to XPU";
+      break;
+  }
+  return out_type;
+}
+
+xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int>& in_shape) {
+  xtcl::Array<xtcl::xIndexExpr> out_shape;
+  for (auto dim : in_shape) {
+    out_shape.push_back(dim);
+  }
+  return out_shape;
+}
+
+xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape) {
+  return CvtShape(std::vector<int>(in_shape.begin(), in_shape.end()));
+}
+
+xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims) {
+  return CvtShape(in_dims.Vectorize());
+}
+
+std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor,
+                                          std::vector<int64_t> out_shape,
+                                          PrecisionType in_ptype,
+                                          DataLayoutType in_ltype) {
+  uint8_t* in_data = nullptr;
+  auto in_size = in_tensor->dims().production();
+  auto in_shape = in_tensor->dims().Vectorize();
+  if (out_shape.empty()) {
+    out_shape = in_shape;
+  }
+  int in_bytes;
+  if (in_ptype == PRECISION(kFloat)) {
+    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
+    in_bytes = in_size * sizeof(float);
+  } else if (in_ptype == PRECISION(kInt32)) {
+    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
+    in_bytes = in_size * sizeof(int32_t);
+  } else if (in_ptype == PRECISION(kInt8)) {
+    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
+    in_bytes = in_size * sizeof(int8_t);
+  } else {
+    LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype);
+  }
+  auto out_tensor = std::make_shared<xtcl::xNDArray>(
+      xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0}));
+  auto out_data =
+      reinterpret_cast<uint8_t*>(out_tensor->ToDLPack()->dl_tensor.data);
+  std::memcpy(out_data, in_data, in_bytes);
+  return out_tensor;
+}
+
+// Build the XPU subgraph to the XPU model, store the model data into the
+// weight tensor of the graph op, and the model data will be loaded again
+// by the graph computing kernel when the graph op is executed for inference.
+// Due to the lack of XPU APIs for building and outputing the model data,
+// the compiled XPU runtime object will be managed by the global variable
+// 'DeviceInfo' and the key name for finding the runtime object will be
+// stored in the weight tensor of graph op.
+// TODO(hong19860320) Compile the XPU subgraph and output the compiled model
+// data to the weight tensor of graph op.
+bool BuildModel(
+    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
+    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
+    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
+    lite::Tensor* model) {
+  LOG(INFO) << "[XPU] Build Model.";
+  CHECK(builder != nullptr);
+  CHECK(outputs != nullptr);
+  CHECK_GT(outputs->size(), 0);
+  CHECK(model != nullptr);
+
+  // build graph and fill all of constant params
+  xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
+  auto target = xtcl::Target::Create("llvm");
+  auto compiler = xtcl::network::xTensorCompiler(network, target);
+  compiler.SetParams(*params);  // set the data of constant tensors
+  compiler.Build();
+
+  // create and register runtime
+  auto runtime = std::make_shared<xtcl::network::xRuntimeInstance>(
+      compiler.CreateRuntimeInstance());
+  if (runtime == nullptr) {
+    LOG(WARNING) << "[XPU] Build Model failed!";
+    return false;
+  }
+  std::string name = UniqueName("xpu");
+  LOG(INFO) << "[XPU] Model Name: " << name;
+  DeviceInfo::Global().Insert(name, runtime);
+  model->Resize({static_cast<int64_t>(name.length() + 1)});
+  memcpy(model->mutable_data<int8_t>(),
+         reinterpret_cast<const int8_t*>(name.c_str()),
+         name.length() + 1);
+  return true;
+}
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/builder.h b/lite/backends/xpu/builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0ac2b303aac7fa7f827e6e2f8f0fdf614b604b5
--- /dev/null
+++ b/lite/backends/xpu/builder.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+std::string UniqueName(const std::string& prefix);
+
+xtcl::DataType CvtPrecisionType(PrecisionType in_type);
+
+DLDataType CvtDataType(PrecisionType in_type);
+
+xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int>& in_shape);
+
+xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape);
+
+xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims);
+
+std::shared_ptr<xtcl::xNDArray> CvtTensor(
+    Tensor* in_tensor,
+    std::vector<int64_t> out_shape = {},
+    PrecisionType in_ptype = PRECISION(kFloat),
+    DataLayoutType in_ltype = DATALAYOUT(kNCHW));
+
+bool BuildModel(
+    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
+    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
+    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
+    lite::Tensor* model);
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/runtime.cc b/lite/backends/xpu/runtime.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2c34b95758e8abf81c8294507d0ca60aad7c021
--- /dev/null
+++ b/lite/backends/xpu/runtime.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/runtime.h"
+#include <vector>
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+// Extract the model data and recover the XPU model for inference, the function
+// is called by the graph computing kernel when the graph op is executed.
+// Due to the lack of XPU APIs for loading and recovering the XPU model from
+// memory, the key name is obtained from the weight tensor of graph op, to get
+// the runtime object for inference from the global variable 'DeviceInfo'.
+// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op.
+bool LoadModel(const lite::Tensor &model,
+               std::shared_ptr<xtcl::network::xRuntimeInstance> *runtime) {
+  LOG(INFO) << "[XPU] Load Model.";
+  CHECK_GT(model.dims().production(), 0);
+  std::string name(reinterpret_cast<const char *>(model.data<int8_t>()));
+  LOG(INFO) << "[XPU] Model Name: " << name;
+  CHECK(runtime != nullptr);
+  *runtime = DeviceInfo::Global().Find(name);
+  if (*runtime == nullptr) {
+    LOG(WARNING) << "[XPU] Load Model failed!";
+    return false;
+  }
+  return true;
+}
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/runtime.h b/lite/backends/xpu/runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ff8d75bce6156d51a4988d427058da34460443f
--- /dev/null
+++ b/lite/backends/xpu/runtime.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+class DeviceInfo {
+ public:
+  static DeviceInfo& Global() {
+    static DeviceInfo x;
+    return x;
+  }
+  DeviceInfo() {}
+
+  void Insert(const std::string& name,
+              std::shared_ptr<xtcl::network::xRuntimeInstance> runtime) {
+    if (runtimes_.find(name) != runtimes_.end()) {
+      LOG(WARNING) << "[XPU] Model " << name << " already exists.";
+      return;
+    }
+    runtimes_.emplace(std::make_pair(name, runtime));
+  }
+
+  void Clear() { runtimes_.clear(); }
+
+  std::shared_ptr<xtcl::network::xRuntimeInstance> Find(
+      const std::string& name) const {
+    if (runtimes_.find(name) != runtimes_.end()) {
+      return runtimes_.at(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  int device_id_{0};
+  std::string device_name_{"default"};
+  std::unordered_map<std::string,
+                     std::shared_ptr<xtcl::network::xRuntimeInstance>>
+      runtimes_;
+};
+
+bool LoadModel(const lite::Tensor& model,
+               std::shared_ptr<xtcl::network::xRuntimeInstance>* runtime);
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index a5b581335047ff18c31ea9d1c03a9785e4ddf2ed..5eecf1d815d30fe0ef10a55c6b6b351795fe63ae 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -35,7 +35,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
 if (LITE_WITH_ARM)
 lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS npu_runtime)
 else()
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags XPU_DEPS xpu_runtime)
 endif()
 
 #-------------------------------------------- GET CODE META INFO ------------------------------------------
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index 127e2ea11c159217e6d943d852af5849d85a74b3..bc77afd81e0859b9492b2068ce681098a9393923 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -5,6 +5,6 @@ endif()
 
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
-if(NOT LITE_WITH_OPENCL AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
   lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/core/context.h b/lite/core/context.h
index 281a9e0d267b43b3c7a50f3172908909b362811a..f798dc3a60705828c3ea1606e76145d91216ae95 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -28,6 +28,9 @@
 #ifdef LITE_WITH_NPU
 #include "lite/backends/npu/runtime.h"
 #endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/runtime.h"
+#endif
 
 #include <map>
 #include <memory>
@@ -55,6 +58,7 @@ using X86Context = Context<TargetType::kX86>;
 using CUDAContext = Context<TargetType::kCUDA>;
 using ARMContext = Context<TargetType::kARM>;
 using NPUContext = Context<TargetType::kNPU>;
+using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
 
@@ -84,6 +88,20 @@ class Context<TargetType::kNPU> {
 };
 #endif
 
+#ifdef LITE_WITH_XPU
+template <>
+class Context<TargetType::kXPU> {
+ public:
+  Context() {}
+  explicit Context(const NPUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(XPUContext* ctx) {}
+
+  std::string name() const { return "XPUContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_ARM
 template <>
 class Context<TargetType::kARM> {
@@ -340,6 +358,12 @@ class ContextScheduler {
             &ctx->As<NPUContext>());
         break;
 #endif
+#ifdef LITE_WITH_XPU
+      case TARGET(kXPU):
+        kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
+            &ctx->As<XPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_OPENCL
       case TARGET(kOpenCL):
         kernel_contexts_[TargetType::kOpenCL].As<OpenCLContext>().CopySharedTo(
@@ -386,6 +410,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_NPU
     InitContext<TargetType::kNPU, NPUContext>();
+#endif
+#ifdef LITE_WITH_XPU
+    InitContext<TargetType::kXPU, XPUContext>();
 #endif
   }
 
diff --git a/lite/core/mir/pass_utils.cc b/lite/core/mir/pass_utils.cc
index 804d4e1b5bc94f0e7804fa588e107a298210143b..cfa43f8d6e9dc4585a4618a003cb8e0bd9709642 100644
--- a/lite/core/mir/pass_utils.cc
+++ b/lite/core/mir/pass_utils.cc
@@ -53,6 +53,7 @@ void ExpandPlaces(std::set<Place>* places, const Place& place) {
                                              TARGET(kARM),
                                              TARGET(kOpenCL),
                                              TARGET(kNPU),
+                                             TARGET(kXPU),
                                              TARGET(kFPGA)});
   static const Types<PrecisionType> precision_set(
       {PRECISION(kFloat), PRECISION(kInt8), PRECISION(kFP16), PRECISION(kAny)});
diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt
index 76588b7027764a6afd54c33158a37589525ba8c0..95b5fe5ae13e03940bda8d83fcfc252b4ca490ab 100644
--- a/lite/core/mir/subgraph/CMakeLists.txt
+++ b/lite/core/mir/subgraph/CMakeLists.txt
@@ -16,7 +16,7 @@ set(subgraph_passes subgraph_pass)
 
 if(LITE_WITH_NPU)
   lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc
-      DEPS mir_pass types context ${mir_fusers} ${npu_bridges} ${npu_ddk_libs} graph_op subgraph_pass)
+      DEPS mir_pass types context ${mir_fusers} ${npu_bridges} graph_op subgraph_pass)
   list(APPEND subgraph_passes npu_pass)
   lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc
     DEPS npu_pass mir_passes paddle_api_full paddle_api_light gflags
@@ -30,5 +30,21 @@ if(LITE_WITH_NPU)
   endif()
 endif()
 
+if(LITE_WITH_XPU)
+  lite_cc_library(xpu_pass SRCS generate_xpu_program_pass.cc
+      DEPS mir_pass types context ${mir_fusers} ${xpu_bridges} ${xpu_builder_libs} graph_op subgraph_pass)
+  list(APPEND subgraph_passes xpu_pass)
+  lite_cc_test(test_xpu_pass SRCS generate_xpu_program_pass_test.cc
+    DEPS xpu_pass mir_passes paddle_api_full gflags
+    ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
+         --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
+  if (WITH_TESTING)
+    add_dependencies(test_xpu_pass extern_lite_download_mobilenet_v1_tar_gz)
+    add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
+    set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+    set_target_properties(test_xpu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  endif()
+endif()
+
 set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes")
 message(STATUS "----> subgraph_passes: ${subgraph_passes}")
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc
index c47ab60d634672c9092cec83d4a7bfc74cf1a747..c5465a5edaa28d3cc2cfb4a7ffe0cca2e3c1bc79 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc
@@ -22,14 +22,9 @@
 #include "lite/core/mir/pass_registry.h"
 #include "lite/core/mir/pattern_matcher.h"
 
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"  // for ge::op::Data
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/paddle_use_npu_bridges.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -51,7 +46,7 @@ std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
     auto wgt = std::make_shared<ge::op::Const>(arg.name);
     LOG(INFO) << "in convert const:" << arg.name;
     VLOG(4) << dims;
-    wgt->set_attr_value(lite::kernels::npu::bridges::CvtFromLiteTensor(tensor));
+    wgt->set_attr_value(lite::npu::CvtFromLiteTensor(tensor));
     return wgt;
   } else {
     CHECK_EQ(dims.size(), 4);
@@ -132,7 +127,7 @@ std::string GenerateNPUProgramPass::BuildNPUGraph(
   // Compiling IR graph to NPU model and store mode data into weight tensor with
   // persistable=true, Sothat the model parser can recognize it and save it to
   // param files
-  if (!lite::kernels::npu::bridges::BuildModel(inputs, outputs, weight)) {
+  if (!lite::npu::BuildModel(inputs, outputs, weight)) {
     LOG(WARNING) << "Build NPU failed subgraph " << sub_id;
     throw std::runtime_error("Build NPU failed subgraph.");
   }
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h
index be6b1aa24c8bf6ccab9bbdac198814350195b1b1..823ca5f1f624a9e920a5f395a9d5098c5ea52929 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.h
@@ -20,10 +20,10 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include "lite/backends/npu/builder.h"
 #include "lite/core/mir/pass.h"
 #include "lite/core/mir/subgraph/subgraph_program_pass.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
index 88095df502fe05a51b548dde7ce09700855ffae3..95339d6175c98f22d542db24f02d6d714ccbe2a8 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
@@ -93,11 +93,13 @@ void CompareOutputTensor(
     auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
     EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
     for (size_t j = 0; j < ref_output_tensor_size; j++) {
-      auto diff =
-          std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) /
-          (std::fabs(ref_output_tensor_data[j]) + 1e-6);
-      VLOG(3) << diff;
-      EXPECT_LT(diff, 0.1);
+      auto abs_diff =
+          std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]);
+      auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6);
+      VLOG(3) << "val: " << tar_output_tensor_data[j]
+              << " ref: " << ref_output_tensor_data[j]
+              << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;
+      EXPECT_LT(rel_diff, 0.1);
     }
   }
 }
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.cc b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..319e1e51feb917b803753807ddbb1f72c2cb7084
--- /dev/null
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
@@ -0,0 +1,206 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher.h"
+
+#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace subgraph {
+
+std::shared_ptr<xtcl::xExpr> GenerateXPUProgramPass::CvtVarNode(
+    lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
+    lite::mir::Node* var_node,
+    const Scope* scope) {
+  CHECK(var_node->IsArg());
+  const auto& arg = var_node->AsArg();
+  auto var_name = arg.name;
+  VLOG(4) << "[XPU] Convert var node " << var_name;
+
+  auto* var = scope->FindVar(var_name);
+  CHECK(var);
+  auto* tensor = var->GetMutable<lite::Tensor>();
+  CHECK(tensor);
+  auto dims = tensor->dims();
+  auto cvted_var_node =
+      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
+          var_name, lite::xpu::CvtShape(dims), ::xtcl::Float(32)));
+  if (arg.is_weight) {
+    auto cvted_var_tensor = lite::xpu::CvtTensor(tensor);
+    graph_ctx->params->emplace(std::make_pair(var_name, *cvted_var_tensor));
+  }
+  return cvted_var_node;
+}
+
+void GenerateXPUProgramPass::CvtAllOpNodes(
+    const std::vector<Node*>& op_nodes,
+    lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
+    lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes) {
+  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
+  const auto& supported_lists = bridges.AllFunctions();
+  // return record all converted vars
+  // op node's inputs must be found in converted_vars
+  for (auto& node : op_nodes) {
+    lite::kernels::xpu::bridges::node_map_type input_nodes;
+    auto& stmt = node->AsStmt();
+    for (auto& var_node : node->inlinks) {
+      auto& arg = var_node->AsArg();
+      // weight should be handled in the converter, so skip here
+      if (arg.is_weight) {
+        continue;
+      }
+      auto var_name = arg.name;
+      if (!cvted_var_nodes->count(var_name)) {
+        cvted_var_nodes->insert(std::make_pair(
+            var_name, CvtVarNode(graph_ctx, var_node, stmt.op()->scope())));
+      }
+      input_nodes.insert(*cvted_var_nodes->find(var_name));
+    }
+    auto output_nodes =
+        supported_lists.at(stmt.op_type())(stmt.op(), graph_ctx, input_nodes);
+    cvted_var_nodes->insert(output_nodes.begin(), output_nodes.end());
+  }
+}
+
+std::string GenerateXPUProgramPass::BuildXPUGraph(
+    const std::unordered_set<Node*>& op_nodes,
+    const std::unordered_set<Node*>& in_data_vars,
+    const std::unordered_set<Node*>& out_data_vars,
+    int sub_id) {
+  auto ordered_op_nodes = GetTopologicalOrder(op_nodes);
+  lite::kernels::xpu::bridges::graph_ctx_type graph_ctx;
+  graph_ctx.builder = std::make_shared<xtcl::network::xNetworkBuilder>();
+  graph_ctx.params =
+      std::make_shared<xtcl::network::xTensorCompiler::ParamNDArrayMap>();
+  lite::kernels::xpu::bridges::node_map_type cvted_var_nodes;
+  CvtAllOpNodes(ordered_op_nodes, &graph_ctx, &cvted_var_nodes);
+
+  std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
+  auto any_op = (*op_nodes.begin())->AsStmt().op();
+  auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
+  weight->set_persistable(true);
+  weight->set_precision(PRECISION(kInt8));
+  // Compiling graph to XPU model and store mode data into weight tensor with
+  // persistable=true, Sothat the model parser can recognize it and save it to
+  // param files
+  std::vector<std::shared_ptr<xtcl::xExpr>> ordered_cvted_var_nodes;
+  for (auto out_data_var : out_data_vars) {
+    auto var_name = out_data_var->AsArg().name;
+    ordered_cvted_var_nodes.push_back(cvted_var_nodes[var_name]);
+  }
+  if (!lite::xpu::BuildModel(graph_ctx.builder,
+                             graph_ctx.params,
+                             &ordered_cvted_var_nodes,
+                             weight)) {
+    LOG(WARNING) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
+    throw std::runtime_error("[XPU] Build XPU graph failed.");
+  }
+  LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
+  return weight_var_name;
+}
+
+void GenerateXPUProgramPass::GenXPUSubgraph(
+    const std::unique_ptr<SSAGraph>& graph,
+    const std::unordered_set<Node*>& op_nodes,
+    int sub_id) {
+  std::unordered_set<Node*> in_data_vars;
+  std::unordered_set<Node*> in_wgt_vars;
+  std::unordered_set<Node*> out_data_vars;
+  std::unordered_set<Node*> out_unused_vars;
+  FindInputOutputVars(
+      op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
+
+  auto weight_var_name =
+      BuildXPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
+
+  auto any_op = (*op_nodes.begin())->AsStmt().op();
+  InsertNewNode(graph,
+                weight_var_name,
+                any_op->scope(),
+                any_op->valid_places(),
+                in_data_vars,
+                in_wgt_vars,
+                out_data_vars,
+                out_unused_vars);
+
+  auto nodes2rm = GetNode2rm(
+      op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
+
+  GraphSafeRemoveNodes(graph.get(), nodes2rm);
+}
+
+void GenerateXPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  LOG(INFO) << "[XPU] Before XPU Pass \n" << Visualize(graph.get());
+  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
+  const auto& op_map = bridges.AllFunctions();
+  std::vector<std::string> supported_op_types;
+  for (auto& i : op_map) {
+    LOG(INFO) << "[XPU] Supported type: " << i.first;
+    supported_op_types.push_back(i.first);
+  }
+
+  try {
+    int num_subgraph = FuseSubgraph(graph, supported_op_types);
+    InferOnce(graph);
+    auto op_nodes_all = ClassifySubgraph(graph);
+    CHECK_EQ(op_nodes_all.size(), num_subgraph);
+    int id = 1;
+    for (auto& op_nodes : op_nodes_all) {
+      LOG(INFO) << "[XPU] Converting Subgraph " << id;
+      GenXPUSubgraph(graph, op_nodes.second, id);
+      LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
+                << Visualize(graph.get());
+      id++;
+    }
+  } catch (...) {
+    LOG(WARNING) << "[XPU] Build XPU graph failed.";
+    throw std::runtime_error("[XPU] Build XPU graph failed.");
+  }
+
+  for (auto& item : graph->StmtTopologicalOrder()) {
+    if (item->IsStmt()) {
+      auto& stmt = item->AsStmt();
+      LOG(INFO) << stmt;
+      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
+    }
+  }
+}
+
+std::unique_ptr<RuntimeProgram> GenerateXPUProgramPass::GenProgram() {
+  LOG(INFO) << "[XPU] program insts.size=" << insts_.size();
+  std::unique_ptr<RuntimeProgram> program(
+      new RuntimeProgram(std::move(insts_)));
+  return program;
+}
+
+}  // namespace subgraph
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(generate_xpu_program_pass,
+                  paddle::lite::mir::subgraph::GenerateXPUProgramPass)
+    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.h b/lite/core/mir/subgraph/generate_xpu_program_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf121ae9503201e8cf6be40fe9054ccaf6e4b172
--- /dev/null
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "lite/backends/xpu/builder.h"
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/subgraph/subgraph_program_pass.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace subgraph {
+
+class GenerateXPUProgramPass : public SubgraphProgramPass {
+ public:
+  using key2nodes_t = std::map<std::string, Node*>;
+
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+  std::unique_ptr<RuntimeProgram> GenProgram();
+
+ protected:
+  // nodes2cvt: op nodes to convert
+  // return cvted_vars: converted var nodes
+  void CvtAllOpNodes(
+      const std::vector<Node*>& op_nodes,
+      lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
+      lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes);
+
+  std::shared_ptr<xtcl::xExpr> CvtVarNode(
+      lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
+      lite::mir::Node* var_node,
+      const Scope* scope);
+
+  std::string BuildXPUGraph(const std::unordered_set<Node*>& op_nodes,
+                            const std::unordered_set<Node*>& in_data_vars,
+                            const std::unordered_set<Node*>& out_data_vars,
+                            int sub_id);
+
+  void GenXPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
+                      const std::unordered_set<Node*>& op_nodes,
+                      int sub_id);
+
+ private:
+  std::vector<Instruction> insts_;
+};
+
+}  // namespace subgraph
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc b/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..728ecbc6b77666accd432b1ad82a03860588ab40
--- /dev/null
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+DEFINE_string(model_file, "", "model file path of combined protobuf model");
+DEFINE_string(params_file, "", "params file path of combined protobuf model");
+DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model");
+DEFINE_string(input_tensor_shape, "1,3,224,224", "shapes of input tensors");
+DEFINE_int32(output_tensor_num, 1, "number of output tensors");
+
+namespace paddle {
+namespace lite {
+
+std::vector<std::vector<int64_t>> ParseShape(std::string txt) {
+  std::vector<std::vector<int64_t>> shape;
+  while (!txt.empty()) {
+    size_t idx = txt.find_first_of(":");
+    std::string dims = txt.substr(0, idx);
+    std::vector<int64_t> s;
+    while (!dims.empty()) {
+      size_t idx = dims.find_first_of(",");
+      int d = atoi(dims.substr(0, idx).c_str());
+      VLOG(3) << d;
+      s.push_back(d);
+      if (idx == std::string::npos) {
+        break;
+      } else {
+        dims = dims.substr(idx + 1);
+      }
+    }
+    shape.push_back(s);
+    if (idx == std::string::npos) {
+      break;
+    } else {
+      txt = txt.substr(idx + 1);
+    }
+  }
+  return shape;
+}
+
+int64_t ShapeProduction(std::vector<int64_t> shape) {
+  int64_t s = 1;
+  for (int64_t dim : shape) {
+    s *= dim;
+  }
+  return s;
+}
+
+void FillInputTensor(
+    const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
+    const std::vector<std::vector<int64_t>>& input_tensor_shape,
+    const float value) {
+  for (int i = 0; i < input_tensor_shape.size(); i++) {
+    auto input_tensor = predictor->GetInput(i);
+    input_tensor->Resize(input_tensor_shape[i]);
+    auto input_tensor_data = input_tensor->mutable_data<float>();
+    auto input_tensor_size = ShapeProduction(input_tensor->shape());
+    for (int j = 0; j < input_tensor_size; j++) {
+      input_tensor_data[j] = value;
+    }
+  }
+}
+
+void CompareOutputTensor(
+    const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
+    const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
+    const int output_tensor_num) {
+  for (int i = 0; i < output_tensor_num; i++) {
+    auto tar_output_tensor = tar_predictor->GetOutput(i);
+    auto ref_output_tensor = ref_predictor->GetOutput(i);
+    auto tar_output_tensor_data = tar_output_tensor->data<float>();
+    auto ref_output_tensor_data = ref_output_tensor->data<float>();
+    auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
+    auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
+    EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
+    for (size_t j = 0; j < ref_output_tensor_size; j++) {
+      auto diff =
+          std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) /
+          (std::fabs(ref_output_tensor_data[j]) + 1e-6);
+      VLOG(3) << diff;
+      EXPECT_LT(diff, 0.1);
+    }
+  }
+}
+
+std::shared_ptr<lite_api::PaddlePredictor> TestModel(
+    const std::string& model_dir,
+    const std::string& model_file,
+    const std::string& params_file,
+    const std::vector<lite_api::Place>& valid_places,
+    const std::vector<std::vector<int64_t>>& input_tensor_shape,
+    const std::string& optimized_model_dir) {
+  // generate optimized model
+  lite_api::CxxConfig cxx_config;
+  cxx_config.set_model_dir(model_dir);
+  cxx_config.set_model_file(model_file);
+  cxx_config.set_param_file(params_file);
+  cxx_config.set_valid_places(valid_places);
+  auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
+  FillInputTensor(predictor, input_tensor_shape, -1);
+  predictor->SaveOptimizedModel(optimized_model_dir,
+                                lite_api::LiteModelType::kNaiveBuffer);
+#if 0  // TODO(hong19860320) supports light api for XPU
+  // load optimized model
+  lite_api::MobileConfig mobile_config;
+  mobile_config.set_model_dir(optimized_model_dir);
+  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
+  mobile_config.set_threads(1);
+  predictor = lite_api::CreatePaddlePredictor(mobile_config);
+  FillInputTensor(predictor, input_tensor_shape, 1);
+#endif
+  // run optimized model
+  for (int i = 0; i < FLAGS_warmup; i++) {
+    predictor->Run();
+  }
+  for (int i = 0; i < FLAGS_repeats; i++) {
+    auto start = GetCurrentUS();
+    predictor->Run();
+    LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
+  }
+  return predictor;
+}
+
+TEST(XPUSubgraph, compare) {
+  // parsing input tensor shape, supported formats: "1,3,224,224"
+  // "1,3,224,224:1,80"
+  std::vector<std::vector<int64_t>> input_tensor_shape =
+      ParseShape(FLAGS_input_tensor_shape);
+  // generate and run optimized CPU model
+  LOG(INFO) << " ================ CPU ================== ";
+  auto cpu_predictor =
+      TestModel(FLAGS_model_dir,
+                FLAGS_model_file,
+                FLAGS_params_file,
+                {lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
+                input_tensor_shape,
+                FLAGS_optimized_model_dir + "/CPU");
+  // generate and run optimized XPU model
+  LOG(INFO) << " ================ XPU ================== ";
+  auto xpu_predictor =
+      TestModel(FLAGS_model_dir,
+                FLAGS_model_file,
+                FLAGS_params_file,
+                {lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                 lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
+                input_tensor_shape,
+                FLAGS_optimized_model_dir + "/XPU");
+  // verify results
+  CompareOutputTensor(xpu_predictor, cpu_predictor, FLAGS_output_tensor_num);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_program_pass.cc b/lite/core/mir/subgraph/subgraph_program_pass.cc
index 31c28ad89cd419090fd14bfc367a9ef5eeaf9b15..0cb2261a3fca7aa47119b18900d38ecfd8229299 100644
--- a/lite/core/mir/subgraph/subgraph_program_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_program_pass.cc
@@ -207,8 +207,26 @@ void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) {
     if (!item->IsStmt()) continue;
     auto& stmt = item->AsStmt();
     auto& op = stmt.op();
+    auto scope = op->scope();
     std::string op_type = op->op_info()->Type();
-    if (op_type == "feed" || op_type == "fetch") continue;
+    // check the dimension of input variables in the scope, must not be empty !
+    if (op_type == "feed") {
+      auto input_var_names = op->op_info()->output_names();
+      CHECK_GE(input_var_names.size(), 1);
+      for (auto input_var_name : input_var_names) {
+        auto input_var = scope->FindVar(input_var_name);
+        CHECK(input_var) << "No input variable '" << input_var_name
+                         << "' found in scope " << scope;
+        auto input = input_var->GetMutable<lite::Tensor>();
+        CHECK(!input->dims().empty()) << "The dimension of input variable '"
+                                      << input_var_name
+                                      << "' can not be empty.";
+      }
+      continue;
+    }
+    if (op_type == "fetch") {
+      continue;
+    }
     op->CheckShape();
     op->InferShape();
     // TOOD(xxx): remove Launch() at last
diff --git a/lite/core/mir/subgraph/subgraph_program_pass_test.cc b/lite/core/mir/subgraph/subgraph_program_pass_test.cc
index 625c9ac92435296ddb9a9ad2b116aef7fe6ea3f8..22e20b81d831ff25df090a7565e671b9139122f7 100644
--- a/lite/core/mir/subgraph/subgraph_program_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_program_pass_test.cc
@@ -46,6 +46,9 @@ TEST(SubgraphTest, models) {
 #endif
 #ifdef LITE_WITH_NPU
       Place{TARGET(kNPU), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_XPU
+      Place{TARGET(kXPU), PRECISION(kFloat)},
 #endif
   });
   lite::Program program(program_desc, scope, valid_places);
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index 0fdce27e3b5381cb455a346800a47e2a42e9f4ba..ad974a781c7c899428015907a4166d8d0c351c76 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -78,6 +78,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
     case TARGET(kNPU): {
       CREATE_KERNEL(kNPU);
     } break;
+    case TARGET(kXPU): {
+      CREATE_KERNEL(kXPU);
+    } break;
     case TARGET(kFPGA): {
       CREATE_KERNEL(kFPGA);
     } break;
@@ -142,6 +145,11 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kNPU, kAny, kNCHW);
   INIT_FOR(kNPU, kAny, kAny);
 
+  INIT_FOR(kXPU, kFloat, kNCHW);
+  INIT_FOR(kXPU, kInt8, kNCHW);
+  INIT_FOR(kXPU, kAny, kNCHW);
+  INIT_FOR(kXPU, kAny, kAny);
+
   INIT_FOR(kFPGA, kFP16, kNHWC);
   INIT_FOR(kFPGA, kFP16, kAny);
   INIT_FOR(kFPGA, kFloat, kNHWC);
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index 25375b8a8f795e58194d6223f617273beac3b78e..1c67ee8f3dcafe30d9bda587d62233d0e715071e 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -178,6 +178,16 @@ class KernelRegistry final {
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
 
+              KernelRegistryForTarget<TARGET(kXPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kXPU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kXPU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+
               KernelRegistryForTarget<TARGET(kFPGA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW)> *,  //
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 93d8a148c909c1d4682664eca2fe7dc172f4f280..739615e2763f509f2dec97f5ab3e536aca7acc4f 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -28,6 +28,9 @@
 #ifdef LITE_WITH_NPU
 #include "lite/core/mir/subgraph/generate_npu_program_pass.h"
 #endif
+#ifdef LITE_WITH_XPU
+#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
+#endif
 
 namespace paddle {
 namespace lite {
@@ -106,7 +109,8 @@ class Optimizer {
 
            "runtime_context_assign_pass",
            "argument_type_display_pass",  //
-#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU)
+#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \
+    !defined(LITE_WITH_XPU)
            // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel
            "memory_optimize_pass",
 #endif
@@ -121,14 +125,27 @@ class Optimizer {
 
   // Generate a new program based on the mir graph.
   std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
+#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
+    auto target_place = Place{
 #ifdef LITE_WITH_NPU
-    if (std::find(valid_places_.begin(),
-                  valid_places_.end(),
-                  Place{TARGET(kNPU), PRECISION(kFloat)}) !=
+        TARGET(kNPU),
+#endif
+#ifdef LITE_WITH_XPU
+        TARGET(kXPU),
+#endif
+        PRECISION(kFloat)};
+    if (std::find(valid_places_.begin(), valid_places_.end(), target_place) !=
         valid_places_.end()) {
+#ifdef LITE_WITH_NPU
       auto pass = mir::PassManager::Global()
                       .LookUp<mir::subgraph::GenerateNPUProgramPass>(
                           "generate_npu_program_pass");
+#endif
+#ifdef LITE_WITH_XPU
+      auto pass = mir::PassManager::Global()
+                      .LookUp<mir::subgraph::GenerateXPUProgramPass>(
+                          "generate_xpu_program_pass");
+#endif
       try {
         pass->Apply(graph_);
         auto program = pass->GenProgram();
@@ -136,7 +153,8 @@ class Optimizer {
         program->set_exec_scope(exec_scope_);
         return program;
       } catch (...) {
-        LOG(WARNING) << "Build NPU graph failed";
+        LOG(WARNING) << "Build " << TargetToStr(target_place.target)
+                     << " program failed!";
       }
     }
 #endif
diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt
index d83657ad3e24eb5661225a4a0684c141e40a6163..40c95415546d99a66abf2d6f3595ae8695c4df86 100644
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
         X86_DEPS ${x86_kernels}
         ARM_DEPS ${arm_kernels}
         NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         EXCLUDE_COMPILE_DEPS "ON"
@@ -42,6 +43,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
     X86_DEPS ${x86_kernels}
     ARM_DEPS ${arm_kernels}
     NPU_DEPS ${npu_kernels}
+    XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
     FPGA_DEPS ${fpga_kernels}
     EXCLUDE_COMPILE_DEPS "ON"
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index 1996f50133acc6f3bdf651e8c0daae5b68c96832..0bfd39ae9a0bdf6e8af606711fd4dcc6011994b5 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -9,3 +9,4 @@ add_subdirectory(x86)
 add_subdirectory(opencl)
 add_subdirectory(fpga)
 add_subdirectory(npu)
+add_subdirectory(xpu)
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 77d0097c6955c43c12bdaac8ce0410b24c5cf526..032de819743f4aba02e442dd71c26b950d1435b6 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -1,7 +1,6 @@
-lite_cc_library(npu_bridge_registry SRCS registry.cc DEPS ${npu_ddk_libs})
-lite_cc_library(npu_bridge_utils SRCS utils.cc DEPS ${npu_ddk_libs} npu_runtime tensor op scope)
+lite_cc_library(npu_bridge_registry SRCS registry.cc)
 
-set(npu_bridge_deps npu_bridge_registry npu_bridge_utils op)
+set(npu_bridge_deps npu_bridge_registry npu_builder op)
 
 lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps})
 lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps})
@@ -23,7 +22,6 @@ lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
 
 set(npu_bridges
         npu_bridge_registry
-        npu_bridge_utils
         npu_bridge_fc_op
         npu_bridge_conv_op
         npu_bridge_mul_op
@@ -43,7 +41,7 @@ set(npu_bridges
         npu_bridge_pad2d_op
         CACHE INTERNAL "npu_bridges")
 
-set(npu_bridge_test_deps ${npu_ddk_libs} ${npu_bridges} ${npu_kernels} ${ops})
+set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops})
 
 lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc
index 1e8500ef28eed25cd8514846b98e7ebfacb946a2..2b3a415ad72d5629d343678f65e2e0040fafda14 100644
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -32,7 +26,7 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
   auto scope = act_op->scope();
   auto op_info = act_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   // create act node and set input node from inputs_map
@@ -40,8 +34,8 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
   auto act_node = std::make_shared<ge::op::Activation>(unique_op_type);
   CHECK(inputs_map.count(x_var_name));
   act_node->set_input_x(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(act_node);
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(act_node);
 
   // parse and set activation type
   int act_mode = 1;
diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc
index 9f3a506d764eb9a635f46e30715b00e17b62d572..5b3cbd52133b61f0c0e37e2ba9bf2f6775f7a2b4 100644
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -33,7 +27,7 @@ node_map_type BatchNormConverter(
   auto scope = batch_norm_op->scope();
   auto op_info = batch_norm_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   std::shared_ptr<ge::op::BatchNorm> batch_norm_node =
@@ -43,27 +37,27 @@ node_map_type BatchNormConverter(
   auto scale_var_name = op_info->Input("Scale").front();
   lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
   auto npu_scale = std::make_shared<ge::op::Const>(scale_var_name);
-  npu_scale->set_attr_value(CvtFromLiteTensor(scale));
-  OpList::Global().add(npu_scale);
+  npu_scale->set_attr_value(lite::npu::CvtFromLiteTensor(scale));
+  lite::npu::OpList::Global().add(npu_scale);
 
   auto bias_var_name = op_info->Input("Bias").front();
   lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
   auto npu_bias = std::make_shared<ge::op::Const>(bias_var_name);
-  npu_bias->set_attr_value(CvtFromLiteTensor(bias));
-  OpList::Global().add(npu_bias);
+  npu_bias->set_attr_value(lite::npu::CvtFromLiteTensor(bias));
+  lite::npu::OpList::Global().add(npu_bias);
 
   auto mean_var_name = op_info->Input("Mean").front();
   lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
   auto npu_mean = std::make_shared<ge::op::Const>(mean_var_name);
-  npu_mean->set_attr_value(CvtFromLiteTensor(mean));
-  OpList::Global().add(npu_mean);
+  npu_mean->set_attr_value(lite::npu::CvtFromLiteTensor(mean));
+  lite::npu::OpList::Global().add(npu_mean);
 
   auto variance_var_name = op_info->Input("Variance").front();
   lite::Tensor* variance =
       scope->FindVar(variance_var_name)->GetMutable<Tensor>();
   auto npu_variance = std::make_shared<ge::op::Const>(variance_var_name);
-  npu_variance->set_attr_value(CvtFromLiteTensor(variance));
-  OpList::Global().add(npu_variance);
+  npu_variance->set_attr_value(lite::npu::CvtFromLiteTensor(variance));
+  lite::npu::OpList::Global().add(npu_variance);
 
   float npu_momentum = op_info->GetAttr<float>("momentum");
   float npu_epsilon = op_info->GetAttr<float>("epsilon");
@@ -80,8 +74,8 @@ node_map_type BatchNormConverter(
   batch_norm_node->set_attr_mode(npu_mode);
   batch_norm_node->set_attr_use_global_stats(npu_use_global_stats);
 
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(batch_norm_node);
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(batch_norm_node);
 
   node_map_type outputs_map;
   outputs_map[op_info->Output("Y").front()] = batch_norm_node;
diff --git a/lite/kernels/npu/bridges/concat_op.cc b/lite/kernels/npu/bridges/concat_op.cc
index 9684031ac777cc524b5324d07e7a54ce9d954453..9be47339354c5602f98583b5163d11e037570321 100644
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -32,7 +26,7 @@ node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op,
   lite::Scope* scope = concat_op->scope();
   const lite::OpInfo* op_info = concat_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "converting " << op_type << " ... ";
 
   auto x_var_names = op_info->Input("X");
@@ -48,17 +42,17 @@ node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op,
   for (auto x_var_name : x_var_names) {
     if (inputs_map.find(x_var_name) != inputs_map.end()) {
       output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name));
-      OpList::Global().add(inputs_map.at(x_var_name));
+      lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
     } else {
       auto consty = std::make_shared<ge::op::Const>(x_var_name);
       auto* x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
-      consty->set_attr_value(CvtFromLiteTensor(x));
+      consty->set_attr_value(lite::npu::CvtFromLiteTensor(x));
       output_node->set_dynamic_input_x(index + 1, *consty);
-      OpList::Global().add(consty);
+      lite::npu::OpList::Global().add(consty);
     }
     index++;
   }
-  OpList::Global().add(output_node);
+  lite::npu::OpList::Global().add(output_node);
 
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = output_node;
diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc
index db1f72ed69d5a9ce73625308fbac7fcb54cc137f..2a4ae56a515b8119324d944e14d20f5ad4295fd3 100644
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -32,7 +26,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
   auto scope = conv_op->scope();
   auto op_info = conv_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " << op_type << "... ";
 
   // get input, filter and op attributes
@@ -78,13 +72,13 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
 
   // check input
   CHECK(inputs_map.count(input_var_name));
-  OpList::Global().add(inputs_map.at(input_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
 
   // create filter node
   CHECK(!inputs_map.count(filter_var_name));
   auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(CvtFromLiteTensor(filter));
-  OpList::Global().add(filter_const_node);
+  filter_const_node->set_attr_value(lite::npu::CvtFromLiteTensor(filter));
+  lite::npu::OpList::Global().add(filter_const_node);
 
   // create bias node if has bias
   // supports the bias nodes with the following dimensions
@@ -93,7 +87,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
   // 2: {n, oc, oh, ow}
   std::shared_ptr<ge::Operator> bias_node = nullptr;
   bool is_channel_bias = false;
-  if (HasInputArg(op_info, scope, "Bias")) {
+  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
     auto bias_var_name = op_info->Input("Bias").front();
     auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
     auto bias_dims = bias->dims();
@@ -121,10 +115,11 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
     } else {
       // bias node with const data
       auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
-      bias_const_node->set_attr_value(CvtFromLiteTensor(bias, bias_shape));
+      bias_const_node->set_attr_value(
+          lite::npu::CvtFromLiteTensor(bias, bias_shape));
       bias_node = bias_const_node;
     }
-    OpList::Global().add(bias_node);
+    lite::npu::OpList::Global().add(bias_node);
   }
 
   // create conv node and set input, filter, bias nodes and attributes
@@ -147,7 +142,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
         ge::AttrValue::LIST_INT({strides[0], strides[1]}));
     depthwise_conv_node->set_attr_kernel(
         ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    OpList::Global().add(depthwise_conv_node);
+    lite::npu::OpList::Global().add(depthwise_conv_node);
     conv_node = depthwise_conv_node;
     // ConvolutionDepthwise Op doesn't support bias, so append Add node to
     // support bias
@@ -155,7 +150,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
       auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
       add_node->set_input_x1(*depthwise_conv_node);
       add_node->set_input_x2(*bias_node);
-      OpList::Global().add(add_node);
+      lite::npu::OpList::Global().add(add_node);
       conv_node = add_node;
     }
   } else {
@@ -174,7 +169,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
         ge::AttrValue::LIST_INT({strides[0], strides[1]}));
     common_conv_node->set_attr_kernel(
         ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    OpList::Global().add(common_conv_node);
+    lite::npu::OpList::Global().add(common_conv_node);
     conv_node = common_conv_node;
     // Convolution Op only support bias with dimension {1, oc, 1, 1},
     // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
@@ -185,7 +180,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
         auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
         add_node->set_input_x1(*common_conv_node);
         add_node->set_input_x2(*bias_node);
-        OpList::Global().add(add_node);
+        lite::npu::OpList::Global().add(add_node);
         conv_node = add_node;
       }
     }
@@ -199,7 +194,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
         std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
     relu_node->set_input_x(*conv_node);
     relu_node->set_attr_mode(1);
-    OpList::Global().add(relu_node);
+    lite::npu::OpList::Global().add(relu_node);
     outputs_map[op_info->Output("Output").front()] = relu_node;
   } else {
     outputs_map[op_info->Output("Output").front()] = conv_node;
diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc
index 21e3c73d324a45a4bebea23368eae3542f7a5ab6..f8392ec8d9b08c86a571b47187715c5bb251570f 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -33,7 +27,7 @@ node_map_type ConvTransposeConverter(
   auto scope = conv_transpose_op->scope();
   auto op_info = conv_transpose_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " << op_type << "... ";
 
   // get input, output and op attributes
@@ -70,21 +64,22 @@ node_map_type ConvTransposeConverter(
   }
   auto input_sizes_const_node =
       std::make_shared<ge::op::Const>(unique_op_type + "/input_size");
-  input_sizes_const_node->set_attr_value(CreateTensorAndFillData(output_shape));
+  input_sizes_const_node->set_attr_value(
+      lite::npu::CreateTensorAndFillData(output_shape));
   conv_transpose_node->set_input_input_sizes(*input_sizes_const_node);
-  OpList::Global().add(input_sizes_const_node);
+  lite::npu::OpList::Global().add(input_sizes_const_node);
 
   // create filter node
   CHECK(!inputs_map.count(filter_var_name));
   auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(CvtFromLiteTensor(filter));
+  filter_const_node->set_attr_value(lite::npu::CvtFromLiteTensor(filter));
   conv_transpose_node->set_input_filter(*filter_const_node);
-  OpList::Global().add(filter_const_node);
+  lite::npu::OpList::Global().add(filter_const_node);
 
   // set input node
   CHECK(inputs_map.count(input_var_name));
   conv_transpose_node->set_input_x(*inputs_map.at(input_var_name));
-  OpList::Global().add(inputs_map.at(input_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
 
   // set attributes
   conv_transpose_node->set_attr_mode(1);
@@ -99,11 +94,11 @@ node_map_type ConvTransposeConverter(
       ge::AttrValue::LIST_INT({strides[0], strides[1]}));
   conv_transpose_node->set_attr_kernel(
       ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]}));
-  OpList::Global().add(conv_transpose_node);
+  lite::npu::OpList::Global().add(conv_transpose_node);
 
   // append add node to add bias if has bias
   std::shared_ptr<ge::Operator> output_node = conv_transpose_node;
-  if (HasInputArg(op_info, scope, "Bias")) {
+  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
     // create bias node
     auto bias_var_name = op_info->Input("Bias").front();
     CHECK(!inputs_map.count(bias_var_name));
@@ -112,13 +107,13 @@ node_map_type ConvTransposeConverter(
     CHECK_EQ(channel_size, filter_shape[1] * groups);
     auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
     bias_const_node->set_attr_value(
-        CvtFromLiteTensor(bias, {1, channel_size, 1, 1}));
-    OpList::Global().add(bias_const_node);
+        lite::npu::CvtFromLiteTensor(bias, {1, channel_size, 1, 1}));
+    lite::npu::OpList::Global().add(bias_const_node);
     // append add node to add bias node
     auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
     add_node->set_input_x1(*conv_transpose_node);
     add_node->set_input_x2(*bias_const_node);
-    OpList::Global().add(add_node);
+    lite::npu::OpList::Global().add(add_node);
     output_node = add_node;
   }
 
@@ -129,7 +124,7 @@ node_map_type ConvTransposeConverter(
         std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
     relu_node->set_input_x(*output_node);
     relu_node->set_attr_mode(1);
-    OpList::Global().add(relu_node);
+    lite::npu::OpList::Global().add(relu_node);
     outputs_map[op_info->Output("Output").front()] = relu_node;
   } else {
     outputs_map[op_info->Output("Output").front()] = output_node;
diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc
index e42a933e0972041eb835c8435188db2d47c77180..6ba7acc254c0c352fe46aeee77ac3a5d25c4582f 100644
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -33,7 +27,7 @@ node_map_type ElementwiseConverter(
   auto scope = elementwise_op->scope();
   auto op_info = elementwise_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "converting elementwise...";
 
   std::shared_ptr<ge::op::Eltwise> elementwise_node =
@@ -47,20 +41,20 @@ node_map_type ElementwiseConverter(
 
   CHECK(inputs_map.find(x_var_name) != inputs_map.end());
   elementwise_node->set_input_x1(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
 
   if (inputs_map.find(y_var_name) != inputs_map.end()) {
     elementwise_node->set_input_x2(*inputs_map.at(y_var_name));
-    OpList::Global().add(inputs_map.at(y_var_name));
+    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
   } else {
     auto consty = std::make_shared<ge::op::Const>(y_var_name);
     auto* y = scope->FindVar(y_var_name)->GetMutable<Tensor>();
-    consty->set_attr_value(CvtFromLiteTensor(y));
+    consty->set_attr_value(lite::npu::CvtFromLiteTensor(y));
     elementwise_node->set_input_x2(*consty);
-    OpList::Global().add(consty);
+    lite::npu::OpList::Global().add(consty);
   }
 
-  OpList::Global().add(elementwise_node);
+  lite::npu::OpList::Global().add(elementwise_node);
 
   // paddlelite has sum only
   elementwise_node->set_attr_mode(1);
diff --git a/lite/kernels/npu/bridges/fc_op.cc b/lite/kernels/npu/bridges/fc_op.cc
index b96d62fd27cd2d40938d6396df7276fa8c64b377..1233ccedd4086bfca36fa4f1ba996814cc68127d 100644
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -29,19 +23,22 @@ namespace bridges {
 
 node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
                           const node_map_type& inputs_map) {
-  LOG(INFO) << "Converting fc...";
-  lite::Scope* scope = fc_op->scope();
-  const lite::OpInfo* op_info = fc_op->op_info();
-  auto output_node = std::make_shared<ge::op::MatMul>(UniqueName("fc"));
+  auto scope = fc_op->scope();
+  auto op_info = fc_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::npu::UniqueName(op_type);
+  LOG(INFO) << "Converting " + op_type + "...";
+
+  auto fc_node = std::make_shared<ge::op::FullConnection>(unique_op_type);
 
   auto x_var_name = op_info->Input("Input").front();
   auto w_var_name = op_info->Input("W").front();
 
   int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
-  auto* xtensor = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto* wtensor = scope->FindVar(w_var_name)->GetMutable<lite::Tensor>();
-  auto x_dims = xtensor->dims();
-  auto w_dims = wtensor->dims();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto w = scope->FindVar(w_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto w_dims = w->dims();
 
   CHECK_GE(x_dims.size(), 2UL);
   CHECK_EQ(w_dims.size(), 2UL);
@@ -49,65 +46,69 @@ node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
   int m = x_dims.Slice(0, in_num_col_dims).production();
   int k = x_dims.Slice(in_num_col_dims, x_dims.size()).production();
   int n = w_dims[1];
+  CHECK_EQ(k * n, w_dims.production());
+  VLOG(3) << "x dims: " << x_dims << " w dims: " << w_dims << " m: " << m
+          << " k: " << k << " n: " << n;
 
   CHECK(inputs_map.count(x_var_name));
   CHECK(!inputs_map.count(w_var_name));
 
-  LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k;
-  LOG(INFO) << "x_var_name:" << x_var_name
-            << ", is data: " << inputs_map.count(x_var_name);
-  LOG(INFO) << "w_var_name:" << w_var_name
-            << ", is data: " << inputs_map.count(w_var_name);
-
-  auto xsrc = inputs_map.at(x_var_name);
-  auto reshapex = std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-  reshapex->set_input_tensor(*xsrc);
-  reshapex->set_attr_shape({m, k});
-  reshapex->set_attr_axis(0);
-  OpList::Global().add(xsrc);
-  OpList::Global().add(reshapex);
-  output_node->set_input_x(*reshapex);
-
-  auto wconst = std::make_shared<ge::op::Const>(w_var_name);
-  ge::TensorDesc wdesc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-  auto size = wdesc.GetShape().GetShapeSize();
-  CHECK_EQ(size, w_dims.production());
-  ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-  ptensor->SetTensorDesc(wdesc);
-  auto* pdata = reinterpret_cast<uint8_t*>(wtensor->mutable_data<float>());
-  ptensor->SetData(pdata, size * sizeof(float));
-  wconst->set_attr_value(ptensor);
-  OpList::Global().add(wconst);
-  output_node->set_input_w(*wconst);
-
-  if (HasInputArg(op_info, scope, "Bias")) {
-    auto b_var_name = op_info->Input("Bias").front();
-    auto* btensor = scope->FindVar(b_var_name)->GetMutable<lite::Tensor>();
-
-    LOG(INFO) << "b_var_name:" << b_var_name
-              << ", is data: " << inputs_map.count(b_var_name);
-    CHECK(!inputs_map.count(b_var_name));
-    CHECK_EQ(btensor->numel(), n);
-
-    auto bconst = std::make_shared<ge::op::Const>(b_var_name);
-    ge::TensorDesc bdesc(
-        ge::Shape({1, n, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto size = bdesc.GetShape().GetShapeSize();
-    CHECK_EQ(size, n);
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(bdesc);
-    auto* pdata = reinterpret_cast<uint8_t*>(btensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    bconst->set_attr_value(ptensor);
-    OpList::Global().add(bconst);
-    output_node->set_input_bias(*bconst);
-    output_node->set_attr_has_bias(ge::AttrValue::BOOL{true});
+  // reshape x to (m, k, 1, 1)
+  auto reshaped_x_node =
+      std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
+  reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
+  reshaped_x_node->set_attr_shape({m, k, 1, 1});
+  reshaped_x_node->set_attr_axis(0);
+  fc_node->set_input_x(*reshaped_x_node);
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(reshaped_x_node);
+
+  // create w const node, set its shape to (k, n, 1, 1) and fill with
+  // the transposed w tensor
+  auto w_const_node = std::make_shared<ge::op::Const>(w_var_name);
+  ge::TensorDesc w_const_desc(
+      ge::Shape({n, k, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
+  ge::TensorPtr w_const_tensor = std::make_shared<ge::Tensor>();
+  w_const_tensor->SetTensorDesc(w_const_desc);
+  auto w_data = w->mutable_data<float>();
+  std::vector<float> transposed_w_data(w_dims.production());
+  for (int i = 0; i < k; i++) {
+    for (int j = 0; j < n; j++) {
+      transposed_w_data[j * k + i] = w_data[i * n + j];
+    }
+  }
+  w_const_tensor->SetData(reinterpret_cast<uint8_t*>(transposed_w_data.data()),
+                          transposed_w_data.size() * sizeof(float));
+  w_const_node->set_attr_value(w_const_tensor);
+  fc_node->set_input_w(*w_const_node);
+  lite::npu::OpList::Global().add(w_const_node);
+
+  // add bias node if bias tensor exists
+  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
+    auto bias_var_name = op_info->Input("Bias").front();
+    auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    auto bias_dims = bias->dims();
+    CHECK(!inputs_map.count(bias_var_name));
+    CHECK_EQ(bias_dims.production(), n);
+
+    auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
+    bias_const_node->set_attr_value(
+        lite::npu::CvtFromLiteTensor(bias, {1, n, 1, 1}));
+    fc_node->set_input_b(*bias_const_node);
+    lite::npu::OpList::Global().add(bias_const_node);
   }
+  lite::npu::OpList::Global().add(fc_node);
 
-  OpList::Global().add(output_node);
+  // reshape output of fc_node from (m, n, 1, 1) to (m, n)
+  auto reshaped_fc_node =
+      std::make_shared<ge::op::Reshape>(unique_op_type + "_reshape");
+  reshaped_fc_node->set_input_tensor(*fc_node);
+  reshaped_fc_node->set_attr_shape({m, n});
+  reshaped_fc_node->set_attr_axis(0);
+  lite::npu::OpList::Global().add(reshaped_fc_node);
 
   node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
+  outputs_map[op_info->Output("Out").front()] = reshaped_fc_node;
   return outputs_map;
 }
 
diff --git a/lite/kernels/npu/bridges/fc_op_test.cc b/lite/kernels/npu/bridges/fc_op_test.cc
index 92936dc6bfdb73df104e93b213f26ac6eedcd4b1..77015236e2eed847d0ec0ea5c06e646e5893f29a 100644
--- a/lite/kernels/npu/bridges/fc_op_test.cc
+++ b/lite/kernels/npu/bridges/fc_op_test.cc
@@ -126,6 +126,7 @@ TEST(NPUBridges, fc) {
     test_fc({1, 8, 8, 1}, {8, 4}, 2, use_bias);
     test_fc({1, 5, 5, 1}, {5, 7}, 2, use_bias);
     test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
+    test_fc({1, 1024, 1, 1}, {1024, 1000}, 1, use_bias);
   }
 }
 
diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc
index 8f3e20b023621ab3a1257a7e92686ca32bcdade2..b0cfa1c28fae68ec936e8715fb25d59853d063bc 100644
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -33,13 +27,13 @@ node_map_type InterpolateConverter(
   auto scope = interpolate_op->scope();
   auto op_info = interpolate_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   // get input, output and attributes from lite op
   auto x_var_name = op_info->Input("X").front();
   CHECK(inputs_map.count(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
 
   auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
   auto x_dims = x->dims();
@@ -64,7 +58,7 @@ node_map_type InterpolateConverter(
 
   // update out_h and out_w if has OutSize
   bool inputs_map_has_w = false;
-  if (HasInputArg(op_info, scope, "OutSize")) {
+  if (lite::npu::HasInputArg(op_info, scope, "OutSize")) {
     auto out_size_var_name = op_info->Input("OutSize").front();
     if (inputs_map.count(out_size_var_name)) {
       inputs_map_has_w = true;
@@ -83,12 +77,12 @@ node_map_type InterpolateConverter(
   auto interp_method = op_info->GetAttr<std::string>("interp_method");
   if (interp_method == "bilinear") {
     auto interp_node = std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
-    OpList::Global().add(interp_node);
+    lite::npu::OpList::Global().add(interp_node);
     interp_node->set_input_x(*inputs_map.at(x_var_name));
     if (inputs_map_has_w) {
       auto out_size_var_name = op_info->Input("OutSize").front();
       interp_node->set_input_w(*inputs_map.at(out_size_var_name));
-      OpList::Global().add(inputs_map.at(out_size_var_name));
+      lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
     } else {
       const float largest_multiple = 7.0f;
       float multiple = static_cast<float>(x_h * x_w) / (out_h * out_w);
@@ -99,9 +93,9 @@ node_map_type InterpolateConverter(
       auto w_const_node =
           std::make_shared<ge::op::Const>(unique_op_type + "/w");
       w_const_node->set_attr_value(
-          CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
+          lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
       interp_node->set_input_w(*w_const_node);
-      OpList::Global().add(w_const_node);
+      lite::npu::OpList::Global().add(w_const_node);
     }
     interp_node->set_attr_output_dim_mode(
         2);  // 0: zoom_factor, 1: shrink_factor, 2: height/width
@@ -110,19 +104,19 @@ node_map_type InterpolateConverter(
   } else if (interp_method == "nearest") {
     auto interp_node =
         std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type);
-    OpList::Global().add(interp_node);
+    lite::npu::OpList::Global().add(interp_node);
     interp_node->set_input_image(*inputs_map.at(x_var_name));
     if (inputs_map_has_w) {
       auto out_size_var_name = op_info->Input("OutSize").front();
       interp_node->set_input_size(*inputs_map.at(out_size_var_name));
-      OpList::Global().add(inputs_map.at(out_size_var_name));
+      lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
     } else {
       auto w_const_node =
           std::make_shared<ge::op::Const>(unique_op_type + "/w");
       w_const_node->set_attr_value(
-          CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
+          lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
       interp_node->set_input_size(*w_const_node);
-      OpList::Global().add(w_const_node);
+      lite::npu::OpList::Global().add(w_const_node);
     }
     interp_node->set_attr_align_corners(align_corners);
     outputs_map[op_info->Output("Out").front()] = interp_node;
diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc
index f22c0d611da90a230fbe070ea880a23994fbebfd..ce1662c71d62a6d73a7a3b9ce594b0dd80b6fec1 100644
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -34,7 +28,8 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
   LOG(INFO) << "converting mul...";
   lite::Scope* scope = mul_op->scope();
   const lite::OpInfo* op_info = mul_op->op_info();
-  auto output_node = std::make_shared<ge::op::MatMul>(UniqueName("mul"));
+  auto output_node =
+      std::make_shared<ge::op::MatMul>(lite::npu::UniqueName("mul"));
 
   auto x_var_name = op_info->Input("X").front();
   auto y_var_name = op_info->Input("Y").front();
@@ -66,8 +61,8 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
     reshapex->set_input_tensor(*xsrc);
     reshapex->set_attr_shape({m, k});
     reshapex->set_attr_axis(0);
-    OpList::Global().add(xsrc);
-    OpList::Global().add(reshapex);
+    lite::npu::OpList::Global().add(xsrc);
+    lite::npu::OpList::Global().add(reshapex);
     output_node->set_input_x(*reshapex);
   } else {
     auto constx = std::make_shared<ge::op::Const>(x_var_name);
@@ -79,7 +74,7 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
     auto* pdata = reinterpret_cast<uint8_t*>(xtensor->mutable_data<float>());
     ptensor->SetData(pdata, size * sizeof(float));
     constx->set_attr_value(ptensor);
-    OpList::Global().add(constx);
+    lite::npu::OpList::Global().add(constx);
     output_node->set_input_x(*constx);
   }
 
@@ -89,8 +84,8 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
     reshapey->set_input_tensor(*ysrc);
     reshapey->set_attr_shape({k, n});
     reshapey->set_attr_axis(0);
-    OpList::Global().add(ysrc);
-    OpList::Global().add(reshapey);
+    lite::npu::OpList::Global().add(ysrc);
+    lite::npu::OpList::Global().add(reshapey);
     output_node->set_input_w(*reshapey);
   } else {
     auto consty = std::make_shared<ge::op::Const>(y_var_name);
@@ -102,11 +97,11 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
     auto* pdata = reinterpret_cast<uint8_t*>(ytensor->mutable_data<float>());
     ptensor->SetData(pdata, size * sizeof(float));
     consty->set_attr_value(ptensor);
-    OpList::Global().add(consty);
+    lite::npu::OpList::Global().add(consty);
     output_node->set_input_w(*consty);
   }
 
-  OpList::Global().add(output_node);
+  lite::npu::OpList::Global().add(output_node);
 
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = output_node;
diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc
index a8eefbbc9e688183d065c7e13961b79edfa85d77..acc3b6adf9a89ffc4d984082d7330c30d46362ba 100644
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -32,15 +26,15 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
   auto scope = pad2d_op->scope();
   auto op_info = pad2d_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   std::shared_ptr<ge::op::Pad> pad2d_node =
       std::make_shared<ge::op::Pad>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
   pad2d_node->set_input_x(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(pad2d_node);
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(pad2d_node);
 
   auto mode = op_info->GetAttr<std::string>("mode");
   if (mode == "constant") {
@@ -59,17 +53,19 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
   padding.insert(padding.begin(), xds * 2 - 4, 0);
   auto npu_padding =
       std::make_shared<ge::op::Const>(unique_op_type + "/padding");
-  npu_padding->set_attr_value(CreateTensorAndFillData<int>(padding, {xds, 2}));
+  npu_padding->set_attr_value(
+      lite::npu::CreateTensorAndFillData<int>(padding, {xds, 2}));
   pad2d_node->set_input_padding(*npu_padding);
-  OpList::Global().add(npu_padding);
+  lite::npu::OpList::Global().add(npu_padding);
 
   if (mode == "constant") {
     auto pad_value = op_info->GetAttr<float>("pad_value");
     auto npu_pad_value =
         std::make_shared<ge::op::Const>(unique_op_type + "/pad_value");
-    npu_pad_value->set_attr_value(CreateTensorAndFillData<float>({pad_value}));
+    npu_pad_value->set_attr_value(
+        lite::npu::CreateTensorAndFillData<float>({pad_value}));
     pad2d_node->set_input_constant_values(*npu_pad_value);
-    OpList::Global().add(npu_pad_value);
+    lite::npu::OpList::Global().add(npu_pad_value);
 
     pad2d_node->set_attr_T(0);  // type of pad_value:  0:float  3:int32
   }
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index e4d6658432d8fbf6c7dc30ecc305b74b9bf81393..66cb27d7c34be707129f78ff15eaf4848f6878c0 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -32,7 +26,7 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
   auto scope = pool_op->scope();
   auto op_info = pool_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   std::shared_ptr<ge::op::Pooling> pool_node =
@@ -73,8 +67,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
   pool_node->set_attr_ceil_mode(npu_ceil_mode);
   // output_node->set_attr_data_mode(npu_data_mode);
 
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(pool_node);
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(pool_node);
 
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = pool_node;
diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc
index 081c49a30393ffa333e08676cc8a123307455180..50111222dd6e22ad13e675864fc4c8999ee474ff 100644
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -13,14 +13,8 @@
 // limitations under the License.
 
 #include "lite/operators/reshape_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -33,7 +27,7 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
   auto scope = reshape_op->scope();
   auto op_info = reshape_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   // get input, output and op attributes
@@ -45,10 +39,10 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
   auto reshape_node = std::make_shared<ge::op::Reshape>(unique_op_type);
   CHECK(inputs_map.count(x_var_name));
   reshape_node->set_input_tensor(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
 
   // read shape from actual shape tensor as input "w" if 'Shape' is found
-  if (HasInputArg(op_info, scope, "Shape")) {
+  if (lite::npu::HasInputArg(op_info, scope, "Shape")) {
     auto actual_shape_var_name = op_info->Input("Shape").front();
     if (!inputs_map.count(actual_shape_var_name)) {
       auto actual_shape =
@@ -67,13 +61,14 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
       }
       auto actual_shape_const_node =
           std::make_shared<ge::op::Const>(actual_shape_var_name);
-      actual_shape_const_node->set_attr_value(CreateTensorAndFillData(
-          std::vector<int>(out_shape.begin(), out_shape.end())));
+      actual_shape_const_node->set_attr_value(
+          lite::npu::CreateTensorAndFillData(
+              std::vector<int>(out_shape.begin(), out_shape.end())));
       reshape_node->set_input_w(*actual_shape_const_node);
-      OpList::Global().add(actual_shape_const_node);
+      lite::npu::OpList::Global().add(actual_shape_const_node);
     } else {
       reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name));
-      OpList::Global().add(inputs_map.at(actual_shape_var_name));
+      lite::npu::OpList::Global().add(inputs_map.at(actual_shape_var_name));
     }
   } else {
     auto shape = op_info->GetAttr<std::vector<int>>("shape");
@@ -87,7 +82,7 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
     reshape_node->set_attr_shape(
         ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
   }
-  OpList::Global().add(reshape_node);
+  lite::npu::OpList::Global().add(reshape_node);
 
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = reshape_node;
@@ -107,7 +102,7 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
     xshape_node->set_input_tensor(*inputs_map.at(x_var_name));
     xshape_node->set_attr_shape(
         ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end()));
-    OpList::Global().add(xshape_node);
+    lite::npu::OpList::Global().add(xshape_node);
     outputs_map[op_info->Output("XShape").front()] = xshape_node;
   }
   return outputs_map;
diff --git a/lite/kernels/npu/bridges/scale_op.cc b/lite/kernels/npu/bridges/scale_op.cc
index af45e6102b83b9f2e30c98d461a71488f8cd3d13..4e305b15f2f485317d5040be11cd92269d08baa8 100644
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -32,7 +26,7 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
   auto scope = scale_op->scope();
   auto op_info = scale_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   // get input, output and op attributes
@@ -52,26 +46,26 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
   auto scale_node = std::make_shared<ge::op::Scale>(unique_op_type);
   CHECK(inputs_map.count(x_var_name));
   scale_node->set_input_x(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(scale_node);
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(scale_node);
 
   // add filter node(fill with scale)
   auto filter_const_node =
       std::make_shared<ge::op::Const>(unique_op_type + "/filter");
   filter_const_node->set_attr_value(
-      CreateTensorAndFillData(scale, scale_bias_shape));
+      lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
   scale_node->set_input_filter(*filter_const_node);
-  OpList::Global().add(filter_const_node);
+  lite::npu::OpList::Global().add(filter_const_node);
 
   // add bias node(fill with bias)
   if (fabs(bias) > 1e-6f) {
     auto bias_const_node =
         std::make_shared<ge::op::Const>(unique_op_type + "/bias");
     bias_const_node->set_attr_value(
-        CreateTensorAndFillData(bias, scale_bias_shape));
+        lite::npu::CreateTensorAndFillData(bias, scale_bias_shape));
     scale_node->set_input_bias(*bias_const_node);
     scale_node->set_attr_has_bias_value(true);
-    OpList::Global().add(bias_const_node);
+    lite::npu::OpList::Global().add(bias_const_node);
   }
 
   scale_node->set_attr_axis(1);
diff --git a/lite/kernels/npu/bridges/shuffle_channel_op.cc b/lite/kernels/npu/bridges/shuffle_channel_op.cc
index 5c95fd53d53ab6d95e8e0f85edb54fa5b48bd637..d1e7bc83dd90f07fd1e0f2811a1492e9bfcc0660 100644
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -33,7 +27,7 @@ node_map_type ShuffleChannelConverter(
   auto scope = shuffle_channel_op->scope();
   auto op_info = shuffle_channel_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node =
@@ -43,8 +37,8 @@ node_map_type ShuffleChannelConverter(
   shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name));
   shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group"));
 
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(shuffle_channel_node);
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(shuffle_channel_node);
 
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = shuffle_channel_node;
diff --git a/lite/kernels/npu/bridges/softmax_op.cc b/lite/kernels/npu/bridges/softmax_op.cc
index 7473a8ea39bda7a38277da803f699b8bc94b2ede..24712315646d8d83349c47d415ab41cdfcadad88 100644
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -32,7 +26,7 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
   auto scope = softmax_op->scope();
   auto op_info = softmax_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   std::shared_ptr<ge::op::Softmax> softmax_node =
@@ -51,8 +45,8 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
   softmax_node->set_input_x(*inputs_map.at(x_var_name));
   softmax_node->set_attr_axis(axis);
 
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(softmax_node);
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(softmax_node);
 
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = softmax_node;
diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc
index 97b6c19156d0ca9736966cfc8018b4d600f7b807..0caa51c53035ef46b0f29be5a3047860c900a403 100644
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -32,7 +26,7 @@ node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
   lite::Scope* scope = split_op->scope();
   const lite::OpInfo* op_info = split_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " << op_type << " ... ";
 
   auto x_var_name = op_info->Input("X").front();
@@ -45,7 +39,7 @@ node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
       std::make_shared<ge::op::Split>(unique_op_type);
   CHECK(inputs_map.count(x_var_name));
   output_node->set_input_x(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
 
   output_node->set_attr_axis(static_cast<int64_t>(axis));
   if (num > 0) {
@@ -63,18 +57,18 @@ node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
   for (auto out_var_name : out_var_names) {
     auto const_node = std::make_shared<ge::op::Const>(
         unique_op_type + "/const_zero" + std::to_string(index));
-    const_node->set_attr_value(CreateTensorAndFillData(0));
-    OpList::Global().add(const_node);
+    const_node->set_attr_value(lite::npu::CreateTensorAndFillData(0));
+    lite::npu::OpList::Global().add(const_node);
     auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add" +
                                                   std::to_string(index));
     add_node->set_input_x1(*output_node, "y" + std::to_string(index));
     add_node->set_input_x2(*const_node);
     outputs_map[out_var_name] = add_node;
-    OpList::Global().add(add_node);
+    lite::npu::OpList::Global().add(add_node);
     index++;
   }
 
-  OpList::Global().add(output_node);
+  lite::npu::OpList::Global().add(output_node);
   return outputs_map;
 }
 
diff --git a/lite/kernels/npu/bridges/test_helper.cc b/lite/kernels/npu/bridges/test_helper.cc
index 594b2db47457cf3af4f2c4786d2bd94e57815c6e..b410a4190d86f2ddf020e7f223787acc0108a398 100644
--- a/lite/kernels/npu/bridges/test_helper.cc
+++ b/lite/kernels/npu/bridges/test_helper.cc
@@ -14,10 +14,9 @@
 
 #include "lite/kernels/npu/bridges/test_helper.h"
 #include <utility>
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 #include "lite/operators/graph_op.h"
 
 namespace paddle {
@@ -44,7 +43,7 @@ void LauchOp(const std::shared_ptr<lite::OpLite> op,
         ge::Shape(input->dims().Vectorize()), ge::FORMAT_NCHW, ge::DT_FLOAT);
     auto input_node = std::make_shared<ge::op::Data>(input_var_name);
     input_node->update_input_desc_x(input_desc);
-    OpList::Global().add(input_node);
+    lite::npu::OpList::Global().add(input_node);
     inputs_map[input_var_name] = input_node;
   }
   auto outputs_map = supported_lists.at(op_type)(op, inputs_map);
@@ -63,7 +62,7 @@ void LauchOp(const std::shared_ptr<lite::OpLite> op,
   auto weight = scope->Var(weight_var_name)->GetMutable<Tensor>();
   weight->set_persistable(true);
   weight->set_precision(PRECISION(kInt8));
-  CHECK(BuildModel(graph_inputs, graph_outputs, weight));
+  CHECK(lite::npu::BuildModel(graph_inputs, graph_outputs, weight));
   CHECK_GT(weight->numel(), 0);
   CHECK_NE(weight->data<uint8_t>(), 0);
 
@@ -94,7 +93,7 @@ void LauchOp(const std::shared_ptr<lite::OpLite> op,
   graph_kernel->Launch();
 
   // release all of resources of generated model
-  OpList::Global().clear();
+  lite::npu::OpList::Global().clear();
 }
 
 }  // namespace bridges
diff --git a/lite/kernels/npu/bridges/transpose_op.cc b/lite/kernels/npu/bridges/transpose_op.cc
index ac243a060158b2edcf7354276d62998ee1dc6b31..5e9a69837b9e253845e6a1df35a897cfe342a84e 100644
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/utils.h"
 
 namespace paddle {
 namespace lite {
@@ -33,7 +27,7 @@ node_map_type TransposeConverter(
   auto scope = transpose_op->scope();
   auto op_info = transpose_op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
+  auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "Converting " + op_type + "...";
 
   std::shared_ptr<ge::op::Permute> transpose_node =
@@ -50,8 +44,8 @@ node_map_type TransposeConverter(
     w_data[i] = 1.f;
   }
   auto npu_w = std::make_shared<ge::op::Const>(w_var_name);
-  npu_w->set_attr_value(CvtFromLiteTensor(w));
-  OpList::Global().add(npu_w);
+  npu_w->set_attr_value(lite::npu::CvtFromLiteTensor(w));
+  lite::npu::OpList::Global().add(npu_w);
 
   auto axis = op_info->GetAttr<std::vector<int>>("axis");
   auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end());
@@ -61,8 +55,8 @@ node_map_type TransposeConverter(
   transpose_node->set_input_w(*npu_w);
   transpose_node->set_attr_order(npu_axis);
 
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(transpose_node);
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(transpose_node);
 
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = transpose_node;
diff --git a/lite/kernels/npu/bridges/utils.h b/lite/kernels/npu/bridges/utils.h
deleted file mode 100644
index 382879f649fd086221eef2c62ee48bcf8da48c9d..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/utils.h
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-class OpList {
- public:
-  static OpList& Global() {
-    static thread_local OpList x;
-    return x;
-  }
-  void clear() { lists_.clear(); }
-  void add(std::shared_ptr<ge::Operator> p) { lists_.push_back(p); }
-
- private:
-  std::vector<std::shared_ptr<ge::Operator>> lists_;
-};
-
-// Build HIAI IR graph to om model, and store om model data into lite tensor
-bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
-                std::vector<ge::Operator>& outputs,  // NOLINT
-                lite::Tensor* model_data);
-
-std::string UniqueName(const std::string& prefix);
-
-ge::DataType PrecisionConverter(PrecisionType itype);
-
-ge::Format DataLayoutConverter(DataLayoutType itype);
-
-ge::TensorPtr CvtFromLiteTensor(Tensor* in_tensor,
-                                std::vector<int64_t> out_shape = {},
-                                PrecisionType in_ptype = PRECISION(kFloat),
-                                DataLayoutType in_ltype = DATALAYOUT(kNCHW));
-
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
-                                      std::vector<int64_t> shape = {},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  const std::type_info& info = typeid(T);
-  ge::DataType type = ge::DT_FLOAT;
-  if (info == typeid(float)) {
-    type = ge::DT_FLOAT;
-  } else if (info == typeid(int8_t)) {
-    type = ge::DT_INT8;
-  } else if (info == typeid(int32_t)) {
-    type = ge::DT_INT32;
-  } else {
-    LOG(FATAL) << "Unknow value type " << info.name();
-  }
-  if (shape.empty()) {
-    shape = {static_cast<int64_t>(data.size())};
-  } else {
-    int size = 1;
-    for (auto i : shape) {
-      size *= i;
-    }
-    CHECK_EQ(data.size(), size);
-  }
-  ge::TensorDesc desc(ge::Shape(shape), format, type);
-  ge::TensorPtr tensor = std::make_shared<ge::Tensor>();
-  tensor->SetTensorDesc(desc);
-  tensor->SetData(reinterpret_cast<uint8_t*>(data.data()),
-                  data.size() * sizeof(T));
-  return tensor;
-}
-
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(T value,
-                                      std::vector<int64_t> shape = {1},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  int64_t size = 1;
-  for (auto i : shape) {
-    size *= i;
-  }
-  std::vector<T> data(size, value);
-  return CreateTensorAndFillData(data, shape, format);
-}
-
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname);
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/npu/graph_compute.cc b/lite/kernels/npu/graph_compute.cc
index 41a36238cc787625e7d6191d98800ba88cbef508..f2b42c658d11edfed65eea2af48a3c0202ba3114 100644
--- a/lite/kernels/npu/graph_compute.cc
+++ b/lite/kernels/npu/graph_compute.cc
@@ -49,8 +49,8 @@ void GraphCompute::PrepareForRun() {
     VLOG(3) << "npu_idims[" << i << "]: " << npu_idims_[i].GetNumber() << ","
             << npu_idims_[i].GetChannel() << "," << npu_idims_[i].GetHeight()
             << "," << npu_idims_[i].GetWidth();
-    VLOG(3) << "lite_idims[" << i << "]: " << param.inputs[i]->dims();
-    CHECK_EQ(param.inputs[i]->dims().production(),
+    VLOG(3) << "lite_idims[" << i << "]: " << param.inputs[i].second->dims();
+    CHECK_EQ(param.inputs[i].second->dims().production(),
              npu_idims_[i].GetNumber() * npu_idims_[i].GetChannel() *
                  npu_idims_[i].GetHeight() * npu_idims_[i].GetWidth());
     npu_itensors_[i].reset(new hiai::AiTensor);
@@ -61,16 +61,16 @@ void GraphCompute::PrepareForRun() {
     VLOG(3) << "npu_odims[" << i << "]: " << npu_odims_[i].GetNumber() << ","
             << npu_odims_[i].GetChannel() << "," << npu_odims_[i].GetHeight()
             << "," << npu_odims_[i].GetWidth();
-    VLOG(3) << "lite_odims[" << i << "]: " << param.outputs[i]->dims();
+    VLOG(3) << "lite_odims[" << i << "]: " << param.outputs[i].second->dims();
     auto out_size = npu_odims_[i].GetNumber() * npu_odims_[i].GetChannel() *
                     npu_odims_[i].GetHeight() * npu_odims_[i].GetWidth();
-    if (param.outputs[i]->dims().production() != out_size) {
-      param.outputs[i]->Resize({npu_odims_[i].GetNumber(),
-                                npu_odims_[i].GetChannel(),
-                                npu_odims_[i].GetHeight(),
-                                npu_odims_[i].GetWidth()});
+    if (param.outputs[i].second->dims().production() != out_size) {
+      param.outputs[i].second->Resize({npu_odims_[i].GetNumber(),
+                                       npu_odims_[i].GetChannel(),
+                                       npu_odims_[i].GetHeight(),
+                                       npu_odims_[i].GetWidth()});
     }
-    LOG(INFO) << param.outputs[i]->dims();
+    LOG(INFO) << param.outputs[i].second->dims();
     npu_otensors_[i].reset(new hiai::AiTensor);
     npu_otensors_[i]->Init(&(npu_odims_[i]));
   }
@@ -80,7 +80,7 @@ bool GraphCompute::input_dims_changed() const {
   auto& param = this->Param<param_t>();
   CHECK_EQ(param.inputs.size(), npu_idims_.size());
   for (size_t i = 0; i < param.inputs.size(); ++i) {
-    auto param_idims = param.inputs[i]->dims();
+    auto param_idims = param.inputs[i].second->dims();
     CHECK(!param_idims.empty());
     CHECK_EQ(param_idims.size(), 4);
     std::vector<int> idims{static_cast<int>(npu_idims_[i].GetNumber()),
@@ -105,7 +105,7 @@ void GraphCompute::Run() {
   CHECK_EQ(param.outputs.size(), npu_otensors_.size());
 
   for (size_t i = 0; i < param.inputs.size(); ++i) {
-    auto* itensor = param.inputs[i];
+    auto* itensor = param.inputs[i].second;
     CHECK(itensor);
     const auto* i_data = itensor->data<float>();
     std::memcpy(
@@ -126,10 +126,10 @@ void GraphCompute::Run() {
   CHECK_EQ(hiai::AI_SUCCESS,
            model_client_->Process(
                model_context_, npu_itensors_, npu_otensors_, 1000, istamp));
-  LOG(INFO) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
+  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
 
   for (size_t i = 0; i < param.outputs.size(); ++i) {
-    auto* otensor = param.outputs[i];
+    auto* otensor = param.outputs[i].second;
     CHECK(otensor);
     auto* o_data = otensor->mutable_data<float>();
     auto* npu_obuffer = static_cast<float*>(npu_otensors_[i]->GetBuffer());
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
index 60219e3b18665280ece5c0b77723bc311cb8eebd..6d47c880c8daf1ec8981dfb4083324b79c25cec1 100644
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops)
+add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function)
 # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
@@ -55,6 +55,8 @@ lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS ba
 lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
 lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
 lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS activation_compute_x86)
+lite_cc_test(test_tanh_compute_x86 SRCS tanh_compute_test.cc DEPS activation_compute_x86)
+lite_cc_test(test_gelu_compute_x86 SRCS gelu_compute_test.cc DEPS activation_compute_x86)
 lite_cc_test(test_sequence_expand_as_compute_x86 SRCS sequence_expand_as_compute_test.cc DEPS sequence_expand_as_compute_x86)
 lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86)
 lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86)
diff --git a/lite/kernels/x86/activation_compute.cc b/lite/kernels/x86/activation_compute.cc
index 0ed09c43a5df9a087e5a21c6c9566b7b785a5afa..b4a053419c5c6f04b4b053d7bf902a57e9562518 100644
--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
@@ -35,3 +35,25 @@ REGISTER_LITE_KERNEL(relu,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+
+// float
+REGISTER_LITE_KERNEL(tanh,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::TanhCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+
+// float
+REGISTER_LITE_KERNEL(gelu,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::GeluCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/activation_compute.h b/lite/kernels/x86/activation_compute.h
index 27752401949a30234a36854260793a33b4487eba..482684b0672c1ed7f0d571f852e134e92ddcaafa 100644
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -13,8 +13,10 @@
 // limitations under the License.
 #pragma once
 
+#include <algorithm>
 #include <utility>
 #include <vector>
+#include "lite/backends/x86/math/blas.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
@@ -115,6 +117,76 @@ class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~ReluCompute() = default;
 };
 
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
+  }
+};
+
+template <typename T>
+class TanhCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    Activate<TanhFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~TanhCompute() = default;
+};
+
+// gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
+template <typename T>
+struct GeluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+// Because the execute or device context can not be deliver here, it keep the
+// marco for NVCC.
+#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
+    auto x_data = x.data();
+    auto out_data = out.data();
+    int n = std::min(x.size(), out.size());
+
+    std::memset(out_data, 0, n * sizeof(T));
+    paddle::lite::x86::math::CBlas<T>::AXPY(
+        n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, 1);
+    paddle::lite::x86::math::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
+    for (int i = 0; i < n; i++) {
+      out_data[i] += static_cast<T>(1);
+    }
+    paddle::lite::x86::math::CBlas<T>::VMUL(n, x_data, out_data, out_data);
+    for (int i = 0; i < n; i++) {
+      out_data[i] *= static_cast<T>(0.5);
+    }
+#else
+    auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
+    out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+#endif
+  }
+};
+
+template <typename T>
+class GeluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    Activate<GeluFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~GeluCompute() = default;
+};
+
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/x86/gelu_compute_test.cc b/lite/kernels/x86/gelu_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20479760e916613f14745d8b7316e094950f6a46
--- /dev/null
+++ b/lite/kernels/x86/gelu_compute_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/x86/activation_compute.cc"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(gelu_x86, retrive_op) {
+  auto gelu =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("gelu");
+  ASSERT_FALSE(gelu.empty());
+  ASSERT_TRUE(gelu.front());
+}
+
+TEST(gelu_x86, init) {
+  GeluCompute<float> gelu;
+  ASSERT_EQ(gelu.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gelu.target(), TARGET(kX86));
+}
+
+TEST(gelu_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    int sign = i % 2 == 0 ? 1 : -1;
+    x_data[i] = static_cast<float>(i * sign) * 0.8f;
+  }
+  // GeluCompute gelu;
+  GeluCompute<float> gelu;
+  operators::ActivationParam param;
+
+  param.X = &x;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gelu.SetContext(std::move(ctx));
+  gelu.SetParam(param);
+  gelu.Run();
+
+  LOG(INFO) << "output: ";
+  std::vector<float> ref_data{0.,
+                              -0.169484,
+                              1.512321,
+                              -0.019674,
+                              3.197801,
+                              -0.000126719,
+                              4.8,
+                              -0.,
+                              6.4000001,
+                              -0.,
+                              8.,
+                              -0.};
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(gelu, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/tanh_compute_test.cc b/lite/kernels/x86/tanh_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa65ca02df27642fc0114a075ad8a4249f3b70de
--- /dev/null
+++ b/lite/kernels/x86/tanh_compute_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/x86/activation_compute.cc"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(tanh_x86, retrive_op) {
+  auto tanh =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("tanh");
+  ASSERT_FALSE(tanh.empty());
+  ASSERT_TRUE(tanh.front());
+}
+
+TEST(tanh_x86, init) {
+  TanhCompute<float> tanh;
+  ASSERT_EQ(tanh.precision(), PRECISION(kFloat));
+  ASSERT_EQ(tanh.target(), TARGET(kX86));
+}
+
+TEST(tanh_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    int sign = i % 2 == 0 ? 1 : -1;
+    x_data[i] = static_cast<float>(i * sign) * 0.08f;
+  }
+  // TanhCompute tanh;
+  TanhCompute<float> tanh;
+  operators::ActivationParam param;
+
+  param.X = &x;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  tanh.SetContext(std::move(ctx));
+  tanh.SetParam(param);
+  tanh.Run();
+
+  LOG(INFO) << "output: ";
+  std::vector<float> ref_data{0.,
+                              -0.079829,
+                              0.158648,
+                              -0.235495,
+                              0.309506,
+                              -0.379949,
+                              0.446243,
+                              -0.507977,
+                              0.564899,
+                              -0.616909,
+                              0.664036,
+                              -0.706419};
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(tanh, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..72c48ceab079bc65e4f2363a1702de52586733d6
--- /dev/null
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -0,0 +1,9 @@
+
+if(NOT LITE_WITH_XPU)
+  return ()
+endif()
+
+add_kernel(graph_compute_xpu XPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} xpu_runtime)
+# lite_cc_test(test_graph_compute_xpu SRCS graph_compute_test.cc DEPS graph_compute_xpu)
+
+add_subdirectory(bridges)
diff --git a/lite/kernels/xpu/bridges/CMakeLists.txt b/lite/kernels/xpu/bridges/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a1f7b67be3b0b1798ea50daa6638873500786912
--- /dev/null
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
@@ -0,0 +1,29 @@
+lite_cc_library(xpu_bridge_registry SRCS registry.cc)
+
+set(xpu_bridge_deps xpu_bridge_registry xpu_builder op)
+
+lite_cc_library(xpu_bridge_act_op SRCS act_op.cc DEPS ${xpu_bridge_deps})
+lite_cc_library(xpu_bridge_conv_op SRCS conv_op.cc DEPS ${xpu_bridge_deps})
+lite_cc_library(xpu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${xpu_bridge_deps})
+lite_cc_library(xpu_bridge_pool_op SRCS pool_op.cc DEPS ${xpu_bridge_deps})
+lite_cc_library(xpu_bridge_softmax_op SRCS softmax_op.cc DEPS ${xpu_bridge_deps})
+lite_cc_library(xpu_bridge_mul_op SRCS mul_op.cc DEPS ${xpu_bridge_deps})
+
+set(xpu_bridges
+        xpu_bridge_registry
+        xpu_bridge_act_op
+        xpu_bridge_conv_op
+        xpu_bridge_elementwise_ops
+        xpu_bridge_pool_op
+        xpu_bridge_softmax_op
+        xpu_bridge_mul_op
+        CACHE INTERNAL "xpu_bridges")
+
+set(xpu_bridge_test_deps ${xpu_bridges} ${xpu_kernels} ${ops})
+
+lite_cc_test(test_xpu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
+lite_cc_test(test_xpu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
+lite_cc_test(test_xpu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
+lite_cc_test(test_xpu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
+lite_cc_test(test_xpu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
+lite_cc_test(test_xpu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
diff --git a/lite/kernels/xpu/bridges/act_op.cc b/lite/kernels/xpu/bridges/act_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8e11caa96fdbff3a853a192a8d16f2eccd96337
--- /dev/null
+++ b/lite/kernels/xpu/bridges/act_op.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+node_map_type ActConverter(const std::shared_ptr<lite::OpLite> op,
+                           graph_ctx_type* graph_ctx,
+                           const node_map_type& input_nodes) {
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+
+  // check context
+  CHECK(graph_ctx != nullptr);
+  CHECK(graph_ctx->builder != nullptr);
+  CHECK(graph_ctx->params != nullptr);
+
+  // create act node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  CHECK(input_nodes.count(x_var_name));
+  std::shared_ptr<xtcl::xExpr> act_node = nullptr;
+  if (op_type == "relu") {
+    act_node = std::make_shared<xtcl::xExpr>(
+        graph_ctx->builder->CreateRelu(*input_nodes.at(x_var_name)));
+  } else {
+    // TODO(hong19860320) supports more activation ops
+    LOG(FATAL) << "[XPU] Unsupported activation type " << op_type;
+  }
+  graph_ctx->builder->SetLayer(unique_op_type);
+
+  // output converted nodes
+  node_map_type output_nodes;
+  output_nodes[op_info->Output("Out").front()] = act_node;
+  return output_nodes;
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_XPU_BRIDGE(relu, paddle::lite::kernels::xpu::bridges::ActConverter);
diff --git a/lite/kernels/xpu/bridges/act_op_test.cc b/lite/kernels/xpu/bridges/act_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a3efab46e3c7caee08bf646a560a0ab9abcf5c7
--- /dev/null
+++ b/lite/kernels/xpu/bridges/act_op_test.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/test_helper.h"
+#include "lite/operators/activation_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+void relu_ref(const std::shared_ptr<operators::ActivationOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x_data = x->data<float>();
+  auto out_data = out->mutable_data<float>();
+  DDim x_dims = x->dims();
+  DDim out_dims = out->dims();
+  CHECK_EQ(x_dims.production(), out_dims.production());
+  for (int i = 0; i < out_dims.production(); i++) {
+    out_data[i] = std::max(0.f, x_data[i]);
+  }
+}
+
+void test_relu(int bs, int ic, int ih, int iw) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+
+  // initialize input&output data
+  FillTensor<float, int>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("relu");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  // create and convert op to XPU model, and run it on XPU
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+  out_ref->CopyDataFrom(*out);
+
+  // execute reference implementation and save to output tensor
+  relu_ref(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(NPUBridges, relu) {
+  for (auto bs : {1, 3}) {
+    for (auto ic : {3, 4}) {
+      for (auto ih : {2, 5}) {
+        for (auto iw : {5, 9}) {
+          VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
+                  << " iw: " << iw;
+          test_relu(bs, ic, ih, iw);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(relu);
+USE_XPU_BRIDGE(relu);
diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7c2f0ca5f303555eaa74ea04dad27c9de70d89a
--- /dev/null
+++ b/lite/kernels/xpu/bridges/conv_op.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
+                            graph_ctx_type* graph_ctx,
+                            const node_map_type& input_nodes) {
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  LOG(INFO) << "[XPU] Converting " << op_type << "... ";
+
+  // get input, filter and op attributes
+  auto input_var_name = op_info->Input("Input").front();
+  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input_dims = input->dims();
+  auto filter_var_name = op_info->Input("Filter").front();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter_dims = filter->dims();
+  auto bs = input_dims[0];
+  auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4);
+  CHECK_EQ(filter_dims.size(), 4);
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
+  CHECK_EQ(strides.size(), 2);
+  CHECK_EQ(paddings.size(), 2);
+  CHECK_EQ(dilations.size(), 2);
+  std::vector<int64_t> output_shape({bs, oc});
+  for (size_t i = 0; i < 2; i++) {
+    const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
+    output_shape.push_back(
+        (input_dims[i + 2] + 2 * paddings[i] - dkernel) / strides[i] + 1);
+  }
+  DDim output_dims(output_shape);
+
+  // check context
+  CHECK(graph_ctx != nullptr);
+  CHECK(graph_ctx->builder != nullptr);
+  CHECK(graph_ctx->params != nullptr);
+
+  // create filter node
+  CHECK(!input_nodes.count(filter_var_name));
+  auto filter_const_node = std::make_shared<xtcl::xExpr>(
+      graph_ctx->builder->CreateTensor(filter_var_name,
+                                       lite::xpu::CvtShape(filter_dims),
+                                       ::xtcl::Float(32)));
+  auto filter_const_tensor = lite::xpu::CvtTensor(filter);
+  graph_ctx->params->emplace(
+      std::make_pair(filter_var_name, *filter_const_tensor));
+
+  // create conv node and set input, filter, bias nodes and attributes
+  auto conv_attrs = xtcl::make_node<xtcl::network::Conv2DAttrs>();
+  conv_attrs->strides = std::move(lite::xpu::CvtShape(strides));
+  conv_attrs->padding = std::move(lite::xpu::CvtShape(paddings));
+  conv_attrs->dilation = std::move(lite::xpu::CvtShape(dilations));
+  conv_attrs->groups = groups;
+  // conv_attrs->channels = nullptr;
+  conv_attrs->kernel_size = std::move(xtcl::Array<xtcl::xIndexExpr>(nullptr));
+  conv_attrs->data_layout = "NCHW";
+  conv_attrs->kernel_layout = "OIHW";
+  conv_attrs->out_layout = "";
+  // conv_attrs->out_dtype = "";
+  CHECK(input_nodes.count(input_var_name));
+  auto conv_node =
+      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateConv2D(
+          *input_nodes.at(input_var_name), *filter_const_node, conv_attrs));
+  graph_ctx->builder->SetLayer(unique_op_type);
+
+  // create bias node if has bias
+  // supports the bias nodes with the following dimensions
+  // 0: {oc}
+  // 1: {1, oc, oh, ow}
+  // 2: {n, oc, oh, ow}
+  if (lite::xpu::HasInputArg(op_info, scope, "Bias")) {
+    auto bias_var_name = op_info->Input("Bias").front();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    auto bias_dims = bias->dims();
+    auto bias_data_size = bias_dims.production();
+    auto output_data_size = output_dims.production();
+    std::vector<int64_t> bias_shape;
+    bool is_channel_bias = false;
+    if (bias_data_size == oc) {
+      // 0: {oc}
+      bias_shape = {oc};
+      is_channel_bias = true;
+    } else if (bias_data_size == output_data_size / bs) {
+      // 1: {1, oc, oh, ow}
+      bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
+    } else if (bias_data_size == output_data_size) {
+      // 2: {n, oc, oh, ow}
+      bias_shape = output_dims.Vectorize();
+    } else {
+      LOG(ERROR) << "bias dimension " << bias_dims
+                 << " isn't supported in conv2d Op when output dimension is "
+                 << output_dims;
+    }
+    std::shared_ptr<xtcl::xExpr> bias_node = nullptr;
+    if (input_nodes.count(bias_var_name)) {
+      // bias node from input node
+      bias_node = input_nodes.at(bias_var_name);
+    } else {
+      // bias node with const tensor
+      auto bias_const_node = std::make_shared<xtcl::xExpr>(
+          graph_ctx->builder->CreateTensor(bias_var_name,
+                                           lite::xpu::CvtShape(bias_shape),
+                                           ::xtcl::Float(32)));
+      auto bias_const_tensor = lite::xpu::CvtTensor(bias, bias_shape);
+      graph_ctx->params->emplace(
+          std::make_pair(bias_var_name, *bias_const_tensor));
+      bias_node = bias_const_node;
+    }
+    std::shared_ptr<xtcl::xExpr> add_node = nullptr;
+    if (is_channel_bias) {
+      add_node = std::make_shared<xtcl::xExpr>(
+          graph_ctx->builder->CreateBiasAdd(*conv_node, *bias_node, 1));
+    } else {
+      add_node = std::make_shared<xtcl::xExpr>(
+          graph_ctx->builder->CreateBinaryOp("add", *conv_node, *bias_node));
+    }
+    graph_ctx->builder->SetLayer(unique_op_type + "/add");
+    conv_node = add_node;
+  }
+
+  // output converted nodes
+  node_map_type output_nodes;
+  if (fuse_relu) {
+    // append relu node if fuse_relu is true
+    auto relu_node = std::make_shared<xtcl::xExpr>(
+        graph_ctx->builder->CreateRelu(*conv_node));
+    graph_ctx->builder->SetLayer(unique_op_type + "/relu");
+    output_nodes[op_info->Output("Output").front()] = relu_node;
+  } else {
+    output_nodes[op_info->Output("Output").front()] = conv_node;
+  }
+  return output_nodes;
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_XPU_BRIDGE(conv2d, paddle::lite::kernels::xpu::bridges::ConvConverter);
+REGISTER_XPU_BRIDGE(depthwise_conv2d,
+                    paddle::lite::kernels::xpu::bridges::ConvConverter);
diff --git a/lite/kernels/xpu/bridges/conv_op_test.cc b/lite/kernels/xpu/bridges/conv_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ebdb67bd0d2801a9036696f52790f7104279b0cb
--- /dev/null
+++ b/lite/kernels/xpu/bridges/conv_op_test.cc
@@ -0,0 +1,281 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto input =
+      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
+  auto filter =
+      scope->FindVar(op_info->Input("Filter").front())->GetMutable<Tensor>();
+  auto output =
+      scope->FindVar(op_info->Output("Output").front())->GetMutable<Tensor>();
+  std::vector<int32_t> strides =
+      op_info->GetAttr<std::vector<int32_t>>("strides");
+  std::vector<int32_t> paddings =
+      op_info->GetAttr<std::vector<int32_t>>("paddings");
+  int32_t groups = op_info->GetAttr<int32_t>("groups");
+  std::vector<int32_t> dilations =
+      op_info->GetAttr<std::vector<int32_t>>("dilations");
+  bool fuse_relu = op_info->GetAttr<bool>("fuse_relu");
+  auto input_dims = input->dims();
+  auto filter_dims = filter->dims();
+  auto output_dims = output->dims();
+  auto input_data = input->mutable_data<float>();
+  auto filter_data = filter->mutable_data<float>();
+  auto output_data = output->mutable_data<float>();
+  int kernel_w = filter_dims[3];
+  int kernel_h = filter_dims[2];
+  int stride_w = strides[1];
+  int stride_h = strides[0];
+  int dila_w = dilations[1];
+  int dila_h = dilations[0];
+  int pad_w = paddings[1];
+  int pad_h = paddings[0];
+  int batch_size = input_dims[0];
+  int in_ch_size = input_dims[1];
+  int in_h = input_dims[2];
+  int in_w = input_dims[3];
+  int out_ch_size = output_dims[1];
+  int out_h = output_dims[2];
+  int out_w = output_dims[3];
+  int out_c_group = out_ch_size / groups;
+  int in_c_group = in_ch_size / groups;
+  Tensor* bias = nullptr;
+  float* bias_data = nullptr;
+  bool is_channel_bias = false;
+  if (op_info->HasInput("Bias")) {
+    auto bias_var_names = op_info->Input("Bias");
+    if (bias_var_names.size() > 0) {
+      auto bias_var_name = bias_var_names.front();
+      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+      auto bias_dims = bias->dims();
+      is_channel_bias = bias_dims.production() == out_ch_size;
+      bias_data = bias->mutable_data<float>();
+    }
+  }
+  for (int n = 0; n < batch_size; ++n) {
+    for (int g = 0; g < groups; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * groups * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            float out_value =
+                bias_data != nullptr
+                    ? (is_channel_bias ? bias_data[g * out_c_group + oc]
+                                       : bias_data[out_idx])
+                    : 0;
+            // + out_value *= beta;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+                  int in_idx = n * in_ch_size * in_h * in_w +
+                               g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                               ih * in_w + iw;
+                  int filter_idx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+                  out_value += input_data[in_idx] * filter_data[filter_idx];
+                }
+              }
+            }
+            if (fuse_relu) {
+              out_value = out_value > 0 ? out_value : 0;
+            }
+            output_data[out_idx] = out_value;
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_conv(int bs,
+               int ic,
+               int oc,
+               int ih,
+               int iw,
+               bool has_bias,
+               bool is_channel_bias,
+               bool fuse_relu,
+               bool depthwise,
+               int dilation,
+               int stride,
+               int padding,
+               int kernel) {
+  // prepare input&output variables
+  Scope scope;
+  std::string input_var_name("input");
+  std::string filter_var_name("filter");
+  std::string bias_var_name("bias");
+  std::string output_var_name("output");
+  std::string output_ref_var_name("output_ref");
+  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
+  auto* filter = scope.Var(filter_var_name)->GetMutable<Tensor>();
+  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
+  auto* output = scope.Var(output_var_name)->GetMutable<Tensor>();
+  auto* output_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
+
+  // get group size and input&filter shape
+  int groups = 1;
+  if (depthwise) {  // depthwise convolution ?
+    groups = oc = ic;
+  }
+  std::vector<int64_t> input_shape = {bs, ic, ih, iw};
+  std::vector<int64_t> filter_shape = {oc, ic / groups, kernel, kernel};
+  std::vector<int64_t> output_shape({bs, oc});
+  for (size_t i = 0; i < 2; i++) {
+    const int dkernel = dilation * (kernel - 1) + 1;
+    int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1;
+    output_shape.push_back(output_size);
+  }
+  input->Resize(input_shape);
+  filter->Resize(filter_shape);
+
+  // initialize input&output data
+  FillTensor<float, int>(input);
+  FillTensor<float, int>(filter);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
+  opdesc.SetInput("Input", {input_var_name});
+  opdesc.SetInput("Filter", {filter_var_name});
+  opdesc.SetOutput("Output", {output_var_name});
+  opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
+  opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
+  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
+  opdesc.SetAttr("groups", groups);
+  opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
+  if (has_bias) {
+    if (is_channel_bias) {
+      bias->Resize({1, oc, 1, 1});
+    } else {
+      bias->Resize({1, output_shape[1], output_shape[2], output_shape[3]});
+    }
+    FillTensor<float, int>(bias);
+    opdesc.SetInput("Bias", {bias_var_name});
+  }
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ConvOpLite>(opdesc, &scope);
+  LauchOp(op, {input_var_name}, {output_var_name});
+  output_ref->CopyDataFrom(*output);
+
+  // execute reference implementation and save to output tensor('out')
+  conv_ref(op);
+
+  // compare results
+  auto* output_data = output->mutable_data<float>();
+  auto* output_ref_data = output_ref->mutable_data<float>();
+  for (int i = 0; i < output->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+  }
+}
+
+TEST(NPUBridges, conv) {
+#if 0
+  for (auto bs : {1, 2}) {
+    for (auto ic : {3, 6}) {
+      for (auto oc : {6, 9}) {
+        for (auto ih : {14, 28}) {
+          for (auto iw : {14, 28}) {
+            for (auto has_bias : {false, true}) {
+              for (auto is_channel_bias : {false, true}) {
+                for (auto fuse_relu : {false, true}) {
+                  for (auto depthwise : {false, true}) {
+                    for (auto dilation : {1, 2}) {
+                      for (auto stride : {1, 2}) {
+                        for (auto kernel : {1, 3, 5}) {
+                          std::vector<int> paddings = {kernel / 2};
+                          if (kernel / 2 != 0) {
+                            paddings.push_back(0);
+                          }
+                          for (auto padding : paddings) {
+                            VLOG(3) << "bs: " << bs << " ic: " << ic
+                                    << " oc: " << oc << " ih: " << ih
+                                    << " iw: " << iw
+                                    << " has_bias: " << has_bias
+                                    << " is_channel_bias: " << is_channel_bias
+                                    << " fuse_relu: " << fuse_relu
+                                    << " depthwise: " << depthwise
+                                    << " dilation: " << dilation
+                                    << " stride: " << stride
+                                    << " padding: " << padding
+                                    << " kernel: " << kernel;
+                            test_conv(bs,
+                                      ic,
+                                      oc,
+                                      ih,
+                                      iw,
+                                      has_bias,
+                                      is_channel_bias,
+                                      fuse_relu,
+                                      depthwise,
+                                      dilation,
+                                      stride,
+                                      padding,
+                                      kernel);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+  test_conv(1, 1, 1, 4, 4, false, false, false, false, 1, 1, 1, 3);
+  test_conv(1, 1, 1, 4, 4, true, true, false, false, 1, 1, 1, 3);
+  test_conv(1, 1, 1, 4, 4, true, false, false, false, 1, 1, 1, 3);
+#endif
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(conv2d);
+USE_XPU_BRIDGE(conv2d);
+
+USE_LITE_OP(depthwise_conv2d);
+USE_XPU_BRIDGE(depthwise_conv2d);
diff --git a/lite/kernels/xpu/bridges/elementwise_ops.cc b/lite/kernels/xpu/bridges/elementwise_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70906b5ec9fb155efe3edcb885926a25936f41be
--- /dev/null
+++ b/lite/kernels/xpu/bridges/elementwise_ops.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+node_map_type ElementwiseConverter(const std::shared_ptr<lite::OpLite> op,
+                                   graph_ctx_type* graph_ctx,
+                                   const node_map_type& input_nodes) {
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+
+  // check context
+  CHECK(graph_ctx != nullptr);
+  CHECK(graph_ctx->builder != nullptr);
+  CHECK(graph_ctx->params != nullptr);
+
+  // get input, and attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto y_var_name = op_info->Input("Y").front();
+  auto axis = op_info->GetAttr<int>("axis");
+  auto x_tensor = scope->FindMutableTensor(x_var_name);
+  auto y_tensor = scope->FindMutableTensor(y_var_name);
+  auto x_dims = x_tensor->dims();
+  auto y_dims = y_tensor->dims();
+
+  // create x and y node
+  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
+  if (input_nodes.count(x_var_name)) {
+    x_node = input_nodes.at(x_var_name);
+  } else {
+    x_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
+        x_var_name, lite::xpu::CvtShape(x_dims), ::xtcl::Float(32)));
+    auto x_const_tensor = lite::xpu::CvtTensor(x_tensor);
+    graph_ctx->params->emplace(std::make_pair(x_var_name, *x_const_tensor));
+  }
+
+  std::shared_ptr<xtcl::xExpr> y_node = nullptr;
+  if (input_nodes.count(y_var_name)) {
+    y_node = input_nodes.at(y_var_name);
+  } else {
+    y_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
+        y_var_name, lite::xpu::CvtShape(y_dims), ::xtcl::Float(32)));
+    auto y_const_tensor = lite::xpu::CvtTensor(y_tensor);
+    graph_ctx->params->emplace(std::make_pair(y_var_name, *y_const_tensor));
+  }
+
+  // create elementwise node and set input, attributes
+  std::shared_ptr<xtcl::xExpr> elementwise_node = nullptr;
+  if (y_dims.size() == 1) {
+    elementwise_node = std::make_shared<xtcl::xExpr>(
+        graph_ctx->builder->CreateBiasAdd(*x_node, *y_node, axis));
+  } else if (x_dims.size() == y_dims.size()) {
+    elementwise_node = std::make_shared<xtcl::xExpr>(
+        graph_ctx->builder->CreateBinaryOp("add", *x_node, *y_node));
+  } else {
+    LOG(ERROR) << "XPU elementwise_add only support y of one dimension, or x "
+                  "and y of the same dimension. But recieved x's dimension: "
+               << x_dims << ", y's dimension: " << y_dims << ", axis: " << axis;
+  }
+  graph_ctx->builder->SetLayer(unique_op_type);
+
+  // output converted nodes
+  node_map_type output_nodes;
+  output_nodes[op_info->Output("Out").front()] = elementwise_node;
+  return output_nodes;
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_XPU_BRIDGE(elementwise_add,
+                    paddle::lite::kernels::xpu::bridges::ElementwiseConverter);
diff --git a/lite/kernels/xpu/bridges/elementwise_ops_test.cc b/lite/kernels/xpu/bridges/elementwise_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2abda822e3ae380ad376e92db99b5ad204a2a2a4
--- /dev/null
+++ b/lite/kernels/xpu/bridges/elementwise_ops_test.cc
@@ -0,0 +1,188 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/elementwise_ops.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+template <typename dtype>
+void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+
+  auto x_data = x->data<dtype>();
+  auto y_data = y->data<dtype>();
+  dtype* out_data = out->mutable_data<dtype>();
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  int axis = op_info->GetAttr<int>("axis");
+
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  // do elementwise add/sub/max...
+  std::string elt_type = "add";
+  if (elt_type == "add") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr + diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (elt_type == "sub") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr - diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (elt_type == "mul") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr * diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (elt_type == "max") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = std::max(*din_ptr, diny_data);
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+  }
+}
+
+void test_elementwise_add(std::vector<int64_t> x_dims,
+                          std::vector<int64_t> y_dims,
+                          int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string y_var_name = "y";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(x_dims);
+  if (y_dims.size() == 0) {
+    y->Resize(x_dims);
+  } else {
+    y->Resize(y_dims);
+  }
+
+  // initialize input&output data
+  FillTensor<float>(x);
+  FillTensor<float>(y);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("elementwise_add");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetInput("Y", {y_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", axis);
+
+  // create and convert op to XPU model, then run it on XPU
+  auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name, y_var_name}, {out_var_name});
+  out_ref->CopyDataFrom(*out);
+
+  // execute reference implementation and save to output tensor
+  elementwise_add_ref<float>(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+// xpu's bias_add only support y with one dimension
+TEST(XPUBridges, elementwise_add) {
+  test_elementwise_add({1, 2, 3, 4}, {1}, 0);
+  test_elementwise_add({1, 2, 3, 4}, {2}, 1);
+  test_elementwise_add({2, 2, 3, 4}, {3}, 2);
+  test_elementwise_add({2, 2, 3, 4}, {4}, 3);
+  test_elementwise_add({2, 2, 3, 4}, {4}, -1);
+  test_elementwise_add({2, 2, 3, 4}, {}, -1);
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(elementwise_add);
+USE_XPU_BRIDGE(elementwise_add);
diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..edf44f78bbfb54cf4316d3b9d7d9be2a121669d7
--- /dev/null
+++ b/lite/kernels/xpu/bridges/mul_op.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op,
+                           graph_ctx_type* graph_ctx,
+                           const node_map_type& input_nodes) {
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+
+  // check context
+  CHECK(graph_ctx != nullptr);
+  CHECK(graph_ctx->builder != nullptr);
+  CHECK(graph_ctx->params != nullptr);
+
+  // get input, and attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto y_var_name = op_info->Input("Y").front();
+  auto y_tensor = scope->FindMutableTensor(y_var_name);
+  auto y_dims = y_tensor->dims();
+  CHECK_EQ(y_dims.size(), 2) << "xpu now only support y_dims.size() == 2";
+
+  auto x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
+  CHECK_EQ(x_num_col_dims, 1) << "xpu now only support x_num_col_dims == 1";
+  auto y_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
+  CHECK_EQ(y_num_col_dims, 1) << "xpu now only support y_num_col_dims == 1";
+
+  // create x node
+  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
+  x_node = std::make_shared<xtcl::xExpr>(
+      graph_ctx->builder->CreateBatchFlatten(*input_nodes.at(x_var_name)));
+  graph_ctx->builder->SetLayer(unique_op_type + "/X");
+
+  // transpose y
+  DDimLite y_dims_t(std::vector<int64_t>{1, 1});
+  y_dims_t[0] = y_dims[1];
+  y_dims_t[1] = y_dims[0];
+  auto y_var_name_t = unique_op_type + "/Y";
+  Tensor* y_tensor_t = new Tensor();
+  y_tensor_t->Resize(y_dims_t);
+  auto y_data_t = y_tensor_t->mutable_data<float>();
+  auto y_data = y_tensor->mutable_data<float>();
+  for (int i = 0; i < y_dims_t[0]; i++) {
+    for (int j = 0; j < y_dims_t[1]; j++) {
+      y_data_t[i * y_dims_t[1] + j] = y_data[j * y_dims_t[0] + i];
+    }
+  }
+
+  // create y node
+  std::shared_ptr<xtcl::xExpr> y_const_node = nullptr;
+  y_const_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
+      y_var_name_t, lite::xpu::CvtShape(y_dims_t), ::xtcl::Float(32)));
+  auto y_const_tensor = lite::xpu::CvtTensor(y_tensor_t);
+  graph_ctx->params->emplace(std::make_pair(y_var_name_t, *y_const_tensor));
+  delete y_tensor_t;
+
+  // create mul node and set params from op
+  std::shared_ptr<xtcl::xExpr> mul_node = nullptr;
+  mul_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateDense(
+      *x_node, *y_const_node, static_cast<int>(y_dims[1])));
+  graph_ctx->builder->SetLayer(unique_op_type);
+
+  // output converted nodes
+  node_map_type output_nodes;
+  output_nodes[op_info->Output("Out").front()] = mul_node;
+  return output_nodes;
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_XPU_BRIDGE(mul, paddle::lite::kernels::xpu::bridges::MulConverter);
diff --git a/lite/kernels/xpu/bridges/mul_op_test.cc b/lite/kernels/xpu/bridges/mul_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd439b68cb7286a919a8fce97371443f53ed40db
--- /dev/null
+++ b/lite/kernels/xpu/bridges/mul_op_test.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/mul_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+void mul_ref(const std::shared_ptr<operators::MulOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int32_t x_num_col_dims = op_info->GetAttr<int32_t>("x_num_col_dims");
+  int32_t y_num_col_dims = op_info->GetAttr<int32_t>("y_num_col_dims");
+  auto x_data = x->mutable_data<float>();
+  auto y_data = y->mutable_data<float>();
+  auto out_data = out->mutable_data<float>();
+  auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims);
+  auto y_mat_dims = y->dims().Flatten2D(y_num_col_dims);
+  CHECK_EQ(x_mat_dims[1], y_mat_dims[0]);
+  const int M = x_mat_dims[0];
+  const int K = x_mat_dims[1];
+  const int N = y_mat_dims[1];
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      out_data[m * N + n] = 0;
+      for (int k = 0; k < K; ++k) {
+        out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n];
+      }
+    }
+  }
+}
+
+void test_mul(const std::vector<int64_t>& x_shape,
+              const std::vector<int64_t>& y_shape,
+              int x_num_col_dims,
+              int y_num_col_dims) {
+  Scope scope;
+  std::string x_var_name("X");
+  std::string y_var_name("Y");
+  std::string out_var_name("Out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(x_shape);
+  y->Resize(y_shape);
+
+  FillTensor<float>(x);
+  FillTensor<float>(y);
+
+  // create mul op
+  cpp::OpDesc mul_op_desc;
+  mul_op_desc.SetType("mul");
+  mul_op_desc.SetInput("X", {x_var_name});
+  mul_op_desc.SetInput("Y", {y_var_name});
+  mul_op_desc.SetOutput("Out", {out_var_name});
+  mul_op_desc.SetAttr("x_num_col_dims", static_cast<int>(x_num_col_dims));
+  mul_op_desc.SetAttr("y_num_col_dims", static_cast<int>(y_num_col_dims));
+
+  auto mul_op = CreateOp<operators::MulOpLite>(mul_op_desc, &scope);
+  LauchOp(mul_op, {x_var_name}, {out_var_name});
+  out_ref->CopyDataFrom(*out);
+
+  mul_ref(mul_op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(XPUBridges, mul) {
+  test_mul({1, 2, 3, 4}, {24, 2}, 1, 1);
+  test_mul({2, 2, 3, 4}, {24, 2}, 1, 1);
+  test_mul({2, 7}, {7, 3}, 1, 1);
+  //  test_mul({1, 8, 8, 1}, {1, 8, 2, 2}, 2, 2);
+  //  test_mul({1, 5, 5, 1}, {1, 5, 7, 7}, 2, 2);
+  //  test_mul({1, 4, 1, 1}, {4, 8}, 1, 1);
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(mul);
+USE_XPU_BRIDGE(mul);
diff --git a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h b/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..27e936eaaa125f26b0bdab43f5c38d60769cfd88
--- /dev/null
+++ b/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/kernels/xpu/bridges/registry.h"
+
+USE_XPU_BRIDGE(relu);
+USE_XPU_BRIDGE(conv2d);
+USE_XPU_BRIDGE(depthwise_conv2d);
+USE_XPU_BRIDGE(elementwise_add);
+USE_XPU_BRIDGE(pool2d);
+USE_XPU_BRIDGE(softmax);
+USE_XPU_BRIDGE(mul);
diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbc6a9919c446508afa5a3b8a1c35352f9b8ecfa
--- /dev/null
+++ b/lite/kernels/xpu/bridges/pool_op.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op,
+                            graph_ctx_type* graph_ctx,
+                            const node_map_type& input_nodes) {
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+
+  // check context
+  CHECK(graph_ctx != nullptr);
+  CHECK(graph_ctx->builder != nullptr);
+  CHECK(graph_ctx->params != nullptr);
+
+  // get input, and attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto exclusive = op_info->GetAttr<bool>("exclusive");
+
+  // create pool node and set params from op
+  CHECK(input_nodes.count(x_var_name));
+  std::shared_ptr<xtcl::xExpr> pool_node = nullptr;
+  if (pooling_type == "max") {
+    if (global_pooling) {
+      pool_node = std::make_shared<xtcl::xExpr>(
+          graph_ctx->builder->CreateGlobalMaxPool2D(
+              *input_nodes.at(x_var_name)));
+    } else {
+      pool_node = std::make_shared<xtcl::xExpr>(
+          graph_ctx->builder->CreateMaxPool2D(*input_nodes.at(x_var_name),
+                                              lite::xpu::CvtShape(ksize),
+                                              lite::xpu::CvtShape(strides),
+                                              lite::xpu::CvtShape(paddings),
+                                              "NCHW",
+                                              ceil_mode));
+    }
+  } else if (pooling_type == "avg") {
+    if (global_pooling) {
+      pool_node = std::make_shared<xtcl::xExpr>(
+          graph_ctx->builder->CreateGlobalAvgPool2D(
+              *input_nodes.at(x_var_name)));
+    } else {
+      pool_node = std::make_shared<xtcl::xExpr>(
+          // !exclusive ---> count_include_pad
+          graph_ctx->builder->CreateAvgPool2D(*input_nodes.at(x_var_name),
+                                              lite::xpu::CvtShape(ksize),
+                                              lite::xpu::CvtShape(strides),
+                                              lite::xpu::CvtShape(paddings),
+                                              "NCHW",
+                                              ceil_mode,
+                                              !exclusive));
+    }
+  } else {
+    LOG(FATAL) << "Unsupported pooling type: " << pooling_type;
+  }
+  graph_ctx->builder->SetLayer(unique_op_type);
+
+  // output converted nodes
+  node_map_type output_nodes;
+  output_nodes[op_info->Output("Out").front()] = pool_node;
+  return output_nodes;
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_XPU_BRIDGE(pool2d, paddle::lite::kernels::xpu::bridges::PoolConverter);
diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/xpu/bridges/pool_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ed5f922d59b5ca5e387076c9a533c4b4c251cc87
--- /dev/null
+++ b/lite/kernels/xpu/bridges/pool_op_test.cc
@@ -0,0 +1,267 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto& in_dims = x->dims();
+  auto& out_dims = out->dims();
+
+  const float* src_ptr = x->data<const float>();
+  float* dst_ptr = out->mutable_data<float>();
+
+  std::vector<int> ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  std::vector<int> strides = op_info->GetAttr<std::vector<int>>("strides");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  bool exclusive = op_info->GetAttr<bool>("exclusive");
+  std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
+
+  int in_n = in_dims[0];
+  int in_c = in_dims[1];
+  int in_h = in_dims[2];
+  int in_w = in_dims[3];
+  int size_in_n = in_c * in_h * in_w;
+  int size_in_c = in_h * in_w;
+
+  int out_h = out_dims[2];
+  int out_w = out_dims[3];
+  int size_out_n = in_c * out_h * out_w;
+  int size_out_c = out_h * out_w;
+
+  int window_h = ksize[0];
+  int window_w = ksize[1];
+  int stride_h = strides[0];
+  int stride_w = strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
+
+  if (global_pooling == true) {
+    for (int n = 0; n < in_n; ++n) {
+      for (int c = 0; c < in_c; ++c) {
+        const float* src = src_ptr + n * size_in_n + c * size_in_c;
+        float res = src[0];
+        if (pooling_type == "max") {
+          for (int i = 1; i < size_in_c; ++i) {
+            float cur_val = src[i];
+            res = cur_val > res ? cur_val : res;
+          }
+        } else if (pooling_type == "avg") {
+          for (int i = 1; i < size_in_c; ++i) {
+            float cur_val = src[i];
+            res += cur_val;
+          }
+          res /= size_in_c;
+        }
+        dst_ptr[n * size_out_n + c] = res;
+      }
+    }
+  } else {
+    for (int n = 0; n < in_n; ++n) {
+      for (int c = 0; c < in_c; ++c) {
+        for (int h = 0; h < out_h; ++h) {
+          int sh = h * stride_h;
+          int eh = sh + window_h;
+          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
+          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
+          for (int w = 0; w < out_w; ++w) {
+            int sw = w * stride_w;
+            int ew = sw + window_w;
+            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
+            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
+            int pooling_size = (ew - sw) * (eh - sh);
+            if (pooling_size == 0) continue;
+            float res = 0.f;
+            for (int kh = sh; kh < eh; ++kh) {
+              for (int kw = sw; kw < ew; ++kw) {
+                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
+                if (kh == sh && kw == sw) {
+                  res = src_ptr[src_idx];
+                } else {
+                  if (pooling_type == "max") {
+                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
+                  }
+                  if (pooling_type == "avg") {
+                    res += src_ptr[src_idx];
+                  }
+                }
+              }
+            }
+            if (pooling_type == "avg") {
+              if (exclusive) {
+                res /= pooling_size;
+              } else {
+                res /= window_h * window_w;
+              }
+            }
+            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_pool(int bs,
+               int ic,
+               int ih,
+               int iw,
+               std::string pooling_type,
+               bool ceil_mode,
+               bool global_pooling,
+               bool exclusive,
+               int ksize,
+               int stride,
+               int padding) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("pool2d");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("pooling_type", pooling_type);
+  opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
+  opdesc.SetAttr("global_pooling", global_pooling);
+  opdesc.SetAttr("exclusive", exclusive);
+  opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
+  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
+  opdesc.SetAttr("ceil_mode", ceil_mode);
+
+  // create and convert op to XPU model, then run it on XPU
+  auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+  out_ref->CopyDataFrom(*out);
+
+  // execute reference implementation and save to output tensor
+  pool_ref(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(XPUBridges, pool) {
+  for (auto pooling_type : {"max", "avg"}) {
+    for (auto bs : {1, 3}) {
+      for (auto ic : {2}) {
+        for (auto ih : {3}) {
+          for (auto iw : {4}) {
+            test_pool(bs, ic, ih, iw, pooling_type, true, true, true, 0, 1, 0);
+          }
+        }
+      }
+    }
+  }
+
+  for (auto pooling_type : {"max"}) {
+    for (auto ceil_mode : {true, false}) {
+      for (auto ksize : {2, 3}) {
+        for (auto stride : {1, 2}) {
+          for (auto padding : {0, 1}) {
+            for (auto bs : {1, 3}) {
+              for (auto ic : {2}) {
+                for (auto ih : {3}) {
+                  for (auto iw : {4}) {
+                    test_pool(bs,
+                              ic,
+                              ih,
+                              iw,
+                              pooling_type,
+                              ceil_mode,
+                              false,
+                              true,
+                              ksize,
+                              stride,
+                              padding);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (auto pooling_type : {"avg"}) {
+    for (auto ceil_mode : {true, false}) {
+      for (auto exclusive : {true, false}) {
+        for (auto ksize : {2, 3}) {
+          for (auto stride : {1, 2}) {
+            for (auto padding : {0, 1}) {
+              for (auto bs : {1, 3}) {
+                for (auto ic : {2}) {
+                  for (auto ih : {3}) {
+                    for (auto iw : {4}) {
+                      test_pool(bs,
+                                ic,
+                                ih,
+                                iw,
+                                pooling_type,
+                                ceil_mode,
+                                false,
+                                exclusive,
+                                ksize,
+                                stride,
+                                padding);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(pool2d);
+USE_XPU_BRIDGE(pool2d);
diff --git a/lite/kernels/xpu/bridges/registry.cc b/lite/kernels/xpu/bridges/registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ab1b69a25a29aeb1c1ceaff25525459ef2e94cd
--- /dev/null
+++ b/lite/kernels/xpu/bridges/registry.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/bridges/registry.h"
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+Factory& Factory::Instance() {
+  static Factory g_xpu_bridge;
+  return g_xpu_bridge;
+}
+
+bool Factory::HasType(const std::string& op_type) const {
+  return map_.count(op_type);
+}
+
+void Factory::Insert(const std::string& op_type, const func_type& func_name) {
+  map_.insert(std::make_pair(op_type, func_name));
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/registry.h b/lite/kernels/xpu/bridges/registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..c990399c1cdeb865dc214d2f1c6d1970b6d27b85
--- /dev/null
+++ b/lite/kernels/xpu/bridges/registry.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+// xpu network builder and constant tensors
+class graph_ctx_type {
+ public:
+  std::shared_ptr<xtcl::network::xNetworkBuilder> builder;
+  std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params;
+};
+
+// var_name, xpu node pointer
+using node_map_type =
+    std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>>;
+
+using func_type = std::function<node_map_type(
+    const std::shared_ptr<OpLite>, graph_ctx_type*, const node_map_type&)>;
+using cvt_map_type = std::unordered_map<std::string, func_type>;
+class Factory {
+ public:
+  static Factory& Instance();
+
+  const cvt_map_type& AllFunctions() const { return map_; }
+  bool HasType(const std::string& op_type) const;
+  void Insert(const std::string& op_type, const func_type& func_name);
+  Factory() = default;
+
+ private:
+  cvt_map_type map_;
+  DISALLOW_COPY_AND_ASSIGN(Factory);
+};
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// some platform-independent defintion
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
+#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name)                         \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                 \
+      __reg_xpu_bridge_##op_type##__,                                       \
+      "REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \
+  int __reg_xpu_bridge_##op_type##_Insert() {                               \
+    paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert(        \
+        #op_type, cvt_func_name);                                           \
+    return 0;                                                               \
+  }
+
+#define USE_XPU_BRIDGE(op_type)                                  \
+  extern int __reg_xpu_bridge_##op_type##_Insert();              \
+  static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \
+      __reg_xpu_bridge_##op_type##_Insert();
diff --git a/lite/kernels/xpu/bridges/softmax_op.cc b/lite/kernels/xpu/bridges/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3972496762a1d399ab59e7a69b0e9e18a9c28300
--- /dev/null
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/builder.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> op,
+                               graph_ctx_type* graph_ctx,
+                               const node_map_type& input_nodes) {
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::xpu::UniqueName(op_type);
+  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+
+  // check context
+  CHECK(graph_ctx != nullptr);
+  CHECK(graph_ctx->builder != nullptr);
+  CHECK(graph_ctx->params != nullptr);
+
+  // get op's attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto axis = op_info->GetAttr<int>("axis");
+
+  // create softmax node and set params from ops
+  CHECK(input_nodes.count(x_var_name));
+  std::shared_ptr<xtcl::xExpr> softmax_node = nullptr;
+  softmax_node = std::make_shared<xtcl::xExpr>(
+      graph_ctx->builder->CreateSoftmax(*input_nodes.at(x_var_name), axis));
+  graph_ctx->builder->SetLayer(unique_op_type);
+
+  // output converted nodes
+  node_map_type output_nodes;
+  output_nodes[op_info->Output("Out").front()] = softmax_node;
+  return output_nodes;
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_XPU_BRIDGE(softmax,
+                    paddle::lite::kernels::xpu::bridges::SoftmaxConverter);
diff --git a/lite/kernels/xpu/bridges/softmax_op_test.cc b/lite/kernels/xpu/bridges/softmax_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2cd12cbf4e8dc108ac43fec55a568ecec72a51ab
--- /dev/null
+++ b/lite/kernels/xpu/bridges/softmax_op_test.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/softmax_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+template <typename dtype>
+void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+  DDim x_dims = x->dims();
+
+  auto x_rank = x_dims.size();
+  int axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis += x_rank;
+  }
+  int axis_size = x_dims[axis];
+  int outer_num = x_dims.Slice(0, axis).production();
+  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
+  int compute_size = outer_num * inner_num;
+  for (int i = 0; i < compute_size; i++) {
+    int idx_inner = i % inner_num;
+    int idx_outer = (i / inner_num) * axis_size;
+    int start = idx_outer * inner_num + idx_inner;
+    int offset;
+
+    offset = start;
+    dtype max_data = std::numeric_limits<dtype>::lowest();
+    for (int j = 0; j < axis_size; j++) {
+      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
+      offset += inner_num;
+    }
+
+    offset = start;
+    dtype sum_data = (dtype)0;
+    for (int j = 0; j < axis_size; j++) {
+      out_data[offset] = exp(x_data[offset] - max_data);
+      sum_data += out_data[offset];
+      offset += inner_num;
+    }
+
+    offset = start;
+    for (int j = 0; j < axis_size; j++) {
+      out_data[offset] /= sum_data;
+      offset += inner_num;
+    }
+  }
+}
+
+void test_softmax(int bs, int ic, int ih, int iw, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("softmax");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", axis);
+
+  // create and convert op to XPU model, then run it on XPU
+  auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+  out_ref->CopyDataFrom(*out);
+
+  // execute reference implementation and save to output tensor
+  softmax_ref<float>(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(XPUBridges, softmax) {
+  for (auto bs : {2, 3}) {
+    for (auto ic : {4}) {
+      for (auto ih : {5}) {
+        for (auto iw : {6}) {
+          for (auto axis : {-3, -1, 0, 1, 2, 3}) {
+            test_softmax(bs, ic, ih, iw, axis);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(softmax);
+USE_XPU_BRIDGE(softmax);
diff --git a/lite/kernels/xpu/bridges/test_helper.cc b/lite/kernels/xpu/bridges/test_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a19324b946203c008093136d7a207ffaf23fbd6
--- /dev/null
+++ b/lite/kernels/xpu/bridges/test_helper.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/bridges/test_helper.h"
+#include <utility>
+#include "lite/backends/xpu/builder.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/operators/graph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+void LauchOp(const std::shared_ptr<lite::OpLite> op,
+             const std::vector<std::string>& input_var_names,
+             const std::vector<std::string>& output_var_names) {
+  auto scope = op->scope();
+  auto op_type = op->op_info()->Type();
+
+  // convert lite op to XPU op
+  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
+  const auto& supported_lists = bridges.AllFunctions();
+  CHECK(bridges.HasType(op_type));
+  graph_ctx_type graph_ctx;
+  graph_ctx.builder = std::make_shared<xtcl::network::xNetworkBuilder>();
+  graph_ctx.params =
+      std::make_shared<xtcl::network::xTensorCompiler::ParamNDArrayMap>();
+  node_map_type input_nodes;
+  for (auto input_var_name : input_var_names) {
+    auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+    auto input_node = std::make_shared<xtcl::xExpr>(
+        graph_ctx.builder->CreateTensor(input_var_name,
+                                        lite::xpu::CvtShape(input->dims()),
+                                        ::xtcl::Float(32)));
+    input_nodes[input_var_name] = input_node;
+  }
+  auto output_nodes = supported_lists.at(op_type)(op, &graph_ctx, input_nodes);
+  CHECK_GT(output_nodes.size(), 0);
+
+  // build network graph and output model data
+  std::vector<std::shared_ptr<xtcl::xExpr>> ordered_output_nodes;
+  for (auto output_var_name : output_var_names) {
+    ordered_output_nodes.push_back(output_nodes.at(output_var_name));
+  }
+  std::string weight_var_name = "weight";
+  auto weight = scope->Var(weight_var_name)->GetMutable<Tensor>();
+  weight->set_persistable(true);
+  weight->set_precision(PRECISION(kInt8));
+  CHECK(lite::xpu::BuildModel(
+      graph_ctx.builder, graph_ctx.params, &ordered_output_nodes, weight));
+  CHECK_GT(weight->numel(), 0);
+  CHECK(weight->data<uint8_t>() != nullptr);
+
+  // create graph op and set inputs and outputs
+  cpp::OpDesc graph_op_desc;
+  graph_op_desc.SetType("graph_op");
+  graph_op_desc.SetInput("Inputs", input_var_names);
+  graph_op_desc.SetInput("Weight", {weight_var_name});
+  graph_op_desc.SetOutput("Outputs", output_var_names);
+
+  auto graph_op =
+      std::make_shared<operators::GraphOpLite>(graph_op_desc.Type());
+  graph_op->SetValidPlaces({Place{TARGET(kXPU), PRECISION(kFloat)}});
+  CHECK(graph_op->Attach(graph_op_desc, scope));
+  CHECK(graph_op->CheckShape());
+  CHECK(graph_op->InferShape());
+
+  // create graph op kernel and set XPU context
+  auto graph_kernels =
+      graph_op->CreateKernels({Place{TARGET(kXPU), PRECISION(kFloat)}});
+  CHECK(!graph_kernels.empty());
+  auto graph_kernel =
+      std::move(graph_kernels.front());  // use the first kernel by default
+  auto graph_device = ContextScheduler::Global().NewContext(TARGET(kXPU));
+  graph_kernel->SetContext(std::move(graph_device));
+
+  // perform graph op kernel and store to output variables
+  graph_kernel->Launch();
+
+  lite::xpu::DeviceInfo::Global().Clear();
+}
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(graph_op);
+USE_LITE_KERNEL(graph_op, kXPU, kFloat, kNCHW, def);
diff --git a/lite/kernels/xpu/bridges/test_helper.h b/lite/kernels/xpu/bridges/test_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8bba5da66550a9eccaefa8b2d9a31a233f5f706
--- /dev/null
+++ b/lite/kernels/xpu/bridges/test_helper.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace bridges {
+
+template <typename T>
+std::shared_ptr<T> CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  auto op = std::make_shared<T>(opdesc.Type());
+  op->SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)},
+                      Place{TARGET(kARM), PRECISION(kFloat)},
+                      Place{TARGET(kXPU), PRECISION(kFloat)}});
+  CHECK(op->Attach(opdesc, scope));
+  CHECK(op->CheckShape());
+  CHECK(op->InferShape());
+  return op;
+}
+
+// T is the target data type
+// R is the range data type, e.g. int, half
+template <typename T, typename R = float>
+void FillTensor(Tensor* x,
+                T lower = static_cast<T>(-2),
+                T upper = static_cast<T>(2)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T* x_data = x->mutable_data<T>();
+  for (int i = 0; i < x->dims().production(); ++i) {
+    auto r = uniform_dist(rng) * (upper - lower) + lower;
+    x_data[i] = static_cast<T>(static_cast<R>(r));
+  }
+}
+
+void LauchOp(const std::shared_ptr<lite::OpLite> op,
+             const std::vector<std::string>& input_var_names,
+             const std::vector<std::string>& output_var_names);
+
+}  // namespace bridges
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/graph_compute.cc b/lite/kernels/xpu/graph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9e5be1a1d5c764c378f3fdf29d73148743962a4
--- /dev/null
+++ b/lite/kernels/xpu/graph_compute.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/graph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <string>
+#include <vector>
+#include "lite/backends/xpu/runtime.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void GraphCompute::PrepareForRun() {
+  // auto& ctx = this->ctx_->template As<XPUContext>();
+  auto& param = this->Param<param_t>();
+  CHECK(param.weight);
+  CHECK(lite::xpu::LoadModel(*param.weight, &runtime_));
+  CHECK(runtime_ != nullptr);
+}
+
+void GraphCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start_time = GetCurrentUS();
+  for (int i = 0; i < param.inputs.size(); i++) {
+    auto input_var_name = param.inputs[i].first;
+    auto input_tensor = param.inputs[i].second;
+    LOG(INFO) << "input dims[" << i << ":" << input_var_name
+              << "]: " << input_tensor->dims();
+    auto input_tensor_data = input_tensor->data<float>();
+    for (int j = 0; j < input_tensor->dims().production(); j++) {
+      VLOG(3) << input_tensor_data[j];
+    }
+    auto input_ndarray = xtcl::xNDArray::Empty(
+        input_tensor->dims().Vectorize(), {kDLFloat, 32, 1}, {kDLCPU, 0});
+    auto input_ndarray_data =
+        static_cast<float*>(input_ndarray.ToDLPack()->dl_tensor.data);
+    std::memcpy(input_ndarray_data,
+                input_tensor_data,
+                sizeof(float) * input_tensor->dims().production());
+    runtime_->SetInputZeroCopy(input_var_name,
+                               &input_ndarray.ToDLPack()->dl_tensor);
+  }
+  runtime_->Run();
+  for (int i = 0; i < param.outputs.size(); i++) {
+    auto output_ndarray = runtime_->GetOutput(i);
+    auto output_var_name = param.outputs[i].first;
+    auto output_tensor = param.outputs[i].second;
+    output_tensor->Resize(output_ndarray.Shape());
+    LOG(INFO) << "output dims[" << i << ":" << output_var_name
+              << "]: " << output_tensor->dims();
+    auto output_ndarray_data =
+        static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data);
+    auto output_tensor_data = output_tensor->mutable_data<float>();
+    std::memcpy(output_tensor_data,
+                output_ndarray_data,
+                sizeof(float) * output_tensor->dims().production());
+    for (int j = 0; j < output_tensor->dims().production(); j++) {
+      VLOG(3) << output_tensor_data[j];
+    }
+  }
+  LOG(INFO) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(graph_op,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::GraphCompute,
+                     def)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/xpu/graph_compute.h b/lite/kernels/xpu/graph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5406daa8a1b757989d006f4e0ea09baedc809e33
--- /dev/null
+++ b/lite/kernels/xpu/graph_compute.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class GraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~GraphCompute() = default;
+
+ private:
+  std::shared_ptr<xtcl::network::xRuntimeInstance> runtime_{nullptr};
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/graph_op.cc b/lite/operators/graph_op.cc
index 8fd3fe8e6dc07a677d48dc54d330fd6568698de2..018ce264e2f18862549a4abc0444d02dcbb573ee 100644
--- a/lite/operators/graph_op.cc
+++ b/lite/operators/graph_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/operators/graph_op.h"
+#include <utility>
 #include "lite/core/op_registry.h"
 
 namespace paddle {
@@ -34,7 +35,8 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
 
   for (auto var : inputs) {
     CHECK(scope->FindVar(var));
-    param_.inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+    param_.inputs.push_back(
+        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
   }
 
   param_.weight = scope->FindVar(weight.front())->GetMutable<lite::Tensor>();
@@ -42,7 +44,8 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
 
   for (auto var : outputs) {
     CHECK(scope->FindVar(var));
-    param_.outputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+    param_.outputs.push_back(
+        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
   }
 
   return true;
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 45d53f17f91ce5a7b42e9e54829640b9c94005db..097dd91163357d9fa43818c68687a48de06fe8aa 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+#include <utility>
 #include <vector>
 #include "lite/api/paddle_place.h"
 #include "lite/core/scope.h"
@@ -69,9 +70,9 @@ struct CalibParam {
 };
 
 struct GraphParam {
-  std::vector<const lite::Tensor*> inputs{};
+  std::vector<std::pair<std::string, const lite::Tensor*>> inputs{};
   lite::Tensor* weight{};
-  std::vector<lite::Tensor*> outputs{};
+  std::vector<std::pair<std::string, lite::Tensor*>> outputs{};
 };
 
 /// -------------------------- NN operators ------------------------------------
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 1e5fdbb34de3fd0b986e6ec635545fd114f42e5f..f2c2c9a71666b539248c955c6e75470c5933b5c9 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
     lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh
index 30fd812fb5f49d141786d9dab0f64788e27d07fc..03a74046f17ad03bccc7b6d5050acae9d643686c 100755
--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
@@ -5,12 +5,13 @@ set -ex
 ARM_OS="android"                    # android only yet
 ARM_ABI="armv8"                     # armv8, armv7
 ARM_LANG="gcc"                      # gcc only yet
-ANDROID_STL="c++_shared"            # c++_shared, c++_static
+ANDROID_STL="c++_static"            # c++_shared, c++_static
 DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HIAI SDK from https://developer.huawei.com/consumer/cn/hiai/
 TARGET_NAME="test_npu_pass"         # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
 WITH_JAVA=ON                        # ON(build jar and jni so)/OFF
 WITH_TESTING=ON                     # ON/OFF
+SHUTDOWN_LOG=OFF                    # ON(disable logging)/OFF
 ON_TINY_PUBLISH=OFF                 # ON(tiny publish)/OFF(full publish)
 
 function print_usage {
@@ -75,6 +76,7 @@ function build_npu {
     fi
     if [[ "${ON_TINY_PUBLISH}" == "ON" ]]; then
         WITH_TESTING=OFF
+        SHUTDOWN_LOG=ON
         publish_dir="tiny_publish"
     else
         publish_dir="full_publish"
@@ -97,6 +99,7 @@ function build_npu {
         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
         -DWITH_TESTING=${WITH_TESTING} \
         -DLITE_WITH_JAVA=${WITH_JAVA} \
+        -DLITE_SHUTDOWN_LOG=${SHUTDOWN_LOG} \
         -DLITE_WITH_NPU=ON \
         -DLITE_ON_TINY_PUBLISH=${ON_TINY_PUBLISH} \
         -DANDROID_API_LEVEL=24 \
diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..62a123c82b2945147fa8616ad8faf0af33a32302
--- /dev/null
+++ b/lite/tools/build_xpu.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+set -ex
+
+# global variables with default value
+XPU_SDK_ROOT="$(pwd)/../XPU_SDK"    # XPU SDK
+TARGET_NAME="lite_compile_deps"     # default target
+BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
+WITH_TESTING=ON                     # ON/OFF
+
+function print_usage {
+    echo -e "\nUSAGE:"
+    echo
+    echo "----------------------------------------"
+    echo -e "--xpu_sdk_root=<xpu sdk directory>"
+    echo -e "--target_name=<target name>"
+    echo "----------------------------------------"
+    echo
+}
+
+# readonly variables with default value
+readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
+                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
+                               -DWITH_PYTHON=OFF \
+                               -DLITE_WITH_ARM=OFF"
+
+readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
+
+readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+readonly workspace=$(pwd)
+
+function prepare_thirdparty {
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+        rm -rf $workspace/third-party
+
+        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-05b862.tar.gz
+    else
+        git submodule update --init --recursive
+    fi
+}
+
+# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    # in build directory
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
+    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
+    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
+
+    # clone submodule
+    # git submodule update --init --recursive
+    prepare_thirdparty
+}
+
+function build_xpu {
+    build_dir=${workspace}/build.lite.xpu
+    mkdir -p $build_dir
+    cd $build_dir
+
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
+    prepare_workspace
+    cmake .. \
+        ${CMAKE_COMMON_OPTIONS} \
+        -DWITH_GPU=OFF \
+        -DWITH_MKLDNN=OFF \
+        -DLITE_WITH_X86=ON \
+        -DWITH_MKL=ON \
+        -DLITE_BUILD_EXTRA=ON \
+        -DLITE_WITH_XPU=ON \
+        -DWITH_TESTING=${WITH_TESTING} \
+        -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
+
+    make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
+
+    cd -
+    echo "Done"
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --target_name=*)
+                TARGET_NAME="${i#*=}"
+                shift
+                ;;
+            --build_extra=*)
+                BUILD_EXTRA="${i#*=}"
+                shift
+                ;;
+            --xpu_sdk_root=*)
+                XPU_SDK_ROOT="${i#*=}"
+                shift
+                ;;
+            build)
+                build_xpu
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+}
+
+main $@
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 0e8f75f10ace88a1fa57ebce8f158ab3416546b6..8be8e6e6b6da1e2aa38b6fcbcf95b23a8543a5be 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -248,6 +248,63 @@ function build_test_train {
 
 }
 
+function cmake_xpu {
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
+    prepare_workspace
+    cmake .. \
+        ${common_flags} \
+        -DWITH_GPU=OFF \
+        -DWITH_MKLDNN=OFF \
+        -DLITE_WITH_X86=ON \
+        -DWITH_MKL=ON \
+        -DLITE_BUILD_EXTRA=ON \
+        -DLITE_WITH_XPU=ON \
+        -DXPU_SDK_ROOT="$(pwd)/../../XPU_SDK"
+}
+
+function build_xpu {
+    make lite_compile_deps -j$NUM_CORES_FOR_COMPILE
+}
+
+# It will eagerly test all lite related unittests.
+function test_xpu {
+    # Due to the missing of xpu kernels, we skip the following tests temporarily.
+    # TODO(xxx) clear the skip list latter
+    local skip_list=("test_paddle_api" "test_cxx_api" "test_googlenet"
+                     "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86"
+                     "test_inceptionv4_lite_x86" "test_light_api"
+                     "test_apis" "test_model_bin"
+                    )
+    local to_skip=0
+    for _test in $(cat $TESTS_FILE); do
+        to_skip=0
+        for skip_name in ${skip_list[@]}; do
+            if [ $skip_name = $_test ]; then
+                echo "to skip " $skip_name
+                to_skip=1
+            fi
+        done
+
+        if [ $to_skip -eq 0 ]; then
+            ctest -R $_test -V
+        fi
+    done
+}
+
+# Build the code and run lite server tests. This is executed in the CI system.
+function build_test_xpu {
+    cur_dir=$(pwd)
+
+    build_dir=$cur_dir/build.lite.xpu
+    mkdir -p $build_dir
+    cd $build_dir
+
+    cmake_xpu
+    build_xpu
+
+    test_xpu
+}
+
 # test_arm_android <some_test_name> <adb_port_number>
 function test_arm_android {
     local test_name=$1
@@ -850,6 +907,10 @@ function main {
                 cmake_x86
                 shift
                 ;;
+            cmake_xpu)
+                cmake_xpu
+                shift
+                ;;
             cmake_opencl)
                 cmake_opencl $ARM_OS $ARM_ABI $ARM_LANG
                 shift
@@ -874,6 +935,10 @@ function main {
                 test_server
                 shift
                 ;;
+            test_xpu)
+                test_xpu
+                shift
+                ;;
             test_arm)
                 test_arm $ARM_OS $ARM_ABI $ARM_LANG $ARM_PORT
                 shift
@@ -890,6 +955,10 @@ function main {
                 build_test_server
                 shift
                 ;;
+            build_test_xpu)
+                build_test_xpu
+                shift
+                ;;
             build_test_train)
                 build_test_train
                 shift
diff --git a/lite/tools/debug/CMakeLists.txt b/lite/tools/debug/CMakeLists.txt
index ae098b05a66668e1cd4166c4b174feec538d8b37..43c0812ab91f6ddcba02f93d2eea60f5a5268341 100644
--- a/lite/tools/debug/CMakeLists.txt
+++ b/lite/tools/debug/CMakeLists.txt
@@ -13,6 +13,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
     X86_DEPS ${x86_kernels}
     ARM_DEPS ${arm_kernels}
     NPU_DEPS ${npu_kernels}
+    XPU_DEPS ${xpu_kernels}
     FPGA_DEPS ${fpga_kernels}
     CL_DEPS ${opencl_kernels})
 endif()