Merge pull request #946 from zhangjun/arm-devel

[WIP]support arm and xpu

Merge pull request #946 from zhangjun/arm-devel
[WIP]support arm and xpu
75333c57 · TeslaZhao · GitHub · 159267cc · 09c32ed9 · 75333c57
20 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,9 @@ set(THIRD_PARTY_BUILD_TYPE Release)
 option(WITH_AVX	    "Compile Paddle Serving with AVX intrinsics"    OFF)
 option(WITH_MKL	    "Compile Paddle Serving with MKL support."      OFF)
 option(WITH_GPU	    "Compile Paddle Serving with NVIDIA GPU"        OFF)
+option(WITH_LITE    "Compile Paddle Serving with Paddle Lite Engine"    OFF)
+option(WITH_XPU	    "Compile Paddle Serving with Baidu Kunlun"        OFF)
+option(WITH_PYTHON  "Compile Paddle Serving with Python"		    ON)
 option(CLIENT  	    "Compile Paddle Serving Client"		    OFF)
 option(SERVER	    "Compile Paddle Serving Server"		    OFF)
 option(APP          "Compile Paddle Serving App package"	    OFF)
@@ -66,40 +69,40 @@ if (NOT DEFINED WITH_MKLDNN)
    endif()
 endif()
-if (SERVER)
-include(external/jsoncpp)
-#include(external/rocksdb)
-endif()
 if (SERVER OR CLIENT)
-include(external/snappy)
+    include(external/snappy)
-include(external/leveldb)
+    include(external/leveldb)
-include(external/zlib)
+    include(external/zlib)
-include(external/boost)
+    include(external/boost)
-include(external/protobuf)
+    include(external/protobuf)
-include(external/brpc)
+    include(external/brpc)
-include(external/gflags)
+    include(external/gflags)
-include(external/glog)
+    include(external/glog)
-include(external/pybind11)
+    if (WITH_PYTHON)
-include(external/python)
+        include(external/pybind11)
-include(generic)
+        include(external/python)
-include(flags)
+    endif()
+    include(generic)
+    include(flags)
 endif()
 if (APP)
-include(external/zlib)
+    include(external/zlib)
-include(external/boost)
+    include(external/boost)
-include(external/protobuf)
+    include(external/protobuf)
-include(external/gflags)
+    include(external/gflags)
-include(external/glog)
+    include(external/glog)
-include(external/pybind11)
+    include(external/pybind11)
-include(external/python)
+    include(external/python)
-include(generic)
+    include(generic)
 endif()
 if (SERVER)
-include(external/cudnn)
+    include(external/jsoncpp)
-include(paddlepaddle)
+    #include(external/rocksdb)
+    include(external/cudnn)
+    include(paddlepaddle)
 endif()
 message("paddle serving source dir: " ${PADDLE_SERVING_SOURCE_DIR})
@@ -125,26 +128,24 @@ set(EXTERNAL_LIBS
 )
 if(SERVER)
-if(WITH_MKLML)
+    if(WITH_MKLML)
-    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
+        list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
-endif()
+    endif()
-endif()
-if(SERVER)
+    if(WITH_MKLDNN)
-if(WITH_MKLDNN)
+        list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
+    endif()
-endif()
-endif()
-if (SERVER)
    list(APPEND EXTERNAL_LIBS paddlepaddle)
 endif()
 add_subdirectory(core)
 if(SERVER)
-add_subdirectory(paddle_inference)
+    add_subdirectory(paddle_inference)
 endif()
-add_subdirectory(python)
+if (WITH_PYTHON)
+    add_subdirectory(python)
+endif()
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -22,6 +22,7 @@ set(BOOST_PROJECT       "extern_boost")
 # version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.74.0")
 set(BOOST_TAR "boost_1_74_0" CACHE STRING "" FORCE)
 set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)

--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -38,13 +38,21 @@ INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
 set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
+if(WITH_LITE)
+    set(BRPC_REPO "https://github.com/zhangjun/incubator-brpc.git")
+    set(BRPC_TAG "master")
+else()
+    set(BRPC_REPO "https://github.com/wangjiawei04/brpc")
+    set(BRPC_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47")
+endif()
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
    extern_brpc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    # TODO(gongwb): change to de newst repo when they changed.
-    GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
+    GIT_REPOSITORY  ${BRPC_REPO}
-    GIT_TAG         "serving-0.4.1"
+    GIT_TAG         ${BRPC_TAG}
    PREFIX          ${BRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,11 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE)
  find_package(Threads REQUIRED)
  link_libraries(${CMAKE_THREAD_LIBS_INIT})
-  set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
+  if(WITH_LITE OR WITH_XPU)
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -fopenmp -pthread -ldl -lrt")
+  else()
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
+  endif()
 endif(NOT APPLE)
 set_property(GLOBAL PROPERTY FLUID_MODULES "")

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -39,6 +39,12 @@ if (WITH_GPU)
    else()
        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl")
    endif()
+elseif (WITH_LITE)
+    if (WITH_XPU)
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm-xpu")
+    else()
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm")
+    endif()
 else()
    if (WITH_AVX)
        if (WITH_MKLML)
@@ -51,7 +57,12 @@ else()
    endif()
 endif()
-SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
+if(WITH_LITE)
+    SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
+else()
+    SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
+endif()
 MESSAGE(STATUS "PADDLE_LIB_PATH=${PADDLE_LIB_PATH}")
 if (WITH_GPU OR WITH_MKLML)
    if (WITH_TRT)
@@ -117,11 +128,24 @@ ADD_LIBRARY(paddle_fluid SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.so)
 if (WITH_TRT)
-ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
+    ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
+    SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
+    ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
+endif()
-ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
+if (WITH_LITE)
-SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
+    ADD_LIBRARY(paddle_api_full_bundled STATIC IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET paddle_api_full_bundled PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/lite/cxx/lib/libpaddle_api_full_bundled.a)
+    if (WITH_XPU)
+        ADD_LIBRARY(xpuapi SHARED IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET xpuapi PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xpu/lib/libxpuapi.so)
+        ADD_LIBRARY(xpurt SHARED IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET xpurt PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xpu/lib/libxpurt.so)
+    endif()
 endif()
 ADD_LIBRARY(xxhash STATIC IMPORTED GLOBAL)
@@ -132,7 +156,14 @@ LIST(APPEND external_project_dependencies paddle)
 LIST(APPEND paddle_depend_libs
    xxhash)
+if(WITH_LITE)
+    LIST(APPEND paddle_depend_libs paddle_api_full_bundled)
+    if(WITH_XPU)
+        LIST(APPEND paddle_depend_libs xpuapi xpurt)
+    endif()
+endif()
 if(WITH_TRT)
-LIST(APPEND paddle_depend_libs
+    LIST(APPEND paddle_depend_libs
-    nvinfer nvinfer_plugin)
+        nvinfer nvinfer_plugin)
 endif()
--- a/core/configure/CMakeLists.txt
+++ b/core/configure/CMakeLists.txt
@@ -27,6 +27,8 @@ install(FILES ${inc}
        DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure)
 endif()
+if (WITH_PYTHON)
 py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.proto)
 add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
@@ -70,7 +72,7 @@ if (SERVER)
 py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
 add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(server_config_py_proto server_config_py_proto_init)
-if (NOT WITH_GPU)
+if (NOT WITH_GPU AND NOT WITH_LITE)
 add_custom_command(TARGET server_config_py_proto POST_BUILD
 		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
 		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
@@ -114,3 +116,5 @@ add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
 endif()
+endif()
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -45,6 +45,8 @@ message EngineDesc {
  optional bool force_update_static_cache = 15;
  optional bool enable_ir_optimization = 16;
  optional bool use_trt = 17;
+  optional bool use_lite = 18;
+  optional bool use_xpu = 19;
 };
 // model_toolkit conf

--- a/core/general-server/CMakeLists.txt
+++ b/core/general-server/CMakeLists.txt
@@ -6,6 +6,11 @@ add_dependencies(serving pdcodegen fluid_cpu_engine pdserving paddle_fluid cube-
 if (WITH_GPU)
    add_dependencies(serving fluid_gpu_engine)
 endif()
+if (WITH_LITE)
+    add_dependencies(serving fluid_arm_engine)
+endif()
 target_include_directories(serving PUBLIC
        ${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
        )
@@ -15,6 +20,11 @@ if(WITH_GPU)
            -Wl,--no-whole-archive)
 endif()
+if(WITH_LITE)
+    target_link_libraries(serving -Wl,--whole-archive fluid_arm_engine
+            -Wl,--no-whole-archive)
+endif()
 target_link_libraries(serving -Wl,--whole-archive fluid_cpu_engine
        -Wl,--no-whole-archive)

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -38,6 +38,8 @@ class InferEngineCreationParams {
    _static_optimization = false;
    _force_update_static_cache = false;
    _use_trt = false;
+    _use_lite = false;
+    _use_xpu = false;
  }
  void set_path(const std::string& path) { _path = path; }
@@ -52,6 +54,10 @@ class InferEngineCreationParams {
  void set_use_trt(bool use_trt) { _use_trt = use_trt; }
+  void set_use_lite(bool use_lite) { _use_lite = use_lite; }
+  void set_use_xpu(bool use_xpu) { _use_xpu = use_xpu; }
  bool enable_memory_optimization() const {
    return _enable_memory_optimization;
  }
@@ -60,6 +66,10 @@ class InferEngineCreationParams {
  bool use_trt() const { return _use_trt; }
+  bool use_lite() const { return _use_lite; }
+  bool use_xpu() const { return _use_xpu; }
  void set_static_optimization(bool static_optimization = false) {
    _static_optimization = static_optimization;
  }
@@ -79,6 +89,9 @@ class InferEngineCreationParams {
              << "model_path = " << _path << ", "
              << "enable_memory_optimization = " << _enable_memory_optimization
              << ", "
+              << "enable_tensorrt = " << _use_trt << ", "
+              << "enable_lite = " << _use_lite << ", "
+              << "enable_xpu = " << _use_xpu << ", "
              << "enable_ir_optimization = " << _enable_ir_optimization << ", "
              << "static_optimization = " << _static_optimization << ", "
              << "force_update_static_cache = " << _force_update_static_cache;
@@ -91,6 +104,8 @@ class InferEngineCreationParams {
  bool _static_optimization;
  bool _force_update_static_cache;
  bool _use_trt;
+  bool _use_lite;
+  bool _use_xpu;
 };
 class InferEngine {
@@ -179,6 +194,14 @@ class ReloadableInferEngine : public InferEngine {
      _infer_engine_params.set_use_trt(conf.use_trt());
    }
+    if (conf.has_use_lite()) {
+      _infer_engine_params.set_use_lite(conf.use_lite());
+    }
+    if (conf.has_use_xpu()) {
+      _infer_engine_params.set_use_xpu(conf.use_xpu());
+    }
    if (!check_need_reload() || load(_infer_engine_params) != 0) {
      LOG(ERROR) << "Failed load model_data_path" << _model_data_path;
      return -1;

--- a/paddle_inference/CMakeLists.txt
+++ b/paddle_inference/CMakeLists.txt
@@ -13,8 +13,13 @@
 # limitations under the License
 if (NOT CLIENT_ONLY)
-add_subdirectory(inferencer-fluid-cpu)
+    add_subdirectory(inferencer-fluid-cpu)
-if (WITH_GPU)
-add_subdirectory(inferencer-fluid-gpu)
+    if (WITH_GPU)
-endif()
+        add_subdirectory(inferencer-fluid-gpu)
+    endif()
+    if (WITH_LITE)
+        add_subdirectory(inferencer-fluid-arm)
+    endif()
 endif()
--- a/paddle_inference/inferencer-fluid-arm/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-arm/CMakeLists.txt
+FILE(GLOB fluid_arm_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
+add_library(fluid_arm_engine ${fluid_arm_engine_srcs})
+target_include_directories(fluid_arm_engine PUBLIC
+        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
+add_dependencies(fluid_arm_engine pdserving extern_paddle configure)
+target_link_libraries(fluid_arm_engine pdserving paddle_fluid -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
+install(TARGETS fluid_arm_engine 
+        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
+        )
--- a/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
+++ b/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <pthread.h>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "core/configure/include/configure_parser.h"
+#include "core/configure/inferencer_configure.pb.h"
+#include "core/predictor/framework/infer.h"
+#include "paddle_inference_api.h"  // NOLINT
+namespace baidu {
+namespace paddle_serving {
+namespace fluid_arm {
+class AutoLock {
+ public:
+  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
+    pthread_mutex_lock(&mutex);
+  }
+  ~AutoLock() { pthread_mutex_unlock(&_mut); }
+ private:
+  pthread_mutex_t& _mut;
+};
+class GlobalPaddleCreateMutex {
+ public:
+  pthread_mutex_t& mutex() { return _mut; }
+  static pthread_mutex_t& instance() {
+    static GlobalPaddleCreateMutex gmutex;
+    return gmutex.mutex();
+  }
+ private:
+  GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
+  pthread_mutex_t _mut;
+};
+using paddle_infer::Config;
+using paddle_infer::Predictor;
+using paddle_infer::Tensor;
+using paddle_infer::PrecisionType;
+using paddle_infer::CreatePredictor;
+// data interface
+class FluidFamilyCore {
+ public:
+  virtual ~FluidFamilyCore() {}
+  virtual std::vector<std::string> GetInputNames() {
+    return _core->GetInputNames();
+  }
+  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
+    return _core->GetInputHandle(name);
+  }
+  virtual std::vector<std::string> GetOutputNames() {
+    return _core->GetOutputNames();
+  }
+  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
+    return _core->GetOutputHandle(name);
+  }
+  virtual bool Run() {
+    if (!_core->Run()) {
+      LOG(ERROR) << "Failed call Run with paddle predictor";
+      return false;
+    }
+    return true;
+  }
+  virtual int create(const predictor::InferEngineCreationParams& params) = 0;
+  virtual int clone(void* origin_core) {
+    if (origin_core == NULL) {
+      LOG(ERROR) << "origin paddle Predictor is null.";
+      return -1;
+    }
+    Predictor* p_predictor = (Predictor*)origin_core;
+    _core = p_predictor->Clone();
+    if (_core.get() == NULL) {
+      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
+      return -1;
+    }
+    return 0;
+  }
+  virtual void* get() { return _core.get(); }
+ protected:
+  std::shared_ptr<Predictor> _core;
+};
+// infer interface
+class FluidArmAnalysisCore : public FluidFamilyCore {
+ public:
+  int create(const predictor::InferEngineCreationParams& params) {
+    std::string data_path = params.get_path();
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+    Config config;
+    config.SetParamsFile(data_path + "/__params__");
+    config.SetProgFile(data_path + "/__model__");
+    config.DisableGpu();
+    config.SetCpuMathLibraryNumThreads(1);
+    if (params.enable_memory_optimization()) {
+      config.EnableMemoryOptim();
+    }
+    if (params.enable_memory_optimization()) {
+      config.EnableMemoryOptim();
+    }
+    if (params.use_lite()) {
+      config.EnableLiteEngine(PrecisionType::kFloat32, true);
+    }
+    if (params.use_xpu()) {
+      config.EnableXpu(100);
+    }
+    config.SwitchSpecifyInputNames(true);
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core = CreatePredictor(config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+class FluidArmAnalysisDirCore : public FluidFamilyCore {
+ public:
+  int create(const predictor::InferEngineCreationParams& params) {
+    std::string data_path = params.get_path();
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+    Config config;
+    config.SetModel(data_path);
+    config.DisableGpu();
+    config.SwitchSpecifyInputNames(true);
+    config.SetCpuMathLibraryNumThreads(1);
+    if (params.enable_memory_optimization()) {
+      config.EnableMemoryOptim();
+    }
+    if (params.enable_ir_optimization()) {
+      config.SwitchIrOptim(true);
+    } else {
+      config.SwitchIrOptim(false);
+    }
+    if (params.use_lite()) {
+      config.EnableLiteEngine(PrecisionType::kFloat32, true);
+    }
+    if (params.use_xpu()) {
+      config.EnableXpu(100);
+    }
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core = CreatePredictor(config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+class Parameter {
+ public:
+  Parameter() : _row(0), _col(0), _params(NULL) {}
+  ~Parameter() {
+    VLOG(2) << "before destroy Parameter, file_name[" << _file_name << "]";
+    destroy();
+  }
+  int init(int row, int col, const char* file_name) {
+    destroy();
+    _file_name = file_name;
+    _row = row;
+    _col = col;
+    _params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
+    if (_params == NULL) {
+      LOG(ERROR) << "Load " << _file_name << " malloc error.";
+      return -1;
+    }
+    VLOG(2) << "Load parameter file[" << _file_name << "] success.";
+    return 0;
+  }
+  void destroy() {
+    _row = 0;
+    _col = 0;
+    if (_params != NULL) {
+      free(_params);
+      _params = NULL;
+    }
+  }
+  int load() {
+    if (_params == NULL || _row <= 0 || _col <= 0) {
+      LOG(ERROR) << "load parameter error [not inited].";
+      return -1;
+    }
+    FILE* fs = fopen(_file_name.c_str(), "rb");
+    if (fs == NULL) {
+      LOG(ERROR) << "load " << _file_name << " fopen error.";
+      return -1;
+    }
+    static const uint32_t MODEL_FILE_HEAD_LEN = 16;
+    char head[MODEL_FILE_HEAD_LEN] = {0};
+    if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
+      destroy();
+      LOG(ERROR) << "Load " << _file_name << " read head error.";
+      if (fs != NULL) {
+        fclose(fs);
+        fs = NULL;
+      }
+      return -1;
+    }
+    uint32_t matrix_size = _row * _col;
+    if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
+      if (fs != NULL) {
+        fclose(fs);
+        fs = NULL;
+      }
+      VLOG(2) << "load " << _file_name << " read ok.";
+      return 0;
+    } else {
+      LOG(ERROR) << "load " << _file_name << " read error.";
+      destroy();
+      if (fs != NULL) {
+        fclose(fs);
+        fs = NULL;
+      }
+      return -1;
+    }
+    return 0;
+  }
+ public:
+  std::string _file_name;
+  int _row;
+  int _col;
+  float* _params;
+};
+}  // namespace fluid_arm
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/paddle_inference/inferencer-fluid-arm/src/fluid_arm_engine.cpp
+++ b/paddle_inference/inferencer-fluid-arm/src/fluid_arm_engine.cpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h"
+#include "core/predictor/framework/factory.h"
+namespace baidu {
+namespace paddle_serving {
+namespace fluid_arm {
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidArmAnalysisCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_ARM_ANALYSIS");
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<
+        FluidArmAnalysisDirCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_ARM_ANALYSIS_DIR");
+}  // namespace fluid_arm
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -7,7 +7,7 @@ if (CLIENT)
 endif()
 if (SERVER)
-    if (NOT WITH_GPU)
+    if (NOT WITH_GPU AND NOT WITH_LITE)
        file(INSTALL pipeline DESTINATION paddle_serving_server)
        file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
    else()
@@ -34,7 +34,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
 endif()
 if (SERVER)
-    if (NOT WITH_GPU)
+    if (NOT WITH_GPU AND NOT WITH_LITE)
        configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
            ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
    else()
@@ -72,7 +72,7 @@ add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINA
 endif()
 if (SERVER)
-    if(NOT WITH_GPU)
+    if(NOT WITH_GPU AND NOT WITH_LITE)
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
            COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server/ ${PADDLE_SERVING_BINARY_DIR}/python/
@@ -90,6 +90,16 @@ if (SERVER)
            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+    elseif(WITH_LITE)
+        add_custom_command(
+            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+            COMMAND cp -r
+            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+            "server_gpu" arm 
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
    else()
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp

--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -57,6 +57,8 @@ class LocalPredictor(object):
                          mem_optim=True,
                          ir_optim=False,
                          use_trt=False,
+                          use_lite=False,
+                          use_xpu=False,
                          use_feed_fetch_ops=False):
        """
        Load model config and set the engine config for the paddle predictor
@@ -70,6 +72,8 @@ class LocalPredictor(object):
            mem_optim: memory optimization, True default.
            ir_optim: open calculation chart optimization, False default.
            use_trt: use nvidia TensorRT optimization, False default
+            use_lite: use Paddle-Lite Engint, False default
+            use_xpu: run predict on Baidu Kunlun, False default
            use_feed_fetch_ops: use feed/fetch ops, False default.
        """
        client_config = "{}/serving_server_conf.prototxt".format(model_path)
@@ -80,9 +84,9 @@ class LocalPredictor(object):
        config = AnalysisConfig(model_path)
        logger.info("load_model_config params: model_path:{}, use_gpu:{},\
            gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
-            use_trt:{}, use_feed_fetch_ops:{}".format(
+            use_trt:{}, use_lite:{}, use_xpu: {}, use_feed_fetch_ops:{}".format(
            model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
-            ir_optim, use_trt, use_feed_fetch_ops))
+            ir_optim, use_trt, use_lite, use_xpu, use_feed_fetch_ops))
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
@@ -119,6 +123,17 @@ class LocalPredictor(object):
                    use_static=False,
                    use_calib_mode=False)
+        if use_lite:
+            config.enable_lite_engine(
+                precision_mode = PrecisionType.Float32,
+                zero_copy = True,
+                passes_filter = [],
+                ops_filter = []
+            )
+        if use_xpu:
+            config.enable_xpu(100 * 1024 * 1024)
        self.predictor = create_paddle_predictor(config)
    def predict(self, feed=None, fetch=None, batch=False, log_id=0):

--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -77,6 +77,10 @@ def serve_args():
        help="Use Multi-language-service")
    parser.add_argument(
        "--use_trt", default=False, action="store_true", help="Use TensorRT")
+    parser.add_argument(
+        "--use_lite", default=False, action="store_true", help="Use PaddleLite")
+    parser.add_argument(
+        "--use_xpu", default=False, action="store_true", help="Use XPU")
    parser.add_argument(
        "--product_name",
        type=str,
@@ -210,6 +214,8 @@ class Server(object):
        self.use_local_bin = False
        self.gpuid = 0
        self.use_trt = False
+        self.use_lite = False
+        self.use_xpu = False
        self.model_config_paths = None  # for multi-model in a workflow
        self.product_name = None
        self.container_id = None
@@ -279,6 +285,12 @@ class Server(object):
    def set_trt(self):
        self.use_trt = True
+    def set_lite(self):
+        self.use_lite = True
+    def set_xpu(self):
+        self.use_xpu = True
    def _prepare_engine(self, model_config_paths, device):
        if self.model_toolkit_conf == None:
            self.model_toolkit_conf = server_sdk.ModelToolkitConf()
@@ -299,11 +311,17 @@ class Server(object):
            engine.static_optimization = False
            engine.force_update_static_cache = False
            engine.use_trt = self.use_trt
+            engine.use_lite = self.use_lite
+            engine.use_xpu = self.use_xpu
            if device == "cpu":
                engine.type = "FLUID_CPU_ANALYSIS_DIR"
            elif device == "gpu":
                engine.type = "FLUID_GPU_ANALYSIS_DIR"
+            elif device == "arm":
+                engine.type = "FLUID_ARM_ANALYSIS_DIR"
            self.model_toolkit_conf.engines.extend([engine])
@@ -405,10 +423,12 @@ class Server(object):
        for line in version_file.readlines():
            if re.match("cuda_version", line):
                cuda_version = line.split("\"")[1]
-                if cuda_version != "trt":
+                if cuda_version == "trt":
-                    device_version = "serving-gpu-cuda" + cuda_version + "-"
-                else:
                    device_version = "serving-gpu-" + cuda_version + "-"
+                elif cuda_version == "arm":
+                    device_version = "serving-" + cuda_version + "-"
+                else:
+                    device_version = "serving-gpu-cuda" + cuda_version + "-"
        folder_name = device_version + serving_server_version
        tar_name = folder_name + ".tar.gz"
@@ -507,36 +527,65 @@ class Server(object):
                time.sleep(1)
        else:
            print("Use local bin : {}".format(self.bin_path))
-        self.check_cuda()
+        #self.check_cuda()
-        command = "{} " \
+        if self.use_lite:
-                  "-enable_model_toolkit " \
+            command = "{} " \
-                  "-inferservice_path {} " \
+                      "-enable_model_toolkit " \
-                  "-inferservice_file {} " \
+                      "-inferservice_path {} " \
-                  "-max_concurrency {} " \
+                      "-inferservice_file {} " \
-                  "-num_threads {} " \
+                      "-max_concurrency {} " \
-                  "-port {} " \
+                      "-num_threads {} " \
-                  "-reload_interval_s {} " \
+                      "-port {} " \
-                  "-resource_path {} " \
+                      "-reload_interval_s {} " \
-                  "-resource_file {} " \
+                      "-resource_path {} " \
-                  "-workflow_path {} " \
+                      "-resource_file {} " \
-                  "-workflow_file {} " \
+                      "-workflow_path {} " \
-                  "-bthread_concurrency {} " \
+                      "-workflow_file {} " \
-                  "-gpuid {} " \
+                      "-bthread_concurrency {} " \
-                  "-max_body_size {} ".format(
+                      "-max_body_size {} ".format(
-                      self.bin_path,
+                          self.bin_path,
-                      self.workdir,
+                          self.workdir,
-                      self.infer_service_fn,
+                          self.infer_service_fn,
-                      self.max_concurrency,
+                          self.max_concurrency,
-                      self.num_threads,
+                          self.num_threads,
-                      self.port,
+                          self.port,
-                      self.reload_interval_s,
+                          self.reload_interval_s,
-                      self.workdir,
+                          self.workdir,
-                      self.resource_fn,
+                          self.resource_fn,
-                      self.workdir,
+                          self.workdir,
-                      self.workflow_fn,
+                          self.workflow_fn,
-                      self.num_threads,
+                          self.num_threads,
-                      self.gpuid,
+                          self.max_body_size)
-                      self.max_body_size)
+        else:
+            command = "{} " \
+                      "-enable_model_toolkit " \
+                      "-inferservice_path {} " \
+                      "-inferservice_file {} " \
+                      "-max_concurrency {} " \
+                      "-num_threads {} " \
+                      "-port {} " \
+                      "-reload_interval_s {} " \
+                      "-resource_path {} " \
+                      "-resource_file {} " \
+                      "-workflow_path {} " \
+                      "-workflow_file {} " \
+                      "-bthread_concurrency {} " \
+                      "-gpuid {} " \
+                      "-max_body_size {} ".format(
+                          self.bin_path,
+                          self.workdir,
+                          self.infer_service_fn,
+                          self.max_concurrency,
+                          self.num_threads,
+                          self.port,
+                          self.reload_interval_s,
+                          self.workdir,
+                          self.resource_fn,
+                          self.workdir,
+                          self.workflow_fn,
+                          self.num_threads,
+                          self.gpuid,
+                          self.max_body_size)
        print("Going to Run Comand")
        print(command)

--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -38,7 +38,9 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    ir_optim = args.ir_optim
    max_body_size = args.max_body_size
    use_multilang = args.use_multilang
-    workdir = "{}_{}".format(args.workdir, gpuid)
+    workdir = args.workdir
+    if gpuid >= 0:
+        workdir = "{}_{}".format(args.workdir, gpuid)
    if model == "":
        print("You must specify your serving model")
@@ -67,6 +69,13 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    if args.use_trt:
        server.set_trt()
+    if args.use_lite:
+        server.set_lite()
+        device = "arm"
+    if args.use_xpu:
+        server.set_xpu()
    if args.product_name != None:
        server.set_product_name(args.product_name)
    if args.container_id != None:
@@ -95,7 +104,10 @@ def start_multi_card(args):  # pylint: disable=doc-string-missing
                    exit(-1)
        else:
            env_gpus = []
-    if len(gpus) <= 0:
+    if args.use_lite:
+        print("run arm server.")
+        start_gpu_card_model(-1, -1, args)
+    elif len(gpus) <= 0:
        print("gpu_ids not set, going to run cpu service.")
        start_gpu_card_model(-1, -1, args)
    else:
@@ -128,7 +140,8 @@ if __name__ == "__main__":
        if len(gpu_ids) > 0:
            web_service.set_gpus(gpu_ids)
        web_service.prepare_server(
-            workdir=args.workdir, port=args.port, device=args.device)
+            workdir=args.workdir, port=args.port, device=args.device,
+            use_lite=args.use_lite, use_xpu=args.use_xpu, ir_optim=args.ir_optim)
        web_service.run_rpc_service()
        app_instance = Flask(__name__)

--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -83,10 +83,15 @@ class WebService(object):
                            gpuid=0,
                            thread_num=2,
                            mem_optim=True,
+                            use_lite=False,
+                            use_xpu=False,
                            ir_optim=False):
        device = "gpu"
        if gpuid == -1:
-            device = "cpu"
+            if use_lite:
+                device = "arm"
+            else:
+                device = "cpu"
        op_maker = serving.OpMaker()
        read_op = op_maker.create('general_reader')
        general_infer_op = op_maker.create('general_infer')
@@ -103,6 +108,11 @@ class WebService(object):
        server.set_memory_optimize(mem_optim)
        server.set_ir_optimize(ir_optim)
+        if use_lite:
+            server.set_lite()
+        if use_xpu:
+            server.set_xpu()
        server.load_model_config(self.model_config)
        if gpuid >= 0:
            server.set_gpuid(gpuid)
@@ -125,9 +135,11 @@ class WebService(object):
                       workdir="",
                       port=9393,
                       device="gpu",
+                       use_lite=False,
+                       use_xpu=False,
+                       ir_optim=False,
                       gpuid=0,
-                       mem_optim=True,
+                       mem_optim=True):
-                       ir_optim=False):
        print("This API will be deprecated later. Please do not use it")
        self.workdir = workdir
        self.port = port
@@ -150,6 +162,8 @@ class WebService(object):
                    -1,
                    thread_num=2,
                    mem_optim=mem_optim,
+                    use_lite=use_lite,
+                    use_xpu=use_xpu,
                    ir_optim=ir_optim))
        else:
            for i, gpuid in enumerate(self.gpus):
@@ -160,6 +174,8 @@ class WebService(object):
                        gpuid,
                        thread_num=2,
                        mem_optim=mem_optim,
+                        use_lite=use_lite,
+                        use_xpu=use_xpu,
                        ir_optim=ir_optim))
    def _launch_web_service(self):

--- a/python/pipeline/local_service_handler.py
+++ b/python/pipeline/local_service_handler.py
@@ -44,6 +44,8 @@ class LocalServiceHandler(object):
                 ir_optim=False,
                 available_port_generator=None,
                 use_trt=False,
+                 use_lite=False,
+                 use_xpu=False,
                 use_profile=False):
        """
        Initialization of localservicehandler
@@ -60,6 +62,8 @@ class LocalServiceHandler(object):
           ir_optim: use calculation chart optimization, False default.
           available_port_generator: generate available ports
           use_trt: use nvidia tensorRt engine, False default.
+           use_lite: use Paddle-Lite engine, False default.
+           use_xpu: run predict on Baidu Kunlun, False default.
           use_profile: use profiling, False default.
        Returns:
@@ -74,10 +78,16 @@ class LocalServiceHandler(object):
        if devices == "":
            # cpu
            devices = [-1]
-            self._device_type = "cpu"
+            if use_lite:
-            self._port_list.append(available_port_generator.next())
+                self._device_type = "arm"
-            _LOGGER.info("Model({}) will be launch in cpu device. Port({})"
+                self._port_list.append(available_port_generator.next())
-                         .format(model_config, self._port_list))
+                _LOGGER.info("Model({}) will be launch in arm device. Port({})"
+                             .format(model_config, self._port_list))
+            else:
+                self._device_type = "cpu"
+                self._port_list.append(available_port_generator.next())
+                _LOGGER.info("Model({}) will be launch in cpu device. Port({})"
+                             .format(model_config, self._port_list))
        else:
            # gpu
            self._device_type = "gpu"
@@ -96,6 +106,8 @@ class LocalServiceHandler(object):
        self._rpc_service_list = []
        self._server_pros = []
        self._use_trt = use_trt
+        self._use_lite = use_lite
+        self._use_xpu = use_xpu
        self._use_profile = use_profile
        self.fetch_names_ = fetch_names
@@ -138,8 +150,11 @@ class LocalServiceHandler(object):
        if self._local_predictor_client is None:
            self._local_predictor_client = LocalPredictor()
            use_gpu = False
+            use_lite = False
            if self._device_type == "gpu":
                use_gpu = True
+            elif self._device_type == "arm":
+                use_lite = True
            self._local_predictor_client.load_model_config(
                model_path=self._model_config,
                use_gpu=use_gpu,
@@ -148,7 +163,9 @@ class LocalServiceHandler(object):
                thread_num=self._thread_num,
                mem_optim=self._mem_optim,
                ir_optim=self._ir_optim,
-                use_trt=self._use_trt)
+                use_trt=self._use_trt,
+                use_lite=use_lite,
+                use_xpu=self._use_xpu)
        return self._local_predictor_client
    def get_client_config(self):
@@ -185,7 +202,7 @@ class LocalServiceHandler(object):
            server = Server()
        else:
-            #gpu
+            #gpu or arm
            from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server
            op_maker = OpMaker()
            read_op = op_maker.create('general_reader')

--- a/python/setup.py.app.in
+++ b/python/setup.py.app.in
@@ -32,7 +32,7 @@ if '${PACK}' == 'ON':
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'sentencepiece<=0.1.92', 'opencv-python<=4.2.0.32', 'pillow',
+    'six >= 1.10.0', 'sentencepiece', 'opencv-python', 'pillow',
    'pyclipper'
 ]