support ascend_cl arm64 with lite for ascend310

b0953a3f · ShiningZhang · 6da3f883 · b0953a3f · b0953a3f · b0953a3f
12 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ option(WITH_TRT             "Compile Paddle Serving with TRT"
 option(PADDLE_ON_INFERENCE  "Compile for encryption"                             ON)
 option(WITH_OPENCV	    "Compile Paddle Serving with OPENCV"                    OFF)
 option(WITH_ROCM	    "Compile Paddle Serving with ROCM"                    OFF)
+option(WITH_ASCEND_CL       "Compile PaddlePaddle with ASCEND CL"               OFF)
 if(NOT DEFINED VERSION_TAG)
  set(VERSION_TAG "0.0.0")

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -62,6 +62,13 @@ elseif (WITH_LITE)
        elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
            SET(PADDLE_LIB_VERSION "arm64_gcc7.3_openblas")
        endif()
+    elseif (WITH_ASCEND_CL)
+        if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+             MESSAGE("paddle lite lib is unknown.")
+             SET(PADDLE_LIB_VERSION "paddle-lite-unknown")
+         elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+             SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/ASCEND/arm64_gcc7.5_openblas_lite2.10")
+         endif()
    else()
        MESSAGE("paddle lite lib is unknown.")
        SET(PADDLE_LIB_VERSION "paddle-lite-unknown")
@@ -81,7 +88,11 @@ else()
 endif()
 if(WITH_LITE)
+    if (WITH_XPU)
        SET(PADDLE_LIB_PATH "https://paddle-inference-lib.bj.bcebos.com/2.2.0-rc0/cxx_c/Linux/XPU/${PADDLE_LIB_VERSION}/paddle_inference_install_dir.tar.gz ")
+    elseif (WITH_ASCEND_CL)
+        SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference_install_dir.tgz ")
+    endif()
 else()
    SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
 endif()
@@ -152,6 +163,9 @@ endif()
 ADD_LIBRARY(paddle_inference STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a)
+if (WITH_ASCEND_CL AND WITH_LITE)
+    SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so)
+endif()
 if (WITH_TRT)
    ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)

--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -47,6 +47,7 @@ message EngineDesc {
  optional bool combined_model = 18;
  optional bool encrypted_model = 19;
  optional bool gpu_multi_stream = 20;
+  optional bool use_ascend_cl = 21;
  /*
   * "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling

--- a/core/general-client/CMakeLists.txt
+++ b/core/general-client/CMakeLists.txt
@@ -18,7 +18,7 @@ add_executable(simple_client example/simple_client.cpp)
 add_dependencies(simple_client utils sdk-cpp client)
 target_link_libraries(simple_client -Wl,--whole-archive
-        -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
+        -Wl,--no-whole-archive -Wl,--start-group -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
 target_link_libraries(simple_client utils)
 target_link_libraries(simple_client sdk-cpp)

--- a/paddle_inference/paddle/include/paddle_engine.h
+++ b/paddle_inference/paddle/include/paddle_engine.h
@@ -41,6 +41,9 @@ using paddle_infer::CreatePredictor;
 DECLARE_int32(gpuid);
 DECLARE_string(precision);
 DECLARE_bool(use_calib);
+DECLARE_string(nnadapter_device_names);
+DECLARE_string(nnadapter_context_properties);
+DECLARE_string(nnadapter_model_cache_dir);
 static const int max_batch = 32;
 static const int min_subgraph_size = 3;
@@ -237,6 +240,7 @@ class PaddleInferenceEngine : public EngineCore {
    if (engine_conf.has_use_lite() && engine_conf.use_lite()) {
      config.EnableLiteEngine(precision_type, true);
+      config.SwitchIrOptim(true);
    }
    if ((!engine_conf.has_use_lite() && !engine_conf.has_use_gpu()) ||
@@ -269,6 +273,29 @@ class PaddleInferenceEngine : public EngineCore {
      config.SetXpuDeviceId(gpu_id);
    }
+    if (engine_conf.has_use_ascend_cl() &&
+        engine_conf.use_ascend_cl()) {
+      if (engine_conf.has_use_lite() && engine_conf.use_lite()) {
+        FLAGS_nnadapter_device_names = "huawei_ascend_npu";
+        FLAGS_nnadapter_context_properties =
+                "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS=" +
+                std::to_string(gpu_id);
+        FLAGS_nnadapter_model_cache_dir = "";
+        config.NNAdapter()
+        .Enable()
+        .SetDeviceNames({FLAGS_nnadapter_device_names})
+        .SetContextProperties(FLAGS_nnadapter_context_properties)
+        .SetModelCacheDir(FLAGS_nnadapter_model_cache_dir);
+        LOG(INFO) << "Enable Lite NNAdapter for Ascend,"
+                  << "nnadapter_device_names="
+                  << FLAGS_nnadapter_device_names
+                  << ",nnadapter_context_properties="
+                  << FLAGS_nnadapter_context_properties
+                  << ",nnadapter_model_cache_dir="
+                  << FLAGS_nnadapter_model_cache_dir;
+      }
+    }
    if (engine_conf.has_enable_memory_optimization() &&
        engine_conf.enable_memory_optimization()) {
      config.EnableMemoryOptim();

--- a/paddle_inference/paddle/src/paddle_engine.cpp
+++ b/paddle_inference/paddle/src/paddle_engine.cpp
@@ -22,6 +22,11 @@ namespace inference {
 DEFINE_int32(gpuid, 0, "GPU device id to use");
 DEFINE_string(precision, "fp32", "precision to deploy, default is fp32");
 DEFINE_bool(use_calib, false, "calibration mode, default is false");
+DEFINE_string(nnadapter_device_names, "", "Names of nnadapter device");
+DEFINE_string(nnadapter_context_properties,
+              "",
+              "Properties of nnadapter context");
+DEFINE_string(nnadapter_model_cache_dir, "", "Cache dir of nnadapter model");
 REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
    ::baidu::paddle_serving::predictor::FluidInferEngine<PaddleInferenceEngine>,

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -14,6 +14,8 @@ if (SERVER)
    set(SERVER_PACKAGE_NAME "paddle-serving-server-xpu")
  elseif(WITH_ROCM)
    set(SERVER_PACKAGE_NAME "paddle-serving-server-rocm")
+  elseif(WITH_ASCEND_CL)
+    set(SERVER_PACKAGE_NAME "paddle-serving-server-npu")
  endif()
  file(INSTALL pipeline DESTINATION paddle_serving_server)
  file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)

--- a/python/gen_version.py
+++ b/python/gen_version.py
@@ -45,6 +45,8 @@ elif package_name.endswith('xpu'):
    update_info("paddle_serving_server/version.py", "device_type", "2")
 elif package_name.endswith('rocm'):
    update_info("paddle_serving_server/version.py", "device_type", "3")
+elif package_name.endswith('npu'):
+    update_info("paddle_serving_server/version.py", "device_type", "4")
 path = "paddle_serving_" + sys.argv[1]
 commit_id = subprocess.check_output(['git', 'rev-parse', 'HEAD'])

--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -86,7 +86,8 @@ class LocalPredictor(object):
                          mkldnn_cache_capacity=0,
                          mkldnn_op_list=None,
                          mkldnn_bf16_op_list=None,
-                          use_feed_fetch_ops=False):
+                          use_feed_fetch_ops=False,
+                          use_ascend_cl=False):
        """
        Load model configs and create the paddle predictor by Paddle Inference API.
@@ -108,6 +109,7 @@ class LocalPredictor(object):
            mkldnn_op_list: op list accelerated using MKLDNN, None default.
            mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default.
            use_feed_fetch_ops: use feed/fetch ops, False default.
+            use_ascend_cl: run predict on Huawei Ascend, False default
        """
        gpu_id = int(gpu_id)
        client_config = "{}/serving_server_conf.prototxt".format(model_path)
@@ -146,11 +148,12 @@ class LocalPredictor(object):
            "gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{}, "
            "use_trt:{}, use_lite:{}, use_xpu:{}, precision:{}, use_calib:{}, "
            "use_mkldnn:{}, mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
-            "mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, ".format(
+            "mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, "
+            "use_ascend_cl:{} ".format(
                model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
                ir_optim, use_trt, use_lite, use_xpu, precision, use_calib,
                use_mkldnn, mkldnn_cache_capacity, mkldnn_op_list,
-                mkldnn_bf16_op_list, use_feed_fetch_ops))
+                mkldnn_bf16_op_list, use_feed_fetch_ops, use_ascend_cl))
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
@@ -215,11 +218,24 @@ class LocalPredictor(object):
                zero_copy=True,
                passes_filter=[],
                ops_filter=[])
+            config.switch_ir_optim(True)
        # set xpu
        if use_xpu:
            # 2MB l3 cache
            config.enable_xpu(8 * 1024 * 1024)
            config.set_xpu_device_id(gpu_id)
+        # set ascend cl
+        if use_ascend_cl:
+            if use_lite:
+                nnadapter_device_names = "huawei_ascend_npu"
+                nnadapter_context_properties = \
+                    "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={}".format(gpu_id)
+                nnadapter_model_cache_dir = ""
+                config.nnadapter() \
+                .enable() \
+                .set_device_names([nnadapter_device_names]) \
+                .set_context_properties(nnadapter_context_properties) \
+                .set_model_cache_dir(nnadapter_model_cache_dir)
        # set cpu low precision
        if not use_gpu and not use_lite:
            if precision_type == paddle_infer.PrecisionType.Int8:

--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -181,6 +181,8 @@ def serve_args():
        "--use_lite", default=False, action="store_true", help="Use PaddleLite")
    parser.add_argument(
        "--use_xpu", default=False, action="store_true", help="Use XPU")
+    parser.add_argument(
+        "--use_ascend_cl", default=False, action="store_true", help="Use Ascend CL")
    parser.add_argument(
        "--product_name",
        type=str,
@@ -272,13 +274,15 @@ def start_gpu_card_model(gpu_mode, port, args):  # pylint: disable=doc-string-mi
    server.set_device(device)
    if args.use_xpu:
        server.set_xpu()
+    if args.use_ascend_cl:
+        server.set_ascend_cl()
    if args.product_name != None:
        server.set_product_name(args.product_name)
    if args.container_id != None:
        server.set_container_id(args.container_id)
-    if gpu_mode == True:
+    if gpu_mode == True or args.use_xpu or args.use_ascend_cl:
        server.set_gpuid(args.gpu_ids)
    server.load_model_config(model)
    server.prepare_server(

--- a/python/paddle_serving_server/server.py
+++ b/python/paddle_serving_server/server.py
@@ -88,6 +88,7 @@ class Server(object):
        self.gpu_multi_stream = False
        self.use_lite = False
        self.use_xpu = False
+        self.use_ascend_cl = False
        self.model_config_paths = collections.OrderedDict()
        self.product_name = None
        self.container_id = None
@@ -189,6 +190,9 @@ class Server(object):
    def set_xpu(self):
        self.use_xpu = True
+    def set_ascend_cl(self):
+        self.use_ascend_cl = True
    def _prepare_engine(self, model_config_paths, device, use_encryption_model):
        self.device = device
        if self.model_toolkit_conf == None:
@@ -202,6 +206,8 @@ class Server(object):
            if self.device == "gpu" or self.use_trt or self.gpu_multi_stream:
                self.gpuid = ["0"]
                self.device = "gpu"
+            elif self.use_xpu or self.use_ascend_cl:
+                self.gpuid = ["0"]
            else:
                self.gpuid = ["-1"]
@@ -238,6 +244,7 @@ class Server(object):
            engine.gpu_multi_stream = self.gpu_multi_stream
            engine.use_lite = self.use_lite
            engine.use_xpu = self.use_xpu
+            engine.use_ascend_cl = self.use_ascend_cl
            engine.use_gpu = False
            if len(self.gpuid) == 0:
@@ -437,6 +444,11 @@ class Server(object):
            device_version = "xpu-" + platform.machine()
        elif device_type == "3":
            device_version = "rocm-" + platform.machine()
+        elif device_type == "4":
+            if self.use_lite:
+                device_version = "ascendcl-lite-" + platform.machine()
+            else:
+                device_version = "ascendcl-" + platform.machine()
        return device_version
    def download_bin(self):

--- a/python/pipeline/local_service_handler.py
+++ b/python/pipeline/local_service_handler.py
@@ -86,6 +86,7 @@ class LocalServiceHandler(object):
        self._use_trt = False
        self._use_lite = False
        self._use_xpu = False
+        self._use_ascend_cl = False
        self._use_mkldnn = False
        self._mkldnn_cache_capacity = 0
        self._mkldnn_op_list = None
@@ -129,6 +130,12 @@ class LocalServiceHandler(object):
            devices = [int(x) for x in devices.split(",")]
            self._use_lite = True
            self._use_xpu = True
+        elif device_type == 5:
+            # Ascend 310 ARM CPU
+            self._device_name = "arm"
+            devices = [int(x) for x in devices.split(",")]
+            self._use_lite = True
+            self._use_ascend_cl = True
        else:
            _LOGGER.error(
                "LocalServiceHandler initialization fail. device_type={}"
@@ -163,13 +170,14 @@ class LocalServiceHandler(object):
            "mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
            "client_type:{}, fetch_names:{}, precision:{}, use_mkldnn:{}, "
            "mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
-            "mkldnn_bf16_op_list:{}".format(
+            "mkldnn_bf16_op_list:{}, use_ascend_cl:{}".format(
                model_config, self._device_name, self._use_gpu, self._use_trt,
                self._use_lite, self._use_xpu, device_type, self._devices,
                self._mem_optim, self._ir_optim, self._use_profile,
                self._thread_num, self._client_type, self._fetch_names,
                self._precision, self._use_mkldnn, self._mkldnn_cache_capacity,
-                self._mkldnn_op_list, self._mkldnn_bf16_op_list))
+                self._mkldnn_op_list, self._mkldnn_bf16_op_list,
+                self._use_ascend_cl))
    def get_fetch_list(self):
        return self._fetch_names
@@ -225,7 +233,8 @@ class LocalServiceHandler(object):
                use_mkldnn=self._use_mkldnn,
                mkldnn_cache_capacity=self._mkldnn_cache_capacity,
                mkldnn_op_list=self._mkldnn_op_list,
-                mkldnn_bf16_op_list=self._mkldnn_bf16_op_list)
+                mkldnn_bf16_op_list=self._mkldnn_bf16_op_list,
+                use_ascend_cl=self._use_ascend_cl)
        return self._local_predictor_client
    def get_client_config(self):
@@ -284,6 +293,8 @@ class LocalServiceHandler(object):
                server.set_xpu()
            if self._use_lite:
                server.set_lite()
+            if self._use_ascend_cl:
+                server.set_ascend_cl()
        server.set_op_sequence(op_seq_maker.get_op_sequence())
        server.set_num_threads(thread_num)