diff --git a/CMakeLists.txt b/CMakeLists.txt index 11fd0f09a6aa5adfa5971580ba8babe55d12d550..ad1f75f792f9cd3a4963a95f4727e2ffe8051ff5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,8 @@ option(PACK "Compile for whl" option(WITH_TRT "Compile Paddle Serving with TRT" OFF) option(PADDLE_ON_INFERENCE "Compile for encryption" ON) option(WITH_OPENCV "Compile Paddle Serving with OPENCV" OFF) +option(WITH_ROCM "Compile Paddle Serving with ROCM" OFF) +option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" OFF) if(NOT DEFINED VERSION_TAG) set(VERSION_TAG "0.0.0") @@ -163,6 +165,14 @@ if(SERVER) endif() list(APPEND EXTERNAL_LIBS paddlepaddle) + if(WITH_ROCM) + include(hip) + endif() + + if(WITH_ASCEND_CL) + include(external/ascend) + list(APPEND EXTERNAL_LIBS ascend ascend_cl) + endif() endif() diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake new file mode 100644 index 0000000000000000000000000000000000000000..414b2a54be0342b3ef76d5e3a553577cb5f3e4be --- /dev/null +++ b/cmake/external/ascend.cmake @@ -0,0 +1,89 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#NOTE: Logic is from +# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt +if(DEFINED ENV{ASCEND_CUSTOM_PATH}) + set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH}) +else() + set(ASCEND_DIR /usr/local/Ascend) +endif() + +if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h) + # It means CANN 20.2 + + add_definitions(-DPADDLE_WITH_ASCEND_STRING) +endif() + + +if(WITH_ASCEND OR WITH_ASCEND_CL) + set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) + set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) + set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) + set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) + set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) + set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) + set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) + + set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) + set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) + set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) + set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) + set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) + + set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) + set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) + set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) + INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) + + + ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) + + ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) + + ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) + + add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) +endif() + +if(WITH_ASCEND_CL) + set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + + set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so) + set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) + set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) + set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) + + message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}") + message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") + INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR}) + INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR}) + + ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) + + ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib}) + + ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) + add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) + +endif() diff --git a/cmake/hip.cmake b/cmake/hip.cmake new file mode 100644 index 0000000000000000000000000000000000000000..9fbf667c3a2db4e609c8cc10e393b982e75e2c78 --- /dev/null +++ b/cmake/hip.cmake @@ -0,0 +1,89 @@ +if(NOT WITH_ROCM) + return() +endif() + +if(NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") + set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") + set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed") +else() + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") + set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") + set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed") +endif() +set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + +find_package(HIP REQUIRED) +include_directories(${ROCM_PATH}/include) +message(STATUS "HIP version: ${HIP_VERSION}") +message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}") + +macro(find_package_and_include PACKAGE_NAME) + find_package("${PACKAGE_NAME}" REQUIRED) + include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include") + message(STATUS "${PACKAGE_NAME} version: ${${PACKAGE_NAME}_VERSION}") +endmacro() + +find_package_and_include(miopen) +find_package_and_include(rocblas) +find_package_and_include(hiprand) +find_package_and_include(rocrand) +find_package_and_include(rccl) +find_package_and_include(rocthrust) +find_package_and_include(hipcub) +find_package_and_include(rocprim) +find_package_and_include(hipsparse) +find_package_and_include(rocsparse) +find_package_and_include(rocfft) + +# set CXX flags for HIP +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP") +set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP) + +# define HIP_CXX_FLAGS +list(APPEND HIP_CXX_FLAGS -fPIC) +list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1) +# Note(qili93): HIP has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer +list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1) +list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined) +list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override) +list(APPEND HIP_CXX_FLAGS -Wno-exceptions) +list(APPEND HIP_CXX_FLAGS -Wno-shift-count-negative) +list(APPEND HIP_CXX_FLAGS -Wno-shift-count-overflow) +list(APPEND HIP_CXX_FLAGS -Wno-unused-command-line-argument) +list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier) +list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion) +list(APPEND HIP_CXX_FLAGS -Wno-pass-failed) +list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) +list(APPEND HIP_CXX_FLAGS -std=c++14) + +if(CMAKE_BUILD_TYPE MATCHES Debug) + list(APPEND HIP_CXX_FLAGS -g2) + list(APPEND HIP_CXX_FLAGS -O0) + list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling) +endif(CMAKE_BUILD_TYPE MATCHES Debug) + +set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS}) +set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) +# Ask hcc to generate device code during compilation so we can use +# host linker to link. +list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906) +list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906) + + +if(HIP_COMPILER STREQUAL clang) + set(hip_library_name amdhip64) +else() + set(hip_library_name hip_hcc) +endif() +message(STATUS "HIP library name: ${hip_library_name}") + +# set HIP link libs +find_library(ROCM_HIPRTC_LIB ${hip_library_name} HINTS ${HIP_PATH}/lib) +message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}") + +#include(thrust) diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake index 6c0e3261e361bbf8d3208fea21eb01ba57381719..3119c62f85a9f3616136fd0448811d76163f1195 100644 --- a/cmake/paddlepaddle.cmake +++ b/cmake/paddlepaddle.cmake @@ -62,10 +62,27 @@ elseif (WITH_LITE) elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") SET(PADDLE_LIB_VERSION "arm64_gcc7.3_openblas") endif() + elseif (WITH_ASCEND_CL) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + MESSAGE("paddle lite lib is unknown.") + SET(PADDLE_LIB_VERSION "paddle-lite-unknown") + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/ASCEND/arm64_gcc7.5_openblas_lite2.10") + endif() else() MESSAGE("paddle lite lib is unknown.") SET(PADDLE_LIB_VERSION "paddle-lite-unknown") endif() +elseif (WITH_ROCM) + SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/rocm") +elseif (WITH_ASCEND_CL) + message("cpu arch: ${CMAKE_SYSTEM_PROCESSOR}") + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + MESSAGE("paddle lib is unknown.") + SET(PADDLE_LIB_VERSION "paddle-unknown") + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/ASCEND/arm64_gcc8.2_openblas") + endif() else() if (WITH_AVX) if (WITH_MKLML) @@ -79,7 +96,13 @@ else() endif() if(WITH_LITE) - SET(PADDLE_LIB_PATH "https://paddle-inference-lib.bj.bcebos.com/2.2.0-rc0/cxx_c/Linux/XPU/${PADDLE_LIB_VERSION}/paddle_inference_install_dir.tar.gz ") + if (WITH_XPU) + SET(PADDLE_LIB_PATH "https://paddle-inference-lib.bj.bcebos.com/2.2.0-rc0/cxx_c/Linux/XPU/${PADDLE_LIB_VERSION}/paddle_inference_install_dir.tar.gz ") + elseif (WITH_ASCEND_CL) + SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference_install_dir.tgz ") + endif() +elseif(WITH_ASCEND_CL) + SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference.tgz ") else() SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz") endif() @@ -150,6 +173,9 @@ endif() ADD_LIBRARY(paddle_inference STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a) +if (WITH_ASCEND_CL) + SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so) +endif() if (WITH_TRT) ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL) diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto index 13b9d39553b9219f0ab7f494f58ab0b7cfd3b7e8..c974f010737a8836d5de83d737ee0f9b9519462f 100755 --- a/core/configure/proto/server_configure.proto +++ b/core/configure/proto/server_configure.proto @@ -47,6 +47,7 @@ message EngineDesc { optional bool combined_model = 18; optional bool encrypted_model = 19; optional bool gpu_multi_stream = 20; + optional bool use_ascend_cl = 21; /* * "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling diff --git a/core/general-client/CMakeLists.txt b/core/general-client/CMakeLists.txt index 0a7f2ee4b2899a1e6c6b4557dc26f767efe842e1..21355e4784c2888880b81c61d300958a45dc48a3 100644 --- a/core/general-client/CMakeLists.txt +++ b/core/general-client/CMakeLists.txt @@ -18,7 +18,7 @@ add_executable(simple_client example/simple_client.cpp) add_dependencies(simple_client utils sdk-cpp client) target_link_libraries(simple_client -Wl,--whole-archive - -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib) + -Wl,--no-whole-archive -Wl,--start-group -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib) target_link_libraries(simple_client utils) target_link_libraries(simple_client sdk-cpp) diff --git a/core/general-server/CMakeLists.txt b/core/general-server/CMakeLists.txt index 7875d42dc848a29b455a5c0681ab3ba60c741791..334b31263dd448ba3475a4105446e5b04ab1ff8d 100644 --- a/core/general-server/CMakeLists.txt +++ b/core/general-server/CMakeLists.txt @@ -31,6 +31,10 @@ target_link_libraries(serving pdserving) target_link_libraries(serving cube-api) target_link_libraries(serving utils) target_link_libraries(serving utf8proc) +if(WITH_ASCEND_CL) + target_link_libraries(serving ascendcl acl_op_compiler) +endif() + if(WITH_GPU) target_link_libraries(serving ${CUDA_LIBRARIES}) @@ -63,3 +67,7 @@ install(FILES DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/demo/serving/bin) endif() + +if (WITH_ROCM) + target_link_libraries(serving ${ROCM_HIPRTC_LIB}) +endif() diff --git a/doc/Serving_Configure_CN.md b/doc/Serving_Configure_CN.md index 84ea0cb55dbf92055707d2c64ec2446120f9878b..e613293ad2a4c5ff00fe72f4b3571923f1ceb19b 100644 --- a/doc/Serving_Configure_CN.md +++ b/doc/Serving_Configure_CN.md @@ -98,6 +98,7 @@ workdir_9393 | `precision` | str | FP32 | Precision Mode, support FP32, FP16, INT8 | | `use_calib` | bool | False | Use TRT int8 calibration | | `gpu_multi_stream` | bool | False | EnableGpuMultiStream to get larger QPS | +| `use_ascend_cl` | bool | False | Enable for ascend910; Use with use_lite for ascend310 | #### 当您的某个模型想使用多张GPU卡部署时. ```BASH @@ -249,6 +250,7 @@ engines { use_gpu: false combined_model: false gpu_multi_stream: false + use_ascend_cl: false runtime_thread_num: 0 batch_infer_size: 32 enable_overrun: false @@ -286,6 +288,7 @@ gpu_ids: 2 - use_gpu:是否使用GPU - combined_model: 是否使用组合模型文件 - gpu_multi_stream: 是否开启gpu多流模式 +- use_ascend_cl: 是否使用昇腾,单独开启适配昇腾910,同时开启lite适配310 - runtime_thread_num: 若大于0, 则启用Async异步模式,并创建对应数量的predictor实例。 - batch_infer_size: Async异步模式下的最大batch数 - enable_overrun: Async异步模式下总是将整个任务放入任务队列 @@ -357,7 +360,7 @@ op: #Fetch结果列表,以client_config中fetch_var的alias_name为准 fetch_list: ["concat_1.tmp_0"] - # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu + # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu, 5=arm ascend310, 6=arm ascend910 device_type: 0 #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 @@ -395,7 +398,7 @@ op: #Fetch结果列表,以client_config中fetch_var的alias_name为准 fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"] - # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu + # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu, 5=arm ascend310, 6=arm ascend910 device_type: 0 #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 @@ -434,10 +437,12 @@ Python Pipeline除了支持CPU、GPU之外,还支持多种异构硬件部署 - TensorRT : 2 - CPU(Arm) : 3 - XPU : 4 +- Ascend310(Arm) : 5 +- Ascend910(Arm) : 6 config.yml中硬件配置: ```YAML -#计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu +#计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu, 5=arm ascend310, 6=arm ascend910 device_type: 0 #计算硬件ID,优先由device_type决定硬件类型。devices为""或空缺时为CPU预测;当为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 devices: "" # "0,1" diff --git a/doc/Serving_Configure_EN.md b/doc/Serving_Configure_EN.md index 2c4be74a3e19affca345fddb06de11f939754e9f..acb4f99aeaf98418ba99f67f6491e513555347a5 100644 --- a/doc/Serving_Configure_EN.md +++ b/doc/Serving_Configure_EN.md @@ -98,6 +98,7 @@ More flags: | `precision` | str | FP32 | Precision Mode, support FP32, FP16, INT8 | | `use_calib` | bool | False | Use TRT int8 calibration | | `gpu_multi_stream` | bool | False | EnableGpuMultiStream to get larger QPS | +| `use_ascend_cl` | bool | False | Enable for ascend910; Use with use_lite for ascend310 | #### Serving model with multiple gpus. ```BASH @@ -258,6 +259,7 @@ engines { use_gpu: false combined_model: false gpu_multi_stream: false + use_ascend_cl: false runtime_thread_num: 0 batch_infer_size: 32 enable_overrun: false @@ -293,6 +295,7 @@ gpu_ids: 2 - use_gpu: Enbale GPU. - combined_model: Enable combined model. - gpu_multi_stream: Enable gpu multiple stream mode. +- use_ascend_cl: Enable Ascend, use individually for ascend910, use with lite for ascend310 - runtime_thread_num: Enable Async mode when num greater than 0 and creating predictors. - batch_infer_size: The max batch size of Async mode. - enable_overrun: Enable over running of Async mode which means putting the whole task into the task queue. @@ -380,7 +383,7 @@ op: #Fetch data list fetch_list: ["concat_1.tmp_0"] - # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu + # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu, 5=arm ascend310, 6=arm ascend910 device_type: 0 #Device ID @@ -418,7 +421,7 @@ op: #Fetch data list fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"] - # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu + # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu, 5=arm ascend310, 6=arm ascend910 device_type: 0 #Device ID @@ -459,10 +462,12 @@ In addition to supporting CPU and GPU, Pipeline also supports the deployment of - TensorRT : 2 - CPU(Arm) : 3 - XPU : 4 +- Ascend310(Arm) : 5 +- Ascend910(Arm) : 6 Reference config.yaml: ```YAML -# device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu +# device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu, 5=arm ascend310, 6=arm ascend910 device_type: 0 devices: "" # "0,1" ``` diff --git a/paddle_inference/paddle/include/paddle_engine.h b/paddle_inference/paddle/include/paddle_engine.h index c76147b6842b9f01b3b4f65785102766d3940aef..952056102d9ba4d14c41445a9a90fedfff984eba 100644 --- a/paddle_inference/paddle/include/paddle_engine.h +++ b/paddle_inference/paddle/include/paddle_engine.h @@ -41,6 +41,9 @@ using paddle_infer::CreatePredictor; DECLARE_int32(gpuid); DECLARE_string(precision); DECLARE_bool(use_calib); +DECLARE_string(nnadapter_device_names); +DECLARE_string(nnadapter_context_properties); +DECLARE_string(nnadapter_model_cache_dir); static const int max_batch = 32; static const int min_subgraph_size = 3; @@ -237,6 +240,7 @@ class PaddleInferenceEngine : public EngineCore { if (engine_conf.has_use_lite() && engine_conf.use_lite()) { config.EnableLiteEngine(precision_type, true); + config.SwitchIrOptim(true); } if ((!engine_conf.has_use_lite() && !engine_conf.has_use_gpu()) || @@ -269,6 +273,33 @@ class PaddleInferenceEngine : public EngineCore { config.SetXpuDeviceId(gpu_id); } + if (engine_conf.has_use_ascend_cl() && + engine_conf.use_ascend_cl()) { + if (engine_conf.has_use_lite() && engine_conf.use_lite()) { + // for ascend 310 + FLAGS_nnadapter_device_names = "huawei_ascend_npu"; + FLAGS_nnadapter_context_properties = + "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS=" + + std::to_string(gpu_id); + FLAGS_nnadapter_model_cache_dir = ""; + config.NNAdapter() + .Enable() + .SetDeviceNames({FLAGS_nnadapter_device_names}) + .SetContextProperties(FLAGS_nnadapter_context_properties) + .SetModelCacheDir(FLAGS_nnadapter_model_cache_dir); + LOG(INFO) << "Enable Lite NNAdapter for Ascend," + << "nnadapter_device_names=" + << FLAGS_nnadapter_device_names + << ",nnadapter_context_properties=" + << FLAGS_nnadapter_context_properties + << ",nnadapter_model_cache_dir=" + << FLAGS_nnadapter_model_cache_dir; + } else { + // for ascend 910 + config.EnableNpu(gpu_id); + } + } + if (engine_conf.has_enable_memory_optimization() && engine_conf.enable_memory_optimization()) { config.EnableMemoryOptim(); diff --git a/paddle_inference/paddle/src/paddle_engine.cpp b/paddle_inference/paddle/src/paddle_engine.cpp index b6da2a5a0eeb31473e2eba5b1a5b58855dbb03c6..dc6ffd8181c3b0a402ca9a9e9c42a2bc1133f3de 100644 --- a/paddle_inference/paddle/src/paddle_engine.cpp +++ b/paddle_inference/paddle/src/paddle_engine.cpp @@ -22,6 +22,11 @@ namespace inference { DEFINE_int32(gpuid, 0, "GPU device id to use"); DEFINE_string(precision, "fp32", "precision to deploy, default is fp32"); DEFINE_bool(use_calib, false, "calibration mode, default is false"); +DEFINE_string(nnadapter_device_names, "", "Names of nnadapter device"); +DEFINE_string(nnadapter_context_properties, + "", + "Properties of nnadapter context"); +DEFINE_string(nnadapter_model_cache_dir, "", "Cache dir of nnadapter model"); REGIST_FACTORY_OBJECT_IMPL_WITH_NAME( ::baidu::paddle_serving::predictor::FluidInferEngine, diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 986fd8ee289677234407d7ee2fc4d492933731ba..3fd4a6f296e41cc54aaff7fa690cae0a3e66812f 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -12,6 +12,10 @@ if (SERVER) set(SERVER_PACKAGE_NAME "paddle-serving-server-gpu") elseif(WITH_XPU) set(SERVER_PACKAGE_NAME "paddle-serving-server-xpu") + elseif(WITH_ROCM) + set(SERVER_PACKAGE_NAME "paddle-serving-server-rocm") + elseif(WITH_ASCEND_CL) + set(SERVER_PACKAGE_NAME "paddle-serving-server-npu") endif() file(INSTALL pipeline DESTINATION paddle_serving_server) file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py) diff --git a/python/gen_version.py b/python/gen_version.py index 8c46e39a0009abd115dd0a7c7815425e2ba80815..c18f030cf7980db5a86a9bcb7201beec6bda8595 100644 --- a/python/gen_version.py +++ b/python/gen_version.py @@ -43,6 +43,10 @@ if package_name.endswith('gpu'): update_info("paddle_serving_server/version.py", "device_type", "1") elif package_name.endswith('xpu'): update_info("paddle_serving_server/version.py", "device_type", "2") +elif package_name.endswith('rocm'): + update_info("paddle_serving_server/version.py", "device_type", "3") +elif package_name.endswith('npu'): + update_info("paddle_serving_server/version.py", "device_type", "4") path = "paddle_serving_" + sys.argv[1] commit_id = subprocess.check_output(['git', 'rev-parse', 'HEAD']) diff --git a/python/paddle_serving_app/local_predict.py b/python/paddle_serving_app/local_predict.py index 7de419530462b59f733f6ecc81e8b2fd9ce61b80..a637b408e8616ed8a218cc6c51c13ae28ec904bb 100644 --- a/python/paddle_serving_app/local_predict.py +++ b/python/paddle_serving_app/local_predict.py @@ -86,7 +86,8 @@ class LocalPredictor(object): mkldnn_cache_capacity=0, mkldnn_op_list=None, mkldnn_bf16_op_list=None, - use_feed_fetch_ops=False): + use_feed_fetch_ops=False, + use_ascend_cl=False): """ Load model configs and create the paddle predictor by Paddle Inference API. @@ -108,6 +109,7 @@ class LocalPredictor(object): mkldnn_op_list: op list accelerated using MKLDNN, None default. mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default. use_feed_fetch_ops: use feed/fetch ops, False default. + use_ascend_cl: run predict on Huawei Ascend, False default """ gpu_id = int(gpu_id) client_config = "{}/serving_server_conf.prototxt".format(model_path) @@ -146,11 +148,12 @@ class LocalPredictor(object): "gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{}, " "use_trt:{}, use_lite:{}, use_xpu:{}, precision:{}, use_calib:{}, " "use_mkldnn:{}, mkldnn_cache_capacity:{}, mkldnn_op_list:{}, " - "mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, ".format( + "mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, " + "use_ascend_cl:{} ".format( model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim, ir_optim, use_trt, use_lite, use_xpu, precision, use_calib, use_mkldnn, mkldnn_cache_capacity, mkldnn_op_list, - mkldnn_bf16_op_list, use_feed_fetch_ops)) + mkldnn_bf16_op_list, use_feed_fetch_ops, use_ascend_cl)) self.feed_names_ = [var.alias_name for var in model_conf.feed_var] self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var] @@ -215,11 +218,28 @@ class LocalPredictor(object): zero_copy=True, passes_filter=[], ops_filter=[]) + config.switch_ir_optim(True) # set xpu if use_xpu: # 2MB l3 cache config.enable_xpu(8 * 1024 * 1024) config.set_xpu_device_id(gpu_id) + # set ascend cl + if use_ascend_cl: + if use_lite: + # for ascend 310 + nnadapter_device_names = "huawei_ascend_npu" + nnadapter_context_properties = \ + "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={}".format(gpu_id) + nnadapter_model_cache_dir = "" + config.nnadapter() \ + .enable() \ + .set_device_names([nnadapter_device_names]) \ + .set_context_properties(nnadapter_context_properties) \ + .set_model_cache_dir(nnadapter_model_cache_dir) + else: + # for ascend 910 + config.enable_npu(gpu_id) # set cpu low precision if not use_gpu and not use_lite: if precision_type == paddle_infer.PrecisionType.Int8: diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py index 0447f5ecb5dd6ede7b53758a7601a82b21bbb1e9..c0bc5b0d47bbd7b96ed83413b76ba0223106ba96 100755 --- a/python/paddle_serving_server/serve.py +++ b/python/paddle_serving_server/serve.py @@ -181,6 +181,8 @@ def serve_args(): "--use_lite", default=False, action="store_true", help="Use PaddleLite") parser.add_argument( "--use_xpu", default=False, action="store_true", help="Use XPU") + parser.add_argument( + "--use_ascend_cl", default=False, action="store_true", help="Use Ascend CL") parser.add_argument( "--product_name", type=str, @@ -272,13 +274,15 @@ def start_gpu_card_model(gpu_mode, port, args): # pylint: disable=doc-string-mi server.set_device(device) if args.use_xpu: server.set_xpu() + if args.use_ascend_cl: + server.set_ascend_cl() if args.product_name != None: server.set_product_name(args.product_name) if args.container_id != None: server.set_container_id(args.container_id) - if gpu_mode == True: + if gpu_mode == True or args.use_xpu or args.use_ascend_cl: server.set_gpuid(args.gpu_ids) server.load_model_config(model) server.prepare_server( diff --git a/python/paddle_serving_server/server.py b/python/paddle_serving_server/server.py index c6446ac2dc61fa7a2123e6713b84624776435863..5cc6dc4a9f5edfb9b57d1e6c582dfdf7f54ad14b 100755 --- a/python/paddle_serving_server/server.py +++ b/python/paddle_serving_server/server.py @@ -88,6 +88,7 @@ class Server(object): self.gpu_multi_stream = False self.use_lite = False self.use_xpu = False + self.use_ascend_cl = False self.model_config_paths = collections.OrderedDict() self.product_name = None self.container_id = None @@ -189,6 +190,9 @@ class Server(object): def set_xpu(self): self.use_xpu = True + def set_ascend_cl(self): + self.use_ascend_cl = True + def _prepare_engine(self, model_config_paths, device, use_encryption_model): self.device = device if self.model_toolkit_conf == None: @@ -202,6 +206,8 @@ class Server(object): if self.device == "gpu" or self.use_trt or self.gpu_multi_stream: self.gpuid = ["0"] self.device = "gpu" + elif self.use_xpu or self.use_ascend_cl: + self.gpuid = ["0"] else: self.gpuid = ["-1"] @@ -238,6 +244,7 @@ class Server(object): engine.gpu_multi_stream = self.gpu_multi_stream engine.use_lite = self.use_lite engine.use_xpu = self.use_xpu + engine.use_ascend_cl = self.use_ascend_cl engine.use_gpu = False if len(self.gpuid) == 0: @@ -435,6 +442,13 @@ class Server(object): device_version = "gpu-cuda" + version_suffix elif device_type == "2": device_version = "xpu-" + platform.machine() + elif device_type == "3": + device_version = "rocm-" + platform.machine() + elif device_type == "4": + if self.use_lite: + device_version = "ascendcl-lite-" + platform.machine() + else: + device_version = "ascendcl-" + platform.machine() return device_version def download_bin(self): diff --git a/python/pipeline/local_service_handler.py b/python/pipeline/local_service_handler.py index d9df5e3091053a62c98fd108a5985a1e518a7767..ea0ea64adc342bd94ceca3693f8d9db08220adea 100644 --- a/python/pipeline/local_service_handler.py +++ b/python/pipeline/local_service_handler.py @@ -86,6 +86,7 @@ class LocalServiceHandler(object): self._use_trt = False self._use_lite = False self._use_xpu = False + self._use_ascend_cl = False self._use_mkldnn = False self._mkldnn_cache_capacity = 0 self._mkldnn_op_list = None @@ -129,6 +130,17 @@ class LocalServiceHandler(object): devices = [int(x) for x in devices.split(",")] self._use_lite = True self._use_xpu = True + elif device_type == 5: + # Ascend 310 ARM CPU + self._device_name = "arm" + devices = [int(x) for x in devices.split(",")] + self._use_lite = True + self._use_ascend_cl = True + elif device_type == 6: + # Ascend 910 ARM CPU + self._device_name = "arm" + devices = [int(x) for x in devices.split(",")] + self._use_ascend_cl = True else: _LOGGER.error( "LocalServiceHandler initialization fail. device_type={}" @@ -163,13 +175,14 @@ class LocalServiceHandler(object): "mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, " "client_type:{}, fetch_names:{}, precision:{}, use_mkldnn:{}, " "mkldnn_cache_capacity:{}, mkldnn_op_list:{}, " - "mkldnn_bf16_op_list:{}".format( + "mkldnn_bf16_op_list:{}, use_ascend_cl:{}".format( model_config, self._device_name, self._use_gpu, self._use_trt, self._use_lite, self._use_xpu, device_type, self._devices, self._mem_optim, self._ir_optim, self._use_profile, self._thread_num, self._client_type, self._fetch_names, self._precision, self._use_mkldnn, self._mkldnn_cache_capacity, - self._mkldnn_op_list, self._mkldnn_bf16_op_list)) + self._mkldnn_op_list, self._mkldnn_bf16_op_list, + self._use_ascend_cl)) def get_fetch_list(self): return self._fetch_names @@ -225,7 +238,8 @@ class LocalServiceHandler(object): use_mkldnn=self._use_mkldnn, mkldnn_cache_capacity=self._mkldnn_cache_capacity, mkldnn_op_list=self._mkldnn_op_list, - mkldnn_bf16_op_list=self._mkldnn_bf16_op_list) + mkldnn_bf16_op_list=self._mkldnn_bf16_op_list, + use_ascend_cl=self._use_ascend_cl) return self._local_predictor_client def get_client_config(self): @@ -284,6 +298,8 @@ class LocalServiceHandler(object): server.set_xpu() if self._use_lite: server.set_lite() + if self._use_ascend_cl: + server.set_ascend_cl() server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_num_threads(thread_num)