提交 5cab7cdd 编写于 作者: Y yanghongtian

collect all

上级 f6cf1f9f
......@@ -24,33 +24,6 @@ if(NOT DEFINED ASCEND_HOME)
endif()
message(STATUS "LITE_WITH_HW_ASCEND_NPU: ${LITE_WITH_HW_ASCEND_NPU}")
find_path(ACL_INC NAMES acl/acl.h
PATHS ${ASCEND_HOME}/acllib/include NO_DEFAULT_PATH)
if(NOT ACL_INC)
message(FATAL_ERROR "Can not find acl/acl.h in ${ASCEND_HOME}/include")
endif()
include_directories("${ACL_INC}")
set(ACL_LIB_FILES
acl_dvpp
ascendcl
register
runtime
)
foreach (libname ${ACL_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/acllib/lib64)
if (lib_name_path_${libname})
add_library(acl_${libname} SHARED IMPORTED GLOBAL)
set_property(TARGET acl_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
list(APPEND acl_libs acl_${libname})
else()
message(FATAL_ERROR "can not find library: ${libname}")
endif()
endforeach()
set(hw_ascend_npu_runtime_libs ${acl_libs} CACHE INTERNAL "ascend runtime libs")
# find atc include folder and library
find_path(ATC_INC NAMES ge/ge_ir_build.h
......@@ -61,6 +34,8 @@ endif()
include_directories("${ATC_INC}")
set(ATC_LIB_FILES
ge_compiler
graph
_caffe_parser
auto_tiling
c_sec
......@@ -76,9 +51,7 @@ set(ATC_LIB_FILES
fmk_tensorflow_parser
ge_client
ge_common
ge_compiler
ge_executor
graph
mmpa
msprof
parser_common
......@@ -92,6 +65,16 @@ set(ATC_LIB_FILES
tvm_runtime
tvm_topi
)
set(ATC_PLUGIN_NNENGIN_LIB_FILES
engine
)
set(ATC_PLUGIN_OPSKERNEL_LIB_FILES
aicpu_engine
fe
ge_local_engine
rts_engine
)
foreach (libname ${ATC_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64)
......@@ -104,6 +87,28 @@ foreach (libname ${ATC_LIB_FILES})
endif()
endforeach()
foreach (libname ${ATC_PLUGIN_NNENGIN_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64/plugin/nnengine)
if (lib_name_path_${libname})
add_library(atc_${libname} SHARED IMPORTED GLOBAL)
set_property(TARGET atc_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
list(APPEND atc_libs atc_${libname})
else()
message(FATAL_ERROR "can not find library: ${libname}")
endif()
endforeach()
foreach (libname ${ATC_PLUGIN_OPSKERNEL_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64/plugin/opskernel)
if (lib_name_path_${libname})
add_library(atc_${libname} SHARED IMPORTED GLOBAL)
set_property(TARGET atc_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
list(APPEND atc_libs atc_${libname})
else()
message(FATAL_ERROR "can not find library: ${libname}")
endif()
endforeach()
# find opp include folder and library
find_path(OPP_INC NAMES all_ops.h
PATHS ${ASCEND_HOME}/opp/op_proto/built-in/inc)
......@@ -139,10 +144,59 @@ else()
set_property(TARGET opp_fusion_pass_vectorcore_lib PROPERTY IMPORTED_LOCATION ${OPP_FUSION_VECTORCORE})
endif()
set(hw_ascend_npu_builder_libs
add_library(hw_ascend_npu_builder_libs INTERFACE)
target_link_libraries(hw_ascend_npu_builder_libs INTERFACE
${atc_libs}
opp_opsproto_lib
opp_fusion_pass_aicore_lib
opp_fusion_pass_vectorcore_lib)
#set(hw_ascend_npu_builder_libs
# ${atc_libs}
# opp_opsproto_lib
# opp_fusion_pass_aicore_lib
# opp_fusion_pass_vectorcore_lib
# CACHE INTERNAL "ascend builder libs")
# find ascend cl runtime library
find_path(ACL_INC NAMES acl/acl.h
PATHS ${ASCEND_HOME}/acllib/include NO_DEFAULT_PATH)
if(NOT ACL_INC)
message(FATAL_ERROR "Can not find acl/acl.h in ${ASCEND_HOME}/include")
endif()
include_directories("${ACL_INC}")
set(ACL_LIB_FILES
acl_dvpp
ascendcl
register
runtime
)
foreach (libname ${ACL_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/acllib/lib64)
if (lib_name_path_${libname})
add_library(acl_${libname} SHARED IMPORTED GLOBAL)
set_property(TARGET acl_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
list(APPEND acl_libs acl_${libname})
else()
message(FATAL_ERROR "can not find library: ${libname}")
endif()
endforeach()
add_library(hw_ascend_npu_runtime_libs INTERFACE)
target_link_libraries(hw_ascend_npu_runtime_libs INTERFACE ${acl_libs})
add_library(hw_ascend_npu_libs INTERFACE)
target_link_libraries(hw_ascend_npu_libs INTERFACE
${atc_libs}
opp_opsproto_lib
opp_fusion_pass_aicore_lib
opp_fusion_pass_vectorcore_lib
CACHE INTERNAL "ascend builder libs")
${acl_libs})
# set(hw_ascend_npu_runtime_libs ${acl_libs} CACHE INTERNAL "ascend runtime libs")
......@@ -23,7 +23,7 @@ function (lite_deps TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS
CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS ARGS)
CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS})
......@@ -138,7 +138,7 @@ function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps "")
......@@ -156,7 +156,7 @@ function(lite_cc_library TARGET)
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
# MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
)
......@@ -185,7 +185,7 @@ function(lite_cc_binary TARGET)
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS PROFILE_DEPS HW_ASCEND_NPU_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps "")
......@@ -204,7 +204,7 @@ function(lite_cc_binary TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS}
# MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS ${args_MLU_DEPS}
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
......@@ -235,7 +235,7 @@ function(lite_cc_test TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS HW_ASCEND_NPU_DEPS
XPU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS COMPILE_LEVEL # (basic|extra)
)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -261,7 +261,7 @@ function(lite_cc_test TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS}
# MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS ${args_MLU_DEPS}
)
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size
......@@ -309,8 +309,8 @@ endif()
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS HW_ASCEND_NPU_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -444,7 +444,7 @@ function(add_kernel TARGET device level)
XPU_DEPS ${args_XPU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
BM_DEPS ${args_BM_DEPS}
#MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -463,8 +463,8 @@ endif()
function(add_operator TARGET level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -499,7 +499,7 @@ function(add_operator TARGET level)
XPU_DEPS ${args_XPU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
BM_DEPS ${args_BM_DEPS}
#MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......
......@@ -69,7 +69,7 @@ if (WITH_TESTING)
XPU_DEPS ${xpu_kernels}
BM_DEPS ${bm_kernels}
HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
# MLU_DEPS ${mlu_kernels}
MLU_DEPS ${mlu_kernels}
)
endif()
if(LITE_WITH_FPGA)
......
......@@ -2,12 +2,24 @@ if(NOT LITE_WITH_HW_ASCEND_NPU)
return()
endif()
lite_cc_library(build_hw_ascend_npu SRCS build.cc DEPS
hw_ascend_npu_libs)
lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS
hw_ascend_npu_libs
build_hw_ascend_npu
)
lite_cc_library(target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS
${hw_ascend_npu_runtime_libs})
hw_ascend_npu_libs
build_hw_ascend_npu
)
lite_cc_library(runtime_hw_ascend_npu SRCS runtime.cc DEPS
${hw_ascend_npu_runtime_libs}
target_wrapper_hw_ascend_npu)
lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS
${hw_ascend_npu_runtime_libs}
hw_ascend_npu_libs
target_wrapper_hw_ascend_npu
runtime_hw_ascend_npu)
device_hw_ascend_npu
build_hw_ascend_npu
)
add_executable(test_build test_build.cc)
target_link_libraries(test_build build_hw_ascend_npu)
......@@ -16,7 +16,7 @@
#include <map>
#include <string>
#include "ge/ge_api_types.h"
#include "lite/backends/hw_ascend_npu/runtime.h"
#include "lite/backends/hw_ascend_npu/build.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
......@@ -26,33 +26,92 @@ std::shared_ptr<HWAscendNPURuntime> Device::Build(
std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes // NOLINT
) {
VLOG(3) << "[HWAscendNPU] Build model";
// Build the IR graph to the om model
ge::Graph ir_graph("graph");
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
ge::ModelBufferData model;
std::map<std::string, std::string> build_options;
build_options.insert({ge::ir_option::EXEC_DISABLE_REUSED_MEMORY, "1"});
ge::graphStatus ret = aclgrphBuildModel(ir_graph, build_options, model);
if (ret != ge::GRAPH_SUCCESS) {
LOG(ERROR) << "[HWAscendNPU] Build model failed, error code: " << ret;
std::shared_ptr<ge::ModelBufferData> model_data =
paddle::lite::hw_ascend_npu::Build(input_nodes, output_nodes);
if (model_data == nullptr) {
LOG(ERROR) << "[HWAscendNPU] Build model failed";
return nullptr;
}
LOG(INFO) << "[HWAscendNPU] Build model success";
if (!inited_) {
if (0 == InitDevice()) {
LOG(INFO) << "Init success.";
inited_ = true;
}
}
std::shared_ptr<HWAscendNPURuntime> model_runtime(
new HWAscendNPURuntime(model.data, model.length));
new HWAscendNPURuntime(model_data->data, model_data->length));
CHECK(model_runtime != nullptr);
if (!model_runtime->model_loaded()) {
LOG(ERROR) << "[HWAscendNPU]: Can not create model runtime instance";
return nullptr;
}
VLOG(3) << "[HWAscendNPU]: Build done";
LOG(INFO) << "[HWAscendNPU]: Build done";
return model_runtime;
}
int Device::InitDevice() {
const char* acl_conf = "/usr/local/acl.json";
aclError ret = aclInit(acl_conf);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] acl init failed";
return -1;
}
// open device
ret = aclrtSetDevice(device_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] acl open device " << device_id_ << " failed";
return -1;
}
ret = aclrtCreateContext(&context_ptr_, device_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "acl create context failed";
return -1;
}
// create stream
ret = aclrtCreateStream(&stream_ptr_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] acl create stream failed";
return -1;
}
// get run mode
aclrtGetRunMode runMode;
ret = aclrtGetMode(&runMode);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] acl get run mode failed";
return -1;
}
is_devcie_ = (runMode == ACL_DEVICE);
LOG(INFO) << "[HWAscendNPU] Hardware initialization done";
return 0;
}
void Device::ReleaseDevice() {
aclError ret;
if (stream_ptr_ != nullptr) {
ret = aclrtDestroyStream(stream_ptr_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] destroy stream failed";
}
stream_ptr_ = nullptr;
}
LOG(INFO) << "[HWAscendNPU] end to destroy stream";
if (context_ptr_ != nullptr) {
ret = aclrtDestroyContext(context_ptr_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] destroy context failed";
}
context_ptr_ = nullptr;
}
LOG(INFO) << "[HWAscendNPU] Release device successfully";
}
} // namespace hw_ascend_npu
} // namespace lite
} // namespace paddle
......@@ -18,8 +18,9 @@
#include <string>
#include <unordered_map>
#include <vector>
#include "ge/ge_ir_build.h" // NOLINT
#include "ge/ge_ir_build.h"
#include "lite/backends/hw_ascend_npu/runtime.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace hw_ascend_npu {
......@@ -30,12 +31,11 @@ class Device {
static Device x;
return x;
}
Device() {}
Device() : inited_(false) {}
int freq_level() { return freq_level_; }
int framework_type() { return framework_type_; }
int model_type() { return model_type_; }
int device_type() { return device_type_; }
~Device() { ReleaseDevice(); }
bool is_device() const { return is_devcie_; }
// Build the IR graph to om model, return a HWAscendNPURuntime instance to
// load om model and run inference.
......@@ -45,10 +45,15 @@ class Device {
); // NOLINT
private:
int freq_level_{3};
int framework_type_{0};
int model_type_{0};
int device_type_{0};
int InitDevice();
void ReleaseDevice();
private:
bool inited_{false};
int device_id_{0};
bool is_devcie_{false};
aclrtContext context_ptr_{nullptr};
aclrtStream stream_ptr_{nullptr};
};
} // namespace hw_ascend_npu
......
......@@ -42,12 +42,15 @@ int HWAscendNPURuntime::LoadModelFromMem(
&model_size_,
&model_weights_size_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Can query size from a built model buffer, "
LOG(ERROR) << "[HWAscendNPU]: Can't query size from a built model buffer, "
"error code: "
<< ret;
<< ret << ", model buffer size: " << model_buff_size;
return ret;
}
LOG(INFO) << "[HWAscendNPU]: Query model info success, model_size: "
<< model_size_ << ", model weights_size_: " << model_weights_size_;
ret = aclrtMalloc(&model_ptr_, model_size_, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model, "
......
add_subdirectory(bridges)
add_kernel(subgraph_compute_hw_ascend_npu HW_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS
${lite_kernel_deps}
build_hw_ascend_npu
device_hw_ascend_npu
subgraph_bridge_engine
runtime_hw_ascend_npu
${hw_ascend_npu_subgraph_bridges}
subgraph_bridge_registry
${lite_kernel_deps}
)
......@@ -4,18 +4,24 @@ endif()
lite_cc_library(subgraph_bridge_utility_hw_ascend_npu
SRCS utility.cc
DEPS ${hw_ascend_npu_builder_libs} tensor)
DEPS hw_ascend_npu_libs tensor)
lite_cc_library(subgraph_bridge_graph_hw_ascend_npu
SRCS graph.cc
DEPS ${hw_ascend_npu_builder_libs} subgraph_bridge_utility_hw_ascend_npu
)
DEPS hw_ascend_npu_libs subgraph_bridge_utility_hw_ascend_npu)
set(hw_ascend_npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_hw_ascend_npu subgraph_bridge_graph_hw_ascend_npu)
set(hw_ascend_npu_subgraph_bridge_deps
subgraph_bridge_registry
subgraph_bridge_utility_hw_ascend_npu
subgraph_bridge_graph_hw_ascend_npu)
lite_cc_library(subgraph_bridge_act_op_hw_ascend_npu SRCS act_op.cc DEPS
${hw_ascend_npu_subgraph_bridge_deps}
${hw_ascend_npu_builder_libs})
hw_ascend_npu_libs
${hw_ascend_npu_subgraph_bridge_deps})
#lite_cc_library(subgraph_bridge_concat_op_hw_ascend_npu SRCS concat_op.cc DEPS
# ${hw_ascend_npu_subgraph_bridge_deps}
# hw_ascend_npu_builder_libs)
set(hw_ascend_npu_subgraph_bridges
subgraph_bridge_graph_hw_ascend_npu
......
......@@ -30,7 +30,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HWAscendNPU] Converting " + op_type + "...";
LOG(INFO) << "[HWAscendNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
......@@ -64,28 +64,35 @@ int ActConverter<ge::op::Activation>(void* ctx,
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HWAscendNPU] Converting " + op_type + "...";
LOG(INFO) << "[HWAscendNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto out_name = op_info->Output("Out").front();
LOG(INFO) << "[HWAscendNPU] xname: " << x_name << ", dims: " << x_dims;
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
LOG(INFO) << "[HWAscendNPU] graph has node: " << x_name;
x_node = graph->Get(x_name);
} else {
LOG(INFO) << "[HWAscendNPU] graph does no have node: " << x_name;
x_node = graph->Add(x_name, *x);
}
LOG(INFO) << "[HWAscendNPU] out name: " << out_name;
#if 0
// Act node
auto act_node = graph->template Add<ge::op::Activation>(out_name);
auto act_op = act_node->template data<ge::op::Activation>();
act_op->set_input_x(*x_node->data());
// TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
// clipped_relu etc.
LOG(INFO) << "[HWAscendNPU] activation mode: " << op_type
<< ", type: " << CvtActMode(op_type);
act_op->set_attr_mode(CvtActMode(op_type));
if (op_type == "relu_clipped") {
auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
......@@ -94,6 +101,12 @@ int ActConverter<ge::op::Activation>(void* ctx,
float Relu_clipped_coef = 6.f;
act_op->set_attr_coef(Relu_clipped_coef);
}
#else
// Act node
auto act_node = graph->template Add<ge::op::Relu>(out_name);
auto act_op = act_node->template data<ge::op::Relu>();
act_op->set_input_x(*x_node->data());
#endif
return SUCCESS;
}
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <all_ops.h>
#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace hw_ascend_npu {
int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HWAscendNPU] Converting " << op_type << " ... ";
// Get input and output vars and op attributes
auto x_names = op_info->Input("X");
auto out_name = op_info->Output("Out").front();
auto axis = op_info->GetAttr<int>("axis");
auto num = x_names.size();
// Traverse all of input nodes which are added into the new created concat
// node
auto concat_node = graph->Add<ge::op::Concat>(out_name);
auto concat_op = concat_node->data<ge::op::Concat>();
concat_op->set_input_concat_dim(axis);
concat_op->set_attr_N(num);
concat_op->create_dynamic_input_input_values(num);
int idx = 1;
for (auto& x_name : x_names) {
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
concat_op->set_dynamic_input_input_values(idx, *x_node->data());
idx++;
}
return SUCCESS;
}
} // namespace hw_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(
concat,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ConcatConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/conv_op.h"
#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace hw_ascend_npu {
int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HwAscendNPU] Converting " << op_type << "... ";
// Get input and output vars and op attributes
auto input_name = op_info->Input("Input").front();
auto input = scope->FindMutableTensor(input_name);
auto input_dims = input->dims();
auto filter_name = op_info->Input("Filter").front();
auto filter = scope->FindMutableTensor(filter_name);
auto filter_dims = filter->dims();
auto output_name = op_info->Output("Output").front();
auto output = scope->FindMutableTensor(output_name);
auto output_dims = output->dims();
auto bs = input_dims[0];
auto ic = input_dims[1];
auto oc = filter_dims[0];
CHECK_EQ(input_dims.size(), 4L);
CHECK_EQ(output_dims.size(), 4L);
CHECK_EQ(filter_dims.size(), 4L);
CHECK_EQ(output_dims[0], bs);
CHECK_EQ(output_dims[1], oc);
auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
auto groups = op_info->GetAttr<int>("groups");
auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
bool with_act =
op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
std::string act_type =
with_act ? op_info->GetAttr<std::string>("act_type") : "";
float leaky_relu_alpha = act_type == "leaky_relu"
? op_info->GetAttr<float>("leaky_relu_alpha")
: 0.f;
CHECK_EQ(strides.size(), 2L);
CHECK_EQ(dilations.size(), 2L);
// Input node
std::shared_ptr<Node> input_node = nullptr;
if (graph->Has(input_name)) {
input_node = graph->Get(input_name);
} else {
input_node = graph->Add(input_name, *input);
}
if (paddings.size() == 2L) {
for (size_t i = 0; i < strides.size(); ++i) {
int copy_pad = *(paddings.begin() + 2 * i);
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
}
CHECK_EQ(paddings.size(), 4L) << "[HwAscendNPU] Paddings size should be the "
"same or twice as the input size.";
std::string padding_algorithm("");
if (op_info->HasAttr("padding_algorithm")) {
padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
}
operators::UpdatePaddingAndDilation(&paddings,
&dilations,
strides,
padding_algorithm,
input_dims,
filter_dims);
// Check depthwise mode, and decide whether use ConvolutionDepthwise Op
bool use_depthwise_conv =
false; // Whether use ge::op::ConvolutionDepthwise ?
bool is_depthwise_mode = ic == groups && oc == groups;
if (is_depthwise_mode &&
!((groups == 1 || groups >= 5) && dilations[0] == 1 &&
dilations[1] == 1)) {
use_depthwise_conv = true;
LOG(WARNING)
<< "[HwAscendNPU] For depthwise mode, dilation = 1 and groups >= 5 "
"(or groups = 1) is only supported in Convolution Op, so "
"force to use ConvolutionDepthwise Op, but may lead poor "
"performance.";
}
// Filter node
auto filter_node = graph->Add(filter_name, *filter);
// Add bias node if exists bias
// Supports the bias nodes with the following dimensions
// 0: {oc}
// 1: {1, oc, oh, ow}
// 2: {n, oc, oh, ow}
std::shared_ptr<Node> bias_node = nullptr;
bool is_channel_bias = false;
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
if (graph->Has(bias_name)) {
bias_node = graph->Get(bias_name);
} else {
auto bias = scope->FindMutableTensor(bias_name);
auto bias_dims = bias->dims();
auto bias_data_size = bias_dims.production();
auto output_data_size = output_dims.production();
std::vector<int64_t> bias_shape;
if (bias_data_size == oc) {
// 0: {oc}
bias_shape = {1, oc, 1, 1};
is_channel_bias = true;
} else if (bias_data_size == output_data_size / bs) {
// 1: {1, oc, oh, ow}
bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
} else if (bias_data_size == output_data_size) {
// 2: {n, oc, oh, ow}
bias_shape = output_dims.Vectorize();
} else {
LOG(WARNING)
<< "[HwAscendNPU] Bias dimension " << bias_dims
<< " isn't supported in conv2d Op when output dimension is "
<< output_dims;
return FAILED;
}
bias_node = graph->Add(bias_name, *bias, bias_shape);
}
}
// Conv node
std::shared_ptr<Node> conv_node = nullptr;
if (use_depthwise_conv && is_depthwise_mode) {
conv_node = graph->Add<ge::op::DepthwiseConv2D>(output_name);
auto conv_op = conv_node->data<ge::op::ConvolutionDepthwise>();
conv_op->set_input_x(*input_node->data());
conv_op->set_input_filter(*filter_node->data());
conv_op->set_attr_mode(1);
conv_op->set_attr_algo(0);
conv_op->set_attr_format(0); // NCHW
conv_op->set_attr_pad_mode(5); // VALID
conv_op->set_attr_group(groups);
conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
{paddings[0], paddings[1], paddings[2], paddings[3]}));
conv_op->set_attr_dilation(
ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
conv_op->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
// ConvolutionDepthwise Op doesn't support bias, so append Add node to
// support bias
if (bias_node != nullptr) {
auto add_node = graph->Add<ge::op::Add>(output_name);
auto add_op = add_node->data<ge::op::Add>();
add_op->set_input_x1(*conv_node->data());
add_op->set_input_x2(*bias_node->data());
conv_node = add_node;
}
} else {
conv_node = graph->Add<ge::op::Convolution>(output_name);
auto conv_op = conv_node->data<ge::op::Convolution>();
conv_op->set_input_x(*input_node->data());
conv_op->set_input_w(*filter_node->data());
conv_op->set_attr_mode(1);
// when padding_algorithm=="SAME", NPU is different from lite
if (padding_algorithm == "VALID") {
conv_op->set_attr_pad_mode(5);
} else {
conv_op->set_attr_pad_mode(0);
}
conv_op->set_attr_group(groups);
conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
{paddings[0], paddings[1], paddings[2], paddings[3]}));
conv_op->set_attr_dilation(
ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
conv_op->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
// Convolution Op only support bias with dimension {1, oc, 1, 1},
// so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
if (bias_node != nullptr) {
if (is_channel_bias) {
conv_op->set_input_b(*bias_node->data());
} else {
auto add_node = graph->Add<ge::op::Add>(output_name);
auto add_op = add_node->data<ge::op::Add>();
add_op->set_input_x1(*conv_node->data());
add_op->set_input_x2(*bias_node->data());
conv_node = add_node;
}
}
}
CHECK(conv_node);
if (!act_type.empty()) {
auto act_node = graph->Add<ge::op::Activation>(output_name);
auto act_op = act_node->data<ge::op::Activation>();
act_op->set_input_x(*conv_node->data());
act_op->set_attr_mode(CvtActMode(act_type));
if (act_type == "leaky_relu") {
act_op->set_attr_negative_slope(leaky_relu_alpha);
} else if (act_type == "relu6") {
act_op->set_attr_coef(6.f);
}
}
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace hw_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(conv2d,
kHWAscendNPU,
paddle::lite::subgraph::npu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
kHWAscendNPU,
paddle::lite::subgraph::npu::ConvConverter);
......@@ -27,7 +27,8 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
if (it != nodes_.end()) {
// Only variable node can be shared with the same name
if (!node->is_var() || !it->second.back()->is_var()) {
LOG(FATAL) << "[NPU] Const or data node " << name << " is redefined.";
LOG(FATAL) << "[HWAscendNPU] Const or data node " << name
<< " is redefined.";
return -1;
}
} else {
......@@ -65,6 +66,13 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
PrecisionType precision,
DataLayoutType layout) {
auto node = Add<ge::op::Data>(name, precision, layout);
std::stringstream iss;
iss << "[HWAscendNPU] Add data node, shape: ";
for (auto& s : shape) {
iss << s << ",";
}
iss << " name: " << name;
LOG(INFO) << iss.str();
ge::TensorDesc desc(
ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
node->data<ge::op::Data>()->update_input_desc_data(desc);
......
......@@ -181,13 +181,14 @@ class Graph {
}
std::shared_ptr<Node> Get(std::string name) {
CHECK(Has(name)) << "[NPU] Node " << name << " not found.";
CHECK(Has(name)) << "[HWAscendNPU] Node " << name << " not found.";
return nodes_.at(name).back();
}
bool Has(const std::string& name) {
return nodes_.find(name) != nodes_.end();
}
size_t size() const { return nodes_.size(); }
private:
std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
......
......@@ -47,7 +47,7 @@ ge::Tensor CvtTensor(const Tensor& in_tensor,
std::vector<int64_t> out_shape = {},
DataLayoutType in_layout = DATALAYOUT(kNCHW));
int CvtActMode(std::string act_type);
int CvtActMode(const std::string& act_type);
} // namespace hw_ascend_npu
} // namespace subgraph
} // namespace lite
......
......@@ -32,22 +32,31 @@ int SubgraphEngine::BuildDeviceProgram() {
// the HWAscendNPU IR graph
subgraph::hw_ascend_npu::Graph graph;
const auto& bridges = subgraph::Registry::Instance();
LOG(INFO) << "[HWAscendNPU] Build device program";
for (auto& inst : origin_program_) {
auto op = const_cast<OpLite*>(inst.op());
CHECK(op);
op->CheckShape();
op->InferShape();
std::string op_type = op->op_info()->Type();
LOG(INFO) << "[HWAscendNPU] trying to convert OP: " << op_type;
if (!bridges.Exists(op_type, TARGET(kHWAscendNPU))) {
LOG(ERROR) << "[HWAscendNPU] OP: " << op_type
<< " does not exist for target HWAscendNPU";
return subgraph::FAILED;
}
LOG(INFO) << "[HWAscendNPU] OP: " << op_type << " exists for HWAscendNPU";
auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kHWAscendNPU))(
reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
LOG(ERROR) << "[HWAscendNPU] OP: " << op_type << " select kernel failed";
return subgraph::FAILED;
}
LOG(INFO) << "[HWAscendNPU] OP: " << op_type
<< " select kernel for HWAscendNPU";
}
LOG(INFO) << "[HWAscendNPU] Graph size: " << graph.size();
// Collect the valid input and output nodes in the HiAI IR graph and update
// the input and output names
device_inames_.clear();
......@@ -55,8 +64,12 @@ int SubgraphEngine::BuildDeviceProgram() {
std::vector<ge::Operator> device_inodes;
std::vector<ge::Operator> device_onodes;
for (auto& input_name : input_names_) {
LOG(INFO) << "[HWAscendNPU] input name: " << input_name;
if (graph.Has(input_name)) {
LOG(INFO) << "[HWAscendNPU] Graph has input name: " << input_name;
if (graph.Get(input_name)->is_data()) {
LOG(INFO) << "[HWAscendNPU] the current input name: " << input_name
<< " is data";
device_inodes.push_back(*graph.Get(input_name)->data());
device_inames_.push_back(input_name);
} else {
......@@ -82,10 +95,15 @@ int SubgraphEngine::BuildDeviceProgram() {
CHECK(!device_onames_.empty())
<< "[HWAscendNPU] No output nodes found for building NPU model";
LOG(INFO) << "[HWAscendNPU] Graph size to build: " << graph.size();
// Build the IR graph to om model as the device program
if (device_program_map_.count(inputs_shape_) > 0) {
return status;
}
LOG(INFO) << "[HWAscendNPU] Start to build, device_inodes = "
<< device_inodes.size()
<< ", device_onodes = " << device_onodes.size();
auto device_client =
lite::hw_ascend_npu::Device::Global().Build(device_inodes, device_onodes);
if (device_client == nullptr) {
......@@ -188,12 +206,14 @@ int SubgraphEngine::LaunchDeviceProgram() {
// tensors
auto device_program = device_program_map_[inputs_shape_];
int ret = 0;
LOG(INFO) << "[HWAscendNPU] start to set input...";
ret = device_program->client->SetInput(origin_itensors_,
device_program->origin_idims);
if (ret != 0) {
return ret;
}
LOG(INFO) << "[HWAscendNPU] start to create output...";
device_program->client->CreateOutput(device_program->origin_odims);
......@@ -205,10 +225,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
};
auto start_time = GetCurrentUS();
CHECK_EQ(device_program->client->Process(), 0);
VLOG(3) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time
LOG(INFO) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time
<< " us";
device_program->client->GetOutput(&origin_otensors_);
LOG(INFO) << "[HWAscendNPU] Get ouput done";
return 0;
}
......@@ -238,7 +259,9 @@ void SubgraphCompute::PrepareForRun() {
}
void SubgraphCompute::Run() {
LOG(INFO) << "[HWAscendNPU] Start to run";
CHECK(engine_);
LOG(INFO) << "[HWAscendNPU] Start to call Launch";
engine_->Launch();
}
......
......@@ -105,15 +105,19 @@ bool Engine::InputShapeChanged() {
}
int Engine::Launch() {
LOG(INFO) << "[HWAscendNPU] in Launch, start to build if needed";
// Rebuild device program when the shapes of input tensors have been changed.
if (CHECK_SUCCESS(build_device_program_status_) &&
CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
InputShapeChanged()) {
Build();
}
LOG(INFO) << "[HWAscendNPU] launch program";
if (CHECK_FAILED(build_device_program_status_)) {
LOG(INFO) << "[HWAscendNPU] launch original program";
LaunchOriginProgram();
} else {
LOG(INFO) << "[HWAscendNPU] launch device program";
LaunchDeviceProgram();
}
return 0;
......
......@@ -9,7 +9,8 @@ set (kernels
${host_kernels}
${hw_ascend_npu_kernels})
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_HW_ASCEND_NPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
message(STATUS "======---------------------------------=================${hw_ascend_npu_kernels}")
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels})
lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels})
lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels})
......
......@@ -293,6 +293,7 @@ TEST(Activation_relu, precision) {
place = TARGET(kXPU);
#elif defined(LITE_WITH_HW_ASCEND_NPU)
place = TARGET(kHWAscendNPU);
std::cout << "-----------test relu with hw_ascend_npu" << std::endl;
#else
return;
#endif
......
......@@ -3,7 +3,7 @@ set -ex
# global variables with default value
ASCEND_HOME="/usr/local/Ascend" # Ascend SDK root directory
TARGET_NAME="test_subgraph_pass" # default target
TARGET_NAME="test_kernel_activation_compute" # default target
BUILD_EXTRA=ON # ON(with sequence ops)/OFF
WITH_TESTING=ON # ON/OFF
......@@ -80,7 +80,7 @@ function build_hw_ascend_npu {
-DWITH_TESTING=${WITH_TESTING} \
-DASCEND_HOME=${HW_ASCEND_NPU_SDK_ROOT}
make -j$NUM_CORES_FOR_COMPILE
make $TARGET_NAME -j2
cd -
echo "Done"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册