“5e8ba7d7c043570e313eb02a37f0845f81d022b8”上不存在“develop/api_doc/v2/fluid/nets.html”
未验证 提交 d5434aa2 编写于 作者: H hong19860320 提交者: GitHub

[LITE][NPU][XPU] Refine subgraph pass, and support NPU/XPU model generation at...

[LITE][NPU][XPU] Refine subgraph pass, and support NPU/XPU model generation at execution time (#2576)
上级 d8750966
...@@ -118,7 +118,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean ...@@ -118,7 +118,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET) function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module) set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS XPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -128,10 +128,10 @@ function(lite_cc_library TARGET) ...@@ -128,10 +128,10 @@ function(lite_cc_library TARGET)
X86_DEPS ${args_X86_DEPS} X86_DEPS ${args_X86_DEPS}
CUDA_DEPS ${args_CUDA_DEPS} CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -161,7 +161,7 @@ function(lite_cc_binary TARGET) ...@@ -161,7 +161,7 @@ function(lite_cc_binary TARGET)
set(options " -g ") set(options " -g ")
endif() endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -173,6 +173,8 @@ function(lite_cc_binary TARGET) ...@@ -173,6 +173,8 @@ function(lite_cc_binary TARGET)
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -205,7 +207,7 @@ function(lite_cc_test TARGET) ...@@ -205,7 +207,7 @@ function(lite_cc_test TARGET)
endif() endif()
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS ARGS
COMPILE_LEVEL # (basic|extra) COMPILE_LEVEL # (basic|extra)
...@@ -225,6 +227,8 @@ function(lite_cc_test TARGET) ...@@ -225,6 +227,8 @@ function(lite_cc_test TARGET)
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -267,7 +271,7 @@ endif() ...@@ -267,7 +271,7 @@ endif()
function(add_kernel TARGET device level) function(add_kernel TARGET device level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -360,11 +364,12 @@ function(add_kernel TARGET device level) ...@@ -360,11 +364,12 @@ function(add_kernel TARGET device level)
lite_cc_library(${TARGET} SRCS ${args_SRCS} lite_cc_library(${TARGET} SRCS ${args_SRCS}
DEPS ${args_DEPS} DEPS ${args_DEPS}
X86_DEPS ${args_X86_DEPS} X86_DEPS ${args_X86_DEPS}
XPU_DEPS ${args_XPU_DEPS}
CUDA_DEPS ${args_CUDA_DEPS} CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -383,7 +388,7 @@ endif() ...@@ -383,7 +388,7 @@ endif()
function(add_operator TARGET level) function(add_operator TARGET level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -409,11 +414,12 @@ function(add_operator TARGET level) ...@@ -409,11 +414,12 @@ function(add_operator TARGET level)
lite_cc_library(${TARGET} SRCS ${args_SRCS} lite_cc_library(${TARGET} SRCS ${args_SRCS}
DEPS ${args_DEPS} DEPS ${args_DEPS}
X86_DEPS ${args_X86_DEPS} X86_DEPS ${args_X86_DEPS}
XPU_DEPS ${args_XPU_DEPS}
CUDA_DEPS ${args_CUDA_DEPS} CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
......
...@@ -89,7 +89,7 @@ else() ...@@ -89,7 +89,7 @@ else()
endif() endif()
find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
PATHS ${XPU_SDK_ROOT}/XTDK/shlib) PATHS ${XPU_SDK_ROOT}/XTDK/shlib/gcc482)
if(NOT XPU_SDK_LLVM_FILE) if(NOT XPU_SDK_LLVM_FILE)
message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
......
...@@ -42,7 +42,7 @@ else() ...@@ -42,7 +42,7 @@ else()
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
if (LITE_WITH_NPU) if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency # Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_light_api_shared ${npu_runtime_libs}) target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
endif() endif()
endif() endif()
endif() endif()
...@@ -78,8 +78,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -78,8 +78,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
DEPS ${cxx_api_deps} ${ops} ${host_kernels} program DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}) FPGA_DEPS ${fpga_kernels})
endif() endif()
......
...@@ -108,7 +108,7 @@ USE_LITE_OP(while) ...@@ -108,7 +108,7 @@ USE_LITE_OP(while)
USE_LITE_OP(lod_reset) USE_LITE_OP(lod_reset)
USE_LITE_OP(lookup_table) USE_LITE_OP(lookup_table)
USE_LITE_OP(multiclass_nms) USE_LITE_OP(multiclass_nms)
USE_LITE_OP(graph_op) USE_LITE_OP(subgraph)
USE_LITE_OP(sequence_expand) USE_LITE_OP(sequence_expand)
USE_LITE_OP(sequence_pool) USE_LITE_OP(sequence_pool)
USE_LITE_OP(reduce_max) USE_LITE_OP(reduce_max)
......
...@@ -30,7 +30,7 @@ else() ...@@ -30,7 +30,7 @@ else()
add_dependencies(paddle_lite_jni op_list_h kernel_list_h) add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
if (LITE_WITH_NPU) if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency # Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_lite_jni ${npu_runtime_libs}) target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})
endif() endif()
endif() endif()
......
...@@ -139,22 +139,15 @@ std::vector<std::string> Predictor::GetOutputNames() { return output_names_; } ...@@ -139,22 +139,15 @@ std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
// append the names of inputs and outputs into input_names_ and output_names_ // append the names of inputs and outputs into input_names_ and output_names_
void Predictor::PrepareFeedFetch() { void Predictor::PrepareFeedFetch() {
std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs;
#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
// The shape of input tensors must be determined before generating NPU and XPU
// program.
auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
for (size_t i = 0; i < current_block->OpsSize(); i++) {
auto op = current_block->GetOp<cpp::OpDesc>(i);
#else
if (!program_) { if (!program_) {
GenRuntimeProgram(); GenRuntimeProgram();
} }
std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs;
const auto &insts = program_->instructions(); const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) { for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info(); const auto &op = insts[i].op()->op_info();
#endif
if (op->Type() == "feed") { if (op->Type() == "feed") {
feeds.push_back(op); feeds.push_back(op);
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
......
...@@ -90,6 +90,10 @@ std::vector<Place> ParserValidPlaces() { ...@@ -90,6 +90,10 @@ std::vector<Place> ParserValidPlaces() {
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") { } else if (target_repr == "x86") {
valid_places.emplace_back(TARGET(kX86)); valid_places.emplace_back(TARGET(kX86));
} else if (target_repr == "npu") {
valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU));
} else { } else {
LOG(FATAL) << lite::string_format( LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag " "Wrong target '%s' found, please check the command flag "
......
...@@ -20,12 +20,6 @@ USE_MIR_PASS(static_kernel_pick_pass); ...@@ -20,12 +20,6 @@ USE_MIR_PASS(static_kernel_pick_pass);
USE_MIR_PASS(variable_place_inference_pass); USE_MIR_PASS(variable_place_inference_pass);
USE_MIR_PASS(type_target_cast_pass); USE_MIR_PASS(type_target_cast_pass);
USE_MIR_PASS(generate_program_pass); USE_MIR_PASS(generate_program_pass);
#ifdef LITE_WITH_NPU
USE_MIR_PASS(generate_npu_program_pass);
#endif
#ifdef LITE_WITH_XPU
USE_MIR_PASS(generate_xpu_program_pass);
#endif
USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(io_copy_kernel_pick_pass);
USE_MIR_PASS(argument_type_display_pass); USE_MIR_PASS(argument_type_display_pass);
...@@ -45,3 +39,5 @@ USE_MIR_PASS(lite_quant_dequant_fuse_pass); ...@@ -45,3 +39,5 @@ USE_MIR_PASS(lite_quant_dequant_fuse_pass);
USE_MIR_PASS(type_precision_cast_pass); USE_MIR_PASS(type_precision_cast_pass);
USE_MIR_PASS(type_layout_cast_pass); USE_MIR_PASS(type_layout_cast_pass);
USE_MIR_PASS(memory_optimize_pass); USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
...@@ -2,5 +2,4 @@ if(NOT LITE_WITH_NPU) ...@@ -2,5 +2,4 @@ if(NOT LITE_WITH_NPU)
return() return()
endif() endif()
lite_cc_library(npu_runtime SRCS runtime.cc DEPS ${npu_runtime_libs}) lite_cc_library(device_npu SRCS device.cc DEPS ${npu_builder_libs} ${npu_runtime_libs})
lite_cc_library(npu_builder SRCS builder.cc DEPS ${npu_builder_libs} npu_runtime tensor op scope)
...@@ -12,47 +12,56 @@ ...@@ -12,47 +12,56 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/runtime.h" #include "lite/backends/npu/device.h"
#include <string>
#include <vector>
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace npu { namespace npu {
// Create hiai model manager to load om model from lite tensor, and return the std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
// manager and an unique model name std::string& model_name, // NOLINT
bool LoadModel(const lite::Tensor &model_data, std::vector<ge::Operator>& input_nodes, // NOLINT
std::shared_ptr<hiai::AiModelMngerClient> *model_client, std::vector<ge::Operator>& output_nodes // NOLINT
std::string *model_name) { ) {
LOG(INFO) << "[NPU] Load model."; VLOG(3) << "[NPU] Build model";
auto model_data_ptr = model_data.data<int8_t>(); // Build the HiAI IR graph to the HiAI om model
auto model_data_size = model_data.numel() * sizeof(int8_t); ge::Graph ir_graph("graph");
if (model_data_ptr == nullptr || model_data_size == 0) { ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
return false; ge::Model om_model("model", "model");
om_model.SetGraph(ir_graph);
domi::HiaiIrBuild ir_build;
domi::ModelBufferData om_model_buf;
if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
LOG(WARNING) << "[NPU] CreateModelBuff failed!";
return nullptr;
} }
*model_client = std::make_shared<hiai::AiModelMngerClient>(); if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
int ret = (*model_client)->Init(nullptr); LOG(WARNING) << "[NPU] BuildIRModel failed!";
if (ret != hiai::AI_SUCCESS) { ir_build.ReleaseModelBuff(om_model_buf);
LOG(WARNING) << "[NPU] AiModelMngerClient init failed(" << ret << ")!"; return nullptr;
return false;
} }
*model_name = "model.om"; // Create a HiAI model manager client to load the HiAI om model
std::unique_ptr<hiai::AiModelMngerClient> model_client(
new hiai::AiModelMngerClient());
if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
ir_build.ReleaseModelBuff(om_model_buf);
return nullptr;
}
model_name = "model_" + std::to_string(model_count_++) + ".om";
auto model_desc = std::make_shared<hiai::AiModelDescription>( auto model_desc = std::make_shared<hiai::AiModelDescription>(
*model_name, model_name, freq_level(), framework_type(), model_type(), device_type());
DeviceInfo::Global().freq_level(), model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
DeviceInfo::Global().framework_type(),
DeviceInfo::Global().model_type(),
DeviceInfo::Global().device_type());
model_desc->SetModelBuffer(model_data_ptr, model_data_size);
std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs; std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
model_descs.push_back(model_desc); model_descs.push_back(model_desc);
if ((*model_client)->Load(model_descs) != hiai::AI_SUCCESS) { if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!"; LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
return false; ir_build.ReleaseModelBuff(om_model_buf);
return nullptr;
} }
return true; ir_build.ReleaseModelBuff(om_model_buf);
return model_client;
} }
} // namespace npu } // namespace npu
......
...@@ -13,38 +13,47 @@ ...@@ -13,38 +13,47 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector>
#include "ai_ddk_lib/include/HiAiModelManagerService.h" #include "ai_ddk_lib/include/HiAiModelManagerService.h"
#include "lite/core/tensor.h" #include "ai_ddk_lib/include/hiai_ir_build.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace npu { namespace npu {
class DeviceInfo { class Device {
public: public:
static DeviceInfo &Global() { static Device& Global() {
static DeviceInfo x; static Device x;
return x; return x;
} }
DeviceInfo() {} Device() {}
int freq_level() { return freq_level_; } int freq_level() { return freq_level_; }
int framework_type() { return framework_type_; } int framework_type() { return framework_type_; }
int model_type() { return model_type_; } int model_type() { return model_type_; }
int device_type() { return device_type_; } int device_type() { return device_type_; }
// Build the HiAI IR graph to om model, return HiAI model manager client to
// load om model and run inference.
std::unique_ptr<hiai::AiModelMngerClient> Build(
std::string& model_name, // NOLINT
std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes // NOLINT
); // NOLINT
private: private:
int freq_level_{3}; int freq_level_{3};
int framework_type_{0}; int framework_type_{0};
int model_type_{0}; int model_type_{0};
int device_type_{0}; int device_type_{0};
int model_count_{0};
}; };
bool LoadModel(const lite::Tensor &model_data,
std::shared_ptr<hiai::AiModelMngerClient> *model_client,
std::string *model_name);
} // namespace npu } // namespace npu
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -2,5 +2,4 @@ if(NOT LITE_WITH_XPU) ...@@ -2,5 +2,4 @@ if(NOT LITE_WITH_XPU)
return() return()
endif() endif()
lite_cc_library(xpu_runtime SRCS runtime.cc DEPS ${xpu_runtime_libs}) lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
lite_cc_library(xpu_builder SRCS builder.cc DEPS ${xpu_builder_libs} xpu_runtime tensor op scope)
...@@ -12,33 +12,31 @@ ...@@ -12,33 +12,31 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/xpu/runtime.h" #include "lite/backends/xpu/device.h"
#include <vector>
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace xpu { namespace xpu {
// Extract the model data and recover the XPU model for inference, the function std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build(
// is called by the graph computing kernel when the graph op is executed. xtcl::network::xNetworkBuilder* builder,
// Due to the lack of XPU APIs for loading and recovering the XPU model from xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
// memory, the key name is obtained from the weight tensor of graph op, to get std::vector<xtcl::xExpr*>* outputs) {
// the runtime object for inference from the global variable 'DeviceInfo'. VLOG(3) << "[XPU] Build model";
// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op. CHECK(builder != nullptr);
bool LoadModel(const lite::Tensor &model, CHECK(outputs != nullptr);
std::shared_ptr<xtcl::network::xRuntimeInstance> *runtime) { CHECK_GT(outputs->size(), 0);
LOG(INFO) << "[XPU] Load Model.";
CHECK_GT(model.dims().production(), 0); // The XPU compiler build the graph and fill all of the constant params, only
std::string name(reinterpret_cast<const char *>(model.data<int8_t>())); // one output is supported now.
LOG(INFO) << "[XPU] Model Name: " << name; xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
CHECK(runtime != nullptr); auto target = xtcl::Target::Create(device_name_);
*runtime = DeviceInfo::Global().Find(name); auto compiler = xtcl::network::xTensorCompiler(network, target);
if (*runtime == nullptr) { compiler.SetParams(*params); // Set the data of constant tensors
LOG(WARNING) << "[XPU] Load Model failed!"; compiler.Build();
return false; return std::unique_ptr<xtcl::network::xRuntimeInstance>(
} new xtcl::network::xRuntimeInstance(compiler.CreateRuntimeInstance()));
return true;
} }
} // namespace xpu } // namespace xpu
......
...@@ -17,31 +17,34 @@ ...@@ -17,31 +17,34 @@
#include <xtcl/xtcl.h> #include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/types.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels {
namespace xpu { namespace xpu {
class GraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> { class Device {
public: public:
using param_t = operators::GraphParam; static Device& Global() {
static Device x;
void PrepareForRun() override; return x;
}
void Run() override; Device() {}
virtual ~GraphCompute() = default; // Build the XPU graph to the XPU runtime, return the XPU runtime which can be
// used to run inference.
std::unique_ptr<xtcl::network::xRuntimeInstance> Build(
xtcl::network::xNetworkBuilder* builder,
xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
std::vector<xtcl::xExpr*>* outputs);
private: private:
std::shared_ptr<xtcl::network::xRuntimeInstance> runtime_{nullptr}; // Keep reserved fields
int device_id_{0};
std::string device_name_{"llvm"};
}; };
} // namespace xpu } // namespace xpu
} // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -33,9 +33,9 @@ lite_cc_library(scope SRCS scope.cc DEPS tensor) ...@@ -33,9 +33,9 @@ lite_cc_library(scope SRCS scope.cc DEPS tensor)
lite_cc_library(device_info SRCS device_info.cc DEPS tensor) lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
if (LITE_WITH_ARM) if (LITE_WITH_ARM)
lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS npu_runtime) lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags)
else() else()
lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags XPU_DEPS xpu_runtime) lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags)
endif() endif()
#-------------------------------------------- GET CODE META INFO ------------------------------------------ #-------------------------------------------- GET CODE META INFO ------------------------------------------
......
...@@ -5,6 +5,6 @@ endif() ...@@ -5,6 +5,6 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif() endif()
...@@ -14,13 +14,38 @@ ...@@ -14,13 +14,38 @@
#include "lite/core/arena/framework.h" #include "lite/core/arena/framework.h"
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/operators/subgraph_op.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace arena { namespace arena {
void TestCase::CreateInstruction() { void TestCase::CreateInstruction() {
auto op = LiteOpRegistry::Global().Create(op_desc().Type()); std::shared_ptr<lite::OpLite> op = nullptr;
if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) {
// Create a new block desc to wrap the original op desc
int sub_block_idx = 0;
auto sub_block_desc = new cpp::BlockDesc();
sub_block_desc->ClearOps();
sub_block_desc->ClearVars();
auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_block_op_desc = *op_desc_;
// Add the block desc into the subgraph op which used to replace the
// original op
op_desc_.reset(new cpp::OpDesc());
op_desc_->SetType("subgraph");
op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
op_desc_->SetInput("Inputs", op_desc_->input_vars());
op_desc_->SetOutput("Outputs", op_desc_->output_vars());
op_desc_->SetAttr<std::vector<std::string>>(
"input_data_names", sub_block_op_desc->input_vars());
op_desc_->SetAttr<std::vector<std::string>>(
"output_data_names", sub_block_op_desc->output_vars());
op = LiteOpRegistry::Global().Create(op_desc().Type());
static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
} else {
op = LiteOpRegistry::Global().Create(op_desc().Type());
}
CHECK(op) << "no op for " << op_desc().Type(); CHECK(op) << "no op for " << op_desc().Type();
op->Attach(*op_desc_, inst_scope_); op->Attach(*op_desc_, inst_scope_);
auto kernels = op->CreateKernels({place_}); auto kernels = op->CreateKernels({place_});
...@@ -68,6 +93,19 @@ void TestCase::PrepareInputsForInstruction() { ...@@ -68,6 +93,19 @@ void TestCase::PrepareInputsForInstruction() {
} }
} }
TestCase::~TestCase() {
if (op_desc_->Type() == "subgraph") {
// Release the subblock desc of Subgraph op
auto subgraph_op = const_cast<operators::SubgraphOp*>(
static_cast<const operators::SubgraphOp*>(instruction_->op()));
CHECK(subgraph_op);
auto sub_block_desc = subgraph_op->GetSubBlock();
if (sub_block_desc) {
delete sub_block_desc;
}
}
}
} // namespace arena } // namespace arena
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -42,7 +42,7 @@ class TestCase { ...@@ -42,7 +42,7 @@ class TestCase {
: place_(place), scope_(new Scope), alias_(alias) { : place_(place), scope_(new Scope), alias_(alias) {
ctx_ = ContextScheduler::Global().NewContext(place_.target); ctx_ = ContextScheduler::Global().NewContext(place_.target);
} }
virtual ~TestCase() {} virtual ~TestCase();
void Prepare() { void Prepare() {
PrepareScopes(); PrepareScopes();
......
...@@ -25,12 +25,6 @@ ...@@ -25,12 +25,6 @@
#include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_context.h"
#include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_runtime.h"
#endif #endif
#ifdef LITE_WITH_NPU
#include "lite/backends/npu/runtime.h"
#endif
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/runtime.h"
#endif
#include <map> #include <map>
#include <memory> #include <memory>
...@@ -93,7 +87,7 @@ template <> ...@@ -93,7 +87,7 @@ template <>
class Context<TargetType::kXPU> { class Context<TargetType::kXPU> {
public: public:
Context() {} Context() {}
explicit Context(const NPUContext& ctx); explicit Context(const XPUContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler // NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {} void InitOnce() {}
void CopySharedTo(XPUContext* ctx) {} void CopySharedTo(XPUContext* ctx) {}
......
...@@ -32,7 +32,7 @@ lite_cc_library(mir_passes ...@@ -32,7 +32,7 @@ lite_cc_library(mir_passes
demo_pass.cc demo_pass.cc
runtime_context_assign_pass.cc runtime_context_assign_pass.cc
memory_optimize_pass.cc memory_optimize_pass.cc
DEPS mir_pass types context ${mir_fusers} ${subgraph_passes}) DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
# lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS # lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
#mir_ssa_graph scope op #mir_ssa_graph scope op
......
...@@ -36,15 +36,6 @@ std::string Visualize(mir::SSAGraph* graph) { ...@@ -36,15 +36,6 @@ std::string Visualize(mir::SSAGraph* graph) {
int id = 0; int id = 0;
std::set<std::string> exists_args; std::set<std::string> exists_args;
std::map<int, std::string> graph_col; // Different colors of subgraphs
graph_col.insert({{1, "red"},
{2, "green"},
{3, "cyan"},
{4, "bisque3"},
{5, "coral"},
{6, "darkseagreen1"},
{7, "goldenrod1"},
{8, "darkorchid"}});
for (auto& node : graph->mutable_nodes()) { for (auto& node : graph->mutable_nodes()) {
std::string key; std::string key;
if (node.IsArg()) { if (node.IsArg()) {
...@@ -52,24 +43,12 @@ std::string Visualize(mir::SSAGraph* graph) { ...@@ -52,24 +43,12 @@ std::string Visualize(mir::SSAGraph* graph) {
} else { } else {
key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++); key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++);
} }
if (node.IsStmt()) { if (node.IsStmt()) {
auto& stmt = node.AsStmt(); dot.AddNode(key,
auto sub_id = stmt.subgraph_id(); {Dot::Attr("shape", "box"),
auto it = graph_col.find(sub_id); Dot::Attr("style", "filled"),
if (sub_id > 0 && it != graph_col.end()) { Dot::Attr("color", "black"),
dot.AddNode(key, Dot::Attr("fillcolor", "yellow")});
{Dot::Attr("shape", "box"),
Dot::Attr("style", "filled"),
Dot::Attr("color", "black"),
Dot::Attr("fillcolor", it->second)});
} else {
dot.AddNode(key,
{Dot::Attr("shape", "box"),
Dot::Attr("style", "filled"),
Dot::Attr("color", "black"),
Dot::Attr("fillcolor", "yellow")});
}
for (auto& x : node.inlinks) { for (auto& x : node.inlinks) {
auto name = x->AsArg().name; auto name = x->AsArg().name;
if (!exists_args.count(name)) { if (!exists_args.count(name)) {
......
...@@ -50,7 +50,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice( ...@@ -50,7 +50,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
"lod_reset", "lod_reset",
"concat", "concat",
"yolo_box", "yolo_box",
"graph_op", "subgraph",
"feed", "feed",
"fetch"}; "fetch"};
for (auto* tmp : node->inlinks) { for (auto* tmp : node->inlinks) {
......
...@@ -64,9 +64,6 @@ class Node { ...@@ -64,9 +64,6 @@ class Node {
return valid_kernels_; return valid_kernels_;
} }
void ClearSubgraphID() { subgraph_id_ = -1 /* note: not 0 */; }
void SetSubgraphID(int id) { subgraph_id_ = id; }
int subgraph_id() const { return subgraph_id_; }
void SetOp(const std::shared_ptr<OpLite>& op) { op_ = op; } void SetOp(const std::shared_ptr<OpLite>& op) { op_ = op; }
const std::shared_ptr<OpLite> op() const { return op_; } const std::shared_ptr<OpLite> op() const { return op_; }
...@@ -82,11 +79,6 @@ class Node { ...@@ -82,11 +79,6 @@ class Node {
// Description. // Description.
std::string desc; std::string desc;
protected:
// -1 means not in subgraph, 0 means supported but not one id, id started
// from 1
int subgraph_id_{-1};
}; };
struct Arg { struct Arg {
......
lite_cc_library(subgraph_detector
SRCS subgraph_detector.cc
DEPS mir_pass types subgraph_op)
lite_cc_library(subgraph_pass lite_cc_library(subgraph_pass
SRCS subgraph_program_pass.cc SRCS subgraph_pass.cc
DEPS mir_pass types ${mir_fusers}) DEPS mir_pass types context ${mir_fusers} subgraph_detector)
lite_cc_test(test_subgraph_pass SRCS subgraph_program_pass_test.cc
DEPS subgraph_pass mir_passes gflags model_parser cxx_api
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
if (WITH_TESTING) if (WITH_TESTING)
add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v1_tar_gz) lite_cc_test(test_subgraph_detector
add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz) SRCS subgraph_detector_test.cc
set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") DEPS subgraph_detector mir_passes gflags model_parser cxx_api
set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}") ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
endif() add_dependencies(test_subgraph_detector
extern_lite_download_mobilenet_v1_tar_gz
set(subgraph_passes subgraph_pass) extern_lite_download_mobilenet_v2_relu_tar_gz)
if(LITE_WITH_NPU)
lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc
DEPS mir_pass types context ${mir_fusers} ${npu_bridges} graph_op subgraph_pass)
list(APPEND subgraph_passes npu_pass)
lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc
DEPS npu_pass mir_passes paddle_api_full paddle_api_light gflags
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
--optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
if (WITH_TESTING)
add_dependencies(test_npu_pass extern_lite_download_mobilenet_v1_tar_gz)
add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set_target_properties(test_npu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}") set_target_properties(test_subgraph_detector PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif() lite_cc_test(test_subgraph_pass
endif() SRCS subgraph_pass_test.cc
DEPS mir_passes paddle_api_full paddle_api_light gflags
if(LITE_WITH_XPU) ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
lite_cc_library(xpu_pass SRCS generate_xpu_program_pass.cc --optimized_model_dir=${LITE_MODEL_DIR}/lite_model_opt SERIAL)
DEPS mir_pass types context ${mir_fusers} ${xpu_bridges} ${xpu_builder_libs} graph_op subgraph_pass) add_dependencies(test_subgraph_pass
list(APPEND subgraph_passes xpu_pass) extern_lite_download_mobilenet_v1_tar_gz
lite_cc_test(test_xpu_pass SRCS generate_xpu_program_pass_test.cc extern_lite_download_mobilenet_v2_relu_tar_gz)
DEPS xpu_pass mir_passes paddle_api_full gflags
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
--optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
if (WITH_TESTING)
add_dependencies(test_xpu_pass extern_lite_download_mobilenet_v1_tar_gz)
add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set_target_properties(test_xpu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}") set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
endif() endif()
set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes") set(mir_subgraphs subgraph_pass CACHE INTERNAL "mir_subgraphs")
message(STATUS "----> subgraph_passes: ${subgraph_passes}") message(STATUS "----> mir_subgraphs: ${mir_subgraphs}")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher.h"
#include "lite/backends/npu/builder.h"
#include "lite/kernels/npu/bridges/paddle_use_npu_bridges.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace mir {
namespace subgraph {
std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
lite::mir::Node* var_node, const Scope* scope) {
CHECK(var_node->IsArg());
const auto& arg = var_node->AsArg();
VLOG(4) << "[NPU] Convert var node " << arg.name;
auto* var = scope->FindVar(arg.name);
CHECK(var);
auto* tensor = var->GetMutable<lite::Tensor>();
CHECK(tensor);
auto dims = tensor->dims();
if (arg.is_weight) {
auto wgt = std::make_shared<ge::op::Const>(arg.name);
LOG(INFO) << "[NPU] Convert const var node " << arg.name;
VLOG(4) << dims;
wgt->set_attr_value(lite::npu::CvtTensor(tensor));
return wgt;
} else {
CHECK_EQ(dims.size(), 4);
LOG(INFO) << "[NPU] Convert data var node " << arg.name;
LOG(INFO) << dims;
// TODO(xxx): support more types and dims size
ge::TensorDesc desc(ge::Shape(dims.Vectorize()),
ge::Format::FORMAT_NCHW,
ge::DataType::DT_FLOAT);
// auto size = desc.GetShape().GetShapeSize();
// ge::TensorUtils::SetSize(desc, size*sizeof(float));
// ge::TensorUtils::SetRealDimCnt(desc, 4);
auto data = std::make_shared<ge::op::Data>(arg.name);
data->update_input_desc_x(desc);
return data;
}
return nullptr;
}
void GenerateNPUProgramPass::CvtAllOpNodes(
const std::vector<Node*>& nodes2cvt,
lite::kernels::npu::bridges::node_map_type* converted_vars) {
const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
const auto& cvtfunc_map = bridges.AllFunctions();
// return record all converted vars
// op node's inputs must be found in converted_vars
for (auto& node : nodes2cvt) {
lite::kernels::npu::bridges::node_map_type node_inputs;
auto& stmt = node->AsStmt();
for (auto& var_node : node->inlinks) {
auto& arg = var_node->AsArg();
// weight should be handled in the converter, so skip here
if (arg.is_weight) {
continue;
}
auto var_name = arg.name;
if (!converted_vars->count(var_name)) {
converted_vars->insert(
std::make_pair(var_name, CvtVarNode(var_node, stmt.op()->scope())));
}
node_inputs.insert(*converted_vars->find(var_name));
}
auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs);
converted_vars->insert(node_outputs.begin(), node_outputs.end());
}
}
std::string GenerateNPUProgramPass::BuildNPUGraph(
const std::unordered_set<Node*>& op_nodes,
const std::unordered_set<Node*>& in_data_vars,
const std::unordered_set<Node*>& out_data_vars,
int sub_id) {
auto ordered_nodes = GetTopologicalOrder(op_nodes);
lite::kernels::npu::bridges::node_map_type converted_vars;
CvtAllOpNodes(ordered_nodes, &converted_vars);
std::vector<std::string> in_var_names;
std::vector<std::string> out_var_names;
std::vector<ge::Operator> inputs;
std::vector<ge::Operator> outputs;
for (auto i : in_data_vars) {
auto argname = i->AsArg().name;
in_var_names.push_back(argname);
inputs.push_back(*converted_vars.at(argname));
}
for (auto i : out_data_vars) {
auto argname = i->AsArg().name;
out_var_names.push_back(argname);
outputs.push_back(*converted_vars.at(argname));
}
std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
auto any_op = (*op_nodes.begin())->AsStmt().op();
auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
weight->set_persistable(true);
weight->set_precision(PRECISION(kInt8));
// Compiling IR graph to NPU model and store mode data into weight tensor with
// persistable=true, Sothat the model parser can recognize it and save it to
// param files
if (!lite::npu::BuildModel(inputs, outputs, weight)) {
LOG(FATAL) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
} else {
LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
}
return weight_var_name;
}
void GenerateNPUProgramPass::GenNPUSubgraph(
const std::unique_ptr<SSAGraph>& graph,
const std::unordered_set<Node*>& op_nodes,
int sub_id) {
std::unordered_set<Node*> in_data_vars;
std::unordered_set<Node*> in_wgt_vars;
std::unordered_set<Node*> out_data_vars;
std::unordered_set<Node*> out_unused_vars;
FindInputOutputVars(
op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
auto weight_var_name =
BuildNPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
auto any_op = (*op_nodes.begin())->AsStmt().op();
InsertNewNode(graph,
weight_var_name,
any_op->scope(),
any_op->valid_places(),
in_data_vars,
in_wgt_vars,
out_data_vars,
out_unused_vars);
auto nodes2rm = GetNode2rm(
op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
GraphSafeRemoveNodes(graph.get(), nodes2rm);
}
void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
LOG(INFO) << "[NPU] Before NPU Pass \n" << Visualize(graph.get());
const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
const auto& op_map = bridges.AllFunctions();
std::vector<std::string> supported_op_types;
for (auto& i : op_map) {
LOG(INFO) << "[NPU] Supported type: " << i.first;
supported_op_types.push_back(i.first);
}
int num_subgraph = FuseSubgraph(graph, supported_op_types);
InferOnce(graph);
auto op_nodes_all = ClassifySubgraph(graph);
CHECK_EQ(op_nodes_all.size(), num_subgraph);
int id = 1;
for (auto& op_nodes : op_nodes_all) {
LOG(INFO) << "[NPU] Converting Subgraph " << id;
GenNPUSubgraph(graph, op_nodes.second, id);
LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
<< Visualize(graph.get());
id++;
}
}
} // namespace subgraph
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(generate_npu_program_pass,
paddle::lite::mir::subgraph::GenerateNPUProgramPass)
.BindTargets({TARGET(kNPU)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher.h"
#include "lite/backends/xpu/builder.h"
#include "lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace mir {
namespace subgraph {
std::shared_ptr<xtcl::xExpr> GenerateXPUProgramPass::CvtVarNode(
lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
lite::mir::Node* var_node,
const Scope* scope) {
CHECK(var_node->IsArg());
const auto& arg = var_node->AsArg();
auto var_name = arg.name;
VLOG(4) << "[XPU] Convert var node " << var_name;
auto* var = scope->FindVar(var_name);
CHECK(var);
auto* tensor = var->GetMutable<lite::Tensor>();
CHECK(tensor);
auto dims = tensor->dims();
auto cvted_var_node =
std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
var_name, lite::xpu::CvtShape(dims), ::xtcl::Float(32)));
if (arg.is_weight) {
auto cvted_var_tensor = lite::xpu::CvtTensor(tensor);
graph_ctx->params->emplace(std::make_pair(var_name, *cvted_var_tensor));
}
return cvted_var_node;
}
void GenerateXPUProgramPass::CvtAllOpNodes(
const std::vector<Node*>& op_nodes,
lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes) {
const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
const auto& supported_lists = bridges.AllFunctions();
// return record all converted vars
// op node's inputs must be found in converted_vars
for (auto& node : op_nodes) {
lite::kernels::xpu::bridges::node_map_type input_nodes;
auto& stmt = node->AsStmt();
for (auto& var_node : node->inlinks) {
auto& arg = var_node->AsArg();
// weight should be handled in the converter, so skip here
if (arg.is_weight) {
continue;
}
auto var_name = arg.name;
if (!cvted_var_nodes->count(var_name)) {
cvted_var_nodes->insert(std::make_pair(
var_name, CvtVarNode(graph_ctx, var_node, stmt.op()->scope())));
}
input_nodes.insert(*cvted_var_nodes->find(var_name));
}
auto output_nodes =
supported_lists.at(stmt.op_type())(stmt.op(), graph_ctx, input_nodes);
cvted_var_nodes->insert(output_nodes.begin(), output_nodes.end());
}
}
std::string GenerateXPUProgramPass::BuildXPUGraph(
const std::unordered_set<Node*>& op_nodes,
const std::unordered_set<Node*>& in_data_vars,
const std::unordered_set<Node*>& out_data_vars,
int sub_id) {
auto ordered_op_nodes = GetTopologicalOrder(op_nodes);
lite::kernels::xpu::bridges::graph_ctx_type graph_ctx;
graph_ctx.builder = std::make_shared<xtcl::network::xNetworkBuilder>();
graph_ctx.params =
std::make_shared<xtcl::network::xTensorCompiler::ParamNDArrayMap>();
lite::kernels::xpu::bridges::node_map_type cvted_var_nodes;
CvtAllOpNodes(ordered_op_nodes, &graph_ctx, &cvted_var_nodes);
std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
auto any_op = (*op_nodes.begin())->AsStmt().op();
auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
weight->set_persistable(true);
weight->set_precision(PRECISION(kInt8));
// Compiling graph to XPU model and store mode data into weight tensor with
// persistable=true, Sothat the model parser can recognize it and save it to
// param files
std::vector<std::shared_ptr<xtcl::xExpr>> ordered_cvted_var_nodes;
for (auto out_data_var : out_data_vars) {
auto var_name = out_data_var->AsArg().name;
ordered_cvted_var_nodes.push_back(cvted_var_nodes[var_name]);
}
if (!lite::xpu::BuildModel(graph_ctx.builder,
graph_ctx.params,
&ordered_cvted_var_nodes,
weight)) {
LOG(FATAL) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
} else {
LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
}
return weight_var_name;
}
void GenerateXPUProgramPass::GenXPUSubgraph(
const std::unique_ptr<SSAGraph>& graph,
const std::unordered_set<Node*>& op_nodes,
int sub_id) {
std::unordered_set<Node*> in_data_vars;
std::unordered_set<Node*> in_wgt_vars;
std::unordered_set<Node*> out_data_vars;
std::unordered_set<Node*> out_unused_vars;
FindInputOutputVars(
op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
auto weight_var_name =
BuildXPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
auto any_op = (*op_nodes.begin())->AsStmt().op();
InsertNewNode(graph,
weight_var_name,
any_op->scope(),
any_op->valid_places(),
in_data_vars,
in_wgt_vars,
out_data_vars,
out_unused_vars);
auto nodes2rm = GetNode2rm(
op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
GraphSafeRemoveNodes(graph.get(), nodes2rm);
}
void GenerateXPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
LOG(INFO) << "[XPU] Before XPU Pass \n" << Visualize(graph.get());
const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
const auto& op_map = bridges.AllFunctions();
std::vector<std::string> supported_op_types;
for (auto& i : op_map) {
LOG(INFO) << "[XPU] Supported type: " << i.first;
supported_op_types.push_back(i.first);
}
int num_subgraph = FuseSubgraph(graph, supported_op_types);
InferOnce(graph);
auto op_nodes_all = ClassifySubgraph(graph);
CHECK_EQ(op_nodes_all.size(), num_subgraph);
int id = 1;
for (auto& op_nodes : op_nodes_all) {
LOG(INFO) << "[XPU] Converting Subgraph " << id;
GenXPUSubgraph(graph, op_nodes.second, id);
LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
<< Visualize(graph.get());
id++;
}
}
} // namespace subgraph
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(generate_xpu_program_pass,
paddle::lite::mir::subgraph::GenerateXPUProgramPass)
.BindTargets({TARGET(kXPU)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <cmath>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
DEFINE_string(model_file, "", "model file path of combined protobuf model");
DEFINE_string(params_file, "", "params file path of combined protobuf model");
DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model");
DEFINE_string(input_tensor_shape, "1,3,224,224", "shapes of input tensors");
DEFINE_int32(output_tensor_num, 1, "number of output tensors");
namespace paddle {
namespace lite {
std::vector<std::vector<int64_t>> ParseShape(std::string txt) {
std::vector<std::vector<int64_t>> shape;
while (!txt.empty()) {
size_t idx = txt.find_first_of(":");
std::string dims = txt.substr(0, idx);
std::vector<int64_t> s;
while (!dims.empty()) {
size_t idx = dims.find_first_of(",");
int d = atoi(dims.substr(0, idx).c_str());
VLOG(3) << d;
s.push_back(d);
if (idx == std::string::npos) {
break;
} else {
dims = dims.substr(idx + 1);
}
}
shape.push_back(s);
if (idx == std::string::npos) {
break;
} else {
txt = txt.substr(idx + 1);
}
}
return shape;
}
int64_t ShapeProduction(std::vector<int64_t> shape) {
int64_t s = 1;
for (int64_t dim : shape) {
s *= dim;
}
return s;
}
void FillInputTensor(
const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
const std::vector<std::vector<int64_t>>& input_tensor_shape,
const float value) {
for (int i = 0; i < input_tensor_shape.size(); i++) {
auto input_tensor = predictor->GetInput(i);
input_tensor->Resize(input_tensor_shape[i]);
auto input_tensor_data = input_tensor->mutable_data<float>();
auto input_tensor_size = ShapeProduction(input_tensor->shape());
for (int j = 0; j < input_tensor_size; j++) {
input_tensor_data[j] = value;
}
}
}
void CompareOutputTensor(
const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
const int output_tensor_num) {
for (int i = 0; i < output_tensor_num; i++) {
auto tar_output_tensor = tar_predictor->GetOutput(i);
auto ref_output_tensor = ref_predictor->GetOutput(i);
auto tar_output_tensor_data = tar_output_tensor->data<float>();
auto ref_output_tensor_data = ref_output_tensor->data<float>();
auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
for (size_t j = 0; j < ref_output_tensor_size; j++) {
auto diff =
std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) /
(std::fabs(ref_output_tensor_data[j]) + 1e-6);
VLOG(3) << diff;
EXPECT_LT(diff, 0.1);
}
}
}
std::shared_ptr<lite_api::PaddlePredictor> TestModel(
const std::string& model_dir,
const std::string& model_file,
const std::string& params_file,
const std::vector<lite_api::Place>& valid_places,
const std::vector<std::vector<int64_t>>& input_tensor_shape,
const std::string& optimized_model_dir) {
// generate optimized model
lite_api::CxxConfig cxx_config;
cxx_config.set_model_dir(model_dir);
cxx_config.set_model_file(model_file);
cxx_config.set_param_file(params_file);
cxx_config.set_valid_places(valid_places);
auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
FillInputTensor(predictor, input_tensor_shape, -1);
predictor->SaveOptimizedModel(optimized_model_dir,
lite_api::LiteModelType::kNaiveBuffer);
#if 0 // TODO(hong19860320) supports light api for XPU
// load optimized model
lite_api::MobileConfig mobile_config;
mobile_config.set_model_dir(optimized_model_dir);
mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
mobile_config.set_threads(1);
predictor = lite_api::CreatePaddlePredictor(mobile_config);
FillInputTensor(predictor, input_tensor_shape, 1);
#endif
// run optimized model
for (int i = 0; i < FLAGS_warmup; i++) {
predictor->Run();
}
for (int i = 0; i < FLAGS_repeats; i++) {
auto start = GetCurrentUS();
predictor->Run();
LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
}
return predictor;
}
TEST(XPUSubgraph, compare) {
// parsing input tensor shape, supported formats: "1,3,224,224"
// "1,3,224,224:1,80"
std::vector<std::vector<int64_t>> input_tensor_shape =
ParseShape(FLAGS_input_tensor_shape);
// generate and run optimized CPU model
LOG(INFO) << " ================ CPU ================== ";
auto cpu_predictor =
TestModel(FLAGS_model_dir,
FLAGS_model_file,
FLAGS_params_file,
{lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
input_tensor_shape,
FLAGS_optimized_model_dir + "/CPU");
// generate and run optimized XPU model
LOG(INFO) << " ================ XPU ================== ";
auto xpu_predictor =
TestModel(FLAGS_model_dir,
FLAGS_model_file,
FLAGS_params_file,
{lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
input_tensor_shape,
FLAGS_optimized_model_dir + "/XPU");
// verify results
CompareOutputTensor(xpu_predictor, cpu_predictor, FLAGS_output_tensor_num);
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/subgraph/subgraph_detector.h"
#include <memory>
#include <set>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/dot.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher.h"
#include "lite/operators/subgraph_op.h"
namespace paddle {
namespace lite {
namespace mir {
using inference::analysis::Dot;
std::string SubgraphVisualizer::operator()() {
inference::analysis::Dot dot;
const std::vector<std::string> subgraph_colors{
"red", "green", "cyan", "bisque3",
"coral", "darkseagreen1", "goldenrod1", "darkorchid",
"antiquewhite", "aquamarine", "azure", "bisque4",
"blue2", "brown1", "burlywood1", "cadetblue1",
"chartreuse1", "chocolate1", "coral1", "cornsilk",
"crimson", "cyan4", "darkgoldenrod4", "darkolivegreen2",
"darkorange2", "darkorchid2", "darkseagreen3", "darkslategray",
"deeppink2", "deepskyblue2", "dodgerblue", "firebrick",
"floralwhite", "gold1", "skyblue3", "indianred",
"indigo", "lavenderblush2", "lightblue1", "lightsalmon3",
"khaki1", "ivory4", "sandybrown", "olivedrab2",
"turquoise4", "snow3", "sienna4", "salmon2",
};
std::unordered_map<Node *, int> subgraph_indices;
for (int i = 0; i < subgraphs_.size(); i++) {
for (int j = 0; j < subgraphs_[i].size(); j++) {
subgraph_indices[subgraphs_[i][j]] = i;
}
}
std::unordered_map<std::string, int> exists_ops;
std::set<std::string> exists_args;
for (auto &node : graph_->StmtTopologicalOrder()) {
if (!node->IsStmt()) {
continue;
}
auto op_type = node->AsStmt().op_type();
if (!exists_ops.count(op_type)) {
exists_ops[op_type] = 0;
} else {
exists_ops[op_type]++;
}
auto op_name = op_type + std::to_string(exists_ops[op_type]);
std::string op_color = "white";
if (subgraph_indices.count(node)) {
auto subgraph_idx = subgraph_indices[node];
op_name += "_subgraph_" + std::to_string(subgraph_idx);
op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
}
dot.AddNode(op_name,
{Dot::Attr("shape", "box"),
Dot::Attr("style", "filled"),
Dot::Attr("color", "black"),
Dot::Attr("fillcolor", op_color)});
for (auto &in_node : node->inlinks) {
auto arg_name = in_node->AsArg().name;
if (!exists_args.count(arg_name)) {
dot.AddNode(arg_name, {});
exists_args.insert(arg_name);
}
dot.AddEdge(arg_name, op_name, {});
}
for (auto &out_node : node->outlinks) {
auto arg_name = out_node->AsArg().name;
if (!exists_args.count(arg_name)) {
dot.AddNode(arg_name, {});
exists_args.insert(arg_name);
}
dot.AddEdge(op_name, arg_name, {});
}
}
auto res = dot.Build();
VLOG(3) << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl;
return res;
}
// Find the ancestor node
SubgraphDetector::node_dat_t *
SubgraphDetector::node_dat_t::UnionFindAncestor() {
node_dat_t *ancestor = this;
while (ancestor->union_find_parent != ancestor) {
ancestor = ancestor->union_find_parent;
}
return ancestor;
}
// Merge the two adjacent nodes into one node.
// Suppose we have two adjacent nodes src and dst.
// We will perform the following operations:
// 1. add all inputs(except src) of dst to src inlinks.
// 2. add all outputs of dst to src outlinks.
// 3. change all the dst's inputs and outputs
// corresponding inlinks and outlinks to src node.
// 4. delete all dst's inlinks and outlinks.
void SubgraphDetector::node_dat_t::UnionFindCombine(node_dat_t *candidate) {
// Make this two node share the same ancestor.
union_find_parent = UnionFindAncestor();
node_dat_t *candidate_ancestor = candidate->UnionFindAncestor();
candidate_ancestor->union_find_parent = union_find_parent;
candidate->union_find_parent = union_find_parent;
// Obtain the input and output nodes for the combined one
std::unordered_set<node_dat_t *> inputs(inlinks.begin(), inlinks.end());
std::unordered_set<node_dat_t *> outputs(candidate->outlinks.begin(),
candidate->outlinks.end());
for (auto *out_node : outlinks) {
if (out_node != candidate) {
outputs.insert(out_node);
}
}
for (auto *in_node : candidate->inlinks) {
if (in_node != this) {
inputs.insert(in_node);
}
}
// Update the dst and src node's inlinks and outlinks.
#ifdef __clang__
inlinks = node_set_t(inputs.begin(), inputs.end());
outlinks = node_set_t(outputs.begin(), outputs.end());
candidate->inlinks.clear();
candidate->outlinks.clear();
#else
inlinks = std::move(node_set_t(inputs.begin(), inputs.end()));
outlinks = std::move(node_set_t(outputs.begin(), outputs.end()));
candidate->inlinks.clear();
candidate->outlinks.clear();
#endif
// Change all the dst inputs and outputs corresponding inlink and
// outlink to the src node.
for (auto *in_node : inlinks) {
for (auto *&out_node : in_node->outlinks) {
if (out_node == candidate) {
out_node = this;
}
}
}
for (auto *out_node : outlinks) {
for (auto *&in_node : out_node->inlinks) {
if (in_node == candidate) {
in_node = this;
}
}
}
}
// FlexibleDFS
// If reverse is true, do reverse dfs.
// If enter func is not nullptr, calls enter(node) before visiting any children
// of node.
// If leave func not nullptr, calls leave(node) after visiting all parents of
// node.
void SubgraphDetector::FlexibleDFS(
const node_set_t &source,
bool reverse,
const std::function<bool(const node_dat_t *)> &enter,
const std::function<bool(const node_dat_t *)> &leave) {
std::vector<std::pair<const node_dat_t *, bool>> stack; // node, leave
for (auto &node : source) {
stack.push_back(std::pair<const node_dat_t *, bool>(node, false));
}
std::unordered_set<const node_dat_t *> visited;
while (!stack.empty()) {
auto top = stack.back();
stack.pop_back();
if (top.second) {
if (leave && !leave(top.first)) return;
}
if (visited.count(top.first)) continue;
visited.insert(top.first);
if (enter && !enter(top.first)) return;
if (leave)
stack.push_back(std::pair<const node_dat_t *, bool>(top.first, true));
const node_set_t iter_nodes =
reverse == true ? top.first->inlinks : top.first->outlinks;
for (auto *node : iter_nodes) {
if (!visited.count(node)) {
stack.push_back(std::pair<const node_dat_t *, bool>(node, false));
}
}
}
}
void SubgraphDetector::InitNodes(node_map_t *nodes) {
// Initialize and mark the subgraph detector nodes based on teller.
for (auto &it : *nodes) {
for (auto &in_node : it.first->inlinks) {
it.second->inlinks.push_back((*nodes)[in_node]);
}
for (auto &out_node : it.first->outlinks) {
it.second->outlinks.push_back((*nodes)[out_node]);
}
if (teller_(it.first)) {
it.second->marked = true;
if (it.first->IsStmt()) {
// If a function is inside the subgraph, mark all the output variables
// to be inside too, so that two marked functions will be inside a same
// subgraph, lets take a example: A_function->var->B_function, if
// A_function is marked, var should also be marked, so that B_function
// will be in the same subgraph with A_function if B_function is
// marked.
for (auto &out_node : it.first->outlinks) {
(*nodes)[out_node]->marked = true;
}
}
}
}
} // namespace mir
std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
node_map_t *nodes) {
for (auto &it : *nodes) {
node_dat_t *node = it.second;
if (!node->marked) {
continue;
}
// Our algorithm must guarantee that:
// 1. The graph is always directed acyclic graph(DAG).
// 2. If there is a path in the subgraph from X to Y (X and Y are both
// nodes in the subgraph), then all paths from X to Y are in the
// subgraph.
//
// In order to achieve the above guarantee.
// For adjacent nodes src -> dst.
// 1. Get all dst input nodes except src.
// 2. Reverse DFS from those input nodes
// 3. If there is a path from input nodes to src,
// then the src and dst nodes can not be fused into one node,
// otherwise it can be done.
while (true) {
std::unordered_set<node_dat_t *> contract_nodes;
for (auto *out_node : node->outlinks) {
// must be an candidate
if (!out_node->marked) continue;
// get all dst input nodes except src node.
node_set_t source_nodes;
for (auto *in_node : out_node->inlinks) {
if (in_node != node) {
source_nodes.push_back(in_node);
}
}
// Reverse DFS from the source_nodes.
bool have_excess_path = false;
FlexibleDFS(source_nodes,
true,
nullptr,
[&have_excess_path, node](const node_dat_t *n) {
if (n == node) {
have_excess_path = true;
return false;
}
return true;
});
if (have_excess_path) continue;
contract_nodes.insert(out_node);
}
if (contract_nodes.empty()) break;
for (auto &contract_node : contract_nodes) {
node->UnionFindCombine(contract_node);
}
}
}
std::unordered_map<node_dat_t * /*ancestor*/, std::vector<Node *>> clusters;
for (auto &node : graph_->StmtTopologicalOrder()) {
if (!node->IsStmt()) continue;
if ((*nodes)[node]->marked) {
clusters[(*nodes)[node]->UnionFindAncestor()].push_back(node);
}
}
std::vector<std::vector<Node *>> subgraphs;
std::for_each(clusters.begin(),
clusters.end(),
[&](const decltype(clusters)::value_type &it) {
subgraphs.push_back(it.second);
});
return subgraphs;
}
std::vector<std::vector<Node *>> SubgraphDetector::operator()() {
node_map_t nodes;
for (auto &node : graph_->mutable_nodes()) {
nodes[&node] = new node_dat_t(&node);
CHECK(nodes[&node]);
}
// Initialize and mark the subgraph detector nodes based on teller.
InitNodes(&nodes);
// Run the Extract algorithm to find all subgraphs.
std::vector<std::vector<Node *>> subgraphs = ExtractSubgraphs(&nodes);
for (auto &it : nodes) {
CHECK(it.second);
delete it.second;
}
return subgraphs;
}
void SubgraphFuser::InsertNewNode(SSAGraph *graph,
int subgraph_idx,
const std::vector<Node *> &subgraph_nodes) {
// Create and attach a new subgraph op
cpp::OpDesc subgraph_op_desc;
subgraph_op_desc.SetType("subgraph");
// Create a new sub block desc for storing all of Ops an Vars of the target
// subgraph and sub_block_idx is set as a attribute of subgraph op,
// sub_block_idx < 0 means it's a new subgraph op
int sub_block_idx = -(subgraph_idx + 1);
auto sub_block_desc = new cpp::BlockDesc();
sub_block_desc->ClearOps();
sub_block_desc->ClearVars();
for (auto &op_node : subgraph_nodes) {
auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_block_op_desc = *op_node->AsStmt().op_info();
sub_block_op_desc->SetAttr(
kKernelTypeAttr,
op_node->AsStmt().picked_kernel().SerializedKernelType());
}
subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
// Extract input and output nodes from the target subgraph
std::unordered_set<Node *> input_var_nodes;
std::unordered_set<Node *> weight_var_nodes;
std::unordered_set<Node *> output_var_nodes;
std::unordered_set<Node *> local_var_nodes;
std::unordered_set<Node *> unused_var_nodes;
ExtractInputsOutputs(subgraph_nodes,
&input_var_nodes,
&weight_var_nodes,
&output_var_nodes,
&local_var_nodes,
&unused_var_nodes);
// Set input and output name mapping which stores the real inputs and
// outputs
std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (auto &var_node : input_var_nodes) {
input_var_names.push_back(var_node->AsArg().name);
}
for (auto &var_node : output_var_nodes) {
output_var_names.push_back(var_node->AsArg().name);
}
subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
input_var_names);
subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
output_var_names);
// Set all of the inputs and outputs to the target subgraph op
// To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
for (auto &var_node : weight_var_nodes) {
input_var_names.push_back(var_node->AsArg().name);
}
for (auto &var_node : local_var_nodes) {
output_var_names.push_back(var_node->AsArg().name);
}
for (auto &var_node : unused_var_nodes) {
output_var_names.push_back(var_node->AsArg().name);
}
subgraph_op_desc.SetInput("Inputs", input_var_names);
subgraph_op_desc.SetOutput("Outputs", output_var_names);
auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
static_cast<operators::SubgraphOp *>(subgraph_op.get())
->SetSubBlock(sub_block_desc);
auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
subgraph_op->Attach(subgraph_op_desc, any_op->scope());
// Create and add a new subgraph node into the graph
auto subgraph_op_node =
graph->GraphCreateInstructNode(subgraph_op, any_op->valid_places());
for (auto &var_node : input_var_nodes) {
IR_NODE_LINK_TO(var_node, subgraph_op_node);
}
for (auto &var_node : weight_var_nodes) {
IR_NODE_LINK_TO(var_node, subgraph_op_node);
}
for (auto &var_node : output_var_nodes) {
IR_OP_VAR_LINK(subgraph_op_node, var_node);
}
for (auto &var_node : local_var_nodes) {
IR_OP_VAR_LINK(subgraph_op_node, var_node);
}
for (auto &var_node : unused_var_nodes) {
IR_OP_VAR_LINK(subgraph_op_node, var_node);
}
// Create and assign the context to the picked kernel of the new subgraph
// node
auto &inst = subgraph_op_node->AsStmt();
inst.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
// Remove subgraph nodes and unused var nodes
auto nodes2rm = GetNodes2RM(subgraph_nodes,
{input_var_nodes,
weight_var_nodes,
output_var_nodes,
local_var_nodes,
unused_var_nodes});
GraphSafeRemoveNodes(graph, nodes2rm);
}
void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph,
const SubgraphTeller &teller,
int min_subgraph_size) {
std::vector<std::vector<Node *>> subgraphs =
SubgraphDetector(graph, teller)();
SubgraphVisualizer(graph, subgraphs)();
for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) {
if (subgraphs[subgraph_idx].size() >= min_subgraph_size) {
InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]);
}
}
}
void SubgraphFuser::operator()() {
ReplaceNodesWithSubgraphs(graph_, teller_, min_subgraph_size_);
}
void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
std::unordered_set<Node *> *input_var_nodes,
std::unordered_set<Node *> *weight_var_nodes,
std::unordered_set<Node *> *output_var_nodes,
std::unordered_set<Node *> *local_var_nodes,
std::unordered_set<Node *> *unused_var_nodes) {
for (auto &op_node : op_nodes) {
for (auto &var_node : op_node->inlinks) {
if (var_node->AsArg().is_weight) {
weight_var_nodes->insert(var_node);
continue;
}
if (!var_node->inlinks.empty()) {
// Var can only come from one op node, so use front
auto *prev_op_node = var_node->inlinks.front();
if (std::find(op_nodes.begin(), op_nodes.end(), prev_op_node) !=
op_nodes.end()) {
continue;
}
}
input_var_nodes->insert(var_node);
}
for (auto &var_node : op_node->outlinks) {
if (var_node->outlinks.empty()) {
// The next op is empty so this var is actually unused
unused_var_nodes->insert(var_node);
continue;
}
// Var can have more than one next op node, So, if any one in the
// op_nodes then continue
bool next_op_in_nodes = false;
for (auto &next_op_node : var_node->outlinks) {
if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) !=
op_nodes.end()) {
next_op_in_nodes = true;
}
}
if (next_op_in_nodes) {
local_var_nodes->insert(var_node);
continue;
}
output_var_nodes->insert(var_node);
}
}
}
std::unordered_set<const Node *> GetNodes2RM(
const std::vector<Node *> &op_nodes,
const std::vector<std::unordered_set<Node *>> &excluded_var_nodes) {
std::unordered_set<const Node *> nodes2rm(op_nodes.begin(), op_nodes.end());
for (auto &op_node : op_nodes) {
for (auto &var_node : op_node->inlinks) {
if (!nodes2rm.count(var_node)) {
nodes2rm.insert(var_node);
}
}
for (auto &var_node : op_node->outlinks) {
if (!nodes2rm.count(var_node)) {
nodes2rm.insert(var_node);
}
}
}
// Excluded nodes should not be removed
for (auto &excluded_var_node : excluded_var_nodes) {
for (auto &var_node : excluded_var_node) {
if (nodes2rm.count(var_node)) {
nodes2rm.erase(var_node);
}
}
}
return nodes2rm;
}
static void SortHelper(Node *node,
const std::unordered_set<Node *> &unordered_nodes,
std::unordered_set<const Node *> *visited_nodes,
std::vector<Node *> *ordered_nodes) {
for (auto &var_node : node->inlinks) {
if (var_node->inlinks.empty()) continue;
auto *op_node = var_node->inlinks.front();
if (unordered_nodes.count(op_node) && !visited_nodes->count(op_node)) {
SortHelper(op_node, unordered_nodes, visited_nodes, ordered_nodes);
}
}
ordered_nodes->push_back(node);
visited_nodes->insert(node);
}
std::vector<Node *> GetTopologicalOrder(
const std::unordered_set<Node *> &unordered_nodes) {
std::unordered_set<const Node *> visited_nodes;
std::vector<Node *> ordered_nodes;
for (auto &node : unordered_nodes) {
if (!node->IsStmt()) continue;
if (visited_nodes.count(node)) continue;
SortHelper(node, unordered_nodes, &visited_nodes, &ordered_nodes);
}
return ordered_nodes;
}
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
using SubgraphTeller = std::function<bool(Node*)>;
class SubgraphVisualizer {
public:
SubgraphVisualizer(SSAGraph* graph,
const std::vector<std::vector<Node*>>& subgraphs)
: graph_(graph), subgraphs_(subgraphs) {}
std::string operator()();
protected:
SSAGraph* graph_{nullptr};
std::vector<std::vector<Node*>> subgraphs_;
};
/*
* Divide the graph into subgraphs according to the specified conditions.
* Return the divided clusters, a cluster is consisted of the op nodes in the
* subgraph.
*/
class SubgraphDetector {
public:
// This is a simple representation of a graph. The SDNode hold the
// pointer of the Node. This is to avoid changing the original graph in the
// process of graph analysis.
struct node_dat_t;
using node_map_t = std::unordered_map<Node*, node_dat_t*>;
using node_set_t = std::vector<node_dat_t*>;
struct node_dat_t {
explicit node_dat_t(Node* _node) : node(_node) {}
Node* node;
bool marked{false};
node_dat_t* union_find_parent{this};
node_set_t inlinks{};
node_set_t outlinks{};
node_dat_t* UnionFindAncestor();
void UnionFindCombine(node_dat_t* candidate);
};
SubgraphDetector(SSAGraph* graph, const SubgraphTeller& teller)
: graph_(graph), teller_(teller) {}
std::vector<std::vector<Node*>> operator()();
void FlexibleDFS(const node_set_t& source,
bool reverse,
const std::function<bool(const node_dat_t*)>& enter,
const std::function<bool(const node_dat_t*)>& leave);
void InitNodes(node_map_t* nodes);
std::vector<std::vector<Node*>> ExtractSubgraphs(node_map_t* nodes);
protected:
SSAGraph* graph_{nullptr};
SubgraphTeller teller_;
};
/*
* Replace all of subgraphs with the subgraph ops, a block desc is added into
* the subgraph op to wrap the original op nodes, keep all of var nodes of the
* original ops nodes as the inputs and outputs of the subgraph op
*/
class SubgraphFuser {
public:
SubgraphFuser(SSAGraph* graph,
const SubgraphTeller& teller,
int min_subgraph_size)
: graph_(graph), teller_(teller), min_subgraph_size_{min_subgraph_size} {}
void operator()();
// Remove the op nodes of the subgraphs and replace with the subgraph ops.
void ReplaceNodesWithSubgraphs(SSAGraph* graph,
const SubgraphTeller& teller,
int min_subgraph_size);
// Create a subgraph node with a block desc to wrap the original op nodes of
// the subgraph
void InsertNewNode(SSAGraph* graph,
int subgraph_idx,
const std::vector<Node*>& subgraph_nodes);
protected:
SSAGraph* graph_{nullptr};
SubgraphTeller teller_;
int min_subgraph_size_;
};
void ExtractInputsOutputs(const std::vector<Node*>& op_nodes,
std::unordered_set<Node*>* input_var_nodes,
std::unordered_set<Node*>* weight_var_nodes,
std::unordered_set<Node*>* output_var_nodes,
std::unordered_set<Node*>* local_var_nodes,
std::unordered_set<Node*>* unused_var_nodes);
std::unordered_set<const Node*> GetNodes2RM(
const std::vector<Node*>& op_nodes,
const std::vector<std::unordered_set<Node*>>& excluded_var_nodes);
std::vector<Node*> GetTopologicalOrder(
const std::unordered_set<Node*>& unordered_nodes);
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -12,68 +12,25 @@ ...@@ -12,68 +12,25 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/core/mir/subgraph/subgraph_program_pass.h" #include "lite/core/mir/subgraph/subgraph_detector.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
#include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/ssa_graph.h" #include "lite/core/mir/ssa_graph.h"
#include "lite/core/program.h" #include "lite/core/program.h"
#include "lite/model_parser/cpp/program_desc.h" #include "lite/model_parser/cpp/program_desc.h"
#include "lite/model_parser/model_parser.h" #include "lite/model_parser/model_parser.h"
DEFINE_string(model_dir, "", "model_dir"); DEFINE_string(model_dir, "", "model_dir");
DEFINE_string(model_file, "", "model file path of combined protobuf model");
DEFINE_string(params_file, "", "params file path of combined protobuf model");
namespace paddle { namespace paddle {
namespace lite { namespace lite {
TEST(SubgraphTest, models) { // The helper functions for building model manually
cpp::ProgramDesc program_desc;
auto scope = std::make_shared<Scope>();
// LoadModelPb(FLAGS_model_dir,
// FLAGS_model_dir + "/model",
// FLAGS_model_dir + "/params",
// scope.get(),
// &program_desc,
// true);
LoadModelPb(FLAGS_model_dir, "", "", scope.get(), &program_desc);
std::vector<Place> valid_places({
Place{TARGET(kHost), PRECISION(kFloat)},
#ifdef LITE_WITH_ARM
Place{TARGET(kARM), PRECISION(kFloat)},
#endif
#ifdef LITE_WITH_NPU
Place{TARGET(kNPU), PRECISION(kFloat)},
#endif
#ifdef LITE_WITH_XPU
Place{TARGET(kXPU), PRECISION(kFloat)},
#endif
});
lite::Program program(program_desc, scope, valid_places);
auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
graph->Build(program, valid_places);
std::vector<std::string> supported_op_types{"concat",
"conv2d",
"depthwise_conv2d",
"batch_norm",
"scale",
"pool2d",
"mul",
"elementwise_add",
"softmax",
"split",
"relu",
"reshape2",
"transpose2"};
auto* pass = new mir::subgraph::SubgraphProgramPass;
ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
}
// return output_var_names
std::vector<std::string> AddFCDesc( std::vector<std::string> AddFCDesc(
cpp::BlockDesc* block_desc, cpp::BlockDesc* block_desc,
const std::shared_ptr<Scope>& scope, const std::shared_ptr<Scope>& scope,
...@@ -87,20 +44,20 @@ std::vector<std::string> AddFCDesc( ...@@ -87,20 +44,20 @@ std::vector<std::string> AddFCDesc(
auto* wgt = block_desc->AddVar<cpp::VarDesc>(); auto* wgt = block_desc->AddVar<cpp::VarDesc>();
wgt->SetName(prefix + "_W"); wgt->SetName(prefix + "_W");
auto* wtensor = scope->Var(prefix + "_W")->GetMutable<lite::Tensor>(); auto* wtensor = scope->Var(prefix + "_W")->GetMutable<Tensor>();
wtensor->Resize(wshape); wtensor->Resize(wshape);
wtensor->mutable_data<float>(); wtensor->mutable_data<float>();
auto* bias = block_desc->AddVar<cpp::VarDesc>(); auto* bias = block_desc->AddVar<cpp::VarDesc>();
bias->SetName(prefix + "_Bias"); bias->SetName(prefix + "_Bias");
auto* btensor = scope->Var(prefix + "_Bias")->GetMutable<lite::Tensor>(); auto* btensor = scope->Var(prefix + "_Bias")->GetMutable<Tensor>();
btensor->Resize({wshape[1]}); btensor->Resize({wshape[1]});
btensor->mutable_data<float>(); btensor->mutable_data<float>();
auto* out = block_desc->AddVar<cpp::VarDesc>(); auto* out = block_desc->AddVar<cpp::VarDesc>();
out->SetName(prefix + "_Out"); out->SetName(prefix + "_Out");
std::vector<std::string> out_var_names{prefix + "_Out"}; std::vector<std::string> out_var_names{prefix + "_Out"};
scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>(); scope->Var(prefix + "_Out")->GetMutable<Tensor>();
op_desc->SetType("fc"); op_desc->SetType("fc");
op_desc->SetInput("Input", input_var_names); op_desc->SetInput("Input", input_var_names);
...@@ -126,7 +83,7 @@ std::vector<std::string> AddElementwiseAddDesc( ...@@ -126,7 +83,7 @@ std::vector<std::string> AddElementwiseAddDesc(
out->SetName(prefix + "_Out"); out->SetName(prefix + "_Out");
std::vector<std::string> out_var_names{prefix + "_Out"}; std::vector<std::string> out_var_names{prefix + "_Out"};
scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>(); scope->Var(prefix + "_Out")->GetMutable<Tensor>();
op_desc->SetType("elementwise_add"); op_desc->SetType("elementwise_add");
op_desc->SetInput("X", input_X_names); op_desc->SetInput("X", input_X_names);
...@@ -150,7 +107,7 @@ std::vector<std::string> AddFeedDesc( ...@@ -150,7 +107,7 @@ std::vector<std::string> AddFeedDesc(
out->SetName(prefix + "_Out"); out->SetName(prefix + "_Out");
std::vector<std::string> out_var_names{prefix + "_Out"}; std::vector<std::string> out_var_names{prefix + "_Out"};
scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>(); scope->Var(prefix + "_Out")->GetMutable<Tensor>();
op_desc->SetType("feed"); op_desc->SetType("feed");
op_desc->SetInput("X", input_X_names); op_desc->SetInput("X", input_X_names);
...@@ -173,7 +130,7 @@ std::vector<std::string> AddFetchDesc( ...@@ -173,7 +130,7 @@ std::vector<std::string> AddFetchDesc(
out->SetName(prefix + "_Out"); out->SetName(prefix + "_Out");
std::vector<std::string> out_var_names{prefix + "_Out"}; std::vector<std::string> out_var_names{prefix + "_Out"};
scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>(); scope->Var(prefix + "_Out")->GetMutable<Tensor>();
op_desc->SetType("fetch"); op_desc->SetType("fetch");
op_desc->SetInput("X", input_X_names); op_desc->SetInput("X", input_X_names);
...@@ -183,40 +140,88 @@ std::vector<std::string> AddFetchDesc( ...@@ -183,40 +140,88 @@ std::vector<std::string> AddFetchDesc(
return out_var_names; return out_var_names;
} }
std::unique_ptr<mir::SSAGraph> BuildSimpleNet( TEST(Subgraph, detect_simple_model) {
cpp::ProgramDesc* program_desc, cpp::ProgramDesc program_desc;
const std::shared_ptr<Scope>& scope, std::vector<Place> valid_places{{TARGET(kHost), PRECISION(kFloat)}};
const std::vector<Place>& valid_places) { auto scope = std::make_shared<Scope>();
program_desc->ClearBlocks(); // Build a simple network
auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>(); program_desc.ClearBlocks();
auto* block_desc = program_desc.AddBlock<cpp::BlockDesc>();
block_desc->ClearOps(); block_desc->ClearOps();
block_desc->ClearVars(); block_desc->ClearVars();
auto* var_desc = block_desc->AddVar<cpp::VarDesc>(); auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
var_desc->SetName("feed_var"); var_desc->SetName("feed_var");
auto* feed_var = scope->Var("feed_var")->GetMutable<lite::Tensor>(); auto* feed_var = scope->Var("feed_var")->GetMutable<Tensor>();
feed_var->Resize({1, 4}); feed_var->Resize({1, 4});
auto fc1_out = AddFCDesc(block_desc, scope, {"feed_var"}, {4, 5}); auto fc1_out = AddFCDesc(block_desc, scope, {"feed_var"}, {4, 5});
auto fc2_out = AddFCDesc(block_desc, scope, fc1_out, {5, 2}); auto fc2_out = AddFCDesc(block_desc, scope, fc1_out, {5, 2});
Program program(program_desc, scope, valid_places);
lite::Program program(*program_desc, scope, valid_places);
auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph()); auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
graph->Build(program, valid_places); graph->Build(program, valid_places);
// Apply subgraph detector and check results
return graph; auto teller = [](mir::Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
auto op_type = stmt.op_type();
const std::vector<std::string> supported_types = {"fc"};
return std::find(supported_types.begin(), supported_types.end(), op_type) !=
supported_types.end();
};
std::vector<std::vector<mir::Node*>> subgraphs =
mir::SubgraphDetector(graph.get(), teller)();
ASSERT_EQ(subgraphs.size(), 1);
ASSERT_EQ(graph->nodes().size(), 9);
mir::SubgraphVisualizer(graph.get(), subgraphs)();
} }
TEST(SubGraphTest, SimpleNet) { TEST(Subgraph, detect_custom_model) {
if (FLAGS_model_dir.empty() && FLAGS_model_file.empty() &&
FLAGS_params_file.empty()) {
LOG(INFO) << "Using --model_dir, or --model_file and --params_file to set "
"the path of model files.";
return;
}
cpp::ProgramDesc program_desc; cpp::ProgramDesc program_desc;
std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
auto scope = std::make_shared<Scope>(); auto scope = std::make_shared<Scope>();
auto graph = BuildSimpleNet(&program_desc, scope, places); LoadModelPb(FLAGS_model_dir,
FLAGS_model_file,
std::vector<std::string> supported_op_types{"fc"}; FLAGS_params_file,
auto* pass = new mir::subgraph::SubgraphProgramPass; scope.get(),
ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1); &program_desc,
!FLAGS_model_file.empty() && !FLAGS_params_file.empty(),
ASSERT_EQ(graph->nodes().size(), 9); false);
// LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get()); std::vector<Place> valid_places({
#ifdef LITE_WITH_ARM
Place{TARGET(kARM), PRECISION(kFloat)},
#endif
#ifdef LITE_WITH_X86
Place{TARGET(kX86), PRECISION(kFloat)},
#endif
#ifdef LITE_WITH_NPU
Place{TARGET(kNPU), PRECISION(kFloat)},
#endif
#ifdef LITE_WITH_XPU
Place{TARGET(kXPU), PRECISION(kFloat)},
#endif
});
Program program(program_desc, scope, valid_places);
auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
graph->Build(program, valid_places);
// Apply subgraph detector and check results
auto teller = [](mir::Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
auto op_type = stmt.op_type();
const std::vector<std::string> unsupported_types = {
"feed", "fetch", "subgraph"};
return std::find(unsupported_types.begin(),
unsupported_types.end(),
op_type) == unsupported_types.end();
};
std::vector<std::vector<mir::Node*>> subgraphs =
mir::SubgraphDetector(graph.get(), teller)();
ASSERT_EQ(subgraphs.size(), 1);
mir::SubgraphVisualizer(graph.get(), subgraphs)();
} }
} // namespace lite } // namespace lite
......
...@@ -12,58 +12,52 @@ ...@@ -12,58 +12,52 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #include "lite/core/mir/subgraph/subgraph_pass.h"
#include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_set>
#include <utility> #include <utility>
#include "lite/core/tensor.h" #include <vector>
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/subgraph/subgraph_detector.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace xpu { namespace mir {
class DeviceInfo { void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
public: std::unordered_set<std::string> supported_lists;
static DeviceInfo& Global() { #define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
static DeviceInfo x; #include "lite/kernels/npu/bridges/paddle_use_bridges.h"
return x; #undef USE_SUBGRAPH_BRIDGE
} auto teller = [&](Node* node) {
DeviceInfo() {} if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
void Insert(const std::string& name, return supported_lists.count(stmt.op_type()) != 0;
std::shared_ptr<xtcl::network::xRuntimeInstance> runtime) { };
if (runtimes_.find(name) != runtimes_.end()) { SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
LOG(WARNING) << "[XPU] Model " << name << " already exists."; fuser();
return; }
}
runtimes_.emplace(std::make_pair(name, runtime)); void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
} std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
void Clear() { runtimes_.clear(); } #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
std::shared_ptr<xtcl::network::xRuntimeInstance> Find( auto teller = [&](Node* node) {
const std::string& name) const { if (!node->IsStmt()) return false;
if (runtimes_.find(name) != runtimes_.end()) { auto& stmt = node->AsStmt();
return runtimes_.at(name); return supported_lists.count(stmt.op_type()) != 0;
} else { };
return nullptr; SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
} fuser();
} }
private: } // namespace mir
int device_id_{0};
std::string device_name_{"default"};
std::unordered_map<std::string,
std::shared_ptr<xtcl::network::xRuntimeInstance>>
runtimes_;
};
bool LoadModel(const lite::Tensor& model,
std::shared_ptr<xtcl::network::xRuntimeInstance>* runtime);
} // namespace xpu
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
.BindTargets({TARGET(kNPU)});
REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
.BindTargets({TARGET(kXPU)});
...@@ -12,30 +12,26 @@ ...@@ -12,30 +12,26 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/xpu/bridges/registry.h" #pragma once
#include <utility>
#include <memory>
#include <vector>
#include "lite/core/mir/pass.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace mir {
namespace xpu {
namespace bridges {
Factory& Factory::Instance() {
static Factory g_xpu_bridge;
return g_xpu_bridge;
}
bool Factory::HasType(const std::string& op_type) const { class NPUSubgraphPass : public ProgramPass {
return map_.count(op_type); public:
} void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
void Factory::Insert(const std::string& op_type, const func_type& func_name) { class XPUSubgraphPass : public ProgramPass {
map_.insert(std::make_pair(op_type, func_name)); public:
} void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace bridges } // namespace mir
} // namespace xpu
} // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -30,7 +30,9 @@ DEFINE_int32(output_tensor_num, 1, "number of output tensors"); ...@@ -30,7 +30,9 @@ DEFINE_int32(output_tensor_num, 1, "number of output tensors");
namespace paddle { namespace paddle {
namespace lite { namespace lite {
std::vector<std::vector<int64_t>> ParseShape(std::string txt) { // The helper functions for loading and running model from command line and
// verifying output data
std::vector<std::vector<int64_t>> ShapeParsing(std::string txt) {
std::vector<std::vector<int64_t>> shape; std::vector<std::vector<int64_t>> shape;
while (!txt.empty()) { while (!txt.empty()) {
size_t idx = txt.find_first_of(":"); size_t idx = txt.find_first_of(":");
...@@ -65,7 +67,7 @@ int64_t ShapeProduction(std::vector<int64_t> shape) { ...@@ -65,7 +67,7 @@ int64_t ShapeProduction(std::vector<int64_t> shape) {
return s; return s;
} }
void FillInputTensor( void FillInputTensors(
const std::shared_ptr<lite_api::PaddlePredictor>& predictor, const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
const std::vector<std::vector<int64_t>>& input_tensor_shape, const std::vector<std::vector<int64_t>>& input_tensor_shape,
const float value) { const float value) {
...@@ -80,7 +82,7 @@ void FillInputTensor( ...@@ -80,7 +82,7 @@ void FillInputTensor(
} }
} }
void CompareOutputTensor( void CheckOutputTensors(
const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor, const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor, const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
const int output_tensor_num) { const int output_tensor_num) {
...@@ -96,7 +98,7 @@ void CompareOutputTensor( ...@@ -96,7 +98,7 @@ void CompareOutputTensor(
auto abs_diff = auto abs_diff =
std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]); std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]);
auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6); auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6);
VLOG(3) << "val: " << tar_output_tensor_data[j] VLOG(5) << "val: " << tar_output_tensor_data[j]
<< " ref: " << ref_output_tensor_data[j] << " ref: " << ref_output_tensor_data[j]
<< " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;
EXPECT_LT(rel_diff, 0.1); EXPECT_LT(rel_diff, 0.1);
...@@ -111,24 +113,23 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel( ...@@ -111,24 +113,23 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
const std::vector<lite_api::Place>& valid_places, const std::vector<lite_api::Place>& valid_places,
const std::vector<std::vector<int64_t>>& input_tensor_shape, const std::vector<std::vector<int64_t>>& input_tensor_shape,
const std::string& optimized_model_dir) { const std::string& optimized_model_dir) {
// generate optimized model // Generate optimized model
lite_api::CxxConfig cxx_config; lite_api::CxxConfig cxx_config;
cxx_config.set_model_dir(model_dir); cxx_config.set_model_dir(model_dir);
cxx_config.set_model_file(model_file); cxx_config.set_model_file(model_file);
cxx_config.set_param_file(params_file); cxx_config.set_param_file(params_file);
cxx_config.set_valid_places(valid_places); cxx_config.set_valid_places(valid_places);
auto predictor = lite_api::CreatePaddlePredictor(cxx_config); auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
FillInputTensor(predictor, input_tensor_shape, 1);
predictor->SaveOptimizedModel(optimized_model_dir, predictor->SaveOptimizedModel(optimized_model_dir,
lite_api::LiteModelType::kNaiveBuffer); lite_api::LiteModelType::kNaiveBuffer);
// load optimized model // Load optimized model
lite_api::MobileConfig mobile_config; lite_api::MobileConfig mobile_config;
mobile_config.set_model_dir(optimized_model_dir); mobile_config.set_model_dir(optimized_model_dir);
mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH); mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
mobile_config.set_threads(1); mobile_config.set_threads(1);
predictor = lite_api::CreatePaddlePredictor(mobile_config); predictor = lite_api::CreatePaddlePredictor(mobile_config);
FillInputTensor(predictor, input_tensor_shape, 1); FillInputTensors(predictor, input_tensor_shape, 1);
// run optimized model // Run optimized model
for (int i = 0; i < FLAGS_warmup; i++) { for (int i = 0; i < FLAGS_warmup; i++) {
predictor->Run(); predictor->Run();
} }
...@@ -140,32 +141,48 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel( ...@@ -140,32 +141,48 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
return predictor; return predictor;
} }
TEST(NPUSubgraph, compare) { TEST(Subgraph, generate_model_and_check_precision) {
// parsing input tensor shape, supported formats: "1,3,224,224" if (FLAGS_model_dir.empty() && FLAGS_model_file.empty() &&
// "1,3,224,224:1,80" FLAGS_params_file.empty()) {
LOG(INFO) << "Using --model_dir, or --model_file and --params_file to set "
"the path of model files.";
return;
}
// Parsing the shapes of input tensors from strings, supported formats:
// "1,3,224,224" and "1,3,224,224:1,80"
std::vector<std::vector<int64_t>> input_tensor_shape = std::vector<std::vector<int64_t>> input_tensor_shape =
ParseShape(FLAGS_input_tensor_shape); ShapeParsing(FLAGS_input_tensor_shape);
// generate and run optimized CPU model std::vector<lite_api::Place> valid_places({
LOG(INFO) << " ================ CPU ================== "; #ifdef LITE_WITH_ARM
auto cpu_predictor = lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
TestModel(FLAGS_model_dir, #endif
FLAGS_model_file, #ifdef LITE_WITH_X86
FLAGS_params_file, lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
{lite_api::Place{TARGET(kARM), PRECISION(kFloat)}}, #endif
input_tensor_shape, });
FLAGS_optimized_model_dir + "/CPU"); // Generate and run optimized model on CPU as the reference predictor
// generate and run optimized NPU model auto ref_predictor = TestModel(FLAGS_model_dir,
LOG(INFO) << " ================ NPU ================== "; FLAGS_model_file,
auto npu_predictor = FLAGS_params_file,
TestModel(FLAGS_model_dir, valid_places,
FLAGS_model_file, input_tensor_shape,
FLAGS_params_file, FLAGS_optimized_model_dir + "/ref_opt_model");
{lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}, // Generate and run optimized model on NPU/XPU as the target predictor
lite_api::Place{TARGET(kARM), PRECISION(kFloat)}}, #ifdef LITE_WITH_NPU
input_tensor_shape, valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
FLAGS_optimized_model_dir + "/NPU"); #endif
// verify results #ifdef LITE_WITH_XPU
CompareOutputTensor(npu_predictor, cpu_predictor, FLAGS_output_tensor_num); valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
#endif
auto tar_predictor = TestModel(FLAGS_model_dir,
FLAGS_model_file,
FLAGS_params_file,
valid_places,
input_tensor_shape,
FLAGS_optimized_model_dir + "/tar_opt_model");
// Check the difference of the output tensors between reference predictor and
// target predictor
CheckOutputTensors(tar_predictor, ref_predictor, FLAGS_output_tensor_num);
} }
} // namespace lite } // namespace lite
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/subgraph/subgraph_program_pass.h"
#include <memory>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher.h"
namespace paddle {
namespace lite {
namespace mir {
namespace subgraph {
std::unordered_map<int, std::unordered_set<Node*>>
SubgraphProgramPass::ClassifySubgraph(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_map<int, std::unordered_set<Node*>> op_nodes;
for (auto& item : graph->StmtTopologicalOrder()) {
if (!item->IsStmt()) continue;
auto& stmt = item->AsStmt();
int sub_id = stmt.subgraph_id();
if (sub_id < 1) continue;
if (!op_nodes.count(sub_id)) {
op_nodes[sub_id] = std::unordered_set<Node*>();
}
op_nodes.at(sub_id).insert(item);
}
return op_nodes;
}
cpp::OpDesc SubgraphProgramPass::GenGraphOpDesc(
const std::string& weight_var_name,
const std::vector<std::string>& in_var_names,
const std::vector<std::string>& out_var_names) {
cpp::OpDesc op_desc;
op_desc.SetType("graph_op");
op_desc.SetInput("Inputs", in_var_names);
op_desc.SetInput("Weight", {weight_var_name});
op_desc.SetOutput("Outputs", out_var_names);
return op_desc;
}
void SubgraphProgramPass::InsertNewNode(
const std::unique_ptr<SSAGraph>& graph,
const std::string& weight_var_name,
Scope* scope,
const std::vector<Place>& valid_places,
std::unordered_set<Node*> in_data_vars,
std::unordered_set<Node*> in_wgt_vars,
std::unordered_set<Node*> out_data_vars,
std::unordered_set<Node*> out_unused_vars) {
std::vector<std::string> in_var_names;
std::vector<std::string> out_var_names;
for (auto i : in_data_vars) {
in_var_names.push_back(i->AsArg().name);
}
for (auto i : out_data_vars) {
out_var_names.push_back(i->AsArg().name);
}
auto op_desc = GenGraphOpDesc(weight_var_name, in_var_names, out_var_names);
auto graph_op = LiteOpRegistry::Global().Create("graph_op");
graph_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places);
for (auto& in_var : in_data_vars) {
IR_NODE_LINK_TO(in_var, new_op_node);
}
for (auto& in_var : in_wgt_vars) {
IR_NODE_LINK_TO(in_var, new_op_node);
}
for (auto& out_var : out_data_vars) {
IR_OP_VAR_LINK(new_op_node, out_var);
}
for (auto& out_var : out_unused_vars) {
IR_OP_VAR_LINK(new_op_node, out_var);
}
// add weight node to store pre-compilied NPU model
auto new_weight_node = graph->NewArgumentNode(weight_var_name);
new_weight_node->AsArg().is_weight = true;
new_weight_node->AsArg().is_persist = true;
DirectedLink(new_weight_node, new_op_node);
// assign context
auto& inst = new_op_node->AsStmt();
inst.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
}
void SubgraphProgramPass::SortHelper(
Node* node,
const std::unordered_set<Node*>& nodes_all,
std::unordered_set<const Node*>* visited_nodes,
std::vector<Node*>* ret) {
for (auto& var_node : node->inlinks) {
if (var_node->inlinks.empty()) continue;
auto* op_node = var_node->inlinks.front();
if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) {
SortHelper(op_node, nodes_all, visited_nodes, ret);
}
}
ret->push_back(node);
visited_nodes->insert(node);
}
std::vector<Node*> SubgraphProgramPass::GetTopologicalOrder(
const std::unordered_set<Node*>& nodes) {
std::unordered_set<const Node*> visited;
std::vector<Node*> ret;
for (auto& node : nodes) {
if (!node->IsStmt()) continue;
if (visited.count(node)) continue;
SortHelper(node, nodes, &visited, &ret);
}
return ret;
}
void SubgraphProgramPass::FindInputOutputVars(
const std::unordered_set<Node*>& op_nodes,
std::unordered_set<Node*>* in_data_vars,
std::unordered_set<Node*>* in_wgt_vars,
std::unordered_set<Node*>* out_data_vars,
std::unordered_set<Node*>* out_unused_vars) {
for (auto& op_node : op_nodes) {
for (auto& in_var : op_node->inlinks) {
if (in_var->AsArg().is_weight) {
in_wgt_vars->insert(in_var);
continue;
}
if (!in_var->inlinks.empty()) {
// var can only come from one op node, so use front
auto* pre_op_node = in_var->inlinks.front();
if (op_nodes.count(pre_op_node)) {
continue;
}
}
in_data_vars->insert(in_var);
}
for (auto& out_var : op_node->outlinks) {
if (out_var->outlinks.empty()) {
// the next op is empty so this var is actually unused
out_unused_vars->insert(out_var);
continue;
}
// var can have more than one next op node
// so, if any one in the op_nodes then continue
bool next_op_in_nodes = false;
for (auto& next_op_node : out_var->outlinks) {
if (op_nodes.count(next_op_node)) {
next_op_in_nodes = true;
}
}
if (next_op_in_nodes) {
continue;
}
out_data_vars->insert(out_var);
}
}
}
std::unordered_set<const Node*> SubgraphProgramPass::GetNode2rm(
const std::unordered_set<Node*>& op_nodes,
const std::vector<std::unordered_set<Node*>>& excluded_nodes) {
std::unordered_set<const Node*> nodes2rm(op_nodes.begin(), op_nodes.end());
for (auto& op_node : op_nodes) {
for (auto& in_var : op_node->inlinks) {
if (!nodes2rm.count(in_var)) {
nodes2rm.insert(in_var);
}
}
for (auto& out_var : op_node->outlinks) {
if (!nodes2rm.count(out_var)) {
nodes2rm.insert(out_var);
}
}
}
// some nodes should not be removed
for (auto& e : excluded_nodes) {
for (auto& i : e) {
if (nodes2rm.count(i)) {
nodes2rm.erase(i);
}
}
}
return nodes2rm;
}
void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) {
for (auto& item : graph->StmtTopologicalOrder()) {
if (!item->IsStmt()) continue;
auto& stmt = item->AsStmt();
auto& op = stmt.op();
auto scope = op->scope();
std::string op_type = op->op_info()->Type();
// check the dimension of input variables in the scope, must not be empty !
if (op_type == "feed") {
auto input_var_names = op->op_info()->output_names();
CHECK_GE(input_var_names.size(), 1);
for (auto input_var_name : input_var_names) {
auto input_var = scope->FindVar(input_var_name);
CHECK(input_var) << "No input variable '" << input_var_name
<< "' found in scope " << scope;
auto input = input_var->GetMutable<lite::Tensor>();
CHECK(!input->dims().empty()) << "The dimension of input variable '"
<< input_var_name
<< "' can not be empty.";
}
continue;
}
if (op_type == "fetch") {
continue;
}
op->CheckShape();
op->InferShape();
#ifndef LITH_WITH_XPU
// TOOD(xxx): remove Launch() at last
auto& kkks = stmt.kernels();
if (!kkks.empty()) {
auto& kk = stmt.kernels().front();
if (kk) {
kk->Launch();
}
}
#endif
}
}
void SubgraphProgramPass::InitSubgraphID(
const std::unique_ptr<SSAGraph>& graph,
const std::vector<std::string>& supported_op_types) {
for (auto& item : graph->StmtTopologicalOrder()) {
if (!item->IsStmt()) continue;
auto& stmt = item->AsStmt();
stmt.ClearSubgraphID();
if (std::find(supported_op_types.begin(),
supported_op_types.end(),
stmt.op_type()) != supported_op_types.end()) {
stmt.SetSubgraphID(0);
LOG(INFO) << "supported " << stmt.op_type();
} else {
LOG(INFO) << "======= not supported " << stmt.op_type();
}
}
}
// mark current and all output supported nodes
void SubgraphProgramPass::ChangeAllOutConnectedID(Node* node,
int to_id,
int from_id) {
if (!node) return;
if (node->IsStmt()) {
auto& stmt = node->AsStmt();
if (stmt.subgraph_id() == from_id) {
stmt.SetSubgraphID(to_id);
for (auto& i : node->outlinks) {
ChangeAllOutConnectedID(i, to_id, from_id);
}
} else {
LOG(INFO) << "failed op type:" << stmt.op_type();
return;
}
} else {
// this it arg node
bool all_out_op_supported = true;
for (auto& i : node->outlinks) {
if (!i->IsStmt()) return;
auto& stmt = i->AsStmt();
if (stmt.subgraph_id() < from_id) {
all_out_op_supported = false;
}
}
if (!all_out_op_supported) {
return;
}
for (auto& i : node->outlinks) {
CHECK(i->IsStmt());
auto& stmt = i->AsStmt();
if (stmt.subgraph_id() == from_id) {
stmt.SetSubgraphID(to_id);
for (auto& o : i->outlinks) {
ChangeAllOutConnectedID(o, to_id, from_id);
}
}
}
}
}
int SubgraphProgramPass::FuseSubgraphID(
const std::unique_ptr<SSAGraph>& graph) {
int sub_id = 1; // id start from 1 not 0
for (auto& item : graph->StmtTopologicalOrder()) {
// bool inputvar = false;
if (!item->IsStmt()) continue;
auto& stmt = item->AsStmt();
/*
if (stmt.subgraph_id() == -1) {
for (auto& i : item->outlinks) {
for (auto& j : i->outlinks) {
if (j->IsStmt()) {
auto& jstmt = j->AsStmt();
if (jstmt.subgraph_id() == 0) inputvar = true;
}
}
}
}
*/
if (stmt.subgraph_id() != 0) continue;
ChangeAllOutConnectedID(item, sub_id);
sub_id++;
}
return sub_id - 1;
}
int SubgraphProgramPass::FuseSubgraph(
const std::unique_ptr<SSAGraph>& graph,
const std::vector<std::string>& supported_op_types) {
InitSubgraphID(graph, supported_op_types);
return FuseSubgraphID(graph);
}
} // namespace subgraph
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(subgraph_program_pass,
paddle::lite::mir::subgraph::SubgraphProgramPass)
.BindTargets({TARGET(kAny)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
namespace subgraph {
class SubgraphProgramPass : public ProgramPass {
public:
using key2nodes_t = std::map<std::string, Node*>;
// make all the linked ops in subgraph with same subgraph_id
// return the fused subgraph numbers
int FuseSubgraph(const std::unique_ptr<SSAGraph>& graph,
const std::vector<std::string>& supported_op_types);
void Apply(const std::unique_ptr<SSAGraph>& graph) override{};
protected:
void InferOnce(const std::unique_ptr<SSAGraph>& graph);
// clear all subgraph id and mark all ops, which could be fuse, as id zero
void InitSubgraphID(const std::unique_ptr<SSAGraph>& graph,
const std::vector<std::string>& supported_op_types);
// make all the linked ops in subgraph with same subgraph_id
// return the fused subgraph numbers
int FuseSubgraphID(const std::unique_ptr<SSAGraph>& graph);
// // GenerateFusedGraph:
// std::unique_ptr<SSAGraph> GenerateFusedGraph(const
// std::unique_ptr<SSAGraph>& graph, int sub_num);
void ChangeAllOutConnectedID(Node* node, int to_id, int from_id = 0);
// Below function cloud be useful in child classes //
// classify node by subgraph id
std::unordered_map<int, std::unordered_set<Node*>> ClassifySubgraph(
const std::unique_ptr<SSAGraph>& graph);
// generate the graph op desc
cpp::OpDesc GenGraphOpDesc(const std::string& weight_var_name,
const std::vector<std::string>& in_var_names,
const std::vector<std::string>& out_var_names);
// insert a new graph op node
void InsertNewNode(const std::unique_ptr<SSAGraph>& graph,
const std::string& weight_var_name,
Scope* scope,
const std::vector<Place>& valid_places,
std::unordered_set<Node*> in_data_vars,
std::unordered_set<Node*> in_wgt_vars,
std::unordered_set<Node*> out_data_vars,
std::unordered_set<Node*> out_unused_vars);
// Sort and return the topology order of nodes set
std::vector<Node*> GetTopologicalOrder(
const std::unordered_set<Node*>& nodes);
// find all input data vars, input weight vars,
// output data vars and output vars from the nodes
void FindInputOutputVars(const std::unordered_set<Node*>& op_nodes,
std::unordered_set<Node*>* in_data_vars,
std::unordered_set<Node*>* in_wgt_vars,
std::unordered_set<Node*>* out_data_vars,
std::unordered_set<Node*>* out_unused_vars);
// return the node to remove in the subgraph
std::unordered_set<const Node*> GetNode2rm(
const std::unordered_set<Node*>& op_nodes,
const std::vector<std::unordered_set<Node*>>& excluded_nodes);
private:
// sort nodes to operational sequence
void SortHelper(Node* node,
const std::unordered_set<Node*>& nodes_all,
std::unordered_set<const Node*>* visited_nodes,
std::vector<Node*>* ret);
};
} // namespace subgraph
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -27,12 +27,6 @@ ...@@ -27,12 +27,6 @@
#include "lite/core/program.h" #include "lite/core/program.h"
#include "lite/core/types.h" #include "lite/core/types.h"
#include "lite/model_parser/model_parser.h" #include "lite/model_parser/model_parser.h"
#ifdef LITE_WITH_NPU
#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
#endif
#ifdef LITE_WITH_XPU
#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -109,7 +103,9 @@ class Optimizer { ...@@ -109,7 +103,9 @@ class Optimizer {
"runtime_context_assign_pass", "runtime_context_assign_pass",
"argument_type_display_pass", "argument_type_display_pass",
"memory_optimize_pass"}}; "memory_optimize_pass",
"npu_subgraph_pass",
"xpu_subgraph_pass"}};
RunPasses(passes_local); RunPasses(passes_local);
} else { } else {
RunPasses(passes); RunPasses(passes);
...@@ -121,13 +117,6 @@ class Optimizer { ...@@ -121,13 +117,6 @@ class Optimizer {
// Generate a new program based on the mir graph. // Generate a new program based on the mir graph.
std::unique_ptr<RuntimeProgram> GenRuntimeProgram() { std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
// Extra passes are applied for NPU and XPU, they depends on the shapes
// of input tensors. so GenRuntimeProgram() must be called after the shapes
// of input tensors are determined.
std::vector<std::string> subgraph_passes{"generate_npu_program_pass",
"generate_xpu_program_pass"};
RunPasses(subgraph_passes);
auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>( auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
"generate_program_pass"); "generate_program_pass");
pass->Apply(graph_); pass->Apply(graph_);
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "lite/model_parser/cpp/op_desc.h" #include "lite/model_parser/cpp/op_desc.h"
#include "lite/model_parser/cpp/var_desc.h" #include "lite/model_parser/cpp/var_desc.h"
#include "lite/operators/conditional_block_op.h" #include "lite/operators/conditional_block_op.h"
#include "lite/operators/subgraph_op.h"
#include "lite/operators/while_op.h" #include "lite/operators/while_op.h"
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#include "lite/core/profile/precision_profiler.h" #include "lite/core/profile/precision_profiler.h"
...@@ -31,10 +32,32 @@ void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) { ...@@ -31,10 +32,32 @@ void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) {
// NOTE: RuntimeProgram do not has all meta info, so save model just update // NOTE: RuntimeProgram do not has all meta info, so save model just update
// upon origin model // upon origin model
CHECK(desc->BlocksSize()); CHECK(desc->BlocksSize());
auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0); auto main_block = desc->GetBlock<cpp::BlockDesc>(0);
main_block.ClearOps(); main_block->ClearOps();
for (auto& node : instructions_) { for (auto& node : instructions_) {
auto* op = main_block.AddOp<cpp::OpDesc>(); auto op_type = node.op()->op_info()->Type();
if (op_type == "subgraph") {
auto subgraph_op = const_cast<operators::SubgraphOp*>(
static_cast<const operators::SubgraphOp*>(node.op()));
int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
if (sub_block_idx < 0) {
// It's a new subgraph op when its sub_block_idx < 0, Now we add its
// subblock desc to the program desc, Then update its sub_block_idx to
// the index of block desc of the program desc.
sub_block_idx = desc->BlocksSize();
auto sub_block_desc = subgraph_op->GetSubBlock();
CHECK(sub_block_desc);
auto new_block_desc = desc->AddBlock<cpp::BlockDesc>();
*new_block_desc = *sub_block_desc;
delete sub_block_desc;
subgraph_op->mutable_op_info()->SetAttr<int32_t>("sub_block",
sub_block_idx);
subgraph_op->SetSubBlock(new_block_desc);
// Update main block desc after a new subblock desc is added
main_block = desc->GetBlock<cpp::BlockDesc>(0);
}
}
auto op = main_block->AddOp<cpp::OpDesc>();
*op = *node.op()->op_info(); *op = *node.op()->op_info();
op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType()); op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType());
} }
...@@ -142,16 +165,25 @@ void Program::Build(const cpp::ProgramDesc& prog) { ...@@ -142,16 +165,25 @@ void Program::Build(const cpp::ProgramDesc& prog) {
VLOG(4) << "create Op [" << op_type << "]"; VLOG(4) << "create Op [" << op_type << "]";
auto op = LiteOpRegistry::Global().Create(op_type); auto op = LiteOpRegistry::Global().Create(op_type);
CHECK(op) << "no Op found for " << op_type; CHECK(op) << "no Op found for " << op_type;
if (op_type == "while" || op_type == "conditional_block") { if (op_type == "while" || op_type == "conditional_block" ||
op_type == "subgraph") {
auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block"); auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
auto sub_block = CHECK(sub_block_idx >= 0 && sub_block_idx < program.BlocksSize())
<< "Invalid attribute sub_block(" << sub_block_idx << ") for "
<< op_type;
auto sub_block_desc =
const_cast<cpp::ProgramDesc&>(prog).GetBlock<cpp::BlockDesc>( const_cast<cpp::ProgramDesc&>(prog).GetBlock<cpp::BlockDesc>(
sub_block_idx); sub_block_idx);
CHECK(sub_block_desc);
if (op_type == "while") { if (op_type == "while") {
static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(sub_block); static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(
sub_block_desc);
} else if (op_type == "conditional_block") { } else if (op_type == "conditional_block") {
static_cast<operators::ConditionalBlockOpLite*>(op.get())->SetSubBlock( static_cast<operators::ConditionalBlockOpLite*>(op.get())->SetSubBlock(
sub_block); sub_block_desc);
} else if (op_type == "subgraph") {
static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(
sub_block_desc);
} }
} }
ops_.emplace_back(std::move(op)); ops_.emplace_back(std::move(op));
......
add_subdirectory(bridges)
if(NOT LITE_WITH_NPU) add_kernel(subgraph_compute_npu NPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_npu subgraph_bridge_engine ${npu_subgraph_bridges})
return ()
endif()
message(STATUS "compile with lite NPU kernels")
add_kernel(graph_compute_npu NPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} npu_runtime)
# lite_cc_test(test_graph_compute_npu SRCS graph_compute_test.cc DEPS graph_compute_npu)
if(NOT LITE_ON_TINY_PUBLISH)
add_subdirectory(bridges)
endif()
lite_cc_library(npu_bridge_registry SRCS registry.cc) if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU)
return()
endif()
set(npu_bridge_deps npu_bridge_registry npu_builder op) lite_cc_library(subgraph_bridge_registry
SRCS registry.cc
DEPS op)
lite_cc_library(subgraph_bridge_engine
SRCS engine.cc
DEPS tensor op scope program)
lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps}) if(NOT LITE_WITH_NPU)
lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps}) return()
lite_cc_library(npu_bridge_mul_op SRCS mul_op.cc DEPS ${npu_bridge_deps}) endif()
lite_cc_library(npu_bridge_act_op SRCS act_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_scale_op SRCS scale_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_softmax_op SRCS softmax_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_pool_op SRCS pool_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_reshape_op SRCS reshape_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_conv_transpose_op SRCS conv_transpose_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_interpolate_op SRCS interpolate_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_transpose_op SRCS transpose_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_unsqueeze_op SRCS unsqueeze_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_argmax_op SRCS argmax_op.cc DEPS ${npu_bridge_deps})
set(npu_bridges lite_cc_library(subgraph_bridge_utility_npu SRCS utility.cc DEPS ${npu_builder_libs} tensor)
npu_bridge_registry lite_cc_library(subgraph_bridge_graph_npu SRCS graph.cc DEPS subgraph_bridge_utility_npu)
npu_bridge_fc_op
npu_bridge_conv_op
npu_bridge_mul_op
npu_bridge_act_op
npu_bridge_scale_op
npu_bridge_softmax_op
npu_bridge_pool_op
npu_bridge_batch_norm_op
npu_bridge_elementwise_ops
npu_bridge_reshape_op
npu_bridge_conv_transpose_op
npu_bridge_interpolate_op
npu_bridge_transpose_op
npu_bridge_split_op
npu_bridge_concat_op
npu_bridge_shuffle_channel_op
npu_bridge_pad2d_op
npu_bridge_square_op
npu_bridge_sqrt_op
npu_bridge_reduce_mean_op
npu_bridge_unsqueeze_op
npu_bridge_argmax_op
CACHE INTERNAL "npu_bridges")
set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops}) set(npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_npu subgraph_bridge_graph_npu)
lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_fc_op_npu SRCS fc_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_conv_op_npu SRCS conv_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_mul_op_npu SRCS mul_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_act_op_npu SRCS act_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_scale_op SRCS scale_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_scale_op_npu SRCS scale_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_softmax_op_npu SRCS softmax_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_pool_op_npu SRCS pool_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_batch_norm_op SRCS batch_norm_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_batch_norm_op_npu SRCS batch_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_elementwise_ops_npu SRCS elementwise_ops.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_reshape_op SRCS reshape_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_reshape_op_npu SRCS reshape_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_conv_transpose_op SRCS conv_transpose_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_conv_transpose_op_npu SRCS conv_transpose_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_interpolate_op SRCS interpolate_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_interpolate_op_npu SRCS interpolate_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_transpose_op SRCS transpose_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_transpose_op_npu SRCS transpose_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_split_op_npu SRCS split_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_concat_op_npu SRCS concat_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_shuffle_channel_op_npu SRCS shuffle_channel_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_pad2d_op_npu SRCS pad2d_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_square_op_npu SRCS square_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_sqrt_op_npu SRCS sqrt_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_reduce_mean_op_npu SRCS reduce_mean_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_unsqueeze_op SRCS unsqueeze_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_test(test_npu_bridge_argmax_op SRCS argmax_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
message(STATUS "+++++ npu_bridges: ${npu_bridges}") set(npu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_utility_npu
subgraph_bridge_graph_npu
subgraph_bridge_fc_op_npu
subgraph_bridge_conv_op_npu
subgraph_bridge_mul_op_npu
subgraph_bridge_act_op_npu
subgraph_bridge_scale_op_npu
subgraph_bridge_softmax_op_npu
subgraph_bridge_pool_op_npu
subgraph_bridge_batch_norm_op_npu
subgraph_bridge_elementwise_ops_npu
subgraph_bridge_reshape_op_npu
subgraph_bridge_conv_transpose_op_npu
subgraph_bridge_interpolate_op_npu
subgraph_bridge_transpose_op_npu
subgraph_bridge_split_op_npu
subgraph_bridge_concat_op_npu
subgraph_bridge_shuffle_channel_op_npu
subgraph_bridge_pad2d_op_npu
subgraph_bridge_square_op_npu
subgraph_bridge_sqrt_op_npu
subgraph_bridge_reduce_mean_op_npu
subgraph_bridge_unsqueeze_op_npu
subgraph_bridge_argmax_op_npu
CACHE INTERNAL "npu_subgraph_bridges")
message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
...@@ -12,34 +12,32 @@ ...@@ -12,34 +12,32 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op, int ActConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = act_op->scope(); CHECK(op != nullptr);
auto op_info = act_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); VLOG(3) << "[NPU] Converting " + op_type + "...";
LOG(INFO) << "[NPU] Converting " + op_type + "...";
// create act node and set input node from inputs_map // Create act node and set input node which is obtained from the node map
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto act_node = std::make_shared<ge::op::Activation>(unique_op_type); auto out_var_name = op_info->Output("Out").front();
CHECK(inputs_map.count(x_var_name)); auto act_node = graph->AddNode<ge::op::Activation>(out_var_name);
act_node->set_input_x(*inputs_map.at(x_var_name)); act_node->set_input_x(*graph->GetNode(x_var_name));
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(act_node);
// TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
// clipped_relu etc. // clipped_relu etc.
act_node->set_attr_mode(lite::npu::CvtActMode(op_type)); act_node->set_attr_mode(CvtActMode(op_type));
if (op_type == "relu_clipped") { if (op_type == "relu_clipped") {
auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef"); auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
...@@ -56,31 +54,33 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op, ...@@ -56,31 +54,33 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
act_node->set_attr_negative_slope(slope); act_node->set_attr_negative_slope(slope);
act_node->set_attr_coef(offset); act_node->set_attr_coef(offset);
} }
return SUCCESS;
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = act_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(sigmoid, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
REGISTER_NPU_BRIDGE(relu, paddle::lite::kernels::npu::bridges::ActConverter); sigmoid,
REGISTER_NPU_BRIDGE(tanh, paddle::lite::kernels::npu::bridges::ActConverter); paddle::lite::subgraph::npu::ActConverter);
REGISTER_NPU_BRIDGE(relu_clipped, REGISTER_SUBGRAPH_BRIDGE(NPU, relu, paddle::lite::subgraph::npu::ActConverter);
paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_SUBGRAPH_BRIDGE(NPU, tanh, paddle::lite::subgraph::npu::ActConverter);
REGISTER_NPU_BRIDGE(relu6, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
// REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter); relu_clipped,
REGISTER_NPU_BRIDGE(leaky_relu, paddle::lite::subgraph::npu::ActConverter);
paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_SUBGRAPH_BRIDGE(NPU, relu6, paddle::lite::subgraph::npu::ActConverter);
REGISTER_NPU_BRIDGE(abs, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
REGISTER_NPU_BRIDGE(softsign, leaky_relu,
paddle::lite::kernels::npu::bridges::ActConverter); paddle::lite::subgraph::npu::ActConverter);
REGISTER_NPU_BRIDGE(softplus, REGISTER_SUBGRAPH_BRIDGE(NPU, abs, paddle::lite::subgraph::npu::ActConverter);
paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
REGISTER_NPU_BRIDGE(hard_sigmoid, softsign,
paddle::lite::kernels::npu::bridges::ActConverter); paddle::lite::subgraph::npu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(NPU,
softplus,
paddle::lite::subgraph::npu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(NPU,
hard_sigmoid,
paddle::lite::subgraph::npu::ActConverter);
...@@ -12,59 +12,41 @@ ...@@ -12,59 +12,41 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type ArgmaxConverter(const std::shared_ptr<lite::OpLite> argmax_op, int ArgmaxConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = argmax_op->scope(); CHECK(op != nullptr);
auto op_info = argmax_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
int axis = op_info->GetAttr<int64_t>("axis");
std::shared_ptr<ge::op::ArgMax> argmax_node =
std::make_shared<ge::op::ArgMax>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
int axis = op_info->GetAttr<int64_t>("axis");
CHECK(inputs_map.count(x_var_name)); auto argmax_node = graph->AddNode<ge::op::ArgMax>(out_var_name);
argmax_node->set_input_x1(*inputs_map.at(x_var_name)); argmax_node->set_input_x1(*graph->GetNode(x_var_name));
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(argmax_node);
Tensor x2_t;
x2_t.Resize(std::vector<int64_t>{1});
auto x2_t_data = x2_t.mutable_data<int>();
x2_t_data[0] = axis;
auto x2 = std::make_shared<ge::op::Const>(unique_op_type + "/axis"); auto x2 = graph->AddNode(out_var_name + "/axis", axis);
x2->set_attr_value(lite::npu::CvtTensor(&x2_t));
argmax_node->set_input_x2(*x2); argmax_node->set_input_x2(*x2);
lite::npu::OpList::Global().add(x2); return SUCCESS;
// argmax_node->set_attr_axis(axis);
// argmax only support output_type==int32
// argmax_node->set_attr_output_type(3);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = argmax_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(arg_max, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::ArgmaxConverter); arg_max,
paddle::lite::subgraph::npu::ArgmaxConverter);
...@@ -12,81 +12,66 @@ ...@@ -12,81 +12,66 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type BatchNormConverter( int BatchNormConverter(void* ctx, OpLite* op) {
const std::shared_ptr<lite::OpLite> batch_norm_op, CHECK(ctx != nullptr);
const node_map_type& inputs_map) { CHECK(op != nullptr);
auto scope = batch_norm_op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = batch_norm_op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
std::shared_ptr<ge::op::BatchNormExt2> batch_norm_node =
std::make_shared<ge::op::BatchNormExt2>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Output("Y").front();
auto batch_norm_node = graph->AddNode<ge::op::BatchNormExt2>(y_var_name);
batch_norm_node->set_input_x(*graph->GetNode(x_var_name));
auto scale_var_name = op_info->Input("Scale").front(); auto scale_var_name = op_info->Input("Scale").front();
lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>(); auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
auto npu_scale = std::make_shared<ge::op::Const>(scale_var_name); auto scale_const_node = graph->AddNode(scale_var_name, *scale);
npu_scale->set_attr_value(lite::npu::CvtTensor(scale));
lite::npu::OpList::Global().add(npu_scale);
auto bias_var_name = op_info->Input("Bias").front(); auto bias_var_name = op_info->Input("Bias").front();
lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>(); auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
auto npu_bias = std::make_shared<ge::op::Const>(bias_var_name); auto bias_const_node = graph->AddNode(bias_var_name, *bias);
npu_bias->set_attr_value(lite::npu::CvtTensor(bias));
lite::npu::OpList::Global().add(npu_bias);
auto mean_var_name = op_info->Input("Mean").front(); auto mean_var_name = op_info->Input("Mean").front();
lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>(); auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
auto npu_mean = std::make_shared<ge::op::Const>(mean_var_name); auto mean_const_node = graph->AddNode(mean_var_name, *mean);
npu_mean->set_attr_value(lite::npu::CvtTensor(mean));
lite::npu::OpList::Global().add(npu_mean);
auto variance_var_name = op_info->Input("Variance").front(); auto variance_var_name = op_info->Input("Variance").front();
lite::Tensor* variance = auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
scope->FindVar(variance_var_name)->GetMutable<Tensor>(); auto variance_const_node = graph->AddNode(variance_var_name, *variance);
auto npu_variance = std::make_shared<ge::op::Const>(variance_var_name);
npu_variance->set_attr_value(lite::npu::CvtTensor(variance));
lite::npu::OpList::Global().add(npu_variance);
float npu_momentum = op_info->GetAttr<float>("momentum"); float momentum = op_info->GetAttr<float>("momentum");
float npu_epsilon = op_info->GetAttr<float>("epsilon"); float epsilon = op_info->GetAttr<float>("epsilon");
int npu_mode = 1; // bnScale, bnBias tensor dims are 1xCx1x1 int mode = 1; // bnScale, bnBias tensor dims are 1xCx1x1
bool npu_use_global_stats = op_info->GetAttr<bool>("use_global_stats"); bool use_global_stats = op_info->GetAttr<bool>("use_global_stats");
batch_norm_node->set_input_x(*inputs_map.at(x_var_name)); batch_norm_node->set_input_scale(*scale_const_node);
batch_norm_node->set_input_scale(*npu_scale); batch_norm_node->set_input_offset(*bias_const_node);
batch_norm_node->set_input_offset(*npu_bias); batch_norm_node->set_input_mean(*mean_const_node);
batch_norm_node->set_input_mean(*npu_mean); batch_norm_node->set_input_variance(*variance_const_node);
batch_norm_node->set_input_variance(*npu_variance); batch_norm_node->set_attr_momentum(momentum);
batch_norm_node->set_attr_momentum(npu_momentum); batch_norm_node->set_attr_epsilon(epsilon);
batch_norm_node->set_attr_epsilon(npu_epsilon); batch_norm_node->set_attr_mode(mode);
batch_norm_node->set_attr_mode(npu_mode); batch_norm_node->set_attr_use_global_stats(use_global_stats);
batch_norm_node->set_attr_use_global_stats(npu_use_global_stats); return SUCCESS;
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(batch_norm_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Y").front()] = batch_norm_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(batch_norm, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::BatchNormConverter); batch_norm,
paddle::lite::subgraph::npu::BatchNormConverter);
...@@ -12,58 +12,51 @@ ...@@ -12,58 +12,51 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op, int ConcatConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
lite::Scope* scope = concat_op->scope(); CHECK(op != nullptr);
const lite::OpInfo* op_info = concat_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " << op_type << " ... "; VLOG(3) << "[NPU] Converting " << op_type << " ... ";
auto x_var_names = op_info->Input("X"); auto x_var_names = op_info->Input("X");
auto out_var_name = op_info->Output("Out").front();
auto axis = op_info->GetAttr<int>("axis"); auto axis = op_info->GetAttr<int>("axis");
int num = x_var_names.size(); auto num = x_var_names.size();
int index = 0; auto concat_node = graph->AddNode<ge::op::Concat>(out_var_name);
concat_node->set_attr_axis(axis);
std::shared_ptr<ge::op::Concat> output_node = concat_node->set_attr_N(num);
std::make_shared<ge::op::Concat>(unique_op_type); concat_node->create_dynamic_input_x(num);
output_node->set_attr_axis(axis); int idx = 1;
output_node->set_attr_N(num); for (auto& x_var_name : x_var_names) {
output_node->create_dynamic_input_x(num); if (graph->HasNode(x_var_name)) {
for (auto x_var_name : x_var_names) { concat_node->set_dynamic_input_x(idx, *graph->GetNode(x_var_name));
if (inputs_map.find(x_var_name) != inputs_map.end()) {
output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
} else { } else {
auto consty = std::make_shared<ge::op::Const>(x_var_name); auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto* x = scope->FindVar(x_var_name)->GetMutable<Tensor>(); auto x_const_node = graph->AddNode(x_var_name, *x);
consty->set_attr_value(lite::npu::CvtTensor(x)); concat_node->set_dynamic_input_x(idx, *x_const_node);
output_node->set_dynamic_input_x(index + 1, *consty);
lite::npu::OpList::Global().add(consty);
} }
index++; idx++;
} }
lite::npu::OpList::Global().add(output_node); return SUCCESS;
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = output_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(concat, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::ConcatConverter); concat,
paddle::lite::subgraph::npu::ConcatConverter);
...@@ -13,32 +13,33 @@ ...@@ -13,32 +13,33 @@
// limitations under the License. // limitations under the License.
#include "lite/operators/conv_op.h" #include "lite/operators/conv_op.h"
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, int ConvConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = conv_op->scope(); CHECK(op != nullptr);
auto op_info = conv_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " << op_type << "... "; VLOG(3) << "[NPU] Converting " << op_type << "... ";
// get input, filter and op attributes // Get input, filter and op attributes
auto input_var_name = op_info->Input("Input").front(); auto input_var_name = op_info->Input("Input").front();
auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>(); auto input = scope->FindVar(input_var_name)->GetMutable<Tensor>();
auto input_dims = input->dims(); auto input_dims = input->dims();
auto output_var_name = op_info->Output("Output").front(); auto output_var_name = op_info->Output("Output").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>(); auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims(); auto output_dims = output->dims();
auto filter_var_name = op_info->Input("Filter").front(); auto filter_var_name = op_info->Input("Filter").front();
auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>(); auto filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
auto filter_dims = filter->dims(); auto filter_dims = filter->dims();
auto bs = input_dims[0]; auto bs = input_dims[0];
auto ic = input_dims[1]; auto ic = input_dims[1];
...@@ -63,7 +64,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -63,7 +64,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
} }
} }
CHECK_EQ(paddings.size(), 4L) CHECK_EQ(paddings.size(), 4L)
<< "Paddings size should be the same or twice as the input size."; << "[NPU] Paddings size should be the same or twice as the input size.";
std::string padding_algorithm(""); std::string padding_algorithm("");
if (op_info->HasAttr("padding_algorithm")) { if (op_info->HasAttr("padding_algorithm")) {
...@@ -76,9 +77,9 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -76,9 +77,9 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
input_dims, input_dims,
filter_dims); filter_dims);
// check depthwise mode, and decide whether use ConvolutionDepthwise Op // Check depthwise mode, and decide whether use ConvolutionDepthwise Op
bool use_depthwise_conv = bool use_depthwise_conv =
false; // whether use ge::op::ConvolutionDepthwise ? false; // Whether use ge::op::ConvolutionDepthwise ?
bool is_depthwise_mode = ic == groups && oc == groups; bool is_depthwise_mode = ic == groups && oc == groups;
if (is_depthwise_mode && if (is_depthwise_mode &&
!((groups == 1 || groups >= 5) && dilations[0] == 1 && !((groups == 1 || groups >= 5) && dilations[0] == 1 &&
...@@ -90,26 +91,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -90,26 +91,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
"performance."; "performance.";
} }
// check input // Create filter node
CHECK(inputs_map.count(input_var_name)); auto filter_const_node = graph->AddNode(filter_var_name, *filter);
lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
// create filter node // Create bias node if exists bias
CHECK(!inputs_map.count(filter_var_name)); // Supports the bias nodes with the following dimensions
auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
filter_const_node->set_attr_value(lite::npu::CvtTensor(filter));
lite::npu::OpList::Global().add(filter_const_node);
// create bias node if has bias
// supports the bias nodes with the following dimensions
// 0: {oc} // 0: {oc}
// 1: {1, oc, oh, ow} // 1: {1, oc, oh, ow}
// 2: {n, oc, oh, ow} // 2: {n, oc, oh, ow}
std::shared_ptr<ge::Operator> bias_node = nullptr; std::shared_ptr<ge::Operator> bias_node = nullptr;
bool is_channel_bias = false; bool is_channel_bias = false;
if (lite::npu::HasInputArg(op_info, scope, "Bias")) { if (HasInputArg(op_info, scope, "Bias")) {
auto bias_var_name = op_info->Input("Bias").front(); auto bias_var_name = op_info->Input("Bias").front();
auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>(); auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
auto bias_dims = bias->dims(); auto bias_dims = bias->dims();
auto bias_data_size = bias_dims.production(); auto bias_data_size = bias_dims.production();
auto output_data_size = output_dims.production(); auto output_data_size = output_dims.production();
...@@ -125,28 +119,26 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -125,28 +119,26 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
// 2: {n, oc, oh, ow} // 2: {n, oc, oh, ow}
bias_shape = output_dims.Vectorize(); bias_shape = output_dims.Vectorize();
} else { } else {
LOG(ERROR) << "bias dimension " << bias_dims LOG(WARNING) << "[NPU] Bias dimension " << bias_dims
<< " isn't supported in conv2d Op when output dimension is " << " isn't supported in conv2d Op when output dimension is "
<< output_dims; << output_dims;
return FAILED;
} }
if (inputs_map.count(bias_var_name)) { if (graph->HasNode(bias_var_name)) {
// bias node from input map // Bias node from input map
bias_node = inputs_map.at(bias_var_name); bias_node = graph->GetNode(bias_var_name);
} else { } else {
// bias node with const data // Bias node with const data
auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name); bias_node = graph->AddNode(bias_var_name, *bias, bias_shape);
bias_const_node->set_attr_value(lite::npu::CvtTensor(bias, bias_shape));
bias_node = bias_const_node;
} }
lite::npu::OpList::Global().add(bias_node);
} }
// create conv node and set input, filter, bias nodes and attributes // Create conv node and set input, filter, bias nodes and attributes
std::shared_ptr<ge::Operator> conv_node = nullptr; std::shared_ptr<ge::Operator> conv_node = nullptr;
if (use_depthwise_conv && is_depthwise_mode) { if (use_depthwise_conv && is_depthwise_mode) {
auto depthwise_conv_node = auto depthwise_conv_node =
std::make_shared<ge::op::ConvolutionDepthwise>(unique_op_type); graph->AddNode<ge::op::ConvolutionDepthwise>(output_var_name);
depthwise_conv_node->set_input_x(*inputs_map.at(input_var_name)); depthwise_conv_node->set_input_x(*graph->GetNode(input_var_name));
depthwise_conv_node->set_input_filter(*filter_const_node); depthwise_conv_node->set_input_filter(*filter_const_node);
depthwise_conv_node->set_attr_mode(1); depthwise_conv_node->set_attr_mode(1);
depthwise_conv_node->set_attr_algo(0); depthwise_conv_node->set_attr_algo(0);
...@@ -161,21 +153,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -161,21 +153,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
ge::AttrValue::LIST_INT({strides[0], strides[1]})); ge::AttrValue::LIST_INT({strides[0], strides[1]}));
depthwise_conv_node->set_attr_kernel( depthwise_conv_node->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
lite::npu::OpList::Global().add(depthwise_conv_node);
conv_node = depthwise_conv_node; conv_node = depthwise_conv_node;
// ConvolutionDepthwise Op doesn't support bias, so append Add node to // ConvolutionDepthwise Op doesn't support bias, so append Add node to
// support bias // support bias
if (bias_node != nullptr) { if (bias_node != nullptr) {
auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add"); auto add_node = graph->AddNode<ge::op::Add>(output_var_name);
add_node->set_input_x1(*depthwise_conv_node); add_node->set_input_x1(*depthwise_conv_node);
add_node->set_input_x2(*bias_node); add_node->set_input_x2(*bias_node);
lite::npu::OpList::Global().add(add_node);
conv_node = add_node; conv_node = add_node;
} }
} else { } else {
auto common_conv_node = auto common_conv_node =
std::make_shared<ge::op::Convolution>(unique_op_type); graph->AddNode<ge::op::Convolution>(output_var_name);
common_conv_node->set_input_x(*inputs_map.at(input_var_name)); common_conv_node->set_input_x(*graph->GetNode(input_var_name));
common_conv_node->set_input_w(*filter_const_node); common_conv_node->set_input_w(*filter_const_node);
common_conv_node->set_attr_mode(1); common_conv_node->set_attr_mode(1);
common_conv_node->set_attr_pad_mode(0); // NOTSET common_conv_node->set_attr_pad_mode(0); // NOTSET
...@@ -188,7 +178,6 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -188,7 +178,6 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
ge::AttrValue::LIST_INT({strides[0], strides[1]})); ge::AttrValue::LIST_INT({strides[0], strides[1]}));
common_conv_node->set_attr_kernel( common_conv_node->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
lite::npu::OpList::Global().add(common_conv_node);
conv_node = common_conv_node; conv_node = common_conv_node;
// Convolution Op only support bias with dimension {1, oc, 1, 1}, // Convolution Op only support bias with dimension {1, oc, 1, 1},
// so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow) // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
...@@ -196,37 +185,32 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -196,37 +185,32 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
if (is_channel_bias) { if (is_channel_bias) {
common_conv_node->set_input_b(*bias_node); common_conv_node->set_input_b(*bias_node);
} else { } else {
auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add"); auto add_node = graph->AddNode<ge::op::Add>(output_var_name);
add_node->set_input_x1(*common_conv_node); add_node->set_input_x1(*common_conv_node);
add_node->set_input_x2(*bias_node); add_node->set_input_x2(*bias_node);
lite::npu::OpList::Global().add(add_node);
conv_node = add_node; conv_node = add_node;
} }
} }
} }
CHECK(conv_node); CHECK(conv_node);
node_map_type outputs_map;
if (fuse_relu) { if (fuse_relu) {
// append relu node if fuse_relu is true // Append relu node if fuse_relu is true
auto relu_node = auto relu_node = graph->AddNode<ge::op::Activation>(output_var_name);
std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
relu_node->set_input_x(*conv_node); relu_node->set_input_x(*conv_node);
relu_node->set_attr_mode(lite::npu::CvtActMode("relu")); relu_node->set_attr_mode(CvtActMode("relu"));
lite::npu::OpList::Global().add(relu_node);
outputs_map[op_info->Output("Output").front()] = relu_node;
} else {
outputs_map[op_info->Output("Output").front()] = conv_node;
} }
return outputs_map; return REBUILD_WHEN_SHAPE_CHANGED;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(conv2d, paddle::lite::kernels::npu::bridges::ConvConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
REGISTER_NPU_BRIDGE(depthwise_conv2d, conv2d,
paddle::lite::kernels::npu::bridges::ConvConverter); paddle::lite::subgraph::npu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(NPU,
depthwise_conv2d,
paddle::lite::subgraph::npu::ConvConverter);
...@@ -12,30 +12,31 @@ ...@@ -12,30 +12,31 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type ConvTransposeConverter( int ConvTransposeConverter(void* ctx, OpLite* op) {
const std::shared_ptr<lite::OpLite> conv_transpose_op, CHECK(ctx != nullptr);
const node_map_type& inputs_map) { CHECK(op != nullptr);
auto scope = conv_transpose_op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = conv_transpose_op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " << op_type << "... "; VLOG(3) << "[NPU] Converting " << op_type << "... ";
// get input, output and op attributes // Get input, output and op attributes
auto input_var_name = op_info->Input("Input").front(); auto input_var_name = op_info->Input("Input").front();
auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>(); auto input = scope->FindVar(input_var_name)->GetMutable<Tensor>();
auto input_shape = input->dims().Vectorize(); auto input_shape = input->dims().Vectorize();
auto output_var_name = op_info->Output("Output").front();
auto filter_var_name = op_info->Input("Filter").front(); auto filter_var_name = op_info->Input("Filter").front();
auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>(); auto filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
auto filter_shape = filter->dims().Vectorize(); auto filter_shape = filter->dims().Vectorize();
CHECK_EQ(input_shape.size(), 4); CHECK_EQ(input_shape.size(), 4);
CHECK_EQ(filter_shape.size(), 4); CHECK_EQ(filter_shape.size(), 4);
...@@ -54,42 +55,34 @@ node_map_type ConvTransposeConverter( ...@@ -54,42 +55,34 @@ node_map_type ConvTransposeConverter(
} }
} }
CHECK_EQ(paddings.size(), 4L) CHECK_EQ(paddings.size(), 4L)
<< "Paddings size should be the same or twice as the input size."; << "[NPU] Paddings size should be the same or twice as the input size.";
// create deconv node // Create deconv node
auto conv_transpose_node = auto conv_transpose_node =
std::make_shared<ge::op::Deconvolution>(unique_op_type); graph->AddNode<ge::op::Deconvolution>(output_var_name);
// create input sizes node to describe the dimensions of input tensor // Create input sizes node to describe the dimensions of input tensor
std::vector<int32_t> output_shape; std::vector<int32_t> input_sizes;
output_shape.push_back(input_shape[0]); input_sizes.push_back(input_shape[0]);
output_shape.push_back(filter_shape[1] * groups); input_sizes.push_back(filter_shape[1] * groups);
for (int i = 0; i < strides.size(); i++) { for (int i = 0; i < strides.size(); i++) {
int kernel_ext = dilations[i] * (filter_shape[i + 2] - 1) + 1; int kernel_ext = dilations[i] * (filter_shape[i + 2] - 1) + 1;
int output_size = int output_size =
(input_shape[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i]; (input_shape[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i];
output_shape.push_back(output_size); input_sizes.push_back(output_size);
} }
auto input_sizes_const_node = auto input_sizes_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/input_size"); graph->AddNode(output_var_name + "/input_sizes", input_sizes);
input_sizes_const_node->set_attr_value(
lite::npu::CreateTensorAndFillData(output_shape));
conv_transpose_node->set_input_input_sizes(*input_sizes_const_node); conv_transpose_node->set_input_input_sizes(*input_sizes_const_node);
lite::npu::OpList::Global().add(input_sizes_const_node);
// create filter node // Create filter node
CHECK(!inputs_map.count(filter_var_name)); auto filter_const_node = graph->AddNode(filter_var_name, *filter);
auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
filter_const_node->set_attr_value(lite::npu::CvtTensor(filter));
conv_transpose_node->set_input_filter(*filter_const_node); conv_transpose_node->set_input_filter(*filter_const_node);
lite::npu::OpList::Global().add(filter_const_node);
// set input node // Set input node
CHECK(inputs_map.count(input_var_name)); conv_transpose_node->set_input_x(*graph->GetNode(input_var_name));
conv_transpose_node->set_input_x(*inputs_map.at(input_var_name));
lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
// set attributes // Set attributes
conv_transpose_node->set_attr_format(0); // NCHW conv_transpose_node->set_attr_format(0); // NCHW
conv_transpose_node->set_attr_pad_mode(0); // NOTSET conv_transpose_node->set_attr_pad_mode(0); // NOTSET
conv_transpose_node->set_attr_group(groups); conv_transpose_node->set_attr_group(groups);
...@@ -101,50 +94,39 @@ node_map_type ConvTransposeConverter( ...@@ -101,50 +94,39 @@ node_map_type ConvTransposeConverter(
ge::AttrValue::LIST_INT({strides[0], strides[1]})); ge::AttrValue::LIST_INT({strides[0], strides[1]}));
conv_transpose_node->set_attr_kernel( conv_transpose_node->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]})); ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]}));
lite::npu::OpList::Global().add(conv_transpose_node);
// append add node to add bias if has bias // Append add node to add bias if exists bias
std::shared_ptr<ge::Operator> output_node = conv_transpose_node; std::shared_ptr<ge::Operator> output_node = conv_transpose_node;
if (lite::npu::HasInputArg(op_info, scope, "Bias")) { if (HasInputArg(op_info, scope, "Bias")) {
// create bias node // Create bias node
auto bias_var_name = op_info->Input("Bias").front(); auto bias_var_name = op_info->Input("Bias").front();
CHECK(!inputs_map.count(bias_var_name)); CHECK(!graph->HasNode(bias_var_name));
auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>(); auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
auto channel_size = bias->dims().production(); auto channel_size = bias->dims().production();
CHECK_EQ(channel_size, filter_shape[1] * groups); CHECK_EQ(channel_size, filter_shape[1] * groups);
auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name); auto bias_const_node =
bias_const_node->set_attr_value( graph->AddNode(bias_var_name, *bias, {1, channel_size, 1, 1});
lite::npu::CvtTensor(bias, {1, channel_size, 1, 1})); // Append add node to add bias node
lite::npu::OpList::Global().add(bias_const_node); auto add_node = graph->AddNode<ge::op::Add>(output_var_name);
// append add node to add bias node
auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
add_node->set_input_x1(*conv_transpose_node); add_node->set_input_x1(*conv_transpose_node);
add_node->set_input_x2(*bias_const_node); add_node->set_input_x2(*bias_const_node);
lite::npu::OpList::Global().add(add_node);
output_node = add_node; output_node = add_node;
} }
node_map_type outputs_map;
if (fuse_relu) { if (fuse_relu) {
// append relu node if fuse_relu is true // Append relu node if fuse_relu is true
auto relu_node = auto relu_node = graph->AddNode<ge::op::Activation>(output_var_name);
std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
relu_node->set_input_x(*output_node); relu_node->set_input_x(*output_node);
relu_node->set_attr_mode(lite::npu::CvtActMode("relu")); relu_node->set_attr_mode(CvtActMode("relu"));
lite::npu::OpList::Global().add(relu_node);
outputs_map[op_info->Output("Output").front()] = relu_node;
} else {
outputs_map[op_info->Output("Output").front()] = output_node;
} }
return outputs_map; return REBUILD_WHEN_SHAPE_CHANGED;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE( REGISTER_SUBGRAPH_BRIDGE(NPU,
conv2d_transpose, conv2d_transpose,
paddle::lite::kernels::npu::bridges::ConvTransposeConverter); paddle::lite::subgraph::npu::ConvTransposeConverter);
...@@ -12,18 +12,18 @@ ...@@ -12,18 +12,18 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) { std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
auto x_dims = x.dims(); auto x_dims = x.dims();
CHECK_EQ(x_dims.size(), 4UL) << "[NPU] only support 4-dimension x"; CHECK_EQ(x_dims.size(), 4UL) << "[NPU] Only support 4-dimension x";
auto y_dims = y->dims(); auto y_dims = y->dims();
CHECK_GE(x_dims.size(), y_dims.size()); CHECK_GE(x_dims.size(), y_dims.size());
...@@ -45,93 +45,86 @@ std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) { ...@@ -45,93 +45,86 @@ std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
return y_new_shape; return y_new_shape;
} }
node_map_type ElementwiseConverter( int ElementwiseConverter(void* ctx, OpLite* op) {
const std::shared_ptr<lite::OpLite> elementwise_op, CHECK(ctx != nullptr);
const node_map_type& inputs_map) { CHECK(op != nullptr);
auto scope = elementwise_op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = elementwise_op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Input("Y").front(); auto y_var_name = op_info->Input("Y").front();
CHECK(inputs_map.find(x_var_name) != inputs_map.end()); auto out_var_name = op_info->Output("Out").front();
auto axis = op_info->GetAttr<int>("axis"); auto axis = op_info->GetAttr<int>("axis");
std::shared_ptr<ge::Operator> elementwise_node = nullptr; std::shared_ptr<ge::Operator> elementwise_node = nullptr;
std::shared_ptr<ge::Operator> x_node = inputs_map.at(x_var_name); std::shared_ptr<ge::Operator> x_node = graph->GetNode(x_var_name);
std::shared_ptr<ge::Operator> y_node = nullptr; std::shared_ptr<ge::Operator> y_node = nullptr;
if (inputs_map.find(y_var_name) != inputs_map.end()) { if (graph->HasNode(y_var_name)) {
y_node = inputs_map.at(y_var_name); y_node = graph->GetNode(y_var_name);
} else { } else {
auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
auto x = scope->FindTensor(x_var_name); auto x = scope->FindTensor(x_var_name);
auto y = scope->FindMutableTensor(y_var_name); auto y = scope->FindMutableTensor(y_var_name);
auto y_new_shape = CvtYShape(*x, y, axis); auto y_new_shape = CvtYShape(*x, y, axis);
y_const_node->set_attr_value(lite::npu::CvtTensor(y, y_new_shape)); y_node = graph->AddNode(y_var_name, y, y_new_shape);
y_node = y_const_node;
} }
lite::npu::OpList::Global().add(x_node);
lite::npu::OpList::Global().add(y_node);
if (op_type == "elementwise_add" || if (op_type == "elementwise_add" ||
op_type == "fusion_elementwise_add_activation") { op_type == "fusion_elementwise_add_activation") {
auto elt_node = std::make_shared<ge::op::Add>(unique_op_type); auto elt_node = graph->AddNode<ge::op::Add>(out_var_name);
elt_node->set_input_x1(*x_node); elt_node->set_input_x1(*x_node);
elt_node->set_input_x2(*y_node); elt_node->set_input_x2(*y_node);
elementwise_node = elt_node; elementwise_node = elt_node;
} else if (op_type == "elementwise_sub") { } else if (op_type == "elementwise_sub") {
auto elt_node = std::make_shared<ge::op::Sub>(unique_op_type); auto elt_node = graph->AddNode<ge::op::Sub>(out_var_name);
elt_node->set_input_x1(*x_node); elt_node->set_input_x1(*x_node);
elt_node->set_input_x2(*y_node); elt_node->set_input_x2(*y_node);
elementwise_node = elt_node; elementwise_node = elt_node;
} else if (op_type == "elementwise_mul") { } else if (op_type == "elementwise_mul") {
auto elt_node = std::make_shared<ge::op::Mul>(unique_op_type); auto elt_node = graph->AddNode<ge::op::Mul>(out_var_name);
elt_node->set_input_x(*x_node); elt_node->set_input_x(*x_node);
elt_node->set_input_y(*y_node); elt_node->set_input_y(*y_node);
elementwise_node = elt_node; elementwise_node = elt_node;
} else if (op_type == "elementwise_div") { } else if (op_type == "elementwise_div") {
auto elt_node = std::make_shared<ge::op::RealDiv>(unique_op_type); auto elt_node = graph->AddNode<ge::op::RealDiv>(out_var_name);
elt_node->set_input_x1(*x_node); elt_node->set_input_x1(*x_node);
elt_node->set_input_x2(*y_node); elt_node->set_input_x2(*y_node);
elementwise_node = elt_node; elementwise_node = elt_node;
} else { } else {
LOG(FATAL) << "unsupported op type: " << op_type; LOG(WARNING) << "[NPU] Unsupported op type: " << op_type;
return FAILED;
} }
lite::npu::OpList::Global().add(elementwise_node);
node_map_type outputs_map;
if (op_type == "fusion_elementwise_add_activation") { if (op_type == "fusion_elementwise_add_activation") {
auto act_type = op_info->GetAttr<std::string>("act_type"); auto act_type = op_info->GetAttr<std::string>("act_type");
auto act_node = auto act_node = graph->AddNode<ge::op::Activation>(out_var_name);
std::make_shared<ge::op::Activation>(unique_op_type + "/act");
act_node->set_input_x(*elementwise_node); act_node->set_input_x(*elementwise_node);
// TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
// clipped_relu etc. // clipped_relu etc.
act_node->set_attr_mode(lite::npu::CvtActMode(act_type)); act_node->set_attr_mode(CvtActMode(act_type));
lite::npu::OpList::Global().add(act_node);
outputs_map[op_info->Output("Out").front()] = act_node;
} else {
outputs_map[op_info->Output("Out").front()] = elementwise_node;
} }
return outputs_map; return REBUILD_WHEN_SHAPE_CHANGED;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(elementwise_add, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::ElementwiseConverter); elementwise_add,
REGISTER_NPU_BRIDGE(fusion_elementwise_add_activation, paddle::lite::subgraph::npu::ElementwiseConverter);
paddle::lite::kernels::npu::bridges::ElementwiseConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
REGISTER_NPU_BRIDGE(elementwise_sub, fusion_elementwise_add_activation,
paddle::lite::kernels::npu::bridges::ElementwiseConverter); paddle::lite::subgraph::npu::ElementwiseConverter);
REGISTER_NPU_BRIDGE(elementwise_mul, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::ElementwiseConverter); elementwise_sub,
REGISTER_NPU_BRIDGE(elementwise_div, paddle::lite::subgraph::npu::ElementwiseConverter);
paddle::lite::kernels::npu::bridges::ElementwiseConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
elementwise_mul,
paddle::lite::subgraph::npu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(NPU,
elementwise_div,
paddle::lite::subgraph::npu::ElementwiseConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/engine.h"
#include <sys/time.h>
#include <time.h>
#include <utility>
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
int Engine::BuildDeviceProgram() { return FAILED; }
int Engine::LaunchDeviceProgram() { return 0; }
int Engine::BuildOriginProgram() {
// TODO(hong19860320) The block_desc need to be divided into subgraphs during
// the exection time. But only see them as a subgraph now.
origin_program_.clear();
for (int op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) {
auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
CHECK(op_desc);
std::string op_type = op_desc->Type();
auto op = LiteOpRegistry::Global().Create(op_desc->Type());
op->Attach(*op_desc, scope_);
std::unique_ptr<KernelBase> picked_kernel;
if (op_desc->HasAttr(kKernelTypeAttr)) {
// Create op and pick up kernel according to the kKernelTypeAttr attribute
auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
std::string alias;
Place place;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
<< " for " << op_type;
auto kernels = op->CreateKernels({place});
CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
auto it = std::find_if(
kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
return it->alias() == alias;
});
CHECK(it != kernels.end());
picked_kernel = std::move(*it);
} else {
VLOG(3) << "The attr '" << kKernelTypeAttr
<< "' not found, pick the first kernel for " << op_type;
#if defined(LITE_WITH_ARM)
auto kernels = op->CreateKernels({Place{TARGET(kARM)}});
#elif defined(LITE_WITH_X86)
auto kernels = op->CreateKernels({Place{TARGET(kX86)}});
#endif
CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
picked_kernel = std::move(kernels.front());
}
picked_kernel->SetContext(
ContextScheduler::Global().NewContext(picked_kernel->target()));
origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
}
return 0;
}
int Engine::LaunchOriginProgram() {
for (auto& inst : origin_program_) {
auto op_type = inst.op()->op_info()->Type();
if (op_type == "feed" || op_type == "fetch") continue;
inst.Run();
}
return 0;
}
int Engine::Build() {
// In order to attach all of the ops of the block desc, we need to build the
// original program firstly.
BuildOriginProgram();
// Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
build_device_program_status_ = BuildDeviceProgram();
return build_device_program_status_;
}
bool Engine::InputShapeChanged() {
for (int i = 0; i < origin_itensors_.size(); i++) {
if (origin_itensors_[i]->dims() != origin_idims_[i]) {
return true;
}
}
return false;
}
int Engine::Launch() {
// Rebuild device program when the shapes of input tensors have been changed.
if (CHECK_SUCCESS(build_device_program_status_) &&
CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
InputShapeChanged()) {
Build();
}
if (CHECK_FAILED(build_device_program_status_)) {
LaunchOriginProgram();
} else {
LaunchDeviceProgram();
}
return 0;
}
} // namespace subgraph
} // namespace lite
} // namespace paddle
...@@ -14,52 +14,63 @@ ...@@ -14,52 +14,63 @@
#pragma once #pragma once
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "lite/backends/xpu/builder.h" #include "lite/core/op_lite.h"
#include "lite/core/mir/pass.h" #include "lite/core/program.h"
#include "lite/core/mir/subgraph/subgraph_program_pass.h" #include "lite/core/tensor.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace mir {
namespace subgraph { namespace subgraph {
class GenerateXPUProgramPass : public SubgraphProgramPass { class Engine {
public: public:
using key2nodes_t = std::map<std::string, Node*>; Engine(int block_idx,
cpp::BlockDesc *block_desc,
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
lite::Scope *scope)
: block_idx_(block_idx),
block_desc_(block_desc),
input_names_(input_names),
output_names_(output_names),
scope_(scope) {}
virtual ~Engine() = default;
void Apply(const std::unique_ptr<SSAGraph>& graph) override; virtual int Build();
virtual int Launch();
private:
Engine(const Engine &) = delete;
protected: protected:
// nodes2cvt: op nodes to convert virtual int BuildDeviceProgram();
// return cvted_vars: converted var nodes virtual int LaunchDeviceProgram();
void CvtAllOpNodes(
const std::vector<Node*>& op_nodes,
lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes);
std::shared_ptr<xtcl::xExpr> CvtVarNode( virtual int BuildOriginProgram();
lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx, virtual int LaunchOriginProgram();
lite::mir::Node* var_node,
const Scope* scope);
std::string BuildXPUGraph(const std::unordered_set<Node*>& op_nodes, virtual bool InputShapeChanged();
const std::unordered_set<Node*>& in_data_vars,
const std::unordered_set<Node*>& out_data_vars,
int sub_id);
void GenXPUSubgraph(const std::unique_ptr<SSAGraph>& graph, int block_idx_;
const std::unordered_set<Node*>& op_nodes, cpp::BlockDesc *block_desc_;
int sub_id); std::vector<std::string> input_names_;
std::vector<std::string> output_names_;
Scope *scope_{nullptr};
// SUCCESS: device program build successed. FAILED: device program build
// failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need
// to rebuild when input shape changed.
int build_device_program_status_{0};
std::vector<DDim> origin_idims_;
std::vector<DDim> origin_odims_;
std::vector<Tensor *> origin_itensors_;
std::vector<Tensor *> origin_otensors_;
std::vector<Instruction> origin_program_;
}; };
} // namespace subgraph } // namespace subgraph
} // namespace mir
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -12,31 +12,31 @@ ...@@ -12,31 +12,31 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op, int FCConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = fc_op->scope(); CHECK(op != nullptr);
auto op_info = fc_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
auto fc_node = std::make_shared<ge::op::FullConnection>(unique_op_type);
auto x_var_name = op_info->Input("Input").front(); auto x_var_name = op_info->Input("Input").front();
auto w_var_name = op_info->Input("W").front(); auto w_var_name = op_info->Input("W").front();
auto out_var_name = op_info->Output("Out").front();
int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims"); int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>(); auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto w = scope->FindVar(w_var_name)->GetMutable<lite::Tensor>(); auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
auto x_dims = x->dims(); auto x_dims = x->dims();
auto w_dims = w->dims(); auto w_dims = w->dims();
...@@ -50,71 +50,54 @@ node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op, ...@@ -50,71 +50,54 @@ node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
VLOG(3) << "[NPU] x dims: " << x_dims << " w dims: " << w_dims << " m: " << m VLOG(3) << "[NPU] x dims: " << x_dims << " w dims: " << w_dims << " m: " << m
<< " k: " << k << " n: " << n; << " k: " << k << " n: " << n;
CHECK(inputs_map.count(x_var_name)); auto fc_node = graph->AddNode<ge::op::FullConnection>(out_var_name + "/fc");
CHECK(!inputs_map.count(w_var_name)); CHECK(!graph->HasNode(w_var_name));
// reshape x to (m, k, 1, 1) // Reshape x to (m, k, 1, 1)
auto reshaped_x_node = auto reshaped_x_node =
std::make_shared<ge::op::Reshape>(x_var_name + "_reshape"); graph->AddNode<ge::op::Reshape>(x_var_name + "/reshape");
reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name)); reshaped_x_node->set_input_tensor(*graph->GetNode(x_var_name));
reshaped_x_node->set_attr_shape({m, k, 1, 1}); reshaped_x_node->set_attr_shape({m, k, 1, 1});
reshaped_x_node->set_attr_axis(0); reshaped_x_node->set_attr_axis(0);
fc_node->set_input_x(*reshaped_x_node); fc_node->set_input_x(*reshaped_x_node);
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(reshaped_x_node);
// create w const node, set its shape to (k, n, 1, 1) and fill with // Create w const node, set its shape to (n, k, 1, 1) and fill with
// the transposed w tensor // the transposed w tensor
auto w_const_node = std::make_shared<ge::op::Const>(w_var_name); Tensor transpose_w;
ge::TensorDesc w_const_desc( transpose_w.Resize({n, k, 1, 1});
ge::Shape({n, k, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT); auto transpose_w_data = transpose_w.mutable_data<float>();
ge::TensorPtr w_const_tensor = std::make_shared<ge::Tensor>();
w_const_tensor->SetTensorDesc(w_const_desc);
auto w_data = w->mutable_data<float>(); auto w_data = w->mutable_data<float>();
std::vector<float> transposed_w_data(w_dims.production());
for (int i = 0; i < k; i++) { for (int i = 0; i < k; i++) {
for (int j = 0; j < n; j++) { for (int j = 0; j < n; j++) {
transposed_w_data[j * k + i] = w_data[i * n + j]; transpose_w_data[j * k + i] = w_data[i * n + j];
} }
} }
w_const_tensor->SetData(reinterpret_cast<uint8_t*>(transposed_w_data.data()), auto w_const_node = graph->AddNode(w_var_name, transpose_w);
transposed_w_data.size() * sizeof(float));
w_const_node->set_attr_value(w_const_tensor);
fc_node->set_input_w(*w_const_node); fc_node->set_input_w(*w_const_node);
lite::npu::OpList::Global().add(w_const_node);
// add bias node if bias tensor exists // Add bias node if bias tensor exists
if (lite::npu::HasInputArg(op_info, scope, "Bias")) { if (HasInputArg(op_info, scope, "Bias")) {
auto bias_var_name = op_info->Input("Bias").front(); auto bias_var_name = op_info->Input("Bias").front();
auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>(); auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
auto bias_dims = bias->dims(); auto bias_dims = bias->dims();
CHECK(!inputs_map.count(bias_var_name)); CHECK(!graph->HasNode(bias_var_name));
CHECK_EQ(bias_dims.production(), n); CHECK_EQ(bias_dims.production(), n);
auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name); auto bias_const_node = graph->AddNode(bias_var_name, *bias, {1, n, 1, 1});
bias_const_node->set_attr_value(lite::npu::CvtTensor(bias, {1, n, 1, 1}));
fc_node->set_input_b(*bias_const_node); fc_node->set_input_b(*bias_const_node);
lite::npu::OpList::Global().add(bias_const_node);
} }
lite::npu::OpList::Global().add(fc_node);
// reshape output of fc_node from (m, n, 1, 1) to (m, n) // Reshape output of fc_node from (m, n, 1, 1) to (m, n)
auto reshaped_fc_node = auto reshaped_fc_node = graph->AddNode<ge::op::Reshape>(out_var_name);
std::make_shared<ge::op::Reshape>(unique_op_type + "_reshape");
reshaped_fc_node->set_input_tensor(*fc_node); reshaped_fc_node->set_input_tensor(*fc_node);
reshaped_fc_node->set_attr_shape({m, n}); reshaped_fc_node->set_attr_shape({m, n});
reshaped_fc_node->set_attr_axis(0); reshaped_fc_node->set_attr_axis(0);
lite::npu::OpList::Global().add(reshaped_fc_node); return REBUILD_WHEN_SHAPE_CHANGED;
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = reshaped_fc_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(fc, paddle::lite::kernels::npu::bridges::FCConverter); REGISTER_SUBGRAPH_BRIDGE(NPU, fc, paddle::lite::subgraph::npu::FCConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/graph.h"
#include <utility>
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace npu {
// Const node
std::shared_ptr<ge::op::Const> Graph::AddNode(const std::string& name,
const Tensor& tensor,
PrecisionType ptype,
DataLayoutType ltype) {
return AddNode(name, tensor, tensor.dims().Vectorize(), ptype, ltype);
}
std::shared_ptr<ge::op::Const> Graph::AddNode(const std::string& name,
const Tensor& tensor,
std::vector<int64_t> shape,
PrecisionType ptype,
DataLayoutType ltype) {
CHECK(!HasNode(name)) << "Node " << name << " redefined.";
auto node = AddNode<ge::op::Const>(name);
node->set_attr_value(CvtTensor(tensor, shape, ptype, ltype));
return node;
}
// Data node
std::shared_ptr<ge::op::Data> Graph::AddNode(const std::string& name,
std::vector<int64_t> shape,
PrecisionType ptype,
DataLayoutType ltype) {
CHECK(!HasNode(name)) << "Node " << name << " redefined.";
auto node = AddNode<ge::op::Data>(name);
ge::TensorDesc desc(
ge::Shape(shape), CvtDataLayoutType(ltype), CvtPrecisionType(ptype));
node->update_input_desc_x(desc);
nodes_.insert(std::make_pair(name, node));
return node;
}
} // namespace npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace npu {
// Type and registers of converters for converting Paddle Ops to HiAI IR graph
class Graph {
public:
template <typename T>
std::shared_ptr<T> AddNode(const std::string& name) {
auto unique_name = [&](const std::string& key) {
int idx = 1;
auto it = counts_.find(key);
if (it == counts_.end()) {
counts_.insert(std::make_pair(key, idx));
} else {
idx = ++(it->second);
}
return key + "_" + std::to_string(idx);
};
auto it = nodes_.find(name);
if (it != nodes_.end()) {
// Generate a new unique name as the key to bind the origin node:
// new_name->node
nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second));
nodes_.erase(it);
}
// Create a new node and bind with the name: name->new_node
auto node = std::make_shared<T>(unique_name(name + "_op"));
nodes_.insert(std::make_pair(name, node));
return node;
}
// Const node
std::shared_ptr<ge::op::Const> AddNode(
const std::string& name,
const Tensor& tensor,
PrecisionType ptype = PRECISION(kFloat),
DataLayoutType ltype = DATALAYOUT(kNCHW));
std::shared_ptr<ge::op::Const> AddNode(
const std::string& name,
const Tensor& tensor,
std::vector<int64_t> shape,
PrecisionType ptype = PRECISION(kFloat),
DataLayoutType ltype = DATALAYOUT(kNCHW));
template <typename T>
std::shared_ptr<ge::op::Const> AddNode(
const std::string& name,
const std::vector<T>& data,
std::vector<int64_t> shape = {},
DataLayoutType ltype = DATALAYOUT(kNCHW)) {
const std::type_info& info = typeid(T);
PrecisionType ptype = PRECISION(kFloat);
if (info == typeid(float)) {
ptype = PRECISION(kFloat);
} else if (info == typeid(int8_t)) {
ptype = PRECISION(kFloat);
} else if (info == typeid(int32_t)) {
ptype = PRECISION(kInt32);
} else {
LOG(FATAL) << "[NPU] Unknow data type " << info.name();
}
if (shape.empty()) {
shape = {static_cast<int64_t>(data.size())};
} else {
int size = 1;
for (auto i : shape) {
size *= i;
}
CHECK_EQ(data.size(), size);
}
Tensor tensor;
tensor.Resize(shape);
std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
reinterpret_cast<const uint8_t*>(data.data()),
data.size() * sizeof(T));
return AddNode(name, tensor, ptype, ltype);
}
template <typename T>
std::shared_ptr<ge::op::Const> AddNode(
const std::string& name,
T value,
std::vector<int64_t> shape = {1},
DataLayoutType ltype = DATALAYOUT(kNCHW)) {
int64_t size = 1;
for (auto i : shape) {
size *= i;
}
std::vector<T> data(size, value);
return AddNode(name, data, shape, ltype);
}
// Data node
std::shared_ptr<ge::op::Data> AddNode(
const std::string& name,
std::vector<int64_t> shape,
PrecisionType ptype = PRECISION(kFloat),
DataLayoutType ltype = DATALAYOUT(kNCHW));
std::shared_ptr<ge::Operator> GetNode(std::string name) {
CHECK(HasNode(name)) << "[NPU] Node " << name << " not found.";
return nodes_.at(name);
}
bool HasNode(const std::string& name) {
return nodes_.find(name) != nodes_.end();
}
private:
std::unordered_map<std::string, std::shared_ptr<ge::Operator>> nodes_;
std::unordered_map<std::string, int> counts_;
};
} // namespace npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
...@@ -12,34 +12,32 @@ ...@@ -12,34 +12,32 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type InterpolateConverter( int InterpolateConverter(void* ctx, OpLite* op) {
const std::shared_ptr<lite::OpLite> interpolate_op, CHECK(ctx != nullptr);
const node_map_type& inputs_map) { CHECK(op != nullptr);
auto scope = interpolate_op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = interpolate_op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
// get input, output and attributes from lite op // Get input, output and attributes from lite op
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
CHECK(inputs_map.count(x_var_name)); auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
auto x_dims = x->dims(); auto x_dims = x->dims();
auto x_h = x_dims[2]; auto x_h = x_dims[2];
auto x_w = x_dims[3]; auto x_w = x_dims[3];
CHECK_EQ(x_dims.size(), 4); CHECK_EQ(x_dims.size(), 4);
auto out_var_name = op_info->Output("Out").front();
auto scale = op_info->GetAttr<float>("scale"); auto scale = op_info->GetAttr<float>("scale");
auto out_w = op_info->GetAttr<int>("out_w"); auto out_w = op_info->GetAttr<int>("out_w");
auto out_h = op_info->GetAttr<int>("out_h"); auto out_h = op_info->GetAttr<int>("out_h");
...@@ -50,7 +48,7 @@ node_map_type InterpolateConverter( ...@@ -50,7 +48,7 @@ node_map_type InterpolateConverter(
"align_corners = false isn't " "align_corners = false isn't "
"supported in HiAI DDK"; "supported in HiAI DDK";
// priority: OutSize > scale > out_h/out_w // Priority: OutSize > scale > out_h/out_w
if (scale > 0) { if (scale > 0) {
out_h = static_cast<int>(x_h * scale); out_h = static_cast<int>(x_h * scale);
out_w = static_cast<int>(x_w * scale); out_w = static_cast<int>(x_w * scale);
...@@ -58,18 +56,17 @@ node_map_type InterpolateConverter( ...@@ -58,18 +56,17 @@ node_map_type InterpolateConverter(
out_w = out_w > 0 ? out_w : -1; out_w = out_w > 0 ? out_w : -1;
} }
// update out_h and out_w if has OutSize // Update out_h and out_w if has OutSize
std::shared_ptr<ge::Operator> out_size_node = nullptr; std::shared_ptr<ge::Operator> out_size_node = nullptr;
if (lite::npu::HasInputArg(op_info, scope, "OutSize")) { if (HasInputArg(op_info, scope, "OutSize")) {
auto out_size_var_name = op_info->Input("OutSize").front(); auto out_size_var_name = op_info->Input("OutSize").front();
if (inputs_map.count(out_size_var_name)) { if (graph->HasNode(out_size_var_name)) {
out_size_node = inputs_map.at(out_size_var_name); out_size_node = graph->GetNode(out_size_var_name);
} else { } else {
auto out_size = auto out_size = scope->FindVar(out_size_var_name)->GetMutable<Tensor>();
scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
CHECK_EQ(out_size->numel(), 2); CHECK_EQ(out_size->numel(), 2);
auto out_size_data = out_size->mutable_data<int>(); auto out_size_data = out_size->mutable_data<int>();
// update out_h and out_w if has OutSize // Update out_h and out_w if has OutSize
out_h = out_size_data[0]; out_h = out_size_data[0];
out_w = out_size_data[1]; out_w = out_size_data[1];
} }
...@@ -83,46 +80,37 @@ node_map_type InterpolateConverter( ...@@ -83,46 +80,37 @@ node_map_type InterpolateConverter(
<< " is too large, should not exceed " << largest_multiple << " is too large, should not exceed " << largest_multiple
<< " in HiAI DDK"; << " in HiAI DDK";
} }
auto out_size_const_node = out_size_node = graph->AddNode(out_var_name + "/out_size",
std::make_shared<ge::op::Const>(unique_op_type + "/out_size"); std::vector<int>({out_h, out_w}));
out_size_const_node->set_attr_value(
lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
out_size_node = out_size_const_node;
} }
lite::npu::OpList::Global().add(out_size_node);
std::shared_ptr<ge::Operator> interp_node = nullptr;
if (interp_method == "bilinear") { if (interp_method == "bilinear") {
auto bilinear_interp_node = auto bilinear_interp_node =
std::make_shared<ge::op::ResizeBilinear>(unique_op_type); graph->AddNode<ge::op::ResizeBilinear>(out_var_name);
bilinear_interp_node->set_input_x(*inputs_map.at(x_var_name)); bilinear_interp_node->set_input_x(*graph->GetNode(x_var_name));
bilinear_interp_node->set_input_size(*out_size_node); bilinear_interp_node->set_input_size(*out_size_node);
bilinear_interp_node->set_attr_align_corners(align_corners); bilinear_interp_node->set_attr_align_corners(align_corners);
interp_node = bilinear_interp_node;
} else if (interp_method == "nearest") { } else if (interp_method == "nearest") {
auto nearest_interp_node = auto nearest_interp_node =
std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type); graph->AddNode<ge::op::ResizeNearestNeighbor>(out_var_name);
nearest_interp_node->set_input_image(*inputs_map.at(x_var_name)); nearest_interp_node->set_input_image(*graph->GetNode(x_var_name));
nearest_interp_node->set_input_size(*out_size_node); nearest_interp_node->set_input_size(*out_size_node);
nearest_interp_node->set_attr_align_corners(align_corners); nearest_interp_node->set_attr_align_corners(align_corners);
interp_node = nearest_interp_node;
} else { } else {
LOG(FATAL) << "[NPU] Unsupported interpolate method: " << interp_method; LOG(WARNING) << "[NPU] Unsupported interpolate method: " << interp_method;
return FAILED;
} }
lite::npu::OpList::Global().add(interp_node); return REBUILD_WHEN_SHAPE_CHANGED;
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = interp_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(bilinear_interp, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::InterpolateConverter); bilinear_interp,
REGISTER_NPU_BRIDGE(nearest_interp, paddle::lite::subgraph::npu::InterpolateConverter);
paddle::lite::kernels::npu::bridges::InterpolateConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
nearest_interp,
paddle::lite::subgraph::npu::InterpolateConverter);
...@@ -12,24 +12,24 @@ ...@@ -12,24 +12,24 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
// Note: inputs_map the var_name contains only the data, the weight should be // Note: all of the input weight vars should be handled in this converter
// handle in this converter int MulConverter(void* ctx, OpLite* op) {
node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op, CHECK(ctx != nullptr);
const node_map_type& inputs_map) { CHECK(op != nullptr);
auto scope = mul_op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = mul_op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Input("Y").front(); auto y_var_name = op_info->Input("Y").front();
...@@ -37,6 +37,7 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op, ...@@ -37,6 +37,7 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>(); auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims = y->dims(); auto y_dims = y->dims();
auto out_var_name = op_info->Output("Out").front();
int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims"); int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims"); int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
int m = x_dims.Slice(0, x_num_col_dims).production(); int m = x_dims.Slice(0, x_num_col_dims).production();
...@@ -44,61 +45,47 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op, ...@@ -44,61 +45,47 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production()) CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production())
<< "[NPU] columns of X must be equal with rows of Y"; << "[NPU] columns of X must be equal with rows of Y";
int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production(); int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production();
LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k; VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k;
LOG(INFO) << "x_var_name:" << x_var_name VLOG(3) << "x_var_name:" << x_var_name
<< ", is data: " << inputs_map.count(x_var_name); << ", is data: " << graph->HasNode(x_var_name);
LOG(INFO) << "y_var_name:" << y_var_name VLOG(3) << "y_var_name:" << y_var_name
<< ", is data: " << inputs_map.count(y_var_name); << ", is data: " << graph->HasNode(y_var_name);
CHECK(inputs_map.count(x_var_name)) CHECK(graph->HasNode(x_var_name))
<< "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet."; << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";
auto mul_node = std::make_shared<ge::op::MatMul>(unique_op_type); auto mul_node = graph->AddNode<ge::op::MatMul>(out_var_name);
// add input x node which supports persistable and non-persistable tensor, and // Add input x node which supports persistable and non-persistable tensor, and
// reshape to (m, k) // reshape to (m, k)
if (inputs_map.count(x_var_name)) { if (graph->HasNode(x_var_name)) {
auto reshaped_x_node = auto reshaped_x_node =
std::make_shared<ge::op::Reshape>(x_var_name + "_reshape"); graph->AddNode<ge::op::Reshape>(x_var_name + "/reshape");
reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name)); reshaped_x_node->set_input_tensor(*graph->GetNode(x_var_name));
reshaped_x_node->set_attr_shape({m, k}); reshaped_x_node->set_attr_shape({m, k});
reshaped_x_node->set_attr_axis(0); reshaped_x_node->set_attr_axis(0);
mul_node->set_input_x1(*reshaped_x_node); mul_node->set_input_x1(*reshaped_x_node);
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(reshaped_x_node);
} else { } else {
auto x_const_node = std::make_shared<ge::op::Const>(x_var_name); auto x_const_node = graph->AddNode(x_var_name, *x, {m, k});
x_const_node->set_attr_value(lite::npu::CvtTensor(x, {m, k}));
mul_node->set_input_x1(*x_const_node); mul_node->set_input_x1(*x_const_node);
lite::npu::OpList::Global().add(x_const_node);
} }
// add input y node which only supports persistable tensor, and reshape to (k, // Add input y node which only supports persistable tensor, and reshape to
// n) // (k,n)
if (inputs_map.count(y_var_name)) { if (graph->HasNode(y_var_name)) {
auto reshaped_y_node = auto reshaped_y_node =
std::make_shared<ge::op::Reshape>(y_var_name + "_reshape"); graph->AddNode<ge::op::Reshape>(y_var_name + "/reshape");
reshaped_y_node->set_input_tensor(*inputs_map.at(y_var_name)); reshaped_y_node->set_input_tensor(*graph->GetNode(y_var_name));
reshaped_y_node->set_attr_shape({k, n}); reshaped_y_node->set_attr_shape({k, n});
reshaped_y_node->set_attr_axis(0); reshaped_y_node->set_attr_axis(0);
mul_node->set_input_x2(*reshaped_y_node); mul_node->set_input_x2(*reshaped_y_node);
lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
lite::npu::OpList::Global().add(reshaped_y_node);
} else { } else {
auto y_const_node = std::make_shared<ge::op::Const>(y_var_name); auto y_const_node = graph->AddNode(y_var_name, *y, {k, n});
y_const_node->set_attr_value(lite::npu::CvtTensor(y, {k, n}));
mul_node->set_input_x2(*y_const_node); mul_node->set_input_x2(*y_const_node);
lite::npu::OpList::Global().add(y_const_node);
} }
return REBUILD_WHEN_SHAPE_CHANGED;
lite::npu::OpList::Global().add(mul_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = mul_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(mul, paddle::lite::kernels::npu::bridges::MulConverter); REGISTER_SUBGRAPH_BRIDGE(NPU, mul, paddle::lite::subgraph::npu::MulConverter);
...@@ -12,38 +12,39 @@ ...@@ -12,38 +12,39 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op, int Pad2dConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = pad2d_op->scope(); CHECK(op != nullptr);
auto op_info = pad2d_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
std::shared_ptr<ge::op::Pad> pad2d_node =
std::make_shared<ge::op::Pad>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
pad2d_node->set_input_x(*inputs_map.at(x_var_name)); auto out_var_name = op_info->Output("Out").front();
lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); auto pad2d_node = graph->AddNode<ge::op::Pad>(out_var_name);
lite::npu::OpList::Global().add(pad2d_node); pad2d_node->set_input_x(*graph->GetNode(x_var_name));
auto mode = op_info->GetAttr<std::string>("mode"); auto mode = op_info->GetAttr<std::string>("mode");
if (mode == "constant") { if (mode == "constant") {
pad2d_node->set_attr_mode(0); pad2d_node->set_attr_mode(0);
} else if (mode == "reflect") { } else if (mode == "reflect") {
LOG(FATAL) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
pad2d_node->set_attr_mode(1); pad2d_node->set_attr_mode(1);
return FAILED;
} else { } else {
LOG(FATAL) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
return FAILED;
} }
auto x_dims = scope->FindTensor(x_var_name)->dims(); auto x_dims = scope->FindTensor(x_var_name)->dims();
...@@ -51,34 +52,25 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op, ...@@ -51,34 +52,25 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
CHECK_EQ(padding.size(), 4); CHECK_EQ(padding.size(), 4);
int xds = x_dims.size(); int xds = x_dims.size();
padding.insert(padding.begin(), xds * 2 - 4, 0); padding.insert(padding.begin(), xds * 2 - 4, 0);
auto npu_padding = auto padding_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/padding"); graph->AddNode(out_var_name + "/padding", padding, {xds, 2});
npu_padding->set_attr_value( pad2d_node->set_input_padding(*padding_const_node);
lite::npu::CreateTensorAndFillData<int>(padding, {xds, 2}));
pad2d_node->set_input_padding(*npu_padding);
lite::npu::OpList::Global().add(npu_padding);
if (mode == "constant") { if (mode == "constant") {
auto pad_value = op_info->GetAttr<float>("pad_value"); auto pad_value = op_info->GetAttr<float>("pad_value");
auto npu_pad_value = auto pad_value_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/pad_value"); graph->AddNode(out_var_name + "/pad_value", pad_value);
npu_pad_value->set_attr_value( pad2d_node->set_input_constant_values(*pad_value_const_node);
lite::npu::CreateTensorAndFillData<float>({pad_value}));
pad2d_node->set_input_constant_values(*npu_pad_value);
lite::npu::OpList::Global().add(npu_pad_value);
pad2d_node->set_attr_T(0); // type of pad_value: 0:float 3:int32 pad2d_node->set_attr_T(0); // type of pad_value: 0:float 3:int32
} }
return REBUILD_WHEN_SHAPE_CHANGED;
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = pad2d_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(pad2d, paddle::lite::kernels::npu::bridges::Pad2dConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
pad2d,
paddle::lite::subgraph::npu::Pad2dConverter);
...@@ -14,46 +14,40 @@ ...@@ -14,46 +14,40 @@
#pragma once #pragma once
#include "lite/kernels/npu/bridges/registry.h" USE_SUBGRAPH_BRIDGE(NPU, sigmoid);
USE_SUBGRAPH_BRIDGE(NPU, relu);
USE_SUBGRAPH_BRIDGE(NPU, tanh);
USE_SUBGRAPH_BRIDGE(NPU, relu_clipped);
USE_SUBGRAPH_BRIDGE(NPU, leaky_relu);
USE_SUBGRAPH_BRIDGE(NPU, softsign);
USE_SUBGRAPH_BRIDGE(NPU, hard_sigmoid);
USE_NPU_BRIDGE(sigmoid); USE_SUBGRAPH_BRIDGE(NPU, batch_norm);
USE_NPU_BRIDGE(relu); USE_SUBGRAPH_BRIDGE(NPU, concat);
USE_NPU_BRIDGE(tanh); USE_SUBGRAPH_BRIDGE(NPU, conv2d);
USE_NPU_BRIDGE(relu_clipped); USE_SUBGRAPH_BRIDGE(NPU, depthwise_conv2d);
USE_NPU_BRIDGE(relu6); USE_SUBGRAPH_BRIDGE(NPU, conv2d_transpose);
USE_NPU_BRIDGE(leaky_relu);
USE_NPU_BRIDGE(softsign);
USE_NPU_BRIDGE(hard_sigmoid);
USE_NPU_BRIDGE(arg_max); USE_SUBGRAPH_BRIDGE(NPU, elementwise_add);
USE_NPU_BRIDGE(batch_norm); USE_SUBGRAPH_BRIDGE(NPU, fusion_elementwise_add_activation);
USE_NPU_BRIDGE(concat); USE_SUBGRAPH_BRIDGE(NPU, elementwise_sub);
USE_NPU_BRIDGE(conv2d); USE_SUBGRAPH_BRIDGE(NPU, elementwise_mul);
USE_NPU_BRIDGE(depthwise_conv2d); USE_SUBGRAPH_BRIDGE(NPU, elementwise_div);
USE_NPU_BRIDGE(conv2d_transpose);
USE_NPU_BRIDGE(elementwise_add); USE_SUBGRAPH_BRIDGE(NPU, fc);
USE_NPU_BRIDGE(fusion_elementwise_add_activation); USE_SUBGRAPH_BRIDGE(NPU, bilinear_interp);
USE_NPU_BRIDGE(elementwise_sub); USE_SUBGRAPH_BRIDGE(NPU, nearest_interp);
USE_NPU_BRIDGE(elementwise_mul); USE_SUBGRAPH_BRIDGE(NPU, mul);
USE_NPU_BRIDGE(elementwise_div); USE_SUBGRAPH_BRIDGE(NPU, pad2d);
USE_SUBGRAPH_BRIDGE(NPU, pool2d);
USE_NPU_BRIDGE(fc); USE_SUBGRAPH_BRIDGE(NPU, reduce_mean);
USE_NPU_BRIDGE(bilinear_interp); USE_SUBGRAPH_BRIDGE(NPU, reshape);
USE_NPU_BRIDGE(nearest_interp); USE_SUBGRAPH_BRIDGE(NPU, reshape2);
USE_NPU_BRIDGE(mul); USE_SUBGRAPH_BRIDGE(NPU, scale);
USE_NPU_BRIDGE(pad2d); USE_SUBGRAPH_BRIDGE(NPU, shuffle_channel);
USE_NPU_BRIDGE(pool2d); USE_SUBGRAPH_BRIDGE(NPU, softmax);
USE_NPU_BRIDGE(reduce_mean); USE_SUBGRAPH_BRIDGE(NPU, split);
USE_NPU_BRIDGE(reshape); USE_SUBGRAPH_BRIDGE(NPU, sqrt);
USE_NPU_BRIDGE(reshape2); USE_SUBGRAPH_BRIDGE(NPU, square);
USE_NPU_BRIDGE(scale); USE_SUBGRAPH_BRIDGE(NPU, transpose);
USE_NPU_BRIDGE(shuffle_channel); USE_SUBGRAPH_BRIDGE(NPU, transpose2);
USE_NPU_BRIDGE(softmax);
USE_NPU_BRIDGE(split);
USE_NPU_BRIDGE(sqrt);
USE_NPU_BRIDGE(square);
USE_NPU_BRIDGE(transpose);
USE_NPU_BRIDGE(transpose2);
USE_NPU_BRIDGE(unsqueeze);
USE_NPU_BRIDGE(unsqueeze2);
...@@ -13,30 +13,29 @@ ...@@ -13,30 +13,29 @@
// limitations under the License. // limitations under the License.
#include "lite/operators/pool_op.h" #include "lite/operators/pool_op.h"
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op, int PoolConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = pool_op->scope(); CHECK(op != nullptr);
auto op_info = pool_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
std::shared_ptr<ge::op::Pooling> pool_node =
std::make_shared<ge::op::Pooling>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto x = scope->FindTensor(x_var_name); auto x = scope->FindTensor(x_var_name);
pool_node->set_input_x(*inputs_map.at(x_var_name)); auto out_var_name = op_info->Output("Out").front();
lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); auto pool_node = graph->AddNode<ge::op::Pooling>(out_var_name);
lite::npu::OpList::Global().add(pool_node); pool_node->set_input_x(*graph->GetNode(x_var_name));
int mode = 0; int mode = 0;
auto pooling_type = op_info->GetAttr<std::string>("pooling_type"); auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
...@@ -47,7 +46,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op, ...@@ -47,7 +46,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
CHECK(op_info->GetAttr<bool>("exclusive")) CHECK(op_info->GetAttr<bool>("exclusive"))
<< "[NPU] exclusive must be true in HiAI DDK"; << "[NPU] exclusive must be true in HiAI DDK";
} else { } else {
LOG(FATAL) << "[NPU] Unsupported pooling type: " << pooling_type; LOG(WARNING) << "[NPU] Unsupported pooling type: " << pooling_type;
return FAILED;
} }
pool_node->set_attr_mode(mode); pool_node->set_attr_mode(mode);
...@@ -67,8 +67,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op, ...@@ -67,8 +67,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
pool_node->set_attr_global_pooling(global_pooling); pool_node->set_attr_global_pooling(global_pooling);
auto ksize = op_info->GetAttr<std::vector<int>>("ksize"); auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
auto window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()); pool_node->set_attr_window(
pool_node->set_attr_window(window); ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()));
auto paddings = op_info->GetAttr<std::vector<int>>("paddings"); auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
if (paddings.size() == 2L) { if (paddings.size() == 2L) {
...@@ -78,42 +78,38 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op, ...@@ -78,42 +78,38 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
} }
} }
CHECK_EQ(paddings.size(), 4L) CHECK_EQ(paddings.size(), 4L)
<< "Paddings size should be the same or twice as the inputs size."; << "[NPU] Paddings size should be the same or twice as the inputs size.";
bool adaptive = false; bool adaptive = false;
if (op_info->HasAttr("adaptive")) { if (op_info->HasAttr("adaptive")) {
adaptive = op_info->GetAttr<bool>("adaptive"); adaptive = op_info->GetAttr<bool>("adaptive");
} }
auto strides = op_info->GetAttr<std::vector<int>>("strides"); auto strides = op_info->GetAttr<std::vector<int>>("strides");
operators::UpdatePadding(&paddings, lite::operators::UpdatePadding(&paddings,
global_pooling, global_pooling,
adaptive, adaptive,
padding_algorithm, padding_algorithm,
x->dims(), x->dims(),
strides, strides,
ksize); ksize);
auto npu_pad = ge::AttrValue::LIST_INT{ pool_node->set_attr_pad(ge::AttrValue::LIST_INT{
paddings[0], paddings[1], paddings[2], paddings[3]}; paddings[0], paddings[1], paddings[2], paddings[3]});
pool_node->set_attr_pad(npu_pad); pool_node->set_attr_stride(
ge::AttrValue::LIST_INT(strides.begin(), strides.end()));
auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end());
pool_node->set_attr_stride(npu_stride);
int ceil_mode = 0; int ceil_mode = 0;
if (op_info->HasAttr("ceil_mode")) { if (op_info->HasAttr("ceil_mode")) {
ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0; ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
} }
pool_node->set_attr_ceil_mode(ceil_mode); pool_node->set_attr_ceil_mode(ceil_mode);
// output_node->set_attr_data_mode(npu_data_mode); // pool_node->set_attr_data_mode(data_mode);
return REBUILD_WHEN_SHAPE_CHANGED;
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = pool_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(pool2d, paddle::lite::kernels::npu::bridges::PoolConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
pool2d,
paddle::lite::subgraph::npu::PoolConverter);
...@@ -12,30 +12,31 @@ ...@@ -12,30 +12,31 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type ReduceMeanConverter( int ReduceMeanConverter(void* ctx, OpLite* op) {
const std::shared_ptr<lite::OpLite> reduce_mean_op, CHECK(ctx != nullptr);
const node_map_type& inputs_map) { CHECK(op != nullptr);
auto scope = reduce_mean_op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = reduce_mean_op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
// get input, and op attributes // Get input and op attributes
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Input("Out").front();
auto x_dims = scope->FindTensor(x_var_name)->dims(); auto x_dims = scope->FindTensor(x_var_name)->dims();
auto keep_dim = op_info->GetAttr<bool>("keep_dim"); auto keep_dim = op_info->GetAttr<bool>("keep_dim");
auto dim = op_info->GetAttr<std::vector<int>>("dim"); auto dim = op_info->GetAttr<std::vector<int>>("dim");
CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty."; CHECK(!dim.empty()) << "[NPU] \"dim\" of reduce_mean should not be empty.";
for (size_t i = 0; i < dim.size(); i++) { for (size_t i = 0; i < dim.size(); i++) {
if (dim[i] < 0) { if (dim[i] < 0) {
dim[i] += x_dims.size(); dim[i] += x_dims.size();
...@@ -43,30 +44,16 @@ node_map_type ReduceMeanConverter( ...@@ -43,30 +44,16 @@ node_map_type ReduceMeanConverter(
} }
std::sort(dim.begin(), dim.end()); std::sort(dim.begin(), dim.end());
// create reduce_mean(reduce_sum + scale) node and set input node from // Create reduce_mean(using reduce_sum + scale) node and set input node from
// inputs_map // node map
// creat reduce_sum node auto reduce_sum_node =
auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum"); graph->AddNode<ge::op::ReduceSum>(out_var_name + "/reducesum");
auto reduce_sum_node = std::make_shared<ge::op::ReduceSum>(unique_reduce_sum); reduce_sum_node->set_input_x(*graph->GetNode(x_var_name));
CHECK(inputs_map.count(x_var_name));
reduce_sum_node->set_input_x(*inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(reduce_sum_node);
auto dim_const_node = auto dim_const_node = graph->AddNode(out_var_name + "/dim", dim);
std::make_shared<ge::op::Const>(unique_reduce_sum + "/dim");
dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData<int>(dim));
reduce_sum_node->set_input_w(*dim_const_node); reduce_sum_node->set_input_w(*dim_const_node);
lite::npu::OpList::Global().add(dim_const_node);
reduce_sum_node->set_attr_keep_dims(keep_dim); reduce_sum_node->set_attr_keep_dims(keep_dim);
// create scale node
auto unique_scale = lite::npu::UniqueName("scale");
auto scale_node = std::make_shared<ge::op::Scale>(unique_scale);
scale_node->set_input_x(*reduce_sum_node);
lite::npu::OpList::Global().add(scale_node);
float scale = 1; float scale = 1;
for (size_t i = 0; i < dim.size(); i++) { for (size_t i = 0; i < dim.size(); i++) {
scale /= x_dims[dim[i]]; scale /= x_dims[dim[i]];
...@@ -88,24 +75,19 @@ node_map_type ReduceMeanConverter( ...@@ -88,24 +75,19 @@ node_map_type ReduceMeanConverter(
} }
auto filter_const_node = auto filter_const_node =
std::make_shared<ge::op::Const>(unique_scale + "/filter"); graph->AddNode(out_var_name + "/filter", scale, scale_bias_shape);
filter_const_node->set_attr_value( auto scale_node = graph->AddNode<ge::op::Scale>(out_var_name);
lite::npu::CreateTensorAndFillData(scale, scale_bias_shape)); scale_node->set_input_x(*reduce_sum_node);
scale_node->set_input_filter(*filter_const_node); scale_node->set_input_filter(*filter_const_node);
lite::npu::OpList::Global().add(filter_const_node);
scale_node->set_attr_axis(1); scale_node->set_attr_axis(1);
return REBUILD_WHEN_SHAPE_CHANGED;
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = scale_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(reduce_mean, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::ReduceMeanConverter); reduce_mean,
paddle::lite::subgraph::npu::ReduceMeanConverter);
...@@ -17,25 +17,38 @@ ...@@ -17,25 +17,38 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu {
namespace bridges {
Factory& Factory::Instance() { Registry& Registry::Instance() {
static Factory g_npu_bridge; static Registry x;
return g_npu_bridge; return x;
} }
bool Factory::HasType(const std::string& op_type) const { void Registry::Insert(const std::string& dev_type,
return map_.count(op_type); const std::string& op_type,
const cvt_func_type& cvt_func_name) {
auto it = map_.find(dev_type);
if (it == map_.end()) {
map_.insert(std::make_pair(
dev_type, std::unordered_map<std::string, cvt_func_type>()));
}
map_.at(dev_type).insert(std::make_pair(op_type, cvt_func_name));
} }
void Factory::Insert(const std::string& op_type, const func_type& func_name) { const cvt_func_type& Registry::Select(const std::string& dev_type,
map_.insert(std::make_pair(op_type, func_name)); const std::string& op_type) const {
return map_.at(dev_type).at(op_type);
} }
} // namespace bridges bool Registry::Exists(const std::string& dev_type,
} // namespace npu const std::string& op_type) const {
} // namespace kernels bool found = map_.find(dev_type) != map_.end();
if (found) {
found = map_.at(dev_type).find(op_type) != map_.at(dev_type).end();
}
return found;
}
} // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -15,44 +15,46 @@ ...@@ -15,44 +15,46 @@
#pragma once #pragma once
#include <functional> #include <functional>
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector>
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/utils/macros.h" #include "lite/utils/macros.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu {
namespace bridges {
// var_name, npu node point const int FAILED = 1;
using node_map_type = const int SUCCESS = 0;
std::unordered_map<std::string, std::shared_ptr<ge::Operator>>; const int REBUILD_WHEN_SHAPE_CHANGED = 2;
inline bool CHECK_FAILED(int status) { return status & FAILED; }
inline bool CHECK_SUCCESS(int status) { return !CHECK_FAILED(status); }
inline bool CHECK_REBUILD_WHEN_SHAPE_CHANGED(int status) {
return status & REBUILD_WHEN_SHAPE_CHANGED;
}
using func_type = std::function<node_map_type(const std::shared_ptr<OpLite>, using cvt_func_type = std::function<int(void* ctx, OpLite* op)>;
const node_map_type&)>; using cvt_map_type =
using cvt_map_type = std::unordered_map<std::string, func_type>; std::unordered_map<std::string,
class Factory { std::unordered_map<std::string, cvt_func_type>>;
class Registry {
public: public:
static Factory& Instance(); static Registry& Instance();
const cvt_map_type& AllFunctions() const { return map_; } void Insert(const std::string& dev_type,
bool HasType(const std::string& op_type) const; const std::string& op_type,
void Insert(const std::string& op_type, const func_type& func_name); const cvt_func_type& cvt_func_name);
Factory() = default; const cvt_func_type& Select(const std::string& dev_type,
const std::string& op_type) const;
bool Exists(const std::string& dev_type, const std::string& op_type) const;
Registry() = default;
private: private:
cvt_map_type map_; cvt_map_type map_;
DISALLOW_COPY_AND_ASSIGN(Factory); DISALLOW_COPY_AND_ASSIGN(Registry);
}; };
} // namespace bridges } // namespace subgraph
} // namespace npu
} // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -70,17 +72,18 @@ class Factory { ...@@ -70,17 +72,18 @@ class Factory {
__test_global_namespace_##uniq_name##__>::value, \ __test_global_namespace_##uniq_name##__>::value, \
msg) msg)
#define REGISTER_NPU_BRIDGE(op_type, cvt_func_name) \ #define REGISTER_SUBGRAPH_BRIDGE(dev_type, op_type, cvt_func_name) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \
__reg_npu_bridge_##op_type##__, \ __reg_subgraph_bridge_##dev_type##_##op_type##__, \
"REGISTER_NPU_BRIDGE must be called in global namespace only once!"); \ "REGISTER_SUBGRAPH_BRIDGE must be called in global namespace only " \
int __reg_npu_bridge_##op_type##_Insert() { \ "once!"); \
paddle::lite::kernels::npu::bridges::Factory::Instance().Insert( \ int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert() { \
#op_type, cvt_func_name); \ paddle::lite::subgraph::Registry::Instance().Insert( \
return 0; \ #dev_type, #op_type, cvt_func_name); \
return 0; \
} }
#define USE_NPU_BRIDGE(op_type) \ #define USE_SUBGRAPH_BRIDGE(dev_type, op_type) \
extern int __reg_npu_bridge_##op_type##_Insert(); \ extern int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert(); \
static int __reg_npu_bridge_##op_type##_Insert_return UNUSED = \ static int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert_return \
__reg_npu_bridge_##op_type##_Insert(); UNUSED = __reg_subgraph_bridge_##dev_type##_##op_type##_Insert();
...@@ -13,48 +13,49 @@ ...@@ -13,48 +13,49 @@
// limitations under the License. // limitations under the License.
#include "lite/operators/reshape_op.h" #include "lite/operators/reshape_op.h"
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op, int ReshapeConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = reshape_op->scope(); CHECK(op != nullptr);
auto op_info = reshape_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
// get input, output and op attributes // Get input, output and op attributes
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>(); auto out_var_name = op_info->Output("Out").front();
auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto x_dims = x->dims(); auto x_dims = x->dims();
// create reshape node and set input node from inputs_map // Create reshape node and set input node from inputs_map
auto reshape_node = std::make_shared<ge::op::Reshape>(unique_op_type); auto reshape_node = graph->AddNode<ge::op::Reshape>(out_var_name);
CHECK(inputs_map.count(x_var_name)); reshape_node->set_input_tensor(*graph->GetNode(x_var_name));
reshape_node->set_input_tensor(*inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
// read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr) // Read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr)
if (lite::npu::HasInputArg(op_info, scope, "ShapeTensor")) { if (HasInputArg(op_info, scope, "ShapeTensor")) {
LOG(FATAL) << "[NPU] not support \"Shape\" from more than one Tensor."; LOG(WARNING) << "[NPU] not support \"Shape\" from more than one Tensor.";
} else if (lite::npu::HasInputArg(op_info, scope, "Shape")) { return FAILED;
} else if (HasInputArg(op_info, scope, "Shape")) {
auto actual_shape_var_name = op_info->Input("Shape").front(); auto actual_shape_var_name = op_info->Input("Shape").front();
if (!inputs_map.count(actual_shape_var_name)) { if (!graph->HasNode(actual_shape_var_name)) {
auto actual_shape = auto actual_shape =
scope->FindVar(actual_shape_var_name)->GetMutable<lite::Tensor>(); scope->FindVar(actual_shape_var_name)->GetMutable<Tensor>();
auto actual_shape_dims = actual_shape->dims(); auto actual_shape_dims = actual_shape->dims();
auto actual_shape_data = actual_shape->mutable_data<int>(); auto actual_shape_data = actual_shape->mutable_data<int>();
auto shape = auto shape =
std::vector<int>(actual_shape_data, std::vector<int>(actual_shape_data,
actual_shape_data + actual_shape_dims.production()); actual_shape_data + actual_shape_dims.production());
auto out_dims = operators::ValidateShape(shape, x_dims); auto out_dims = lite::operators::ValidateShape(shape, x_dims);
auto out_shape = out_dims.Vectorize(); auto out_shape = out_dims.Vectorize();
if (out_shape.size() > 4) { if (out_shape.size() > 4) {
LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, " LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
...@@ -62,19 +63,15 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op, ...@@ -62,19 +63,15 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
<< out_shape.size(); << out_shape.size();
} }
auto actual_shape_const_node = auto actual_shape_const_node =
std::make_shared<ge::op::Const>(actual_shape_var_name); graph->AddNode(actual_shape_var_name,
actual_shape_const_node->set_attr_value( std::vector<int>(out_shape.begin(), out_shape.end()));
lite::npu::CreateTensorAndFillData(
std::vector<int>(out_shape.begin(), out_shape.end())));
reshape_node->set_input_w(*actual_shape_const_node); reshape_node->set_input_w(*actual_shape_const_node);
lite::npu::OpList::Global().add(actual_shape_const_node);
} else { } else {
reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name)); reshape_node->set_input_w(*graph->GetNode(actual_shape_var_name));
lite::npu::OpList::Global().add(inputs_map.at(actual_shape_var_name));
} }
} else { } else {
auto shape = op_info->GetAttr<std::vector<int>>("shape"); auto shape = op_info->GetAttr<std::vector<int>>("shape");
auto out_dims = operators::ValidateShape(shape, x_dims); auto out_dims = lite::operators::ValidateShape(shape, x_dims);
auto out_shape = out_dims.Vectorize(); auto out_shape = out_dims.Vectorize();
if (out_shape.size() > 4) { if (out_shape.size() > 4) {
LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, " LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
...@@ -84,12 +81,9 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op, ...@@ -84,12 +81,9 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
reshape_node->set_attr_shape( reshape_node->set_attr_shape(
ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
} }
lite::npu::OpList::Global().add(reshape_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = reshape_node;
if (op_type == "reshape2") { if (op_type == "reshape2") {
// append an extra reshape node to calc XShape // Append an extra reshape node to calc XShape
std::vector<int64_t> xshape_dims(x_dims.size() + 1, 1); std::vector<int64_t> xshape_dims(x_dims.size() + 1, 1);
for (size_t i = 0; i < x_dims.size(); i++) { for (size_t i = 0; i < x_dims.size(); i++) {
xshape_dims[i + 1] = x_dims[i]; xshape_dims[i + 1] = x_dims[i];
...@@ -99,24 +93,23 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op, ...@@ -99,24 +93,23 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
"but XShape has " "but XShape has "
<< xshape_dims.size(); << xshape_dims.size();
} }
auto xshape_node = auto xshape_var_name = op_info->Output("XShape").front();
std::make_shared<ge::op::Reshape>(unique_op_type + "/xshape"); auto xshape_node = graph->AddNode<ge::op::Reshape>(xshape_var_name);
xshape_node->set_input_tensor(*inputs_map.at(x_var_name)); xshape_node->set_input_tensor(*graph->GetNode(x_var_name));
xshape_node->set_attr_shape( xshape_node->set_attr_shape(
ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end())); ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end()));
lite::npu::OpList::Global().add(xshape_node);
outputs_map[op_info->Output("XShape").front()] = xshape_node;
} }
return outputs_map; return REBUILD_WHEN_SHAPE_CHANGED;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(reshape, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::ReshapeConverter); reshape,
REGISTER_NPU_BRIDGE(reshape2, paddle::lite::subgraph::npu::ReshapeConverter);
paddle::lite::kernels::npu::bridges::ReshapeConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
reshape2,
paddle::lite::subgraph::npu::ReshapeConverter);
...@@ -12,28 +12,30 @@ ...@@ -12,28 +12,30 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op, int ScaleConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = scale_op->scope(); CHECK(op != nullptr);
auto op_info = scale_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
// get input, output and op attributes // Get input, output and op attributes
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>(); auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
auto x_dims = x->dims().Vectorize(); auto x_dims = x->dims().Vectorize();
CHECK_GE(x_dims.size(), 2); CHECK_GE(x_dims.size(), 2);
auto out_var_name = op_info->Output("Out").front();
std::vector<int64_t> scale_bias_shape = {x_dims[1]}; std::vector<int64_t> scale_bias_shape = {x_dims[1]};
float scale = op_info->GetAttr<float>("scale"); float scale = op_info->GetAttr<float>("scale");
float bias = op_info->GetAttr<float>("bias"); float bias = op_info->GetAttr<float>("bias");
...@@ -42,43 +44,31 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op, ...@@ -42,43 +44,31 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
bias *= scale; bias *= scale;
} }
// create scale node and set input node from inputs_map // Create scale node and set input node from inputs_map
auto scale_node = std::make_shared<ge::op::Scale>(unique_op_type); auto scale_node = graph->AddNode<ge::op::Scale>(out_var_name);
CHECK(inputs_map.count(x_var_name)); scale_node->set_input_x(*graph->GetNode(x_var_name));
scale_node->set_input_x(*inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(scale_node);
// add filter node(fill with scale) // Add filter node(fill with scale)
auto filter_const_node = auto filter_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/filter"); graph->AddNode(out_var_name + "/filter", scale, scale_bias_shape);
filter_const_node->set_attr_value(
lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
scale_node->set_input_filter(*filter_const_node); scale_node->set_input_filter(*filter_const_node);
lite::npu::OpList::Global().add(filter_const_node);
// add bias node(fill with bias) // Add bias node(fill with bias)
if (fabs(bias) > 1e-6f) { if (fabs(bias) > 1e-6f) {
auto bias_const_node = auto bias_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/bias"); graph->AddNode(out_var_name + "/bias", bias, scale_bias_shape);
bias_const_node->set_attr_value(
lite::npu::CreateTensorAndFillData(bias, scale_bias_shape));
scale_node->set_input_bias(*bias_const_node); scale_node->set_input_bias(*bias_const_node);
scale_node->set_attr_has_bias_value(true); scale_node->set_attr_has_bias_value(true);
lite::npu::OpList::Global().add(bias_const_node);
} }
scale_node->set_attr_axis(1); scale_node->set_attr_axis(1);
return REBUILD_WHEN_SHAPE_CHANGED;
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = scale_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(scale, paddle::lite::kernels::npu::bridges::ScaleConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
scale,
paddle::lite::subgraph::npu::ScaleConverter);
...@@ -12,45 +12,39 @@ ...@@ -12,45 +12,39 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type ShuffleChannelConverter( int ShuffleChannelConverter(void* ctx, OpLite* op) {
const std::shared_ptr<lite::OpLite> shuffle_channel_op, CHECK(ctx != nullptr);
const node_map_type& inputs_map) { CHECK(op != nullptr);
auto scope = shuffle_channel_op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = shuffle_channel_op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node =
std::make_shared<ge::op::ShuffleChannel>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto shuffle_channel_node =
graph->AddNode<ge::op::ShuffleChannel>(out_var_name);
shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name)); shuffle_channel_node->set_input_x(*graph->GetNode(x_var_name));
shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group")); shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group"));
return SUCCESS;
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(shuffle_channel_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = shuffle_channel_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE( REGISTER_SUBGRAPH_BRIDGE(NPU,
shuffle_channel, shuffle_channel,
paddle::lite::kernels::npu::bridges::ShuffleChannelConverter); paddle::lite::subgraph::npu::ShuffleChannelConverter);
...@@ -12,27 +12,26 @@ ...@@ -12,27 +12,26 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op, int SoftmaxConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = softmax_op->scope(); CHECK(op != nullptr);
auto op_info = softmax_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
std::shared_ptr<ge::op::Softmax> softmax_node =
std::make_shared<ge::op::Softmax>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto x_dims = scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims(); auto x_dims = scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims();
auto axis = op_info->GetAttr<int>("axis"); auto axis = op_info->GetAttr<int>("axis");
if (x_dims.size() > 3) { if (x_dims.size() > 3) {
...@@ -41,23 +40,17 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op, ...@@ -41,23 +40,17 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
<< " :x_w = " << x_dims[3]; << " :x_w = " << x_dims[3];
} }
CHECK(inputs_map.count(x_var_name)); auto softmax_node = graph->AddNode<ge::op::Softmax>(out_var_name);
softmax_node->set_input_x(*inputs_map.at(x_var_name)); softmax_node->set_input_x(*graph->GetNode(x_var_name));
softmax_node->set_attr_axis(axis); softmax_node->set_attr_axis(axis);
return REBUILD_WHEN_SHAPE_CHANGED;
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(softmax_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = softmax_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(softmax, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::SoftmaxConverter); softmax,
paddle::lite::subgraph::npu::SoftmaxConverter);
...@@ -12,70 +12,60 @@ ...@@ -12,70 +12,60 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op, int SplitConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
lite::Scope* scope = split_op->scope(); CHECK(op != nullptr);
const lite::OpInfo* op_info = split_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " << op_type << " ... "; VLOG(3) << "[NPU] Converting " << op_type << " ... ";
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_names = op_info->Output("Out");
auto axis = op_info->GetAttr<int>("axis"); auto axis = op_info->GetAttr<int>("axis");
auto num = op_info->GetAttr<int>("num"); auto num = op_info->GetAttr<int>("num");
auto sections = op_info->GetAttr<std::vector<int>>("sections"); auto sections = op_info->GetAttr<std::vector<int>>("sections");
int64_t sections_num = static_cast<int64_t>(sections.size()); int64_t sections_num = static_cast<int64_t>(sections.size());
std::shared_ptr<ge::op::Split> output_node = auto split_node = graph->AddNode<ge::op::Split>(op_type + "/" + x_var_name);
std::make_shared<ge::op::Split>(unique_op_type); split_node->set_input_x(*graph->GetNode(x_var_name));
CHECK(inputs_map.count(x_var_name)); split_node->set_attr_axis(static_cast<int64_t>(axis));
output_node->set_input_x(*inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
output_node->set_attr_axis(static_cast<int64_t>(axis));
if (num > 0) { if (num > 0) {
output_node->set_attr_output_num(static_cast<int64_t>(num)); split_node->set_attr_output_num(static_cast<int64_t>(num));
} else { } else {
output_node->set_attr_output_num(sections_num); split_node->set_attr_output_num(sections_num);
auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end()); auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end());
output_node->set_attr_size_split(size_split); split_node->set_attr_size_split(size_split);
} }
node_map_type outputs_map; split_node->create_dynamic_output_y(out_var_names.size());
auto out_var_names = op_info->Output("Out"); int idx = 1;
output_node->create_dynamic_output_y(out_var_names.size()); for (auto& out_var_name : out_var_names) {
int index = 1; auto zero_const_node =
for (auto out_var_name : out_var_names) { graph->AddNode(out_var_name + "/zero" + std::to_string(idx), 0);
auto const_node = std::make_shared<ge::op::Const>( auto add_node = graph->AddNode<ge::op::Add>(out_var_name);
unique_op_type + "/const_zero" + std::to_string(index)); add_node->set_input_x1(*split_node, "y" + std::to_string(idx));
const_node->set_attr_value(lite::npu::CreateTensorAndFillData(0)); add_node->set_input_x2(*zero_const_node);
lite::npu::OpList::Global().add(const_node); idx++;
auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add" +
std::to_string(index));
add_node->set_input_x1(*output_node, "y" + std::to_string(index));
add_node->set_input_x2(*const_node);
outputs_map[out_var_name] = add_node;
lite::npu::OpList::Global().add(add_node);
index++;
} }
return REBUILD_WHEN_SHAPE_CHANGED;
lite::npu::OpList::Global().add(output_node);
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(split, paddle::lite::kernels::npu::bridges::SplitConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
split,
paddle::lite::subgraph::npu::SplitConverter);
...@@ -12,43 +12,33 @@ ...@@ -12,43 +12,33 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type SqrtConverter(const std::shared_ptr<lite::OpLite> sqrt_op, int SqrtConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = sqrt_op->scope(); CHECK(op != nullptr);
auto op_info = sqrt_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); VLOG(3) << "[NPU] Converting " + op_type + "...";
LOG(INFO) << "[NPU] Converting " + op_type + "...";
std::shared_ptr<ge::op::Sqrt> sqrt_node =
std::make_shared<ge::op::Sqrt>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
CHECK(inputs_map.count(x_var_name)); auto sqrt_node = graph->AddNode<ge::op::Sqrt>(out_var_name);
sqrt_node->set_input_x(*inputs_map.at(x_var_name)); sqrt_node->set_input_x(*graph->GetNode(x_var_name));
return SUCCESS;
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(sqrt_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = sqrt_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(sqrt, paddle::lite::kernels::npu::bridges::SqrtConverter); REGISTER_SUBGRAPH_BRIDGE(NPU, sqrt, paddle::lite::subgraph::npu::SqrtConverter);
...@@ -12,44 +12,35 @@ ...@@ -12,44 +12,35 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type SquareConverter(const std::shared_ptr<lite::OpLite> square_op, int SquareConverter(void* ctx, OpLite* op) {
const node_map_type& inputs_map) { CHECK(ctx != nullptr);
auto scope = square_op->scope(); CHECK(op != nullptr);
auto op_info = square_op->op_info(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); VLOG(3) << "[NPU] Converting " + op_type + "...";
LOG(INFO) << "[NPU] Converting " + op_type + "...";
std::shared_ptr<ge::op::Square> square_node =
std::make_shared<ge::op::Square>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
CHECK(inputs_map.count(x_var_name)); auto square_node = graph->AddNode<ge::op::Square>(out_var_name);
square_node->set_input_x(*inputs_map.at(x_var_name)); square_node->set_input_x(*graph->GetNode(x_var_name));
return SUCCESS;
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(square_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = square_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(square, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::SquareConverter); square,
paddle::lite::subgraph::npu::SquareConverter);
...@@ -12,64 +12,45 @@ ...@@ -12,64 +12,45 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type TransposeConverter( int TransposeConverter(void* ctx, OpLite* op) {
const std::shared_ptr<lite::OpLite> transpose_op, CHECK(ctx != nullptr);
const node_map_type& inputs_map) { CHECK(op != nullptr);
auto scope = transpose_op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = transpose_op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " + op_type + "...";
std::shared_ptr<ge::op::Permute> transpose_node =
std::make_shared<ge::op::Permute>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Input("Out").front();
// paddlelite doesn't have this input
// w must be set, but it does nothing
auto w_var_name = unique_op_type + "/w";
auto* w = scope->Var(w_var_name)->GetMutable<Tensor>();
w->Resize({1});
auto* w_data = w->mutable_data<float>();
for (int i = 0; i < w->numel(); i++) {
w_data[i] = 1.f;
}
auto npu_w = std::make_shared<ge::op::Const>(w_var_name);
npu_w->set_attr_value(lite::npu::CvtTensor(w));
lite::npu::OpList::Global().add(npu_w);
auto axis = op_info->GetAttr<std::vector<int>>("axis"); auto axis = op_info->GetAttr<std::vector<int>>("axis");
auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end());
CHECK(inputs_map.count(x_var_name));
transpose_node->set_input_x(*inputs_map.at(x_var_name));
transpose_node->set_input_w(*npu_w);
transpose_node->set_attr_order(npu_axis);
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(transpose_node);
node_map_type outputs_map; auto transpose_node = graph->AddNode<ge::op::Permute>(out_var_name);
outputs_map[op_info->Output("Out").front()] = transpose_node; transpose_node->set_input_x(*graph->GetNode(x_var_name));
return outputs_map; auto w_const_node = graph->AddNode(out_var_name + "/w", 1.0f);
transpose_node->set_input_w(*w_const_node);
transpose_node->set_attr_order(
ge::AttrValue::LIST_INT(axis.begin(), axis.end()));
return SUCCESS;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(transpose, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::TransposeConverter); transpose,
REGISTER_NPU_BRIDGE(transpose2, paddle::lite::subgraph::npu::TransposeConverter);
paddle::lite::kernels::npu::bridges::TransposeConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
transpose2,
paddle::lite::subgraph::npu::TransposeConverter);
...@@ -12,53 +12,45 @@ ...@@ -12,53 +12,45 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace npu { namespace npu {
namespace bridges {
node_map_type UnsqueezeConverter( int UnsqueezeConverter(void* ctx, OpLite* op) {
const std::shared_ptr<lite::OpLite> unsqueeze_op, CHECK(ctx != nullptr);
const node_map_type& inputs_map) { CHECK(op != nullptr);
auto scope = unsqueeze_op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = unsqueeze_op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[NPU] Converting " + op_type + "..."; VLOG(3) << "[NPU] Converting " << op_type << "... ";
std::shared_ptr<ge::op::Reshape> unsqueeze_node =
std::make_shared<ge::op::Reshape>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
CHECK(inputs_map.count(x_var_name));
unsqueeze_node->set_input_tensor(*inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(unsqueeze_node);
CHECK(op_info->HasAttr("axes"))
<< "[NPU] unsqueeze not support axes from tensor now";
auto out_var_name = op_info->Output("Out").front(); auto out_var_name = op_info->Output("Out").front();
auto out_shape = scope->FindTensor(out_var_name)->dims().Vectorize(); auto out_shape = scope->FindTensor(out_var_name)->dims().Vectorize();
CHECK(op_info->HasAttr("axes"))
<< "[NPU] unsqueeze not support axes from tensor now";
auto unsqueeze_node = graph->AddNode<ge::op::Reshape>(out_var_name);
unsqueeze_node->set_input_tensor(*graph->GetNode(x_var_name));
unsqueeze_node->set_attr_shape( unsqueeze_node->set_attr_shape(
ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
return REBUILD_WHEN_SHAPE_CHANGED;
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = unsqueeze_node;
return outputs_map;
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_NPU_BRIDGE(unsqueeze, REGISTER_SUBGRAPH_BRIDGE(NPU,
paddle::lite::kernels::npu::bridges::UnsqueezeConverter); unsqueeze,
REGISTER_NPU_BRIDGE(unsqueeze2, paddle::lite::subgraph::npu::UnsqueezeConverter);
paddle::lite::kernels::npu::bridges::UnsqueezeConverter); REGISTER_SUBGRAPH_BRIDGE(NPU,
unsqueeze2,
paddle::lite::subgraph::npu::UnsqueezeConverter);
...@@ -12,59 +12,30 @@ ...@@ -12,59 +12,30 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/utility.h"
#include <mutex> // NOLINT
#include <utility> #include <utility>
#include "lite/backends/npu/runtime.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace subgraph {
namespace npu { namespace npu {
// Build HIAI IR graph to om model, and store om model data into lite tensor bool HasInputArg(const OpInfo* op_info,
bool BuildModel(std::vector<ge::Operator>& inputs, // NOLINT const Scope* scope,
std::vector<ge::Operator>& outputs, // NOLINT const std::string& argname) {
lite::Tensor* model_data) { auto iarg_names = op_info->input_argnames();
LOG(INFO) << "[NPU] Build model."; if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
CHECK_GT(inputs.size(), 0); iarg_names.end()) {
CHECK_GT(outputs.size(), 0); auto inputs = op_info->Input(argname);
CHECK_NE(model_data, 0); if (inputs.empty()) {
// build IR graph to om model return false;
ge::Graph ir_graph("graph"); }
ir_graph.SetInputs(inputs).SetOutputs(outputs); auto var_name = inputs.front();
ge::Model om_model("model", "model"); auto var = scope->FindVar(var_name);
om_model.SetGraph(ir_graph); return var != nullptr;
domi::HiaiIrBuild ir_build;
domi::ModelBufferData om_model_buf;
if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
LOG(WARNING) << "[NPU] CreateModelBuff failed!";
return false;
}
if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
LOG(WARNING) << "[NPU] BuildIRModel failed!";
return false;
}
// store om model into tensor
model_data->Resize({om_model_buf.length});
memcpy(model_data->mutable_data<int8_t>(),
om_model_buf.data,
om_model_buf.length);
ir_build.ReleaseModelBuff(om_model_buf);
return true;
}
std::string UniqueName(const std::string& prefix) {
static std::mutex counter_mtx;
static std::unordered_map<std::string, int> counter_map;
std::unique_lock<std::mutex> counter_lck(counter_mtx);
int counter = 1;
auto it = counter_map.find(prefix);
if (it == counter_map.end()) {
counter_map[prefix] = counter;
} else { } else {
counter = ++(it->second); return false;
} }
return prefix + "_" + std::to_string(counter);
} }
ge::DataType CvtPrecisionType(PrecisionType itype) { ge::DataType CvtPrecisionType(PrecisionType itype) {
...@@ -102,25 +73,25 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) { ...@@ -102,25 +73,25 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) {
return otype; return otype;
} }
ge::TensorPtr CvtTensor(lite::Tensor* in_tensor, ge::TensorPtr CvtTensor(const Tensor& in_tensor,
std::vector<int64_t> out_shape, std::vector<int64_t> out_shape,
PrecisionType in_ptype, PrecisionType in_ptype,
DataLayoutType in_ltype) { DataLayoutType in_ltype) {
uint8_t* in_data = nullptr; const uint8_t* in_data = nullptr;
auto in_size = in_tensor->dims().production(); auto in_size = in_tensor.dims().production();
auto in_shape = in_tensor->dims().Vectorize(); auto in_shape = in_tensor.dims().Vectorize();
if (out_shape.empty()) { if (out_shape.empty()) {
out_shape = in_shape; out_shape = in_shape;
} }
int in_bytes; int in_bytes;
if (in_ptype == PRECISION(kFloat)) { if (in_ptype == PRECISION(kFloat)) {
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>()); in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<float>());
in_bytes = in_size * sizeof(float); in_bytes = in_size * sizeof(float);
} else if (in_ptype == PRECISION(kInt32)) { } else if (in_ptype == PRECISION(kInt32)) {
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>()); in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int32_t>());
in_bytes = in_size * sizeof(int32_t); in_bytes = in_size * sizeof(int32_t);
} else if (in_ptype == PRECISION(kInt8)) { } else if (in_ptype == PRECISION(kInt8)) {
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>()); in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int8_t>());
in_bytes = in_size * sizeof(int8_t); in_bytes = in_size * sizeof(int8_t);
} else { } else {
LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype); LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype);
...@@ -169,24 +140,7 @@ int CvtActMode(std::string act_type) { ...@@ -169,24 +140,7 @@ int CvtActMode(std::string act_type) {
return act_mode; return act_mode;
} }
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname) {
auto iarg_names = op_info->input_argnames();
if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
iarg_names.end()) {
auto inputs = op_info->Input(argname);
if (inputs.empty()) {
return false;
}
auto var_name = inputs.front();
auto var = scope->FindVar(var_name);
return var != nullptr;
} else {
return false;
}
}
} // namespace npu } // namespace npu
} // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <functional>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -24,12 +25,10 @@ ...@@ -24,12 +25,10 @@
#include "ai_ddk_lib/include/graph/op/all_ops.h" #include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h" #include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h" #include "ai_ddk_lib/include/graph/operator_reg.h"
#include "ai_ddk_lib/include/hiai_ir_build.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/target_wrapper.h" #include "lite/utils/macros.h"
#include "lite/core/tensor.h"
// Extended Ops of HIAI DDK // Extended ops based on HIAI DDK
namespace ge { namespace ge {
/** /**
* Pads a tensor. * Pads a tensor.
...@@ -59,39 +58,25 @@ REG_OP(Pad) ...@@ -59,39 +58,25 @@ REG_OP(Pad)
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace subgraph {
namespace npu { namespace npu {
class OpList { // Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor
public: bool HasInputArg(const OpInfo* op_info,
static OpList& Global() { const Scope* scope,
static thread_local OpList x; const std::string& argname);
return x;
}
void clear() { lists_.clear(); }
void add(std::shared_ptr<ge::Operator> p) { lists_.push_back(p); }
private:
std::vector<std::shared_ptr<ge::Operator>> lists_;
};
// Build HIAI IR graph to om model, and store om model data into lite tensor
bool BuildModel(std::vector<ge::Operator>& inputs, // NOLINT
std::vector<ge::Operator>& outputs, // NOLINT
lite::Tensor* model_data);
std::string UniqueName(const std::string& prefix);
ge::DataType CvtPrecisionType(PrecisionType itype); ge::DataType CvtPrecisionType(PrecisionType itype);
ge::Format CvtDataLayoutType(DataLayoutType itype); ge::Format CvtDataLayoutType(DataLayoutType itype);
ge::TensorPtr CvtTensor(Tensor* in_tensor, ge::TensorPtr CvtTensor(const Tensor& in_tensor,
std::vector<int64_t> out_shape = {}, std::vector<int64_t> out_shape = {},
PrecisionType in_ptype = PRECISION(kFloat), PrecisionType in_ptype = PRECISION(kFloat),
DataLayoutType in_ltype = DATALAYOUT(kNCHW)); DataLayoutType in_ltype = DATALAYOUT(kNCHW));
template <typename T> template <typename T>
ge::TensorPtr CreateTensorAndFillData(std::vector<T> data, ge::TensorPtr CreateTensorAndFillData(const std::vector<T>& data,
std::vector<int64_t> shape = {}, std::vector<int64_t> shape = {},
ge::Format format = ge::FORMAT_NCHW) { ge::Format format = ge::FORMAT_NCHW) {
const std::type_info& info = typeid(T); const std::type_info& info = typeid(T);
...@@ -136,10 +121,7 @@ ge::TensorPtr CreateTensorAndFillData(T value, ...@@ -136,10 +121,7 @@ ge::TensorPtr CreateTensorAndFillData(T value,
int CvtActMode(std::string act_type); int CvtActMode(std::string act_type);
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname);
} // namespace npu } // namespace npu
} // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/graph_compute.h"
#include <sys/time.h>
#include <time.h>
namespace paddle {
namespace lite {
namespace kernels {
namespace npu {
void GraphCompute::PrepareForRun() {
auto& ctx = this->ctx_->template As<NPUContext>();
auto& param = this->Param<param_t>();
// Load HiAI model from the weight tensor and release its buffer
// to save memory
CHECK(param.weight);
CHECK(lite::npu::LoadModel(*param.weight, &model_client_, &model_name_));
// TODO(hong19860320): find an good way to free the model data.
// No interface exists to free the data of tensor, so I resize the dim to 1
// and change target to force it to realloc a small size memory.
param.weight->Resize({1});
param.weight->mutable_data<int8_t>(TargetType::kARM);
CHECK(model_client_);
// Query the dimensions of NPU input and output tensors from HiAI model
std::vector<hiai::TensorDimension> npu_idims;
std::vector<hiai::TensorDimension> npu_odims;
int ret =
model_client_->GetModelIOTensorDim(model_name_, npu_idims, npu_odims);
CHECK_EQ(ret, hiai::AI_SUCCESS)
<< "[NPU] Get the dimensions of input and output tensors failed.";
// Check whether the data sizes of NPU input and output tensors are the
// same as CPU's, then create and initialize NPU input and output tensors.
npu_itensors_.resize(npu_idims.size());
npu_otensors_.resize(npu_odims.size());
npu_idatasizes_.resize(npu_idims.size());
npu_odatasizes_.resize(npu_odims.size());
for (size_t i = 0; i < npu_idims.size(); ++i) {
auto cpu_itensor = param.inputs[i].second;
CHECK(cpu_itensor);
VLOG(3) << "[NPU] CPU input dims[" << i << "]: " << cpu_itensor->dims();
VLOG(3) << "[NPU] NPU input dims[" << i << "]: {"
<< npu_idims[i].GetNumber() << "," << npu_idims[i].GetChannel()
<< "," << npu_idims[i].GetHeight() << "," << npu_idims[i].GetWidth()
<< "}";
npu_idatasizes_[i] = npu_idims[i].GetNumber() * npu_idims[i].GetChannel() *
npu_idims[i].GetHeight() * npu_idims[i].GetWidth();
CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
npu_itensors_[i].reset(new hiai::AiTensor);
npu_itensors_[i]->Init(&(npu_idims[i]));
}
for (size_t i = 0; i < npu_odims.size(); ++i) {
auto cpu_otensor = param.outputs[i].second;
CHECK(cpu_otensor);
VLOG(3) << "[NPU] CPU output dims[" << i << "]: " << cpu_otensor->dims();
VLOG(3) << "[NPU] NPU output dims[" << i << "]: {"
<< npu_odims[i].GetNumber() << "," << npu_odims[i].GetChannel()
<< "," << npu_odims[i].GetHeight() << "," << npu_odims[i].GetWidth()
<< "}";
npu_odatasizes_[i] = npu_odims[i].GetNumber() * npu_odims[i].GetChannel() *
npu_odims[i].GetHeight() * npu_odims[i].GetWidth();
if (cpu_otensor->dims().production() != npu_odatasizes_[i]) {
cpu_otensor->Resize({npu_odims[i].GetNumber(),
npu_odims[i].GetChannel(),
npu_odims[i].GetHeight(),
npu_odims[i].GetWidth()});
}
npu_otensors_[i].reset(new hiai::AiTensor);
npu_otensors_[i]->Init(&(npu_odims[i]));
}
}
void GraphCompute::Run() {
auto& param = this->Param<param_t>();
// Check whether the data sizes of NPU input tensors are the same as
// CPU's, and copy the data of CPU input tensors to NPU's.
CHECK_EQ(param.inputs.size(), npu_itensors_.size());
CHECK_EQ(param.outputs.size(), npu_otensors_.size());
for (size_t i = 0; i < param.inputs.size(); ++i) {
auto cpu_itensor = param.inputs[i].second;
CHECK(cpu_itensor);
CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
std::memcpy(static_cast<float*>(npu_itensors_[i]->GetBuffer()),
cpu_itensor->data<float>(),
sizeof(float) * static_cast<size_t>(npu_idatasizes_[i]));
}
// Run HiAI model with model name
std::string key = "model_name"; // Note: key seems must be model_name
model_context_.AddPara(key, model_name_);
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
int istamp;
auto start_time = GetCurrentUS();
CHECK_EQ(hiai::AI_SUCCESS,
model_client_->Process(
model_context_, npu_itensors_, npu_otensors_, 1000, istamp));
VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
// Check whether the data sizes of NPU output tensors are the same as
// CPU's, and copy the data of NPU output tensors to CPU's.
for (size_t i = 0; i < param.outputs.size(); ++i) {
auto cpu_otensor = param.outputs[i].second;
CHECK(cpu_otensor);
CHECK_EQ(cpu_otensor->dims().production(), npu_odatasizes_[i]);
std::memcpy(cpu_otensor->mutable_data<float>(),
static_cast<float*>(npu_otensors_[i]->GetBuffer()),
sizeof(float) * static_cast<size_t>(npu_odatasizes_[i]));
}
}
} // namespace npu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(graph_op,
kNPU,
kFloat,
kNCHW,
paddle::lite::kernels::npu::GraphCompute,
def)
.BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <utility>
#include "ai_ddk_lib/include/hiai_ir_build.h"
#include "lite/backends/npu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace npu {
int SubgraphEngine::BuildDeviceProgram() {
int status = 0;
// Convert all of input data vars and added into the HiAI IR graph
subgraph::npu::Graph graph;
for (auto& input_name : input_names_) {
auto input_tensor = scope_->FindMutableTensor(input_name);
CHECK(input_tensor);
auto input_node =
graph.AddNode(input_name, input_tensor->dims().Vectorize());
CHECK(input_node);
// HiAI DDK doesn't support dynamic dimensions/shapes, so need to rebuild
// the program when the shape of any input tensor is changed.
status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
}
// Convert all of ops and its weights and added into the HiAI IR graph
const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) {
auto op = inst.op();
CHECK(op);
op->CheckShape();
op->InferShape();
std::string op_type = op->op_info()->Type();
if (!bridges.Exists("NPU", op_type)) {
return subgraph::FAILED;
}
status |= bridges.Select("NPU", op_type)(reinterpret_cast<void*>(&graph),
const_cast<OpLite*>(op));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
}
}
// Set the input and output nodes of the HiAI IR graph
std::vector<ge::Operator> input_nodes, output_nodes;
for (auto& input_name : input_names_) {
input_nodes.push_back(*graph.GetNode(input_name));
}
for (auto& output_name : output_names_) {
output_nodes.push_back(*graph.GetNode(output_name));
}
// Build the HiAI IR graph to HiAI om model
device_program_ =
lite::npu::Device::Global().Build(model_name_, input_nodes, output_nodes);
if (device_program_ == nullptr) {
LOG(WARNING) << "[NPU] Build model failed!";
return subgraph::FAILED;
}
// Query and check the dimensions of input and output tensors
std::vector<hiai::TensorDimension> device_idims, device_odims;
if (device_program_->GetModelIOTensorDim(
model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
LOG(WARNING)
<< "[NPU] Get the dimensions of input and output tensors failed!";
return subgraph::FAILED;
}
CHECK_EQ(device_idims.size(), input_names_.size());
CHECK_EQ(device_odims.size(), output_names_.size());
origin_idims_.resize(input_names_.size());
origin_itensors_.resize(input_names_.size());
device_idatasizes_.resize(input_names_.size());
device_itensors_.resize(input_names_.size());
origin_odims_.resize(output_names_.size());
origin_otensors_.resize(output_names_.size());
device_odatasizes_.resize(output_names_.size());
device_otensors_.resize(output_names_.size());
for (int i = 0; i < input_names_.size(); i++) {
origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
CHECK(origin_itensors_[i]);
origin_idims_[i] = origin_itensors_[i]->dims();
VLOG(3) << "[NPU] Input dims[" << i << "]: {" << device_idims[i].GetNumber()
<< "," << device_idims[i].GetChannel() << ","
<< device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
<< "}";
device_idatasizes_[i] =
device_idims[i].GetNumber() * device_idims[i].GetChannel() *
device_idims[i].GetHeight() * device_idims[i].GetWidth();
CHECK_EQ(device_idatasizes_[i], origin_idims_[i].production());
device_itensors_[i].reset(new hiai::AiTensor);
device_itensors_[i]->Init(&(device_idims[i]));
}
for (int i = 0; i < output_names_.size(); i++) {
origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
CHECK(origin_otensors_[i]);
origin_odims_[i] = origin_otensors_[i]->dims();
VLOG(3) << "[NPU] Output dims[" << i << "]: {"
<< device_odims[i].GetNumber() << ","
<< device_odims[i].GetChannel() << ","
<< device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
<< "}";
device_odatasizes_[i] =
device_odims[i].GetNumber() * device_odims[i].GetChannel() *
device_odims[i].GetHeight() * device_odims[i].GetWidth();
CHECK_EQ(device_odatasizes_[i], origin_odims_[i].production());
device_otensors_[i].reset(new hiai::AiTensor);
device_otensors_[i]->Init(&(device_odims[i]));
}
return status;
}
int SubgraphEngine::LaunchDeviceProgram() {
// Copy the data of origin input tensors to the buffer of input HiAI tensors
for (size_t i = 0; i < input_names_.size(); i++) {
std::memcpy(static_cast<float*>(device_itensors_[i]->GetBuffer()),
origin_itensors_[i]->mutable_data<float>(),
sizeof(float) * static_cast<size_t>(device_idatasizes_[i]));
}
// Run the HiAI model by name
std::string key = "model_name"; // Note: key seems must be model_name
model_context_.AddPara(key, model_name_);
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
int istamp;
auto start_time = GetCurrentUS();
CHECK_EQ(
device_program_->Process(
model_context_, device_itensors_, device_otensors_, 1000, istamp),
hiai::AI_SUCCESS);
VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
// Copy the data of output HiAI tensor to the buffer of origin output tensors
for (size_t i = 0; i < output_names_.size(); i++) {
std::memcpy(origin_otensors_[i]->mutable_data<float>(),
static_cast<float*>(device_otensors_[i]->GetBuffer()),
sizeof(float) * static_cast<size_t>(device_odatasizes_[i]));
}
return 0;
}
void SubgraphCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
engine_.reset(new SubgraphEngine(param.sub_block_idx,
param.sub_block_desc,
param.input_data_names,
param.output_data_names,
param.scope));
CHECK(engine_);
engine_->Build();
}
void SubgraphCompute::Run() {
CHECK(engine_);
engine_->Launch();
}
} // namespace npu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(subgraph,
kNPU,
kFloat,
kNCHW,
paddle::lite::kernels::npu::SubgraphCompute,
def)
.BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
...@@ -14,48 +14,57 @@ ...@@ -14,48 +14,57 @@
#pragma once #pragma once
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "lite/backends/npu/builder.h" #include "ai_ddk_lib/include/HiAiModelManagerService.h"
#include "lite/core/mir/pass.h" #include "lite/core/kernel.h"
#include "lite/core/mir/subgraph/subgraph_program_pass.h" #include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace mir { namespace kernels {
namespace subgraph { namespace npu {
class GenerateNPUProgramPass : public SubgraphProgramPass { class SubgraphEngine : public subgraph::Engine {
public: public:
using key2nodes_t = std::map<std::string, Node*>; SubgraphEngine(int block_idx,
cpp::BlockDesc *block_desc,
void Apply(const std::unique_ptr<SSAGraph>& graph) override; const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
Scope *scope)
: subgraph::Engine(
block_idx, block_desc, input_names, output_names, scope) {}
protected: protected:
// nodes2cvt: op nodes to convert int BuildDeviceProgram() override;
// return cvted_vars: converted var nodes int LaunchDeviceProgram() override;
void CvtAllOpNodes(const std::vector<Node*>& nodes2cvt,
lite::kernels::npu::bridges::node_map_type* cvted_vars); std::string model_name_;
hiai::AiContext model_context_;
std::shared_ptr<ge::Operator> CvtVarNode(lite::mir::Node* var_node, std::vector<int64_t> device_idatasizes_;
const Scope* scope); std::vector<int64_t> device_odatasizes_;
std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
std::string BuildNPUGraph(const std::unordered_set<Node*>& op_nodes, std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
const std::unordered_set<Node*>& in_data_vars, std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
const std::unordered_set<Node*>& out_data_vars, };
int sub_id);
class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
void GenNPUSubgraph(const std::unique_ptr<SSAGraph>& graph, public:
const std::unordered_set<Node*>& op_nodes, using param_t = operators::SubgraphParam;
int sub_id);
void PrepareForRun() override;
void Run() override;
virtual ~SubgraphCompute() = default;
private:
std::unique_ptr<SubgraphEngine> engine_;
}; };
} // namespace subgraph } // namespace npu
} // namespace mir } // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
if(NOT LITE_WITH_XPU)
return ()
endif()
add_kernel(graph_compute_xpu XPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} xpu_runtime)
# lite_cc_test(test_graph_compute_xpu SRCS graph_compute_test.cc DEPS graph_compute_xpu)
add_subdirectory(bridges) add_subdirectory(bridges)
add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
lite_cc_library(xpu_bridge_registry SRCS registry.cc) if(NOT LITE_WITH_XPU)
return()
endif()
set(xpu_bridge_deps xpu_bridge_registry xpu_builder op) lite_cc_library(subgraph_bridge_utility_xpu SRCS utility.cc DEPS ${xpu_builder_libs} tensor)
lite_cc_library(subgraph_bridge_graph_xpu SRCS graph.cc DEPS subgraph_bridge_utility_xpu)
lite_cc_library(xpu_bridge_act_op SRCS act_op.cc DEPS ${xpu_bridge_deps}) set(xpu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_xpu subgraph_bridge_graph_xpu)
lite_cc_library(xpu_bridge_conv_op SRCS conv_op.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_pool_op SRCS pool_op.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_softmax_op SRCS softmax_op.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_mul_op SRCS mul_op.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${xpu_bridge_deps})
set(xpu_bridges lite_cc_library(subgraph_bridge_act_op_xpu SRCS act_op.cc DEPS ${xpu_subgraph_bridge_deps})
xpu_bridge_registry lite_cc_library(subgraph_bridge_conv_op_xpu SRCS conv_op.cc DEPS ${xpu_subgraph_bridge_deps})
xpu_bridge_act_op lite_cc_library(subgraph_bridge_elementwise_ops_xpu SRCS elementwise_ops.cc DEPS ${xpu_subgraph_bridge_deps})
xpu_bridge_conv_op lite_cc_library(subgraph_bridge_pool_op_xpu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_xpu})
xpu_bridge_elementwise_ops lite_cc_library(subgraph_bridge_softmax_op_xpu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_xpu})
xpu_bridge_pool_op lite_cc_library(subgraph_bridge_mul_op_xpu SRCS mul_op.cc DEPS ${xpu_subgraph_bridge_deps})
xpu_bridge_softmax_op lite_cc_library(subgraph_bridge_batch_norm_op_xpu SRCS batch_norm_op.cc DEPS ${xpu_subgraph_bridge_deps})
xpu_bridge_mul_op
xpu_bridge_batch_norm_op
CACHE INTERNAL "xpu_bridges")
set(xpu_bridge_test_deps ${xpu_bridges} ${xpu_kernels} ${ops}) set(xpu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_utility_xpu
subgraph_bridge_graph_xpu
subgraph_bridge_act_op_xpu
subgraph_bridge_conv_op_xpu
subgraph_bridge_elementwise_ops_xpu
subgraph_bridge_pool_op_xpu
subgraph_bridge_softmax_op_xpu
subgraph_bridge_mul_op_xpu
subgraph_bridge_batch_norm_op_xpu
CACHE INTERNAL "xpu_subgraph_bridges")
lite_cc_test(test_xpu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps}) message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
lite_cc_test(test_xpu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_batch_norm_op SRCS batch_norm_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
...@@ -12,51 +12,41 @@ ...@@ -12,51 +12,41 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/xpu/builder.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/registry.h" #include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace xpu { namespace xpu {
namespace bridges {
node_map_type ActConverter(const std::shared_ptr<lite::OpLite> op, int ActConverter(void* ctx, OpLite* op) {
graph_ctx_type* graph_ctx, CHECK(ctx != nullptr);
const node_map_type& input_nodes) { CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type); VLOG(3) << "[XPU] Converting " + op_type + "...";
LOG(INFO) << "[XPU] Converting " + op_type + "...";
// check context // Create act node and set params from op
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// create act node and set params from op
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
CHECK(input_nodes.count(x_var_name)); auto out_var_name = op_info->Output("Out").front();
std::shared_ptr<xtcl::xExpr> act_node = nullptr; CHECK(graph->HasNode(x_var_name));
if (op_type == "relu") { if (op_type == "relu") {
act_node = std::make_shared<xtcl::xExpr>( graph->AddNode(out_var_name,
graph_ctx->builder->CreateRelu(*input_nodes.at(x_var_name))); graph->builder_.CreateRelu(*graph->GetNode(x_var_name)));
} else { } else {
// TODO(hong19860320) supports more activation ops // TODO(hong19860320) supports more activation ops
LOG(FATAL) << "[XPU] Unsupported activation type " << op_type; LOG(WARNING) << "[XPU] Unsupported activation type " << op_type;
return FAILED;
} }
graph_ctx->builder->SetLayer(unique_op_type); return SUCCESS;
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = act_node;
return output_nodes;
} }
} // namespace bridges
} // namespace xpu } // namespace xpu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_XPU_BRIDGE(relu, paddle::lite::kernels::xpu::bridges::ActConverter); REGISTER_SUBGRAPH_BRIDGE(XPU, relu, paddle::lite::subgraph::xpu::ActConverter);
...@@ -12,30 +12,25 @@ ...@@ -12,30 +12,25 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/xpu/builder.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/registry.h" #include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace xpu { namespace xpu {
namespace bridges {
node_map_type BatchNormConverter(const std::shared_ptr<lite::OpLite> op, int BatchNormConverter(void* ctx, OpLite* op) {
graph_ctx_type* graph_ctx, CHECK(ctx != nullptr);
const node_map_type& input_nodes) { CHECK(op != nullptr);
auto scope = op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[XPU] Converting " + op_type + "..."; VLOG(3) << "[XPU] Converting " + op_type + "...";
// check context
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// get input, and attributes // Get input vars and op attributes
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto scale_var_name = op_info->Input("Scale").front(); auto scale_var_name = op_info->Input("Scale").front();
auto* scale = scope->FindMutableTensor(scale_var_name); auto* scale = scope->FindMutableTensor(scale_var_name);
...@@ -45,69 +40,33 @@ node_map_type BatchNormConverter(const std::shared_ptr<lite::OpLite> op, ...@@ -45,69 +40,33 @@ node_map_type BatchNormConverter(const std::shared_ptr<lite::OpLite> op,
auto* mean = scope->FindMutableTensor(mean_var_name); auto* mean = scope->FindMutableTensor(mean_var_name);
auto variance_var_name = op_info->Input("Variance").front(); auto variance_var_name = op_info->Input("Variance").front();
auto* variance = scope->FindMutableTensor(variance_var_name); auto* variance = scope->FindMutableTensor(variance_var_name);
auto y_var_name = op_info->Output("Y").front();
auto epsilon = op_info->GetAttr<float>("epsilon"); auto epsilon = op_info->GetAttr<float>("epsilon");
// create scale node // Create scale, bias, mean, variance nodes
CHECK(!input_nodes.count(scale_var_name)); auto scale_const_node = graph->AddNode(scale_var_name, *scale);
auto scale_const_node = std::make_shared<xtcl::xExpr>( auto bias_const_node = graph->AddNode(bias_var_name, *bias);
graph_ctx->builder->CreateTensor(scale_var_name, auto mean_const_node = graph->AddNode(mean_var_name, *mean);
lite::xpu::CvtShape(scale->dims()), auto variance_const_node = graph->AddNode(variance_var_name, *variance);
::xtcl::Float(32)));
auto scale_const_tensor = lite::xpu::CvtTensor(scale);
graph_ctx->params->emplace(
std::make_pair(scale_var_name, *scale_const_tensor));
// create bias node
CHECK(!input_nodes.count(bias_var_name));
auto bias_const_node =
std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
bias_var_name, lite::xpu::CvtShape(bias->dims()), ::xtcl::Float(32)));
auto bias_const_tensor = lite::xpu::CvtTensor(bias);
graph_ctx->params->emplace(std::make_pair(bias_var_name, *bias_const_tensor));
// create mean node
CHECK(!input_nodes.count(mean_var_name));
auto mean_const_node =
std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
mean_var_name, lite::xpu::CvtShape(mean->dims()), ::xtcl::Float(32)));
auto mean_const_tensor = lite::xpu::CvtTensor(mean);
graph_ctx->params->emplace(std::make_pair(mean_var_name, *mean_const_tensor));
// create variance node
CHECK(!input_nodes.count(variance_var_name));
auto variance_const_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateTensor(variance_var_name,
lite::xpu::CvtShape(variance->dims()),
::xtcl::Float(32)));
auto variance_const_tensor = lite::xpu::CvtTensor(variance);
graph_ctx->params->emplace(
std::make_pair(variance_var_name, *variance_const_tensor));
// create batch_norm node and set params from op
CHECK(input_nodes.count(x_var_name));
auto batch_norm_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateBatchNorm(*input_nodes.at(x_var_name),
*scale_const_node,
*bias_const_node,
*mean_const_node,
*variance_const_node,
1,
epsilon));
batch_norm_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->GetField(*batch_norm_node, 0));
graph_ctx->builder->SetLayer(unique_op_type);
// output converted nodes // Create batch_norm node and set params from op
node_map_type output_nodes; auto batch_norm_node =
output_nodes[op_info->Output("Y").front()] = batch_norm_node; graph->builder_.CreateBatchNorm(*graph->GetNode(x_var_name),
return output_nodes; *scale_const_node,
*bias_const_node,
*mean_const_node,
*variance_const_node,
1,
epsilon);
graph->AddNode(y_var_name, graph->builder_.GetField(batch_norm_node, 0));
return SUCCESS;
} }
} // namespace bridges
} // namespace xpu } // namespace xpu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_XPU_BRIDGE(batch_norm, REGISTER_SUBGRAPH_BRIDGE(XPU,
paddle::lite::kernels::xpu::bridges::BatchNormConverter); batch_norm,
paddle::lite::subgraph::xpu::BatchNormConverter);
...@@ -13,31 +13,32 @@ ...@@ -13,31 +13,32 @@
// limitations under the License. // limitations under the License.
#include "lite/operators/conv_op.h" #include "lite/operators/conv_op.h"
#include "lite/backends/xpu/builder.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/registry.h" #include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace xpu { namespace xpu {
namespace bridges {
node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op, int ConvConverter(void* ctx, OpLite* op) {
graph_ctx_type* graph_ctx, CHECK(ctx != nullptr);
const node_map_type& input_nodes) { CHECK(op != nullptr);
auto scope = op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[XPU] Converting " << op_type << "... "; VLOG(3) << "[XPU] Converting " << op_type << "... ";
// get input, filter and op attributes // Get input, filter and op attributes
auto input_var_name = op_info->Input("Input").front(); auto input_var_name = op_info->Input("Input").front();
auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>(); auto input = scope->FindVar(input_var_name)->GetMutable<Tensor>();
auto input_dims = input->dims(); auto input_dims = input->dims();
auto filter_var_name = op_info->Input("Filter").front(); auto filter_var_name = op_info->Input("Filter").front();
auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>(); auto filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
auto filter_dims = filter->dims(); auto filter_dims = filter->dims();
auto output_var_name = op_info->Output("Output").front();
auto bs = input_dims[0]; auto bs = input_dims[0];
auto oc = filter_dims[0]; auto oc = filter_dims[0];
CHECK_EQ(input_dims.size(), 4); CHECK_EQ(input_dims.size(), 4);
...@@ -80,26 +81,14 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op, ...@@ -80,26 +81,14 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
} }
DDim output_dims(output_shape); DDim output_dims(output_shape);
// check context // Create filter node
CHECK(graph_ctx != nullptr); auto filter_const_node = graph->AddNode(filter_var_name, *filter);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// create filter node
CHECK(!input_nodes.count(filter_var_name));
auto filter_const_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateTensor(filter_var_name,
lite::xpu::CvtShape(filter_dims),
::xtcl::Float(32)));
auto filter_const_tensor = lite::xpu::CvtTensor(filter);
graph_ctx->params->emplace(
std::make_pair(filter_var_name, *filter_const_tensor));
// create conv node and set input, filter, bias nodes and attributes // Create conv node and set input, filter, bias nodes and attributes
auto conv_attrs = xtcl::make_node<xtcl::network::Conv2DAttrs>(); auto conv_attrs = xtcl::make_node<xtcl::network::Conv2DAttrs>();
conv_attrs->strides = std::move(lite::xpu::CvtShape(strides)); conv_attrs->strides = std::move(CvtShape(strides));
conv_attrs->padding = std::move(lite::xpu::CvtShape(paddings)); conv_attrs->padding = std::move(CvtShape(paddings));
conv_attrs->dilation = std::move(lite::xpu::CvtShape(dilations)); conv_attrs->dilation = std::move(CvtShape(dilations));
conv_attrs->groups = groups; conv_attrs->groups = groups;
// conv_attrs->channels = nullptr; // conv_attrs->channels = nullptr;
conv_attrs->kernel_size = std::move(xtcl::Array<xtcl::xIndexExpr>(nullptr)); conv_attrs->kernel_size = std::move(xtcl::Array<xtcl::xIndexExpr>(nullptr));
...@@ -107,20 +96,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op, ...@@ -107,20 +96,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
conv_attrs->kernel_layout = "OIHW"; conv_attrs->kernel_layout = "OIHW";
conv_attrs->out_layout = ""; conv_attrs->out_layout = "";
// conv_attrs->out_dtype = ""; // conv_attrs->out_dtype = "";
CHECK(input_nodes.count(input_var_name)); auto conv_node = graph->AddNode(
auto conv_node = output_var_name,
std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateConv2D( graph->builder_.CreateConv2D(
*input_nodes.at(input_var_name), *filter_const_node, conv_attrs)); *graph->GetNode(input_var_name), *filter_const_node, conv_attrs));
graph_ctx->builder->SetLayer(unique_op_type);
// create bias node if has bias // Create bias node if exists bias
// supports the bias nodes with the following dimensions // supports the bias nodes with the following dimensions
// 0: {oc} // 0: {oc}
// 1: {1, oc, oh, ow} // 1: {1, oc, oh, ow}
// 2: {n, oc, oh, ow} // 2: {n, oc, oh, ow}
if (lite::xpu::HasInputArg(op_info, scope, "Bias")) { if (HasInputArg(op_info, scope, "Bias")) {
auto bias_var_name = op_info->Input("Bias").front(); auto bias_var_name = op_info->Input("Bias").front();
auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>(); auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
auto bias_dims = bias->dims(); auto bias_dims = bias->dims();
auto bias_data_size = bias_dims.production(); auto bias_data_size = bias_dims.production();
auto output_data_size = output_dims.production(); auto output_data_size = output_dims.production();
...@@ -137,57 +125,46 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op, ...@@ -137,57 +125,46 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
// 2: {n, oc, oh, ow} // 2: {n, oc, oh, ow}
bias_shape = output_dims.Vectorize(); bias_shape = output_dims.Vectorize();
} else { } else {
LOG(ERROR) << "bias dimension " << bias_dims LOG(ERROR) << "[XPU] Bias dimension " << bias_dims
<< " isn't supported in conv2d Op when output dimension is " << " isn't supported in conv2d Op when output dimension is "
<< output_dims; << output_dims;
} }
std::shared_ptr<xtcl::xExpr> bias_node = nullptr; std::shared_ptr<xtcl::xExpr> bias_node = nullptr;
if (input_nodes.count(bias_var_name)) { if (graph->HasNode(bias_var_name)) {
// bias node from input node // Bias node from input node
bias_node = input_nodes.at(bias_var_name); bias_node = graph->GetNode(bias_var_name);
} else { } else {
// bias node with const tensor // Bias node with const tensor
auto bias_const_node = std::make_shared<xtcl::xExpr>( bias_node = graph->AddNode(bias_var_name, *bias, bias_shape);
graph_ctx->builder->CreateTensor(bias_var_name,
lite::xpu::CvtShape(bias_shape),
::xtcl::Float(32)));
auto bias_const_tensor = lite::xpu::CvtTensor(bias, bias_shape);
graph_ctx->params->emplace(
std::make_pair(bias_var_name, *bias_const_tensor));
bias_node = bias_const_node;
} }
std::shared_ptr<xtcl::xExpr> add_node = nullptr; std::shared_ptr<xtcl::xExpr> add_node = nullptr;
if (is_channel_bias) { if (is_channel_bias) {
add_node = std::make_shared<xtcl::xExpr>( add_node = graph->AddNode(
graph_ctx->builder->CreateBiasAdd(*conv_node, 1, *bias_node)); output_var_name,
graph->builder_.CreateBiasAdd(*conv_node, 1, *bias_node));
} else { } else {
add_node = std::make_shared<xtcl::xExpr>( add_node = graph->AddNode(
graph_ctx->builder->CreateBinaryOp("add", *conv_node, *bias_node)); output_var_name,
graph->builder_.CreateBinaryOp("add", *conv_node, *bias_node));
} }
graph_ctx->builder->SetLayer(unique_op_type + "/add");
conv_node = add_node; conv_node = add_node;
} }
// output converted nodes
node_map_type output_nodes;
if (fuse_relu) { if (fuse_relu) {
// append relu node if fuse_relu is true // Append relu node if fuse_relu is true
auto relu_node = std::make_shared<xtcl::xExpr>( graph->AddNode(output_var_name, graph->builder_.CreateRelu(*conv_node));
graph_ctx->builder->CreateRelu(*conv_node));
graph_ctx->builder->SetLayer(unique_op_type + "/relu");
output_nodes[op_info->Output("Output").front()] = relu_node;
} else {
output_nodes[op_info->Output("Output").front()] = conv_node;
} }
return output_nodes; return REBUILD_WHEN_SHAPE_CHANGED;
} }
} // namespace bridges
} // namespace xpu } // namespace xpu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_XPU_BRIDGE(conv2d, paddle::lite::kernels::xpu::bridges::ConvConverter); REGISTER_SUBGRAPH_BRIDGE(XPU,
REGISTER_XPU_BRIDGE(depthwise_conv2d, conv2d,
paddle::lite::kernels::xpu::bridges::ConvConverter); paddle::lite::subgraph::xpu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(XPU,
depthwise_conv2d,
paddle::lite::subgraph::xpu::ConvConverter);
...@@ -12,85 +12,72 @@ ...@@ -12,85 +12,72 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/xpu/builder.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/registry.h" #include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace xpu { namespace xpu {
namespace bridges {
node_map_type ElementwiseConverter(const std::shared_ptr<lite::OpLite> op, int ElementwiseConverter(void* ctx, OpLite* op) {
graph_ctx_type* graph_ctx, CHECK(op != nullptr);
const node_map_type& input_nodes) { CHECK(ctx != nullptr);
auto scope = op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[XPU] Converting " + op_type + "..."; VLOG(3) << "[XPU] Converting " + op_type + "...";
// check context
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// get input, and attributes // Get input, and attributes
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Input("Y").front(); auto y_var_name = op_info->Input("Y").front();
auto out_var_name = op_info->Output("Out").front();
auto axis = op_info->GetAttr<int>("axis"); auto axis = op_info->GetAttr<int>("axis");
auto x_tensor = scope->FindMutableTensor(x_var_name); auto x = scope->FindMutableTensor(x_var_name);
auto y_tensor = scope->FindMutableTensor(y_var_name); auto y = scope->FindMutableTensor(y_var_name);
auto x_dims = x_tensor->dims(); auto x_dims = x->dims();
auto y_dims = y_tensor->dims(); auto y_dims = y->dims();
// create x and y node // Create x and y node
std::shared_ptr<xtcl::xExpr> x_node = nullptr; std::shared_ptr<xtcl::xExpr> x_node = nullptr;
if (input_nodes.count(x_var_name)) { if (graph->HasNode(x_var_name)) {
x_node = input_nodes.at(x_var_name); x_node = graph->GetNode(x_var_name);
} else { } else {
x_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor( x_node = graph->AddNode(x_var_name, *x);
x_var_name, lite::xpu::CvtShape(x_dims), ::xtcl::Float(32)));
auto x_const_tensor = lite::xpu::CvtTensor(x_tensor);
graph_ctx->params->emplace(std::make_pair(x_var_name, *x_const_tensor));
} }
std::shared_ptr<xtcl::xExpr> y_node = nullptr; std::shared_ptr<xtcl::xExpr> y_node = nullptr;
if (input_nodes.count(y_var_name)) { if (graph->HasNode(y_var_name)) {
y_node = input_nodes.at(y_var_name); y_node = graph->GetNode(y_var_name);
} else { } else {
y_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor( y_node = graph->AddNode(y_var_name, *y);
y_var_name, lite::xpu::CvtShape(y_dims), ::xtcl::Float(32)));
auto y_const_tensor = lite::xpu::CvtTensor(y_tensor);
graph_ctx->params->emplace(std::make_pair(y_var_name, *y_const_tensor));
} }
// create elementwise node and set input, attributes // Create elementwise node and set input, attributes
std::shared_ptr<xtcl::xExpr> elementwise_node = nullptr; std::shared_ptr<xtcl::xExpr> elementwise_node = nullptr;
if (y_dims.size() == 1) { if (y_dims.size() == 1) {
elementwise_node = std::make_shared<xtcl::xExpr>( elementwise_node = graph->AddNode(
graph_ctx->builder->CreateBiasAdd(*x_node, axis, *y_node)); out_var_name, graph->builder_.CreateBiasAdd(*x_node, axis, *y_node));
} else if (x_dims.size() == y_dims.size()) { } else if (x_dims.size() == y_dims.size()) {
elementwise_node = std::make_shared<xtcl::xExpr>( elementwise_node = graph->AddNode(
graph_ctx->builder->CreateBinaryOp("add", *x_node, *y_node)); out_var_name, graph->builder_.CreateBinaryOp("add", *x_node, *y_node));
} else { } else {
LOG(ERROR) << "XPU elementwise_add only support y of one dimension, or x " LOG(WARNING)
"and y of the same dimension. But recieved x's dimension: " << "[XPU] elementwise_add only support y of one dimension, or x "
<< x_dims << ", y's dimension: " << y_dims << ", axis: " << axis; "and y of the same dimension. But recieved x's dimension: "
<< x_dims << ", y's dimension: " << y_dims << ", axis: " << axis;
return FAILED;
} }
graph_ctx->builder->SetLayer(unique_op_type); return REBUILD_WHEN_SHAPE_CHANGED;
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = elementwise_node;
return output_nodes;
} }
} // namespace bridges
} // namespace xpu } // namespace xpu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_XPU_BRIDGE(elementwise_add, REGISTER_SUBGRAPH_BRIDGE(XPU,
paddle::lite::kernels::xpu::bridges::ElementwiseConverter); elementwise_add,
paddle::lite::subgraph::xpu::ElementwiseConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/bridges/graph.h"
#include <utility>
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace xpu {
std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
const xtcl::xExpr& layer) {
auto unique_name = [&](const std::string& key) {
int idx = 1;
auto it = counts_.find(key);
if (it == counts_.end()) {
counts_.insert(std::make_pair(key, idx));
} else {
idx = ++(it->second);
}
return key + "_" + std::to_string(idx);
};
auto it = nodes_.find(name);
if (it != nodes_.end()) {
CHECK(params_.find(name) == params_.end()) << "[XPU] Node " << name
<< " redefined.";
// Generate a new unique name as the key to bind the origin node if the
// origin node isn't a const node: new_name->node
nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second));
nodes_.erase(it);
}
// Create a new node and bind with the name: name->new_node
auto node = std::make_shared<xtcl::xExpr>(layer);
nodes_.insert(std::make_pair(name, node));
builder_.SetLayer(unique_name(name + "_op"));
return node;
}
// Const node
std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
const Tensor& tensor,
PrecisionType ptype,
DataLayoutType ltype) {
return AddNode(name, tensor, tensor.dims().Vectorize(), ptype, ltype);
}
std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
const Tensor& tensor,
std::vector<int64_t> shape,
PrecisionType ptype,
DataLayoutType ltype) {
auto node = AddNode(name, shape, ptype, ltype);
params_.emplace(
std::make_pair(name, *CvtTensor(tensor, shape, ptype, ltype)));
return node;
}
// Data node
std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
std::vector<int64_t> shape,
PrecisionType ptype,
DataLayoutType ltype) {
CHECK(!HasNode(name));
auto node = std::make_shared<xtcl::xExpr>(
builder_.CreateTensor(name, CvtShape(shape), CvtPrecisionType(ptype)));
nodes_.insert(std::make_pair(name, node));
return node;
}
} // namespace xpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <xtcl/xtcl.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace xpu {
// The Context of the converters which used for converting the ops of subgraph
// to the XPU IR graph
class Graph {
public:
// Layer node
std::shared_ptr<xtcl::xExpr> AddNode(const std::string& name,
const xtcl::xExpr& layer);
// Const node
std::shared_ptr<xtcl::xExpr> AddNode(
const std::string& name,
const Tensor& tensor,
PrecisionType ptype = PRECISION(kFloat),
DataLayoutType ltype = DATALAYOUT(kNCHW));
std::shared_ptr<xtcl::xExpr> AddNode(
const std::string& name,
const Tensor& tensor,
std::vector<int64_t> shape,
PrecisionType ptype = PRECISION(kFloat),
DataLayoutType ltype = DATALAYOUT(kNCHW));
template <typename T>
std::shared_ptr<xtcl::xExpr> AddNode(
const std::string& name,
const std::vector<T>& data,
std::vector<int64_t> shape = {},
DataLayoutType ltype = DATALAYOUT(kNCHW)) {
const std::type_info& info = typeid(T);
PrecisionType ptype = PRECISION(kFloat);
if (info == typeid(float)) {
ptype = PRECISION(kFloat);
} else if (info == typeid(int8_t)) {
ptype = PRECISION(kFloat);
} else if (info == typeid(int32_t)) {
ptype = PRECISION(kInt32);
} else {
LOG(FATAL) << "[XPU] Unknow data type " << info.name();
}
if (shape.empty()) {
shape = {static_cast<int64_t>(data.size())};
} else {
int size = 1;
for (auto i : shape) {
size *= i;
}
CHECK_EQ(data.size(), size);
}
Tensor tensor;
tensor.Resize(shape);
std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
reinterpret_cast<const uint8_t*>(data.data()),
data.size() * sizeof(T));
return AddNode(name, tensor, ptype, ltype);
}
template <typename T>
std::shared_ptr<xtcl::xExpr> AddNode(
const std::string& name,
T value,
std::vector<int64_t> shape = {1},
DataLayoutType ltype = DATALAYOUT(kNCHW)) {
int64_t size = 1;
for (auto i : shape) {
size *= i;
}
std::vector<T> data(size, value);
return AddNode(name, data, shape, ltype);
}
// Data node
std::shared_ptr<xtcl::xExpr> AddNode(
const std::string& name,
std::vector<int64_t> shape,
PrecisionType ptype = PRECISION(kFloat),
DataLayoutType ltype = DATALAYOUT(kNCHW));
std::shared_ptr<xtcl::xExpr> GetNode(const std::string& name) {
CHECK(HasNode(name)) << "[XPU] Node " << name << " not found.";
return nodes_.at(name);
}
bool HasNode(const std::string& name) {
return nodes_.find(name) != nodes_.end();
}
public:
// XPU network builder and constant tensors
xtcl::network::xNetworkBuilder builder_;
xtcl::network::xTensorCompiler::ParamNDArrayMap params_;
private:
std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>> nodes_;
std::unordered_map<std::string, int> counts_;
};
} // namespace xpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
...@@ -12,34 +12,30 @@ ...@@ -12,34 +12,30 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/xpu/builder.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/registry.h" #include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace xpu { namespace xpu {
namespace bridges {
node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op, int MulConverter(void* ctx, OpLite* op) {
graph_ctx_type* graph_ctx, CHECK(ctx != nullptr);
const node_map_type& input_nodes) { CHECK(op != nullptr);
auto scope = op->scope(); auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type); auto scope = op->scope();
LOG(INFO) << "[XPU] Converting " + op_type + "..."; VLOG(3) << "[XPU] Converting " + op_type + "...";
// check context
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// get input, and attributes // Get input, and attributes
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Input("Y").front(); auto y_var_name = op_info->Input("Y").front();
auto y_tensor = scope->FindMutableTensor(y_var_name); auto out_var_name = op_info->Output("Out").front();
auto y_dims = y_tensor->dims(); auto y = scope->FindMutableTensor(y_var_name);
auto y_dims = y->dims();
CHECK_EQ(y_dims.size(), 2) << "xpu now only support y_dims.size() == 2"; CHECK_EQ(y_dims.size(), 2) << "xpu now only support y_dims.size() == 2";
auto x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims"); auto x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
...@@ -47,54 +43,38 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op, ...@@ -47,54 +43,38 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op,
auto y_num_col_dims = op_info->GetAttr<int>("x_num_col_dims"); auto y_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
CHECK_EQ(y_num_col_dims, 1) << "xpu now only support y_num_col_dims == 1"; CHECK_EQ(y_num_col_dims, 1) << "xpu now only support y_num_col_dims == 1";
// create x node // Flatten x node
std::shared_ptr<xtcl::xExpr> x_node = nullptr; auto x_node = graph->AddNode(
x_node = std::make_shared<xtcl::xExpr>( x_var_name + "/flatten",
graph_ctx->builder->CreateBatchFlatten(*input_nodes.at(x_var_name))); graph->builder_.CreateBatchFlatten(*graph->GetNode(x_var_name)));
graph_ctx->builder->SetLayer(unique_op_type + "/X");
// transpose y // Transpose y data and create y node
DDimLite y_dims_t(std::vector<int64_t>{1, 1}); Tensor transpose_y;
y_dims_t[0] = y_dims[1]; DDim transpose_y_dims(std::vector<int64_t>{y_dims[1], y_dims[0]});
y_dims_t[1] = y_dims[0]; transpose_y.Resize(transpose_y_dims);
auto y_var_name_t = unique_op_type + "/Y"; auto transpose_y_data = transpose_y.mutable_data<float>();
Tensor* y_tensor_t = new Tensor(); auto y_data = y->mutable_data<float>();
y_tensor_t->Resize(y_dims_t); for (int i = 0; i < transpose_y_dims[0]; i++) {
auto y_data_t = y_tensor_t->mutable_data<float>(); for (int j = 0; j < transpose_y_dims[1]; j++) {
auto y_data = y_tensor->mutable_data<float>(); transpose_y_data[i * transpose_y_dims[1] + j] =
for (int i = 0; i < y_dims_t[0]; i++) { y_data[j * transpose_y_dims[0] + i];
for (int j = 0; j < y_dims_t[1]; j++) {
y_data_t[i * y_dims_t[1] + j] = y_data[j * y_dims_t[0] + i];
} }
} }
auto y_const_node = graph->AddNode(y_var_name + "/transpose", transpose_y);
// create y node // Create mul node and set params from op
std::shared_ptr<xtcl::xExpr> y_const_node = nullptr; graph->AddNode(
y_const_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor( out_var_name,
y_var_name_t, lite::xpu::CvtShape(y_dims_t), ::xtcl::Float(32))); graph->builder_.CreateDense(*x_node,
auto y_const_tensor = lite::xpu::CvtTensor(y_tensor_t); static_cast<int>(y_dims[1]),
graph_ctx->params->emplace(std::make_pair(y_var_name_t, *y_const_tensor)); ::xtcl::NullValue<::xtcl::DataType>(),
delete y_tensor_t; *y_const_node));
return REBUILD_WHEN_SHAPE_CHANGED;
// create mul node and set params from op
std::shared_ptr<xtcl::xExpr> mul_node = nullptr;
mul_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateDense(*x_node,
static_cast<int>(y_dims[1]),
::xtcl::NullValue<::xtcl::DataType>(),
*y_const_node));
graph_ctx->builder->SetLayer(unique_op_type);
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = mul_node;
return output_nodes;
} }
} // namespace bridges
} // namespace xpu } // namespace xpu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_XPU_BRIDGE(mul, paddle::lite::kernels::xpu::bridges::MulConverter); REGISTER_SUBGRAPH_BRIDGE(XPU, mul, paddle::lite::subgraph::xpu::MulConverter);
...@@ -14,13 +14,11 @@ ...@@ -14,13 +14,11 @@
#pragma once #pragma once
#include "lite/kernels/xpu/bridges/registry.h" USE_SUBGRAPH_BRIDGE(XPU, relu);
USE_SUBGRAPH_BRIDGE(XPU, conv2d);
USE_XPU_BRIDGE(relu); USE_SUBGRAPH_BRIDGE(XPU, depthwise_conv2d);
USE_XPU_BRIDGE(conv2d); USE_SUBGRAPH_BRIDGE(XPU, elementwise_add);
USE_XPU_BRIDGE(depthwise_conv2d); USE_SUBGRAPH_BRIDGE(XPU, pool2d);
USE_XPU_BRIDGE(elementwise_add); USE_SUBGRAPH_BRIDGE(XPU, softmax);
USE_XPU_BRIDGE(pool2d); USE_SUBGRAPH_BRIDGE(XPU, mul);
USE_XPU_BRIDGE(softmax); USE_SUBGRAPH_BRIDGE(XPU, batch_norm);
USE_XPU_BRIDGE(mul);
USE_XPU_BRIDGE(batch_norm);
...@@ -12,30 +12,26 @@ ...@@ -12,30 +12,26 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/xpu/builder.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/registry.h" #include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace xpu { namespace xpu {
namespace bridges {
node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op, int PoolConverter(void* ctx, OpLite* op) {
graph_ctx_type* graph_ctx, CHECK(ctx != nullptr);
const node_map_type& input_nodes) { CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type); VLOG(3) << "[XPU] Converting " + op_type + "...";
LOG(INFO) << "[XPU] Converting " + op_type + "...";
// check context // Get input, and attributes
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// get input, and attributes
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto pooling_type = op_info->GetAttr<std::string>("pooling_type"); auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
auto ceil_mode = op_info->GetAttr<bool>("ceil_mode"); auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings"); auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
...@@ -44,54 +40,51 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op, ...@@ -44,54 +40,51 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op,
auto strides = op_info->GetAttr<std::vector<int>>("strides"); auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto exclusive = op_info->GetAttr<bool>("exclusive"); auto exclusive = op_info->GetAttr<bool>("exclusive");
// create pool node and set params from op // Create pool node and set params from op
CHECK(input_nodes.count(x_var_name));
std::shared_ptr<xtcl::xExpr> pool_node = nullptr;
if (pooling_type == "max") { if (pooling_type == "max") {
if (global_pooling) { if (global_pooling) {
pool_node = std::make_shared<xtcl::xExpr>( graph->AddNode(
graph_ctx->builder->CreateGlobalMaxPool2D( out_var_name,
*input_nodes.at(x_var_name))); graph->builder_.CreateGlobalMaxPool2D(*graph->GetNode(x_var_name)));
} else { } else {
pool_node = std::make_shared<xtcl::xExpr>( graph->AddNode(
graph_ctx->builder->CreateMaxPool2D(*input_nodes.at(x_var_name), out_var_name,
lite::xpu::CvtShape(ksize), graph->builder_.CreateMaxPool2D(*graph->GetNode(x_var_name),
lite::xpu::CvtShape(strides), CvtShape(ksize),
lite::xpu::CvtShape(paddings), CvtShape(strides),
"NCHW", CvtShape(paddings),
ceil_mode)); "NCHW",
ceil_mode));
} }
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
if (global_pooling) { if (global_pooling) {
pool_node = std::make_shared<xtcl::xExpr>( graph->AddNode(
graph_ctx->builder->CreateGlobalAvgPool2D( out_var_name,
*input_nodes.at(x_var_name))); graph->builder_.CreateGlobalAvgPool2D(*graph->GetNode(x_var_name)));
} else { } else {
pool_node = std::make_shared<xtcl::xExpr>( // !exclusive ---> count_include_pad
// !exclusive ---> count_include_pad graph->AddNode(
graph_ctx->builder->CreateAvgPool2D(*input_nodes.at(x_var_name), out_var_name,
lite::xpu::CvtShape(ksize), graph->builder_.CreateAvgPool2D(*graph->GetNode(x_var_name),
lite::xpu::CvtShape(strides), CvtShape(ksize),
lite::xpu::CvtShape(paddings), CvtShape(strides),
"NCHW", CvtShape(paddings),
ceil_mode, "NCHW",
!exclusive)); ceil_mode,
!exclusive));
} }
} else { } else {
LOG(FATAL) << "Unsupported pooling type: " << pooling_type; LOG(WARNING) << "[XPU] Unsupported pooling type: " << pooling_type;
return FAILED;
} }
graph_ctx->builder->SetLayer(unique_op_type); return SUCCESS;
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = pool_node;
return output_nodes;
} }
} // namespace bridges
} // namespace xpu } // namespace xpu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_XPU_BRIDGE(pool2d, paddle::lite::kernels::xpu::bridges::PoolConverter); REGISTER_SUBGRAPH_BRIDGE(XPU,
pool2d,
paddle::lite::subgraph::xpu::PoolConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <xtcl/xtcl.h>
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/utils/macros.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
// xpu network builder and constant tensors
class graph_ctx_type {
public:
std::shared_ptr<xtcl::network::xNetworkBuilder> builder;
std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params;
};
// var_name, xpu node pointer
using node_map_type =
std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>>;
using func_type = std::function<node_map_type(
const std::shared_ptr<OpLite>, graph_ctx_type*, const node_map_type&)>;
using cvt_map_type = std::unordered_map<std::string, func_type>;
class Factory {
public:
static Factory& Instance();
const cvt_map_type& AllFunctions() const { return map_; }
bool HasType(const std::string& op_type) const;
void Insert(const std::string& op_type, const func_type& func_name);
Factory() = default;
private:
cvt_map_type map_;
DISALLOW_COPY_AND_ASSIGN(Factory);
};
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// some platform-independent defintion
#if defined(_WIN32)
#define UNUSED
#define __builtin_expect(EXP, C) (EXP)
#else
#define UNUSED __attribute__((unused))
#endif
#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \
struct __test_global_namespace_##uniq_name##__ {}; \
static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \
__test_global_namespace_##uniq_name##__>::value, \
msg)
#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \
__reg_xpu_bridge_##op_type##__, \
"REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \
int __reg_xpu_bridge_##op_type##_Insert() { \
paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert( \
#op_type, cvt_func_name); \
return 0; \
}
#define USE_XPU_BRIDGE(op_type) \
extern int __reg_xpu_bridge_##op_type##_Insert(); \
static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \
__reg_xpu_bridge_##op_type##_Insert();
...@@ -12,50 +12,40 @@ ...@@ -12,50 +12,40 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/xpu/builder.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/registry.h" #include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace subgraph {
namespace xpu { namespace xpu {
namespace bridges {
node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> op, int SoftmaxConverter(void* ctx, OpLite* op) {
graph_ctx_type* graph_ctx, CHECK(ctx != nullptr);
const node_map_type& input_nodes) { CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type); VLOG(3) << "[XPU] Converting " + op_type + "...";
LOG(INFO) << "[XPU] Converting " + op_type + "...";
// check context // Get op's attributes
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// get op's attributes
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto axis = op_info->GetAttr<int>("axis"); auto axis = op_info->GetAttr<int>("axis");
// create softmax node and set params from ops // Create softmax node and set params from ops
CHECK(input_nodes.count(x_var_name)); graph->AddNode(
std::shared_ptr<xtcl::xExpr> softmax_node = nullptr; out_var_name,
softmax_node = std::make_shared<xtcl::xExpr>( graph->builder_.CreateSoftmax(*graph->GetNode(x_var_name), axis));
graph_ctx->builder->CreateSoftmax(*input_nodes.at(x_var_name), axis)); return SUCCESS;
graph_ctx->builder->SetLayer(unique_op_type);
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = softmax_node;
return output_nodes;
} }
} // namespace bridges
} // namespace xpu } // namespace xpu
} // namespace kernels } // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_XPU_BRIDGE(softmax, REGISTER_SUBGRAPH_BRIDGE(XPU,
paddle::lite::kernels::xpu::bridges::SoftmaxConverter); softmax,
paddle::lite::subgraph::xpu::SoftmaxConverter);
...@@ -12,13 +12,12 @@ ...@@ -12,13 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/xpu/builder.h" #include "lite/kernels/xpu/bridges/utility.h"
#include <mutex> // NOLINT
#include <utility> #include <utility>
#include "lite/backends/xpu/runtime.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace subgraph {
namespace xpu { namespace xpu {
bool HasInputArg(const OpInfo* op_info, bool HasInputArg(const OpInfo* op_info,
...@@ -39,20 +38,6 @@ bool HasInputArg(const OpInfo* op_info, ...@@ -39,20 +38,6 @@ bool HasInputArg(const OpInfo* op_info,
} }
} }
std::string UniqueName(const std::string& prefix) {
static std::mutex counter_mtx;
static std::unordered_map<std::string, int> counter_map;
std::unique_lock<std::mutex> counter_lck(counter_mtx);
int counter = 1;
auto it = counter_map.find(prefix);
if (it == counter_map.end()) {
counter_map[prefix] = counter;
} else {
counter = ++(it->second);
}
return prefix + "_" + std::to_string(counter);
}
xtcl::DataType CvtPrecisionType(PrecisionType in_type) { xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
xtcl::DataType out_type = ::xtcl::Float(32); xtcl::DataType out_type = ::xtcl::Float(32);
switch (in_type) { switch (in_type) {
...@@ -66,8 +51,8 @@ xtcl::DataType CvtPrecisionType(PrecisionType in_type) { ...@@ -66,8 +51,8 @@ xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
out_type = ::xtcl::Int(32); out_type = ::xtcl::Int(32);
break; break;
default: default:
LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type) LOG(FATAL) << "[XPU] Can not convert precision type("
<< ") from Lite to XPU"; << PrecisionToStr(in_type) << ") from Lite to XPU";
break; break;
} }
return out_type; return out_type;
...@@ -86,8 +71,8 @@ DLDataType CvtDataType(PrecisionType in_type) { ...@@ -86,8 +71,8 @@ DLDataType CvtDataType(PrecisionType in_type) {
out_type = {kDLInt, 32, 1}; out_type = {kDLInt, 32, 1};
break; break;
default: default:
LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type) LOG(FATAL) << "[XPU] Can not convert data type("
<< ") from Lite to XPU"; << PrecisionToStr(in_type) << ") from Lite to XPU";
break; break;
} }
return out_type; return out_type;
...@@ -109,28 +94,28 @@ xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims) { ...@@ -109,28 +94,28 @@ xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims) {
return CvtShape(in_dims.Vectorize()); return CvtShape(in_dims.Vectorize());
} }
std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor, std::shared_ptr<xtcl::xNDArray> CvtTensor(const Tensor& in_tensor,
std::vector<int64_t> out_shape, std::vector<int64_t> out_shape,
PrecisionType in_ptype, PrecisionType in_ptype,
DataLayoutType in_ltype) { DataLayoutType in_ltype) {
uint8_t* in_data = nullptr; const uint8_t* in_data = nullptr;
auto in_size = in_tensor->dims().production(); auto in_size = in_tensor.dims().production();
auto in_shape = in_tensor->dims().Vectorize(); auto in_shape = in_tensor.dims().Vectorize();
if (out_shape.empty()) { if (out_shape.empty()) {
out_shape = in_shape; out_shape = in_shape;
} }
int in_bytes; int in_bytes;
if (in_ptype == PRECISION(kFloat)) { if (in_ptype == PRECISION(kFloat)) {
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>()); in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<float>());
in_bytes = in_size * sizeof(float); in_bytes = in_size * sizeof(float);
} else if (in_ptype == PRECISION(kInt32)) { } else if (in_ptype == PRECISION(kInt32)) {
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>()); in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int32_t>());
in_bytes = in_size * sizeof(int32_t); in_bytes = in_size * sizeof(int32_t);
} else if (in_ptype == PRECISION(kInt8)) { } else if (in_ptype == PRECISION(kInt8)) {
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>()); in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int8_t>());
in_bytes = in_size * sizeof(int8_t); in_bytes = in_size * sizeof(int8_t);
} else { } else {
LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype); LOG(FATAL) << "[XPU] Unknow precision type " << PrecisionToStr(in_ptype);
} }
auto out_tensor = std::make_shared<xtcl::xNDArray>( auto out_tensor = std::make_shared<xtcl::xNDArray>(
xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0})); xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0}));
...@@ -140,50 +125,7 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor, ...@@ -140,50 +125,7 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor,
return out_tensor; return out_tensor;
} }
// Build the XPU subgraph to the XPU model, store the model data into the
// weight tensor of the graph op, and the model data will be loaded again
// by the graph computing kernel when the graph op is executed for inference.
// Due to the lack of XPU APIs for building and outputing the model data,
// the compiled XPU runtime object will be managed by the global variable
// 'DeviceInfo' and the key name for finding the runtime object will be
// stored in the weight tensor of graph op.
// TODO(hong19860320) Compile the XPU subgraph and output the compiled model
// data to the weight tensor of graph op.
bool BuildModel(
std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
lite::Tensor* model) {
LOG(INFO) << "[XPU] Build Model.";
CHECK(builder != nullptr);
CHECK(outputs != nullptr);
CHECK_GT(outputs->size(), 0);
CHECK(model != nullptr);
// build graph and fill all of constant params
xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
auto target = xtcl::Target::Create("llvm");
auto compiler = xtcl::network::xTensorCompiler(network, target);
compiler.SetParams(*params); // set the data of constant tensors
compiler.Build();
// create and register runtime
auto runtime = std::make_shared<xtcl::network::xRuntimeInstance>(
compiler.CreateRuntimeInstance());
if (runtime == nullptr) {
LOG(WARNING) << "[XPU] Build Model failed!";
return false;
}
std::string name = UniqueName("xpu");
LOG(INFO) << "[XPU] Model Name: " << name;
DeviceInfo::Global().Insert(name, runtime);
model->Resize({static_cast<int64_t>(name.length() + 1)});
memcpy(model->mutable_data<int8_t>(),
reinterpret_cast<const int8_t*>(name.c_str()),
name.length() + 1);
return true;
}
} // namespace xpu } // namespace xpu
} // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -17,22 +17,20 @@ ...@@ -17,22 +17,20 @@
#include <xtcl/xtcl.h> #include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/target_wrapper.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace subgraph {
namespace xpu { namespace xpu {
// Type/tensor converters for converting Paddle type/tensor to XPU type/tensor
bool HasInputArg(const OpInfo* op_info, bool HasInputArg(const OpInfo* op_info,
const Scope* scope, const Scope* scope,
const std::string& argname); const std::string& argname);
std::string UniqueName(const std::string& prefix);
xtcl::DataType CvtPrecisionType(PrecisionType in_type); xtcl::DataType CvtPrecisionType(PrecisionType in_type);
DLDataType CvtDataType(PrecisionType in_type); DLDataType CvtDataType(PrecisionType in_type);
...@@ -44,17 +42,12 @@ xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape); ...@@ -44,17 +42,12 @@ xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape);
xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims); xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims);
std::shared_ptr<xtcl::xNDArray> CvtTensor( std::shared_ptr<xtcl::xNDArray> CvtTensor(
Tensor* in_tensor, const Tensor& in_tensor,
std::vector<int64_t> out_shape = {}, std::vector<int64_t> out_shape = {},
PrecisionType in_ptype = PRECISION(kFloat), PrecisionType in_ptype = PRECISION(kFloat),
DataLayoutType in_ltype = DATALAYOUT(kNCHW)); DataLayoutType in_ltype = DATALAYOUT(kNCHW));
bool BuildModel(
std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
lite::Tensor* model);
} // namespace xpu } // namespace xpu
} // namespace subgraph
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/graph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <string>
#include <vector>
#include "lite/backends/xpu/runtime.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void GraphCompute::PrepareForRun() {
// auto& ctx = this->ctx_->template As<XPUContext>();
auto& param = this->Param<param_t>();
CHECK(param.weight);
CHECK(lite::xpu::LoadModel(*param.weight, &runtime_));
CHECK(runtime_ != nullptr);
}
void GraphCompute::Run() {
auto& param = this->Param<param_t>();
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
auto start_time = GetCurrentUS();
for (int i = 0; i < param.inputs.size(); i++) {
auto input_var_name = param.inputs[i].first;
auto input_tensor = param.inputs[i].second;
LOG(INFO) << "input dims[" << i << ":" << input_var_name
<< "]: " << input_tensor->dims();
auto input_tensor_data = input_tensor->data<float>();
for (int j = 0; j < input_tensor->dims().production(); j++) {
VLOG(3) << input_tensor_data[j];
}
auto input_ndarray = xtcl::xNDArray::Empty(
input_tensor->dims().Vectorize(), {kDLFloat, 32, 1}, {kDLCPU, 0});
auto input_ndarray_data =
static_cast<float*>(input_ndarray.ToDLPack()->dl_tensor.data);
std::memcpy(input_ndarray_data,
input_tensor_data,
sizeof(float) * input_tensor->dims().production());
runtime_->SetInputZeroCopy(input_var_name,
&input_ndarray.ToDLPack()->dl_tensor);
}
runtime_->Run();
for (int i = 0; i < param.outputs.size(); i++) {
auto output_ndarray = runtime_->GetOutput(i);
auto output_var_name = param.outputs[i].first;
auto output_tensor = param.outputs[i].second;
output_tensor->Resize(output_ndarray.Shape());
LOG(INFO) << "output dims[" << i << ":" << output_var_name
<< "]: " << output_tensor->dims();
auto output_ndarray_data =
static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data);
auto output_tensor_data = output_tensor->mutable_data<float>();
std::memcpy(output_tensor_data,
output_ndarray_data,
sizeof(float) * output_tensor->dims().production());
for (int j = 0; j < output_tensor->dims().production(); j++) {
VLOG(3) << output_tensor_data[j];
}
}
LOG(INFO) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(graph_op,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::GraphCompute,
def)
.BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <utility>
#include "lite/backends/xpu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
int SubgraphEngine::BuildDeviceProgram() {
int status = 0;
// Convert all of input data vars and added into the XPU IR graph
subgraph::xpu::Graph graph;
for (auto& input_name : input_names_) {
auto input_tensor = scope_->FindMutableTensor(input_name);
CHECK(input_tensor);
auto input_node =
graph.AddNode(input_name, input_tensor->dims().Vectorize());
CHECK(input_node);
// XTCL doesn't support dynamic dimensions/shapes, so need to rebuild
// the program when the shape of any input tensor is changed.
status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
}
// Convert all of ops and its weights and added into the XPU IR graph
const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) {
auto op = inst.op();
CHECK(op);
op->CheckShape();
op->InferShape();
std::string op_type = op->op_info()->Type();
if (!bridges.Exists("XPU", op_type)) {
return subgraph::FAILED;
}
status |= bridges.Select("XPU", op_type)(reinterpret_cast<void*>(&graph),
const_cast<OpLite*>(op));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
}
}
// Obtain the output nodes of the XPU IR graph and build the graph to XPU
// runtime
std::vector<xtcl::xExpr*> output_nodes;
for (auto& output_name : output_names_) {
output_nodes.push_back(graph.GetNode(output_name).get());
}
device_program_ = lite::xpu::Device::Global().Build(
&graph.builder_, &graph.params_, &output_nodes);
if (device_program_ == nullptr) {
LOG(WARNING) << "[XPU] Build model failed!";
return subgraph::FAILED;
}
// Query and check the dimensions of input and output tensors
origin_idims_.resize(input_names_.size());
origin_itensors_.resize(input_names_.size());
origin_odims_.resize(output_names_.size());
origin_otensors_.resize(output_names_.size());
for (int i = 0; i < input_names_.size(); i++) {
origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
CHECK(origin_itensors_[i]);
origin_idims_[i] = origin_itensors_[i]->dims();
VLOG(3) << "[XPU] Input dims[" << i << "]: " << origin_idims_[i];
}
for (int i = 0; i < output_names_.size(); i++) {
origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
CHECK(origin_otensors_[i]);
origin_odims_[i] = origin_otensors_[i]->dims();
VLOG(3) << "[XPU] Output dims[" << i << "]: " << origin_odims_[i];
}
return status;
}
int SubgraphEngine::LaunchDeviceProgram() {
// Copy the data of origin input tensors to the buffer of input XPU tensors
for (size_t i = 0; i < input_names_.size(); i++) {
auto input_ndarray =
xtcl::xNDArray::Empty(origin_itensors_[i]->dims().Vectorize(),
{kDLFloat, 32, 1},
{kDLCPU, 0});
std::memcpy(static_cast<float*>(input_ndarray.ToDLPack()->dl_tensor.data),
origin_itensors_[i]->mutable_data<float>(),
sizeof(float) * origin_itensors_[i]->dims().production());
device_program_->SetInputZeroCopy(input_names_[i],
&input_ndarray.ToDLPack()->dl_tensor);
}
// Run the XPU model
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
auto start_time = GetCurrentUS();
device_program_->Run();
VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
// Copy the data of output XPU tensor to the buffer of origin output tensors
for (size_t i = 0; i < output_names_.size(); i++) {
auto output_ndarray = device_program_->GetOutput(i);
std::memcpy(origin_otensors_[i]->mutable_data<float>(),
static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data),
sizeof(float) * origin_otensors_[i]->dims().production());
}
return 0;
}
void SubgraphCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
engine_.reset(new SubgraphEngine(param.sub_block_idx,
param.sub_block_desc,
param.input_data_names,
param.output_data_names,
param.scope));
CHECK(engine_);
engine_->Build();
}
void SubgraphCompute::Run() {
CHECK(engine_);
engine_->Launch();
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(subgraph,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::SubgraphCompute,
def)
.BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
...@@ -14,41 +14,51 @@ ...@@ -14,41 +14,51 @@
#pragma once #pragma once
#include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "ai_ddk_lib/include/HiAiModelManagerService.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_registry.h" #include "lite/kernels/npu/bridges/engine.h"
#include "lite/core/types.h" #include "lite/kernels/npu/bridges/registry.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace npu { namespace xpu {
class GraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> { class SubgraphEngine : public subgraph::Engine {
public: public:
using param_t = operators::GraphParam; SubgraphEngine(int block_idx,
cpp::BlockDesc *block_desc,
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
Scope *scope)
: subgraph::Engine(
block_idx, block_desc, input_names, output_names, scope) {}
protected:
int BuildDeviceProgram() override;
int LaunchDeviceProgram() override;
std::unique_ptr<xtcl::network::xRuntimeInstance> device_program_{nullptr};
};
class SubgraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::SubgraphParam;
void PrepareForRun() override; void PrepareForRun() override;
void Run() override; void Run() override;
virtual ~GraphCompute() = default; virtual ~SubgraphCompute() = default;
private: private:
std::shared_ptr<hiai::AiModelMngerClient> model_client_; std::unique_ptr<SubgraphEngine> engine_;
std::string model_name_;
hiai::AiContext model_context_;
std::vector<int64_t> npu_idatasizes_;
std::vector<int64_t> npu_odatasizes_;
std::vector<std::shared_ptr<hiai::AiTensor>> npu_itensors_;
std::vector<std::shared_ptr<hiai::AiTensor>> npu_otensors_;
}; };
} // namespace npu } // namespace xpu
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -48,7 +48,7 @@ add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_ ...@@ -48,7 +48,7 @@ add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_
add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS}) add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS}) add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS}) add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS}) add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
# 2.basic ops not used in basic models # 2.basic ops not used in basic models
add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS}) add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
...@@ -106,7 +106,6 @@ add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS}) ...@@ -106,7 +106,6 @@ add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS}) add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS}) add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS})
add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS}) add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS})
add_operator(logical_xor extra SRCS logical_op.cc DEPS ${op_DEPS}) add_operator(logical_xor extra SRCS logical_op.cc DEPS ${op_DEPS})
add_operator(logical_and extra SRCS logical_op.cc DEPS ${op_DEPS}) add_operator(logical_and extra SRCS logical_op.cc DEPS ${op_DEPS})
add_operator(logical_or extra SRCS logical_op.cc DEPS ${op_DEPS}) add_operator(logical_or extra SRCS logical_op.cc DEPS ${op_DEPS})
......
...@@ -70,10 +70,14 @@ struct CalibParam { ...@@ -70,10 +70,14 @@ struct CalibParam {
float scale; float scale;
}; };
struct GraphParam { struct SubgraphParam {
std::vector<std::pair<std::string, const lite::Tensor*>> inputs{}; std::vector<std::string> input_names{};
lite::Tensor* weight{}; std::vector<std::string> output_names{};
std::vector<std::pair<std::string, lite::Tensor*>> outputs{}; std::vector<std::string> input_data_names{};
std::vector<std::string> output_data_names{};
int sub_block_idx{-1};
cpp::BlockDesc* sub_block_desc{nullptr};
Scope* scope{nullptr};
}; };
/// -------------------------- NN operators ------------------------------------ /// -------------------------- NN operators ------------------------------------
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/operators/graph_op.h" #include "lite/operators/subgraph_op.h"
#include <utility> #include <utility>
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
...@@ -20,34 +20,29 @@ namespace paddle { ...@@ -20,34 +20,29 @@ namespace paddle {
namespace lite { namespace lite {
namespace operators { namespace operators {
bool GraphOpLite::CheckShape() const { bool SubgraphOp::CheckShape() const { return true; }
CHECK_GE_OR_FALSE(param_.inputs.size(), 1UL);
CHECK_GE_OR_FALSE(param_.outputs.size(), 1UL);
return true;
}
bool GraphOpLite::InferShape() const { return CheckShape(); /* enrich me */ }
bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { bool SubgraphOp::InferShape() const { return CheckShape(); /* enrich me */ }
auto inputs = op_desc.Input("Inputs");
auto weight = op_desc.Input("Weight");
auto outputs = op_desc.Output("Outputs");
for (auto var : inputs) { bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
CHECK(scope->FindVar(var)); param_.input_names = op_desc.Input("Inputs");
param_.inputs.push_back( param_.output_names = op_desc.Output("Outputs");
std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>())); for (auto& input_name : param_.input_names) {
CHECK(scope->FindVar(input_name));
scope->FindVar(input_name)->GetMutable<lite::Tensor>();
} }
for (auto& output_name : param_.output_names) {
param_.weight = scope->FindVar(weight.front())->GetMutable<lite::Tensor>(); CHECK(scope->FindVar(output_name));
CHECK(param_.weight); scope->FindVar(output_name)->GetMutable<lite::Tensor>();
for (auto var : outputs) {
CHECK(scope->FindVar(var));
param_.outputs.push_back(
std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
} }
param_.input_data_names =
op_desc.GetAttr<std::vector<std::string>>("input_data_names");
param_.output_data_names =
op_desc.GetAttr<std::vector<std::string>>("output_data_names");
CHECK(param_.sub_block_desc);
param_.sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
param_.scope = scope;
CHECK(param_.scope);
return true; return true;
} }
...@@ -55,4 +50,4 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { ...@@ -55,4 +50,4 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_LITE_OP(graph_op, paddle::lite::operators::GraphOpLite); REGISTER_LITE_OP(subgraph, paddle::lite::operators::SubgraphOp);
...@@ -27,11 +27,11 @@ namespace paddle { ...@@ -27,11 +27,11 @@ namespace paddle {
namespace lite { namespace lite {
namespace operators { namespace operators {
class GraphOpLite : public OpLite { class SubgraphOp : public OpLite {
public: public:
GraphOpLite() {} SubgraphOp() {}
explicit GraphOpLite(const std::string &type) : OpLite(type) {} explicit SubgraphOp(const std::string &type) : OpLite(type) {}
bool CheckShape() const override; bool CheckShape() const override;
...@@ -41,10 +41,13 @@ class GraphOpLite : public OpLite { ...@@ -41,10 +41,13 @@ class GraphOpLite : public OpLite {
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "graph_op"; } std::string DebugString() const override { return "subgraph"; }
void SetSubBlock(cpp::BlockDesc *desc) { param_.sub_block_desc = desc; }
cpp::BlockDesc *GetSubBlock() { return param_.sub_block_desc; }
private: private:
mutable GraphParam param_; mutable SubgraphParam param_;
}; };
} // namespace operators } // namespace operators
......
...@@ -8,7 +8,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE ...@@ -8,7 +8,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE
lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
...@@ -44,16 +44,16 @@ if(LITE_BUILD_EXTRA) ...@@ -44,16 +44,16 @@ if(LITE_BUILD_EXTRA)
lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif() endif()
lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif() endif()
...@@ -243,38 +243,53 @@ class ActivationComputeTester : public arena::TestCase { ...@@ -243,38 +243,53 @@ class ActivationComputeTester : public arena::TestCase {
TEST(Activation_relu, precision) { TEST(Activation_relu, precision) {
LOG(INFO) << "test relu op"; LOG(INFO) << "test relu op";
#ifdef LITE_WITH_ARM float abs_error = 2e-5;
Place place(TARGET(kARM)); Place place;
#if defined(LITE_WITH_NPU)
place = TARGET(kNPU);
abs_error = 1e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_ARM)
place = TARGET(kARM);
#elif defined(LITE_WITH_XPU)
place = TARGET(kXPU);
#else
return;
#endif
for (auto n : {1, 3}) { for (auto n : {1, 3}) {
for (auto c : {3, 6}) { for (auto c : {3, 6}) {
for (auto h : {9, 18}) { for (auto h : {9, 18}) {
for (auto w : {9, 18}) { for (auto w : {9, 18}) {
for (auto slope : {0.01, 0.1}) { std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester( place,
place, "def",
"def", 0.01,
0.01, 6.,
6., "all",
"all", 0.,
0., DDim(std::vector<int64_t>({n, c, h, w})),
DDim(std::vector<int64_t>({n, c, h, w})), "relu",
"relu", RELU));
RELU)); arena::Arena arena(std::move(tester), place, abs_error);
arena::Arena arena(std::move(tester), place, 2e-5); arena.TestPrecision();
arena.TestPrecision();
}
} }
} }
} }
} }
#endif
} }
TEST(Activation_leaky_relu, precision) { TEST(Activation_leaky_relu, precision) {
LOG(INFO) << "test leaky_relu op"; LOG(INFO) << "test leaky_relu op";
#ifdef LITE_WITH_ARM float abs_error = 2e-5;
Place place(TARGET(kARM)); Place place;
#if defined(LITE_WITH_NPU)
place = TARGET(kNPU);
abs_error = 1e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_ARM)
place = TARGET(kARM);
#else
return;
#endif
for (auto n : {1, 3}) { for (auto n : {1, 3}) {
for (auto c : {3, 6}) { for (auto c : {3, 6}) {
...@@ -291,20 +306,27 @@ TEST(Activation_leaky_relu, precision) { ...@@ -291,20 +306,27 @@ TEST(Activation_leaky_relu, precision) {
DDim(std::vector<int64_t>({n, c, h, w})), DDim(std::vector<int64_t>({n, c, h, w})),
"leaky_relu", "leaky_relu",
LEAKY_RELU)); LEAKY_RELU));
arena::Arena arena(std::move(tester), place, 2e-5); arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision(); arena.TestPrecision();
} }
} }
} }
} }
} }
#endif
} }
TEST(Activation_relu_clipped, precision) { TEST(Activation_relu_clipped, precision) {
LOG(INFO) << "test relu clipped op"; LOG(INFO) << "test relu clipped op";
#ifdef LITE_WITH_ARM float abs_error = 2e-5;
Place place(TARGET(kARM)); Place place;
#if defined(LITE_WITH_NPU)
place = TARGET(kNPU);
abs_error = 1e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_ARM)
place = TARGET(kARM);
#else
return;
#endif
for (auto n : {1, 3}) { for (auto n : {1, 3}) {
for (auto c : {3, 6}) { for (auto c : {3, 6}) {
...@@ -321,14 +343,13 @@ TEST(Activation_relu_clipped, precision) { ...@@ -321,14 +343,13 @@ TEST(Activation_relu_clipped, precision) {
DDim(std::vector<int64_t>({n, c, h, w})), DDim(std::vector<int64_t>({n, c, h, w})),
"relu_clipped", "relu_clipped",
RELU_CLIPPED)); RELU_CLIPPED));
arena::Arena arena(std::move(tester), place, 2e-5); arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision(); arena.TestPrecision();
} }
} }
} }
} }
} }
#endif
} }
TEST(Activation_prelu, precision) { TEST(Activation_prelu, precision) {
...@@ -363,8 +384,16 @@ TEST(Activation_prelu, precision) { ...@@ -363,8 +384,16 @@ TEST(Activation_prelu, precision) {
TEST(Activation_sigmoid, precision) { TEST(Activation_sigmoid, precision) {
LOG(INFO) << "test sigmoid op"; LOG(INFO) << "test sigmoid op";
#ifdef LITE_WITH_ARM float abs_error = 2e-5;
Place place(TARGET(kARM)); Place place;
#if defined(LITE_WITH_NPU)
place = TARGET(kNPU);
abs_error = 1e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_ARM)
place = TARGET(kARM);
#else
return;
#endif
for (auto n : {1, 3}) { for (auto n : {1, 3}) {
for (auto c : {3, 6}) { for (auto c : {3, 6}) {
...@@ -380,19 +409,26 @@ TEST(Activation_sigmoid, precision) { ...@@ -380,19 +409,26 @@ TEST(Activation_sigmoid, precision) {
DDim(std::vector<int64_t>({n, c, h, w})), DDim(std::vector<int64_t>({n, c, h, w})),
"sigmoid", "sigmoid",
SIGMOID)); SIGMOID));
arena::Arena arena(std::move(tester), place, 2e-5); arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision(); arena.TestPrecision();
} }
} }
} }
} }
#endif
} }
TEST(Activation_tanh, precision) { TEST(Activation_tanh, precision) {
LOG(INFO) << "test tanh op"; LOG(INFO) << "test tanh op";
#ifdef LITE_WITH_ARM float abs_error = 2e-5;
Place place(TARGET(kARM)); Place place;
#if defined(LITE_WITH_NPU)
place = TARGET(kNPU);
abs_error = 1e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_ARM)
place = TARGET(kARM);
#else
return;
#endif
for (auto n : {1, 3}) { for (auto n : {1, 3}) {
for (auto c : {3, 6}) { for (auto c : {3, 6}) {
...@@ -408,13 +444,12 @@ TEST(Activation_tanh, precision) { ...@@ -408,13 +444,12 @@ TEST(Activation_tanh, precision) {
DDim(std::vector<int64_t>({n, c, h, w})), DDim(std::vector<int64_t>({n, c, h, w})),
"tanh", "tanh",
TANH)); TANH));
arena::Arena arena(std::move(tester), place, 2e-5); arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision(); arena.TestPrecision();
} }
} }
} }
} }
#endif
} }
TEST(Activation_swish, precision) { TEST(Activation_swish, precision) {
......
...@@ -7,7 +7,7 @@ ARM_ABI="armv8" # armv8, armv7 ...@@ -7,7 +7,7 @@ ARM_ABI="armv8" # armv8, armv7
ARM_LANG="gcc" # gcc only yet ARM_LANG="gcc" # gcc only yet
ANDROID_STL="c++_shared" # c++_shared/c++_static, c++_shared is used by HiAI DDK 310 ANDROID_STL="c++_shared" # c++_shared/c++_static, c++_shared is used by HiAI DDK 310
DDK_ROOT="$(pwd)/ai_ddk_lib/" # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/ DDK_ROOT="$(pwd)/ai_ddk_lib/" # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/
TARGET_NAME="test_npu_pass" # default target TARGET_NAME="test_subgraph_pass" # default target
BUILD_EXTRA=OFF # ON(with sequence ops)/OFF BUILD_EXTRA=OFF # ON(with sequence ops)/OFF
WITH_JAVA=ON # ON(build jar and jni so)/OFF WITH_JAVA=ON # ON(build jar and jni so)/OFF
WITH_TESTING=ON # ON/OFF WITH_TESTING=ON # ON/OFF
......
...@@ -3,7 +3,7 @@ set -ex ...@@ -3,7 +3,7 @@ set -ex
# global variables with default value # global variables with default value
XPU_SDK_ROOT="$(pwd)/../XPU_SDK" # XPU SDK XPU_SDK_ROOT="$(pwd)/../XPU_SDK" # XPU SDK
TARGET_NAME="lite_compile_deps" # default target TARGET_NAME="test_subgraph_pass" # default target
BUILD_EXTRA=ON # ON(with sequence ops)/OFF BUILD_EXTRA=ON # ON(with sequence ops)/OFF
WITH_TESTING=ON # ON/OFF WITH_TESTING=ON # ON/OFF
...@@ -73,8 +73,8 @@ function build_xpu { ...@@ -73,8 +73,8 @@ function build_xpu {
-DWITH_MKLDNN=OFF \ -DWITH_MKLDNN=OFF \
-DLITE_WITH_X86=ON \ -DLITE_WITH_X86=ON \
-DWITH_MKL=ON \ -DWITH_MKL=ON \
-DLITE_BUILD_EXTRA=ON \
-DLITE_WITH_XPU=ON \ -DLITE_WITH_XPU=ON \
-DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-DWITH_TESTING=${WITH_TESTING} \ -DWITH_TESTING=${WITH_TESTING} \
-DXPU_SDK_ROOT=${XPU_SDK_ROOT} -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册