提交 809f7fc3 编写于 作者: J jackzhang235 提交者: jackzhang235

fix some error when compiling (#6)

* fix some error when compiling with mlu-sdk1.2.5
上级 ce58801f
......@@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
......@@ -177,6 +178,10 @@ if(LITE_WITH_XPU)
include(device/xpu)
endif()
if(LITE_WITH_MLU)
include(mlu)
endif()
include(external/mklml) # download mklml package
include(external/xbyak) # download xbyak package
include(external/libxsmm) # download, build, install libxsmm
......
......@@ -150,6 +150,10 @@ if (LITE_WITH_BM)
add_definitions("-DLITE_WITH_BM")
endif()
if (LITE_WITH_MLU)
add_definitions("-DLITE_WITH_MLU")
endif()
if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE")
if (LITE_WITH_PRECISION_PROFILE)
......
......@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS})
......@@ -100,6 +100,12 @@ function (lite_deps TARGET)
endforeach(var)
endif()
if (LITE_WITH_MLU)
foreach(var ${lite_deps_MLU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE)
endfunction()
......@@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -144,6 +150,7 @@ function(lite_cc_library TARGET)
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
MLU_DEPS ${args_MLU_DEPS}
)
if (args_SHARED OR ARGS_shared)
......@@ -170,7 +177,7 @@ function(lite_cc_binary TARGET)
set(options " -g ")
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -189,6 +196,7 @@ function(lite_cc_binary TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
......@@ -218,7 +226,7 @@ function(lite_cc_test TARGET)
endif()
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS
COMPILE_LEVEL # (basic|extra)
......@@ -245,6 +253,7 @@ function(lite_cc_test TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
)
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size
......@@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels")
set(host_kernels CACHE INTERNAL "host kernels")
......@@ -280,12 +290,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif()
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
# level: one of (basic, extra)
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -357,6 +367,12 @@ function(add_kernel TARGET device level)
endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "MLU")
if (NOT LITE_WITH_MLU)
return()
endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL)
return()
......@@ -391,6 +407,7 @@ function(add_kernel TARGET device level)
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -409,7 +426,7 @@ endif()
function(add_operator TARGET level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -442,6 +459,7 @@ function(add_operator TARGET level)
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......
......@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
......
......@@ -65,7 +65,8 @@ if (WITH_TESTING)
CUDA_DEPS ${cuda_kernels}
X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels}
BM_DEPS ${bm_kernels})
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
endif()
if(LITE_WITH_FPGA)
set(light_api_deps ${light_api_deps} ${fpga_deps})
......@@ -87,6 +88,7 @@ message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}")
# for full api
if (NOT LITE_ON_TINY_PUBLISH)
......@@ -124,7 +126,8 @@ lite_cc_library(light_api SRCS light_api.cc
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels})
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
......@@ -143,6 +146,7 @@ if(WITH_TESTING)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -288,6 +292,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -320,6 +325,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
X86_DEPS ${x86_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
......@@ -333,6 +339,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -345,6 +352,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -357,6 +365,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -369,6 +378,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
......@@ -380,6 +390,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
......
......@@ -34,6 +34,11 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
#ifdef LITE_WITH_CUDA
Env<TARGET(kCUDA)>::Init();
#endif
#ifdef LITE_WITH_MLU
Env<TARGET(kMLU)>::Init();
mlu_core_version_ = config.mlu_core_version();
mlu_core_number_ = config.mlu_core_number();
#endif // LITE_WITH_MLU
auto places = config.valid_places();
std::vector<std::string> passes{};
auto use_layout_preprocess_pass =
......@@ -82,6 +87,9 @@ std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
void CxxPaddleApiImpl::Run() {
#ifdef LITE_WITH_ARM
lite::DeviceInfo::Global().SetRunMode(mode_, threads_);
#endif
#ifdef LITE_WITH_MLU
lite::DeviceInfo::Global().SetMLURunMode(mlu_core_version_, mlu_core_number_);
#endif
raw_predictor_.Run();
}
......
......@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") {
valid_places.emplace_back(TARGET(kMLU));
} else {
LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag "
......
......@@ -106,6 +106,8 @@ class LITE_API PaddlePredictor {
protected:
int threads_{1};
lite_api::PowerMode mode_{lite_api::LITE_POWER_NO_BIND};
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLU_270};
int mlu_core_number_{1};
};
/// Base class for all the configs.
......@@ -136,6 +138,11 @@ class LITE_API CxxConfig : public ConfigBase {
#ifdef LITE_WITH_X86
int x86_math_library_math_threads_ = 1;
#endif
bool use_firstconv_{false};
std::vector<float> mean_ = {0.0f};
std::vector<float> std_ = {1.0f};
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
int mlu_core_number_{1};
public:
void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
......@@ -163,6 +170,20 @@ class LITE_API CxxConfig : public ConfigBase {
return x86_math_library_math_threads_;
}
#endif
void set_use_firstconv(const bool firstconv) { use_firstconv_ = firstconv; }
void set_mean(const std::vector<float> mean) { mean_ = mean; }
void set_std(const std::vector<float> std) { std_ = std; }
void set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
mlu_core_version_ = core_version;
}
void set_mlu_core_number(int core_number) { mlu_core_number_ = core_number; }
bool use_first_conv() const { return use_firstconv_; }
std::vector<float> mean() const { return mean_; }
std::vector<float> std() const { return std_; }
lite_api::MLUCoreVersion mlu_core_version() const {
return mlu_core_version_;
}
int mlu_core_number() const { return mlu_core_number_; }
};
/// MobileConfig is the config for the light weight predictor, it will skip
......
......@@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) {
"fpga",
"npu",
"xpu",
"bm"};
"bm",
"mlu"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) {
"kFPGA",
"kNPU",
"kXPU",
"kMLU",
"kBM"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
......@@ -153,6 +155,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kNPU),
TARGET(kXPU),
TARGET(kBM),
TARGET(kMLU),
TARGET(kFPGA)});
if (target == TARGET(kAny)) {
return valid_set;
......
......@@ -53,8 +53,9 @@ enum class TargetType : int {
kNPU = 8,
kXPU = 9,
kBM = 10,
kMLU = 11,
kAny = 6, // any target
NUM = 11, // number of fields.
NUM = 12, // number of fields.
};
enum class PrecisionType : int {
kUnk = 0,
......@@ -88,6 +89,8 @@ typedef enum {
LITE_POWER_RAND_LOW = 5
} PowerMode;
typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
enum class ActivationType : int {
kIndentity = 0,
kRelu = 1,
......
......@@ -45,5 +45,8 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(subgraph_cast_display_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass);
......@@ -109,6 +109,11 @@ void BindLiteCxxConfig(py::module *m) {
.def("set_power_mode", &CxxConfig::set_power_mode)
.def("power_mode", &CxxConfig::power_mode);
#endif
#ifdef LITE_WITH_MLU
cxx_config.def("set_use_firstconv", &CxxConfig::set_use_firstconv)
.def("set_mean", &CxxConfig::set_mean)
.def("set_std", &CxxConfig::set_std)
#endif
}
// TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
......@@ -150,6 +155,9 @@ void BindLitePlace(py::module *m) {
.value("OpenCL", TargetType::kOpenCL)
.value("FPGA", TargetType::kFPGA)
.value("NPU", TargetType::kNPU)
#ifdef LITE_WITH_MLU
.value("MLU", TargetType::kMLU)
#endif
.value("Any", TargetType::kAny);
// PrecisionType
......@@ -230,6 +238,20 @@ void BindLiteTensor(py::module *m) {
DO_GETTER_ONCE(data_type__, name__##_data)
DATA_GETTER_SETTER_ONCE(int8_t, int8);
#ifdef LITE_WITH_MLU
tensor.def("set_uint8_data",
[](Tensor &self,
const std::vector<uint8_t> &data,
TargetType type = TargetType::kHost) {
if (type == TargetType::kHost) {
self.CopyFromCpu<uint8_t, TargetType::kHost>(data.data());
}
},
py::arg("data"),
py::arg("type") = TargetType::kHost);
DO_GETTER_ONCE(uint8_t, "uint8_data");
#endif
DATA_GETTER_SETTER_ONCE(int32_t, int32);
DATA_GETTER_SETTER_ONCE(float, float);
#undef DO_GETTER_ONCE
......
......@@ -6,4 +6,5 @@ add_subdirectory(fpga)
add_subdirectory(host)
add_subdirectory(npu)
add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm)
......@@ -7,7 +7,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
CUDA_DEPS target_wrapper_cuda
CL_DEPS cl_target_wrapper
FPGA_DEPS fpga_target_wrapper
BM_DEPS target_wrapper_bm)
BM_DEPS target_wrapper_bm
MLU_DEPS target_wrapper_mlu)
lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)
......
......@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......@@ -24,6 +24,11 @@
#include "lite/backends/opencl/cl_context.h"
#include "lite/backends/opencl/cl_runtime.h"
#endif
#ifdef LITE_WITH_MLU
#include <cnml.h>
#include <cnrt.h>
#include "lite/backends/mlu/mlu_utils.h"
#endif
#include <map>
#include <memory>
......@@ -52,6 +57,7 @@ using XPUContext = Context<TargetType::kXPU>;
using OpenCLContext = Context<TargetType::kOpenCL>;
using FPGAContext = Context<TargetType::kFPGA>;
using BMContext = Context<TargetType::kBM>;
using MLUContext = Context<TargetType::kMLU>;
template <>
class Context<TargetType::kHost> {
......@@ -171,6 +177,85 @@ class Context<TargetType::kFPGA> {
};
#endif
#ifdef LITE_WITH_MLU
template <>
class Context<TargetType::kMLU> {
public:
typename Env<TargetType::kMLU>::Devs& devs = Env<TargetType::kMLU>::Global();
void InitOnce() {}
MLUContext& operator=(const MLUContext& ctx) {
this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
return *this;
}
void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
CHECK_GT(devs.size(), 0UL)
<< "Env is not initialized or current target is not exit!";
if (dev_id >= static_cast<int>(devs.size())) {
LOG(WARNING) << "device index exceeds the number of devices, set to "
"default device(0)!";
device_id_ = 0;
} else {
device_id_ = dev_id;
}
SetMluDevice(device_id_);
if (io_queue_id >= devs[dev_id].max_queue()) {
LOG(WARNING) << "data queue index exceeds the maximum queue number, "
"set to default qeueu(0)!";
io_queue_id = 0;
}
if (exec_queue_id >= devs[dev_id].max_queue()) {
LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
"set to default qeueu(0)!";
exec_queue_id = 0;
}
io_queue_ = devs[dev_id].io_queues()[io_queue_id];
exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
exec_queue_id_ = exec_queue_id;
io_queue_id_ = io_queue_id;
}
void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
const cnrtQueue_t& exec_queue() const { return exec_queue_; }
void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; }
const cnrtQueue_t& io_queue() const { return io_queue_; }
void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
cnmlCoreVersion_t MLUCoreVersion() {
return DeviceInfo::Global().MLUCoreVersion();
}
int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
u32_t affinity() { return affinity_; }
cnrtInvokeFuncParam_t forward_param() { return forward_param_; }
int device_id() { return device_id_; }
std::string name() const { return "MLUContext"; }
private:
int device_id_;
// overall information
int exec_queue_id_;
int io_queue_id_;
cnrtQueue_t io_queue_;
cnrtQueue_t exec_queue_;
std::vector<cnrtNotifier_t> input_notifiers_;
std::vector<cnrtNotifier_t> output_notifiers_;
cnrtInvokeFuncParam_t forward_param_;
u32_t affinity_ = 0x01;
};
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
// Only works with CUDA kernels.
template <>
......@@ -393,6 +478,16 @@ class ContextScheduler {
kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
&ctx->As<BMContext>());
break;
#endif
#ifdef LITE_WITH_MLU
case TARGET(kMLU): {
int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
auto& context = ctx->As<MLUContext>();
context.Init(dev_id);
kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
&context);
LOG(INFO) << "New Context for MLU";
} break;
#endif
default:
#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
......@@ -434,6 +529,9 @@ class ContextScheduler {
#endif
#ifdef LITE_WITH_BM
InitContext<TargetType::kBM, BMContext>();
#endif
#ifdef LITE_WITH_MLU
InitContext<TargetType::kMLU, MLUContext>();
#endif
}
......
......@@ -58,7 +58,7 @@
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
#if ((defined LITE_WITH_ARM_) || (defined LITE_WITH_MLU))
thread_local lite_api::PowerMode DeviceInfo::mode_;
thread_local ARMArch DeviceInfo::arch_;
thread_local int DeviceInfo::mem_size_;
......@@ -66,6 +66,11 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
thread_local TensorLite DeviceInfo::workspace_;
thread_local int64_t DeviceInfo::count_ = 0;
#ifdef LITE_WITH_MLU
thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
thread_local int DeviceInfo::mlu_core_number_{1};
#endif
#ifdef TARGET_IOS
const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
......@@ -1080,6 +1085,28 @@ int DeviceInfo::Setup() {
return 0;
}
#ifdef LITE_WITH_MLU
void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
int core_number) {
switch (core_version) {
case (lite_api::MLUCoreVersion::MLU_220):
mlu_core_version_ = CNML_MLU220;
break;
case (lite_api::MLUCoreVersion::MLU_270):
mlu_core_version_ = CNML_MLU270;
break;
default:
mlu_core_version_ = CNML_MLU270;
break;
}
mlu_core_number_ = core_number;
}
cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
#endif // LITE_WITH_MLU
void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
#ifdef ARM_WITH_OMP
thread_num = std::min(thread_num, core_num_);
......@@ -1159,6 +1186,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {
#endif // LITE_WITH_ARM
#ifdef LITE_WITH_MLU
void SetMluDevice(int device_id) {
LOG(INFO) << "Set mlu device " << device_id;
cnrtDev_t dev_handle;
CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id));
CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
}
void Device<TARGET(kMLU)>::Init() {
SetMluDevice(idx_);
GetInfo();
CreateQueue();
}
void Device<TARGET(kMLU)>::GetInfo() {}
void Device<TARGET(kMLU)>::CreateQueue() {
exec_queue_.clear();
io_queue_.clear();
for (size_t i = 0; i < max_queue_; ++i) {
cnrtQueue_t exec_queue;
cnrtQueue_t io_queue;
cnrtCreateQueue(&exec_queue);
cnrtCreateQueue(&io_queue);
exec_queue_.push_back(exec_queue);
io_queue_.push_back(io_queue);
cnrtCreateQueue(&exec_queue);
exec_queue_.push_back(exec_queue);
}
}
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
void Device<TARGET(kCUDA)>::Init() {
......
......@@ -19,11 +19,14 @@
#include <vector>
#include "lite/core/tensor.h"
#include "lite/utils/cp_logging.h"
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/mlu_utils.h"
#endif
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
typedef enum {
kAPPLE = 0,
......@@ -52,6 +55,11 @@ class DeviceInfo {
int Setup();
void SetRunMode(lite_api::PowerMode mode, int thread_num);
#ifdef LITE_WITH_MLU
void SetMLURunMode(lite_api::MLUCoreVersion core_version, int core_number);
cnmlCoreVersion_t MLUCoreVersion();
int MLUCoreNumber();
#endif
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; }
......@@ -103,6 +111,11 @@ class DeviceInfo {
static thread_local TensorLite workspace_;
static thread_local int64_t count_;
#ifdef LITE_WITH_MLU
static thread_local cnmlCoreVersion_t mlu_core_version_;
static thread_local int mlu_core_number_;
#endif
void SetDotInfo(int argc, ...);
void SetFP16Info(int argc, ...);
void SetFP32Info(int argc, ...);
......@@ -134,6 +147,9 @@ class Env {
return *devs;
}
static void Init(int max_stream = 4) {
#ifdef LITE_WITH_MLU
CNRT_CALL(cnrtInit(0));
#endif
Devs& devs = Global();
if (devs.size() > 0) {
return;
......@@ -156,6 +172,41 @@ class Env {
}
};
#ifdef LITE_WITH_MLU
void SetMluDevice(int device_id);
template <>
class Device<TARGET(kMLU)> {
public:
Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {}
void Init();
int id() { return idx_; }
int max_queue() { return max_queue_; }
void SetId(int idx) { idx_ = idx; }
std::string name() { return "MLU"; }
int core_num() { return 16; }
float max_memory() { return 16 * 1024; }
std::vector<cnrtQueue_t> io_queues() { return io_queue_; }
std::vector<cnrtQueue_t> exec_queues() { return exec_queue_; }
private:
void CreateQueue();
void GetInfo();
private:
int idx_{0};
int max_queue_;
std::string device_name_;
float max_memory_;
std::vector<cnrtQueue_t> io_queue_;
std::vector<cnrtQueue_t> exec_queue_;
};
template class Env<TARGET(kMLU)>;
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
template <>
class Device<TARGET(kCUDA)> {
......
......@@ -83,6 +83,9 @@ class KernelBase {
#if defined(LITE_WITH_CUDA)
WorkSpace::Global_CUDA().AllocReset();
#endif
#if defined(LITE_WITH_MLU)
WorkSpace::Global_MLU().AllocReset();
#endif
#ifdef LITE_WITH_PROFILE
profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
......
......@@ -45,6 +45,11 @@ void* TargetMalloc(TargetType target, size_t size) {
data = TargetWrapper<TARGET(kBM)>::Malloc(size);
break;
#endif
#ifdef LITE_WITH_MLU
case TargetType::kMLU:
data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
break;
#endif // LITE_WITH_MLU
default:
LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
}
......@@ -79,6 +84,11 @@ void TargetFree(TargetType target, void* data) {
TargetWrapper<TARGET(kBM)>::Free(data);
break;
#endif
#ifdef LITE_WITH_MLU
case TargetType::kMLU:
TargetWrapper<TARGET(kMLU)>::Free(data);
break;
#endif // LITE_WITH_MLU
default:
LOG(FATAL) << "Unknown type";
}
......@@ -110,6 +120,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
break;
#endif
#ifdef LITE_WITH_MLU
case TargetType::kMLU:
TargetWrapper<TARGET(kMLU)>::MemcpySync(
dst, src, size, IoDirection::HtoD);
break;
#endif
#ifdef LITE_WITH_OPENCL
case TargetType::kOpenCL:
TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
......
......@@ -30,6 +30,10 @@
#include "lite/backends/bm/target_wrapper.h"
#endif // LITE_WITH_BM
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/target_wrapper.h"
#endif // LITE_WITH_MLU
namespace paddle {
namespace lite {
......@@ -81,6 +85,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
case TARGET(kBM):
TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
break;
#endif
#ifdef LITE_WITH_MLU
case TARGET(kMLU):
TargetWrapperMlu::MemcpySync(dst, src, size, dir);
break;
#endif
default:
LOG(FATAL)
......
......@@ -35,6 +35,8 @@ lite_cc_library(mir_passes
demo_pass.cc
runtime_context_assign_pass.cc
memory_optimize_pass.cc
mlu_postprocess_pass.cc
subgraph_cast_display_pass.cc
weight_quantization_preprocess_pass.cc
quantized_op_attributes_inference_pass.cc
DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
......
......@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
return adj_list;
}
std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildNodeAdjList() {
std::map<mir::Node *, std::set<mir::Node *>> adj_list;
for (auto &n : mutable_nodes()) {
if (adj_list.find(&n) == adj_list.end()) {
adj_list[&n] = std::set<mir::Node *>();
}
std::vector<mir::Node *> nodes;
for (auto &var : n.inlinks) {
nodes.push_back(var);
}
std::sort(nodes.begin(),
nodes.end(),
[](mir::Node *node1, mir::Node *node2) { return node1 > node2; });
adj_list[&n].insert(std::make_move_iterator(nodes.begin()),
std::make_move_iterator(nodes.end()));
}
return adj_list;
}
void SSAGraph::SortHelper(
const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
mir::Node *node,
......@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
return res;
}
std::vector<mir::Node *> SSAGraph::NodeTopologicalOrder() {
CheckBidirectionalConnection();
std::stack<mir::Node *> stack;
std::set<mir::Node *> visited;
std::vector<mir::Node *> res;
auto adj_list = BuildNodeAdjList();
for (auto adj : adj_list) {
if (visited.find(adj.first) == visited.end()) {
SortHelper(adj_list, adj.first, &visited, &res);
}
}
return res;
}
Node *SSAGraph::GraphCreateInstructNode(
const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
node_storage_.emplace_back();
......
......@@ -42,6 +42,8 @@ class SSAGraph : GraphBase {
std::vector<mir::Node *> StmtTopologicalOrder();
std::vector<mir::Node *> NodeTopologicalOrder();
// The inputs of the graph.
std::vector<mir::Node *> inputs();
......@@ -86,6 +88,9 @@ class SSAGraph : GraphBase {
// Build operator inlink edge table.
std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList();
// Build node inlink edge table.
std::map<mir::Node *, std::set<mir::Node *>> BuildNodeAdjList();
void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
mir::Node *node,
std::set<mir::Node *> *visited,
......
......@@ -313,8 +313,9 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {
std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
node_map_t *nodes) {
for (auto &it : *nodes) {
node_dat_t *node = it.second;
for (auto &n_tpo : graph_->NodeTopologicalOrder()) {
CHECK(nodes->find(n_tpo) != nodes->end());
node_dat_t *node = (*nodes)[n_tpo];
if (!node->marked) {
continue;
}
......
......@@ -67,6 +67,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser();
}
void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -77,3 +91,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
.BindTargets({TARGET(kXPU)});
REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
.BindTargets({TARGET(kBM)});
REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
.BindTargets({TARGET(kMLU)});
......@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class MLUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -22,29 +22,15 @@ namespace mir {
class SubgraphCastDisplayPass : public DebugPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
VLOG(3) << "== Argument types ==";
for (auto& node : graph->mutable_nodes()) {
if (!node.IsArg()) continue;
auto* type = node.AsArg().type;
if (type) {
VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
} else {
VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
}
}
VLOG(3) << "---------------------";
//
VLOG(0) << "== SubgraphOp Debug Info ==";
VLOG(4) << "== SubgraphOp Debug Info ==";
for (auto& node : graph->mutable_nodes()) {
if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
VLOG(0) << "FOUND SUBGRAPH OP";
VLOG(4) << "FOUND SUBGRAPH OP";
display_debug_info(node, "subgraph");
break;
}
}
VLOG(0) << "---------------------";
VLOG(4) << "---------------------";
}
void display_debug_info(const Node& node,
......@@ -52,17 +38,17 @@ class SubgraphCastDisplayPass : public DebugPass {
bool display_in_nodes = true,
bool display_out_nodes = true) {
CHECK(node.IsStmt());
VLOG(0) << node.AsStmt();
// VLOG(4) << node.AsStmt();
if (display_in_nodes) {
for (auto p_in_arg_node : node.inlinks) {
CHECK(p_in_arg_node->IsArg());
VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
VLOG(4) << "* ARG[IN] " << p_in_arg_node->AsArg().name
<< " type: " << *p_in_arg_node->AsArg().type
<< " is_weight: " << p_in_arg_node->AsArg().is_weight
<< " is_persist: " << p_in_arg_node->AsArg().is_persist
<< " input_count: " << p_in_arg_node->inlinks.size();
if (p_in_arg_node->inlinks.size() == 0) {
VLOG(0) << "** END with No Op";
VLOG(4) << "** END with No Op";
}
for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
CHECK(p_in_stmt_node->IsStmt());
......@@ -71,7 +57,7 @@ class SubgraphCastDisplayPass : public DebugPass {
stmt_op_type == "io_copy") {
display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
} else {
VLOG(0) << "** END with op type: " << stmt_op_type;
VLOG(4) << "** END with op type: " << stmt_op_type;
}
}
}
......@@ -79,13 +65,13 @@ class SubgraphCastDisplayPass : public DebugPass {
if (display_out_nodes) {
for (auto p_out_arg_node : node.outlinks) {
CHECK(p_out_arg_node->IsArg());
VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
VLOG(4) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
<< " type: " << *p_out_arg_node->AsArg().type
<< " is_weight: " << p_out_arg_node->AsArg().is_weight
<< " is_persist: " << p_out_arg_node->AsArg().is_persist
<< " output_count: " << p_out_arg_node->outlinks.size();
if (p_out_arg_node->outlinks.size() == 0) {
VLOG(0) << "** END with No Op";
VLOG(4) << "** END with No Op";
}
for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
CHECK(p_out_stmt_node->IsStmt());
......@@ -94,7 +80,7 @@ class SubgraphCastDisplayPass : public DebugPass {
stmt_op_type == "io_copy") {
display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
} else {
VLOG(0) << "** END with op type: " << stmt_op_type;
VLOG(4) << "** END with op type: " << stmt_op_type;
}
}
}
......@@ -108,4 +94,4 @@ class SubgraphCastDisplayPass : public DebugPass {
REGISTER_MIR_PASS(subgraph_cast_display_pass,
paddle::lite::mir::SubgraphCastDisplayPass)
.BindTargets({TARGET(kAny)});
.BindTargets({TARGET(kMLU)});
......@@ -107,6 +107,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
case TARGET(kBM): {
CREATE_KERNEL(kBM);
} break;
case TARGET(kMLU): {
CREATE_KERNEL(kMLU);
} break;
default:
CHECK(false) << "not supported kernel target " << TargetToStr(target);
}
......@@ -139,6 +142,15 @@ KernelRegistry::KernelRegistry()
INIT_FOR(kCUDA, kInt64, kNCHW);
INIT_FOR(kCUDA, kInt64, kNHWC);
INIT_FOR(kMLU, kFloat, kNHWC);
INIT_FOR(kMLU, kFloat, kNCHW);
INIT_FOR(kMLU, kFP16, kNHWC);
INIT_FOR(kMLU, kFP16, kNCHW);
INIT_FOR(kMLU, kInt8, kNHWC);
INIT_FOR(kMLU, kInt8, kNCHW);
INIT_FOR(kMLU, kInt16, kNHWC);
INIT_FOR(kMLU, kInt16, kNCHW);
INIT_FOR(kHost, kFloat, kNCHW);
INIT_FOR(kHost, kAny, kNCHW);
INIT_FOR(kHost, kFloat, kNHWC);
......
......@@ -268,7 +268,32 @@ class KernelRegistry final {
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kFPGA),
PRECISION(kAny),
DATALAYOUT(kAny)> * //
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kMLU),
PRECISION(kFloat),
DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kMLU),
PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kMLU),
PRECISION(kFP16),
DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kMLU),
PRECISION(kFP16),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kMLU),
PRECISION(kInt8),
DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kMLU),
PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kMLU),
PRECISION(kInt16),
DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kMLU),
PRECISION(kInt16),
DATALAYOUT(kNCHW)> * //
>;
KernelRegistry();
......
......@@ -115,9 +115,15 @@ class Optimizer {
"variable_place_inference_pass", //
"argument_type_display_pass",
"mlu_subgraph_pass",
"mlu_postprocess_pass",
// subgraph_cast_display_pass
"runtime_context_assign_pass",
"argument_type_display_pass",
"memory_optimize_pass"}};
if (passes.size() == 1) {
passes_local.push_back(passes[0]);
}
......
......@@ -69,6 +69,13 @@ class WorkSpace {
}
#endif
#if defined(LITE_WITH_MLU)
static WorkSpace& Global_MLU() {
thread_local std::unique_ptr<WorkSpace> x(new WorkSpace(TARGET(kMLU)));
return *x;
}
#endif
private:
explicit WorkSpace(TargetType x) : target_(x) {}
......
......@@ -10,4 +10,5 @@ add_subdirectory(opencl)
add_subdirectory(fpga)
add_subdirectory(npu)
add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm)
......@@ -29,13 +29,13 @@ set(mlu_subgraph_bridges
CACHE INTERNAL "mlu_subgraph_bridges")
# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
......@@ -54,4 +54,8 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(sigmoid,
kMLU,
paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
......@@ -25,8 +25,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int ActConverter(void* ctx, OpLite* op);
template void FillTensor<float, int>(Tensor* x,
float lower = -2,
float upper = -2);
......@@ -149,8 +147,6 @@ TEST(MLUBridges, activation) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
sigmoid,
paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
USE_SUBGRAPH_BRIDGE(relu, kMLU)
USE_SUBGRAPH_BRIDGE(tanh, kMLU)
......@@ -23,8 +23,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int BatchNormConverter(void* ctx, OpLite* op);
template <typename dtype>
void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
Scope* scope = op->scope();
......@@ -181,6 +179,4 @@ TEST(MLUBridges, batch_norm) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
batch_norm,
paddle::lite::subgraph::mlu::BatchNormConverter);
USE_SUBGRAPH_BRIDGE(batch_norm, kMLU)
......@@ -25,8 +25,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int ConvConverter(void* ctx, OpLite* op);
void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
......@@ -342,9 +340,5 @@ TEST(MLUBridges, conv) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
conv2d,
paddle::lite::subgraph::mlu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
depthwise_conv2d,
paddle::lite::subgraph::mlu::ConvConverter);
USE_SUBGRAPH_BRIDGE(conv2d, kMLU)
USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU)
......@@ -24,8 +24,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int ElementwiseConverter(void* ctx, OpLite* op);
template <typename dtype>
void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
Scope* scope = op->scope();
......@@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_add,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_sub,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_mul,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_div,
paddle::lite::subgraph::mlu::ElementwiseConverter);
USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU)
USE_SUBGRAPH_BRIDGE(elementwise_sub, kMLU)
USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU)
USE_SUBGRAPH_BRIDGE(elementwise_div, kMLU)
......@@ -24,8 +24,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int FCConverter(void* ctx, OpLite* op);
void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
......@@ -170,4 +168,4 @@ TEST(MLUBridges, fc) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter);
USE_SUBGRAPH_BRIDGE(fc, kMLU);
......@@ -24,8 +24,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int PoolConverter(void* ctx, OpLite* op);
void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
......@@ -275,6 +273,4 @@ TEST(MLUBridges, pool) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
pool2d,
paddle::lite::subgraph::mlu::PoolConverter);
USE_SUBGRAPH_BRIDGE(pool2d, kMLU)
......@@ -23,8 +23,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int SoftmaxConverter(void* ctx, OpLite* op);
template <typename dtype>
void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
Scope* scope = op->scope();
......@@ -171,6 +169,4 @@ TEST(MLUBridges, softmax) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
softmax,
paddle::lite::subgraph::mlu::SoftmaxConverter);
USE_SUBGRAPH_BRIDGE(softmax, kMLU)
......@@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const std::vector<std::string>& input_var_names,
const std::vector<std::string>& output_var_names) {
CNRT_CALL(cnrtInit(0));
SetMluDevice(0);
::paddle::lite::SetMluDevice(0);
cnrtQueue_t queue_;
cnrtInvokeFuncParam_t forward_param;
u32_t affinity = 1;
......
......@@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL(
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
// host_to_device)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
// .Finalize();
//
//
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
// device_to_host)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
......@@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine {
graph_.SetFPType(type);
}
int Build() {
// In order to attach all of the ops of the block desc, we need to build
// the original program firstly.
BuildOriginProgram();
// Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
build_device_program_status_ = BuildDeviceProgram();
return build_device_program_status_;
}
int Launch() {
// Rebuild device program when the shapes of input tensors have been
// changed.
if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
build_device_program_status_) &&
InputShapeChanged()) {
Build();
}
if (subgraph::CHECK_FAILED(build_device_program_status_)) {
LaunchOriginProgram();
} else {
LaunchDeviceProgram();
}
return 0;
}
protected:
int BuildDeviceProgram() override {
int status = 0;
......@@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine {
graph_.AddInput(graph_.GetNode(input_name));
}
CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
// auto& mlu_context = this->ctx_->template As<MLUContext>();
// auto core_version = mlu_context.MLUCoreVersion();
// auto core_number = mlu_context.MLUCoreNumber();
// graph_.Compile(core_version, core_number);
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto core_version = mlu_context.MLUCoreVersion();
auto core_number = mlu_context.MLUCoreNumber();
graph_.Compile(core_version, core_number);
return status;
}
int LaunchDeviceProgram() override {
// auto& mlu_context = this->ctx_->template As<MLUContext>();
// auto exec_queue = mlu_context.exec_queue();
// u32_t affinity = mlu_context.affinity();
// cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
// int data_param = 1;
// forward_param.data_parallelism = &data_param;
// forward_param.affinity = &affinity;
// forward_param.end = CNRT_PARAM_END;
// graph_.Compute(forward_param, exec_queue);
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto exec_queue = mlu_context.exec_queue();
u32_t affinity = mlu_context.affinity();
cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
int data_param = 1;
forward_param.data_parallelism = &data_param;
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
graph_.Compute(forward_param, exec_queue);
return 0;
}
......
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM)
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
return()
endif()
......
......@@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast,
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
REGISTER_LITE_KERNEL(
cast,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::CastCompute<::paddle::lite::fluid::float16>,
fp16_to_any)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
endif()
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
......
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册