提交 6ca756f3 编写于 作者: J jackzhang235 提交者: GitHub

[MLU] add mlu related pass, kernels and gtests; modify api in padddle_api.h (#3307)

[MLU] add some basic support for MLU, including related passes, kernels, gtests and some api in padddle_api.h
Passes:mlu_subgraph_pass ,mlu_postprocess_pass
Kernels:  act,batch_norm, concat, conv, elementwise, fc, interpolate, pool, scale, softmax
上级 5dced130
......@@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
......@@ -178,6 +179,10 @@ if(LITE_WITH_XPU)
include(device/xpu)
endif()
if(LITE_WITH_MLU)
include(mlu)
endif()
include(external/mklml) # download mklml package
include(external/xbyak) # download xbyak package
include(external/libxsmm) # download, build, install libxsmm
......
......@@ -153,6 +153,10 @@ if (LITE_WITH_BM)
add_definitions("-DLITE_WITH_BM")
endif()
if (LITE_WITH_MLU)
add_definitions("-DLITE_WITH_MLU")
endif()
if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE")
endif()
......
......@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS})
......@@ -100,6 +100,12 @@ function (lite_deps TARGET)
endforeach(var)
endif()
if (LITE_WITH_MLU)
foreach(var ${lite_deps_MLU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE)
endfunction()
......@@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -144,6 +150,7 @@ function(lite_cc_library TARGET)
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
MLU_DEPS ${args_MLU_DEPS}
)
if (args_SHARED OR ARGS_shared)
......@@ -170,7 +177,7 @@ function(lite_cc_binary TARGET)
set(options " -g ")
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -189,6 +196,7 @@ function(lite_cc_binary TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
......@@ -218,7 +226,7 @@ function(lite_cc_test TARGET)
endif()
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS
COMPILE_LEVEL # (basic|extra)
......@@ -245,6 +253,7 @@ function(lite_cc_test TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
)
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size
......@@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels")
set(host_kernels CACHE INTERNAL "host kernels")
......@@ -285,12 +295,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif()
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
# level: one of (basic, extra)
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -369,6 +379,12 @@ function(add_kernel TARGET device level)
endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "MLU")
if (NOT LITE_WITH_MLU)
return()
endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS})
......@@ -409,6 +425,7 @@ function(add_kernel TARGET device level)
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -427,7 +444,7 @@ endif()
function(add_operator TARGET level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -462,6 +479,7 @@ function(add_operator TARGET level)
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......
......@@ -10,6 +10,7 @@ message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
......
......@@ -67,7 +67,8 @@ if (WITH_TESTING)
CUDA_DEPS ${cuda_kernels}
X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels}
BM_DEPS ${bm_kernels})
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
endif()
if(LITE_WITH_FPGA)
set(light_api_deps ${light_api_deps} ${fpga_deps})
......@@ -89,6 +90,7 @@ message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}")
# for full api
if (NOT LITE_ON_TINY_PUBLISH)
......@@ -126,7 +128,8 @@ lite_cc_library(light_api SRCS light_api.cc
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels})
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
......@@ -145,6 +148,7 @@ if(WITH_TESTING)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -291,6 +295,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -328,6 +333,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
X86_DEPS ${x86_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
......@@ -341,6 +347,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -353,6 +360,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -365,6 +373,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -377,6 +386,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
......@@ -388,6 +398,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
......
......@@ -42,6 +42,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
}
}
#endif
#ifdef LITE_WITH_MLU
Env<TARGET(kMLU)>::Init();
lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
config.mlu_core_number(),
config.mlu_use_first_conv(),
config.mlu_first_conv_mean(),
config.mlu_first_conv_std(),
config.mlu_input_layout());
#endif // LITE_WITH_MLU
std::vector<std::string> passes{};
auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS");
......
......@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") {
valid_places.emplace_back(TARGET(kMLU));
} else {
LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag "
......
......@@ -204,6 +204,39 @@ void ConfigBase::set_threads(int threads) {
#endif
}
#ifdef LITE_WITH_MLU
void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
mlu_core_version_ = core_version;
}
void CxxConfig::set_mlu_core_number(int core_number) {
mlu_core_number_ = core_number;
}
void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
mlu_input_layout_ = layout;
}
void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
mlu_use_first_conv_ = use_first_conv;
}
void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
mlu_first_conv_mean_ = mean;
}
void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
mlu_first_conv_std_ = std;
}
lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
return mlu_core_version_;
}
int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
return mlu_first_conv_mean_;
}
const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
return mlu_first_conv_std_;
}
#endif
void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
......
......@@ -136,6 +136,14 @@ class LITE_API CxxConfig : public ConfigBase {
#ifdef LITE_WITH_X86
int x86_math_library_math_threads_ = 1;
#endif
#ifdef LITE_WITH_MLU
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
int mlu_core_number_{1};
DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
bool mlu_use_first_conv_{false};
std::vector<float> mlu_first_conv_mean_;
std::vector<float> mlu_first_conv_std_;
#endif
public:
void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
......@@ -163,6 +171,32 @@ class LITE_API CxxConfig : public ConfigBase {
return x86_math_library_math_threads_;
}
#endif
#ifdef LITE_WITH_MLU
// set MLU core version, which is used when compiling MLU kernels
void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
// set MLU core number, which is used when compiling MLU kernels
void set_mlu_core_number(int core_number);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void set_mlu_input_layout(DataLayoutType layout);
// whether use MLU's first conv kernel. First conv is a special kernel
// provided by MLU, its input is uint8, and also needs two 3-dimentional
// vectors which save all inputs' mean and std values
void set_mlu_use_first_conv(bool use_first_conv);
// set the 3-dimentional mean vector used by MLU's first conv
void set_mlu_first_conv_mean(const std::vector<float>& mean);
// set the 3-dimentional std vector used by MLU's first conv
void set_mlu_first_conv_std(const std::vector<float>& std);
lite_api::MLUCoreVersion mlu_core_version() const;
int mlu_core_number() const;
DataLayoutType mlu_input_layout() const;
bool mlu_use_first_conv() const;
const std::vector<float>& mlu_first_conv_mean() const;
const std::vector<float>& mlu_first_conv_std() const;
#endif
// XPU only, set the size of the workspace memory from L3 cache for the
// current thread.
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
......
......@@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) {
"fpga",
"npu",
"xpu",
"bm"};
"bm",
"mlu"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) {
"kFPGA",
"kNPU",
"kXPU",
"kMLU",
"kBM"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
......@@ -153,6 +155,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kNPU),
TARGET(kXPU),
TARGET(kBM),
TARGET(kMLU),
TARGET(kFPGA)});
if (target == TARGET(kAny)) {
return valid_set;
......
......@@ -53,8 +53,8 @@ enum class TargetType : int {
kNPU = 8,
kXPU = 9,
kBM = 10,
kAny = 6, // any target
kMLU = 11,
kAny = 6, // any target
NUM = 12, // number of fields.
};
enum class PrecisionType : int {
......@@ -89,6 +89,8 @@ typedef enum {
LITE_POWER_RAND_LOW = 5
} PowerMode;
typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
enum class ActivationType : int {
kIndentity = 0,
kRelu = 1,
......
......@@ -45,6 +45,8 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
......
......@@ -47,6 +47,7 @@ using lite_api::TargetType;
using lite_api::PrecisionType;
using lite_api::DataLayoutType;
using lite_api::Place;
using lite_api::MLUCoreVersion;
using lite::LightPredictorImpl;
using lite_api::OptBase;
......@@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m);
static void BindLitePowerMode(py::module *m);
static void BindLitePlace(py::module *m);
static void BindLiteTensor(py::module *m);
static void BindLiteMLUCoreVersion(py::module *m);
void BindLiteApi(py::module *m) {
BindLiteCxxConfig(m);
......@@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) {
BindLitePowerMode(m);
BindLitePlace(m);
BindLiteTensor(m);
BindLiteMLUCoreVersion(m);
#ifndef LITE_ON_TINY_PUBLISH
BindLiteCxxPredictor(m);
#endif
......@@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) {
.def("set_power_mode", &CxxConfig::set_power_mode)
.def("power_mode", &CxxConfig::power_mode);
#endif
#ifdef LITE_WITH_MLU
cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
.def("set_mlu_core_number", &CxxConfig::set_mlu_core_number)
.def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout)
.def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv)
.def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean)
.def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std);
#endif
}
// TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
......@@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) {
.value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW);
}
void BindLiteMLUCoreVersion(py::module *m) {
py::enum_<MLUCoreVersion>(*m, "MLUCoreVersion")
.value("LITE_MLU_220", MLUCoreVersion::MLU_220)
.value("LITE_MLU_270", MLUCoreVersion::MLU_270);
}
void BindLitePlace(py::module *m) {
// TargetType
py::enum_<TargetType>(*m, "TargetType")
......@@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) {
.value("OpenCL", TargetType::kOpenCL)
.value("FPGA", TargetType::kFPGA)
.value("NPU", TargetType::kNPU)
.value("MLU", TargetType::kMLU)
.value("Any", TargetType::kAny);
// PrecisionType
......@@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) {
DO_GETTER_ONCE(data_type__, name__##_data)
DATA_GETTER_SETTER_ONCE(int8_t, int8);
#ifdef LITE_WITH_MLU
tensor.def("set_uint8_data",
[](Tensor &self,
const std::vector<uint8_t> &data,
TargetType type = TargetType::kHost) {
if (type == TargetType::kHost) {
self.CopyFromCpu<uint8_t, TargetType::kHost>(data.data());
}
},
py::arg("data"),
py::arg("type") = TargetType::kHost);
DO_GETTER_ONCE(uint8_t, "uint8_data");
#endif
DATA_GETTER_SETTER_ONCE(int32_t, int32);
DATA_GETTER_SETTER_ONCE(float, float);
#undef DO_GETTER_ONCE
......
......@@ -6,4 +6,5 @@ add_subdirectory(fpga)
add_subdirectory(host)
add_subdirectory(npu)
add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm)
......@@ -8,7 +8,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
XPU_DEPS target_wrapper_xpu
CL_DEPS cl_target_wrapper
FPGA_DEPS fpga_target_wrapper
BM_DEPS target_wrapper_bm)
BM_DEPS target_wrapper_bm
MLU_DEPS target_wrapper_mlu)
lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)
......
......@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......@@ -24,6 +24,11 @@
#include "lite/backends/opencl/cl_context.h"
#include "lite/backends/opencl/cl_runtime.h"
#endif
#ifdef LITE_WITH_MLU
#include <cnml.h>
#include <cnrt.h>
#include "lite/backends/mlu/mlu_utils.h"
#endif
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/xpu_header_sitter.h"
#endif
......@@ -202,6 +207,85 @@ class Context<TargetType::kFPGA> {
};
#endif
#ifdef LITE_WITH_MLU
template <>
class Context<TargetType::kMLU> {
public:
typename Env<TargetType::kMLU>::Devs& devs = Env<TargetType::kMLU>::Global();
void InitOnce() {}
MLUContext& operator=(const MLUContext& ctx) {
this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
return *this;
}
void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
CHECK_GT(devs.size(), 0UL)
<< "Env is not initialized or current target is not exit!";
if (dev_id >= static_cast<int>(devs.size())) {
LOG(WARNING) << "device index exceeds the number of devices, set to "
"default device(0)!";
device_id_ = 0;
} else {
device_id_ = dev_id;
}
SetMluDevice(device_id_);
if (io_queue_id >= devs[dev_id].max_queue()) {
LOG(WARNING) << "data queue index exceeds the maximum queue number, "
"set to default qeueu(0)!";
io_queue_id = 0;
}
if (exec_queue_id >= devs[dev_id].max_queue()) {
LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
"set to default qeueu(0)!";
exec_queue_id = 0;
}
io_queue_ = devs[dev_id].io_queues()[io_queue_id];
exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
exec_queue_id_ = exec_queue_id;
io_queue_id_ = io_queue_id;
}
void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
const cnrtQueue_t& exec_queue() const { return exec_queue_; }
void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; }
const cnrtQueue_t& io_queue() const { return io_queue_; }
void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
cnmlCoreVersion_t MLUCoreVersion() {
return DeviceInfo::Global().MLUCoreVersion();
}
int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
u32_t affinity() { return affinity_; }
cnrtInvokeFuncParam_t forward_param() { return forward_param_; }
int device_id() { return device_id_; }
std::string name() const { return "MLUContext"; }
private:
int device_id_;
// overall information
int exec_queue_id_;
int io_queue_id_;
cnrtQueue_t io_queue_;
cnrtQueue_t exec_queue_;
std::vector<cnrtNotifier_t> input_notifiers_;
std::vector<cnrtNotifier_t> output_notifiers_;
cnrtInvokeFuncParam_t forward_param_;
u32_t affinity_ = 0x01;
};
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
// Only works with CUDA kernels.
template <>
......@@ -428,6 +512,16 @@ class ContextScheduler {
kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
&ctx->As<BMContext>());
break;
#endif
#ifdef LITE_WITH_MLU
case TARGET(kMLU): {
int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
auto& context = ctx->As<MLUContext>();
context.Init(dev_id);
kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
&context);
LOG(INFO) << "New Context for MLU";
} break;
#endif
default:
#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
......@@ -469,6 +563,9 @@ class ContextScheduler {
#endif
#ifdef LITE_WITH_BM
InitContext<TargetType::kBM, BMContext>();
#endif
#ifdef LITE_WITH_MLU
InitContext<TargetType::kMLU, MLUContext>();
#endif
}
......
......@@ -58,7 +58,7 @@
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
thread_local lite_api::PowerMode DeviceInfo::mode_;
thread_local ARMArch DeviceInfo::arch_;
thread_local int DeviceInfo::mem_size_;
......@@ -66,6 +66,15 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
thread_local TensorLite DeviceInfo::workspace_;
thread_local int64_t DeviceInfo::count_ = 0;
#ifdef LITE_WITH_MLU
thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
thread_local int DeviceInfo::mlu_core_number_{1};
thread_local bool DeviceInfo::use_first_conv_{false};
thread_local std::vector<float> DeviceInfo::mean_vec_;
thread_local std::vector<float> DeviceInfo::std_vec_;
thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
#endif
#ifdef TARGET_IOS
const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
......@@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() {
return 0;
}
#ifdef LITE_WITH_MLU
void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
int core_number,
bool use_first_conv,
const std::vector<float>& mean_vec,
const std::vector<float>& std_vec,
DataLayoutType input_layout) {
switch (core_version) {
case (lite_api::MLUCoreVersion::MLU_220):
mlu_core_version_ = CNML_MLU220;
break;
case (lite_api::MLUCoreVersion::MLU_270):
mlu_core_version_ = CNML_MLU270;
break;
default:
mlu_core_version_ = CNML_MLU270;
break;
}
mlu_core_number_ = core_number;
use_first_conv_ = use_first_conv;
mean_vec_ = mean_vec;
std_vec_ = std_vec;
input_layout_ = input_layout;
}
cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
#endif // LITE_WITH_MLU
void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
#ifdef ARM_WITH_OMP
thread_num = std::min(thread_num, core_num_);
......@@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {
#endif // LITE_WITH_ARM
#ifdef LITE_WITH_MLU
void SetMluDevice(int device_id) {
LOG(INFO) << "Set mlu device " << device_id;
cnrtDev_t dev_handle;
CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id));
CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
}
void Device<TARGET(kMLU)>::Init() {
SetMluDevice(idx_);
GetInfo();
CreateQueue();
}
void Device<TARGET(kMLU)>::GetInfo() {}
void Device<TARGET(kMLU)>::CreateQueue() {
exec_queue_.clear();
io_queue_.clear();
for (size_t i = 0; i < max_queue_; ++i) {
cnrtQueue_t exec_queue;
cnrtQueue_t io_queue;
cnrtCreateQueue(&exec_queue);
cnrtCreateQueue(&io_queue);
exec_queue_.push_back(exec_queue);
io_queue_.push_back(io_queue);
cnrtCreateQueue(&exec_queue);
exec_queue_.push_back(exec_queue);
}
}
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
void Device<TARGET(kCUDA)>::Init() {
......
......@@ -19,11 +19,14 @@
#include <vector>
#include "lite/core/tensor.h"
#include "lite/utils/cp_logging.h"
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/mlu_utils.h"
#endif
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
typedef enum {
kAPPLE = 0,
......@@ -52,6 +55,20 @@ class DeviceInfo {
int Setup();
void SetRunMode(lite_api::PowerMode mode, int thread_num);
#ifdef LITE_WITH_MLU
void SetMLURunMode(lite_api::MLUCoreVersion core_version,
int core_number,
bool use_first_conv,
const std::vector<float>& mean_vec,
const std::vector<float>& std_vec,
DataLayoutType input_layout);
cnmlCoreVersion_t MLUCoreVersion();
int MLUCoreNumber();
bool UseFirstConv();
const std::vector<float>& MeanVec() const;
const std::vector<float>& StdVec() const;
DataLayoutType InputLayout() const;
#endif
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; }
......@@ -103,6 +120,15 @@ class DeviceInfo {
static thread_local TensorLite workspace_;
static thread_local int64_t count_;
#ifdef LITE_WITH_MLU
static thread_local cnmlCoreVersion_t mlu_core_version_;
static thread_local int mlu_core_number_;
static thread_local bool use_first_conv_;
static thread_local std::vector<float> mean_vec_;
static thread_local std::vector<float> std_vec_;
static thread_local DataLayoutType input_layout_;
#endif
void SetDotInfo(int argc, ...);
void SetFP16Info(int argc, ...);
void SetFP32Info(int argc, ...);
......@@ -134,6 +160,9 @@ class Env {
return *devs;
}
static void Init(int max_stream = 4) {
#ifdef LITE_WITH_MLU
CNRT_CALL(cnrtInit(0));
#endif
Devs& devs = Global();
if (devs.size() > 0) {
return;
......@@ -156,6 +185,41 @@ class Env {
}
};
#ifdef LITE_WITH_MLU
void SetMluDevice(int device_id);
template <>
class Device<TARGET(kMLU)> {
public:
Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {}
void Init();
int id() { return idx_; }
int max_queue() { return max_queue_; }
void SetId(int idx) { idx_ = idx; }
std::string name() { return "MLU"; }
int core_num() { return 16; }
float max_memory() { return 16 * 1024; }
std::vector<cnrtQueue_t> io_queues() { return io_queue_; }
std::vector<cnrtQueue_t> exec_queues() { return exec_queue_; }
private:
void CreateQueue();
void GetInfo();
private:
int idx_{0};
int max_queue_;
std::string device_name_;
float max_memory_;
std::vector<cnrtQueue_t> io_queue_;
std::vector<cnrtQueue_t> exec_queue_;
};
template class Env<TARGET(kMLU)>;
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
template <>
class Device<TARGET(kCUDA)> {
......
......@@ -83,6 +83,9 @@ class KernelBase {
#if defined(LITE_WITH_CUDA)
WorkSpace::Global_CUDA().AllocReset();
#endif
#if defined(LITE_WITH_MLU)
WorkSpace::Global_MLU().AllocReset();
#endif
#ifdef LITE_WITH_PROFILE
profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
......
......@@ -45,6 +45,11 @@ void* TargetMalloc(TargetType target, size_t size) {
data = TargetWrapper<TARGET(kBM)>::Malloc(size);
break;
#endif
#ifdef LITE_WITH_MLU
case TargetType::kMLU:
data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
break;
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_XPU
case TargetType::kXPU:
data = TargetWrapperXPU::Malloc(size);
......@@ -88,6 +93,11 @@ void TargetFree(TargetType target, void* data, std::string free_flag) {
TargetWrapper<TARGET(kBM)>::Free(data);
break;
#endif
#ifdef LITE_WITH_MLU
case TargetType::kMLU:
TargetWrapper<TARGET(kMLU)>::Free(data);
break;
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_XPU
case TargetType::kXPU:
TargetWrapperXPU::Free(data);
......@@ -124,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
break;
#endif
#ifdef LITE_WITH_MLU
case TargetType::kMLU:
TargetWrapper<TARGET(kMLU)>::MemcpySync(
dst, src, size, IoDirection::HtoD);
break;
#endif
#ifdef LITE_WITH_OPENCL
case TargetType::kOpenCL:
TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
......
......@@ -31,6 +31,10 @@
#include "lite/backends/bm/target_wrapper.h"
#endif // LITE_WITH_BM
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/target_wrapper.h"
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/target_wrapper.h"
#endif // LITE_WITH_XPU
......@@ -79,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
TargetWrapperCL::MemcpySync(dst, src, size, dir);
break;
#endif // LITE_WITH_OPENCL
#ifdef LITE_WITH_MLU
case TARGET(kMLU):
TargetWrapperMlu::MemcpySync(dst, src, size, dir);
break;
#endif
#ifdef LITE_WITH_FPGA
case TARGET(kFPGA):
TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
......
......@@ -37,6 +37,7 @@ lite_cc_library(mir_passes
demo_pass.cc
runtime_context_assign_pass.cc
memory_optimize_pass.cc
mlu_postprocess_pass.cc
weight_quantization_preprocess_pass.cc
quantized_op_attributes_inference_pass.cc
DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
......
......@@ -15,7 +15,6 @@
#include "lite/core/mir/mlu_postprocess_pass.h"
#include <list>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
......@@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
op_desc.SetAttr<int>("out_dtype", 4); // FP16
op_desc.SetInput("X", {cur_node->AsArg().name});
op_desc.SetOutput("Out", {cast_arg_name});
} else if (op_type == "transpose") {
} else if (op_type == "layout") {
// NCHW -> NHWC
op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1});
op_desc.SetInput("X", {cur_node->AsArg().name});
op_desc.SetInput("Input", {cur_node->AsArg().name});
op_desc.SetOutput("Out", {cast_arg_name});
} else if (op_type == "io_copy") {
op_desc.SetInput("Input", {cur_node->AsArg().name});
......@@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
is_found = true;
}
} else if (op_type == "transpose") {
is_found = true;
} else if (op_type == "layout") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
DataLayoutCompatible(*out_arg_ty, *cast_type) &&
// for first conv
PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
is_found = true;
}
} else if (op_type == "io_copy") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
......@@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
// we pick the kernel
cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
auto& stmt = cast_inst->AsStmt();
stmt.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
if (op_type == "layout") {
stmt.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(TARGET(kX86)));
} else {
stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
stmt.picked_kernel().target()));
}
break;
}
}
......@@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
cast_arg->AsArg().type = cast_type;
auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
// for CastAfter manully set the tensor's type
var->GetMutable<::paddle::lite::Tensor>();
var->GetMutable<paddle::lite::Tensor>();
// create the stmt node
auto* cast_inst = graph->NewInstructNode();
......@@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
op_desc.SetAttr<int>("out_dtype", 5); // FP16
op_desc.SetInput("X", {cast_arg_name});
op_desc.SetOutput("Out", {cur_node->AsArg().name});
} else if (op_type == "transpose") {
} else if (op_type == "layout") {
// NHWC -> NCHW
op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2});
op_desc.SetInput("X", {cast_arg_name});
op_desc.SetInput("Input", {cast_arg_name});
op_desc.SetOutput("Out", {cur_node->AsArg().name});
} else if (op_type == "io_copy") {
op_desc.SetInput("Input", {cast_arg_name});
......@@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
is_found = true;
}
} else if (op_type == "transpose") {
is_found = true;
} else if (op_type == "layout") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
if (DataLayoutCompatible(*in_arg_ty, *cast_type) &&
DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) {
is_found = true;
}
} else if (op_type == "io_copy") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
......@@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
// we pick the kernel
cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
auto& stmt = cast_inst->AsStmt();
stmt.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
if (op_type == "layout") {
stmt.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(TARGET(kX86)));
} else {
stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
stmt.picked_kernel().target()));
}
break;
}
}
......@@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
auto* cur_node = head_node;
const auto name_prefix =
head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
bool is_first_conv_head =
std::find(first_conv_nodes_.begin(),
first_conv_nodes_.end(),
head_node->AsArg().name) != first_conv_nodes_.end();
// layout cast node
if (head_type->layout() != inst_type->layout()) {
// precision cast node
if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
cur_node = InsertCastBefore(
"transpose",
name_prefix + "transpose",
"cast",
name_prefix + "cast",
graph,
cur_node,
inst_node,
LiteType::GetTensorTy(
head_type->target(), head_type->precision(), inst_type->layout()));
head_type->target(), inst_type->precision(), head_type->layout()));
}
// precision cast node
if (head_type->precision() != inst_type->precision()) {
// layout cast node
if (head_type->layout() != inst_type->layout()) {
cur_node = InsertCastBefore(
"cast",
name_prefix + "cast",
"layout",
name_prefix + "layout",
graph,
cur_node,
inst_node,
......@@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
// get subgraph's valid precision
const auto& places = graph->valid_places();
std::set<::paddle::lite_api::PrecisionType> prec_set;
std::set<paddle::lite_api::PrecisionType> prec_set;
for (const auto& place : places) {
if (place.target == TARGET(kMLU)) {
prec_set.insert(place.precision);
......@@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
const auto name_prefix =
tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
// layout cast node
if (tail_type->layout() != inst_type->layout()) {
// precision cast node
if (tail_type->precision() != inst_type->precision()) {
cur_node = InsertCastAfter(
"transpose",
name_prefix + "transpose",
"cast",
name_prefix + "cast",
graph,
cur_node,
inst_node,
LiteType::GetTensorTy(
tail_type->target(), tail_type->precision(), inst_type->layout()));
tail_type->target(), inst_type->precision(), tail_type->layout()));
}
// precision cast node
if (tail_type->precision() != inst_type->precision()) {
// layout cast node
if (tail_type->layout() != inst_type->layout()) {
cur_node = InsertCastAfter(
"cast",
name_prefix + "cast",
"layout",
name_prefix + "layout",
graph,
cur_node,
inst_node,
......@@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
UpdateOutputTo(
sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
/* graph like this
* subgraph_op_0
* / \
* / \
* subgraph_op_1 host_op
*/
UpdateInputTo(
sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
}
// recreate the op
......@@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
}
}
bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
auto* block_desc =
static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
->GetSubBlock();
for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
CHECK(op_desc);
if (op_desc->Type() == "conv2d") {
for (auto& names : op_desc->inputs()) {
if (std::find(names.second.begin(),
names.second.end(),
arg_node->AsArg().name) != names.second.end()) {
return true;
}
}
}
}
return false;
}
bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) {
CHECK(arg_node->IsArg());
for (auto& inst : arg_node->outlinks) {
if (inst->AsStmt().op_type() == "subgraph") {
return IsFirstConvInSubgraph(arg_node, inst);
}
}
return false;
}
void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
for (auto& node : graph->mutable_nodes()) {
if (!node.IsStmt()) continue;
if (node.AsStmt().op_type() == "feed") {
for (auto& out : node.outlinks) {
if (IsFirstConvNode(out)) {
first_conv_nodes_.insert(out->AsArg().name);
// modify first conv nodes' type
const auto* old_type = out->AsArg().type;
out->AsArg().type =
LiteType::GetTensorTy(old_type->target(),
paddle::lite_api::PrecisionType::kInt8,
old_type->layout(),
old_type->device());
}
}
}
}
}
void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
for (auto& node : graph->mutable_nodes()) {
if (!node.IsStmt()) continue;
......@@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
out->AsArg().type =
LiteType::GetTensorTy(old_type->target(),
old_type->precision(),
::paddle::lite_api::DataLayoutType::kNHWC,
paddle::lite_api::DataLayoutType::kNHWC,
old_type->device());
}
}
......@@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
inp->AsArg().type =
LiteType::GetTensorTy(old_type->target(),
old_type->precision(),
::paddle::lite_api::DataLayoutType::kNHWC,
paddle::lite_api::DataLayoutType::kNHWC,
old_type->device());
}
}
......@@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
}
void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// currently for non-persistent input and output args, mlu subgraph op
// only support float16/float32 data type
// in two situations as folllows:
// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
// arg_in and arg_out are assumed to be NHWC which user should be aware of.
// Thus here we change these args' layout to NHWC
ModifyLayout(graph.get());
// currently for non-persistent input and output args, mlu subgraph op
// only support float16/float32 data type
// in two situations as folllows:
// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
// arg_in and arg_out are assumed to be NHWC which user should be aware of.
// Thus here we change these args' layout to NHWC
#ifdef LITE_WITH_MLU
if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
ModifyLayout(graph.get());
}
if (lite::DeviceInfo::Global().UseFirstConv()) {
GatherAndModifyFirstConvNodes(graph.get());
}
#endif
// insert io_copy, layout and precision cast of subgraph's inputs and outputs
for (auto& node : graph->mutable_nodes()) {
......
......@@ -15,6 +15,7 @@
#pragma once
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "lite/core/mir/pass.h"
......@@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass {
const Type* cast_type);
void RecreateOp(Node* inst_node, SSAGraph* graph);
void GatherAndModifyFirstConvNodes(SSAGraph* graph);
bool IsFirstConvNode(Node* arg_node);
bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
private:
std::set<std::string> first_conv_nodes_;
};
} // namespace mir
......
......@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
return adj_list;
}
std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildNodeAdjList() {
std::map<mir::Node *, std::set<mir::Node *>> adj_list;
for (auto &n : mutable_nodes()) {
if (adj_list.find(&n) == adj_list.end()) {
adj_list[&n] = std::set<mir::Node *>();
}
std::vector<mir::Node *> nodes;
for (auto &var : n.inlinks) {
nodes.push_back(var);
}
std::sort(nodes.begin(),
nodes.end(),
[](mir::Node *node1, mir::Node *node2) { return node1 > node2; });
adj_list[&n].insert(std::make_move_iterator(nodes.begin()),
std::make_move_iterator(nodes.end()));
}
return adj_list;
}
void SSAGraph::SortHelper(
const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
mir::Node *node,
......@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
return res;
}
std::vector<mir::Node *> SSAGraph::NodeTopologicalOrder() {
CheckBidirectionalConnection();
std::stack<mir::Node *> stack;
std::set<mir::Node *> visited;
std::vector<mir::Node *> res;
auto adj_list = BuildNodeAdjList();
for (auto adj : adj_list) {
if (visited.find(adj.first) == visited.end()) {
SortHelper(adj_list, adj.first, &visited, &res);
}
}
return res;
}
Node *SSAGraph::GraphCreateInstructNode(
const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
node_storage_.emplace_back();
......
......@@ -42,6 +42,8 @@ class SSAGraph : GraphBase {
std::vector<mir::Node *> StmtTopologicalOrder();
std::vector<mir::Node *> NodeTopologicalOrder();
// The inputs of the graph.
std::vector<mir::Node *> inputs();
......@@ -86,6 +88,9 @@ class SSAGraph : GraphBase {
// Build operator inlink edge table.
std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList();
// Build node inlink edge table.
std::map<mir::Node *, std::set<mir::Node *>> BuildNodeAdjList();
void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
mir::Node *node,
std::set<mir::Node *> *visited,
......
......@@ -312,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {
std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
node_map_t *nodes) {
for (auto &it : *nodes) {
node_dat_t *node = it.second;
for (auto &ordered_node : graph_->NodeTopologicalOrder()) {
// different orders when traversing nodes in graph may lead to
// different subgraph division, which may generate different result
// with device such as MLU. These different results are all "right"
// but a little confusing. Thus the topological order is used instead
// of the address of the node in graph.
CHECK(nodes->find(ordered_node) != nodes->end());
node_dat_t *node = (*nodes)[ordered_node];
if (!node->marked) {
continue;
}
......@@ -571,13 +577,14 @@ void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
unused_var_nodes->insert(var_node);
continue;
}
// Var can have more than one next op node, So, if any one in the
// op_nodes then continue
bool next_op_in_nodes = false;
// Var can have more than one next op node, So, if all next nodes are in
// op_nodes then it should be put into local_var_nodes
bool next_op_in_nodes = true;
for (auto &next_op_node : var_node->outlinks) {
if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) !=
if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) ==
op_nodes.end()) {
next_op_in_nodes = true;
next_op_in_nodes = false;
break;
}
}
if (next_op_in_nodes) {
......
......@@ -69,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser();
}
void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -79,3 +93,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
.BindTargets({TARGET(kXPU)});
REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
.BindTargets({TARGET(kBM)});
REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
.BindTargets({TARGET(kMLU)});
......@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class MLUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/pass.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
class SubgraphCastDisplayPass : public DebugPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
VLOG(3) << "== Argument types ==";
for (auto& node : graph->mutable_nodes()) {
if (!node.IsArg()) continue;
auto* type = node.AsArg().type;
if (type) {
VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
} else {
VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
}
}
VLOG(3) << "---------------------";
//
VLOG(0) << "== SubgraphOp Debug Info ==";
for (auto& node : graph->mutable_nodes()) {
if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
VLOG(0) << "FOUND SUBGRAPH OP";
display_debug_info(node, "subgraph");
break;
}
}
VLOG(0) << "---------------------";
}
void display_debug_info(const Node& node,
std::string op_type,
bool display_in_nodes = true,
bool display_out_nodes = true) {
CHECK(node.IsStmt());
VLOG(0) << node.AsStmt();
if (display_in_nodes) {
for (auto p_in_arg_node : node.inlinks) {
CHECK(p_in_arg_node->IsArg());
VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
<< " type: " << *p_in_arg_node->AsArg().type
<< " is_weight: " << p_in_arg_node->AsArg().is_weight
<< " is_persist: " << p_in_arg_node->AsArg().is_persist
<< " input_count: " << p_in_arg_node->inlinks.size();
if (p_in_arg_node->inlinks.size() == 0) {
VLOG(0) << "** END with No Op";
}
for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
CHECK(p_in_stmt_node->IsStmt());
std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type();
if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
stmt_op_type == "io_copy") {
display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
} else {
VLOG(0) << "** END with op type: " << stmt_op_type;
}
}
}
}
if (display_out_nodes) {
for (auto p_out_arg_node : node.outlinks) {
CHECK(p_out_arg_node->IsArg());
VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
<< " type: " << *p_out_arg_node->AsArg().type
<< " is_weight: " << p_out_arg_node->AsArg().is_weight
<< " is_persist: " << p_out_arg_node->AsArg().is_persist
<< " output_count: " << p_out_arg_node->outlinks.size();
if (p_out_arg_node->outlinks.size() == 0) {
VLOG(0) << "** END with No Op";
}
for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
CHECK(p_out_stmt_node->IsStmt());
std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type();
if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
stmt_op_type == "io_copy") {
display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
} else {
VLOG(0) << "** END with op type: " << stmt_op_type;
}
}
}
}
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(subgraph_cast_display_pass,
paddle::lite::mir::SubgraphCastDisplayPass)
.BindTargets({TARGET(kAny)});
......@@ -117,9 +117,15 @@ class Optimizer {
"variable_place_inference_pass", //
"argument_type_display_pass",
"mlu_subgraph_pass",
"runtime_context_assign_pass",
"argument_type_display_pass",
"mlu_postprocess_pass",
"memory_optimize_pass"}};
if (passes.size() == 1) {
passes_local.push_back(passes[0]);
}
......
......@@ -69,6 +69,13 @@ class WorkSpace {
}
#endif
#if defined(LITE_WITH_MLU)
static WorkSpace& Global_MLU() {
thread_local std::unique_ptr<WorkSpace> x(new WorkSpace(TARGET(kMLU)));
return *x;
}
#endif
private:
explicit WorkSpace(TargetType x) : target_(x) {}
......
......@@ -10,4 +10,5 @@ add_subdirectory(opencl)
add_subdirectory(fpga)
add_subdirectory(npu)
add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm)
......@@ -6,3 +6,4 @@ add_subdirectory(bridges)
add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
......@@ -15,6 +15,9 @@ lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS
lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu})
set(mlu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_utility_mlu
......@@ -26,16 +29,20 @@ set(mlu_subgraph_bridges
subgraph_bridge_softmax_op_mlu
subgraph_bridge_fc_op_mlu
subgraph_bridge_batch_norm_op_mlu
subgraph_bridge_scale_op_mlu
subgraph_bridge_interp_op_mlu
subgraph_bridge_concat_op_mlu
CACHE INTERNAL "mlu_subgraph_bridges")
# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
......@@ -31,20 +31,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
VLOG(3) << "[MLU] Converting " + op_type + "...";
// Create act node and set params from op
auto fp_type = graph->FPType();
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type);
CHECK(graph->HasNode(x_var_name));
auto input_tensor = graph->GetNode(x_var_name);
cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
cnmlBaseOp_t activation_op;
CNML_CALL(cnmlCreateActiveOp(&activation_op,
act_type,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
if (op_type == "leaky_relu") {
auto alpha = op_info->GetAttr<float>("alpha");
std::vector<int64_t> shape = {1, 1, 1, 1};
std::string alpha_var_name = string_format("leaky_relu_alpha_%p", op);
auto alpha_tensor =
graph->AddNode(alpha_var_name, shape, CNML_CONST, CNML_NHWC, fp_type);
graph->BindConstRawData(alpha_var_name, &alpha, 1, true);
CNML_CALL(cnmlCreatePreluOp(&activation_op,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
alpha_tensor->mlu_tensor()));
} else {
cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
CNML_CALL(cnmlCreateActiveOp(&activation_op,
act_type,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
}
graph->FuseOp(activation_op);
return SUCCESS;
}
......@@ -54,4 +68,11 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(sigmoid,
kMLU,
paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
kMLU,
paddle::lite::subgraph::mlu::ActConverter);
......@@ -25,8 +25,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int ActConverter(void* ctx, OpLite* op);
template void FillTensor<float, int>(Tensor* x,
float lower = -2,
float upper = -2);
......@@ -136,7 +134,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
TEST(MLUBridges, activation) {
std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
std::vector<std::string> types{"sigmoid", "relu", "tanh"};
std::vector<std::string> types{"sigmoid", "relu", "tanh", "leaky_relu"};
for (auto x_shape : shapes) {
for (auto op_type : types) {
test_act(x_shape, op_type);
......@@ -149,8 +147,7 @@ TEST(MLUBridges, activation) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
sigmoid,
paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
USE_SUBGRAPH_BRIDGE(relu, kMLU)
USE_SUBGRAPH_BRIDGE(tanh, kMLU)
USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU)
......@@ -42,7 +42,7 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto output = scope->FindVar(y_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto output_tensor = graph->AddNode(
y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
y_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
CHECK(graph->HasNode(x_var_name));
......
......@@ -23,8 +23,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int BatchNormConverter(void* ctx, OpLite* op);
template <typename dtype>
void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
Scope* scope = op->scope();
......@@ -139,9 +137,7 @@ void test_batch_norm(
{bs, ic, ih, iw},
{0, 2, 3, 1});
out->Resize({bs, ih, iw, ic});
x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name});
......@@ -181,6 +177,4 @@ TEST(MLUBridges, batch_norm) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
batch_norm,
paddle::lite::subgraph::mlu::BatchNormConverter);
USE_SUBGRAPH_BRIDGE(batch_norm, kMLU)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
auto x_var_name = op_info->Input("X");
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto param_axis = op_info->GetAttr<int>("axis");
std::vector<cnmlTensor_t> input_tensor;
for (auto x_name : x_var_name) {
CHECK(graph->HasNode(x_name));
input_tensor.push_back(graph->GetNode(x_name)->mlu_tensor());
}
auto dims = output_dims.size();
int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
int nhwc_axis = nchw_to_nhwc_axis_map[axis];
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
cnmlBaseOp_t concat_op;
cnmlTensor_t outputs = output_tensor->mlu_tensor();
CNML_CALL(cnmlCreateNdConcatOp(&concat_op,
nhwc_axis,
input_tensor.data(),
x_var_name.size(),
&outputs,
1));
graph->FuseOp(concat_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(concat,
kMLU,
paddle::lite::subgraph::mlu::ConcatConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/concat_op.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = op_info->Input("X");
std::vector<lite::Tensor*> inputs;
for (auto var : x) {
inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
}
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
int axis = op_info->GetAttr<int>("axis");
std::vector<lite::Tensor*> inputs_concat(inputs.size());
for (int j = 0; j < inputs.size(); ++j) {
inputs_concat[j] = inputs[j];
}
size_t num = inputs.size();
int rows = 1;
auto dim_0 = inputs[0]->dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
}
int out_rows = rows, out_cols = 0;
std::vector<int64_t> inputs_cols(inputs.size());
for (int i = 0; i < num; ++i) {
int t_cols = inputs[i]->numel() / rows;
out_cols += t_cols;
inputs_cols[i] = t_cols;
}
for (int k = 0; k < out_rows; ++k) {
float* dst_ptr = out->mutable_data<float>() + k * out_cols;
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = inputs_cols[j];
const float* src_prt = inputs[j]->data<float>() + k * col_len;
std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
col_idx += col_len;
}
}
}
void test_concat(std::vector<std::vector<int64_t>> input, int axis) {
std::string x_var_name = "x";
std::string y_var_name = "y";
std::string out_var_name = "out";
std::string out_ref_var_name = "out_ref";
// prepare input&output variables
Scope scope;
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
x->Resize(DDim(input[0]));
y->Resize(DDim(input[1]));
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
CHECK_EQ(out->dims(), out_ref->dims());
// initialize input&output data
FillTensor<float>(x);
FillTensor<float>(y);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("concat");
opdesc.SetInput("X", {x_var_name, y_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("axis", axis);
auto op = CreateOp<operators::ConcatOpLite>(opdesc, &scope);
concat_ref(op);
out_ref->CopyDataFrom(*out);
Tensor input_x, input_y;
input_x.Resize(DDim(input[0]));
input_y.Resize(DDim(input[1]));
transpose(x->mutable_data<float>(),
input_x.mutable_data<float>(),
{static_cast<int>(input[0][0]),
static_cast<int>(input[0][1]),
static_cast<int>(input[0][2]),
static_cast<int>(input[0][3])},
{0, 2, 3, 1});
transpose(y->mutable_data<float>(),
input_y.mutable_data<float>(),
{static_cast<int>(input[1][0]),
static_cast<int>(input[1][1]),
static_cast<int>(input[1][2]),
static_cast<int>(input[1][3])},
{0, 2, 3, 1});
x->CopyDataFrom(input_x);
y->CopyDataFrom(input_y);
LaunchOp(op, {x_var_name, y_var_name}, {out_var_name});
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize(out->dims());
auto os = out->dims();
transpose(out_data,
output_trans.mutable_data<float>(),
{static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])},
{0, 3, 1, 2});
out_data = output_trans.mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
}
}
TEST(MLUBridges, concat) {
test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0);
test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1);
test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2);
test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3);
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
USE_SUBGRAPH_BRIDGE(concat, kMLU);
......@@ -31,15 +31,16 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
const auto* scope = op->scope();
VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
// Get input, filter and op attributes
// get input, filter and op attributes
const auto input_var_name = op_info->Input("Input").front();
const auto& input_dims_nhwc =
const auto& input_dims =
scope->FindVar(input_var_name)->GetMutable<Tensor>()->dims();
const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
const auto filter_var_name = op_info->Input("Filter").front();
auto* filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
const auto& filter_dims = filter->dims();
const auto output_var_name = op_info->Output("Output").front();
auto* output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
const auto output_shape = output->dims().Vectorize();
const auto bs = input_dims[0];
const auto oc = filter_dims[0];
CHECK_EQ(input_dims.size(), 4);
......@@ -70,24 +71,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
input_dims,
filter_dims);
std::vector<int64_t> output_shape({bs, oc});
for (size_t i = 0; i < 2; i++) {
const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
output_shape.push_back(
(input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
strides[i] +
1);
}
const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
const auto output_tensor = graph->AddNode(output_var_name,
output_shape_nhwc,
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
const auto output_tensor = graph->AddNode(
output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
// Create filter node
const auto filter_tensor = graph->AddNode(filter_var_name,
......@@ -119,14 +104,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
LOG(FATAL) << "UnSupported weight precision!";
}
cnmlConvOpParam_t conv_param;
CNML_CALL(cnmlCreateConvOpParam(&conv_param,
strides[0],
strides[1],
dilations[0],
dilations[1],
paddings[0] * 2,
paddings[2] * 2));
std::string bias_var_name;
std::shared_ptr<MLUTensor> bias_tensor;
if (HasInputArg(op_info, scope, "Bias")) {
......@@ -160,15 +137,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
graph->FPType());
graph->BindConstData(bias_var_name, bias);
}
cnmlBaseOp_t conv_op;
const auto input_scale = op_info->GetAttr<float>("input_scale");
CNML_CALL(cnmlCreateConvOpForward(
&conv_op,
conv_param,
graph->GetNode(input_var_name)->mlu_tensor(),
output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
bool use_first_conv = false;
if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) {
use_first_conv = true;
}
cnmlBaseOp_t conv_op;
if (use_first_conv) {
cnmlConvFirstOpParam_t conv_param;
CNML_CALL(cnmlCreateConvFirstOpParam_V2(&conv_param,
strides[0],
strides[1],
dilations[0],
dilations[1],
paddings[2],
paddings[2],
paddings[0],
paddings[0]));
const auto mean_tensor = graph->AddNode("first_conv_mean_tensor",
std::vector<int64_t>{3},
CNML_CONST,
CNML_CNHW,
graph->FPType());
const auto std_tensor = graph->AddNode("first_conv_std_tensor",
std::vector<int64_t>{3},
CNML_CONST,
CNML_CNHW,
graph->FPType());
graph->BindConstRawData("first_conv_mean_tensor",
lite::DeviceInfo::Global().MeanVec().data(),
3,
false);
graph->BindConstRawData("first_conv_std_tensor",
lite::DeviceInfo::Global().StdVec().data(),
3,
false);
graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8);
CNML_CALL(cnmlCreateConvFirstOpForward(
&conv_op,
conv_param,
graph->GetNode(input_var_name)->mlu_tensor(),
mean_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
std_tensor->mlu_tensor()));
CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param));
} else {
cnmlConvOpParam_t conv_param;
CNML_CALL(cnmlCreateConvOpParam(&conv_param,
strides[0],
strides[1],
dilations[0],
dilations[1],
paddings[0] * 2,
paddings[2] * 2));
CNML_CALL(cnmlCreateConvOpForward(
&conv_op,
conv_param,
graph->GetNode(input_var_name)->mlu_tensor(),
output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
}
graph->SetComputingDataType(
conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
......@@ -183,7 +220,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
}
graph->BindConstData(filter_var_name, filter);
graph->FuseOp(conv_op);
CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
return REBUILD_WHEN_SHAPE_CHANGED;
}
......
......@@ -25,8 +25,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int ConvConverter(void* ctx, OpLite* op);
void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
......@@ -246,10 +244,6 @@ void test_conv(int bs,
}
}
input->Resize({bs, ih, iw, ic});
output->Resize(
{output_shape[0], output_shape[2], output_shape[3], output_shape[1]});
// create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope);
LaunchOp(op, {input_var_name}, {output_var_name});
......@@ -342,9 +336,5 @@ TEST(MLUBridges, conv) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
conv2d,
paddle::lite::subgraph::mlu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
depthwise_conv2d,
paddle::lite::subgraph::mlu::ConvConverter);
USE_SUBGRAPH_BRIDGE(conv2d, kMLU)
USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU)
......@@ -77,7 +77,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto output_tensor = graph->AddNode(out_var_name,
x->dims().Vectorize(),
CNML_TENSOR,
CNML_NHWC,
CNML_NCHW,
graph->FPType());
cnmlBaseOp_t elementwise_op;
......@@ -90,7 +90,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto mid_tensor = graph->AddNode(out_var_name + "_mid",
x->dims().Vectorize(),
CNML_TENSOR,
CNML_NHWC,
CNML_NCHW,
graph->FPType());
CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
x_tensor->mlu_tensor(),
......
......@@ -24,8 +24,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int ElementwiseConverter(void* ctx, OpLite* op);
template <typename dtype>
void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
Scope* scope = op->scope();
......@@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_add,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_sub,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_mul,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_div,
paddle::lite::subgraph::mlu::ElementwiseConverter);
USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU)
USE_SUBGRAPH_BRIDGE(elementwise_sub, kMLU)
USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU)
USE_SUBGRAPH_BRIDGE(elementwise_div, kMLU)
......@@ -37,6 +37,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
auto x_dims = x->dims();
auto w_dims = w->dims();
......@@ -50,15 +51,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto input_scale = op_info->GetAttr<float>("input_scale");
std::vector<int64_t> output_shape_nhwc({1, 1, 1, w_dims[1]});
auto output_tensor = graph->AddNode(output_var_name,
output_shape_nhwc,
output->dims().Vectorize(),
CNML_TENSOR,
CNML_NHWC,
CNML_NCHW,
graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
std::string bias_var_name;
std::shared_ptr<MLUTensor> bias_tensor;
......
......@@ -24,8 +24,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int FCConverter(void* ctx, OpLite* op);
void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
......@@ -141,15 +139,34 @@ void test_fc(const std::vector<int64_t>& input_shape,
}
auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope);
input->Resize({static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[2]),
static_cast<int>(input_shape[3]),
static_cast<int>(input_shape[1])});
out->Resize({static_cast<int>(input_shape[0]), static_cast<int>(w_shape[1])});
Tensor input_tmp, out_tmp;
input_tmp.Resize(input_shape);
transpose(input->mutable_data<float>(),
input_tmp.mutable_data<float>(),
{static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[1]),
static_cast<int>(input_shape[2]),
static_cast<int>(input_shape[3])},
{0, 2, 3, 1});
input->CopyDataFrom(input_tmp);
LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name});
// compare results
auto os = out->dims();
out_tmp.Resize(os);
auto* out_data = out->mutable_data<float>();
// transpose(out_data,
// out_tmp.mutable_data<float>(),
// {static_cast<int>(os[0]),
// static_cast<int>(os[2]),
// static_cast<int>(os[3]),
// static_cast<int>(os[1])},
// {0, 3, 1, 2});
//
// out_data = out_tmp.mutable_data<float>();
// compare results
auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
......@@ -170,4 +187,4 @@ TEST(MLUBridges, fc) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter);
USE_SUBGRAPH_BRIDGE(fc, kMLU);
......@@ -25,12 +25,12 @@ namespace mlu {
std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
std::vector<int64_t> shape,
cnmlTensorType_t tensor_type,
cnmlDataOrder_t data_order,
cnmlDataOrder_t shape_order,
cnmlDataType_t mlu_dtype,
void* raw_ptr) {
CHECK(!HasNode(name));
auto node = std::shared_ptr<MLUTensor>(
new MLUTensor(shape, tensor_type, data_order, mlu_dtype));
new MLUTensor(shape, tensor_type, shape_order, mlu_dtype));
node->set_mlu_ptr(raw_ptr);
nodes_.insert(std::make_pair(name, node));
return node;
......
......@@ -23,6 +23,12 @@
#include "lite/core/tensor.h"
#include "lite/kernels/mlu/bridges/tensor.h"
#define PRINT_HW_TIME false
#if PRINT_HW_TIME
#include <mutex> //NOLINT
#endif
namespace paddle {
namespace lite {
namespace subgraph {
......@@ -32,13 +38,30 @@ namespace mlu {
// to the MLU IR graph
class Graph {
public:
Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); }
Graph() {
CNML_CALL(cnmlCreateFusionOp(&fusion_op_));
#if PRINT_HW_TIME
CNRT_CALL(cnrtCreateNotifier(&notifier_start_));
CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
#endif
}
~Graph() {
FreeConstData();
CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
for (auto op : ops_) {
CNML_CALL(cnmlDestroyBaseOp(&op));
}
#if PRINT_HW_TIME
CNRT_CALL(cnrtDestroyNotifier(&notifier_start_));
CNRT_CALL(cnrtDestroyNotifier(&notifier_end_));
double total_time = 0;
for (auto& f : time_log_) {
total_time += f;
}
std::cout << "cnml hardware time for " << time_log_.size()
<< " process:" << total_time / time_log_.size() << std::endl;
#endif
}
// Data node
......@@ -89,6 +112,10 @@ class Graph {
}
void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
#if PRINT_HW_TIME
thread_local float hw_time;
CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
#endif
CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
input_addrs_.data(),
input_addrs_.size(),
......@@ -96,7 +123,61 @@ class Graph {
output_addrs_.size(),
&forward_param,
que));
#if PRINT_HW_TIME
CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
#endif
CNRT_CALL(cnrtSyncQueue(que));
#if PRINT_HW_TIME
CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
hw_time /= 1000.0f;
DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
std::lock_guard<std::mutex> lk(time_mut_);
time_log_.push_back(hw_time);
#endif
}
template <typename T>
void* RegisterConstData(size_t len) {
void* addr = malloc(len * sizeof(T));
const_data_storage_.push_back(addr);
return addr;
}
void FreeConstData() {
for (auto& addr : const_data_storage_) {
free(addr);
}
}
void BindConstRawData(std::string tensor_name,
const float* data,
size_t len,
bool alloc = true) {
void* alloc_data;
if (fp_type_ == CNML_DATA_FLOAT32) {
if (alloc) {
alloc_data = RegisterConstData<float>(len);
memcpy(alloc_data, data, len * sizeof(float));
} else {
alloc_data = const_cast<void*>(static_cast<const void*>(data));
}
CNML_CALL(cnmlBindConstData_V2(
nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
} else if (fp_type_ == CNML_DATA_FLOAT16) {
void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
CNRT_CALL(
cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
CNRT_FLOAT32,
data_fp16,
CNRT_FLOAT16,
len,
nullptr));
CNML_CALL(cnmlBindConstData_V2(
nodes_[tensor_name]->mlu_tensor(), data_fp16, false));
} else {
CHECK(0);
}
}
void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
......@@ -158,6 +239,12 @@ class Graph {
std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
std::vector<cnmlBaseOp_t> ops_;
cnmlFusionOp_t fusion_op_;
std::vector<void*> const_data_storage_;
#if PRINT_HW_TIME
cnrtNotifier_t notifier_start_{}, notifier_end_{};
std::mutex time_mut_;
std::vector<float> time_log_;
#endif
};
} // namespace mlu
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto out = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto x_dims = x->dims();
CHECK_EQ(x_dims.size(), 4);
auto scale = op_info->GetAttr<float>("scale");
auto out_w = op_info->GetAttr<int>("out_w");
auto out_h = op_info->GetAttr<int>("out_h");
auto align_corners = op_info->GetAttr<bool>("align_corners");
CHECK(graph->HasNode(x_var_name));
auto input_tensor = graph->GetNode(x_var_name);
auto in_h = x_dims[2];
auto in_w = x_dims[3];
// Priority: SizeTensor > OutSize > Scale > scale > out_h/out_w
if (HasInputArg(op_info, scope, "SizeTensor")) {
LOG(ERROR) << "Not support SizeTensor input now";
CHECK(0);
} else {
if (HasInputArg(op_info, scope, "Scale")) {
LOG(ERROR) << "Not support Scale input now";
CHECK(0);
}
if (scale > 0) {
out_h = static_cast<int>(in_h * scale);
out_w = static_cast<int>(in_w * scale);
out_h = out_h > 0 ? out_h : -1;
out_w = out_w > 0 ? out_w : -1;
}
if (HasInputArg(op_info, scope, "OutSize")) {
LOG(ERROR) << "Not support OutSize input now";
CHECK(0);
}
}
auto output_tensor = graph->AddNode(out_var_name,
out->dims().Vectorize(),
CNML_TENSOR,
CNML_NCHW,
graph->FPType());
cnmlBaseOp_t interp_op;
cnmlNearestNeighborOpParam_t nn_param;
CNML_CALL(cnmlCreateNearestNeighborOpParam(&nn_param, out_w, out_h));
CNML_CALL(cnmlSetNearestNeighborAlignCorner(&nn_param, align_corners));
CNML_CALL(cnmlCreateNearestNeighborOp(&interp_op,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
nn_param));
CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param));
graph->FuseOp(interp_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(nearest_interp,
kMLU,
paddle::lite::subgraph::mlu::InterpolateConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/interpolate_op.h"
#include <gtest/gtest.h>
#include <string>
#include "lite/core/device_info.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
template <typename dtype>
void ResizeNearestAlign(const lite::Tensor* x,
lite::Tensor* out,
bool with_align) {
auto x_dims = x->dims();
int num = x_dims[0];
int channels = x_dims[1];
int hin = x_dims[2];
int win = x_dims[3];
int hout = out->dims()[2];
int wout = out->dims()[3];
dtype scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
: (static_cast<float>(win) / (wout));
dtype scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
: (static_cast<float>(hin) / (hout));
const dtype* src = x->data<dtype>();
dtype* dst = out->mutable_data<dtype>();
int dst_stride_w = 1;
int dst_stride_h = wout;
int dst_stride_c = wout * hout;
int dst_stride_batch = wout * hout * channels;
int src_stride_w = 1;
int src_stride_h = win;
int src_stride_c = win * hin;
int src_stride_batch = win * hin * channels;
for (int n = 0; n < num; ++n) {
for (int c = 0; c < channels; ++c) {
int src_index = n * src_stride_batch + c * src_stride_c;
for (int h = 0; h < hout; ++h) {
for (int w = 0; w < wout; ++w) {
int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
: static_cast<int>(scale_w * w);
fw = (fw < 0) ? 0 : fw;
int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
: static_cast<int>(scale_h * h);
fh = (fh < 0) ? 0 : fh;
int w_start = static_cast<int>(fw);
int h_start = static_cast<int>(fh);
int dst_index = n * dst_stride_batch + c * dst_stride_c +
h * dst_stride_h + w * dst_stride_w;
dst[dst_index] =
src[src_index + w_start * src_stride_w + h_start * src_stride_h];
}
}
}
}
}
template <typename DType>
void BilinearInterpRef(const lite::Tensor* x,
lite::Tensor* out,
bool align_corners,
int align_mode) {
auto x_dims = x->dims();
int batch_size = x_dims[0];
int channel_size = x_dims[1];
auto x_h = x_dims[2];
auto x_w = x_dims[3];
CHECK_EQ(x_dims.size(), 4);
auto out_dims = out->dims();
int out_h = out_dims[2];
int out_w = out_dims[3];
// copy from x if no change
if (x_h == out_h && x_w == out_w) {
out->CopyDataFrom(*x);
return;
}
float ratio_h = 0.f;
float ratio_w = 0.f;
if (out_h > 1) {
ratio_h = (align_corners) ? static_cast<float>(x_h - 1) / (out_h - 1)
: static_cast<float>(x_h) / out_h;
}
if (out_w > 1) {
ratio_w = (align_corners) ? static_cast<float>(x_w - 1) / (out_w - 1)
: static_cast<float>(x_w) / out_w;
}
// naive bilinear interpolation
auto x_data = x->data<DType>();
auto out_data = out->mutable_data<DType>();
bool align_flag = (align_mode == 0 && !align_corners);
std::vector<int> vy_n, vy_s;
std::vector<float> vd_n, vd_s;
vy_n.reserve(out_h);
vy_s.reserve(out_h);
vd_n.reserve(out_h);
vd_s.reserve(out_h);
for (int k = 0; k < out_h; k++) {
int yn = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
: static_cast<int>(ratio_h * k);
yn = (yn > 0) ? yn : 0;
int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1);
float idx_src_y = ratio_h * (k + 0.5) - 0.5;
idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn;
float ds = 1.f - dn;
{
vy_n[k] = yn;
vy_s[k] = ys;
vd_n[k] = dn;
vd_s[k] = ds;
}
}
std::vector<int> vx_w, vx_e;
std::vector<float> vd_w, vd_e;
vx_w.reserve(out_w);
vx_e.reserve(out_w);
vd_w.reserve(out_w);
vd_e.reserve(out_w);
for (int l = 0; l < out_w; l++) {
int xw = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
: static_cast<int>(ratio_w * l);
xw = (xw > 0) ? xw : 0;
int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1);
float idx_src_x = ratio_w * (l + 0.5) - 0.5;
idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw;
float de = 1.f - dw;
{
vx_w[l] = xw;
vx_e[l] = xe;
vd_w[l] = dw;
vd_e[l] = de;
}
}
std::vector<int64_t> x_strides(x_dims.size(), 1);
for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
}
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < channel_size; j++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
DType x0 = x_data[i * x_strides[0] + j * x_strides[1] +
vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]];
DType x1 = x_data[i * x_strides[0] + j * x_strides[1] +
vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]];
DType x2 = x_data[i * x_strides[0] + j * x_strides[1] +
vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]];
DType x3 = x_data[i * x_strides[0] + j * x_strides[1] +
vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]];
*out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] +
x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l];
out_data++;
}
}
}
}
}
class InterpComputeTester {
protected:
// common attributes for this op.
std::string x_var_name = "X";
std::string outsize_var_name = "OutSize";
std::string out_var_name = "Out";
std::string out_ref_var_name = "out_ref";
DDim dims_{{1, 2, 3, 4}};
Scope scope;
std::string interp_method_ = "nearest";
float scale_ = -1.f;
int out_h_ = -1;
int out_w_ = -1;
bool align_corners_ = true;
int align_mode_ = 1;
bool use_outsize_ = false;
public:
InterpComputeTester(const std::string& alias,
DDim dims,
std::string interp_method = "nearest",
float scale = -1.f,
int out_h = -1,
int out_w = -1,
bool align_corners = true,
int align_mode = 1,
bool use_outsize = false)
: dims_(dims),
interp_method_(interp_method),
scale_(scale),
out_h_(out_h),
out_w_(out_w),
align_corners_(align_corners),
align_mode_(align_mode),
use_outsize_(use_outsize) {}
void Execute(float abs_error) {
cpp::OpDesc op_desc;
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* outsize = scope.Var(outsize_var_name)->GetMutable<Tensor>();
auto* outref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
int out_h = out_h_;
int out_w = out_w_;
if (scale_ > 0) {
out_h = static_cast<int>(dims_[2] * scale_);
out_w = static_cast<int>(dims_[3] * scale_);
}
x->Resize(dims_);
/* printf("----output tensor dims: %ld, %d, %d, %ld\n", dims_[0], out_h,
* out_w, dims_[1]); */
std::vector<int64_t> out_shape_nchw = {dims_[0], dims_[1], out_h, out_w};
outref->Resize(out_shape_nchw);
outsize->Resize({2});
FillTensor<float, float>(x, -1.f, 1.f);
if (use_outsize_) {
outsize->mutable_data<int>()[0] = out_h;
outsize->mutable_data<int>()[1] = out_w;
outsize->set_persistable(true);
}
if (interp_method_ == "nearest") {
op_desc.SetType("nearest_interp");
} else if (interp_method_ == "bilinear") {
op_desc.SetType("bilinear_interp");
} else {
LOG(FATAL) << "unsupport";
}
op_desc.SetInput("X", {x_var_name});
if (use_outsize_) {
op_desc.SetInput("OutSize", {outsize_var_name});
}
op_desc.SetOutput("Out", {out_var_name});
op_desc.SetAttr("scale", scale_);
op_desc.SetAttr("out_h", out_h_);
op_desc.SetAttr("out_w", out_w_);
op_desc.SetAttr("align_corners", align_corners_);
op_desc.SetAttr("align_mode", align_mode_);
op_desc.SetAttr("interp_method", interp_method_);
auto op = CreateOp<operators::InterpolateOp>(op_desc, &scope);
if (interp_method_ == "nearest") {
ResizeNearestAlign<float>(x, outref, align_corners_);
} else if (interp_method_ == "bilinear") {
BilinearInterpRef<float>(x, outref, align_corners_, align_mode_);
}
int in = dims_[0], ic = dims_[1], ih = dims_[2], iw = dims_[3];
Tensor input_trans;
input_trans.Resize(dims_);
transpose(x->mutable_data<float>(),
input_trans.mutable_data<float>(),
{in, ic, ih, iw},
{0, 2, 3, 1});
x->CopyDataFrom(input_trans);
if (use_outsize_) {
LaunchOp(op, {x_var_name, outsize_var_name}, {out_var_name});
} else {
LaunchOp(op, {x_var_name}, {out_var_name});
}
auto* out_ref_data = outref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize(out_shape_nchw);
transpose(
out->mutable_data<float>(),
output_trans.mutable_data<float>(),
{static_cast<int>(dims_[0]), out_h, out_w, static_cast<int>(dims_[1])},
{0, 3, 1, 2});
auto* out_data = output_trans.mutable_data<float>();
for (int i = 0; i < out->dims().production(); ++i) {
EXPECT_NEAR(out_data[i], out_ref_data[i], abs_error);
}
}
};
void TestInterpOuthw(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
/* for (auto interp_method : std::vector<std::string>{"nearest",
* "bilinear"}) { */
for (auto interp_method : std::vector<std::string>{"nearest"}) {
for (int out_h : {6, 8, 12}) {
for (int out_w : {6, 9}) {
printf("testcase %s: out_w %d, out_h %d\n",
interp_method.c_str(),
out_w,
out_h);
InterpComputeTester tester(
"def", DDim(x_dims), interp_method, -1.f, out_h, out_w);
tester.Execute(abs_error);
}
}
}
}
}
void TestInterpScale(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
/* for (auto interp_method : std::vector<std::string>{"nearest",
* "bilinear"}) { */
for (auto interp_method : std::vector<std::string>{"nearest"}) {
for (float scale : {0.3f, 1.f, 1.7f}) {
printf("testcase %s: scale: %f\n", interp_method.c_str(), scale);
InterpComputeTester tester("def", DDim(x_dims), interp_method, scale);
tester.Execute(abs_error);
}
}
}
}
void TestInterpOutsize(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
/* for (auto interp_method : std::vector<std::string>{"nearest",
* "bilinear"}) { */
for (auto interp_method : std::vector<std::string>{"nearest"}) {
printf("testcase %s: outsize: %d %d\n", interp_method.c_str(), 4, 4);
InterpComputeTester tester(
"def", DDim(x_dims), interp_method, -1, 4, 4, true, 1, true);
tester.Execute(abs_error);
}
}
}
void TestInterpAlignCorners(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
for (bool align_corners : {true, false}) {
printf(
"testcase nearest: scale: 0.4, out_w -1 out_h -1, align_corners %d\n",
align_corners);
InterpComputeTester tester(
"def", DDim(x_dims), "nearest", 0.4, -1, -1, align_corners);
tester.Execute(abs_error);
}
}
}
void TestInterpAlignMode(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
for (bool align_corners : {true, false}) {
for (int align_mode : {0, 1}) {
printf(
"testcase bilinear: scale: 0.7, out_w -1 out_h -1, align_corners "
"%d, mode %d\n",
align_corners,
align_mode);
InterpComputeTester tester("def",
DDim(x_dims),
"bilinear",
0.7,
-1,
-1,
align_corners,
align_mode);
tester.Execute(abs_error);
}
}
}
}
TEST(MLUBridges, interpolate) {
float abs_error = 2e-5;
TestInterpOuthw(abs_error);
TestInterpScale(abs_error);
// bug, not usable
// TestInterpOutsize(abs_error);
TestInterpAlignCorners(abs_error);
// only for bilinear interp
// TestInterpAlignMode(abs_error);
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
......@@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(pool2d, kMLU);
USE_SUBGRAPH_BRIDGE(softmax, kMLU);
USE_SUBGRAPH_BRIDGE(batch_norm, kMLU);
USE_SUBGRAPH_BRIDGE(fc, kMLU);
USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU);
USE_SUBGRAPH_BRIDGE(concat, kMLU);
USE_SUBGRAPH_BRIDGE(scale, kMLU);
......@@ -47,9 +47,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Get input, and attributes
auto x_var_name = op_info->Input("X").front();
auto x = scope->FindTensor(x_var_name);
auto input_dims_nhwc = x->dims();
const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
auto output_var_name = op_info->Output("Out").front();
auto output_shape = scope->FindTensor(output_var_name)->dims().Vectorize();
auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
......@@ -81,23 +80,17 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
strides,
ksize);
std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
for (size_t i = 0; i < 2; i++) {
output_shape.push_back(
(input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) /
strides[i] +
1);
}
// std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
// for (size_t i = 0; i < 2; i++) {
// output_shape.push_back(
// (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
// ksize[0]) /
// strides[i] +
// 1);
// }
auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
auto output_tensor = graph->AddNode(output_var_name,
output_shape_nhwc,
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
auto output_tensor = graph->AddNode(
output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
cnmlPoolOpParam_t pool_param;
CNML_CALL(
......
......@@ -24,8 +24,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int PoolConverter(void* ctx, OpLite* op);
void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
......@@ -182,12 +180,7 @@ void test_pool(int bs,
{0, 2, 3, 1});
auto os = out->dims();
out->Resize({static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])});
x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name});
......@@ -275,6 +268,4 @@ TEST(MLUBridges, pool) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
pool2d,
paddle::lite::subgraph::mlu::PoolConverter);
USE_SUBGRAPH_BRIDGE(pool2d, kMLU)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
// Create act node and set params from op
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
auto bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
auto scale = op_info->GetAttr<float>("scale");
auto bias = op_info->GetAttr<float>("bias");
auto beta = bias_after_scale ? bias : bias * scale;
std::vector<int64_t> shape = {1, 1, 1, 1};
std::string prefix = string_format("_%p", op);
auto alpha_tensor = graph->AddNode(
"Alpha" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType());
auto beta_tensor = graph->AddNode(
"Beta" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType());
graph->BindConstRawData("Alpha" + prefix, &scale, 1);
graph->BindConstRawData("Beta" + prefix, &beta, 1);
auto input_tensor = graph->GetNode(x_var_name);
cnmlBaseOp_t scale_op;
CNML_CALL(cnmlCreateScaleOp(&scale_op,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
alpha_tensor->mlu_tensor(),
beta_tensor->mlu_tensor()));
graph->FuseOp(scale_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(scale,
kMLU,
paddle::lite::subgraph::mlu::ScaleConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/scale_op.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void scale_ref(const std::shared_ptr<operators::ScaleOp> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
float scale = op_info->GetAttr<float>("scale");
float bias = op_info->GetAttr<float>("bias");
bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
if (!bias_after_scale) {
bias *= scale;
}
auto x_data = x->data<float>();
auto out_data = out->mutable_data<float>();
DDim x_dims = x->dims();
DDim out_dims = out->dims();
CHECK_EQ(x_dims.production(), out_dims.production());
for (int i = 0; i < out_dims.production(); i++) {
out_data[i] = x_data[i] * scale + bias;
}
}
void test_scale(int bs,
int ic,
int ih,
int iw,
bool bias_after_scale,
float scale,
float bias) {
// prepare input&output variables
Scope scope;
std::string x_var_name("x");
std::string out_var_name("out");
std::string out_ref_var_name("out_ref");
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize({bs, ic, ih, iw});
// initialize input&output data
FillTensor<float, int>(x);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("scale");
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("bias_after_scale", bias_after_scale);
opdesc.SetAttr("scale", scale);
opdesc.SetAttr("bias", bias);
// create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::ScaleOp>(opdesc, &scope);
scale_ref(op);
out_ref->CopyDataFrom(*out);
Tensor input_trans;
input_trans.Resize({bs, ic, ih, iw});
transpose(x->mutable_data<float>(),
input_trans.mutable_data<float>(),
{bs, ic, ih, iw},
{0, 2, 3, 1});
auto os = out->dims();
out->Resize({static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])});
x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name});
// execute reference implementation and save to output tensor('out')
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize(os);
transpose(out_data,
output_trans.mutable_data<float>(),
{static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])},
{0, 3, 1, 2});
out_data = output_trans.mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
TEST(MLUBridges, scale) {
for (auto bs : {1, 3}) {
for (auto ic : {1, 3}) {
for (auto ih : {3, 4}) {
for (auto iw : {4, 3}) {
for (auto bias_after_scale : {false, true}) {
for (auto scale : {-1.0f, 5.0f}) {
for (auto bias : {-2.0f, 30.0f}) {
VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
<< " iw: " << iw
// << " bias_after_scale: " << bias_after_scale
<< " scale: " << scale << " bias: " << bias;
test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias);
}
}
}
}
}
}
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
USE_SUBGRAPH_BRIDGE(scale, kMLU);
......@@ -45,11 +45,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
axis = output_dims.size() + axis;
}
}
int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
cnmlBaseOp_t softmax_op;
CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op,
nhwc_axis,
......
......@@ -23,8 +23,6 @@ namespace lite {
namespace subgraph {
namespace mlu {
int SoftmaxConverter(void* ctx, OpLite* op);
template <typename dtype>
void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
Scope* scope = op->scope();
......@@ -112,9 +110,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
{bs, ic, ih, iw},
{0, 2, 3, 1});
out->Resize({bs, ih, iw, ic});
x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name});
......@@ -171,6 +167,4 @@ TEST(MLUBridges, softmax) {
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
softmax,
paddle::lite::subgraph::mlu::SoftmaxConverter);
USE_SUBGRAPH_BRIDGE(softmax, kMLU)
......@@ -47,6 +47,8 @@ class MLUTensor {
return mlu_ptr_;
}
void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
~MLUTensor();
private:
......
......@@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const std::vector<std::string>& input_var_names,
const std::vector<std::string>& output_var_names) {
CNRT_CALL(cnrtInit(0));
SetMluDevice(0);
::paddle::lite::SetMluDevice(0);
cnrtQueue_t queue_;
cnrtInvokeFuncParam_t forward_param;
u32_t affinity = 1;
......@@ -47,7 +47,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const auto& bridges = subgraph::Registry::Instance();
CHECK(bridges.Exists(op_type, TARGET(kMLU)));
// Convert all of input data vars and added into the MLU IR graph
// Convert input data var and add it into the MLU IR graph
for (auto& input_name : input_var_names) {
auto input_tensor = scope->FindMutableTensor(input_name);
CHECK(input_tensor);
......@@ -58,7 +58,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
graph.AddNode(input_name,
input_tensor->dims().Vectorize(),
CNML_TENSOR,
CNML_NHWC,
CNML_NCHW,
graph.FPType(),
reinterpret_cast<void*>(
input_tensor->mutable_data<float>(TARGET(kMLU))));
......@@ -68,6 +68,8 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
sizeof(float) * input_tensor->dims().production(),
CNRT_MEM_TRANS_DIR_HOST2DEV));
}
op->CheckShape();
op->InferShape();
bridges.Select(op_type, TARGET(kMLU))(
reinterpret_cast<void*>(&graph), const_cast<OpLite*>(op.get()), nullptr);
......
......@@ -84,7 +84,7 @@ struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
typedef ::paddle::lite::fluid::float16 T;
typedef paddle::lite::fluid::float16 T;
};
} // namespace mlu
......
......@@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL(
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
// host_to_device)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
// .Finalize();
//
//
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
// device_to_host)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/layout_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
layout,
kMLU,
kFloat,
kNHWC,
paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFloat)>,
def_layout_nhwc2nchw_fp32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL(
layout,
kMLU,
kFP16,
kNHWC,
paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFP16)>,
def_layout_nhwc2nchw_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL(
layout,
kMLU,
kFloat,
kNHWC,
paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFloat)>,
def_layout_nchw2nhwc_fp32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(
layout,
kMLU,
kFP16,
kNHWC,
paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFP16)>,
def_layout_nchw2nhwc_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(
layout,
kMLU,
kInt8,
kNHWC,
paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
def_layout_nchw2nhwc_fp32_int8)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt8),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt8),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Eigen/Core>
#include <string>
#include <vector>
#include "lite/backends/x86/math/math_function.h"
#include "lite/core/kernel.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/operators/layout_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {
template <paddle::lite_api::PrecisionType>
struct FPTypeTraits {};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
typedef float T;
};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
typedef paddle::lite::fluid::float16 T;
};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
typedef int8_t T;
};
template <lite::TargetType Target, typename T>
inline void LayoutTransCompute(const int dim,
const lite::Context<Target>& context,
const lite::Tensor& in,
lite::Tensor* out,
const std::vector<int>& axis) {
switch (dim) {
case 2:
paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 2> trans2;
trans2(context, in, out, axis);
break;
case 3:
paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 3> trans3;
trans3(context, in, out, axis);
break;
case 4:
paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 4> trans4;
trans4(context, in, out, axis);
break;
default:
CHECK(0) << ("Unsupport dim in mlu layout");
}
}
template <PrecisionType Precision>
class LayoutNchwToNhwcCompute
: public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
public:
using param_t = operators::LayoutParam;
void Run() override {
auto& param = this->template Param<param_t>();
auto* x = param.x;
auto* out = param.y;
out->template mutable_data<typename FPTypeTraits<Precision>::T>();
auto x_dims = param.x->dims().size();
auto& context = this->ctx_->template As<X86Context>();
const auto origin_dims = out->dims().Vectorize();
std::vector<int> axis;
switch (x_dims) {
case 2:
axis = {0, 1};
break;
case 3:
axis = {0, 2, 1};
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[2], out->dims()[1]});
break;
case 4:
axis = {0, 2, 3, 1};
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
break;
default:
CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
}
LayoutTransCompute<lite::TargetType::kX86,
typename FPTypeTraits<Precision>::T>(
x_dims, context, *x, out, axis);
if (x_dims > 2) {
out->Resize(origin_dims);
}
}
std::string doc() const override {
return "Mlu layout transform nchw to nhwc";
}
};
template <PrecisionType Precision>
class LayoutNhwcToNchwCompute
: public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
public:
using param_t = operators::LayoutParam;
void Run() override {
auto& param = this->template Param<param_t>();
auto* x = param.x;
auto* out = param.y;
out->template mutable_data<typename FPTypeTraits<Precision>::T>();
auto x_dims = param.x->dims().size();
auto& context = this->ctx_->template As<X86Context>();
const auto origin_dims = out->dims().Vectorize();
std::vector<int> axis;
switch (x_dims) {
case 2:
axis = {0, 1};
break;
case 3:
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[2], out->dims()[1]});
axis = {0, 2, 1};
break;
case 4:
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]});
axis = {0, 3, 1, 2};
break;
default:
CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw";
}
LayoutTransCompute<lite::TargetType::kX86,
typename FPTypeTraits<Precision>::T>(
x_dims, context, *x, out, axis);
if (x_dims > 2) {
out->Resize(origin_dims);
}
}
std::string doc() const override {
return "Mlu layout transform nhwc to nchw";
}
};
} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine {
graph_.SetFPType(type);
}
int Build() {
// In order to attach all of the ops of the block desc, we need to build
// the original program firstly.
BuildOriginProgram();
// Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
build_device_program_status_ = BuildDeviceProgram();
return build_device_program_status_;
}
int Launch() {
// Rebuild device program when the shapes of input tensors have been
// changed.
if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
build_device_program_status_) &&
InputShapeChanged()) {
Build();
}
if (subgraph::CHECK_FAILED(build_device_program_status_)) {
LaunchOriginProgram();
} else {
LaunchDeviceProgram();
}
return 0;
}
protected:
int BuildDeviceProgram() override {
int status = 0;
......@@ -57,7 +83,7 @@ class SubgraphEngine : public subgraph::Engine {
graph_.AddNode(input_name,
input_tensor->dims().Vectorize(),
CNML_TENSOR,
CNML_NHWC,
CNML_NCHW,
graph_.FPType(),
const_cast<void*>(input_tensor->raw_data()));
CHECK(input_node);
......@@ -71,9 +97,9 @@ class SubgraphEngine : public subgraph::Engine {
for (auto& inst : origin_program_) {
auto op = inst.op();
CHECK(op);
op->CheckShape();
op->InferShape();
std::string op_type = op->op_info()->Type();
op->CheckShape();
const_cast<OpLite*>(op)->InferShape();
if (!bridges.Exists(op_type, TARGET(kMLU))) {
LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
return subgraph::FAILED;
......@@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine {
graph_.AddInput(graph_.GetNode(input_name));
}
CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
// auto& mlu_context = this->ctx_->template As<MLUContext>();
// auto core_version = mlu_context.MLUCoreVersion();
// auto core_number = mlu_context.MLUCoreNumber();
// graph_.Compile(core_version, core_number);
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto core_version = mlu_context.MLUCoreVersion();
auto core_number = mlu_context.MLUCoreNumber();
graph_.Compile(core_version, core_number);
return status;
}
int LaunchDeviceProgram() override {
// auto& mlu_context = this->ctx_->template As<MLUContext>();
// auto exec_queue = mlu_context.exec_queue();
// u32_t affinity = mlu_context.affinity();
// cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
// int data_param = 1;
// forward_param.data_parallelism = &data_param;
// forward_param.affinity = &affinity;
// forward_param.end = CNRT_PARAM_END;
// graph_.Compute(forward_param, exec_queue);
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto exec_queue = mlu_context.exec_queue();
u32_t affinity = mlu_context.affinity();
cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
int data_param = 1;
forward_param.data_parallelism = &data_param;
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
graph_.Compute(forward_param, exec_queue);
return 0;
}
......
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM)
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
return()
endif()
......
......@@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast,
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
REGISTER_LITE_KERNEL(
cast,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::CastCompute<::paddle::lite::fluid::float16>,
fp16_to_any)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
endif()
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
......
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
......
......@@ -2,10 +2,10 @@
set -ex
# global variables with default value
NEUWARE_HOME="${NEUWARE_HOME}" # XPU SDK
NEUWARE_HOME="${NEUWARE_HOME}"
TARGET_NAME="all" # default target
BUILD_EXTRA=OFF # ON(with sequence ops)/OFF
WITH_TESTING=OFF # ON/OFF
WITH_TESTING=ON # ON/OFF
function print_usage {
echo -e "\nUSAGE:"
......@@ -20,10 +20,9 @@ function print_usage {
# readonly variables with default value
readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-DWITH_PYTHON=OFF \
-DLITE_WITH_ARM=OFF"
readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-8}
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
readonly workspace=$(pwd)
......@@ -37,8 +36,7 @@ function prepare_thirdparty {
fi
tar xzf third-party-05b862.tar.gz
else
# git submodule update --init --recursive
echo "third-party is in ready"
git submodule update --init --recursive
fi
}
......@@ -62,12 +60,12 @@ function prepare_workspace {
}
function build_mlu {
prepare_workspace
build_dir=${workspace}/build.lite.mlu
mkdir -p $build_dir
cd $build_dir
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
prepare_workspace
cmake .. \
${CMAKE_COMMON_OPTIONS} \
-DWITH_GPU=OFF \
......@@ -75,9 +73,10 @@ function build_mlu {
-DLITE_WITH_X86=ON \
-DWITH_MKL=ON \
-DLITE_WITH_MLU=ON \
-DLITE_WITH_PYTHON=OFF \
-DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-DWITH_TESTING=${WITH_TESTING} \
-DMLU_SDK_ROOT=${XPU_SDK_ROOT}
-DNEUWARE_HOME=${NEUWARE_HOME}
make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册