未验证 提交 cc927184 编写于 作者: M MaxwellDing 提交者: GitHub

[MLU] add cast on MLU as default, test=develop (#3776)

上级 11cbd50e
......@@ -53,12 +53,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
#endif
#ifdef LITE_WITH_MLU
Env<TARGET(kMLU)>::Init();
lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
config.mlu_core_number(),
config.mlu_use_first_conv(),
config.mlu_first_conv_mean(),
config.mlu_first_conv_std(),
config.mlu_input_layout());
lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(),
config.mlu_core_number(),
config.mlu_input_layout(),
config.mlu_firstconv_param());
#endif // LITE_WITH_MLU
auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS");
......
......@@ -13,6 +13,9 @@
// limitations under the License.
#include "lite/api/paddle_api.h"
#include <utility>
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/core/target_wrapper.h"
......@@ -22,6 +25,10 @@
#include "lite/backends/cuda/target_wrapper.h"
#endif
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/target_wrapper.h"
#endif
namespace paddle {
namespace lite_api {
......@@ -97,6 +104,13 @@ void Tensor::CopyFromCpu(const T *src_data) {
data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
#else
LOG(FATAL) << "Please compile the lib with CUDA.";
#endif
} else if (type == TargetType::kMLU) {
#ifdef LITE_WITH_MLU
lite::TargetWrapperMlu::MemcpySync(
data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
#else
LOG(FATAL) << "Please compile the lib with MLU.";
#endif
} else {
LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA";
......@@ -117,6 +131,13 @@ void Tensor::CopyToCpu(T *data) const {
data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
#else
LOG(FATAL) << "Please compile the lib with CUDA.";
#endif
} else if (type == TargetType::kMLU) {
#ifdef LITE_WITH_MLU
lite::TargetWrapperMlu::MemcpySync(
data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
#else
LOG(FATAL) << "Please compile the lib with MLU.";
#endif
} else {
LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA";
......@@ -138,6 +159,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kMLU>(const int *);
template void Tensor::CopyFromCpu<int64_t, TargetType::kMLU>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kMLU>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kMLU>(const int8_t *);
template void Tensor::CopyToCpu(float *) const;
template void Tensor::CopyToCpu(int *) const;
template void Tensor::CopyToCpu(int8_t *) const;
......@@ -228,13 +254,9 @@ void CxxConfig::set_mlu_core_number(int core_number) {
void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
mlu_input_layout_ = layout;
}
void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
mlu_use_first_conv_ = use_first_conv;
}
void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
void CxxConfig::set_mlu_firstconv_param(const std::vector<float> &mean,
const std::vector<float> &std) {
mlu_first_conv_mean_ = mean;
}
void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
mlu_first_conv_std_ = std;
}
lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
......@@ -242,12 +264,9 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
}
int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
return mlu_first_conv_mean_;
}
const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
return mlu_first_conv_std_;
std::pair<std::vector<float>, std::vector<float>>
CxxConfig::mlu_firstconv_param() const {
return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_);
}
#endif
......
......@@ -21,6 +21,7 @@
#define PADDLE_LITE_API_H_
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "paddle_place.h" // NOLINT
......@@ -160,9 +161,8 @@ class LITE_API CxxConfig : public ConfigBase {
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
int mlu_core_number_{1};
DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
bool mlu_use_first_conv_{false};
std::vector<float> mlu_first_conv_mean_;
std::vector<float> mlu_first_conv_std_;
std::vector<float> mlu_first_conv_mean_{};
std::vector<float> mlu_first_conv_std_{};
#endif
public:
......@@ -210,24 +210,22 @@ class LITE_API CxxConfig : public ConfigBase {
void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
// set MLU core number, which is used when compiling MLU kernels
void set_mlu_core_number(int core_number);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void set_mlu_input_layout(DataLayoutType layout);
// whether use MLU's first conv kernel. First conv is a special kernel
// provided by MLU, its input is uint8, and also needs two 3-dimentional
// vectors which save all inputs' mean and std values
void set_mlu_use_first_conv(bool use_first_conv);
// set the 3-dimentional mean vector used by MLU's first conv
void set_mlu_first_conv_mean(const std::vector<float>& mean);
// set the 3-dimentional std vector used by MLU's first conv
void set_mlu_first_conv_std(const std::vector<float>& std);
// set the 3-dimentional mean vector and 3-dimentional std vector used by
// MLU's first conv
void set_mlu_firstconv_param(const std::vector<float>& mean,
const std::vector<float>& std);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void set_mlu_input_layout(DataLayoutType layout);
lite_api::MLUCoreVersion mlu_core_version() const;
int mlu_core_number() const;
DataLayoutType mlu_input_layout() const;
bool mlu_use_first_conv() const;
const std::vector<float>& mlu_first_conv_mean() const;
const std::vector<float>& mlu_first_conv_std() const;
// std::pair<mean, std>
std::pair<std::vector<float>, std::vector<float>> mlu_firstconv_param() const;
#endif
// XPU only, set the size of the workspace memory from L3 cache for the
......
......@@ -15,6 +15,7 @@
#include "lite/backends/mlu/target_wrapper.h"
#include <memory>
#include <utility>
#include "lite/backends/mlu/mlu_utils.h"
......@@ -36,6 +37,13 @@ void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {
} // namespace mlu
thread_local cnmlCoreVersion_t TargetWrapperMlu::mlu_core_version_{CNML_MLU270};
thread_local int TargetWrapperMlu::mlu_core_number_{1};
thread_local bool TargetWrapperMlu::use_first_conv_{false};
thread_local std::vector<float> TargetWrapperMlu::mean_vec_;
thread_local std::vector<float> TargetWrapperMlu::std_vec_;
thread_local DataLayoutType TargetWrapperMlu::input_layout_{DATALAYOUT(kNCHW)};
size_t TargetWrapperMlu::num_devices() {
uint32_t dev_count = 0;
CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed";
......@@ -77,15 +85,42 @@ void TargetWrapperMlu::MemcpySync(void* dst,
LOG(FATAL) << "Unsupported IoDirection" << static_cast<int>(dir);
}
}
void TargetWrapperMlu::SetMLURunMode(
lite_api::MLUCoreVersion core_version,
int core_number,
DataLayoutType input_layout,
std::pair<std::vector<float>, std::vector<float>> firstconv_param) {
switch (core_version) {
case (lite_api::MLUCoreVersion::MLU_220):
mlu_core_version_ = CNML_MLU220;
break;
case (lite_api::MLUCoreVersion::MLU_270):
mlu_core_version_ = CNML_MLU270;
break;
default:
mlu_core_version_ = CNML_MLU270;
break;
}
mlu_core_number_ = core_number;
mean_vec_ = firstconv_param.first;
std_vec_ = firstconv_param.second;
use_first_conv_ = !(mean_vec_.empty() || std_vec_.empty());
input_layout_ = input_layout;
}
cnmlCoreVersion_t TargetWrapperMlu::MLUCoreVersion() {
return mlu_core_version_;
}
int TargetWrapperMlu::MLUCoreNumber() { return mlu_core_number_; }
bool TargetWrapperMlu::UseFirstConv() { return use_first_conv_; }
const std::vector<float>& TargetWrapperMlu::MeanVec() { return mean_vec_; }
const std::vector<float>& TargetWrapperMlu::StdVec() { return std_vec_; }
// void TargetWrapperMlu::MemcpyAsync(void* dst,
// const void* src,
// size_t size,
// IoDirection dir,
// const stream_t& stream) {
// LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
// MemcpySync(dst, src, size, dir);
// }
DataLayoutType TargetWrapperMlu::InputLayout() { return input_layout_; }
} // namespace lite
} // namespace paddle
......@@ -13,6 +13,8 @@
// limitations under the License.
#pragma once
#include <utility>
#include <vector>
#include "lite/backends/mlu/mlu_utils.h"
#include "lite/core/target_wrapper.h"
......@@ -43,11 +45,25 @@ class TargetWrapper<TARGET(kMLU)> {
const void* src,
size_t size,
IoDirection dir);
// static void MemcpyAsync(void* dst,
// const void* src,
// size_t size,
// IoDirection dir,
// const queue_t& queue);
static void SetMLURunMode(
lite_api::MLUCoreVersion core_version,
int core_number,
DataLayoutType input_layout,
std::pair<std::vector<float>, std::vector<float>> firstconv_param);
static cnmlCoreVersion_t MLUCoreVersion();
static int MLUCoreNumber();
static bool UseFirstConv();
static const std::vector<float>& MeanVec();
static const std::vector<float>& StdVec();
static DataLayoutType InputLayout();
private:
static thread_local cnmlCoreVersion_t mlu_core_version_;
static thread_local int mlu_core_number_;
static thread_local bool use_first_conv_;
static thread_local std::vector<float> mean_vec_;
static thread_local std::vector<float> std_vec_;
static thread_local DataLayoutType input_layout_;
};
} // namespace lite
......
......@@ -27,5 +27,11 @@ thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
#endif
#ifdef LITE_WITH_MLU
int Context<TargetType::kMLU>::next_queue_id_{0};
std::map<int, int> Context<TargetType::kMLU>::queue_id_map_;
std::mutex Context<TargetType::kMLU>::map_mutex_;
#endif
} // namespace lite
} // namespace paddle
......@@ -25,6 +25,7 @@
#ifdef LITE_WITH_MLU
#include <cnml.h>
#include <cnrt.h>
#include <mutex> // NOLINT
#include "lite/backends/mlu/mlu_utils.h"
#endif
#ifdef LITE_WITH_XPU
......@@ -249,11 +250,11 @@ class Context<TargetType::kMLU> {
void InitOnce() {}
MLUContext& operator=(const MLUContext& ctx) {
this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
this->Init(ctx.device_id_, ctx.exec_queue_id_);
return *this;
}
void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
void Init(int dev_id, int exec_queue_id = 0) {
CHECK_GT(devs.size(), 0UL)
<< "Env is not initialized or current target is not exit!";
if (dev_id >= static_cast<int>(devs.size())) {
......@@ -264,21 +265,19 @@ class Context<TargetType::kMLU> {
device_id_ = dev_id;
}
SetMluDevice(device_id_);
if (io_queue_id >= devs[dev_id].max_queue()) {
LOG(WARNING) << "data queue index exceeds the maximum queue number, "
"set to default qeueu(0)!";
io_queue_id = 0;
}
if (exec_queue_id >= devs[dev_id].max_queue()) {
LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
"set to default qeueu(0)!";
exec_queue_id = 0;
// get queue id from map
std::unique_lock<std::mutex> lk(map_mutex_);
if (queue_id_map_.find(exec_queue_id) == queue_id_map_.end()) {
queue_id_map_[exec_queue_id] =
next_queue_id_++ % devs[dev_id].max_queue();
}
io_queue_ = devs[dev_id].io_queues()[io_queue_id];
exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
exec_queue_id_ = queue_id_map_[exec_queue_id];
VLOG(4) << "pick mlu queue id: " << exec_queue_id_;
lk.unlock();
exec_queue_id_ = exec_queue_id;
io_queue_id_ = io_queue_id;
io_queue_ = devs[dev_id].io_queues()[exec_queue_id_];
exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id_];
}
void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
......@@ -290,10 +289,12 @@ class Context<TargetType::kMLU> {
void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
cnmlCoreVersion_t MLUCoreVersion() {
return DeviceInfo::Global().MLUCoreVersion();
return paddle::lite::TargetWrapperMlu::MLUCoreVersion();
}
int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
int MLUCoreNumber() {
return paddle::lite::TargetWrapperMlu::MLUCoreNumber();
}
u32_t affinity() { return affinity_; }
......@@ -304,10 +305,12 @@ class Context<TargetType::kMLU> {
std::string name() const { return "MLUContext"; }
private:
static int next_queue_id_;
static std::map<int, int> queue_id_map_;
static std::mutex map_mutex_;
int device_id_;
// overall information
int exec_queue_id_;
int io_queue_id_;
cnrtQueue_t io_queue_;
cnrtQueue_t exec_queue_;
......@@ -455,7 +458,7 @@ class ContextScheduler {
case TARGET(kMLU): {
int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
auto& context = ctx->As<MLUContext>();
context.Init(dev_id);
context.Init(dev_id, exec_stream_id);
kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
&context);
LOG(INFO) << "New Context for MLU";
......
......@@ -66,15 +66,6 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
thread_local TensorLite DeviceInfo::workspace_;
thread_local int64_t DeviceInfo::count_ = 0;
#ifdef LITE_WITH_MLU
thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
thread_local int DeviceInfo::mlu_core_number_{1};
thread_local bool DeviceInfo::use_first_conv_{false};
thread_local std::vector<float> DeviceInfo::mean_vec_;
thread_local std::vector<float> DeviceInfo::std_vec_;
thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
#endif
#ifdef TARGET_IOS
const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
......@@ -1089,45 +1080,6 @@ int DeviceInfo::Setup() {
return 0;
}
#ifdef LITE_WITH_MLU
void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
int core_number,
bool use_first_conv,
const std::vector<float>& mean_vec,
const std::vector<float>& std_vec,
DataLayoutType input_layout) {
switch (core_version) {
case (lite_api::MLUCoreVersion::MLU_220):
mlu_core_version_ = CNML_MLU220;
break;
case (lite_api::MLUCoreVersion::MLU_270):
mlu_core_version_ = CNML_MLU270;
break;
default:
mlu_core_version_ = CNML_MLU270;
break;
}
mlu_core_number_ = core_number;
use_first_conv_ = use_first_conv;
mean_vec_ = mean_vec;
std_vec_ = std_vec;
input_layout_ = input_layout;
}
cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
#endif // LITE_WITH_MLU
void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
#ifdef ARM_WITH_OMP
thread_num = std::min(thread_num, core_num_);
......
......@@ -55,20 +55,6 @@ class DeviceInfo {
int Setup();
void SetRunMode(lite_api::PowerMode mode, int thread_num);
#ifdef LITE_WITH_MLU
void SetMLURunMode(lite_api::MLUCoreVersion core_version,
int core_number,
bool use_first_conv,
const std::vector<float>& mean_vec,
const std::vector<float>& std_vec,
DataLayoutType input_layout);
cnmlCoreVersion_t MLUCoreVersion();
int MLUCoreNumber();
bool UseFirstConv();
const std::vector<float>& MeanVec() const;
const std::vector<float>& StdVec() const;
DataLayoutType InputLayout() const;
#endif
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; }
......@@ -120,15 +106,6 @@ class DeviceInfo {
static thread_local TensorLite workspace_;
static thread_local int64_t count_;
#ifdef LITE_WITH_MLU
static thread_local cnmlCoreVersion_t mlu_core_version_;
static thread_local int mlu_core_number_;
static thread_local bool use_first_conv_;
static thread_local std::vector<float> mean_vec_;
static thread_local std::vector<float> std_vec_;
static thread_local DataLayoutType input_layout_;
#endif
void SetDotInfo(int argc, ...);
void SetFP16Info(int argc, ...);
void SetFP32Info(int argc, ...);
......
......@@ -64,4 +64,5 @@ REGISTER_MIR_PASS(lite_conv_activation_fuse_pass,
paddle::lite::mir::ConvActivationFusePass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kXPU)})
.ExcludeTargets({TARGET(kMLU)})
.BindKernel("conv2d");
......@@ -24,8 +24,13 @@ namespace mir {
void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
#ifdef LITE_WITH_X86
#ifdef LITE_WITH_MLU
fusion::FcFuser fuser(false);
fuser(graph.get());
#else
fusion::FcFuser fuser(true);
fuser(graph.get());
#endif
#endif
fusion::FcFuser fuser2(false);
......@@ -38,6 +43,9 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kXPU), TARGET(kX86)})
.ExcludeTargets({TARGET(kXPU)})
#ifndef LITE_WITH_MLU
.ExcludeTargets({TARGET(kX86)})
#endif
.ExcludeTargets({TARGET(kBM)})
.BindKernel("fc");
......@@ -314,4 +314,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
TARGET(kXPU),
TARGET(kBM),
TARGET(kRKNPU),
TARGET(kAPU)});
TARGET(kAPU),
TARGET(kMLU)});
......@@ -79,6 +79,8 @@ class MLUPostprocessPass : public ProgramPass {
const Type** arg_type,
SSAGraph* graph);
void ModifyInputOutputDataType(SSAGraph* graph);
void ModifyLayout(SSAGraph* graph);
bool NeedInsert(Node* node, const Type* inst_type);
......@@ -86,12 +88,14 @@ class MLUPostprocessPass : public ProgramPass {
void InsertBefore(SSAGraph* graph,
Node* head_node,
Node* inst_node,
const Type* type);
const Type* type,
bool use_mlu_cast);
void InsertAfter(SSAGraph* graph,
Node* tail_node,
Node* inst_node,
const Type* type);
const Type* type,
bool use_mlu_cast);
Node* InsertCastBefore(const std::string& op_type,
const std::string& cast_arg_name,
......@@ -115,6 +119,8 @@ class MLUPostprocessPass : public ProgramPass {
bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
void AdjustSubgraph(Node* subgraph_node, const Type* op_type);
private:
std::set<std::string> first_conv_nodes_;
};
......
......@@ -44,6 +44,10 @@ class RuntimeContextAssignPass : public StmtPass {
inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
inst.picked_kernel().target()));
}
#elif LITE_WITH_MLU
inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
inst.picked_kernel().target(),
static_cast<int>(reinterpret_cast<int64_t>(graph.get()))));
#else
int stream_id = inst.stream_id_;
......
......@@ -249,11 +249,13 @@ void OpenCLTypeLayoutTransformPass::Apply(
REGISTER_MIR_PASS(type_layout_cast_pass,
paddle::lite::mir::TypeLayoutTransformPass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kMLU)})
.BindKernel("layout_once")
.BindKernel("layout");
REGISTER_MIR_PASS(type_layout_cast_preprocess_pass,
paddle::lite::mir::OpenCLTypeLayoutTransformPass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kMLU)})
.BindKernel("layout_once")
.BindKernel("layout");
......@@ -108,9 +108,13 @@ class Optimizer {
"bm_subgraph_pass",
"apu_subgraph_pass",
"rknpu_subgraph_pass",
"mlu_subgraph_pass",
"static_kernel_pick_pass", // pick original kernel from graph
"remove_tf_redundant_ops_pass",
"variable_place_inference_pass", // inference arg/var's
"mlu_postprocess_pass",
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's
......@@ -140,13 +144,9 @@ class Optimizer {
"variable_place_inference_pass", //
"argument_type_display_pass",
"mlu_subgraph_pass",
"runtime_context_assign_pass",
"argument_type_display_pass",
"mlu_postprocess_pass",
"memory_optimize_pass"}};
if (passes.size() == 1) {
......
......@@ -4,6 +4,7 @@ endif()
add_subdirectory(bridges)
add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu})
add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps})
# depend on transpose function in backend/x86/math/math_function
add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function})
......@@ -3,7 +3,7 @@ if(NOT LITE_WITH_MLU)
endif()
lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor)
lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs})
lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs} subgraph_bridge_utility_mlu)
lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu)
set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu)
......@@ -18,6 +18,8 @@ lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_d
lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_cast_op_mlu SRCS cast_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_layout_op_mlu SRCS layout_op.cc DEPS ${subgraph_bridge_deps_mlu})
set(mlu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_utility_mlu
......@@ -32,6 +34,8 @@ set(mlu_subgraph_bridges
subgraph_bridge_scale_op_mlu
subgraph_bridge_interp_op_mlu
subgraph_bridge_concat_op_mlu
subgraph_bridge_cast_op_mlu
subgraph_bridge_layout_op_mlu
CACHE INTERNAL "mlu_subgraph_bridges")
lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
......@@ -45,4 +49,6 @@ lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer targe
lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_layout_converter_mlu SRCS layout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_cast_converter_mlu SRCS cast_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
......@@ -60,6 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
output_tensor->mlu_tensor()));
}
graph->FuseOp(activation_op);
CNML_CALL(cnmlDestroyBaseOp(&activation_op));
return SUCCESS;
}
......@@ -72,6 +73,9 @@ REGISTER_SUBGRAPH_BRIDGE(sigmoid,
kMLU,
paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(relu6,
kMLU,
paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
kMLU,
......
......@@ -13,7 +13,9 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
......@@ -116,7 +118,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
opdesc.SetAttr("offset", 0.5f);
}
// create and convert op to NPU model, then run it on NPU
// create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
// execute reference implementation and save to output tensor
act_ref(op);
......@@ -134,7 +136,8 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
TEST(MLUBridges, activation) {
std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
std::vector<std::string> types{"sigmoid", "relu", "tanh", "leaky_relu"};
std::vector<std::string> types{
"sigmoid", "relu", "relu6", "tanh", "leaky_relu"};
for (auto x_shape : shapes) {
for (auto op_type : types) {
test_act(x_shape, op_type);
......@@ -149,5 +152,6 @@ TEST(MLUBridges, activation) {
USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
USE_SUBGRAPH_BRIDGE(relu, kMLU)
USE_SUBGRAPH_BRIDGE(relu6, kMLU)
USE_SUBGRAPH_BRIDGE(tanh, kMLU)
USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU)
......@@ -48,25 +48,32 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
auto mean_dims = mean->dims().Vectorize();
if (mean_dims.size() < 4) {
mean_dims.insert(mean_dims.begin(), 4 - mean_dims.size(), 1);
}
auto mean_tensor = graph->AddNode(
mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType());
mean_var_name, mean_dims, CNML_CONST, CNML_NHWC, graph->FPType());
auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
auto variance_dims = variance->dims().Vectorize();
if (variance_dims.size() < 4) {
variance_dims.insert(variance_dims.begin(), 4 - variance_dims.size(), 1);
}
auto variance_tensor = graph->AddNode(
variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType());
variance_var_name, variance_dims, CNML_CONST, CNML_NHWC, graph->FPType());
auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
int co = static_cast<int>(mean_dims[0]);
int co = static_cast<int>(mean_dims[3]);
std::vector<float> variance_trans(co);
std::vector<float> mean_trans(co);
for (int i = 0; i < co; ++i) {
variance->mutable_data<float>()[i] =
variance_trans[i] =
scale->data<float>()[i] / sqrtf(variance->data<float>()[i] + epsilon);
mean->mutable_data<float>()[i] =
mean->data<float>()[i] -
bias->data<float>()[i] / variance->data<float>()[i];
mean_trans[i] =
mean->data<float>()[i] - bias->data<float>()[i] / variance_trans[i];
}
auto input_tensor = graph->GetNode(x_var_name);
......@@ -77,10 +84,14 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
mean_tensor->mlu_tensor(),
variance_tensor->mlu_tensor()));
graph->BindConstData(variance_var_name, variance);
graph->BindConstData(mean_var_name, mean);
graph->BindConstRawData(
variance_var_name, variance_trans.data(), variance_trans.size(), true);
graph->BindConstRawData(
mean_var_name, mean_trans.data(), mean_trans.size(), true);
graph->FuseOp(bn_op);
CNML_CALL(cnmlDestroyBaseOp(&bn_op));
return SUCCESS;
}
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto in_dtype = op_info->GetAttr<int>("in_dtype");
auto out_dtype = op_info->GetAttr<int>("out_dtype");
CHECK(graph->HasNode(x_var_name));
auto x_tensor = graph->GetNode(x_var_name);
cnmlDataType_t out_type;
cnmlCastType_t cast_type;
if (in_dtype == 4 && out_dtype == 5) {
cast_type = CNML_CAST_FLOAT16_TO_FLOAT32;
out_type = CNML_DATA_FLOAT32;
} else if (in_dtype == 5 && out_dtype == 4) {
cast_type = CNML_CAST_FLOAT32_TO_FLOAT16;
out_type = CNML_DATA_FLOAT16;
} else {
CHECK(0) << "Unsupported cast type";
}
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, out_type);
cnmlBaseOp_t cast_op;
CNML_CALL(cnmlCreateCastOp(&cast_op,
cast_type,
x_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
graph->FuseOp(cast_op);
CNML_CALL(cnmlDestroyBaseOp(&cast_op));
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(cast,
kMLU,
paddle::lite::subgraph::mlu::CastConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/cast_op.h"
#include <gtest/gtest.h>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void test_cast_FP16_to_FP32(std::vector<int64_t> shape) {
// prepare input&output variables
std::string x_var_name = "x";
std::string out_var_name = "out";
Scope scope;
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
x->Resize(DDim(shape));
auto* x_data = x->mutable_data<paddle::lite::fluid::float16>();
// initialize input&output data
for (int i = 0; i < x->dims().production(); i++) {
x_data[i] = static_cast<paddle::lite::fluid::float16>(i);
}
// initialize op desc
int in_dtype = 4, out_dtype = 5;
cpp::OpDesc opdesc;
opdesc.SetType("cast");
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("in_dtype", in_dtype);
opdesc.SetAttr("out_dtype", out_dtype);
auto op = CreateOp<operators::CastOp>(opdesc, &scope);
Tensor data;
data.Resize(DDim(shape));
auto* copy_data = data.mutable_data<paddle::lite::fluid::float16>();
data.CopyDataFrom(*x);
x->set_precision(paddle::lite_api::PrecisionType::kFP16);
LaunchOp(op, {x_var_name}, {out_var_name});
// compare results
auto* out_data = out->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(out_data[i], static_cast<double>(copy_data[i]), 5e-4);
}
}
void test_cast_FP32_to_FP16(std::vector<int64_t> shape) {
// prepare input&output variables
std::string x_var_name = "x";
std::string out_var_name = "out";
Scope scope;
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
x->Resize(DDim(shape));
auto* x_data = x->mutable_data<float>();
// initialize input&output data
for (int i = 0; i < x->dims().production(); i++) {
x_data[i] = static_cast<float>(i);
}
// initialize op desc
int in_dtype = 5, out_dtype = 4;
cpp::OpDesc opdesc;
opdesc.SetType("cast");
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("in_dtype", in_dtype);
opdesc.SetAttr("out_dtype", out_dtype);
auto op = CreateOp<operators::CastOp>(opdesc, &scope);
Tensor data;
data.Resize(DDim(shape));
auto* copy_data = data.mutable_data<float>();
data.CopyDataFrom(*x);
x->set_precision(paddle::lite_api::PrecisionType::kFloat);
LaunchOp(op, {x_var_name}, {out_var_name});
// compare results
auto* out_data = out->mutable_data<paddle::lite::fluid::float16>();
for (int i = 0; i < out->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(static_cast<double>(out_data[i]), copy_data[i], 5e-4);
}
}
TEST(MLUBridges, cast) {
test_cast_FP16_to_FP32({2, 3, 4, 5});
test_cast_FP16_to_FP32({6, 3, 2, 5});
test_cast_FP32_to_FP16({2, 3, 4, 5});
test_cast_FP32_to_FP16({6, 3, 2, 5});
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
USE_SUBGRAPH_BRIDGE(cast, kMLU);
......@@ -44,9 +44,10 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto dims = output_dims.size();
int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
int nhwc_axis = nchw_to_nhwc_axis_map[axis];
CHECK_LT(axis, dims) << "Unsupport dims in mlu concat";
// value of nhwc2nchw_axis is index of nhwc
// order of nhwc2nchw_axis is nchw
int nhwc_axis = GetAxisNHWC2NCHW<int>(dims)[axis];
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
......@@ -60,6 +61,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
&outputs,
1));
graph->FuseOp(concat_op);
CNML_CALL(cnmlDestroyBaseOp(&concat_op));
return SUCCESS;
}
......
......@@ -13,7 +13,9 @@
// limitations under the License.
#include "lite/operators/conv_op.h"
#include <algorithm>
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
......@@ -30,6 +32,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
const auto* op_info = op->op_info();
const auto* scope = op->scope();
VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
CHECK(!op_info->HasAttr("act_type"));
// get input, filter and op attributes
const auto input_var_name = op_info->Input("Input").front();
......@@ -43,8 +46,13 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
const auto output_shape = output->dims().Vectorize();
const auto bs = input_dims[0];
const auto oc = filter_dims[0];
const auto groups = op_info->GetAttr<int>("groups");
CHECK_EQ(input_dims.size(), 4u);
CHECK_EQ(filter_dims.size(), 4u);
CHECK(!(op_info->HasAttr("fuse_relu") &&
(op_info->GetAttr<bool>("fuse_relu") == true)))
<< "UnSupported param fuse_relu is true!";
const auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
......@@ -70,13 +78,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
padding_algorithm,
input_dims,
filter_dims);
bool is_group_mode = groups > 1;
bool is_depthwise_mode = false;
if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 &&
dilations[1] == 1) { // depthwise filter shape = {1, ic ,kh ,kw}
is_depthwise_mode = true;
is_group_mode = false;
}
auto input_tensor = graph->GetNode(input_var_name);
const auto output_tensor = graph->AddNode(
output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
std::vector<int64_t> cnml_filter_shape = {
filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]};
if (is_depthwise_mode) {
/*paddle filter shape is {oc , ic / groups == 1, kh, kw} while
cnml depthwise conv filter expect shape {oc / groups == 1 , ic , kh, kw}
so we should shape filter shape
*/
cnml_filter_shape = {
filter_dims[1], filter_dims[0], filter_dims[2], filter_dims[3]};
}
// Create filter node
const auto filter_tensor = graph->AddNode(filter_var_name,
filter_dims.Vectorize(),
cnml_filter_shape,
CNML_FILTER,
CNML_NCHW,
graph->FPType());
......@@ -89,15 +116,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
dequant(filter_dequant.data(),
filter->mutable_data<int8_t>(),
1,
filter_dims[0],
filter_dims[1] * filter_dims[2] * filter_dims[3],
cnml_filter_shape[0],
cnml_filter_shape[1] * cnml_filter_shape[2] * cnml_filter_shape[3],
weight_scale);
transpose(filter_dequant.data(),
filter->mutable_data<float>(),
{static_cast<int>(filter_dims[0]),
static_cast<int>(filter_dims[1]),
static_cast<int>(filter_dims[2]),
static_cast<int>(filter_dims[3])},
{static_cast<int>(cnml_filter_shape[0]),
static_cast<int>(cnml_filter_shape[1]),
static_cast<int>(cnml_filter_shape[2]),
static_cast<int>(cnml_filter_shape[3])},
{0, 2, 3, 1});
filter->set_precision(PrecisionType::kFloat);
} else if (filter->precision() != PrecisionType::kFloat) {
......@@ -116,7 +143,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
std::vector<int64_t> bias_shape;
if (bias_data_size == oc) {
// 0: {oc}
bias_shape = {oc};
bias_shape = {1, 1, 1, oc};
} else if (bias_data_size == output_data_size / bs) {
LOG(FATAL) << "Unsupported ... ...";
// 1: {1, oc, oh, ow}
......@@ -130,18 +157,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
<< " isn't supported in conv2d Op when output dimension is "
<< output_dims;
}
bias_tensor = graph->AddNode(bias_var_name,
bias_dims.Vectorize(),
CNML_CONST,
CNML_CNHW,
graph->FPType());
bias_tensor = graph->AddNode(
bias_var_name, bias_shape, CNML_CONST, CNML_NHWC, graph->FPType());
graph->BindConstData(bias_var_name, bias);
}
const auto input_scale = op_info->GetAttr<float>("input_scale");
bool use_first_conv = false;
if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) {
if (lite::TargetWrapperMlu::UseFirstConv() && input_dims[1] == 3) {
use_first_conv = true;
}
......@@ -158,38 +182,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
paddings[0],
paddings[0]));
const auto mean_tensor = graph->AddNode("first_conv_mean_tensor",
std::vector<int64_t>{3},
std::vector<int64_t>{1, 1, 1, 3},
CNML_CONST,
CNML_CNHW,
CNML_NHWC,
graph->FPType());
const auto std_tensor = graph->AddNode("first_conv_std_tensor",
std::vector<int64_t>{3},
std::vector<int64_t>{1, 1, 1, 3},
CNML_CONST,
CNML_CNHW,
CNML_NHWC,
graph->FPType());
graph->BindConstRawData("first_conv_mean_tensor",
lite::DeviceInfo::Global().MeanVec().data(),
lite::TargetWrapperMlu::MeanVec().data(),
3,
false);
graph->BindConstRawData("first_conv_std_tensor",
lite::DeviceInfo::Global().StdVec().data(),
lite::TargetWrapperMlu::StdVec().data(),
3,
false);
graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8);
input_tensor->set_mlu_dtype(CNML_DATA_UINT8);
CNML_CALL(cnmlCreateConvFirstOpForward(
&conv_op,
conv_param,
graph->GetNode(input_var_name)->mlu_tensor(),
input_tensor->mlu_tensor(),
mean_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
std_tensor->mlu_tensor()));
CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param));
} else if (is_depthwise_mode) {
cnmlConvDepthwiseOpParam_t conv_depthwise_param;
cnmlCreateConvDepthwiseOpParam_V2(&conv_depthwise_param,
strides[0],
strides[1],
paddings[0] * 2,
paddings[2] * 2);
CNML_CALL(cnmlCreateConvDepthwiseOpForward(
&conv_op,
conv_depthwise_param,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
CNML_CALL(cnmlDestroyConvDepthwiseOpParam(&conv_depthwise_param));
} else if (is_group_mode) {
cnmlConvOpParam_t conv_param;
CNML_CALL(cnmlCreateConvOpParam(&conv_param,
strides[0],
strides[1],
dilations[0],
dilations[1],
paddings[0] * 2,
paddings[2] * 2));
CNML_CALL(cnmlCreateConvGroupOpForward(
&conv_op,
conv_param,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
groups));
CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
} else {
cnmlConvOpParam_t conv_param;
VLOG(5) << "conv param (" << input_var_name << ")"
<< "stride: " << strides[0] << ',' << strides[1] << '\t'
<< "dilations: " << dilations[0] << ',' << dilations[1] << '\t'
<< "paddings: " << paddings[0] << ',' << paddings[2] << std::endl;
CNML_CALL(cnmlCreateConvOpParam(&conv_param,
strides[0],
strides[1],
......@@ -200,19 +261,21 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CNML_CALL(cnmlCreateConvOpForward(
&conv_op,
conv_param,
graph->GetNode(input_var_name)->mlu_tensor(),
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
}
graph->SetComputingDataType(
conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
graph->SetComputingDataType(
conv_op,
filter_tensor->mlu_tensor(),
1 / *min_element(weight_scale.begin(), weight_scale.end()));
if (!is_depthwise_mode) {
graph->SetComputingDataType(
conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
graph->SetComputingDataType(
conv_op,
filter_tensor->mlu_tensor(),
1 / *max_element(weight_scale.begin(), weight_scale.end()));
}
CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC));
if (HasInputArg(op_info, scope, "Bias")) {
auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
......@@ -220,6 +283,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
}
graph->BindConstData(filter_var_name, filter);
graph->FuseOp(conv_op);
CNML_CALL(cnmlDestroyBaseOp(&conv_op));
return REBUILD_WHEN_SHAPE_CHANGED;
}
......
......@@ -13,8 +13,11 @@
// limitations under the License.
#include "lite/operators/conv_op.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
......@@ -331,6 +334,10 @@ TEST(MLUBridges, conv) {
#endif
}
TEST(MLUBridges, depthwise_conv2d) {
test_conv(1, 8, 8, 14, 14, false, false, false, true, 1, 1, 2, 3);
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
......
......@@ -23,7 +23,7 @@ namespace mlu {
std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
auto x_dims = x.dims();
CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
// CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
auto y_dims = y->dims();
CHECK_GE(x_dims.size(), y_dims.size());
......@@ -117,6 +117,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
}
graph->FuseOp(elementwise_op);
CNML_CALL(cnmlDestroyBaseOp(&elementwise_op));
cnmlBaseOp_t act_op;
if (op_type == "fusion_elementwise_add_activation") {
auto mid_tensor = graph->GetNode(out_var_name + "_mid");
......@@ -127,6 +128,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
mid_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
graph->FuseOp(act_op);
CNML_CALL(cnmlDestroyBaseOp(&act_op));
}
return REBUILD_WHEN_SHAPE_CHANGED;
}
......
......@@ -153,7 +153,7 @@ void test_elementwise_add(const std::vector<int64_t>& x_shape,
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("axis", axis);
// create and convert op to NPU model, then run it on NPU
// create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
// execute reference implementation and save to output tensor
......
......@@ -34,7 +34,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto w_var_name = op_info->Input("W").front();
auto output_var_name = op_info->Output("Out").front();
// int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
CHECK(!op_info->HasAttr("activation_type"));
auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
......@@ -45,9 +45,28 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK_EQ(w_dims.size(), 2UL);
// Create w node
std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
std::vector<int64_t> cnml_w_shape;
if (x_dims.size() == 4) {
if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) {
cnml_w_shape = {
static_cast<int>(w_dims[1]),
static_cast<int>(x_dims[1]), // input_c
static_cast<int>(x_dims[2]), // input_h
static_cast<int>(x_dims[3]), // input_w
};
} else {
LOG(FATAL)
<< "in fc op, we expect input_h * input_w * input_c == filter_c"
<< " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2]
<< " input_w = " << x_dims[3] << " filter_c = " << w_dims[0]
<< std::endl;
}
} else {
cnml_w_shape = {w_dims[1], w_dims[0]};
}
auto w_tensor = graph->AddNode(
w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
auto input_scale = op_info->GetAttr<float>("input_scale");
......@@ -63,15 +82,15 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if (HasInputArg(op_info, scope, "Bias")) {
bias_var_name = op_info->Input("Bias").front();
auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
auto bias_dims = bias->dims();
auto bias_dims = bias->dims().Vectorize();
CHECK(!graph->HasNode(bias_var_name));
if (bias_dims.size() < 4u) {
bias_dims.insert(bias_dims.begin(), 4 - bias_dims.size(), 1);
}
// CHECK_EQ(bias_dims.production(), n);
bias_tensor = graph->AddNode(bias_var_name,
bias_dims.Vectorize(),
CNML_CONST,
CNML_CNHW,
graph->FPType());
bias_tensor = graph->AddNode(
bias_var_name, bias_dims, CNML_CONST, CNML_NHWC, graph->FPType());
graph->BindConstData(bias_var_name, bias);
}
cnmlBaseOp_t fc_op;
......@@ -88,18 +107,46 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if (w->precision() == PrecisionType::kUnk ||
w->precision() == PrecisionType::kInt8) {
std::vector<float> w_dequant(w->data_size());
dequant(w_dequant.data(),
w->mutable_data<int8_t>(),
1,
w_dims[1],
w_dims[0],
weight_scale);
for (int i = 0; i < w_dims[1]; i++) {
for (int j = 0; j < w_dims[0]; j++) {
w->mutable_data<float>()[i * w_dims[0] + j] =
w_dequant[i + j * w_dims[1]];
}
if (cnml_w_shape.size() == 2) {
dequant(w_dequant.data(),
w->mutable_data<int8_t>(),
1,
cnml_w_shape[0],
cnml_w_shape[1],
weight_scale);
transpose2d(w_dequant.data(),
w->mutable_data<float>(),
{static_cast<int>(cnml_w_shape[0]),
static_cast<int>(cnml_w_shape[1])});
} else if (cnml_w_shape.size() == 4) {
dequant(w_dequant.data(),
w->mutable_data<int8_t>(),
1,
cnml_w_shape[0],
cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3],
weight_scale);
int c_o_num = cnml_w_shape[0];
int c_i_num = cnml_w_shape[1];
int h_i_num = cnml_w_shape[2];
int w_i_num = cnml_w_shape[3];
// chw == ci * hi * wi == w_dim[0]
// first trans [chw, co] -> [co,chw]
std::vector<float> first_trans_output(w_dequant.size());
int chw = c_i_num * h_i_num * w_i_num;
transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num});
// second trans [co,ci,hi,wi] -> [co,hi,wi,ci]
transpose(first_trans_output.data(),
w->mutable_data<float>(),
{c_o_num, c_i_num, h_i_num, w_i_num},
{0, 2, 3, 1});
} else {
LOG(FATAL) << "expect w_shape.size == 2 or 4, but got "
<< cnml_w_shape.size() << std::endl;
}
w->set_precision(PrecisionType::kFloat);
} else if (w->precision() != PrecisionType::kFloat) {
LOG(FATAL) << "UnSupported weight precision!";
......@@ -110,9 +157,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
graph->SetComputingDataType(
fc_op,
w_tensor->mlu_tensor(),
1 / *min_element(weight_scale.begin(), weight_scale.end()));
1 / *max_element(weight_scale.begin(), weight_scale.end()));
graph->FuseOp(fc_op);
CNML_CALL(cnmlDestroyBaseOp(&fc_op));
return REBUILD_WHEN_SHAPE_CHANGED;
}
......
......@@ -175,9 +175,9 @@ void test_fc(const std::vector<int64_t>& input_shape,
TEST(MLUBridges, fc) {
for (bool use_bias : {true, false}) {
// test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
// test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
// test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias);
}
}
......
......@@ -27,10 +27,14 @@ std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
cnmlTensorType_t tensor_type,
cnmlDataOrder_t shape_order,
cnmlDataType_t mlu_dtype,
cnmlDataOrder_t data_order,
void* raw_ptr) {
CHECK(!HasNode(name));
VLOG(5) << "add mlu node: " << name << "\t data type "
<< static_cast<int>(mlu_dtype) << "\t data order "
<< static_cast<int>(data_order);
auto node = std::shared_ptr<MLUTensor>(
new MLUTensor(shape, tensor_type, shape_order, mlu_dtype));
new MLUTensor(shape, tensor_type, shape_order, mlu_dtype, data_order));
node->set_mlu_ptr(raw_ptr);
nodes_.insert(std::make_pair(name, node));
return node;
......
......@@ -15,13 +15,15 @@
#pragma once
#include <cmath>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "lite/kernels/mlu/bridges/tensor.h"
#include "lite/utils/env.h"
#define PRINT_HW_TIME false
......@@ -45,32 +47,30 @@ class Graph {
CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
#endif
}
~Graph() {
FreeConstData();
CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
for (auto op : ops_) {
CNML_CALL(cnmlDestroyBaseOp(&op));
}
#if PRINT_HW_TIME
CNRT_CALL(cnrtDestroyNotifier(&notifier_start_));
CNRT_CALL(cnrtDestroyNotifier(&notifier_end_));
double total_time = 0;
for (auto& f : time_log_) {
total_time += f;
if (!time_log_.empty()) {
for (auto& f : time_log_) {
total_time += f;
}
std::cout << "cnml hardware time for " << time_log_.size()
<< " process:" << total_time / time_log_.size() << std::endl;
}
std::cout << "cnml hardware time for " << time_log_.size()
<< " process:" << total_time / time_log_.size() << std::endl;
#endif
}
// Data node
std::shared_ptr<MLUTensor> AddNode(
const std::string& name,
std::vector<int64_t> shape,
cnmlTensorType_t tensor_type = CNML_TENSOR,
cnmlDataOrder_t data_order = CNML_NCHW,
cnmlDataOrder_t shape_order = CNML_NCHW,
cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
cnmlDataOrder_t data_order = CNML_NHWC,
void* raw_ptr = nullptr);
std::shared_ptr<MLUTensor> GetNode(const std::string& name) {
......@@ -82,9 +82,16 @@ class Graph {
return nodes_.find(name) != nodes_.end();
}
void AddInput(std::shared_ptr<MLUTensor> tensor) {
void AddInput(std::shared_ptr<MLUTensor> tensor,
bool disable_batch_size_changeable = true) {
inputs_.push_back(tensor->mlu_tensor());
input_tensors_.push_back(tensor);
if (!disable_batch_size_changeable) {
constexpr int input_dimNb = 4;
bool input_dim_mutable[4] = {true, false, false, false};
CNML_CALL(cnmlSetTensorDimMutable(
tensor->mlu_tensor(), input_dim_mutable, input_dimNb));
}
}
void AddOutput(std::shared_ptr<MLUTensor> tensor) {
......@@ -92,6 +99,22 @@ class Graph {
output_tensors_.push_back(tensor);
}
std::vector<std::shared_ptr<MLUTensor>>* MutableInputs() {
return &input_tensors_;
}
std::vector<std::shared_ptr<MLUTensor>>* MutableOutputs() {
return &output_tensors_;
}
void GenOfflineModel(const std::string& name) {
cnmlModel_t model;
const std::string& symbol = "subnet0";
const auto& filename = name + ".offline.cambricon";
CNML_CALL(cnmlCreateModel(&model, filename.c_str()));
CNML_CALL(cnmlAddFusionOpToModel(model, fusion_op_, symbol.c_str()));
CNML_CALL(cnmlSaveModel(model, filename.c_str()));
CNML_CALL(cnmlDestroyModel(model));
}
void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); }
void Compile(cnmlCoreVersion_t core_version, int core_number) {
......@@ -103,18 +126,37 @@ class Graph {
CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number));
CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version));
CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
for (auto in : input_tensors_) {
input_addrs_.push_back(in->mlu_data());
}
for (auto out : output_tensors_) {
output_addrs_.push_back(out->mlu_data());
}
}
#define MEASURE_HWTIME_START(que) \
do { \
CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); \
} while (0)
#define MEASURE_HWTIME_END(que) \
do { \
thread_local float hw_time; \
CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); \
CNRT_CALL(cnrtSyncQueue(que)); \
CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); \
hw_time /= 1000.0f; \
DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; \
std::lock_guard<std::mutex> lk(time_mut_); \
time_log_.push_back(hw_time); \
} while (0)
void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
input_addrs_.resize(input_tensors_.size());
output_addrs_.resize(output_tensors_.size());
for (size_t i = 0; i < input_addrs_.size(); ++i) {
input_addrs_[i] = input_tensors_[i]->mlu_data();
}
for (size_t i = 0; i < output_addrs_.size(); ++i) {
output_addrs_[i] = output_tensors_[i]->mlu_data();
}
#if PRINT_HW_TIME
thread_local float hw_time;
CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
MEASURE_HWTIME_START(que);
#endif
CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
input_addrs_.data(),
......@@ -124,18 +166,46 @@ class Graph {
&forward_param,
que));
#if PRINT_HW_TIME
CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
MEASURE_HWTIME_END(que);
#endif
}
CNRT_CALL(cnrtSyncQueue(que));
void Compute(cnrtQueue_t que,
const std::vector<std::shared_ptr<MLUTensor>>& in,
const std::vector<std::shared_ptr<MLUTensor>>& out) {
std::vector<cnmlTensor_t> in_tensor;
std::vector<cnmlTensor_t> out_tensor;
input_addrs_.resize(in.size());
output_addrs_.resize(out.size());
for (size_t i = 0; i < input_addrs_.size(); ++i) {
input_addrs_[i] = in[i]->mlu_data();
in_tensor.push_back(in[i]->mlu_tensor());
}
for (size_t i = 0; i < output_addrs_.size(); ++i) {
output_addrs_[i] = out[i]->mlu_data();
out_tensor.push_back(out[i]->mlu_tensor());
}
#if PRINT_HW_TIME
MEASURE_HWTIME_START(que);
#endif
/* Because of using cnmlSetTensorDimMutable, cnmlComputeFusionOpForward_V3
* -> cnmlComputeFusionOpForward_V4 */
CNML_CALL(cnmlComputeFusionOpForward_V4(fusion_op_,
&in_tensor[0],
input_addrs_.data(),
input_addrs_.size(),
&out_tensor[0],
output_addrs_.data(),
output_addrs_.size(),
que,
NULL));
#if PRINT_HW_TIME
CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
hw_time /= 1000.0f;
DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
std::lock_guard<std::mutex> lk(time_mut_);
time_log_.push_back(hw_time);
MEASURE_HWTIME_END(que);
#endif
}
#undef MEASURE_HWTIME_START
#undef MEASURE_HWTIME_END
template <typename T>
void* RegisterConstData(size_t len) {
......@@ -165,7 +235,7 @@ class Graph {
CNML_CALL(cnmlBindConstData_V2(
nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
} else if (fp_type_ == CNML_DATA_FLOAT16) {
void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
CNRT_CALL(
cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
CNRT_FLOAT32,
......@@ -180,7 +250,7 @@ class Graph {
}
}
void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
void BindConstData(std::string tensor_name, paddle::lite::Tensor* tensor) {
const float* data = tensor->data<float>();
size_t len = tensor->data_size();
if (fp_type_ == CNML_DATA_FLOAT32) {
......@@ -189,10 +259,14 @@ class Graph {
const_cast<void*>(static_cast<const void*>(data)),
false));
} else if (fp_type_ == CNML_DATA_FLOAT16) {
auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>();
for (size_t i = 0; i < len; ++i) {
data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]);
}
void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
CNRT_CALL(
cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
CNRT_FLOAT32,
data_fp16,
CNRT_FLOAT16,
len,
nullptr));
CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(),
static_cast<void*>(data_fp16),
false));
......@@ -206,19 +280,23 @@ class Graph {
float scale,
cnmlDataType_t data_type = CNML_DATA_INT8) {
cnmlQuantizedParam_t quant_param;
CNML_CALL(
cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0));
int pos = scale2position(scale);
auto cnml_scale = pow(2, pos) * scale;
VLOG(5) << "[cnml quantized param] pos: " << pos
<< "\tscale: " << cnml_scale << std::endl;
CNML_CALL(cnmlCreateQuantizedParam(&quant_param, pos, cnml_scale, 0.0));
CNML_CALL(
cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param));
CNML_CALL(cnmlDestroyQuantizedParam(&quant_param));
}
void SetFPType(::paddle::lite_api::PrecisionType type) {
void SetFPType(paddle::lite_api::PrecisionType type) {
origin_fp_type_ = type;
switch (type) {
case ::paddle::lite_api::PrecisionType::kFP16:
case paddle::lite_api::PrecisionType::kFP16:
fp_type_ = CNML_DATA_FLOAT16;
break;
case ::paddle::lite_api::PrecisionType::kFloat:
case paddle::lite_api::PrecisionType::kFloat:
fp_type_ = CNML_DATA_FLOAT32;
break;
default:
......@@ -230,14 +308,14 @@ class Graph {
private:
cnmlDataType_t fp_type_{CNML_DATA_FLOAT32};
std::map<std::string, std::shared_ptr<MLUTensor>> nodes_;
paddle::lite_api::PrecisionType origin_fp_type_{PRECISION(kFloat)};
std::unordered_map<std::string, std::shared_ptr<MLUTensor>> nodes_;
std::vector<cnmlTensor_t> inputs_;
std::vector<cnmlTensor_t> outputs_;
std::vector<void*> input_addrs_;
std::vector<void*> output_addrs_;
std::vector<std::shared_ptr<MLUTensor>> input_tensors_;
std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
std::vector<cnmlBaseOp_t> ops_;
cnmlFusionOp_t fusion_op_;
std::vector<void*> const_data_storage_;
#if PRINT_HW_TIME
......
......@@ -85,6 +85,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
nn_param));
CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param));
graph->FuseOp(interp_op);
CNML_CALL(cnmlDestroyBaseOp(&interp_op));
return SUCCESS;
}
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int LayoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
auto x_var_name = op_info->Input("Input").front();
auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
std::shared_ptr<MLUTensor> output_tensor;
CHECK(graph->HasNode(x_var_name));
std::vector<int> axis;
auto x_tensor = graph->GetNode(x_var_name);
auto x_data_order = x_tensor->dorder();
auto x_dims = x->dims().Vectorize();
if (x_data_order == CNML_NCHW) {
switch (x_dims.size()) {
case 2:
axis = {0, 1};
break;
case 3:
axis = {0, 2, 1};
break;
case 4:
axis = {0, 2, 3, 1};
break;
case 5:
axis = {0, 2, 3, 4, 1};
break;
default:
CHECK(0) << "Unsupport shape";
}
output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, x_tensor->dtype());
VLOG(3) << "layout transpose nchw to nhwc" << std::endl;
} else {
switch (x_dims.size()) {
case 2:
axis = {0, 1};
break;
case 3:
axis = {0, 2, 1};
break;
case 4:
axis = {0, 3, 1, 2};
break;
case 5:
axis = {0, 4, 1, 2, 3};
break;
default:
CHECK(0) << "Unsupport shpae";
}
VLOG(3) << "layout transpose nhwc to nchw" << std::endl;
output_tensor = graph->AddNode(out_var_name,
output_dims,
CNML_TENSOR,
CNML_NCHW,
x_tensor->dtype(),
CNML_NCHW);
}
cnmlBaseOp_t layout_op;
cnmlNdTransposeOpParam_t transpose_param;
CNML_CALL(
cnmlCreateNdTransposeOpParam(&transpose_param, axis.data(), axis.size()));
CNML_CALL(cnmlCreateNdTransposeProOp(&layout_op,
x_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
transpose_param));
graph->FuseOp(layout_op);
CNML_CALL(cnmlDestroyBaseOp(&layout_op));
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(layout,
kMLU,
paddle::lite::subgraph::mlu::LayoutConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/layout_op.h"
#include <gtest/gtest.h>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void test_layout_NHWC2NCHW(std::vector<int64_t> input_shape) {
// prepare input&output variables
std::string x_var_name = "input";
std::string out_var_name = "out";
Scope scope;
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
x->Resize(DDim(input_shape));
// initialize input&output data
FillTensor<float>(x);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("layout");
opdesc.SetInput("Input", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
auto op = CreateOp<operators::LayoutOp>(opdesc, &scope);
// execute reference implementation and save to output tensor
Tensor input;
input.Resize(DDim(input_shape));
switch (input_shape.size()) {
case 2:
transpose<float>(
x->mutable_data<float>(),
input.mutable_data<float>(),
{static_cast<int>(input_shape[0]), static_cast<int>(input_shape[1])},
{0, 1});
break;
case 3:
transpose<float>(x->mutable_data<float>(),
input.mutable_data<float>(),
{static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[2]),
static_cast<int>(input_shape[1])},
{0, 2, 1});
break;
case 4:
transpose<float>(x->mutable_data<float>(),
input.mutable_data<float>(),
{static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[2]),
static_cast<int>(input_shape[3]),
static_cast<int>(input_shape[1])},
{0, 3, 1, 2});
break;
case 5:
transpose<float>(x->mutable_data<float>(),
input.mutable_data<float>(),
{static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[2]),
static_cast<int>(input_shape[3]),
static_cast<int>(input_shape[4]),
static_cast<int>(input_shape[1])},
{0, 4, 1, 2, 3});
break;
default:
CHECK(0) << "Unsupport";
}
auto* x_data = input.mutable_data<float>();
LaunchOp(op, {x_var_name}, {out_var_name});
// compare results
auto* out_data = out->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(out_data[i], x_data[i], 5e-4);
}
}
void test_layout_NCHW2NHWC(std::vector<int64_t> input_shape) {
// prepare input&output variables
std::string x_var_name = "input";
std::string out_var_name = "out";
Scope scope;
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
x->Resize(DDim(input_shape));
// initialize input&output data
FillTensor<float>(x);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("layout");
opdesc.SetInput("Input", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
auto op = CreateOp<operators::LayoutOp>(opdesc, &scope);
// execute reference implementation and save to output tensor
Tensor input;
input.Resize(DDim(input_shape));
switch (input_shape.size()) {
case 2:
transpose<float>(
x->mutable_data<float>(),
input.mutable_data<float>(),
{static_cast<int>(input_shape[0]), static_cast<int>(input_shape[1])},
{0, 1});
break;
case 3:
transpose<float>(x->mutable_data<float>(),
input.mutable_data<float>(),
{static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[1]),
static_cast<int>(input_shape[2])},
{0, 2, 1});
break;
case 4:
transpose<float>(x->mutable_data<float>(),
input.mutable_data<float>(),
{static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[1]),
static_cast<int>(input_shape[2]),
static_cast<int>(input_shape[3])},
{0, 2, 3, 1});
break;
case 5:
transpose<float>(x->mutable_data<float>(),
input.mutable_data<float>(),
{static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[1]),
static_cast<int>(input_shape[2]),
static_cast<int>(input_shape[3]),
static_cast<int>(input_shape[4])},
{0, 2, 3, 4, 1});
break;
default:
CHECK(0) << "Unsupport";
}
auto* x_data = input.mutable_data<float>();
LaunchOp(op, {x_var_name}, {out_var_name}, CNML_NCHW);
// compare results
auto* out_data = out->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(out_data[i], x_data[i], 5e-4);
}
}
TEST(MLUBridges, layout) {
test_layout_NHWC2NCHW({12, 32, 4});
test_layout_NHWC2NCHW({12, 32, 44, 3});
test_layout_NHWC2NCHW({12, 32, 44, 3, 6});
test_layout_NCHW2NHWC({12, 32, 55});
test_layout_NCHW2NHWC({12, 32, 44, 3});
test_layout_NCHW2NHWC({12, 32, 44, 3, 8});
test_layout_NHWC2NCHW({12, 32});
test_layout_NCHW2NHWC({12, 32});
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
USE_SUBGRAPH_BRIDGE(layout, kMLU);
......@@ -15,6 +15,7 @@
#pragma once
USE_SUBGRAPH_BRIDGE(relu, kMLU);
USE_SUBGRAPH_BRIDGE(relu6, kMLU)
USE_SUBGRAPH_BRIDGE(conv2d, kMLU);
USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU);
USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU);
......@@ -26,3 +27,7 @@ USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU);
USE_SUBGRAPH_BRIDGE(concat, kMLU);
USE_SUBGRAPH_BRIDGE(scale, kMLU);
USE_SUBGRAPH_BRIDGE(sigmoid, kMLU);
USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU);
USE_SUBGRAPH_BRIDGE(cast, kMLU);
USE_SUBGRAPH_BRIDGE(layout, kMLU);
......@@ -55,6 +55,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto global_pooling = op_info->GetAttr<bool>("global_pooling");
auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
auto strides = op_info->GetAttr<std::vector<int>>("strides");
CHECK(!(op_info->HasAttr("exclusive") &&
op_info->GetAttr<bool>("exclusive") == false))
<< "Unsupport param exclusive is false!";
if (paddings.size() == 2L) {
for (size_t i = 0; i < 2L; ++i) {
......@@ -62,8 +65,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
}
int pad_height = paddings[0];
int pad_width = paddings[2];
std::string padding_algorithm("");
if (op_info->HasAttr("padding_algorithm")) {
padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
......@@ -72,6 +73,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if (op_info->HasAttr("adaptive")) {
adaptive = op_info->GetAttr<bool>("adaptive");
}
auto input_dims = x->dims();
lite::operators::UpdatePadding(&paddings,
global_pooling,
adaptive,
......@@ -80,31 +83,31 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
strides,
ksize);
// std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
// for (size_t i = 0; i < 2; i++) {
// output_shape.push_back(
// (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
// ksize[0]) /
// strides[i] +
// 1);
// }
if (global_pooling) {
ksize.resize(static_cast<size_t>(input_dims.size()) - 2);
for (size_t i = 0; i < ksize.size(); ++i) {
ksize[i] = static_cast<int>(input_dims[i + 2]);
}
}
auto output_tensor = graph->AddNode(
output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
cnmlPoolOpParam_t pool_param;
CNML_CALL(
cnmlCreatePoolOpParam_V2(&pool_param,
cnmlCreatePoolOpParam_V3(&pool_param,
ksize[0],
ksize[1],
strides[0],
strides[1],
pad_height,
pad_width,
1, // dilation
1,
paddings[0],
paddings[1],
paddings[2],
paddings[3],
1, // dilation h
1, // dilation w
ToCnmlPoolMode(pooling_type),
ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL,
ceil_mode ? CNML_POOL_KFULL : CNML_POOL_KVALID,
true, /* real */
1 /* blend factor */));
cnmlBaseOp_t pool_op;
......@@ -114,6 +117,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
output_tensor->mlu_tensor()));
CNML_CALL(cnmlDestroyPoolOpParam(&pool_param));
graph->FuseOp(pool_op);
CNML_CALL(cnmlDestroyBaseOp(&pool_op));
return SUCCESS;
}
......
......@@ -43,6 +43,12 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
bool global_pooling = op_info->GetAttr<bool>("global_pooling");
if (pooling_type == "max") {
for (int i = 0; i < out_dims.production(); ++i) {
dst_ptr[i] = -65504.f;
}
}
int in_n = in_dims[0];
int in_c = in_dims[1];
int in_h = in_dims[2];
......@@ -203,62 +209,46 @@ void test_pool(int bs,
}
TEST(MLUBridges, pool) {
// for (auto pooling_type : {"max", "avg"}) {
// for (auto ceil_mode : {true, false}) {
// for (auto global_pooling : {/*true, */ false}) {
// for (auto exclusive : {true /*, false*/}) {
// for (auto ksize : {2, 3}) {
// for (auto stride : {1, 2}) {
// for (auto padding : {0, 1}) {
// for (auto bs : {1, 3}) {
// for (auto ic : {1, 3}) {
// for (auto ih : {3, 7}) {
// for (auto iw : {3, 7}) {
// test_pool(bs,
// ic,
// ih,
// iw,
// pooling_type,
// ceil_mode,
// global_pooling,
// exclusive,
// ksize,
// stride,
// padding);
// }
// }
// }
// }
// }
// }
// }
// }
// }
// }
// }
for (auto pooling_type : {"max", "avg"}) {
for (auto ceil_mode : {true, false}) {
bool global_pooling = false;
bool exclusive = true;
int ksize = 2;
int stride = 1;
int padding = 0;
int bs = 6;
int ic = 6;
int ih = 6;
int iw = 6;
test_pool(bs,
ic,
ih,
iw,
pooling_type,
ceil_mode,
global_pooling,
exclusive,
ksize,
stride,
padding);
for (auto global_pooling : {true, false}) {
for (auto exclusive : {true /*, false*/}) {
for (auto ksize : {2, 3}) {
for (auto stride : {1, 2}) {
for (auto padding : {0, 1}) {
for (auto bs : {1, 3}) {
for (auto ic : {1, 3}) {
for (auto ih : {3, 7}) {
for (auto iw : {3, 7}) {
LOG(INFO)
<< "shape: " << bs << ',' << ic << ',' << ih << ','
<< iw << '\t' << "pooling type: " << pooling_type
<< '\t' << "ceil model: " << ceil_mode << '\t'
<< "global_pooling: " << global_pooling << '\t'
<< "exclusive: " << exclusive << '\t'
<< "ksize: " << ksize << '\t'
<< "stride: " << stride << '\t'
<< "padding: " << padding;
test_pool(bs,
ic,
ih,
iw,
pooling_type,
ceil_mode,
global_pooling,
exclusive,
ksize,
stride,
padding);
}
}
}
}
}
}
}
}
}
}
}
}
......
......@@ -61,6 +61,7 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
alpha_tensor->mlu_tensor(),
beta_tensor->mlu_tensor()));
graph->FuseOp(scale_op);
CNML_CALL(cnmlDestroyBaseOp(&scale_op));
return SUCCESS;
}
......
......@@ -35,9 +35,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto x_shape =
scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims().Vectorize();
// nchw axis to nhwc aixs
int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2};
// nchw axis to nhwc axis
int axis = 1;
if (op_info->HasAttr("axis")) {
axis = op_info->GetAttr<int>("axis");
......@@ -45,7 +46,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
axis = output_dims.size() + axis;
}
}
int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
// value of nhwc2nchw_axis is index of nhwc
// order of nhwc2nchw_axis is nchw
int nhwc_axis = GetAxisNHWC2NCHW<int>(x_shape.size())[axis];
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
......@@ -55,6 +58,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
graph->GetNode(x_var_name)->mlu_tensor(),
output_tensor->mlu_tensor()));
graph->FuseOp(softmax_op);
CNML_CALL(cnmlDestroyBaseOp(&softmax_op));
return SUCCESS;
}
......
......@@ -93,7 +93,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("axis", axis);
// create and convert op to NPU model, then run it on NPU
// create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
// execute reference implementation and save to output tensor
softmax_ref<float>(op);
......
......@@ -16,6 +16,9 @@
#include <glog/logging.h>
#include <algorithm>
#include <climits>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
namespace paddle {
......@@ -25,8 +28,9 @@ namespace mlu {
MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
cnmlTensorType_t tensor_type,
cnmlDataOrder_t data_order,
cnmlDataType_t mlu_dtype)
cnmlDataOrder_t shape_order,
cnmlDataType_t mlu_dtype,
cnmlDataOrder_t data_order)
: mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) {
std::vector<int> int_shape;
for (auto i : shape) {
......@@ -36,15 +40,18 @@ MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!";
}
}
remember(int_shape, tensor_type, mlu_dtype, data_order);
remember(int_shape, tensor_type, mlu_dtype, shape_order, data_order);
}
void MLUTensor::remember(const std::vector<int>& shape,
cnmlTensorType_t tensor_type,
cnmlDataType_t mlu_dtype,
cnmlDataOrder_t shape_order) {
cnmlDataOrder_t shape_order,
cnmlDataOrder_t data_order) {
tensor_type_ = tensor_type;
mlu_dtype_ = mlu_dtype;
data_order_ = data_order;
origin_shape_.assign(shape.begin(), shape.end());
int size = 4;
if (shape.size() > 4 || shape_order == CNML_ARRAY) {
......@@ -239,13 +246,22 @@ void MLUTensor::remember(const std::vector<int>& shape,
break;
}
}
dim_ = shape_.size();
auto shape_NCHW = DimNHWC2NCHW(shape_);
shape_NCHW.erase(shape_NCHW.begin() + shape.size(), shape_NCHW.end());
dim_ = shape_NCHW.size();
shape_ = DimNCHW2NHWC(shape_NCHW);
}
void MLUTensor::Create() {
if (mlu_tensor_ == nullptr) {
CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_));
std::vector<int> dim_shape(shape_);
if (data_order_ == CNML_NCHW) {
std::transform(origin_shape_.cbegin(),
origin_shape_.cend(),
dim_shape.begin(),
[](DDim::value_type in) { return static_cast<int>(in); });
}
int* dim_strides = nullptr;
CNML_CALL(cnmlSetTensorShape_V2(
mlu_tensor_, dim_, dim_shape.data(), dim_strides));
......@@ -258,6 +274,84 @@ cnmlTensor_t MLUTensor::mlu_tensor() {
return mlu_tensor_;
}
void MLUTensor::ToFile(std::string file_name) {
if (mlu_ptr_) {
VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name;
int count = 1;
for (size_t i = 0; i < shape_.size(); i++) {
count *= shape_[i];
}
VLOG(6) << " dump count: " << count;
VLOG(6) << " dump shape: ";
for (size_t i = 0; i < shape_.size(); i++) {
VLOG(6) << shape_[i] << " ";
}
std::vector<float> cpu_data_fp32(count);
// fp16 to fp32
if (mlu_dtype_ == CNML_DATA_FLOAT16) {
VLOG(6) << " convert fp16 to fp32 ";
std::vector<uint16_t> cpu_data_fp16(count);
cnrtMemcpy(cpu_data_fp16.data(),
mlu_ptr_,
count * sizeof(uint16_t),
CNRT_MEM_TRANS_DIR_DEV2HOST);
for (int i = 0; i < count; i++) {
cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]);
}
} else {
cnrtMemcpy(cpu_data_fp32.data(),
mlu_ptr_,
count * sizeof(float),
CNRT_MEM_TRANS_DIR_DEV2HOST);
}
// trans to nchw
std::vector<float> cpu_data_trans(count);
if (data_order_ != CNML_NCHW) {
switch (shape_.size()) {
case 4:
transpose(cpu_data_fp32.data(),
cpu_data_trans.data(),
shape_,
{0, 3, 1, 2});
break;
case 3:
transpose(
cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 2, 1});
break;
case 2:
transpose(
cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 1});
break;
case 1:
transpose(cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0});
break;
default:
CHECK(0) << "ToFile only support dim <=4";
break;
}
}
// to file
std::ostringstream outs;
for (int i = 0; i < count; i++) {
if (data_order_ == CNML_NCHW) {
outs << cpu_data_fp32[i] << std::endl;
} else {
outs << cpu_data_trans[i] << std::endl;
}
}
std::ofstream of;
of.open(file_name, std::ios::out);
of << outs.str();
of.close();
} else {
LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : "
<< file_name;
}
}
MLUTensor::~MLUTensor() {
if (mlu_tensor_ != nullptr) {
CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));
......
......@@ -14,6 +14,8 @@
#pragma once
#include <fstream>
#include <string>
#include <vector>
#include "lite/kernels/mlu/bridges/utility.h"
......@@ -33,13 +35,15 @@ class MLUTensor {
MLUTensor(const std::vector<int64_t>& shape,
cnmlTensorType_t tensor_type = CNML_TENSOR,
cnmlDataOrder_t data_order = CNML_NCHW,
cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32);
cnmlDataOrder_t shape_order = CNML_NCHW,
cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
cnmlDataOrder_t data_order = CNML_NHWC);
void remember(const std::vector<int>& shape,
cnmlTensorType_t tensor_type,
cnmlDataType_t mlu_dtype,
cnmlDataOrder_t shape_order);
cnmlDataOrder_t shape_order,
cnmlDataOrder_t data_order);
void Create();
cnmlTensor_t mlu_tensor();
void* mlu_data() {
......@@ -47,14 +51,21 @@ class MLUTensor {
return mlu_ptr_;
}
cnmlDataType_t dtype() { return mlu_dtype_; }
void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
const std::vector<int64_t>& get_origin_shape() const { return origin_shape_; }
~MLUTensor();
void ToFile(std::string file_name);
cnmlDataOrder_t dorder() { return data_order_; }
private:
cnmlTensor_t mlu_tensor_;
std::vector<int> shape_;
std::vector<int64_t> origin_shape_;
cnmlTensorType_t tensor_type_;
cnmlDataType_t mlu_dtype_;
int dim_{0};
......
......@@ -24,18 +24,38 @@ namespace lite {
namespace subgraph {
namespace mlu {
template <lite_api::PrecisionType Dtype>
void PrepareInput(Graph* graph,
const std::string& input_name,
Tensor* input_tensor,
cnmlDataOrder_t order) {
thread_local Tensor temp_input;
temp_input.Resize(input_tensor->dims().Vectorize());
temp_input.CopyDataFrom(*input_tensor);
using data_type = typename MLUTypeTraits<Dtype>::type;
auto input_node = graph->AddNode(
input_name,
input_tensor->dims().Vectorize(),
CNML_TENSOR,
CNML_NCHW,
MLUTypeTraits<Dtype>::cnml_type,
order,
reinterpret_cast<void*>(
input_tensor->template mutable_data<data_type>(TARGET(kMLU))));
CHECK(input_node);
CNRT_CHECK(cnrtMemcpy(input_tensor->template mutable_data<data_type>(),
temp_input.mutable_data<data_type>(),
sizeof(data_type) * input_tensor->dims().production(),
CNRT_MEM_TRANS_DIR_HOST2DEV));
}
void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const std::vector<std::string>& input_var_names,
const std::vector<std::string>& output_var_names) {
const std::vector<std::string>& output_var_names,
cnmlDataOrder_t order) {
CNRT_CALL(cnrtInit(0));
::paddle::lite::SetMluDevice(0);
lite::SetMluDevice(0);
cnrtQueue_t queue_;
cnrtInvokeFuncParam_t forward_param;
u32_t affinity = 1;
int data_param = 1;
forward_param.data_parallelism = &data_param;
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
CNRT_CALL(cnrtCreateQueue(&queue_));
cnrtDev_t dev_handle;
CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0));
......@@ -50,23 +70,21 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
// Convert input data var and add it into the MLU IR graph
for (auto& input_name : input_var_names) {
auto input_tensor = scope->FindMutableTensor(input_name);
CHECK(input_tensor);
Tensor temp_input;
temp_input.Resize(input_tensor->dims().Vectorize());
temp_input.CopyDataFrom(*input_tensor);
auto input_node =
graph.AddNode(input_name,
input_tensor->dims().Vectorize(),
CNML_TENSOR,
CNML_NCHW,
graph.FPType(),
reinterpret_cast<void*>(
input_tensor->mutable_data<float>(TARGET(kMLU))));
CHECK(input_node);
CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<float>(),
temp_input.mutable_data<float>(),
sizeof(float) * input_tensor->dims().production(),
CNRT_MEM_TRANS_DIR_HOST2DEV));
auto data_type = input_tensor->precision();
switch (data_type) {
#define PREPARE_INPUT(type__) \
case PRECISION(type__): \
PrepareInput<PRECISION(type__)>(&graph, input_name, input_tensor, order); \
break;
PREPARE_INPUT(kFP16)
PREPARE_INPUT(kFloat)
PREPARE_INPUT(kInt8)
PREPARE_INPUT(kInt32)
#undef PREPARE_INPUT
default:
CHECK(0);
}
}
op->CheckShape();
op->InferShape();
......@@ -89,8 +107,9 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
}
graph.Compile(CNML_MLU270, 1);
graph.Compute(queue_, *(graph.MutableInputs()), *(graph.MutableOutputs()));
CNRT_CALL(cnrtSyncQueue(queue_));
graph.Compute(forward_param, queue_);
for (auto& output_name : output_var_names) {
auto output_tensor = scope->FindMutableTensor(output_name);
Tensor temp_out;
......
......@@ -58,7 +58,8 @@ void FillTensor(Tensor* x,
void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const std::vector<std::string>& input_var_names,
const std::vector<std::string>& output_var_names);
const std::vector<std::string>& output_var_names,
cnmlDataOrder_t order = CNML_NHWC);
} // namespace mlu
} // namespace subgraph
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/kernels/mlu/bridges/utility.h"
#include <utility>
namespace paddle {
......@@ -20,33 +21,21 @@ namespace lite {
namespace subgraph {
namespace mlu {
void transpose(float* input_data,
float* output_data,
std::vector<int> input_shape,
std::vector<int> axis) {
void transpose2d(float* input_data,
float* output_data,
std::vector<int> input_shape) {
CHECK_EQ(input_shape.size(), 2);
int old_index = -1;
int new_index = -1;
int dim[4] = {0};
std::vector<int> shape = input_shape;
for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
old_index = dim[0] * shape[1] * shape[2] * shape[3] +
dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
new_index =
dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
output_data[new_index] = input_data[old_index];
}
}
for (int i = 0; i < input_shape[0]; i++) {
for (int j = 0; j < input_shape[1]; j++) {
old_index = i * input_shape[1] + j;
new_index = j * input_shape[0] + i;
output_data[new_index] = input_data[old_index];
}
}
}
int scale2position(float scale) { return static_cast<int>(-std::log2(scale)); }
void dequant(float* dst, int8_t* src, size_t size, float scale) {
for (size_t i = 0; i < size; ++i) {
dst[i] = static_cast<float>(src[i]) * scale;
......
......@@ -16,24 +16,76 @@
#include <cnml.h>
#include <cnrt.h>
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/mlu/mlu_utils.h"
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "lite/fluid/data_type.h"
#include "lite/fluid/float16.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void transpose(float* input_data,
float* output_data,
void transpose2d(float* input_data,
float* output_data,
std::vector<int> input_shape);
template <typename dtype>
void transpose(dtype* input_data,
dtype* output_data,
std::vector<int> input_shape,
std::vector<int> axis);
int scale2position(float scale);
std::vector<int> axis) {
int old_index = -1;
int new_index = -1;
std::vector<int> shape;
std::vector<int> expand_axis;
if (input_shape.size() < 5u) {
for (size_t i = 0; i < 5 - input_shape.size(); i++) {
shape.push_back(1);
expand_axis.push_back(i);
}
for (size_t i = 0; i < input_shape.size(); i++) {
shape.push_back(input_shape[i]);
expand_axis.push_back(axis[i] + 5 - input_shape.size());
}
} else {
shape = input_shape;
expand_axis = axis;
}
int dim[5] = {0};
for (dim[0] = 0; dim[0] < shape[0]; dim[0]++) {
for (dim[1] = 0; dim[1] < shape[1]; dim[1]++) {
for (dim[2] = 0; dim[2] < shape[2]; dim[2]++) {
for (dim[3] = 0; dim[3] < shape[3]; dim[3]++) {
for (dim[4] = 0; dim[4] < shape[4]; dim[4]++) {
old_index = dim[0] * shape[1] * shape[2] * shape[3] * shape[4] +
dim[1] * shape[2] * shape[3] * shape[4] +
dim[2] * shape[3] * shape[4] + dim[3] * shape[4] +
dim[4];
new_index = dim[expand_axis[0]] * shape[expand_axis[1]] *
shape[expand_axis[2]] * shape[expand_axis[3]] *
shape[expand_axis[4]] +
dim[expand_axis[1]] * shape[expand_axis[2]] *
shape[expand_axis[3]] * shape[expand_axis[4]] +
dim[expand_axis[2]] * shape[expand_axis[3]] *
shape[expand_axis[4]] +
dim[expand_axis[3]] * shape[expand_axis[4]] +
dim[expand_axis[4]];
output_data[new_index] = input_data[old_index];
}
}
}
}
}
}
inline int scale2position(float scale) { return std::floor(-std::log2(scale)); }
void dequant(float* dst, int8_t* src, size_t size, float scale);
void dequant(float* dst,
......@@ -64,27 +116,94 @@ inline const ::paddle::lite::DDimLite DimNCHW2NHWC(
std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]}));
}
inline const std::vector<int64_t> DimNHWC2NCHW(
const std::vector<int64_t>& dim) {
return std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]});
template <typename data_type>
inline const std::vector<data_type> DimNHWC2NCHW(
const std::vector<data_type>& dim) {
switch (dim.size()) {
case 1:
return dim;
case 2:
return dim;
case 3:
return std::vector<data_type>({dim[0], dim[2], dim[1]});
case 4:
return std::vector<data_type>({dim[0], dim[3], dim[1], dim[2]});
case 5:
return std::vector<data_type>({dim[0], dim[4], dim[1], dim[2], dim[3]});
default:
CHECK(0) << "unsupport dimension";
}
}
template <typename data_type>
inline const std::vector<data_type> DimNCHW2NHWC(
const std::vector<data_type>& dim) {
switch (dim.size()) {
case 1:
return dim;
case 2:
return dim;
case 3:
return std::vector<data_type>({dim[0], dim[2], dim[1]});
case 4:
return std::vector<data_type>({dim[0], dim[2], dim[3], dim[1]});
case 5:
return std::vector<data_type>({dim[0], dim[2], dim[3], dim[4], dim[1]});
default:
CHECK(0) << "unsupport dimension";
}
}
inline const std::vector<int64_t> DimNCHW2NHWC(
const std::vector<int64_t>& dim) {
return std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]});
template <typename data_type>
inline std::vector<data_type> GetAxisNHWC2NCHW(size_t n_dims) {
std::vector<data_type> nhwc2nchw_axis(n_dims);
nhwc2nchw_axis[0] = 0;
if (n_dims > 1) nhwc2nchw_axis[1] = n_dims - 1;
for (size_t i = 2; i < n_dims; ++i) {
nhwc2nchw_axis[i] = i - 1;
}
return nhwc2nchw_axis;
}
template <typename data_type>
inline std::vector<data_type> GetAxisNCHW2NHWC(size_t n_dims) {
std::vector<data_type> nchw2nhwc_axis(n_dims);
nchw2nhwc_axis[0] = 0;
for (size_t i = 1; i < n_dims - 1; ++i) {
nchw2nhwc_axis[i] = i + 1;
}
if (n_dims > 1) nchw2nhwc_axis[n_dims - 1] = 1;
return nchw2nhwc_axis;
}
template <paddle::lite_api::PrecisionType>
struct FPTypeTraits {};
struct MLUTypeTraits {
/* using type = void; */
/* static constexpr cnmlDataType_t cnml_type = CNML_DATA_INVALID; */
};
template <>
struct MLUTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
using type = float;
static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT32;
};
template <>
struct MLUTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
using type = paddle::lite::fluid::float16;
static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT16;
};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
typedef float T;
struct MLUTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
using type = int8_t;
static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT8;
};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
typedef paddle::lite::fluid::float16 T;
struct MLUTypeTraits<paddle::lite_api::PrecisionType::kInt32> {
using type = int32_t;
static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT32;
};
} // namespace mlu
......
......@@ -41,6 +41,9 @@ class IoCopyHostToMluCompute
auto mem_size = param.x->memory_size();
// LOG(INFO) << "copy size " << mem_size;
auto* data = param.y->mutable_data(TARGET(kMLU), mem_size);
VLOG(6) << "io_copy host to mlu] memory size: " << mem_size
<< " precision type: " << PrecisionToStr(Precision);
param.y->set_precision(param.x->precision());
CopyFromHostSync(data, param.x->raw_data(), mem_size);
}
......@@ -79,6 +82,13 @@ class IoCopyMluToHostCompute
CHECK(param.x->target() == TARGET(kMLU));
auto mem_size = param.x->memory_size();
auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
VLOG(6) << "io_copy mlu to host] memory size: " << mem_size
<< " precision type: " << PrecisionToStr(Precision);
// sync queue to ensure process done
auto& mlu_context = this->ctx_->template As<MLUContext>();
CNRT_CALL(cnrtSyncQueue(mlu_context.exec_queue()));
CopyToHostSync(data, param.x->raw_data(), mem_size);
}
......@@ -97,8 +107,14 @@ REGISTER_LITE_KERNEL(
kNHWC,
paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFloat)>,
host_to_device_kFloat)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(
......@@ -108,8 +124,31 @@ REGISTER_LITE_KERNEL(
kNHWC,
paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFP16)>,
host_to_device_kFP16)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kFP16),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(
io_copy,
kMLU,
kInt32,
kNHWC,
paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kInt32)>,
host_to_device_kInt32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kInt32),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(
......@@ -119,8 +158,14 @@ REGISTER_LITE_KERNEL(
kNHWC,
paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFloat)>,
device_to_host_kFloat)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(
......@@ -130,6 +175,29 @@ REGISTER_LITE_KERNEL(
kNHWC,
paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFP16)>,
device_to_host_kFP16)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kFP16),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(
io_copy,
kMLU,
kInt8,
kNHWC,
paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kInt8)>,
host_to_device_to_kInt8)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt8),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kInt8),
DATALAYOUT(kAny))})
.Finalize();
......@@ -24,9 +24,9 @@ namespace mlu {} // namespace mlu
REGISTER_LITE_KERNEL(
layout,
kMLU,
kX86,
kFloat,
kNHWC,
kNCHW,
paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFloat)>,
def_layout_nhwc2nchw_fp32)
.BindInput("Input",
......@@ -41,9 +41,9 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL(
layout,
kMLU,
kX86,
kFP16,
kNHWC,
kNCHW,
paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFP16)>,
def_layout_nhwc2nchw_fp16)
.BindInput("Input",
......@@ -58,9 +58,9 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL(
layout,
kMLU,
kX86,
kFloat,
kNHWC,
kNCHW,
paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFloat)>,
def_layout_nchw2nhwc_fp32)
.BindInput("Input",
......@@ -75,9 +75,9 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL(
layout,
kMLU,
kX86,
kFP16,
kNHWC,
kNCHW,
paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFP16)>,
def_layout_nchw2nhwc_fp16)
.BindInput("Input",
......@@ -92,11 +92,11 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL(
layout,
kMLU,
kX86,
kInt8,
kNHWC,
kNCHW,
paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
def_layout_nchw2nhwc_fp32_int8)
def_layout_nchw2nhwc_int8)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt8),
......
......@@ -22,6 +22,7 @@
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/operators/layout_op.h"
namespace paddle {
......@@ -29,24 +30,6 @@ namespace lite {
namespace kernels {
namespace mlu {
template <paddle::lite_api::PrecisionType>
struct FPTypeTraits {};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
typedef float T;
};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
typedef paddle::lite::fluid::float16 T;
};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
typedef int8_t T;
};
template <lite::TargetType Target, typename T>
inline void LayoutTransCompute(const int dim,
const lite::Context<Target>& context,
......@@ -73,7 +56,7 @@ inline void LayoutTransCompute(const int dim,
template <PrecisionType Precision>
class LayoutNchwToNhwcCompute
: public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
: public KernelLite<TARGET(kX86), Precision, DATALAYOUT(kNCHW)> {
public:
using param_t = operators::LayoutParam;
......@@ -81,36 +64,37 @@ class LayoutNchwToNhwcCompute
auto& param = this->template Param<param_t>();
auto* x = param.x;
auto* out = param.y;
out->template mutable_data<typename FPTypeTraits<Precision>::T>();
auto x_dims = param.x->dims().size();
out->template mutable_data<
typename subgraph::mlu::MLUTypeTraits<Precision>::type>();
auto x_ndims = param.x->dims().size();
auto& context = this->ctx_->template As<X86Context>();
const auto origin_dims = out->dims().Vectorize();
std::vector<int> axis;
switch (x_dims) {
switch (x_ndims) {
case 2:
axis = {0, 1};
break;
case 3:
axis = {0, 2, 1};
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[2], out->dims()[1]});
origin_dims[0], origin_dims[2], origin_dims[1]});
break;
case 4:
axis = {0, 2, 3, 1};
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
origin_dims[0], origin_dims[2], origin_dims[3], origin_dims[1]});
break;
default:
CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
}
LayoutTransCompute<lite::TargetType::kX86,
typename FPTypeTraits<Precision>::T>(
x_dims, context, *x, out, axis);
typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
x_ndims, context, *x, out, axis);
if (x_dims > 2) {
if (x_ndims > 2) {
out->Resize(origin_dims);
}
}
......@@ -122,7 +106,7 @@ class LayoutNchwToNhwcCompute
template <PrecisionType Precision>
class LayoutNhwcToNchwCompute
: public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
: public KernelLite<TARGET(kX86), Precision, DATALAYOUT(kNCHW)> {
public:
using param_t = operators::LayoutParam;
......@@ -130,25 +114,27 @@ class LayoutNhwcToNchwCompute
auto& param = this->template Param<param_t>();
auto* x = param.x;
auto* out = param.y;
out->template mutable_data<typename FPTypeTraits<Precision>::T>();
auto x_dims = param.x->dims().size();
out->template mutable_data<
typename subgraph::mlu::MLUTypeTraits<Precision>::type>();
auto& context = this->ctx_->template As<X86Context>();
const auto origin_dims = out->dims().Vectorize();
TensorLite tmp_t;
tmp_t.ShareDataWith(*x);
const auto x_dims = x->dims().Vectorize();
auto x_ndims = param.x->dims().size();
std::vector<int> axis;
switch (x_dims) {
switch (x_ndims) {
case 2:
axis = {0, 1};
break;
case 3:
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[2], out->dims()[1]});
tmp_t.Resize(std::vector<int64_t>{x_dims[0], x_dims[2], x_dims[1]});
axis = {0, 2, 1};
break;
case 4:
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]});
tmp_t.Resize(
std::vector<int64_t>{x_dims[0], x_dims[2], x_dims[3], x_dims[1]});
axis = {0, 3, 1, 2};
break;
default:
......@@ -156,12 +142,8 @@ class LayoutNhwcToNchwCompute
}
LayoutTransCompute<lite::TargetType::kX86,
typename FPTypeTraits<Precision>::T>(
x_dims, context, *x, out, axis);
if (x_dims > 2) {
out->Resize(origin_dims);
}
typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
x_ndims, context, tmp_t, out, axis);
}
std::string doc() const override {
......
......@@ -36,8 +36,14 @@ REGISTER_LITE_KERNEL(
kNHWC,
paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFloat)>,
def_kFloat)
.BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindInput("Inputs",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Outputs",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(
......@@ -47,6 +53,12 @@ REGISTER_LITE_KERNEL(
kNHWC,
paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFP16)>,
def_FP16)
.BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindInput("Inputs",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Outputs",
{LiteType::GetTensorTy(TARGET(kMLU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
......@@ -14,17 +14,24 @@
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/core/kernel.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
#include "lite/core/types.h"
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/tensor.h"
#include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/utils/env.h"
namespace paddle {
namespace lite {
......@@ -40,10 +47,19 @@ class SubgraphEngine : public subgraph::Engine {
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
Scope* scope,
::paddle::lite_api::PrecisionType type)
paddle::lite_api::PrecisionType type)
: subgraph::Engine(
ctx, block_idx, block_desc, input_names, output_names, scope) {
graph_.SetFPType(type);
ctx, block_idx, block_desc, input_names, output_names, scope),
fp_type_(type) {
VLOG(4) << "[MLU] PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL is "
<< GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL");
VLOG(4) << "[MLU] PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE is "
<< GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE");
VLOG(4) << "[MLU] LITE_DISABLE_MLU_CAST is "
<< GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
if (GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE")) {
disable_batch_size_changeable_ = true;
}
}
int Build() {
......@@ -72,24 +88,97 @@ class SubgraphEngine : public subgraph::Engine {
return 0;
}
bool InputShapeChanged() {
std::vector<std::vector<int64_t>> new_shape;
// used in batch changable situation
std::vector<std::vector<int64_t>> all_shape;
for (auto origin_itensor : origin_itensors_) {
if (!disable_batch_size_changeable_) {
auto iv = origin_itensor->dims().Vectorize();
all_shape.push_back(iv);
iv.erase(iv.begin());
new_shape.push_back(iv);
} else {
new_shape.push_back(origin_itensor->dims().Vectorize());
}
}
inputs_shape_ = new_shape;
all_inputs_shape_ = all_shape;
if (shape_graph_map_.count(inputs_shape_) > 0) {
return false;
}
VLOG(3) << "MLU graph input shape changed" << std::endl;
return true;
}
inline cnmlDataType_t PrecisionToDatatype(PrecisionType data_type) {
switch (data_type) {
case paddle::lite_api::PrecisionType::kFP16:
return CNML_DATA_FLOAT16;
case paddle::lite_api::PrecisionType::kFloat:
return CNML_DATA_FLOAT32;
case paddle::lite_api::PrecisionType::kInt32:
return CNML_DATA_INT32;
case paddle::lite_api::PrecisionType::kInt8:
return CNML_DATA_UINT8;
default:
return PrecisionToDatatype(fp_type_);
}
}
protected:
int BuildDeviceProgram() override {
if (!error_compile_batch_size_changeable_ &&
!disable_batch_size_changeable_) {
int status = BuildDeviceProgramImpl();
if (subgraph::CHECK_SUCCESS(status)) {
return status;
}
LOG(INFO) << "[MLU] build batch_size changeable subgraph op failed, "
"changed to input_shape changeable";
}
error_compile_batch_size_changeable_ = true;
disable_batch_size_changeable_ = true;
return BuildDeviceProgramImpl();
}
int BuildDeviceProgramImpl() {
int status = 0;
auto graph = std::make_shared<paddle::lite::subgraph::mlu::Graph>();
graph->SetFPType(fp_type_);
std::vector<std::vector<int64_t>> new_shape;
origin_itensors_.clear();
origin_otensors_.clear();
auto data_order = block_desc_->GetOp<cpp::OpDesc>(0)->Type() == "layout"
? CNML_NCHW
: CNML_NHWC;
// Convert all of input data vars and added into the MLU IR graph
status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
for (auto& input_name : input_names_) {
auto input_tensor = scope_->FindMutableTensor(input_name);
auto data_type = input_tensor->precision();
cnmlDataType_t fp_type = PrecisionToDatatype(data_type);
origin_itensors_.push_back(input_tensor);
if (!disable_batch_size_changeable_) {
auto iv = input_tensor->dims().Vectorize();
iv.erase(iv.begin());
new_shape.push_back(iv);
} else {
new_shape.push_back(input_tensor->dims().Vectorize());
}
CHECK(input_tensor);
auto input_node =
graph_.AddNode(input_name,
input_tensor->dims().Vectorize(),
CNML_TENSOR,
CNML_NCHW,
graph_.FPType(),
const_cast<void*>(input_tensor->raw_data()));
VLOG(4) << "subgraph input tensor " << input_name << std::endl;
auto input_node = graph->AddNode(input_name,
input_tensor->dims().Vectorize(),
CNML_TENSOR,
CNML_NCHW,
fp_type,
data_order);
CHECK(input_node);
// MLU doesn't support dynamic dimensions/shapes, so need to rebuild
// the program when the shape of any input tensor is changed.
status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
}
LOG(INFO) << "START TO CONVERT ";
// Convert all of ops and its weights and added into the MLU IR graph
......@@ -98,6 +187,18 @@ class SubgraphEngine : public subgraph::Engine {
auto op = inst.op();
CHECK(op);
std::string op_type = op->op_info()->Type();
// since cnml's compile api will not return error now, we simply check
// op's type
if (!disable_batch_size_changeable_ &&
std::find(unsupport_batch_size_changeable_op_type_.begin(),
unsupport_batch_size_changeable_op_type_.end(),
op_type) !=
unsupport_batch_size_changeable_op_type_.end()) {
status |= subgraph::FAILED;
VLOG(4) << "[MLU] found unsupported batch_size changeable op type: "
<< op_type;
return status;
}
op->CheckShape();
const_cast<OpLite*>(op)->InferShape();
if (!bridges.Exists(op_type, TARGET(kMLU))) {
......@@ -106,7 +207,7 @@ class SubgraphEngine : public subgraph::Engine {
}
auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kMLU))(
reinterpret_cast<void*>(&graph_),
reinterpret_cast<void*>(graph.get()),
const_cast<OpLite*>(op),
const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
......@@ -115,46 +216,272 @@ class SubgraphEngine : public subgraph::Engine {
}
// Obtain the output nodes of the MLU IR graph and build the graph to MLU
// runtime
std::vector<std::string> valid_output_names;
for (auto& output_name : output_names_) {
if (graph_.HasNode(output_name)) {
graph_.AddOutput(graph_.GetNode(output_name));
if (graph->HasNode(output_name)) {
graph->AddOutput(graph->GetNode(output_name));
auto output_tensor = scope_->FindMutableTensor(output_name);
void* p_data = static_cast<void*>(
output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
FPTypeTraits<Precision>::T>(
TARGET(kMLU)));
auto node = graph_.GetNode(output_name);
CHECK(p_data);
node->set_mlu_ptr(p_data);
valid_output_names.push_back(output_name);
origin_otensors_.push_back(output_tensor);
VLOG(4) << "subgraph output tensor " << output_name << std::endl;
// auto node = graph->GetNode(output_name);
// CHECK(p_data);
// node->set_mlu_ptr(p_data);
}
}
for (auto& input_name : input_names_) {
graph_.AddInput(graph_.GetNode(input_name));
graph->AddInput(graph->GetNode(input_name),
disable_batch_size_changeable_);
}
CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
CHECK(!origin_otensors_.empty()) << "[MLU] no valid output names";
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto core_version = mlu_context.MLUCoreVersion();
auto core_number = mlu_context.MLUCoreNumber();
graph_.Compile(core_version, core_number);
graph->Compile(core_version, core_number);
shape_graph_map_[new_shape] = graph;
if (GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL")) {
graph->GenOfflineModel(GetOfflineModName());
}
return status;
}
std::string TrimStrings(const std::string& origin_str) {
std::string str = origin_str;
std::size_t found = str.find("0x");
std::size_t found_end = 0;
const std::vector<std::string> del_strs = {
"/trans_io_copy", "/trans_cast", "/trans_layout"};
for (const auto& iterm : del_strs) {
found_end = str.find(iterm);
// trim point address and one of the del_strs
if (found != std::string::npos && found_end != std::string::npos) {
str.replace(found, found_end - found, "");
found_end = str.find(iterm);
str.replace(found_end, iterm.size(), "");
break;
}
}
return str;
}
std::string GetOfflineModName() {
sort(input_names_.begin(), input_names_.end());
sort(output_names_.begin(), output_names_.end());
const auto& delimiter = "__";
const auto& delimiter_num = "_";
const auto& input_shape_str = "input_shape_";
const auto& output_shape_str = "output_shape_";
std::string name = "";
std::string tmp = "";
for (const auto& input_name : input_names_) {
tmp = input_name;
name += TrimStrings(tmp) + delimiter + input_shape_str;
auto input_tensor = scope_->FindMutableTensor(input_name);
for (const auto& iterm : input_tensor->dims().Vectorize()) {
name += std::to_string(iterm) + delimiter_num;
}
name += delimiter;
}
for (const auto& output_name : output_names_) {
tmp = output_name;
name += TrimStrings(tmp) + delimiter + output_shape_str;
auto output_tensor = scope_->FindMutableTensor(output_name);
for (const auto& iterm : output_tensor->dims().Vectorize()) {
name += std::to_string(iterm) + delimiter_num;
}
name += delimiter;
}
std::replace(name.begin(), name.end(), '/', '-');
return name;
}
void InferOutputsShapeOnly() {
// infer outputs shape when enable BATCH_SIZE_CHANGEABLE
const auto iter = in_out_shape_map_.find(all_inputs_shape_);
if (iter != in_out_shape_map_.end()) {
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
origin_otensors_[i]->Resize(iter->second[i]);
}
} else {
for (auto& inst : origin_program_) {
auto op = inst.op();
CHECK(op);
op->CheckShape();
const_cast<OpLite*>(op)->InferShape();
}
std::vector<std::vector<int64_t>> outs_shape;
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
outs_shape.push_back(origin_otensors_[i]->dims().Vectorize());
}
in_out_shape_map_[all_inputs_shape_] = outs_shape;
}
}
inline void* GetOutputDataPtr(Tensor* tensor, bool use_mlu_cast) {
if (use_mlu_cast) {
// output is float, since cast fused in subgraph
return static_cast<void*>(tensor->mutable_data<float>(TARGET(kMLU)));
} else {
return static_cast<void*>(
tensor->template mutable_data<
typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
TARGET(kMLU)));
}
}
int LaunchDeviceProgram() override {
// prepare input and output memory
auto& mlu_context = this->ctx_->template As<MLUContext>();
auto exec_queue = mlu_context.exec_queue();
u32_t affinity = mlu_context.affinity();
cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
int data_param = 1;
forward_param.data_parallelism = &data_param;
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
graph_.Compute(forward_param, exec_queue);
auto graph = shape_graph_map_[inputs_shape_];
auto* graph_input = graph->MutableInputs();
auto* graph_output = graph->MutableOutputs();
CHECK_EQ(graph_input->size(), origin_itensors_.size());
CHECK_EQ(graph_output->size(), origin_otensors_.size());
bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
if (!disable_batch_size_changeable_) {
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
graph_in;
if (shape_tensor_map_in_.find(all_inputs_shape_) !=
shape_tensor_map_in_.end()) {
graph_in = shape_tensor_map_in_[all_inputs_shape_];
for (size_t i = 0; i < origin_itensors_.size(); ++i) {
graph_in[i]->set_mlu_ptr(
const_cast<void*>(origin_itensors_[i]->raw_data()));
}
} else {
graph_in.reserve(origin_itensors_.size());
for (size_t i = 0; i < origin_itensors_.size(); ++i) {
paddle::lite::subgraph::mlu::MLUTensor tmp(
origin_itensors_[i]->dims().Vectorize());
tmp.set_mlu_dtype(graph_input->at(i)->dtype());
tmp.set_mlu_ptr(const_cast<void*>(origin_itensors_[i]->raw_data()));
graph_in.push_back(
std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
}
shape_tensor_map_in_[all_inputs_shape_] = graph_in;
}
// TODO(zhangmingwei): we just call every op's infer_shape to get outputs'
// shape, may be it's better to use cnml's api to get output shape. This
// can be done when cnml's tensor dimension is totally equal to lite's
// tensor
// shape.
InferOutputsShapeOnly();
// const std::vector<std::vector<int64_t>> new_output_size =
// graph->InferOutputsShape(graph_in);
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
graph_out;
if (shape_tensor_map_out_.find(all_inputs_shape_) !=
shape_tensor_map_out_.end()) {
graph_out = shape_tensor_map_out_[all_inputs_shape_];
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
// origin_otensors_[i]->Resize(new_output_size.at(i));
graph_out[i]->set_mlu_ptr(
GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
}
} else {
graph_out.reserve(origin_otensors_.size());
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
// origin_otensors_[i]->Resize(new_output_size.at(i));
paddle::lite::subgraph::mlu::MLUTensor tmp(
origin_otensors_[i]->dims().Vectorize());
tmp.set_mlu_dtype(graph_output->at(i)->dtype());
tmp.set_mlu_ptr(
GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
graph_out.push_back(
std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
}
shape_tensor_map_out_[all_inputs_shape_] = graph_out;
}
graph->Compute(exec_queue, graph_in, graph_out);
} else {
for (size_t i = 0; i < origin_itensors_.size(); ++i) {
graph_input->at(i)->set_mlu_ptr(
const_cast<void*>(origin_itensors_[i]->raw_data()));
}
for (size_t i = 0; i < origin_otensors_.size(); ++i) {
origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape());
graph_output->at(i)->set_mlu_ptr(
GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
}
// only cnmlComputeFusionOpForward_V3 need cnrtInvokeFuncParam_t
cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
int data_param = 1;
forward_param.data_parallelism = &data_param;
u32_t affinity = mlu_context.affinity();
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
graph->Compute(forward_param, exec_queue);
#ifdef MLU_DUMP_SUBGRAPH_IO
// Graph node store compile-time tensor while batchsize mutable is set.
// Only batchsize mutable is disabled, data exists in graph node at
// runtime
// =========== DUMP ===================
for (auto input_name : input_names_) {
auto input_tensor =
shape_graph_map_[inputs_shape_]->GetNode(input_name);
auto dump_name = input_name;
while (dump_name.find("/") != std::string::npos) {
dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
}
VLOG(6) << "dump_name: " << dump_name;
input_tensor->ToFile(dump_name);
}
for (auto output_name : output_names_) {
if (shape_graph_map_[inputs_shape_]->HasNode(output_name)) {
auto output_tensor =
shape_graph_map_[inputs_shape_]->GetNode(output_name);
auto dump_name = output_name;
while (dump_name.find("/") != std::string::npos) {
dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
}
VLOG(6) << "dump_name: " << dump_name;
output_tensor->ToFile(dump_name);
} else {
VLOG(6) << "graph does not have " << output_name << " as output"
<< std::endl;
}
}
#endif
// =========== DUMP END ================
}
return 0;
}
paddle::lite::subgraph::mlu::Graph graph_;
paddle::lite_api::PrecisionType fp_type_;
std::vector<std::vector<int64_t>> inputs_shape_{};
std::vector<std::vector<int64_t>> all_inputs_shape_{};
std::map<std::vector<std::vector<int64_t>>,
std::shared_ptr<paddle::lite::subgraph::mlu::Graph>>
shape_graph_map_{};
// enable batch size changeable by default, this cound be changed by
// environment variable PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE and
// whether the op can be compiled with batch size changeable way
bool disable_batch_size_changeable_{false};
bool error_compile_batch_size_changeable_{false};
std::vector<std::string> unsupport_batch_size_changeable_op_type_{"concat"};
// search output runtime MLUTensor for certain output shape when enable
// BATCH_SIZE_CHANGEABLE
std::map<std::vector<std::vector<int64_t>>,
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
shape_tensor_map_out_{};
// search input runtime MLUTensor for certain input shape when enable
// BATCH_SIZE_CHANGEABLE
std::map<std::vector<std::vector<int64_t>>,
std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
shape_tensor_map_in_{};
// search output shape for certain input shape when enable
// BATCH_SIZE_CHANGEABLE
std::map<std::vector<std::vector<int64_t>>, std::vector<std::vector<int64_t>>>
in_out_shape_map_{};
};
template <PrecisionType Precision>
......
......@@ -78,3 +78,13 @@ REGISTER_LITE_KERNEL(softsign,
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
REGISTER_LITE_KERNEL(sigmoid,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::SoftsignCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
......@@ -4,7 +4,7 @@ set -ex
# global variables with default value
NEUWARE_HOME="${NEUWARE_HOME}"
TARGET_NAME="all" # default target
BUILD_EXTRA=OFF # ON(with sequence ops)/OFF
BUILD_EXTRA=ON # ON(with sequence ops)/OFF
WITH_TESTING=ON # ON/OFF
function print_usage {
......@@ -28,16 +28,13 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t
readonly workspace=$(pwd)
function prepare_thirdparty {
if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
if [ ! -d $workspace/third-party ]; then
rm -rf $workspace/third-party
if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
wget $THIRDPARTY_TAR
fi
tar xzf third-party-05b862.tar.gz
else
git submodule update --init --recursive
fi
if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
wget $THIRDPARTY_TAR
fi
tar xvf third-party-05b862.tar.gz
}
# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册