提交 6a75ac5d 编写于 作者: J Jiansong Wang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into jiansowa/img_nna

test=develop
...@@ -16,6 +16,11 @@ if(NOT LITE_WITH_HUAWEI_ASCEND_NPU) ...@@ -16,6 +16,11 @@ if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return() return()
endif() endif()
# require -D_GLIBCXX_USE_CXX11_ABI=0 if GCC 7.3.0
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
# 1. path to Huawei Ascend Install Path # 1. path to Huawei Ascend Install Path
if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT) if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT}) set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
......
...@@ -217,6 +217,10 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -217,6 +217,10 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
ENDIF() ENDIF()
IF(LITE_WITH_HUAWEI_ASCEND_NPU)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
ENDIF()
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
ExternalProject_Add( ExternalProject_Add(
${TARGET_NAME} ${TARGET_NAME}
......
...@@ -64,6 +64,7 @@ if (WITH_TESTING) ...@@ -64,6 +64,7 @@ if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "VGG19.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "VGG19.tar.gz")
# data # data
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ILSVRC2012_small.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ILSVRC2012_small.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert_data.tar.gz")
endif() endif()
endif() endif()
......
...@@ -356,5 +356,13 @@ void MobileConfig::set_model_buffer(const char *model_buffer, ...@@ -356,5 +356,13 @@ void MobileConfig::set_model_buffer(const char *model_buffer,
model_from_memory_ = true; model_from_memory_ = true;
} }
// This is the method for allocating workspace_size according to L3Cache size
void MobileConfig::SetArmL3CacheSize(L3CacheSetMethod method,
int absolute_val) {
#ifdef LITE_WITH_ARM
lite::DeviceInfo::Global().SetArmL3CacheSize(method, absolute_val);
#endif
}
} // namespace lite_api } // namespace lite_api
} // namespace paddle } // namespace paddle
...@@ -32,6 +32,14 @@ using shape_t = std::vector<int64_t>; ...@@ -32,6 +32,14 @@ using shape_t = std::vector<int64_t>;
using lod_t = std::vector<std::vector<uint64_t>>; using lod_t = std::vector<std::vector<uint64_t>>;
enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK }; enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
// Methods for allocating L3Cache on Arm platform
enum class L3CacheSetMethod {
kDeviceL3Cache = 0, // Use the system L3 Cache size, best performance.
kDeviceL2Cache = 1, // Use the system L2 Cache size, trade off performance
// with less memory consumption.
kAbsolute = 2, // Use the external setting.
// kAutoGrow = 3, // Not supported yet, least memory consumption.
};
// return true if current device supports OpenCL model // return true if current device supports OpenCL model
LITE_API bool IsOpenCLBackendValid(); LITE_API bool IsOpenCLBackendValid();
...@@ -294,6 +302,11 @@ class LITE_API MobileConfig : public ConfigBase { ...@@ -294,6 +302,11 @@ class LITE_API MobileConfig : public ConfigBase {
// NOTE: This is a deprecated API and will be removed in latter release. // NOTE: This is a deprecated API and will be removed in latter release.
const std::string& param_buffer() const { return param_buffer_; } const std::string& param_buffer() const { return param_buffer_; }
// This is the method for allocating workspace_size according to L3Cache size
void SetArmL3CacheSize(
L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
int absolute_val = -1);
}; };
template <typename ConfigT> template <typename ConfigT>
......
...@@ -107,7 +107,8 @@ TEST(CxxApi, share_external_data) { ...@@ -107,7 +107,8 @@ TEST(CxxApi, share_external_data) {
TEST(LightApi, run) { TEST(LightApi, run) {
lite_api::MobileConfig config; lite_api::MobileConfig config;
config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb"); config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb");
// disable L3 cache on workspace_ allocating
config.SetArmL3CacheSize(L3CacheSetMethod::kDeviceL2Cache);
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
auto inputs = predictor->GetInputNames(); auto inputs = predictor->GetInputNames();
...@@ -148,6 +149,8 @@ TEST(MobileConfig, LoadfromMemory) { ...@@ -148,6 +149,8 @@ TEST(MobileConfig, LoadfromMemory) {
// set model buffer and run model // set model buffer and run model
lite_api::MobileConfig config; lite_api::MobileConfig config;
config.set_model_from_buffer(model_buffer); config.set_model_from_buffer(model_buffer);
// allocate 1M initial space for workspace_
config.SetArmL3CacheSize(L3CacheSetMethod::kAbsolute, 1024 * 1024);
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
auto input_tensor = predictor->GetInput(0); auto input_tensor = predictor->GetInput(0);
......
...@@ -82,16 +82,20 @@ void NeuronAdapter::InitFunctions() { ...@@ -82,16 +82,20 @@ void NeuronAdapter::InitFunctions() {
PADDLE_DLSYM(NeuronModel_setOperandValue); PADDLE_DLSYM(NeuronModel_setOperandValue);
PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams); PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams);
PADDLE_DLSYM(NeuronModel_addOperation); PADDLE_DLSYM(NeuronModel_addOperation);
PADDLE_DLSYM(NeuronModel_addOperationExtension);
PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs); PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
PADDLE_DLSYM(NeuronCompilation_create); PADDLE_DLSYM(NeuronCompilation_create);
PADDLE_DLSYM(NeuronCompilation_free); PADDLE_DLSYM(NeuronCompilation_free);
PADDLE_DLSYM(NeuronCompilation_finish); PADDLE_DLSYM(NeuronCompilation_finish);
PADDLE_DLSYM(NeuronCompilation_createForDevices);
PADDLE_DLSYM(NeuronExecution_create); PADDLE_DLSYM(NeuronExecution_create);
PADDLE_DLSYM(NeuronExecution_free); PADDLE_DLSYM(NeuronExecution_free);
PADDLE_DLSYM(NeuronExecution_setInput); PADDLE_DLSYM(NeuronExecution_setInput);
PADDLE_DLSYM(NeuronExecution_setOutput); PADDLE_DLSYM(NeuronExecution_setOutput);
PADDLE_DLSYM(NeuronExecution_compute); PADDLE_DLSYM(NeuronExecution_compute);
PADDLE_DLSYM(Neuron_getDeviceCount);
PADDLE_DLSYM(Neuron_getDevice);
PADDLE_DLSYM(NeuronDevice_getName);
#undef PADDLE_DLSYM #undef PADDLE_DLSYM
} }
...@@ -146,6 +150,25 @@ int NeuronModel_addOperation(NeuronModel* model, ...@@ -146,6 +150,25 @@ int NeuronModel_addOperation(NeuronModel* model,
model, type, inputCount, inputs, outputCount, outputs); model, type, inputCount, inputs, outputCount, outputs);
} }
int NeuronModel_addOperationExtension(NeuronModel* model,
const char* name,
const char* vendor,
const NeuronDevice* device,
uint32_t inputCount,
const uint32_t* inputs,
uint32_t outputCount,
const uint32_t* outputs) {
return paddle::lite::NeuronAdapter::Global()
->NeuronModel_addOperationExtension()(model,
name,
vendor,
device,
inputCount,
inputs,
outputCount,
outputs);
}
int NeuronModel_identifyInputsAndOutputs(NeuronModel* model, int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
uint32_t inputCount, uint32_t inputCount,
const uint32_t* inputs, const uint32_t* inputs,
...@@ -172,6 +195,15 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) { ...@@ -172,6 +195,15 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) {
compilation); compilation);
} }
int NeuronCompilation_createForDevices(NeuronModel* model,
const NeuronDevice* const* devices,
uint32_t numDevices,
NeuronCompilation** compilation) {
return paddle::lite::NeuronAdapter::Global()
->NeuronCompilation_createForDevices()(
model, devices, numDevices, compilation);
}
int NeuronExecution_create(NeuronCompilation* compilation, int NeuronExecution_create(NeuronCompilation* compilation,
NeuronExecution** execution) { NeuronExecution** execution) {
return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()( return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()(
...@@ -205,3 +237,18 @@ int NeuronExecution_compute(NeuronExecution* execution) { ...@@ -205,3 +237,18 @@ int NeuronExecution_compute(NeuronExecution* execution) {
return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()( return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()(
execution); execution);
} }
int Neuron_getDeviceCount(uint32_t* numDevices) {
return paddle::lite::NeuronAdapter::Global()->Neuron_getDeviceCount()(
numDevices);
}
int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device) {
return paddle::lite::NeuronAdapter::Global()->Neuron_getDevice()(devIndex,
device);
}
int NeuronDevice_getName(const NeuronDevice* device, const char** name) {
return paddle::lite::NeuronAdapter::Global()->NeuronDevice_getName()(device,
name);
}
...@@ -42,12 +42,25 @@ class NeuronAdapter final { ...@@ -42,12 +42,25 @@ class NeuronAdapter final {
const uint32_t *, const uint32_t *,
uint32_t, uint32_t,
const uint32_t *); const uint32_t *);
using NeuronModel_addOperationExtension_Type = int (*)(NeuronModel *,
const char *,
const char *,
const NeuronDevice *,
uint32_t,
const uint32_t *,
uint32_t,
const uint32_t *);
using NeuronModel_identifyInputsAndOutputs_Type = int (*)( using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *); NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
using NeuronCompilation_create_Type = int (*)(NeuronModel *, using NeuronCompilation_create_Type = int (*)(NeuronModel *,
NeuronCompilation **); NeuronCompilation **);
using NeuronCompilation_free_Type = void (*)(NeuronCompilation *); using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *); using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
using NeuronCompilation_createForDevices_Type =
int (*)(NeuronModel *,
const NeuronDevice *const *,
uint32_t,
NeuronCompilation **);
using NeuronExecution_create_Type = int (*)(NeuronCompilation *, using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
NeuronExecution **); NeuronExecution **);
using NeuronExecution_free_Type = void (*)(NeuronExecution *); using NeuronExecution_free_Type = void (*)(NeuronExecution *);
...@@ -59,6 +72,10 @@ class NeuronAdapter final { ...@@ -59,6 +72,10 @@ class NeuronAdapter final {
using NeuronExecution_setOutput_Type = int (*)( using NeuronExecution_setOutput_Type = int (*)(
NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t); NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t);
using NeuronExecution_compute_Type = int (*)(NeuronExecution *); using NeuronExecution_compute_Type = int (*)(NeuronExecution *);
using Neuron_getDeviceCount_Type = int (*)(uint32_t *);
using Neuron_getDevice_Type = int (*)(uint32_t, NeuronDevice **);
using NeuronDevice_getName_Type = int (*)(const NeuronDevice *,
const char **);
Neuron_getVersion_Type Neuron_getVersion() { Neuron_getVersion_Type Neuron_getVersion() {
CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!"; CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
...@@ -105,6 +122,12 @@ class NeuronAdapter final { ...@@ -105,6 +122,12 @@ class NeuronAdapter final {
return NeuronModel_addOperation_; return NeuronModel_addOperation_;
} }
NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() {
CHECK(NeuronModel_addOperationExtension_ != nullptr)
<< "Cannot load NeuronModel_addOperationExtension!";
return NeuronModel_addOperationExtension_;
}
NeuronModel_identifyInputsAndOutputs_Type NeuronModel_identifyInputsAndOutputs_Type
NeuronModel_identifyInputsAndOutputs() { NeuronModel_identifyInputsAndOutputs() {
CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr) CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
...@@ -130,6 +153,12 @@ class NeuronAdapter final { ...@@ -130,6 +153,12 @@ class NeuronAdapter final {
return NeuronCompilation_finish_; return NeuronCompilation_finish_;
} }
NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() {
CHECK(NeuronCompilation_createForDevices_ != nullptr)
<< "Cannot load NeuronCompilation_createForDevices!";
return NeuronCompilation_createForDevices_;
}
NeuronExecution_create_Type NeuronExecution_create() { NeuronExecution_create_Type NeuronExecution_create() {
CHECK(NeuronExecution_create_ != nullptr) CHECK(NeuronExecution_create_ != nullptr)
<< "Cannot load NeuronExecution_create!"; << "Cannot load NeuronExecution_create!";
...@@ -160,6 +189,23 @@ class NeuronAdapter final { ...@@ -160,6 +189,23 @@ class NeuronAdapter final {
return NeuronExecution_compute_; return NeuronExecution_compute_;
} }
Neuron_getDeviceCount_Type Neuron_getDeviceCount() {
CHECK(Neuron_getDeviceCount_ != nullptr)
<< "Cannot load Neuron_getDeviceCount!";
return Neuron_getDeviceCount_;
}
Neuron_getDevice_Type Neuron_getDevice() {
CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!";
return Neuron_getDevice_;
}
NeuronDevice_getName_Type NeuronDevice_getName() {
CHECK(NeuronDevice_getName_ != nullptr)
<< "Cannot load NeuronDevice_getName!";
return NeuronDevice_getName_;
}
private: private:
NeuronAdapter(); NeuronAdapter();
NeuronAdapter(const NeuronAdapter &) = delete; NeuronAdapter(const NeuronAdapter &) = delete;
...@@ -176,16 +222,23 @@ class NeuronAdapter final { ...@@ -176,16 +222,23 @@ class NeuronAdapter final {
NeuronModel_setOperandSymmPerChannelQuantParams_Type NeuronModel_setOperandSymmPerChannelQuantParams_Type
NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr}; NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr};
NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr}; NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr};
NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension_{
nullptr};
NeuronModel_identifyInputsAndOutputs_Type NeuronModel_identifyInputsAndOutputs_Type
NeuronModel_identifyInputsAndOutputs_{nullptr}; NeuronModel_identifyInputsAndOutputs_{nullptr};
NeuronCompilation_create_Type NeuronCompilation_create_{nullptr}; NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
NeuronCompilation_free_Type NeuronCompilation_free_{nullptr}; NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr}; NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{
nullptr};
NeuronExecution_create_Type NeuronExecution_create_{nullptr}; NeuronExecution_create_Type NeuronExecution_create_{nullptr};
NeuronExecution_free_Type NeuronExecution_free_{nullptr}; NeuronExecution_free_Type NeuronExecution_free_{nullptr};
NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr}; NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr}; NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr};
NeuronExecution_compute_Type NeuronExecution_compute_{nullptr}; NeuronExecution_compute_Type NeuronExecution_compute_{nullptr};
Neuron_getDeviceCount_Type Neuron_getDeviceCount_{nullptr};
Neuron_getDevice_Type Neuron_getDevice_{nullptr};
NeuronDevice_getName_Type NeuronDevice_getName_{nullptr};
}; };
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -161,7 +161,7 @@ class ContextProjectFunctor { ...@@ -161,7 +161,7 @@ class ContextProjectFunctor {
sequence_width}); sequence_width});
if (up_pad > 0) { // add up pad if (up_pad > 0) { // add up pad
int padding_rows = std::min( int padding_rows = (std::min)(
up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i])); up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
for (int k = 0; k < padding_rows; ++k) { for (int k = 0; k < padding_rows; ++k) {
...@@ -180,10 +180,10 @@ class ContextProjectFunctor { ...@@ -180,10 +180,10 @@ class ContextProjectFunctor {
} }
if (down_pad > 0) { // add down pad if (down_pad > 0) { // add down pad
int down_pad_begin_row = int down_pad_begin_row =
std::max(0, (std::max)(
(sequence_height - context_start - context_length) + 1) + 0, (sequence_height - context_start - context_length) + 1) +
1; 1;
int padding_begin = std::max(0, context_start - sequence_height); int padding_begin = (std::max)(0, context_start - sequence_height);
int padding_size = int padding_size =
sequence_height - context_start >= context_length sequence_height - context_start >= context_length
? 1 ? 1
......
...@@ -67,8 +67,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -67,8 +67,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
hend = AdaptEndIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height);
} else { } else {
hstart = ph * stride_height - padding_height; hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height); hend = (std::min)(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0); hstart = (std::max)(hstart, 0);
} }
for (int pw = 0; pw < output_width; ++pw) { for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) { if (adaptive) {
...@@ -76,8 +76,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -76,8 +76,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
wend = AdaptEndIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width);
} else { } else {
wstart = pw * stride_width - padding_width; wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width); wend = (std::min)(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0); wstart = (std::max)(wstart, 0);
} }
T ele = pool_process.initial(); T ele = pool_process.initial();
...@@ -150,8 +150,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -150,8 +150,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
hend = AdaptEndIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height);
} else { } else {
hstart = ph * stride_height - padding_height; hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height); hend = (std::min)(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0); hstart = (std::max)(hstart, 0);
} }
for (int pw = 0; pw < output_width; ++pw) { for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) { if (adaptive) {
...@@ -159,8 +159,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -159,8 +159,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
wend = AdaptEndIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width);
} else { } else {
wstart = pw * stride_width - padding_width; wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width); wend = (std::min)(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0); wstart = (std::max)(wstart, 0);
} }
int pool_size = (exclusive || adaptive) int pool_size = (exclusive || adaptive)
? (hend - hstart) * (wend - wstart) ? (hend - hstart) * (wend - wstart)
...@@ -228,12 +228,12 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> { ...@@ -228,12 +228,12 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ++ph) { for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height; int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height); int hend = (std::min)(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0); hstart = (std::max)(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) { for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width; int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width); int wend = (std::min)(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0); wstart = (std::max)(wstart, 0);
bool stop = false; bool stop = false;
for (int h = hstart; h < hend && !stop; ++h) { for (int h = hstart; h < hend && !stop; ++h) {
...@@ -337,8 +337,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -337,8 +337,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
dend = AdaptEndIndex(pd, input_depth, output_depth); dend = AdaptEndIndex(pd, input_depth, output_depth);
} else { } else {
dstart = pd * stride_depth - padding_depth; dstart = pd * stride_depth - padding_depth;
dend = std::min(dstart + ksize_depth, input_depth); dend = (std::min)(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0); dstart = (std::max)(dstart, 0);
} }
for (int ph = 0; ph < output_height; ++ph) { for (int ph = 0; ph < output_height; ++ph) {
if (adaptive) { if (adaptive) {
...@@ -346,8 +346,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -346,8 +346,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
hend = AdaptEndIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height);
} else { } else {
hstart = ph * stride_height - padding_height; hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height); hend = (std::min)(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0); hstart = (std::max)(hstart, 0);
} }
for (int pw = 0; pw < output_width; ++pw) { for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) { if (adaptive) {
...@@ -355,8 +355,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -355,8 +355,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
wend = AdaptEndIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width);
} else { } else {
wstart = pw * stride_width - padding_width; wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width); wend = (std::min)(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0); wstart = (std::max)(wstart, 0);
} }
int output_idx = (pd * output_height + ph) * output_width + pw; int output_idx = (pd * output_height + ph) * output_width + pw;
T ele = pool_process.initial(); T ele = pool_process.initial();
...@@ -441,8 +441,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -441,8 +441,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
dend = AdaptEndIndex(pd, input_depth, output_depth); dend = AdaptEndIndex(pd, input_depth, output_depth);
} else { } else {
dstart = pd * stride_depth - padding_depth; dstart = pd * stride_depth - padding_depth;
dend = std::min(dstart + ksize_depth, input_depth); dend = (std::min)(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0); dstart = (std::max)(dstart, 0);
} }
for (int ph = 0; ph < output_height; ++ph) { for (int ph = 0; ph < output_height; ++ph) {
if (adaptive) { if (adaptive) {
...@@ -450,8 +450,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -450,8 +450,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
hend = AdaptEndIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height);
} else { } else {
hstart = ph * stride_height - padding_height; hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height); hend = (std::min)(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0); hstart = (std::max)(hstart, 0);
} }
for (int pw = 0; pw < output_width; ++pw) { for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) { if (adaptive) {
...@@ -459,8 +459,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -459,8 +459,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
wend = AdaptEndIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width);
} else { } else {
wstart = pw * stride_width - padding_width; wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width); wend = (std::min)(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0); wstart = (std::max)(wstart, 0);
} }
int pool_size = int pool_size =
...@@ -540,16 +540,16 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> { ...@@ -540,16 +540,16 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
for (int pd = 0; pd < output_depth; ++pd) { for (int pd = 0; pd < output_depth; ++pd) {
int dstart = pd * stride_depth - padding_depth; int dstart = pd * stride_depth - padding_depth;
int dend = std::min(dstart + ksize_depth, input_depth); int dend = (std::min)(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0); dstart = (std::max)(dstart, 0);
for (int ph = 0; ph < output_height; ++ph) { for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height; int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height); int hend = (std::min)(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0); hstart = (std::max)(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) { for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width; int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width); int wend = (std::min)(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0); wstart = (std::max)(wstart, 0);
bool stop = false; bool stop = false;
for (int d = dstart; d < dend && !stop; ++d) { for (int d = dstart; d < dend && !stop; ++d) {
for (int h = hstart; h < hend && !stop; ++h) { for (int h = hstart; h < hend && !stop; ++h) {
...@@ -651,8 +651,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> { ...@@ -651,8 +651,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
hend = AdaptEndIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height);
} else { } else {
hstart = ph * stride_height - padding_height; hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height); hend = (std::min)(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0); hstart = (std::max)(hstart, 0);
} }
for (int pw = 0; pw < output_width; ++pw) { for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) { if (adaptive) {
...@@ -660,8 +660,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> { ...@@ -660,8 +660,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
wend = AdaptEndIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width);
} else { } else {
wstart = pw * stride_width - padding_width; wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width); wend = (std::min)(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0); wstart = (std::max)(wstart, 0);
} }
T1 ele = static_cast<T1>(-FLT_MAX); T1 ele = static_cast<T1>(-FLT_MAX);
...@@ -794,8 +794,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> { ...@@ -794,8 +794,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
dend = AdaptEndIndex(pd, input_depth, output_depth); dend = AdaptEndIndex(pd, input_depth, output_depth);
} else { } else {
dstart = pd * stride_depth - padding_depth; dstart = pd * stride_depth - padding_depth;
dend = std::min(dstart + ksize_depth, input_depth); dend = (std::min)(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0); dstart = (std::max)(dstart, 0);
} }
for (int ph = 0; ph < output_height; ++ph) { for (int ph = 0; ph < output_height; ++ph) {
if (adaptive) { if (adaptive) {
...@@ -803,8 +803,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> { ...@@ -803,8 +803,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
hend = AdaptEndIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height);
} else { } else {
hstart = ph * stride_height - padding_height; hstart = ph * stride_height - padding_height;
hend = std::min(hstart + ksize_height, input_height); hend = (std::min)(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0); hstart = (std::max)(hstart, 0);
} }
for (int pw = 0; pw < output_width; ++pw) { for (int pw = 0; pw < output_width; ++pw) {
if (adaptive) { if (adaptive) {
...@@ -812,8 +812,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> { ...@@ -812,8 +812,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
wend = AdaptEndIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width);
} else { } else {
wstart = pw * stride_width - padding_width; wstart = pw * stride_width - padding_width;
wend = std::min(wstart + ksize_width, input_width); wend = (std::min)(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0); wstart = (std::max)(wstart, 0);
} }
int output_idx = (pd * output_height + ph) * output_width + pw; int output_idx = (pd * output_height + ph) * output_width + pw;
......
...@@ -35,7 +35,7 @@ inline static uint64_t MaximumSequenceLength( ...@@ -35,7 +35,7 @@ inline static uint64_t MaximumSequenceLength(
uint64_t seq_num = seq_offset.size() - 1; uint64_t seq_num = seq_offset.size() - 1;
uint64_t max_seq_len = 0; uint64_t max_seq_len = 0;
for (size_t i = 0; i < seq_num; ++i) { for (size_t i = 0; i < seq_num; ++i) {
max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]); max_seq_len = (std::max)(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
} }
return max_seq_len; return max_seq_len;
} }
......
...@@ -26,7 +26,7 @@ namespace x86 { ...@@ -26,7 +26,7 @@ namespace x86 {
static void SetNumThreads(int num_threads) { static void SetNumThreads(int num_threads) {
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
int real_num_threads = std::max(num_threads, 1); int real_num_threads = (std::max)(num_threads, 1);
x86::MKL_Set_Num_Threads(real_num_threads); x86::MKL_Set_Num_Threads(real_num_threads);
omp_set_num_threads(real_num_threads); omp_set_num_threads(real_num_threads);
#endif #endif
...@@ -52,14 +52,14 @@ static inline void RunParallelFor(const int64_t begin, ...@@ -52,14 +52,14 @@ static inline void RunParallelFor(const int64_t begin,
} }
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
int64_t num_threads = std::min(GetMaxThreads(), end - begin); int64_t num_threads = (std::min)(GetMaxThreads(), end - begin);
if (num_threads > 1) { if (num_threads > 1) {
#pragma omp parallel num_threads(num_threads) #pragma omp parallel num_threads(num_threads)
{ {
int64_t tid = omp_get_thread_num(); int64_t tid = omp_get_thread_num();
int64_t chunk_size = (end - begin + num_threads - 1) / num_threads; int64_t chunk_size = (end - begin + num_threads - 1) / num_threads;
int64_t begin_tid = begin + tid * chunk_size; int64_t begin_tid = begin + tid * chunk_size;
f(begin_tid, std::min(end, chunk_size + begin_tid)); f(begin_tid, (std::min)(end, chunk_size + begin_tid));
} }
return; return;
} }
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <cstdarg> #include <cstdarg>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/api/paddle_api.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
...@@ -27,6 +28,7 @@ ...@@ -27,6 +28,7 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
using L3CacheSetMethod = lite_api::L3CacheSetMethod;
#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU)) #if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
typedef enum { typedef enum {
...@@ -65,11 +67,41 @@ class DeviceInfo { ...@@ -65,11 +67,41 @@ class DeviceInfo {
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; } int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
int l2_cache_size() const { return L2_cache_[active_ids_[0]]; } int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
int l3_cache_size() const { return L3_cache_[active_ids_[0]]; } int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
// Methods for allocating L3Cache on Arm platform
// Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
void SetArmL3CacheSize(
L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
int absolute_val = -1) {
l3_cache_method_ = method;
absolute_l3cache_size_ = absolute_val;
// Realloc memory for sgemm in this context.
workspace_.clear();
workspace_.Resize({llc_size()});
workspace_.mutable_data<int8_t>();
}
int llc_size() const { int llc_size() const {
auto size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]] auto size = absolute_l3cache_size_;
: L2_cache_[active_ids_[0]]; switch (l3_cache_method_) {
// kDeviceL3Cache = 0, use the system L3 Cache size, best performance.
case L3CacheSetMethod::kDeviceL3Cache:
size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
: L2_cache_[active_ids_[0]];
break;
// kDeviceL2Cache = 1, use the system L2 Cache size, trade off performance
// with less memory consumption.
case L3CacheSetMethod::kDeviceL2Cache:
size = L2_cache_[active_ids_[0]];
break;
// kAbsolute = 2, use the external setting.
case L3CacheSetMethod::kAbsolute:
break;
default:
LOG(FATAL) << "Error: unknown l3_cache_method_ !";
}
return size > 0 ? size : 512 * 1024; return size > 0 ? size : 512 * 1024;
} }
bool has_dot() const { return dot_[active_ids_[0]]; } bool has_dot() const { return dot_[active_ids_[0]]; }
bool has_fp16() const { return fp16_[active_ids_[0]]; } bool has_fp16() const { return fp16_[active_ids_[0]]; }
...@@ -121,6 +153,10 @@ class DeviceInfo { ...@@ -121,6 +153,10 @@ class DeviceInfo {
void RequestPowerRandHighMode(int shift_num, int thread_num); void RequestPowerRandHighMode(int shift_num, int thread_num);
void RequestPowerRandLowMode(int shift_num, int thread_num); void RequestPowerRandLowMode(int shift_num, int thread_num);
// Methods for allocating L3Cache on Arm platform
// Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
L3CacheSetMethod l3_cache_method_{L3CacheSetMethod::kDeviceL3Cache};
int absolute_l3cache_size_{-1};
DeviceInfo() = default; DeviceInfo() = default;
}; };
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
......
...@@ -148,7 +148,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice( ...@@ -148,7 +148,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
int cur_life = int cur_life =
(*lifecycles)[TargetToStr(target_type)][var_name].second; (*lifecycles)[TargetToStr(target_type)][var_name].second;
(*lifecycles)[TargetToStr(target_type)][var_name].second = (*lifecycles)[TargetToStr(target_type)][var_name].second =
std::max(max_lifecycle_, cur_life); (std::max)(max_lifecycle_, cur_life);
} }
} }
++max_lifecycle_; ++max_lifecycle_;
......
...@@ -61,7 +61,7 @@ class StaticKernelPickPass : public mir::StmtPass { ...@@ -61,7 +61,7 @@ class StaticKernelPickPass : public mir::StmtPass {
float final_score{-1.}; float final_score{-1.};
Place winner_place{places[0]}; Place winner_place{places[0]};
const int kMax = const int kMax =
std::numeric_limits<core::KernelPickFactor::value_type>::max(); (std::numeric_limits<core::KernelPickFactor::value_type>::max)();
size_t place_size = places.size(); size_t place_size = places.size();
// NOTE: We compare kernel's place with place in valid_places to select the // NOTE: We compare kernel's place with place in valid_places to select the
......
...@@ -14,6 +14,8 @@ lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_br ...@@ -14,6 +14,8 @@ lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_br
lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_concat_op_apu SRCS concat_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_conv_transpose_op_apu SRCS conv_transpose_op.cc DEPS ${apu_subgraph_bridge_deps})
set(apu_subgraph_bridges set(apu_subgraph_bridges
...@@ -25,6 +27,8 @@ set(apu_subgraph_bridges ...@@ -25,6 +27,8 @@ set(apu_subgraph_bridges
subgraph_bridge_softmax_op_apu subgraph_bridge_softmax_op_apu
subgraph_bridge_fc_op_apu subgraph_bridge_fc_op_apu
subgraph_bridge_pool_op_apu subgraph_bridge_pool_op_apu
subgraph_bridge_conv_transpose_op_apu
subgraph_bridge_concat_op_apu
CACHE INTERNAL "apu_subgraph_bridges") CACHE INTERNAL "apu_subgraph_bridges")
message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}") message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/core/subgraph_bridge_registry.h"
#include "lite/kernels/apu/bridges/graph.h"
#include "lite/kernels/apu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace apu {
int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto model = graph->model();
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
int neuron_errCode;
VLOG(3) << "[APU] Converting [" << op_type << "]";
// Get input and output vars and op attributes
auto x_names = op_info->Input("X");
auto out_name = op_info->Output("Out").front();
auto axis = op_info->GetAttr<int>("axis");
auto num = x_names.size();
// Process data layout axis change
if (axis == 1)
axis = 3;
else if (axis == 2)
axis = 1;
else if (axis == 3)
axis = 2;
// Limitation:
// All input tensors of NEURON_TENSOR_QUANT8_ASYMM must
// have the same scale and zeroPoint as the output tensor
CHECK(op_info->HasOutputScale(out_name));
auto output_scale = op_info->GetOutputScale(out_name)[0];
// Traverse all of input nodes
std::vector<std::shared_ptr<Node>> input_nodes;
NeuronOperandType xType;
for (auto& x_name : x_names) {
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
std::shared_ptr<Node> x_node = nullptr;
CHECK(op_info->HasInputScale(x_name));
auto input_scale = op_info->GetInputScale(x_name)[0];
// Add x tensor type
xType.type = NEURON_TENSOR_QUANT8_ASYMM;
xType.scale = input_scale;
xType.zeroPoint = 128;
xType.dimensionCount = x_dims.size();
std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
(uint32_t)x_dims[2],
(uint32_t)x_dims[3],
(uint32_t)x_dims[1]};
xType.dimensions = &dims_x[0];
if (graph->Has(x_name)) {
VLOG(3) << "Graph has " << x_name;
if (graph->IsInput(x_name)) {
VLOG(3) << x_name << "is input and already exist";
x_name = "transpose_" + x_name;
}
if (graph->IsOutput(x_name)) {
VLOG(3) << x_name << "is input and output node";
x_name = "transpose_" + x_name;
}
x_node = graph->Get(x_name);
} else {
// Add input operand
if (graph->IsInput(x_name)) {
// Insert transpose for NCHW -> NHWC
insert_transpose_node(ctx,
x_name,
"transpose_" + x_name,
{(uint32_t)x_dims[0],
(uint32_t)x_dims[1],
(uint32_t)x_dims[2],
(uint32_t)x_dims[3]},
dims_x,
{0, 2, 3, 1},
xType.scale,
xType.zeroPoint);
// Change x_name because we add transpose op
x_name = "transpose_" + x_name;
x_node = graph->Get(x_name);
} else {
NeuronModel_addOperand(model, &xType);
x_node = graph->Add(x_name, dims_x);
}
} // End of else
if (x_node == nullptr) return subgraph::FAILED;
input_nodes.push_back(x_node);
VLOG(3) << "input node x: " << x_node->index()
<< ": input_scale: " << input_scale << " x_dims:" << x_dims[0]
<< ":" << x_dims[1] << ":" << x_dims
<< ", inType: " << xType.dimensions[0] << ":" << xType.dimensions[1]
<< ":" << xType.dimensions[2] << ":" << xType.dimensions[3];
} // End of for
if (input_nodes.size() != num) {
LOG(WARNING) << "Create input operand failed!";
return subgraph::FAILED;
}
// Add axis operand type
NeuronOperandType int32Type;
int32Type.type = NEURON_INT32;
int32Type.dimensionCount = 0;
std::vector<uint32_t> dims_int32 = {1};
// Add axis operand
std::shared_ptr<Node> axis_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // axis
axis_node = graph->Add(out_name + "_axis", dims_int32);
VLOG(3) << "axis:" << axis;
// Add out operand type
auto out = scope->FindMutableTensor(out_name);
auto out_dims = out->dims();
NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM;
outType.scale = output_scale;
outType.zeroPoint = 128;
outType.dimensionCount = out_dims.size();
std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
(uint32_t)out_dims[2],
(uint32_t)out_dims[3],
(uint32_t)out_dims[1]};
outType.dimensions = &dims_out[0];
// Add out operand
std::shared_ptr<Node> out_node = nullptr;
if (graph->Has(out_name)) {
out_node = graph->Get(out_name);
} else {
if (graph->IsOutput(out_name)) {
NeuronModel_addOperand(model, &outType);
out_node = graph->Add("transpose_" + out_name, dims_out);
} else {
NeuronModel_addOperand(model, &outType);
out_node = graph->Add(out_name, dims_out);
}
}
VLOG(3) << "out node idx: " << out_node->index()
<< ": output_scle: " << outType.scale
<< ", outType: " << outType.dimensions[0] << ":"
<< outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
<< outType.dimensions[3];
// Set axis value
int32_t axis_val[1] = {(int32_t)axis};
NeuronModel_setOperandValue(
model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
std::vector<uint32_t> addInIndex;
for (auto& node : input_nodes) {
addInIndex.push_back(node->index());
}
addInIndex.push_back(axis_node->index());
std::vector<uint32_t> addOutIndex = {out_node->index()};
neuron_errCode = NeuronModel_addOperation(model,
NEURON_CONCATENATION,
addInIndex.size(),
&addInIndex[0],
addOutIndex.size(),
&addOutIndex[0]);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Add op fail:" << op_type;
return subgraph::FAILED;
}
if (graph->IsOutput(out_name)) {
// Insert transpose for NHWC -> NCHW
insert_transpose_node(ctx,
"transpose_" + out_name,
out_name,
dims_out,
{(uint32_t)out_dims[0],
(uint32_t)out_dims[1],
(uint32_t)out_dims[2],
(uint32_t)out_dims[3]},
{0, 3, 1, 2},
outType.scale,
outType.zeroPoint);
out_node = graph->Get(out_name);
if (out_node == nullptr) return subgraph::FAILED;
}
return SUCCESS;
}
} // namespace apu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(concat,
kAPU,
paddle::lite::subgraph::apu::ConcatConverter);
...@@ -73,7 +73,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -73,7 +73,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK_EQ(strides.size(), 2L); CHECK_EQ(strides.size(), 2L);
CHECK_EQ(dilations.size(), 2L); CHECK_EQ(dilations.size(), 2L);
bool is_depthwise_mode = ic == groups && oc == groups; bool is_depthwise_mode = ic == groups && oc == groups;
VLOG(3) << "is_depthwise_mode" << is_depthwise_mode; VLOG(3) << "is_depthwise_mode: " << is_depthwise_mode;
if (paddings.size() == 2L) { if (paddings.size() == 2L) {
for (size_t i = 0; i < strides.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
...@@ -103,6 +103,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -103,6 +103,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto filter_scale = op_info->GetInputScale(filter_name); auto filter_scale = op_info->GetInputScale(filter_name);
CHECK(op_info->HasOutputScale(output_name)); CHECK(op_info->HasOutputScale(output_name));
auto output_scale = op_info->GetOutputScale(output_name)[0]; auto output_scale = op_info->GetOutputScale(output_name)[0];
auto orig_output_scale = op_info->GetOutputScale(output_name)[0];
VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
<< " ,dilations: " << dilations[0] << ":" << dilations[1]; << " ,dilations: " << dilations[0] << ":" << dilations[1];
...@@ -128,23 +129,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -128,23 +129,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
std::shared_ptr<Node> input_node = nullptr; std::shared_ptr<Node> input_node = nullptr;
if (graph->Has(input_name)) { if (graph->Has(input_name)) {
VLOG(3) << "Graph has " << input_name; VLOG(3) << "Graph has " << input_name;
// input operand already exist
if (graph->IsInput(input_name)) {
VLOG(3) << input_name << "is input and already exist";
input_name = "transpose_" + input_name;
}
if (graph->IsOutput(input_name)) {
VLOG(3) << input_name << "is input and output node";
input_name = "transpose_" + input_name;
}
input_node = graph->Get(input_name); input_node = graph->Get(input_name);
} else { } else {
// add input operand
if (graph->IsInput(input_name)) { if (graph->IsInput(input_name)) {
// Insert transpose for NCHW -> NHWC // Insert transpose for NCHW -> NHWC
insert_transpose_node( insert_transpose_node(ctx,
ctx, input_name,
input_name, "transpose_" + input_name,
"transpose_" + input_name, {(uint32_t)input_dims[0],
{input_dims[0], input_dims[1], input_dims[2], input_dims[3]}, (uint32_t)input_dims[1],
dims_in, (uint32_t)input_dims[2],
{0, 2, 3, 1}, (uint32_t)input_dims[3]},
inType.scale, dims_in,
inType.zeroPoint); {0, 2, 3, 1},
inType.scale,
// change input_name inType.zeroPoint);
input_name = "transpose_" + input_name; input_name = "transpose_" + input_name;
input_node = graph->Get(input_name); input_node = graph->Get(input_name);
if (input_node == nullptr) return subgraph::FAILED; if (input_node == nullptr) return subgraph::FAILED;
...@@ -153,7 +163,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -153,7 +163,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
input_node = graph->Add(input_name, dims_in); input_node = graph->Add(input_name, dims_in);
} }
} }
VLOG(3) << "input node idx" << input_node->index() VLOG(3) << "input node idx: " << input_node->index()
<< ": input_scale: " << input_scale << ": input_scale: " << input_scale
<< ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1] << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
<< ":" << inType.dimensions[2] << ":" << inType.dimensions[3]; << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
...@@ -161,8 +171,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -161,8 +171,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Add bias type // Add bias type
NeuronOperandType biasType; NeuronOperandType biasType;
// Add filter type // Add filter type, filter data re-layout NCHW -> NHWC
// filter NCHW -> NHWC
Tensor transpose_filter; Tensor transpose_filter;
std::vector<uint32_t> dims_filter; std::vector<uint32_t> dims_filter;
...@@ -233,10 +242,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -233,10 +242,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
biasType.scale = 0; biasType.scale = 0;
} }
auto precision = filter->precision();
std::shared_ptr<Node> filter_node = nullptr; std::shared_ptr<Node> filter_node = nullptr;
if (1 == filter_scale.size()) { if (1 == filter_scale.size()) {
NeuronModel_addOperand(model, &filterType); // 1: filter NeuronModel_addOperand(model, &filterType);
filter_node = graph->Add(filter_name, dims_filter); filter_node = graph->Add(filter_name, dims_filter); // Operand 1: filter
VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]" VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
<< filter_scale[0] << ": filterType: " << filterType.dimensions[0] << filter_scale[0] << ": filterType: " << filterType.dimensions[0]
<< ":" << filterType.dimensions[1] << ":" << ":" << filterType.dimensions[1] << ":"
...@@ -251,7 +261,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -251,7 +261,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
return subgraph::FAILED; return subgraph::FAILED;
} }
} else { } else {
NeuronModel_addOperand(model, &channelFilterType); // 1: filter NeuronModel_addOperand(model, &channelFilterType); // Operand 1: filter
filter_node = graph->Add(filter_name, dims_filter); filter_node = graph->Add(filter_name, dims_filter);
VLOG(3) << "chennel filter node idx: " << filter_node->index() VLOG(3) << "chennel filter node idx: " << filter_node->index()
<< " ,scale_count:" << filter_scale.size() << " ,scale_count:" << filter_scale.size()
...@@ -280,7 +290,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -280,7 +290,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Add biasType node value // Add biasType node value
// A 1-D tensor, of shape [depth_out], specifying the bias. // A 1-D tensor, of shape [depth_out], specifying the bias.
// For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias
// should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 // should be of NEURON_TENSOR_INT32, with zeroPoint of 0
// and bias_scale of 0. The actual scale of each value 'i' is equal // and bias_scale of 0. The actual scale of each value 'i' is equal
// to bias_scale[i] = input_scale * filter_scale[i]. // to bias_scale[i] = input_scale * filter_scale[i].
biasType.type = NEURON_TENSOR_INT32; biasType.type = NEURON_TENSOR_INT32;
...@@ -296,16 +306,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -296,16 +306,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
for (int i = 0; i < bias_dims.size(); i++) for (int i = 0; i < bias_dims.size(); i++)
dims_bias.push_back(bias_dims[i]); dims_bias.push_back(bias_dims[i]);
biasType.dimensions = &dims_bias[0]; biasType.dimensions = &dims_bias[0];
NeuronModel_addOperand(model, &biasType); // 2: bias NeuronModel_addOperand(model, &biasType); // Operand 2: bias
bias_node = graph->Add(bias_name, dims_bias); bias_node = graph->Add(bias_name, dims_bias);
VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name VLOG(3) << "node idx: " << bias_node->index()
<< ": Bias name: " << bias_name
<< " ,bias scale: " << biasType.scale << " ,bias scale: " << biasType.scale
<< " ,dimensions: " << bias_dims; << " ,dimensions: " << bias_dims;
} else { } else {
biasType.dimensionCount = 1; biasType.dimensionCount = 1;
dims_bias = {(uint32_t)output_dims[1]}; dims_bias = {(uint32_t)output_dims[1]};
biasType.dimensions = &dims_bias[0]; biasType.dimensions = &dims_bias[0];
NeuronModel_addOperand(model, &biasType); // 2: bias NeuronModel_addOperand(model, &biasType); // Operand 2: bias
bias_node = graph->Add(filter_name + "_default_bias", dims_bias); bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias " VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias "
<< " ,bias scale: " << biasType.scale << " ,bias scale: " << biasType.scale
...@@ -318,39 +329,51 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -318,39 +329,51 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
std::vector<uint32_t> dims_int32 = {1}; std::vector<uint32_t> dims_int32 = {1};
std::shared_ptr<Node> paddingL_node = nullptr; std::shared_ptr<Node> paddingL_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 3: padding left NeuronModel_addOperand(model, &int32Type); // Operand 3: padding left
paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32); paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
std::shared_ptr<Node> paddingR_node = nullptr; std::shared_ptr<Node> paddingR_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 4: padding right NeuronModel_addOperand(model, &int32Type); // Operand 4: padding right
paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32); paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
std::shared_ptr<Node> paddingT_node = nullptr; std::shared_ptr<Node> paddingT_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 5: padding top NeuronModel_addOperand(model, &int32Type); // Operand 5: padding top
paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32); paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
std::shared_ptr<Node> paddingB_node = nullptr; std::shared_ptr<Node> paddingB_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 6: padding bottom NeuronModel_addOperand(model, &int32Type); // Operand 6: padding bottom
paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32); paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
std::shared_ptr<Node> strideW_node = nullptr; std::shared_ptr<Node> strideW_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 7: stride width NeuronModel_addOperand(model, &int32Type); // Operand 7: stride width
strideW_node = graph->Add(filter_name + "_stride_width", dims_int32); strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
std::shared_ptr<Node> strideH_node = nullptr; std::shared_ptr<Node> strideH_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 8: stride height NeuronModel_addOperand(model, &int32Type); // Operand 8: stride height
strideH_node = graph->Add(filter_name + "_stride_height", dims_int32); strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
std::shared_ptr<Node> dm_node = nullptr; std::shared_ptr<Node> dm_node = nullptr;
if (is_depthwise_mode) { if (is_depthwise_mode) {
NeuronModel_addOperand(model, &int32Type); // 9: depthwise multiplier NeuronModel_addOperand(model,
&int32Type); // Operand 9: depthwise multiplier
dm_node = graph->Add(filter_name + "_dm", dims_int32); dm_node = graph->Add(filter_name + "_dm", dims_int32);
} }
std::shared_ptr<Node> fuse_node = nullptr; std::shared_ptr<Node> fuse_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 9/10: fuse NeuronModel_addOperand(model, &int32Type); // Operand 9/10: fuse
fuse_node = graph->Add(filter_name + "_fuse", dims_int32); fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
/* Check output scale */
if (is_depthwise_mode) {
for (auto s : filter_scale) {
if (output_scale < s * input_scale)
output_scale = s * input_scale + 0.000001;
}
#ifdef LITE_MEDIATEK_APU_ENABLE_REQUANT
output_scale = orig_output_scale;
#endif
}
// Add output tensor type // Add output tensor type
NeuronOperandType outType; NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM; outType.type = NEURON_TENSOR_QUANT8_ASYMM;
...@@ -366,12 +389,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -366,12 +389,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if (graph->Has(output_name)) { if (graph->Has(output_name)) {
output_node = graph->Get(output_name); output_node = graph->Get(output_name);
} else { } else {
// add output operand // Add output operand
if (graph->IsOutput(output_name)) { NeuronModel_addOperand(model, &outType);
NeuronModel_addOperand(model, &outType); // output
if (orig_output_scale != output_scale) {
// Need to insert requant op, the result is requant_ -> transpose_ ->
// output
output_node = graph->Add("requant_" + output_name, dims_out);
} else if (graph->IsOutput(output_name)) {
// Need to insert transpose op, transpose_ -> output
output_node = graph->Add("transpose_" + output_name, dims_out); output_node = graph->Add("transpose_" + output_name, dims_out);
} else { } else {
NeuronModel_addOperand(model, &outType); // output
output_node = graph->Add(output_name, dims_out); output_node = graph->Add(output_name, dims_out);
} }
} }
...@@ -433,10 +461,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -433,10 +461,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Add Stride // Add Stride
int32_t stride_val[1]; int32_t stride_val[1];
stride_val[0] = strides[1]; // width stride_val[0] = strides[1]; // Entry 1: width stride
NeuronModel_setOperandValue( NeuronModel_setOperandValue(
model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
stride_val[0] = strides[0]; // height stride_val[0] = strides[0]; // Entry 0: height stride
NeuronModel_setOperandValue( NeuronModel_setOperandValue(
model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
...@@ -460,7 +488,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -460,7 +488,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
model, dm_node->index(), &dm, sizeof(int32_t) * 1); model, dm_node->index(), &dm, sizeof(int32_t) * 1);
VLOG(3) << "depthwise multiplier:" << dm; VLOG(3) << "depthwise multiplier:" << dm;
// Depthwise conv // Depthwise conv case
NeuronModel_setOperandValue( NeuronModel_setOperandValue(
model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
std::vector<uint32_t> addInIndex = { std::vector<uint32_t> addInIndex = {
...@@ -512,19 +540,46 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -512,19 +540,46 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
return FAILED; return FAILED;
} }
// Check if Requant OP is needed
std::shared_ptr<Node> requant_node = nullptr;
if (orig_output_scale != output_scale) {
std::string requant_out_name = output_name;
VLOG(3) << "Insert requant output scale, orig:" << orig_output_scale
<< " ,output_scale:" << output_scale;
if (graph->IsOutput(output_name)) {
requant_out_name = "transpose_" + output_name;
}
insert_requant_node(ctx,
"requant_" + output_name,
requant_out_name,
dims_out,
dims_out,
output_scale,
orig_output_scale,
outType.zeroPoint);
requant_node = graph->Get(requant_out_name);
if (requant_node == nullptr) return subgraph::FAILED;
}
std::shared_ptr<Node> transpose_node = nullptr;
if (graph->IsOutput(output_name)) { if (graph->IsOutput(output_name)) {
VLOG(3) << "Add output transpose:" << output_name;
// Insert transpose for NHWC -> NCHW // Insert transpose for NHWC -> NCHW
insert_transpose_node( insert_transpose_node(ctx,
ctx, "transpose_" + output_name,
"transpose_" + output_name, output_name,
output_name, dims_out,
dims_out, {(uint32_t)output_dims[0],
{output_dims[0], output_dims[1], output_dims[2], output_dims[3]}, (uint32_t)output_dims[1],
{0, 3, 1, 2}, (uint32_t)output_dims[2],
outType.scale, (uint32_t)output_dims[3]},
outType.zeroPoint); {0, 3, 1, 2},
output_node = graph->Get(output_name); outType.scale,
if (output_node == nullptr) return subgraph::FAILED; outType.zeroPoint);
transpose_node = graph->Get(output_name);
if (transpose_node == nullptr) return subgraph::FAILED;
} }
return REBUILD_WHEN_SHAPE_CHANGED; return REBUILD_WHEN_SHAPE_CHANGED;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/core/subgraph_bridge_registry.h"
#include "lite/kernels/apu/bridges/graph.h"
#include "lite/kernels/apu/bridges/utility.h"
#include "lite/operators/conv_op.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace apu {
int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph *>(ctx);
auto model = graph->model();
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
int neuron_errCode;
VLOG(3) << "[APU] Converting [" << op_type << "]";
CHECK(op_info->HasAttr("enable_int8") &&
op_info->GetAttr<bool>("enable_int8"));
// Get input, output and op attributes
auto input_name = op_info->Input("Input").front();
auto input = scope->FindMutableTensor(input_name);
auto input_dims = input->dims();
CHECK_EQ(input_dims.size(), 4);
auto filter_name = op_info->Input("Filter").front();
auto filter = scope->FindMutableTensor(filter_name);
auto filter_dims = filter->dims();
CHECK_EQ(filter_dims.size(), 4);
auto output_name = op_info->Output("Output").front();
auto strides = op_info->GetAttr<std::vector<int>>("strides");
CHECK_EQ(strides.size(), 2L);
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
auto groups = op_info->GetAttr<int>("groups");
if (groups > 1) {
LOG(WARNING) << "[NPU] only support groups == 1";
return FAILED;
}
bool with_act =
op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
std::string act_type =
with_act ? op_info->GetAttr<std::string>("act_type") : "";
float leaky_relu_alpha = act_type == "leaky_relu"
? op_info->GetAttr<float>("leaky_relu_alpha")
: 0.f;
auto fuse_relu =
op_info->HasAttr("fuse_relu") && op_info->GetAttr<bool>("fuse_relu");
auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
CHECK_EQ(dilations.size(), 2L);
std::string padding_algorithm =
op_info->HasAttr("padding_algorithm")
? op_info->GetAttr<std::string>("padding_algorithm")
: "";
if (paddings.size() == 2L) {
for (size_t i = 0; i < strides.size(); ++i) {
int copy_pad = *(paddings.begin() + 2 * i);
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
}
CHECK_EQ(paddings.size(), 4L)
<< "[APU] Paddings size should be the same or twice as the input size."
<< paddings.size();
operators::UpdatePaddingAndDilation(&paddings,
&dilations,
strides,
padding_algorithm,
input_dims,
filter_dims);
std::vector<int> output_dims;
// Set output_dims: batches
output_dims.push_back(input_dims[0]);
std::vector<int> output_size;
if (op_info->HasAttr("output_size")) {
output_size = op_info->GetAttr<std::vector<int>>("output_size");
}
if (output_size.size() > 2) {
// Set output_dims: height, width
output_dims.push_back(output_size[0]);
output_dims.push_back(output_size[1]);
} else {
// Compute output size
for (int i = 0; i < strides.size(); i++) {
int kernel_ext = filter_dims[i + 2];
int output_size = (input_dims[i + 2] - 1) * strides[i] + kernel_ext -
paddings[i * 2] - paddings[i * 2 + 1];
output_dims.push_back(output_size);
}
}
output_dims.push_back(filter_dims[1]);
CHECK(op_info->HasInputScale(input_name));
auto input_scale = op_info->GetInputScale(input_name)[0];
CHECK(op_info->HasInputScale(filter_name));
auto filter_scale = op_info->GetInputScale(filter_name);
CHECK(op_info->HasOutputScale(output_name));
auto output_scale = op_info->GetOutputScale(output_name)[0];
VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
<< " ,dilations: " << dilations[0] << ":" << dilations[1];
VLOG(3) << "with_act: " << with_act << " ,act_type: " << act_type;
VLOG(3) << "input_dims: " << input_dims
<< " ,filter_scale size: " << filter_scale.size();
VLOG(3) << "filter_dims(Cin, Cout, H, W): " << filter_dims
<< " ,memory_size: " << filter->memory_size()
<< " ,data_size: " << filter->data_size();
// Add input tensor type
NeuronOperandType inType;
inType.type = NEURON_TENSOR_QUANT8_ASYMM;
inType.scale = input_scale;
inType.zeroPoint = 128;
inType.dimensionCount = input_dims.size();
std::vector<uint32_t> dims_in = {(uint32_t)input_dims[0],
(uint32_t)input_dims[2],
(uint32_t)input_dims[3],
(uint32_t)input_dims[1]};
inType.dimensions = &dims_in[0];
std::shared_ptr<Node> input_node = nullptr;
if (graph->Has(input_name)) {
VLOG(3) << "Graph has " << input_name;
// Input operand already created by previous OP
input_node = graph->Get(input_name);
} else {
// Add input operand
if (graph->IsInput(input_name)) {
// Insert transpose for NCHW -> NHWC
insert_transpose_node(ctx,
input_name,
"transpose_" + input_name,
{(uint32_t)input_dims[0],
(uint32_t)input_dims[1],
(uint32_t)input_dims[2],
(uint32_t)input_dims[3]},
dims_in,
{0, 2, 3, 1},
inType.scale,
inType.zeroPoint);
// Change input_name because we add transpose op
input_name = "transpose_" + input_name;
input_node = graph->Get(input_name);
if (input_node == nullptr) return subgraph::FAILED;
} else {
NeuronModel_addOperand(model, &inType);
input_node = graph->Add(input_name, dims_in);
}
}
VLOG(3) << "input node idx: " << input_node->index()
<< ": input_scale: " << input_scale
<< ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
<< ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
// Add bias type
NeuronOperandType biasType;
// Add filter type
// Relay out filter (Cin,Cout,H,W) -> (depth_out, h, w, depth_in)
Tensor transpose_filter;
std::vector<uint32_t> dims_filter;
transpose_filter.Resize({(uint32_t)filter_dims[1],
(uint32_t)filter_dims[2],
(uint32_t)filter_dims[3],
(uint32_t)filter_dims[0]});
transposeAsym(filter->data<int8_t>(),
transpose_filter.mutable_data<uint8_t>(),
{(uint32_t)filter_dims[0],
(uint32_t)filter_dims[1],
(uint32_t)filter_dims[2],
(uint32_t)filter_dims[3]},
{1, 2, 3, 0});
dims_filter = {(uint32_t)filter_dims[1],
(uint32_t)filter_dims[2],
(uint32_t)filter_dims[3],
(uint32_t)filter_dims[0]};
NeuronOperandType filterType;
filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
filterType.scale = filter_scale[0];
filterType.zeroPoint = 128;
filterType.dimensionCount = filter_dims.size();
filterType.dimensions = &dims_filter[0];
biasType.scale = inType.scale * filterType.scale;
std::shared_ptr<Node> filter_node = nullptr;
NeuronModel_addOperand(model, &filterType);
filter_node = graph->Add(filter_name, dims_filter);
auto precision = filter->precision();
VLOG(3) << " filter node idx: " << filter_node->index()
<< " filter_scale[0]=" << filter_scale[0]
<< " filter memory_size=" << filter->memory_size()
<< " filter precision=" << PrecisionToStr(precision)
<< " :filterType: " << filterType.dimensions[0] << ":"
<< filterType.dimensions[2] << ":" << filterType.dimensions[2] << ":"
<< filterType.dimensions[3];
memcpy(filter->mutable_data<int8_t>(),
transpose_filter.mutable_data<uint8_t>(),
filter->memory_size());
// Set filter value
neuron_errCode = NeuronModel_setOperandValue(
model, filter_node->index(), filter->raw_data(), filter->memory_size());
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode;
return subgraph::FAILED;
}
// Add biasType node value
// A 1-D tensor, of shape [depth_out], specifying the bias.
// For filter tensor of NEURON_TENSOR_QUANT8_ASYMM, the bias should be of
// NEURON_TENSOR_INT32 with zeroPoint of 0 and bias_scale ==
// input_scale * filter_scale
biasType.type = NEURON_TENSOR_INT32;
biasType.zeroPoint = 0;
std::vector<uint32_t> dims_bias;
std::shared_ptr<Node> bias_node = nullptr;
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
auto bias = scope->FindMutableTensor(bias_name);
auto bias_dims = bias->dims();
auto channel_size = bias->dims().production();
CHECK_EQ(channel_size, filter_dims[1] * groups);
CHECK_EQ(bias_dims.size(), 1);
biasType.dimensionCount = bias_dims.size();
for (int i = 0; i < bias_dims.size(); i++)
dims_bias.push_back(bias_dims[i]);
biasType.dimensions = &dims_bias[0];
NeuronModel_addOperand(model, &biasType); // Operand 2: bias
bias_node = graph->Add(bias_name, dims_bias);
VLOG(3) << "node idx: " << bias_node->index()
<< ": Bias name: " << bias_name
<< " ,bias scale: " << biasType.scale
<< " ,dimensions: " << bias_dims
<< " ,channel_size:" << channel_size;
} else {
// Create default bias with value 0
biasType.dimensionCount = 1;
dims_bias = {(uint32_t)output_dims[1]};
biasType.dimensions = &dims_bias[0];
NeuronModel_addOperand(model, &biasType); // Operand 2: bias
bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
VLOG(3) << "node idx: " << bias_node->index()
<< ": Bias name: default_bias "
<< " ,bias scale: " << biasType.scale
<< " ,dimensions: " << dims_bias.size();
}
NeuronOperandType int32Type;
int32Type.type = NEURON_INT32;
int32Type.dimensionCount = 0;
std::vector<uint32_t> dims_int32 = {1};
std::shared_ptr<Node> paddingL_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // Operand 3: padding left
paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
std::shared_ptr<Node> paddingR_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // Operand 4: padding right
paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
std::shared_ptr<Node> paddingT_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // Operand 5: padding top
paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
std::shared_ptr<Node> paddingB_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // Operand 6: padding bottom
paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
std::shared_ptr<Node> strideW_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // Operand 7: stride width
strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
std::shared_ptr<Node> strideH_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // Operand 8: stride height
strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
std::shared_ptr<Node> fuse_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // Operand 9: fuse
fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
NeuronOperandType boolType;
boolType.type = NEURON_BOOL;
boolType.dimensionCount = 0; // Must be 0 for scalars.
std::shared_ptr<Node> layout_node = nullptr;
NeuronModel_addOperand(model, &boolType); // Operand 9: fuse
layout_node = graph->Add(filter_name + "_layout", dims_int32);
// Add output tensor type
NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM;
outType.scale = output_scale;
outType.zeroPoint = 128;
outType.dimensionCount = output_dims.size();
std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
(uint32_t)output_dims[1],
(uint32_t)output_dims[2],
(uint32_t)output_dims[3]};
outType.dimensions = &dims_out[0];
std::shared_ptr<Node> output_node = nullptr;
if (graph->Has(output_name)) {
output_node = graph->Get(output_name);
} else {
if (graph->IsOutput(output_name)) {
NeuronModel_addOperand(model, &outType);
output_node = graph->Add("transpose_" + output_name, dims_out);
} else {
NeuronModel_addOperand(model, &outType);
output_node = graph->Add(output_name, dims_out);
}
}
VLOG(3) << "output node idx: " << output_node->index()
<< ": output_scale: " << outType.scale
<< " ,outType: " << outType.dimensions[0] << ":"
<< outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
<< outType.dimensions[3];
// Add bias value
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
auto bias = scope->FindMutableTensor(bias_name);
int32_t *int32_bias_data =
reinterpret_cast<int32_t *>(bias->mutable_data<float>());
float2int32(
bias->data<float>(), input_scale, filter_scale, int32_bias_data);
VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << ":"
<< int32_bias_data[1] << ":" << int32_bias_data[2] << ":"
<< int32_bias_data[3];
neuron_errCode = NeuronModel_setOperandValue(
model, bias_node->index(), bias->raw_data(), bias->memory_size());
} else {
auto int32_bias = std::make_shared<Tensor>();
int32_bias->Resize({1, output_dims[3]});
int32_bias->mutable_data<int32_t>();
VLOG(3) << "bais_default: " << int32_bias->memory_size();
memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
neuron_errCode = NeuronModel_setOperandValue(model,
bias_node->index(),
int32_bias->raw_data(),
int32_bias->memory_size());
bias_node->set_data(int32_bias);
}
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode;
return subgraph::FAILED;
}
VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":"
<< paddings[2] << ":" << paddings[3];
// Add padding value
int32_t padding_val[1];
padding_val[0] = paddings[2];
NeuronModel_setOperandValue(
model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1);
padding_val[0] = paddings[3];
NeuronModel_setOperandValue(
model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1);
padding_val[0] = paddings[0];
NeuronModel_setOperandValue(
model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1);
padding_val[0] = paddings[1];
NeuronModel_setOperandValue(
model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1);
VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0];
// Add Stride
int32_t stride_val[1];
stride_val[0] = strides[1]; // entry 1: width stride
NeuronModel_setOperandValue(
model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
stride_val[0] = strides[0]; // entry 0: height stride
NeuronModel_setOperandValue(
model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
int32_t fuse_val[1] = {NEURON_FUSED_NONE};
if (act_type == "relu") {
fuse_val[0] = NEURON_FUSED_RELU;
} else if (act_type == "relu1") {
fuse_val[0] = NEURON_FUSED_RELU1;
} else if (act_type == "relu6") {
fuse_val[0] = NEURON_FUSED_RELU6;
} else if (!act_type.empty()) {
fuse_val[0] = NEURON_FUSED_NONE;
LOG(WARNING) << "Support act_type: " << act_type;
return FAILED;
}
NeuronModel_setOperandValue(
model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
bool layout_val[] = {false};
NeuronModel_setOperandValue(
model, layout_node->index(), layout_val, sizeof(bool) * 1);
std::vector<uint32_t> addInIndex = {
input_node->index(), // 0: input
filter_node->index(), // 1: filter
bias_node->index(), // 2: bias
paddingL_node->index(), // 3: padding left
paddingR_node->index(), // 4: padding right
paddingT_node->index(), // 5: padding top
paddingB_node->index(), // 6: padding bottom
strideW_node->index(), // 7: stride width
strideH_node->index(), // 8: stride height
fuse_node->index(), // 9: fuse
layout_node->index()}; // 10: layout
std::vector<uint32_t> addOutIndex = {output_node->index()};
neuron_errCode = NeuronModel_addOperation(model,
NEURON_TRANSPOSE_CONV_2D,
addInIndex.size(),
&addInIndex[0],
addOutIndex.size(),
&addOutIndex[0]);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Add op fail:" << op_type;
return FAILED;
}
if (graph->IsOutput(output_name)) {
// Insert transpose for NHWC -> NCHW
insert_transpose_node(ctx,
"transpose_" + output_name,
output_name,
dims_out,
{(uint32_t)output_dims[0],
(uint32_t)output_dims[1],
(uint32_t)output_dims[2],
(uint32_t)output_dims[3]},
{0, 3, 1, 2},
outType.scale,
outType.zeroPoint);
output_node = graph->Get(output_name);
if (output_node == nullptr) return subgraph::FAILED;
}
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace apu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
kAPU,
paddle::lite::subgraph::apu::ConvTransposeConverter);
...@@ -29,28 +29,252 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -29,28 +29,252 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto scope = op->scope(); auto scope = op->scope();
VLOG(3) << "[APU] Converting " + op_type + "..."; int neuron_errCode;
VLOG(3) << "[APU] Converting [" + op_type + "]";
// Get input and output vars and op attributes // Get input and output vars and op attributes
auto x_name = op_info->Input("X").front(); auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name); auto x = scope->FindTensor(x_name);
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_name = op_info->Input("Y").front(); auto y_name = op_info->Input("Y").front();
auto y = scope->FindMutableTensor(y_name); auto y = scope->FindTensor(y_name);
auto y_dims = y->dims(); auto y_dims = y->dims();
auto out_name = op_info->Output("Out").front(); auto out_name = op_info->Output("Out").front();
auto out = scope->FindMutableTensor(out_name); auto out = scope->FindTensor(out_name);
auto out_dims = out->dims(); auto out_dims = out->dims();
auto axis = op_info->GetAttr<int>("axis"); auto axis = op_info->GetAttr<int>("axis");
if (axis < 0) {
axis = x_dims.size() - y_dims.size();
}
auto x_shape = x_dims.Vectorize();
auto y_shape = y_dims.Vectorize();
// Two dimensions are compatible when:
// 1. they are equal, or
// 2. one of them is 1
for (int i = axis; i < x_shape.size(); i++) {
if (x_dims[i] != y_dims[i - axis]) {
// Input 1 compatible dimensions as input0
if (y_dims[i - axis] != 1) {
LOG(WARNING) << i << ":" << axis << ":" << y_dims[i - axis];
return FAILED;
}
}
} // End of for
int32_t fuse_val[1] = {NEURON_FUSED_NONE};
// Act node // Act node
if (op_type == "fusion_elementwise_add_activation" || if (op_type == "fusion_elementwise_add_activation" ||
op_type == "fusion_elementwise_sub_activation" || op_type == "fusion_elementwise_sub_activation" ||
op_type == "fusion_elementwise_mul_activation" || op_type == "fusion_elementwise_mul_activation" ||
op_type == "fusion_elementwise_div_activation") { op_type == "fusion_elementwise_div_activation") {
auto act_type = op_info->GetAttr<std::string>("act_type"); auto act_type = op_info->GetAttr<std::string>("act_type");
if (act_type == "relu") {
fuse_val[0] = NEURON_FUSED_RELU;
} else if (act_type == "relu1") {
fuse_val[0] = NEURON_FUSED_RELU1;
} else if (act_type == "relu6") {
fuse_val[0] = NEURON_FUSED_RELU6;
} else if (!act_type.empty()) {
fuse_val[0] = NEURON_FUSED_NONE;
LOG(WARNING) << "Support act_type: " << act_type;
return FAILED;
}
} // End of if
VLOG(3) << "x_name" << x_name;
CHECK(op_info->HasInputScale(x_name));
auto x_scale = op_info->GetInputScale(x_name)[0];
CHECK(op_info->HasInputScale(y_name));
auto y_scale = op_info->GetInputScale(y_name)[0];
CHECK(op_info->HasOutputScale(out_name));
auto out_scale = op_info->GetOutputScale(out_name)[0];
// Add x tensor type
NeuronOperandType xType;
xType.type = NEURON_TENSOR_QUANT8_ASYMM;
xType.scale = x_scale;
xType.zeroPoint = 128;
xType.dimensionCount = x_dims.size();
std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
(uint32_t)x_dims[2],
(uint32_t)x_dims[3],
(uint32_t)x_dims[1]};
xType.dimensions = &dims_x[0];
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
VLOG(3) << "Graph has " << x_name;
if (graph->IsInput(x_name)) {
VLOG(3) << x_name << "is input and already exist";
x_name = "transpose_" + x_name;
}
if (graph->IsOutput(x_name)) {
VLOG(3) << x_name << "is input and output node";
x_name = "transpose_" + x_name;
}
x_node = graph->Get(x_name);
} else {
if (graph->IsInput(x_name)) {
insert_transpose_node(ctx,
x_name,
"transpose_" + x_name,
{(uint32_t)x_dims[0],
(uint32_t)x_dims[1],
(uint32_t)x_dims[2],
(uint32_t)x_dims[3]},
dims_x,
{0, 2, 3, 1},
xType.scale,
xType.zeroPoint);
// Change x name after insert transpose op for x data relayout
x_name = "transpose_" + x_name;
x_node = graph->Get(x_name);
} else {
NeuronModel_addOperand(model, &xType);
x_node = graph->Add(x_name, dims_x);
}
} // End of else
VLOG(3) << "x node idx: " << x_node->index() << "x_dims: " << x_dims
<< ": x_scale: " << x_scale << ", xType: " << xType.dimensions[0]
<< ":" << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
<< xType.dimensions[3];
// Add y tensor type
NeuronOperandType yType;
yType.type = NEURON_TENSOR_QUANT8_ASYMM;
yType.scale = y_scale;
yType.zeroPoint = 128;
yType.dimensionCount = y_dims.size();
std::vector<uint32_t> dims_y = {(uint32_t)y_dims[0],
(uint32_t)y_dims[2],
(uint32_t)y_dims[3],
(uint32_t)y_dims[1]};
yType.dimensions = &dims_y[0];
std::shared_ptr<Node> y_node = nullptr;
if (graph->Has(y_name)) {
VLOG(3) << "Graph has " << y_name;
y_node = graph->Get(y_name);
} else {
if (graph->IsInput(y_name)) {
insert_transpose_node(ctx,
y_name,
"transpose_" + y_name,
{(uint32_t)y_dims[0],
(uint32_t)y_dims[1],
(uint32_t)y_dims[2],
(uint32_t)y_dims[3]},
dims_y,
{0, 2, 3, 1},
yType.scale,
yType.zeroPoint);
y_name = "transpose_" + y_name;
y_node = graph->Get(y_name);
} else {
NeuronModel_addOperand(model, &yType);
y_node = graph->Add(y_name, dims_y);
}
}
VLOG(3) << "y node idx: " << y_node->index() << "y_dims: " << y_dims
<< ": y_scale: " << y_scale << ", yType: " << yType.dimensions[0]
<< ":" << yType.dimensions[1] << ":" << yType.dimensions[2] << ":"
<< yType.dimensions[3];
// Add fuse operand type
NeuronOperandType int32Type;
int32Type.type = NEURON_INT32;
int32Type.dimensionCount = 0;
std::vector<uint32_t> dims_int32 = {1};
// Add fuse operand
std::shared_ptr<Node> fuse_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // Operand 2: fuse
fuse_node = graph->Add(out_name + "_fuse", dims_int32);
// Add out tensor type
NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM;
outType.scale = out_scale;
outType.zeroPoint = 128;
outType.dimensionCount = out_dims.size();
std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
(uint32_t)out_dims[2],
(uint32_t)out_dims[3],
(uint32_t)out_dims[1]};
outType.dimensions = &dims_out[0];
std::shared_ptr<Node> out_node = nullptr;
if (graph->Has(out_name)) {
VLOG(3) << "Graph has " << out_name;
out_node = graph->Get(out_name);
} else {
if (graph->IsOutput(out_name)) {
NeuronModel_addOperand(model, &outType);
out_node = graph->Add("transpose_" + out_name, dims_out);
} else {
NeuronModel_addOperand(model, &outType);
out_node = graph->Add(out_name, dims_out);
}
}
VLOG(3) << "out node idx: " << out_node->index() << "out_dims: " << out_dims
<< ": out_scale: " << out_scale
<< ", outType: " << outType.dimensions[0] << ":"
<< outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
<< outType.dimensions[3];
// Set fuse value
NeuronModel_setOperandValue(
model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
std::vector<uint32_t> addInIndex = {
x_node->index(), // 0: A tensor
y_node->index(), // 1: A tensor of the same OperandCode,
// and compatible dimensions as input 0
fuse_node->index()}; // 2: fuse
std::vector<uint32_t> addOutIndex = {out_node->index()};
if (op_type == "elementwise_add" ||
op_type == "fusion_elementwise_add_activation") {
neuron_errCode = NeuronModel_addOperation(model,
NEURON_ADD,
addInIndex.size(),
&addInIndex[0],
addOutIndex.size(),
&addOutIndex[0]);
} else {
LOG(WARNING) << "[APU] Unsupported op type: " << op_type;
return FAILED;
}
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "ADD op fail:" << op_type;
return FAILED;
}
if (graph->IsOutput(out_name)) {
// Insert transpose for NHWC -> NCHW
insert_transpose_node(ctx,
"transpose_" + out_name,
out_name,
dims_out,
{(uint32_t)out_dims[0],
(uint32_t)out_dims[1],
(uint32_t)out_dims[2],
(uint32_t)out_dims[3]},
{0, 3, 1, 2},
outType.scale,
outType.zeroPoint);
out_node = graph->Get(out_name);
if (out_node == nullptr) return FAILED;
} }
return REBUILD_WHEN_SHAPE_CHANGED; return REBUILD_WHEN_SHAPE_CHANGED;
...@@ -67,3 +291,6 @@ REGISTER_SUBGRAPH_BRIDGE(elementwise_add, ...@@ -67,3 +291,6 @@ REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
REGISTER_SUBGRAPH_BRIDGE(elementwise_mul, REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
kAPU, kAPU,
paddle::lite::subgraph::apu::ElementwiseConverter); paddle::lite::subgraph::apu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
kAPU,
paddle::lite::subgraph::apu::ElementwiseConverter);
...@@ -77,12 +77,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -77,12 +77,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
inType.dimensions = &dims_in[0]; inType.dimensions = &dims_in[0];
std::shared_ptr<Node> in_node = nullptr; std::shared_ptr<Node> in_node = nullptr;
if (graph->Has(input_name)) { if (graph->Has(input_name)) {
// input operand already exist
in_node = graph->Get(input_name); in_node = graph->Get(input_name);
VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index(); VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index();
} else { } else {
// add input operand NeuronModel_addOperand(model, &inType); // Operand 0: input
NeuronModel_addOperand(model, &inType); // 0: input
in_node = graph->Add(input_name, dims_in); in_node = graph->Add(input_name, dims_in);
} }
VLOG(3) << "input_scale: " << input_scale VLOG(3) << "input_scale: " << input_scale
...@@ -97,7 +95,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -97,7 +95,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
wType.dimensionCount = w_dims.size(); wType.dimensionCount = w_dims.size();
std::vector<uint32_t> dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]}; std::vector<uint32_t> dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]};
wType.dimensions = &dims_w[0]; wType.dimensions = &dims_w[0];
NeuronModel_addOperand(model, &wType); // 1: weight NeuronModel_addOperand(model, &wType); // Operand 1: weight
std::shared_ptr<Node> w_node = nullptr; std::shared_ptr<Node> w_node = nullptr;
w_node = graph->Add(w_name, dims_w); w_node = graph->Add(w_name, dims_w);
VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0] VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0]
...@@ -119,7 +117,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -119,7 +117,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
biasType.dimensionCount = bias_dims.size(); biasType.dimensionCount = bias_dims.size();
std::vector<uint32_t> dims_bias = {(uint32_t)bias_dims[0]}; std::vector<uint32_t> dims_bias = {(uint32_t)bias_dims[0]};
biasType.dimensions = &dims_bias[0]; biasType.dimensions = &dims_bias[0];
NeuronModel_addOperand(model, &biasType); // 2: bias NeuronModel_addOperand(model, &biasType); // Operand 2: bias
bias_node = graph->Add(bias_name, dims_bias); bias_node = graph->Add(bias_name, dims_bias);
VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims
<< ", bias scale: " << biasType.scale << ", bias scale: " << biasType.scale
...@@ -128,7 +126,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -128,7 +126,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
biasType.dimensionCount = 1; biasType.dimensionCount = 1;
std::vector<uint32_t> dims_bias = {(uint32_t)n}; std::vector<uint32_t> dims_bias = {(uint32_t)n};
biasType.dimensions = &dims_bias[0]; biasType.dimensions = &dims_bias[0];
NeuronModel_addOperand(model, &biasType); // 2: bias NeuronModel_addOperand(model, &biasType); // Operand 2: bias
bias_node = graph->Add(w_name + "_default_bias", dims_bias); bias_node = graph->Add(w_name + "_default_bias", dims_bias);
} }
...@@ -137,7 +135,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -137,7 +135,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
fuseType.type = NEURON_INT32; fuseType.type = NEURON_INT32;
fuseType.dimensionCount = 0; fuseType.dimensionCount = 0;
std::vector<uint32_t> dims_int32 = {0}; std::vector<uint32_t> dims_int32 = {0};
NeuronModel_addOperand(model, &fuseType); // 3: fuse NeuronModel_addOperand(model, &fuseType); // Operand 3: fuse
std::shared_ptr<Node> fuse_node = nullptr; std::shared_ptr<Node> fuse_node = nullptr;
fuse_node = graph->Add(w_name + "_fuse", dims_int32); fuse_node = graph->Add(w_name + "_fuse", dims_int32);
...@@ -147,12 +145,13 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -147,12 +145,13 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
outType.scale = out_scale; outType.scale = out_scale;
outType.zeroPoint = 128; outType.zeroPoint = 128;
outType.dimensionCount = 2; outType.dimensionCount = 2;
std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0], out_dims[1]}; std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
(uint32_t)out_dims[1]};
outType.dimensions = &dims_out[0]; outType.dimensions = &dims_out[0];
VLOG(3) << "out_scale: " << out_scale VLOG(3) << "out_scale: " << out_scale
<< ", outType: " << outType.dimensions[0] << " : " << ", outType: " << outType.dimensions[0] << " : "
<< outType.dimensions[1]; << outType.dimensions[1];
NeuronModel_addOperand(model, &outType); // output NeuronModel_addOperand(model, &outType);
std::shared_ptr<Node> out_node = nullptr; std::shared_ptr<Node> out_node = nullptr;
out_node = graph->Add(out_name, dims_out); out_node = graph->Add(out_name, dims_out);
...@@ -190,29 +189,31 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -190,29 +189,31 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
NeuronModel_setOperandValue(model, NeuronModel_setOperandValue(model,
bias_node->index(), bias_node->index(),
bias->raw_data(), bias->raw_data(),
bias->memory_size()); // 2: bias bias->memory_size()); // Operand 2: bias
} else { } else {
auto int32_bias = std::make_shared<Tensor>(); auto int32_bias = std::make_shared<Tensor>();
int32_bias->Resize({1, out_dims[1]}); int32_bias->Resize({1, out_dims[1]});
int32_bias->mutable_data<int32_t>(); int32_bias->mutable_data<int32_t>();
memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size()); memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
VLOG(3) << "default: " << int32_bias->memory_size(); VLOG(3) << "default: " << int32_bias->memory_size();
neuron_errCode = neuron_errCode = NeuronModel_setOperandValue(
NeuronModel_setOperandValue(model, model,
bias_node->index(), bias_node->index(),
int32_bias->raw_data(), int32_bias->raw_data(),
int32_bias->memory_size()); // 2: bias int32_bias->memory_size()); // Operand 2: bias
bias_node->set_data(int32_bias); bias_node->set_data(int32_bias);
} }
// Add fuse value // Add fuse value
int32_t fuse_val[1] = {0}; int32_t fuse_val[1] = {0};
NeuronModel_setOperandValue( NeuronModel_setOperandValue(model,
model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); // 3: fuse fuse_node->index(),
fuse_val,
std::vector<uint32_t> addInIndex = {in_node->index(), sizeof(int32_t) * 1); // Operand 3: fuse
w_node->index(),
bias_node->index(), std::vector<uint32_t> addInIndex = {in_node->index(), // 0: input
fuse_node->index()}; w_node->index(), // 1: weight
bias_node->index(), // 2: bias
fuse_node->index()}; // 3: fuse
std::vector<uint32_t> addOutIndex = {out_node->index()}; std::vector<uint32_t> addOutIndex = {out_node->index()};
neuron_errCode = NeuronModel_addOperation(model, neuron_errCode = NeuronModel_addOperation(model,
NEURON_FULLY_CONNECTED, NEURON_FULLY_CONNECTED,
......
...@@ -28,7 +28,7 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) { ...@@ -28,7 +28,7 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
LOG(FATAL) << "[APU] Node" << name << " is redefined."; LOG(FATAL) << "[APU] Node" << name << " is redefined.";
return -1; return -1;
} else { } else {
VLOG(3) << " Add: " << name << " : " << node->index(); VLOG(5) << " Add: " << name << " : " << node->index();
auto ret = nodes_.insert( auto ret = nodes_.insert(
std::make_pair(name, std::vector<std::shared_ptr<Node>>())); std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
CHECK(ret.second); CHECK(ret.second);
......
...@@ -22,3 +22,6 @@ USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU); ...@@ -22,3 +22,6 @@ USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
USE_SUBGRAPH_BRIDGE(fc, kAPU); USE_SUBGRAPH_BRIDGE(fc, kAPU);
USE_SUBGRAPH_BRIDGE(pool2d, kAPU); USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
USE_SUBGRAPH_BRIDGE(softmax, kAPU); USE_SUBGRAPH_BRIDGE(softmax, kAPU);
USE_SUBGRAPH_BRIDGE(concat, kAPU);
USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kAPU);
USE_SUBGRAPH_BRIDGE(conv2d_transpose, kAPU);
...@@ -47,14 +47,14 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -47,14 +47,14 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto ksize = op_info->GetAttr<std::vector<int>>("ksize"); auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings"); std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
// pool mode // Check pool mode
if ((pooling_type == "max") || (pooling_type == "avg")) { if ((pooling_type == "max") || (pooling_type == "avg")) {
} else { } else {
LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type; LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type;
return FAILED; return FAILED;
} }
// pad mode // Check padding mode
int pad_mode = 0; int pad_mode = 0;
std::string padding_algorithm(""); std::string padding_algorithm("");
if (op_info->HasAttr("padding_algorithm")) { if (op_info->HasAttr("padding_algorithm")) {
...@@ -66,7 +66,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -66,7 +66,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
pad_mode = 5; pad_mode = 5;
} }
// paddings and strides // Check paddings and strides
if (paddings.size() == 2L) { if (paddings.size() == 2L) {
for (size_t i = 0; i < 2L; ++i) { for (size_t i = 0; i < 2L; ++i) {
int copy_pad = *(paddings.begin() + 2 * i); int copy_pad = *(paddings.begin() + 2 * i);
...@@ -107,60 +107,59 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -107,60 +107,59 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
xType.dimensions = &dims_x[0]; xType.dimensions = &dims_x[0];
std::shared_ptr<Node> x_node = nullptr; std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) { if (graph->Has(x_name)) {
LOG(INFO) << "Graph has " << x_name; VLOG(3) << "Graph has " << x_name;
// input operand already exist
x_node = graph->Get(x_name); x_node = graph->Get(x_name);
} else { } else {
// add input operand NeuronModel_addOperand(model, &xType); // Operand 0: x
NeuronModel_addOperand(model, &xType); // 0: x
x_node = graph->Add(x_name, dims_x); x_node = graph->Add(x_name, dims_x);
} }
VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":" VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":"
<< xType.dimensions[1] << ":" << xType.dimensions[2] << ":" << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
<< xType.dimensions[3]; << xType.dimensions[3];
VLOG(3) << "ksize:" << ksize[0] << ":" << ksize[1];
NeuronOperandType int32Type; NeuronOperandType int32Type;
int32Type.type = NEURON_INT32; int32Type.type = NEURON_INT32;
int32Type.dimensionCount = 0; int32Type.dimensionCount = 0;
std::vector<uint32_t> dims_int32 = {0}; std::vector<uint32_t> dims_int32 = {0};
std::shared_ptr<Node> paddingL_node = nullptr; std::shared_ptr<Node> paddingL_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 1: padding left NeuronModel_addOperand(model, &int32Type); // Operand 1: padding left
paddingL_node = graph->Add(x_name + "_padding_left", dims_int32); paddingL_node = graph->Add(x_name + "_padding_left", dims_int32);
std::shared_ptr<Node> paddingR_node = nullptr; std::shared_ptr<Node> paddingR_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 2: padding right NeuronModel_addOperand(model, &int32Type); // Operand 2: padding right
paddingR_node = graph->Add(x_name + "_padding_right", dims_int32); paddingR_node = graph->Add(x_name + "_padding_right", dims_int32);
std::shared_ptr<Node> paddingT_node = nullptr; std::shared_ptr<Node> paddingT_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 3: padding top NeuronModel_addOperand(model, &int32Type); // Operand 3: padding top
paddingT_node = graph->Add(x_name + "_padding_top", dims_int32); paddingT_node = graph->Add(x_name + "_padding_top", dims_int32);
std::shared_ptr<Node> paddingB_node = nullptr; std::shared_ptr<Node> paddingB_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 4: padding bottom NeuronModel_addOperand(model, &int32Type); // Operand 4: padding bottom
paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32); paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32);
std::shared_ptr<Node> strideW_node = nullptr; std::shared_ptr<Node> strideW_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 5: stride width NeuronModel_addOperand(model, &int32Type); // Operand 5: stride width
strideW_node = graph->Add(x_name + "_stride_width", dims_int32); strideW_node = graph->Add(x_name + "_stride_width", dims_int32);
std::shared_ptr<Node> strideH_node = nullptr; std::shared_ptr<Node> strideH_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 6: stride height NeuronModel_addOperand(model, &int32Type); // Operand 6: stride height
strideH_node = graph->Add(x_name + "_stride_height", dims_int32); strideH_node = graph->Add(x_name + "_stride_height", dims_int32);
std::shared_ptr<Node> filterW_node = nullptr; std::shared_ptr<Node> filterW_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 7: filter width NeuronModel_addOperand(model, &int32Type); // Operand 7: filter width
filterW_node = graph->Add(x_name + "_filter_width", dims_int32); filterW_node = graph->Add(x_name + "_filter_width", dims_int32);
std::shared_ptr<Node> filterH_node = nullptr; std::shared_ptr<Node> filterH_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 8: filter height NeuronModel_addOperand(model, &int32Type); // Operand 8: filter height
filterH_node = graph->Add(x_name + "_filter_height", dims_int32); filterH_node = graph->Add(x_name + "_filter_height", dims_int32);
std::shared_ptr<Node> fuse_node = nullptr; std::shared_ptr<Node> fuse_node = nullptr;
NeuronModel_addOperand(model, &int32Type); // 9: fuse NeuronModel_addOperand(model, &int32Type); // Operand 9: fuse
fuse_node = graph->Add(x_name + "_fuse", dims_int32); fuse_node = graph->Add(x_name + "_pool_fuse", dims_int32);
// Add out type
// Add output tensor type // Add output tensor type
NeuronOperandType outType; NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM; outType.type = NEURON_TENSOR_QUANT8_ASYMM;
...@@ -176,10 +175,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -176,10 +175,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if (graph->Has(out_name)) { if (graph->Has(out_name)) {
out_node = graph->Get(out_name); out_node = graph->Get(out_name);
} else { } else {
NeuronModel_addOperand(model, &outType); // out NeuronModel_addOperand(model, &outType);
out_node = graph->Add(out_name, dims_out); out_node = graph->Add(out_name, dims_out);
} }
VLOG(3) << "output_scale: " << x_scale VLOG(3) << "output_scale: " << out_scale
<< ", outType: " << outType.dimensions[0] << ":" << ", outType: " << outType.dimensions[0] << ":"
<< outType.dimensions[1] << ":" << outType.dimensions[2] << ":" << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
<< outType.dimensions[3]; << outType.dimensions[3];
...@@ -201,19 +200,21 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -201,19 +200,21 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Add Stride // Add Stride
int32_t stride_val[1]; int32_t stride_val[1];
stride_val[0] = strides[1]; // width stride_val[0] = strides[1]; // Entry 1: width stride
NeuronModel_setOperandValue( NeuronModel_setOperandValue(
model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
stride_val[0] = strides[0]; // height stride_val[0] = strides[0]; // Entry 0: height stride
NeuronModel_setOperandValue( NeuronModel_setOperandValue(
model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
// Add filter // Add filter
int32_t filter_val[1]; int32_t filter_val[1];
filter_val[0] = global_pooling ? x_dims[3] : ksize[1]; // width filter_val[0] =
global_pooling ? x_dims[3] : ksize[1]; // Entry 1: filter width
NeuronModel_setOperandValue( NeuronModel_setOperandValue(
model, filterW_node->index(), filter_val, sizeof(int32_t) * 1); model, filterW_node->index(), filter_val, sizeof(int32_t) * 1);
filter_val[0] = global_pooling ? x_dims[2] : ksize[0]; // height filter_val[0] =
global_pooling ? x_dims[2] : ksize[0]; // Entry 0: filter height
NeuronModel_setOperandValue( NeuronModel_setOperandValue(
model, filterH_node->index(), filter_val, sizeof(int32_t) * 1); model, filterH_node->index(), filter_val, sizeof(int32_t) * 1);
......
...@@ -64,12 +64,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -64,12 +64,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
xType.dimensions = &dims_x[0]; xType.dimensions = &dims_x[0];
std::shared_ptr<Node> x_node = nullptr; std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) { if (graph->Has(x_name)) {
// input operand already exist
x_node = graph->Get(x_name); x_node = graph->Get(x_name);
VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index(); VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index();
} else { } else {
// add input operand NeuronModel_addOperand(model, &xType); // Operand 0: input
NeuronModel_addOperand(model, &xType); // 0: input
x_node = graph->Add(x_name, dims_x); x_node = graph->Add(x_name, dims_x);
} }
VLOG(3) << "input_scale size: " << input_scale VLOG(3) << "input_scale size: " << input_scale
...@@ -80,7 +78,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -80,7 +78,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
NeuronOperandType betaType; NeuronOperandType betaType;
betaType.type = NEURON_FLOAT32; betaType.type = NEURON_FLOAT32;
betaType.dimensionCount = 0; betaType.dimensionCount = 0;
NeuronModel_addOperand(model, &betaType); // 1: beta NeuronModel_addOperand(model, &betaType); // Operand 1: beta
std::shared_ptr<Node> beta_node = nullptr; std::shared_ptr<Node> beta_node = nullptr;
beta_node = graph->Add(x_name + "_beta", dims_int32); beta_node = graph->Add(x_name + "_beta", dims_int32);
...@@ -88,7 +86,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -88,7 +86,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
NeuronOperandType axisType; NeuronOperandType axisType;
axisType.type = NEURON_INT32; axisType.type = NEURON_INT32;
axisType.dimensionCount = 0; axisType.dimensionCount = 0;
NeuronModel_addOperand(model, &axisType); // 2: axis NeuronModel_addOperand(model, &axisType); // Operand 2: axis
std::shared_ptr<Node> axis_node = nullptr; std::shared_ptr<Node> axis_node = nullptr;
axis_node = graph->Add(x_name + "_axis", dims_int32); axis_node = graph->Add(x_name + "_axis", dims_int32);
...@@ -99,7 +97,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -99,7 +97,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
outType.zeroPoint = 128; outType.zeroPoint = 128;
outType.dimensionCount = x_dims.size(); outType.dimensionCount = x_dims.size();
outType.dimensions = &dims_x[0]; outType.dimensions = &dims_x[0];
NeuronModel_addOperand(model, &outType); // 3: output NeuronModel_addOperand(model, &outType); // Operand 3: output
std::shared_ptr<Node> out_node = nullptr; std::shared_ptr<Node> out_node = nullptr;
out_node = graph->Add(out_name, dims_x); out_node = graph->Add(out_name, dims_x);
VLOG(3) << "out_scale: " << out_scale; VLOG(3) << "out_scale: " << out_scale;
...@@ -112,8 +110,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -112,8 +110,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
axis_val[0] = axis; axis_val[0] = axis;
NeuronModel_setOperandValue( NeuronModel_setOperandValue(
model, axis_node->index(), axis_val, sizeof(int32_t) * 1); model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
std::vector<uint32_t> addInIndex = { std::vector<uint32_t> addInIndex = {x_node->index(), // 0: input
x_node->index(), beta_node->index(), axis_node->index()}; beta_node->index(), // 1: beta
axis_node->index()}; // 2: axis
std::vector<uint32_t> addOutIndex = {out_node->index()}; std::vector<uint32_t> addOutIndex = {out_node->index()};
int neuron_errCode = NeuronModel_addOperation(model, int neuron_errCode = NeuronModel_addOperation(model,
NEURON_SOFTMAX, NEURON_SOFTMAX,
......
...@@ -39,22 +39,43 @@ bool HasInputArg(const OpInfo* op_info, ...@@ -39,22 +39,43 @@ bool HasInputArg(const OpInfo* op_info,
} }
} }
void insert_transpose_node(void* ctx, int insert_requant_node(void* ctx,
const std::string& input_name, const std::string& input_name,
const std::string& output_name, const std::string& output_name,
std::vector<uint32_t> input_shape, std::vector<uint32_t> input_shape,
std::vector<uint32_t> output_shape, std::vector<uint32_t> output_shape,
std::vector<int32_t> axis, float scale_in,
float scale, float scale_out,
int32_t zeroPoint) { int32_t zeroPoint) {
int neuron_errCode; int neuron_errCode;
auto graph = static_cast<Graph*>(ctx); auto graph = static_cast<Graph*>(ctx);
auto model = graph->model(); auto model = graph->model();
uint32_t numDevices = 0;
CHECK_EQ(Neuron_getDeviceCount(&numDevices), NEURON_NO_ERROR);
CHECK_GT(numDevices, (uint32_t)0);
NeuronDevice* targetDevice = nullptr;
for (uint32_t i = 0; i < numDevices; ++i) {
NeuronDevice* device = nullptr;
Neuron_getDevice(i, &device);
const char* name;
NeuronDevice_getName(device, &name);
if (0 == strcmp(name, "mtk-dsp")) {
targetDevice = device;
break;
}
}
if (targetDevice == nullptr) {
LOG(FATAL) << "Insert mtk_requant op fail!";
return -1;
}
// Add input // Add input
NeuronOperandType inType; NeuronOperandType inType;
inType.type = NEURON_TENSOR_QUANT8_ASYMM; inType.type = NEURON_TENSOR_QUANT8_ASYMM;
inType.scale = scale; inType.scale = scale_in;
inType.zeroPoint = zeroPoint; inType.zeroPoint = zeroPoint;
inType.dimensionCount = input_shape.size(); inType.dimensionCount = input_shape.size();
inType.dimensions = &input_shape[0]; inType.dimensions = &input_shape[0];
...@@ -64,15 +85,81 @@ void insert_transpose_node(void* ctx, ...@@ -64,15 +85,81 @@ void insert_transpose_node(void* ctx,
VLOG(3) << "Has " << input_name; VLOG(3) << "Has " << input_name;
input_node = graph->Get(input_name); input_node = graph->Get(input_name);
} else { } else {
neuron_errCode = NeuronModel_addOperand(model, &inType); // input neuron_errCode = NeuronModel_addOperand(model, &inType);
if (NEURON_NO_ERROR != neuron_errCode) { if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Insert transpose op fail!"; LOG(FATAL) << "Insert mtk_requant op fail!";
return; return -1;
} }
VLOG(3) << "Add " << input_name; VLOG(3) << "Add " << input_name;
input_node = graph->Add(input_name, input_shape); input_node = graph->Add(input_name, input_shape);
} }
// Add output
NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM;
outType.scale = scale_out;
outType.zeroPoint = zeroPoint;
outType.dimensionCount = output_shape.size();
outType.dimensions = &output_shape[0];
NeuronModel_addOperand(model, &outType);
std::shared_ptr<Node> output_node = nullptr;
output_node = graph->Add(output_name, output_shape);
std::vector<uint32_t> addInIndex = {input_node->index()};
std::vector<uint32_t> addOutIndex = {output_node->index()};
neuron_errCode = NeuronModel_addOperationExtension(model,
"MTK_REQUANTIZE",
"mediatek",
targetDevice,
addInIndex.size(),
&addInIndex[0],
addOutIndex.size(),
&addOutIndex[0]);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(FATAL) << "Insert mtk_requant op fail!";
return -1;
}
return 0;
}
int insert_transpose_node(void* ctx,
const std::string& input_name,
const std::string& output_name,
std::vector<uint32_t> input_shape,
std::vector<uint32_t> output_shape,
std::vector<int32_t> axis,
float scale,
int32_t zeroPoint) {
int neuron_errCode;
auto graph = static_cast<Graph*>(ctx);
auto model = graph->model();
// Add input
NeuronOperandType inType;
inType.type = NEURON_TENSOR_QUANT8_ASYMM;
inType.scale = scale;
inType.zeroPoint = zeroPoint;
inType.dimensionCount = input_shape.size();
inType.dimensions = &input_shape[0];
std::shared_ptr<Node> input_node = nullptr;
if (graph->Has(input_name)) {
VLOG(5) << "Has " << input_name;
input_node = graph->Get(input_name);
} else {
neuron_errCode = NeuronModel_addOperand(model, &inType);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(FATAL) << "Insert transpose op fail!";
return -1;
}
VLOG(5) << "Add " << input_name;
input_node = graph->Add(input_name, input_shape);
}
// Add perm // Add perm
NeuronOperandType permsType; NeuronOperandType permsType;
permsType.type = NEURON_TENSOR_INT32; permsType.type = NEURON_TENSOR_INT32;
...@@ -80,22 +167,22 @@ void insert_transpose_node(void* ctx, ...@@ -80,22 +167,22 @@ void insert_transpose_node(void* ctx,
uint32_t dims_perms[1] = {4}; uint32_t dims_perms[1] = {4};
permsType.dimensions = dims_perms; permsType.dimensions = dims_perms;
neuron_errCode = NeuronModel_addOperand(model, &permsType); // perm neuron_errCode = NeuronModel_addOperand(model, &permsType);
if (NEURON_NO_ERROR != neuron_errCode) { if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Insert transpose op fail!"; LOG(FATAL) << "Insert transpose op fail!";
return; return -1;
} }
std::shared_ptr<Node> perms_node = nullptr; std::shared_ptr<Node> perms_node = nullptr;
perms_node = graph->Add(input_name + "_perms", {4}); perms_node = graph->Add(input_name + "_perms", {4});
VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" VLOG(5) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
<< axis[3]; << axis[3];
// &axis[0], sizeof(int32_t) * axis.size());
neuron_errCode = NeuronModel_setOperandValue( neuron_errCode = NeuronModel_setOperandValue(
model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size()); model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size());
if (NEURON_NO_ERROR != neuron_errCode) { if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Insert transpose op fail!"; LOG(FATAL) << "Insert transpose op fail!";
return; return -1;
} }
// Add output // Add output
...@@ -106,7 +193,7 @@ void insert_transpose_node(void* ctx, ...@@ -106,7 +193,7 @@ void insert_transpose_node(void* ctx,
outType.dimensionCount = output_shape.size(); outType.dimensionCount = output_shape.size();
outType.dimensions = &output_shape[0]; outType.dimensions = &output_shape[0];
NeuronModel_addOperand(model, &outType); // output NeuronModel_addOperand(model, &outType);
std::shared_ptr<Node> output_node = nullptr; std::shared_ptr<Node> output_node = nullptr;
output_node = graph->Add(output_name, output_shape); output_node = graph->Add(output_name, output_shape);
...@@ -123,8 +210,10 @@ void insert_transpose_node(void* ctx, ...@@ -123,8 +210,10 @@ void insert_transpose_node(void* ctx,
&addOutIndex[0]); &addOutIndex[0]);
if (NEURON_NO_ERROR != neuron_errCode) { if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Insert transpose op fail!"; LOG(FATAL) << "Insert transpose op fail!";
} }
return 0;
} }
void transpose(const int8_t* input_data, void transpose(const int8_t* input_data,
...@@ -135,9 +224,9 @@ void transpose(const int8_t* input_data, ...@@ -135,9 +224,9 @@ void transpose(const int8_t* input_data,
int new_index = -1; int new_index = -1;
int dim[4] = {0}; int dim[4] = {0};
std::vector<uint32_t> shape = input_shape; std::vector<uint32_t> shape = input_shape;
VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
<< ":" << input_shape[3]; << ":" << input_shape[3];
VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
...@@ -164,9 +253,9 @@ void transposeAsym(const int8_t* input_data, ...@@ -164,9 +253,9 @@ void transposeAsym(const int8_t* input_data,
int new_index = -1; int new_index = -1;
int dim[4] = {0}; int dim[4] = {0};
std::vector<uint32_t> shape = input_shape; std::vector<uint32_t> shape = input_shape;
VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
<< ":" << input_shape[3]; << ":" << input_shape[3];
VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
...@@ -177,8 +266,8 @@ void transposeAsym(const int8_t* input_data, ...@@ -177,8 +266,8 @@ void transposeAsym(const int8_t* input_data,
dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] + dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
dim[axis[1]] * shape[axis[2]] * shape[axis[3]] + dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
dim[axis[2]] * shape[axis[3]] + dim[axis[3]]; dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
// Per layer op is asym op and need to add 128
output_data[new_index] = input_data[old_index] + 128; // per layer output_data[new_index] = input_data[old_index] + 128;
} }
} }
} }
......
...@@ -33,14 +33,23 @@ bool HasInputArg(const OpInfo* op_info, ...@@ -33,14 +33,23 @@ bool HasInputArg(const OpInfo* op_info,
const Scope* scope, const Scope* scope,
const std::string& argname); const std::string& argname);
void insert_transpose_node(void* ctx, int insert_requant_node(void* ctx,
const std::string& input_name, const std::string& input_name,
const std::string& output_name, const std::string& output_name,
std::vector<uint32_t> input_shape, std::vector<uint32_t> input_shape,
std::vector<uint32_t> output_shape, std::vector<uint32_t> output_shape,
std::vector<int32_t> axis, float scale_in,
float scale, float scale_out,
int32_t zeroPoint); int32_t zeroPoint);
int insert_transpose_node(void* ctx,
const std::string& input_name,
const std::string& output_name,
std::vector<uint32_t> input_shape,
std::vector<uint32_t> output_shape,
std::vector<int32_t> axis,
float scale,
int32_t zeroPoint);
void transpose(const int8_t* input_data, void transpose(const int8_t* input_data,
uint8_t* output_data, uint8_t* output_data,
......
...@@ -33,6 +33,14 @@ bool SubgraphEngine::BuildDeviceProgram() { ...@@ -33,6 +33,14 @@ bool SubgraphEngine::BuildDeviceProgram() {
BuildOriginProgram(); BuildOriginProgram();
} }
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
auto start_time = GetCurrentUS();
unsigned int version; unsigned int version;
Neuron_getVersion(&version); Neuron_getVersion(&version);
VLOG(3) << "Neuron Adapter version: " << version; VLOG(3) << "Neuron Adapter version: " << version;
...@@ -108,18 +116,16 @@ bool SubgraphEngine::BuildDeviceProgram() { ...@@ -108,18 +116,16 @@ bool SubgraphEngine::BuildDeviceProgram() {
} }
VLOG(3) << "[APU] APU NIR model created!"; VLOG(3) << "[APU] APU NIR model created!";
auto GetCurrentUS = []() -> double { VLOG(1) << "[APU] APU NIR model created, Create cost "
struct timeval time; << GetCurrentUS() - start_time << " us";
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec; start_time = GetCurrentUS();
};
auto start_time = GetCurrentUS();
compilation_ = lite::apu::Device::Global().Build(model_); compilation_ = lite::apu::Device::Global().Build(model_);
if (compilation_ == nullptr) { if (compilation_ == nullptr) {
LOG(WARNING) << "[APU] Build APU DLA model failed!"; LOG(WARNING) << "[APU] Build APU DLA model failed!";
return false; return false;
} }
VLOG(3) << "[APU] APU DLA model created, Build cost " VLOG(1) << "[APU] APU DLA model created, Build cost "
<< GetCurrentUS() - start_time << " us"; << GetCurrentUS() - start_time << " us";
return true; return true;
} }
...@@ -176,7 +182,7 @@ bool SubgraphEngine::LaunchDeviceProgram() { ...@@ -176,7 +182,7 @@ bool SubgraphEngine::LaunchDeviceProgram() {
} }
} }
NeuronExecution_free(run); NeuronExecution_free(run);
VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us"; VLOG(1) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
return true; return true;
} }
......
...@@ -26,6 +26,88 @@ namespace lite { ...@@ -26,6 +26,88 @@ namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace arm {
template <typename Dtype>
void naive_transpose(const Dtype* din, Dtype* dout, int m, int n) {
int k = 0;
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
dout[k++] = din[j * n + i];
}
}
}
template <PrecisionType PType>
void fc_trans_weights(const Tensor& tin, Tensor* tout);
template <>
void fc_trans_weights<PRECISION(kFloat)>(const Tensor& tin, Tensor* tout) {
CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
int m = tin.dims()[0];
int n = tin.dims()[1];
tout->Resize({n, m});
auto* ptr_in = tin.data<float>();
auto* ptr_out = tout->mutable_data<float>();
naive_transpose(ptr_in, ptr_out, m, n);
}
template <>
void fc_trans_weights<PRECISION(kInt8)>(const Tensor& tin, Tensor* tout) {
CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
int m = tin.dims()[0];
int n = tin.dims()[1];
tout->Resize({n, m});
auto* ptr_in = tin.data<int8_t>();
auto* ptr_out = tout->mutable_data<int8_t>();
naive_transpose(ptr_in, ptr_out, m, n);
}
template <PrecisionType PType, PrecisionType OutType>
bool check_fc_use_gemm(int m, const std::vector<float>& scale, bool has_bias) {
return m > 1;
}
template <>
bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kFloat)>(
int m, const std::vector<float>& scale, bool has_bias) {
CHECK_GT(scale.size(), 0) << "Int8 FC param must has weight_scale";
return m > 1 && scale.size() == 1;
}
template <>
bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kInt8)>(
int m, const std::vector<float>& scale, bool has_bias) {
CHECK_GT(scale.size(), 0) << "Int8 FC param must has weight_scale";
return m > 1 && scale.size() == 1 && !has_bias;
}
template <PrecisionType PType, PrecisionType OutType>
void FcCompute<PType, OutType>::ReInitWhenNeeded() {
auto& param = this->template Param<operators::FcParam>();
auto x_dims = param.input->dims();
if (last_shape_ == x_dims) {
return;
}
last_shape_ = x_dims;
auto w_dims = param.w->dims();
auto& ctx = this->ctx_->template As<ARMContext>();
CHECK_GE(x_dims.size(), 2UL);
CHECK_EQ(w_dims.size(), 2UL);
CHECK_GE(param.output->dims().size(), 2UL);
m_ = x_dims.Slice(0, param.in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
CHECK_EQ(k_, w_dims[0]);
n_ = w_dims[1];
CHECK_EQ(k_, static_cast<int>(w_dims[0]));
flag_gemm_ = check_fc_use_gemm<PType, OutType>(
m_, param.weight_scale, param.bias != nullptr);
if (!flag_trans_weights_ && !flag_gemm_) {
flag_trans_weights_ = true;
fc_trans_weights<PType>(*param.w, &weights_);
}
}
/// for fp32 kernel /// for fp32 kernel
template <> template <>
void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
...@@ -71,8 +153,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() { ...@@ -71,8 +153,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
/// update bias /// update bias
if (param.bias) { if (param.bias) {
bias_.Resize(param.bias->dims()); bias_.Resize(param.bias->dims());
auto ptr = bias_.mutable_data<float>(); auto* ptr = bias_.mutable_data<float>();
auto ptr_in = bias_.data<float>(); auto* ptr_in = bias_.data<float>();
float out_scale = param.output_scale; float out_scale = param.output_scale;
for (int i = 0; i < bias_.numel(); ++i) { for (int i = 0; i < bias_.numel(); ++i) {
ptr[i] = ptr_in[i] / out_scale; ptr[i] = ptr_in[i] / out_scale;
...@@ -86,9 +168,9 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -86,9 +168,9 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
auto& param = this->Param<operators::FcParam>(); auto& param = this->Param<operators::FcParam>();
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
auto i_data = param.input->data<float>(); auto* i_data = param.input->data<float>();
auto o_data = param.output->mutable_data<float>(); auto* o_data = param.output->mutable_data<float>();
auto w_data = param.w->data<float>(); auto* w_data = flag_gemm_ ? param.w->data<float>() : weights_.data<float>();
const float* b_data = param.bias ? param.bias->data<float>() : nullptr; const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
if (flag_trans_bias_) { if (flag_trans_bias_) {
b_data = bias_.data<float>(); b_data = bias_.data<float>();
...@@ -125,8 +207,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -125,8 +207,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
} }
} else { } else {
for (int i = 0; i < m_; ++i) { for (int i = 0; i < m_; ++i) {
auto i_data_batch = i_data + i * k_; auto* i_data_batch = i_data + i * k_;
auto o_data_batch = o_data + i * n_; auto* o_data_batch = o_data + i * n_;
lite::arm::math::sgemv(w_data, lite::arm::math::sgemv(w_data,
i_data_batch, i_data_batch,
o_data_batch, o_data_batch,
...@@ -147,9 +229,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() { ...@@ -147,9 +229,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
auto& param = this->Param<operators::FcParam>(); auto& param = this->Param<operators::FcParam>();
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
auto i_data = param.input->data<int8_t>(); auto* i_data = param.input->data<int8_t>();
auto o_data = param.output->mutable_data<float>(); auto* o_data = param.output->mutable_data<float>();
auto w_data = param.w->data<int8_t>(); auto* w_data =
flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
const float* b_data = param.bias ? param.bias->data<float>() : nullptr; const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
if (flag_trans_bias_) { if (flag_trans_bias_) {
b_data = bias_.data<float>(); b_data = bias_.data<float>();
...@@ -182,8 +265,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() { ...@@ -182,8 +265,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
} }
} else { } else {
for (int i = 0; i < m_; ++i) { for (int i = 0; i < m_; ++i) {
auto i_data_batch = i_data + i * k_; auto* i_data_batch = i_data + i * k_;
auto o_data_batch = o_data + i * n_; auto* o_data_batch = o_data + i * n_;
lite::arm::math::gemv_int8(w_data, lite::arm::math::gemv_int8(w_data,
i_data_batch, i_data_batch,
o_data_batch, o_data_batch,
...@@ -205,9 +288,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() { ...@@ -205,9 +288,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
auto& param = this->Param<operators::FcParam>(); auto& param = this->Param<operators::FcParam>();
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
auto i_data = param.input->data<int8_t>(); auto* i_data = param.input->data<int8_t>();
auto o_data = param.output->mutable_data<int8_t>(); auto* o_data = param.output->mutable_data<int8_t>();
auto w_data = param.w->data<int8_t>(); auto* w_data =
flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
const float* b_data = param.bias ? param.bias->data<float>() : nullptr; const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
if (flag_trans_bias_) { if (flag_trans_bias_) {
b_data = bias_.data<float>(); b_data = bias_.data<float>();
...@@ -240,8 +324,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() { ...@@ -240,8 +324,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
&ctx); &ctx);
} else { } else {
for (int i = 0; i < m_; ++i) { for (int i = 0; i < m_; ++i) {
auto i_data_batch = i_data + i * k_; auto* i_data_batch = i_data + i * k_;
auto o_data_batch = o_data + i * n_; auto* o_data_batch = o_data + i * n_;
lite::arm::math::gemv_int8(w_data, lite::arm::math::gemv_int8(w_data,
i_data_batch, i_data_batch,
o_data_batch, o_data_batch,
......
...@@ -24,92 +24,12 @@ namespace lite { ...@@ -24,92 +24,12 @@ namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace arm {
template <typename Dtype>
void naive_transpose(const Dtype* din, Dtype* dout, int m, int n) {
int k = 0;
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
dout[k++] = din[j * n + i];
}
}
}
template <PrecisionType PType>
void fc_trans_weights(const Tensor& tin, Tensor* tout);
template <>
void fc_trans_weights<PRECISION(kFloat)>(const Tensor& tin, Tensor* tout) {
CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
int m = tin.dims()[0];
int n = tin.dims()[1];
tout->Resize({n, m});
auto ptr_in = tin.data<float>();
auto ptr_out = tout->mutable_data<float>();
naive_transpose(ptr_in, ptr_out, m, n);
}
template <>
void fc_trans_weights<PRECISION(kInt8)>(const Tensor& tin, Tensor* tout) {
CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
int m = tin.dims()[0];
int n = tin.dims()[1];
tout->Resize({n, m});
auto ptr_in = tin.data<int8_t>();
auto ptr_out = tout->mutable_data<int8_t>();
naive_transpose(ptr_in, ptr_out, m, n);
}
template <PrecisionType PType, PrecisionType OutType>
bool check_fc_use_gemm(int m, const std::vector<float>& scale, bool has_bias) {
return m > 1;
}
template <>
bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kFloat)>(
int m, const std::vector<float>& scale, bool has_bias) {
CHECK(scale.size() > 0) << "Int8 FC param must has weight_scale";
return m > 1 && scale.size() == 1;
}
template <>
bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kInt8)>(
int m, const std::vector<float>& scale, bool has_bias) {
CHECK(scale.size() > 0) << "Int8 FC param must has weight_scale";
return m > 1 && scale.size() == 1 && !has_bias;
}
template <PrecisionType PType, PrecisionType OutType> template <PrecisionType PType, PrecisionType OutType>
class FcCompute : public KernelLite<TARGET(kARM), PType> { class FcCompute : public KernelLite<TARGET(kARM), PType> {
public: public:
using param_t = operators::FcParam; using param_t = operators::FcParam;
virtual void ReInitWhenNeeded() { virtual void ReInitWhenNeeded();
auto& param = this->template Param<operators::FcParam>();
auto x_dims = param.input->dims();
if (last_shape_ == x_dims) {
return;
}
last_shape_ = x_dims;
auto w_dims = param.w_dims;
auto& ctx = this->ctx_->template As<ARMContext>();
CHECK_GE(x_dims.size(), 2UL);
CHECK_EQ(w_dims.size(), 2UL);
CHECK_GE(param.output->dims().size(), 2UL);
m_ = x_dims.Slice(0, param.in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
n_ = w_dims[1];
flag_gemm_ = check_fc_use_gemm<PType, OutType>(
m_, param.weight_scale, param.bias != nullptr);
if (flag_trans_weights_ == flag_gemm_) {
flag_trans_weights_ = !flag_trans_weights_;
Tensor tmp_tensor;
fc_trans_weights<PType>(*param.w, &tmp_tensor);
param.w->CopyDataFrom(tmp_tensor);
}
}
virtual void PrepareForRun(); virtual void PrepareForRun();
virtual void Run(); virtual void Run();
...@@ -117,6 +37,7 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> { ...@@ -117,6 +37,7 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {
private: private:
DDim last_shape_; DDim last_shape_;
Tensor weights_;
Tensor bias_; Tensor bias_;
bool flag_trans_weights_{false}; bool flag_trans_weights_{false};
bool flag_trans_bias_{false}; bool flag_trans_bias_{false};
......
...@@ -52,7 +52,7 @@ void Decode(const Tensor& emission_weights, ...@@ -52,7 +52,7 @@ void Decode(const Tensor& emission_weights,
for (int k = 1; k < seq_len; ++k) { for (int k = 1; k < seq_len; ++k) {
for (int i = 0; i < tag_num; ++i) { for (int i = 0; i < tag_num; ++i) {
T max_score = -std::numeric_limits<T>::max(); T max_score = -(std::numeric_limits<T>::max)();
int max_j = 0; int max_j = 0;
for (size_t j = 0; j < tag_num; ++j) { for (size_t j = 0; j < tag_num; ++j) {
T score = alpha_value[(k - 1) * tag_num + j] + T score = alpha_value[(k - 1) * tag_num + j] +
...@@ -67,7 +67,7 @@ void Decode(const Tensor& emission_weights, ...@@ -67,7 +67,7 @@ void Decode(const Tensor& emission_weights,
} }
} }
T max_score = -std::numeric_limits<T>::max(); T max_score = -(std::numeric_limits<T>::max)();
int max_i = 0; int max_i = 0;
for (size_t i = 0; i < tag_num; ++i) { for (size_t i = 0; i < tag_num; ++i) {
T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i]; T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
......
...@@ -72,10 +72,10 @@ static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) { ...@@ -72,10 +72,10 @@ static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) {
box2[3] < box1[1]) { box2[3] < box1[1]) {
return static_cast<T>(0.); return static_cast<T>(0.);
} else { } else {
const T inter_xmin = std::max(box1[0], box2[0]); const T inter_xmin = (std::max)(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]); const T inter_ymin = (std::max)(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]); const T inter_xmax = (std::min)(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]); const T inter_ymax = (std::min)(box1[3], box2[3]);
T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.); T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
T inter_w = inter_xmax - inter_xmin + norm; T inter_w = inter_xmax - inter_xmin + norm;
T inter_h = inter_ymax - inter_ymin + norm; T inter_h = inter_ymax - inter_ymin + norm;
......
...@@ -128,7 +128,7 @@ class TensorFormatter { ...@@ -128,7 +128,7 @@ class TensorFormatter {
void FormatData(const Tensor& print_tensor, std::stringstream& log_stream) { void FormatData(const Tensor& print_tensor, std::stringstream& log_stream) {
int64_t print_size = summarize_ == -1 int64_t print_size = summarize_ == -1
? print_tensor.numel() ? print_tensor.numel()
: std::min(summarize_, print_tensor.numel()); : (std::min)(summarize_, print_tensor.numel());
const T* data = print_tensor.data<T>(); // Always kHost, so unnessary to const T* data = print_tensor.data<T>(); // Always kHost, so unnessary to
// copy the data from device // copy the data from device
log_stream << " - data: ["; log_stream << " - data: [";
......
...@@ -83,10 +83,10 @@ static inline T JaccardOverlap(const std::vector<T>& box1, ...@@ -83,10 +83,10 @@ static inline T JaccardOverlap(const std::vector<T>& box1,
box2[3] < box1[1]) { box2[3] < box1[1]) {
return static_cast<T>(0.); return static_cast<T>(0.);
} else { } else {
const T inter_xmin = std::max(box1[0], box2[0]); const T inter_xmin = (std::max)(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]); const T inter_ymin = (std::max)(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]); const T inter_xmax = (std::min)(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]); const T inter_ymax = (std::min)(box1[3], box2[3]);
T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.); T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
T inter_w = inter_xmax - inter_xmin + norm; T inter_w = inter_xmax - inter_xmin + norm;
T inter_h = inter_ymax - inter_ymin + norm; T inter_h = inter_ymax - inter_ymin + norm;
...@@ -183,10 +183,10 @@ void DeltaScoreToPrediction( ...@@ -183,10 +183,10 @@ void DeltaScoreToPrediction(
pred_box_xmax = pred_box_xmax / im_scale; pred_box_xmax = pred_box_xmax / im_scale;
pred_box_ymax = pred_box_ymax / im_scale; pred_box_ymax = pred_box_ymax / im_scale;
pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero); pred_box_xmin = (std::max)((std::min)(pred_box_xmin, im_width - 1), zero);
pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero); pred_box_ymin = (std::max)((std::min)(pred_box_ymin, im_height - 1), zero);
pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero); pred_box_xmax = (std::max)((std::min)(pred_box_xmax, im_width - 1), zero);
pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero); pred_box_ymax = (std::max)((std::min)(pred_box_ymax, im_height - 1), zero);
std::vector<T> one_pred; std::vector<T> one_pred;
one_pred.push_back(pred_box_xmin); one_pred.push_back(pred_box_xmin);
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <cmath>
#include "lite/core/subgraph_bridge_registry.h" #include "lite/core/subgraph_bridge_registry.h"
#include "lite/kernels/huawei_ascend_npu/bridges/graph.h" #include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
#include "lite/kernels/huawei_ascend_npu/bridges/utility.h" #include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <cmath>
#include "lite/core/subgraph_bridge_registry.h" #include "lite/core/subgraph_bridge_registry.h"
#include "lite/kernels/huawei_ascend_npu/bridges/graph.h" #include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
#include "lite/kernels/huawei_ascend_npu/bridges/utility.h" #include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
......
...@@ -71,7 +71,7 @@ inline void get_mid_dims(const lite::DDim &x_dims, ...@@ -71,7 +71,7 @@ inline void get_mid_dims(const lite::DDim &x_dims,
for (size_t j = 0; j < i; ++j) { for (size_t j = 0; j < i; ++j) {
(*pre) *= y_dims[j]; (*pre) *= y_dims[j];
} }
*n = std::max(x_dims[i + axis], y_dims[i]); *n = (std::max)(x_dims[i + axis], y_dims[i]);
*mid_flag = 1; *mid_flag = 1;
mid = i; mid = i;
break; break;
......
...@@ -55,7 +55,7 @@ class SequenceArithmeticCompute ...@@ -55,7 +55,7 @@ class SequenceArithmeticCompute
auto input_x = x_data + x_seq_offset[i] * inner_size; auto input_x = x_data + x_seq_offset[i] * inner_size;
auto input_y = y_data + y_seq_offset[i] * inner_size; auto input_y = y_data + y_seq_offset[i] * inner_size;
auto t_out = out_data + x_seq_offset[i] * inner_size; auto t_out = out_data + x_seq_offset[i] * inner_size;
int len = std::min(len_x, len_y); int len = (std::min)(len_x, len_y);
for (int j = 0; j < len; j++) { for (int j = 0; j < len; j++) {
t_out[j] = input_x[j] + input_y[j]; t_out[j] = input_x[j] + input_y[j];
} }
...@@ -73,7 +73,7 @@ class SequenceArithmeticCompute ...@@ -73,7 +73,7 @@ class SequenceArithmeticCompute
auto input_x = x_data + x_seq_offset[i] * inner_size; auto input_x = x_data + x_seq_offset[i] * inner_size;
auto input_y = y_data + y_seq_offset[i] * inner_size; auto input_y = y_data + y_seq_offset[i] * inner_size;
auto t_out = out_data + x_seq_offset[i] * inner_size; auto t_out = out_data + x_seq_offset[i] * inner_size;
int len = std::min(len_x, len_y); int len = (std::min)(len_x, len_y);
for (int j = 0; j < len; j++) { for (int j = 0; j < len; j++) {
t_out[j] = input_x[j] - input_y[j]; t_out[j] = input_x[j] - input_y[j];
} }
...@@ -91,7 +91,7 @@ class SequenceArithmeticCompute ...@@ -91,7 +91,7 @@ class SequenceArithmeticCompute
auto input_x = x_data + x_seq_offset[i] * inner_size; auto input_x = x_data + x_seq_offset[i] * inner_size;
auto input_y = y_data + y_seq_offset[i] * inner_size; auto input_y = y_data + y_seq_offset[i] * inner_size;
auto t_out = out_data + x_seq_offset[i] * inner_size; auto t_out = out_data + x_seq_offset[i] * inner_size;
int len = std::min(len_x, len_y); int len = (std::min)(len_x, len_y);
for (int j = 0; j < len; j++) { for (int j = 0; j < len; j++) {
t_out[j] = input_x[j] * input_y[j]; t_out[j] = input_x[j] * input_y[j];
} }
......
...@@ -49,8 +49,8 @@ class SequenceConvCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -49,8 +49,8 @@ class SequenceConvCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
bool padding_trainable = false; bool padding_trainable = false;
const Tensor* padding_data = nullptr; const Tensor* padding_data = nullptr;
int up_pad = std::max(0, -context_start); int up_pad = (std::max)(0, -context_start);
int down_pad = std::max(0, context_start + context_length - 1); int down_pad = (std::max)(0, context_start + context_length - 1);
auto sequence_width = static_cast<int64_t>(in->dims()[1]); auto sequence_width = static_cast<int64_t>(in->dims()[1]);
std::vector<int64_t> col_shape{in->dims()[0], std::vector<int64_t> col_shape{in->dims()[0],
......
...@@ -102,9 +102,9 @@ void slice_compute(const lite::Tensor* in, ...@@ -102,9 +102,9 @@ void slice_compute(const lite::Tensor* in,
start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
start = std::max(start, 0); start = (std::max)(start, 0);
end = std::max(end, 0); end = (std::max)(end, 0);
end = std::min(end, dim_value); end = (std::min)(end, dim_value);
CHECK_GT(end, start) << "end should greater than start"; CHECK_GT(end, start) << "end should greater than start";
out_dims[axes[i]] = end - start; out_dims[axes[i]] = end - start;
} }
...@@ -172,7 +172,7 @@ void slice_compute(const lite::Tensor* in, ...@@ -172,7 +172,7 @@ void slice_compute(const lite::Tensor* in,
if (start < 0) { if (start < 0) {
start = (start + in_dims[axes[i]]); start = (start + in_dims[axes[i]]);
} }
start = std::max(start, 0); start = (std::max)(start, 0);
offsets[axes[i]] = start; offsets[axes[i]] = start;
} }
auto in_t = auto in_t =
......
...@@ -391,7 +391,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) { ...@@ -391,7 +391,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
} }
{ // the 3rd field, tensor data { // the 3rd field, tensor data
uint64_t size = tensor.memory_size(); uint64_t size = tensor.memory_size();
CHECK_LT(size, std::numeric_limits<std::streamsize>::max()) CHECK_LT(size, (std::numeric_limits<std::streamsize>::max)())
<< "Index overflow when writing tensor"; << "Index overflow when writing tensor";
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
...@@ -461,7 +461,7 @@ void SetParamInfoNaive(naive_buffer::ParamDesc *param_desc, ...@@ -461,7 +461,7 @@ void SetParamInfoNaive(naive_buffer::ParamDesc *param_desc,
} }
desc.SetDim(tensor.dims().Vectorize()); desc.SetDim(tensor.dims().Vectorize());
uint64_t size = tensor.memory_size(); uint64_t size = tensor.memory_size();
CHECK_LT(size, std::numeric_limits<std::streamsize>::max()) CHECK_LT(size, (std::numeric_limits<std::streamsize>::max)())
<< "Index overflow when writing tensor"; << "Index overflow when writing tensor";
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
......
...@@ -62,7 +62,7 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings, ...@@ -62,7 +62,7 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings,
if (padding_algorithm == "SAME") { if (padding_algorithm == "SAME") {
for (size_t i = 0; i < strides.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
int pad_sum = std::max( int pad_sum = (std::max)(
(out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2], (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
(int64_t)0); (int64_t)0);
int pad_0 = pad_sum / 2; int pad_0 = pad_sum / 2;
......
...@@ -75,7 +75,7 @@ bool ElementwiseOp::InferShapeImpl() const { ...@@ -75,7 +75,7 @@ bool ElementwiseOp::InferShapeImpl() const {
if (x_dims_array[i] == -1 || y_dims_array[i] == -1) { if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
out_dims_array[i] = -1; out_dims_array[i] = -1;
} else { } else {
out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]); out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]);
} }
} }
param_.Out->Resize(DDim(out_dims_array)); param_.Out->Resize(DDim(out_dims_array));
......
...@@ -128,8 +128,8 @@ inline void UpdatePadding(std::vector<int> *paddings, ...@@ -128,8 +128,8 @@ inline void UpdatePadding(std::vector<int> *paddings,
for (size_t i = 0; i < strides.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
int pad_sum = int pad_sum =
std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2], (std::max)((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
(int64_t)0); (int64_t)0);
int pad_0 = pad_sum / 2; int pad_0 = pad_sum / 2;
int pad_1 = pad_sum - pad_0; int pad_1 = pad_sum - pad_0;
*(paddings->begin() + i * 2) = pad_0; *(paddings->begin() + i * 2) = pad_0;
......
...@@ -51,9 +51,9 @@ bool SliceOp::InferShapeImpl() const { ...@@ -51,9 +51,9 @@ bool SliceOp::InferShapeImpl() const {
if (dim_value > 0) { if (dim_value > 0) {
start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
start = std::max(start, 0); start = (std::max)(start, 0);
end = std::max(end, 0); end = (std::max)(end, 0);
end = std::min(end, dim_value); end = (std::min)(end, dim_value);
out_dims[axes[i]] = end - start; out_dims[axes[i]] = end - start;
} }
} }
......
...@@ -3,3 +3,4 @@ add_subdirectory(math) ...@@ -3,3 +3,4 @@ add_subdirectory(math)
add_subdirectory(cv) add_subdirectory(cv)
add_subdirectory(cv/anakin) add_subdirectory(cv/anakin)
add_subdirectory(api) add_subdirectory(api)
add_subdirectory(benchmark)
...@@ -9,11 +9,18 @@ if(LITE_WITH_ARM) ...@@ -9,11 +9,18 @@ if(LITE_WITH_ARM)
endif() endif()
function(xpu_x86_without_xtcl_test TARGET MODEL DATA) function(xpu_x86_without_xtcl_test TARGET MODEL DATA)
lite_cc_test(${TARGET} SRCS ${TARGET}.cc if(${DATA} STREQUAL "")
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils lite_cc_test(${TARGET} SRCS ${TARGET}.cc
${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
ARGS --model_dir=${LITE_MODEL_DIR}/${MODEL} ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
--data_dir=${LITE_MODEL_DIR}/${DATA}) ARGS --model_dir=${LITE_MODEL_DIR}/${MODEL})
else()
lite_cc_test(${TARGET} SRCS ${TARGET}.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/${MODEL} --data_dir=${LITE_MODEL_DIR}/${DATA})
endif()
if(WITH_TESTING) if(WITH_TESTING)
add_dependencies(${TARGET} extern_lite_download_${MODEL}_tar_gz) add_dependencies(${TARGET} extern_lite_download_${MODEL}_tar_gz)
if(NOT ${DATA} STREQUAL "") if(NOT ${DATA} STREQUAL "")
...@@ -26,8 +33,8 @@ if(LITE_WITH_XPU AND NOT LITE_WITH_XTCL) ...@@ -26,8 +33,8 @@ if(LITE_WITH_XPU AND NOT LITE_WITH_XTCL)
xpu_x86_without_xtcl_test(test_resnet50_fp32_xpu resnet50 ILSVRC2012_small) xpu_x86_without_xtcl_test(test_resnet50_fp32_xpu resnet50 ILSVRC2012_small)
xpu_x86_without_xtcl_test(test_googlenet_fp32_xpu GoogLeNet ILSVRC2012_small) xpu_x86_without_xtcl_test(test_googlenet_fp32_xpu GoogLeNet ILSVRC2012_small)
xpu_x86_without_xtcl_test(test_vgg19_fp32_xpu VGG19 ILSVRC2012_small) xpu_x86_without_xtcl_test(test_vgg19_fp32_xpu VGG19 ILSVRC2012_small)
xpu_x86_without_xtcl_test(test_ernie_fp32_xpu ernie "") xpu_x86_without_xtcl_test(test_ernie_fp32_xpu ernie bert_data)
xpu_x86_without_xtcl_test(test_bert_fp32_xpu bert "") xpu_x86_without_xtcl_test(test_bert_fp32_xpu bert bert_data)
endif() endif()
if(LITE_WITH_RKNPU) if(LITE_WITH_RKNPU)
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/io.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
template <class T = int64_t>
void ReadRawData(const std::string& input_data_dir,
std::vector<std::vector<T>>* input0,
std::vector<std::vector<T>>* input1,
std::vector<std::vector<T>>* input2,
std::vector<std::vector<T>>* input3,
std::vector<std::vector<int64_t>>* input_shapes) {
auto lines = ReadLines(input_data_dir);
for (auto line : lines) {
std::vector<std::string> shape_and_data = Split(line, ";");
std::vector<int64_t> input_shape =
Split<int64_t>(Split(shape_and_data[0], ":")[0], " ");
input_shapes->emplace_back(input_shape);
std::vector<T> input0_data =
Split<T>(Split(shape_and_data[0], ":")[1], " ");
input0->emplace_back(input0_data);
std::vector<T> input1_data =
Split<T>(Split(shape_and_data[1], ":")[1], " ");
input1->emplace_back(input1_data);
std::vector<T> input2_data =
Split<T>(Split(shape_and_data[2], ":")[1], " ");
input2->emplace_back(input2_data);
std::vector<T> input3_data =
Split<T>(Split(shape_and_data[3], ":")[1], " ");
input3->emplace_back(input3_data);
}
}
template <class T = int64_t>
void FillTensor(const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
int tensor_id,
const std::vector<int64_t>& tensor_shape,
const std::vector<T>& tensor_value) {
predictor->GetInput(tensor_id)->Resize(tensor_shape);
int64_t tensor_size = 1;
for (size_t i = 0; i < tensor_shape.size(); i++) {
tensor_size *= tensor_shape[i];
}
CHECK_EQ(static_cast<size_t>(tensor_size), tensor_value.size());
memcpy(predictor->GetInput(tensor_id)->mutable_data<T>(),
tensor_value.data(),
sizeof(T) * tensor_size);
}
float CalBertOutAccuracy(const std::vector<std::vector<float>>& out,
const std::string& out_file) {
auto lines = ReadLines(out_file);
std::vector<std::vector<float>> ref_out;
for (auto line : lines) {
ref_out.emplace_back(Split<float>(line, " "));
}
int right_num = 0;
for (size_t i = 0; i < out.size(); i++) {
std::vector<size_t> out_index{0, 1, 2};
std::vector<size_t> ref_out_index{0, 1, 2};
std::sort(out_index.begin(),
out_index.end(),
[&out, i](size_t a, size_t b) { return out[i][a] > out[i][b]; });
std::sort(ref_out_index.begin(),
ref_out_index.end(),
[&ref_out, i](size_t a, size_t b) {
return ref_out[i][a] > ref_out[i][b];
});
right_num += (out_index == ref_out_index);
}
return static_cast<float>(right_num) / static_cast<float>(out.size());
}
float CalErnieOutAccuracy(const std::vector<std::vector<float>>& out,
const std::string& out_file) {
auto lines = ReadLines(out_file);
std::vector<std::vector<float>> ref_out;
for (auto line : lines) {
ref_out.emplace_back(Split<float>(line, " "));
}
int right_num = 0;
for (size_t i = 0; i < out.size(); i++) {
right_num += (std::fabs(out[i][0] - ref_out[i][0]) < 0.01f);
}
return static_cast<float>(right_num) / static_cast<float>(out.size());
}
} // namespace lite
} // namespace paddle
...@@ -21,23 +21,16 @@ ...@@ -21,23 +21,16 @@
#include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h" #include "lite/api/test_helper.h"
#include "lite/tests/api/bert_utility.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
DEFINE_string(data_dir, "", "data dir");
DEFINE_int32(iteration, 9, "iteration times to run");
namespace paddle { namespace paddle {
namespace lite { namespace lite {
template <typename T> TEST(Bert, test_bert_fp32_xpu) {
lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
lite::Tensor ret;
ret.Resize(shape);
T* ptr = ret.mutable_data<T>();
for (int i = 0; i < ret.numel(); ++i) {
ptr[i] = (T)1;
}
return ret;
}
TEST(Ernie, test_ernie_fp32_xpu) {
lite_api::CxxConfig config; lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir); config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
...@@ -46,56 +39,58 @@ TEST(Ernie, test_ernie_fp32_xpu) { ...@@ -46,56 +39,58 @@ TEST(Ernie, test_ernie_fp32_xpu) {
config.set_xpu_workspace_l3_size_per_thread(); config.set_xpu_workspace_l3_size_per_thread();
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
int64_t batch_size = 1; std::string input_data_file = FLAGS_data_dir + std::string("/bert_in.txt");
int64_t seq_len = 64; std::vector<std::vector<int64_t>> input0;
Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1}); std::vector<std::vector<int64_t>> input1;
std::vector<int64_t> input_shape{batch_size, seq_len, 1}; std::vector<std::vector<int64_t>> input2;
predictor->GetInput(0)->Resize(input_shape); std::vector<std::vector<int64_t>> input3;
predictor->GetInput(1)->Resize(input_shape); std::vector<std::vector<int64_t>> input_shapes;
predictor->GetInput(2)->Resize(input_shape); ReadRawData(
predictor->GetInput(3)->Resize(input_shape); input_data_file, &input0, &input1, &input2, &input3, &input_shapes);
memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
for (int i = 0; i < FLAGS_warmup; ++i) { for (int i = 0; i < FLAGS_warmup; ++i) {
std::vector<int64_t> shape = {1, 64, 1};
std::vector<int64_t> fill_value(64, 0);
for (int j = 0; j < 4; j++) {
FillTensor(predictor, j, shape, fill_value);
}
predictor->Run(); predictor->Run();
} }
auto start = GetCurrentUS(); std::vector<std::vector<float>> out_rets;
for (int i = 0; i < FLAGS_repeats; ++i) { out_rets.resize(FLAGS_iteration);
double cost_time = 0;
for (int i = 0; i < FLAGS_iteration; ++i) {
FillTensor(predictor, 0, input_shapes[i], input0[i]);
FillTensor(predictor, 1, input_shapes[i], input1[i]);
FillTensor(predictor, 2, input_shapes[i], input2[i]);
FillTensor(predictor, 3, input_shapes[i], input3[i]);
double start = GetCurrentUS();
predictor->Run(); predictor->Run();
cost_time += GetCurrentUS() - start;
auto output_tensor = predictor->GetOutput(0);
auto output_shape = output_tensor->shape();
auto output_data = output_tensor->data<float>();
ASSERT_EQ(output_shape.size(), 2UL);
ASSERT_EQ(output_shape[0], 1);
ASSERT_EQ(output_shape[1], 3);
int output_size = output_shape[0] * output_shape[1];
out_rets[i].resize(output_size);
memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
} }
LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats << ", warmup: " << FLAGS_warmup
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 << ", iteration: " << FLAGS_iteration << ", spend "
<< " ms in average."; << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
std::vector<std::vector<float>> results; std::string ref_out_file = FLAGS_data_dir + std::string("/bert_out.txt");
results.emplace_back(std::vector<float>({0.278893, 0.330888, 0.39022})); float out_accuracy = CalBertOutAccuracy(out_rets, ref_out_file);
auto out = predictor->GetOutput(0); ASSERT_GT(out_accuracy, 0.95f);
ASSERT_EQ(out->shape().size(), 2);
ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 3);
for (size_t i = 0; i < results.size(); ++i) {
for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(
out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 3e-5);
}
}
} }
} // namespace lite } // namespace lite
......
...@@ -21,8 +21,12 @@ ...@@ -21,8 +21,12 @@
#include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h" #include "lite/api/test_helper.h"
#include "lite/tests/api/bert_utility.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
DEFINE_string(data_dir, "", "data dir");
DEFINE_int32(iteration, 9, "iteration times to run");
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -46,56 +50,58 @@ TEST(Ernie, test_ernie_fp32_xpu) { ...@@ -46,56 +50,58 @@ TEST(Ernie, test_ernie_fp32_xpu) {
config.set_xpu_workspace_l3_size_per_thread(); config.set_xpu_workspace_l3_size_per_thread();
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
int64_t batch_size = 1; std::string input_data_file = FLAGS_data_dir + std::string("/bert_in.txt");
int64_t seq_len = 64; std::vector<std::vector<int64_t>> input0;
Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1}); std::vector<std::vector<int64_t>> input1;
std::vector<int64_t> input_shape{batch_size, seq_len, 1}; std::vector<std::vector<int64_t>> input2;
predictor->GetInput(0)->Resize(input_shape); std::vector<std::vector<int64_t>> input3;
predictor->GetInput(1)->Resize(input_shape); std::vector<std::vector<int64_t>> input_shapes;
predictor->GetInput(2)->Resize(input_shape); ReadRawData(
predictor->GetInput(3)->Resize(input_shape); input_data_file, &input0, &input1, &input2, &input3, &input_shapes);
memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
for (int i = 0; i < FLAGS_warmup; ++i) { for (int i = 0; i < FLAGS_warmup; ++i) {
std::vector<int64_t> shape = {1, 64, 1};
std::vector<int64_t> fill_value(64, 0);
for (int j = 0; j < 4; j++) {
FillTensor(predictor, j, shape, fill_value);
}
predictor->Run(); predictor->Run();
} }
auto start = GetCurrentUS(); std::vector<std::vector<float>> out_rets;
for (int i = 0; i < FLAGS_repeats; ++i) { out_rets.resize(FLAGS_iteration);
double cost_time = 0;
for (int i = 0; i < FLAGS_iteration; ++i) {
FillTensor(predictor, 0, input_shapes[i], input0[i]);
FillTensor(predictor, 1, input_shapes[i], input1[i]);
FillTensor(predictor, 2, input_shapes[i], input2[i]);
FillTensor(predictor, 3, input_shapes[i], input3[i]);
double start = GetCurrentUS();
predictor->Run(); predictor->Run();
cost_time += GetCurrentUS() - start;
auto output_tensor = predictor->GetOutput(0);
auto output_shape = output_tensor->shape();
auto output_data = output_tensor->data<float>();
ASSERT_EQ(output_shape.size(), 2UL);
ASSERT_EQ(output_shape[0], 1);
ASSERT_EQ(output_shape[1], 1);
int output_size = output_shape[0] * output_shape[1];
out_rets[i].resize(output_size);
memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
} }
LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats << ", warmup: " << FLAGS_warmup
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 << ", iteration: " << FLAGS_iteration << ", spend "
<< " ms in average."; << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
std::vector<std::vector<float>> results;
results.emplace_back(std::vector<float>({0.108398}));
auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2);
ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1);
for (size_t i = 0; i < results.size(); ++i) { std::string ref_out_file = FLAGS_data_dir + std::string("/ernie_out.txt");
for (size_t j = 0; j < results[i].size(); ++j) { float out_accuracy = CalErnieOutAccuracy(out_rets, ref_out_file);
EXPECT_NEAR( ASSERT_GT(out_accuracy, 0.95f);
out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 2e-5);
}
}
} }
} // namespace lite } // namespace lite
......
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU AND NOT LITE_WITH_XPU) AND (LITE_WITH_ARM))
lite_cc_test(get_conv_latency SRCS src/get_conv_latency.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(get_batchnorm_latency SRCS src/get_batchnorm_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(get_pooling_latency SRCS src/get_pooling_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(get_fc_latency SRCS src/get_fc_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(get_activation_latency SRCS src/get_activation_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
# 运行方式
```shell
-- cd Paddle-Lite/lite/tests/benchmark
-- ./build_benchmark_ops.sh #把build目录下的所有单测可执行文件push到手机上
在build_benchmark_ops.sh中运行python get_latency_lookup_table.py --ops_path ops.txt --latency_lookup_table_path latency_lookup_table.txt
其中ops.txt是输入的网络模型文件, latency_lookup_table.txt是执行lite单测后输出的网络op耗时信息文件。
```
# 输入ops.txt格式说明
-- op_name [dim0 dim1 dim2 dim3] (op_param0, op_param1, ..., dtype=xxx)
ops.txt每一行有三个字段,第一个字段是op_name, 第二个字段是输入Tensor的input_dims,
第三个字段用()括起来,描述该op的parameter.
# 注意: 每一个字段之间是以tab来分割的,parameter内的子字段是以逗号来分割的,
# 描述tensor维度的[]内的数据之间以空格来分割,不能加逗号和tab.
op_name现支持取值为conv/activation/batchnorm/pooling/fc;
input_dims描述的是输入tensor格式,支持NCHW 4D等Tensor格式;
op_param0,op_param1等字段描述该op的param属性,比如conv op包含ch_out/stride/group/kernel/pad/dilation/flag_bias/flag_act等属性;
dtype描述该层op使用的数据类型,支持的合法输入为float/int8_float/int8_int8, 现在conv支持三种数据类型,其他op只支持float一种数据类型.
# conv op格式
conv [1 96 112 112] (ch_out=48, stride=1, group=1, kernel=1x1, pad=0, dilation=1, flag_bias=0, flag_act=0, dtype=float)
ch_out表示输出channel值, kernel表示卷积核size, 支持的合法取值为1x1/3x3/5x5等, pad表示边界padding的取值, flag_bias表示是否有bias, flag_act表示是否融合激活函数,支持的合法取值为0/1/2/4.
# activitation op格式
activation [1 8 64 64] (act_type=relu)
act_type表示激活函数类型,合法取值为relu/relu6/leaky_relu/tanh/swish/exp/abs/hard_swish/reciprocal/threshold_relu.
# batchnorm op格式
batchnorm [1 8 64 64] (epsilon=1e-4f, momentum=0.9f)
epsilon表示batchnorm的epsilon参数取值, 默认值为1e-4f;
momentum表示batchnorm的momentum参数取值, 默认值为0.9f.
# pooling op格式
pooling [1 8 64 64] (stride=2, pad=0, kernel=2x2, ceil_mode=0, flag_global=0, exclusive=1, pooling_type=max)
stride表示pooling操作的跨度,默认值取2;pad表示边界padding的取值,默认值取0;
kernel表示pooling卷积核size, 常见取值为2x2(默认值);
ceil_mode表示pooling是否进行ceil操作,=0表示false(默认值),否则表示为true;
flag_global表示pooling是否在WxH维度进行全局操作,=0表示false(默认值),否则表示为true;
exclusive表示pooling操作时的exclusive取值,=1表示true(默认值),否则表示为false;
pooling_type表示pooling类型,合法取值为max(默认值)/avg.
# fc op格式
fc [1 64] (flag_bias=1, param_dim=64x1000)
flag_bias表示fc op是否有bias,=1(默认值)表示为true, 否则为false;
param_dim表示fc op `k x n`的操作维度信息,其中k应与input_dims=[m k]中的k取值保持一致.
# 输出latency_lookup_table.txt格式说明
dev_info core_num thread_num power_mode core0 arch core1 arch core2 arch core3 arch core4 arch core5 arch core6 arch core7 arch
Hisilicon Kirin980 8 1 0 ARM_A55 ARM_A55 ARM_A55 ARM_A55 ARM_A76 ARM_A76 ARM_A76 ARM_A76
op_name input_dims output_dims param_info min_latency(ms) max_latency(ms) avg_latency(ms)
conv [1 96 112 112] [1 48 114 114] (ch_out=48, stride=1, pad=0, kernel=1x1, group=1, dilation=1, flag_bias=0, flag_act=0, dtype=float) 3.469 4.111 3.52088
fc [1 64] [64 1000] (param_dim=64x1000, flag_bias=1, dtype=float) 0.135 0.176 0.13779
batchnorm [1 8 64 64] [1 8 64 64] (epsilon=1e-4f, momentum=0.9f, dtype=float) 0.014 0.178 0.01679
pooling [1 8 64 64] [1 8 32 32] (stride=2, pad=0, kernel=2x2, ceil_mode=0, flag_global=0, exclusive=0, pooling_type=avg, dtype=float) 0.009 0.011 0.00983
activation [1 8 64 64] [1 8 64 64] (act_type=relu, dtype=float) 0.01 0.036 0.01103
-- 第一栏为header信息栏, 包含`dev_info` `arm_v7/v8` `core_num` `thread_num` `power_mode` `core0 arch` ... `core7 arch`字段:
`dev_info`表示手机hardware厂家型号信息, `arm_v7/v8`表示armv7还是armv8架构, `core_num`表示cpu核心数, `thread_num`表示设置的运行多线程数,
`power_mode`表示cpu绑核方式,
`core0 arch`...`core7 arch`表示arm cpu架构信息
第二栏为op信息栏, 包含`op_name` `input_dims` `output_dims` `param_info` `min_latency` `max_latency` `avg_latency`字段:
其中`output_dims`为该层op根据`input_dims``param_info`计算得到的输出tensor维度信息;
`min_latency(ms)` `max_latency(ms)` `avg_latency(ms)`为该层op运行得到的min/max/avg耗时信息.
#!/usr/bin/env bash
exe_dir="/data/local/tmp/bin"
work_dir=$(pwd)
os=android
abi=armv8
lang=gcc
function print_usage {
echo "----------------------------------------"
echo -e " ./push2device.sh --arm_os=<os> --arm_abi=<abi> --arm_lang=<lang>"
echo -e "--arm_os:\t android, only support android now"
echo -e "--arm_abi:\t armv8|armv7"
echo -e "--arm_lang:\t gcc|clang"
echo -e "make sure directory: PaddleLite/build.lite.${arm_os}.${arm_abi}.${arm_lang} exsits!"
echo "----------------------------------------"
}
function main {
for i in "$@"; do
case $i in
--arm_os=*)
os="${i#*=}"
shift
;;
--arm_abi=*)
abi="${i#*=}"
shift
;;
--arm_lang=*)
lang="${i#*=}"
shift
;;
*)
print_usage
exit 1
;;
esac
done
build_dir=$work_dir/../../../build.lite.${os}.${abi}.${lang}
lib_path=$build_dir/lite/tests/benchmark
lib_files=$lib_path/get*latency
adb shell mkdir ${exe_dir}
for file in ${lib_files}
do
adb push ${file} ${exe_dir}
done
}
main $@
python get_latency_lookup_table.py --arm_v7_v8 ${abi}
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
import re
import argparse
import subprocess
def get_args():
"""Get arguments.
Returns:
Namespace, arguments.
"""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--ops_path', default='ops.txt', help='Input ops path.')
parser.add_argument(
'--latency_lookup_table_path',
default='latency_lookup_table.txt',
help='Output ops latency path.')
parser.add_argument(
'--platform', default='android', help='Platform: android/ios/custom.')
parser.add_argument('--threads', type=int, default=1, help='Threads.')
parser.add_argument('--power_mode', type=int, default=0, help='PowerMode.')
parser.add_argument('--warmup_times', type=int, default=5,
help='Warm up times of op when estimating latency.')
parser.add_argument('--repeats_times', type=int, default=100,
help='Running times of op when estimating latency.')
parser.add_argument('--arm_v7_v8', type=str, default='armv8',
help='Indicate arm architecture v7 or v8.')
args = parser.parse_args()
return args
def check_dev_connect():
cmd = 'adb devices | grep device'
dev_info = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
out = dev_info.communicate()[0]
res = out.decode().find("\tdevice")
if res == -1:
print("No android device is attached")
sys.exit()
def get_dev_info():
cmd = 'adb shell "cat /proc/cpuinfo | grep Hardware"'
dev_info = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
out = dev_info.communicate()[0]
out = out.decode().strip('\n')
dev_info = out.strip('Hardware\t:').strip()
cmd = 'adb shell "cat /proc/cpuinfo | grep part"'
cpu_info = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
out = cpu_info.communicate()[0]
out = (out.decode().strip('\n').split('\n'))
core_num = len(out)
arch_type = ['UNKNOWN CPU ARCH']*core_num
for i, v in enumerate(out):
out = v.strip('CPU part').strip().strip(':').strip()
if out == '0xd03':
arch_type[i] = 'ARM_A53'
elif out == '0xd05':
arch_type[i] = 'ARM_A55'
elif out == '0xd07':
arch_type[i] = 'ARM_A57'
elif out == '0xd08':
arch_type[i] = 'ARM_A72'
elif out == '0xd09':
arch_type[i] = 'ARM_A73'
elif out == '0xd0a':
arch_type[i] = 'ARM_A75'
elif out == '0xd40':
arch_type[i] = 'ARM_A76'
elif out == '0x804':
# 855
arch_type[i] = 'ARM_A76'
elif out == '0x805':
# 855
arch_type[i] = 'ARM_A55'
elif out == '0x802':
# 845
arch_type[i] = 'ARM_A75'
elif out == '0x803':
# 845
arch_type[i] = 'ARM_A55'
elif out == '0x801':
# 835
arch_type[i] = 'ARM_A73'
elif out == '0x800':
# 835
arch_type[i] = 'ARM_A73'
elif out == '0x205':
# 820
arch_type[i] = 'ARM_A72'
else:
arch_type[i] = 'UNKNOWN CPU ARCH'
return dev_info, core_num, arch_type
def get_op_latency(op, platform):
"""Get model latency.
Args:
op: list, a list of str represents the op and its parameters.
platform: str, platform name.
Returns:
float, op latency.
"""
if platform == 'android':
commands = 'adb shell "cd /data/local/tmp/bin && ./get_{}_latency {}"'.format(
op[0], ' '.join(op[1:]))
proc = subprocess.Popen(
commands,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True)
out = proc.communicate()[0]
avg_out = [_ for _ in out.decode().split('\n') if 'Avg Latency' in _][-1]
avg_out = re.findall(r'\d+\.?\d*', avg_out)[0]
avg_out = float(avg_out)
min_out = [_ for _ in out.decode().split('\n') if 'Min Latency' in _][-1]
min_out = re.findall(r'\d+\.?\d*', min_out)[0]
min_out = float(min_out)
max_out = [_ for _ in out.decode().split('\n') if 'Max Latency' in _][-1]
max_out = re.findall(r'\d+\.?\d*', max_out)[0]
max_out = float(max_out)
elif platform == 'ios':
print('ios platform is not supported now')
sys.exit()
else:
print('Please define `get_op_latency` for {} platform'.format(platform))
sys.exit()
return avg_out, min_out, max_out
def main():
args = get_args()
check_dev_connect()
conv_param_dict = {'ch_out': '1', 'stride':'[1 1]', 'pad':'[0 0 0 0]', 'kernel':'3x3',
'group':'1', 'dilation':'[1 1]', 'flag_bias':'1',
'flag_act':'0', 'dtype':'float'}
batchnorm_param_dict = {'epsilon':'1e-4f', 'momentum':'0.9f',
'dtype':'float'}
pooling_param_dict = {'stride':'2', 'pad':'0', 'kernel':'2x2', 'ceil_mode':'0',
'flag_global':'0', 'exclusive':'1', 'pooling_type': 'max',
'dtype':'float'}
activation_param_dict = {'act_type':'relu', 'dtype':'float'}
fc_param_dict = {'param_dim':'1x1','flag_bias':'1', 'dtype':'float'}
op_info = {}
cur_op_name = ''
cur_param_dict = {}
input_dims = ''
output_dims = ''
runtime_cmd = []
fid = open(args.ops_path, 'r')
handle = open(args.latency_lookup_table_path, 'w')
handle.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('dev_info'.ljust(30), 'armv7/v8'.ljust(10), 'core_num'.ljust(10), 'thread_num'.ljust(10), 'power_mode'.ljust(10), 'core0 arch'.ljust(10), 'core1 arch'.ljust(10),
'core2 arch'.ljust(10), 'core3 arch'.ljust(10), 'core4 arch'.ljust(10), 'core5 arch'.ljust(10),
'core6 arch'.ljust(10), 'core7 arch'.ljust(10)))
dev_info, core_num, arch_type = get_dev_info()
handle.write('{}\t{}\t{}\t{}'.format(dev_info.ljust(30), str(args.arm_v7_v8).ljust(10), str(core_num).ljust(10), str(args.threads).ljust(10), str(args.power_mode).ljust(10)))
for i in arch_type:
handle.write('\t{}'.format(i).ljust(10))
handle.write('\n')
handle.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('op_name'.ljust(10), 'input_dims'.ljust(10), 'output_dims'.ljust(10), 'param_info'.ljust(80), 'min_latency(ms)'.ljust(10), 'max_latency(ms)'.ljust(10), 'avg_latency(ms)'.ljust(10)))
for line in fid.readlines():
line = [line.strip('\n')]
for data_item in line:
data_item = data_item.strip().split('\t')
cur_op_name = data_item[0]
input_dims = data_item[1]
parameters = data_item[2].strip('( )').split(',')
for item_ in parameters:
item_ = item_.strip().split('=')
# conv op dict
if cur_op_name == 'conv':
cur_param_dict = conv_param_dict
if item_[0] == 'ch_out':
cur_param_dict['ch_out'] = item_[1]
elif item_[0] == 'stride':
cur_param_dict['stride'] = item_[1]
elif item_[0] == 'pad':
cur_param_dict['pad'] = item_[1]
elif item_[0] == 'kernel':
cur_param_dict['kernel'] = item_[1]
elif item_[0] == 'group':
cur_param_dict['group'] = item_[1]
elif item_[0] == 'dilation':
cur_param_dict['dilation'] = item_[1]
elif item_[0] == 'flag_bias':
cur_param_dict['flag_bias'] = item_[1]
elif item_[0] == 'flag_act':
cur_param_dict['flag_act'] = item_[1]
elif item_[0] == 'dtype':
cur_param_dict['dtype'] = item_[1]
#batchnorm op dict
elif cur_op_name == 'batchnorm':
cur_param_dict = batchnorm_param_dict
if item_[0] == 'epsilon':
cur_param_dict['epsilon'] = item_[1]
elif item_[0] == 'momentum':
cur_param_dict['momentum'] = item_[1]
#pooling op dict
elif cur_op_name == 'pooling':
cur_param_dict = pooling_param_dict
if item_[0] == 'stride':
cur_param_dict['stride'] = item_[1]
elif item_[0] == 'pad':
cur_param_dict['pad'] = item_[1]
elif item_[0] == 'kernel':
cur_param_dict['kernel'] = item_[1]
elif item_[0] == 'ceil_mode':
cur_param_dict['ceil_mode'] = item_[1]
elif item_[0] == 'flag_global':
cur_param_dict['flag_global'] = item_[1]
elif item_[0] == 'exclusive':
cur_param_dict['exclusive'] = item_[1]
elif item_[0] == 'pooling_type':
cur_param_dict['pooling_type'] = item_[1]
#activation op dict
elif cur_op_name == 'activation':
cur_param_dict = activation_param_dict
if item_[0] == 'act_type':
cur_param_dict['act_type'] = item_[1]
# fc op dict
elif cur_op_name == 'fc':
cur_param_dict = fc_param_dict
if item_[0] == 'param_dim':
cur_param_dict['param_dim'] = item_[1]
elif item_[0] == 'flag_bias':
cur_param_dict['flag_bias'] = item_[1]
elif item_[0] == 'dtype':
cur_param_dict['dtype'] = 'float'
op_info[cur_op_name] = cur_param_dict
if cur_op_name == 'conv':
batch = input_dims.strip('[' ']').split()[0]
in_ch = input_dims.strip('[' ']').split()[1]
height = input_dims.strip('[' ']').split()[2]
width = input_dims.strip('[' ']').split()[3]
out_ch = cur_param_dict['ch_out']
pad_top = cur_param_dict['pad'].strip('[' ']').split()[0]
pad_bottom = cur_param_dict['pad'].strip('[' ']').split()[1]
pad_left = cur_param_dict['pad'].strip('[' ']').split()[2]
pad_right = cur_param_dict['pad'].strip('[' ']').split()[0]
dila_h = cur_param_dict['dilation'].strip('[' ']').split()[0]
dila_w = cur_param_dict['dilation'].strip('[' ']').split()[1]
kernel_h = cur_param_dict['kernel'][0]
kernel_w = cur_param_dict['kernel'][2]
stride_h = cur_param_dict['stride'].strip('[' ']').split()[0]
stride_w = cur_param_dict['stride'].strip('[' ']').split()[1]
hout = (int(height) + int(pad_top) + int(pad_bottom) - int(dila_h) *
(int(kernel_h) - 1) + 1) / int(stride_h) + 1
wout = (int(width) + int(pad_left) + int(pad_right) - int(dila_w) *
(int(kernel_w) - 1) + 1) / int(stride_w) + 1
output_dims = '[' + str(batch) + ' ' + str(out_ch) + ' ' + str(int(hout)) + ' ' + str(int(wout)) + ']'
dtype = 0
if cur_param_dict['dtype'] == 'float':
dtype = 0
elif cur_param_dict['dtype'] == 'int8_float':
dtype = 1
elif cur_param_dict['dtype'] == 'int8_int8':
dtype = 2
runtime_cmd = [str(batch), str(in_ch), str(height), str(width), str(out_ch),
str(cur_param_dict['group']), str(cur_param_dict['kernel'])[0],
str(pad_top), str(pad_bottom),
str(pad_left), str(pad_right),
str(stride_h), str(stride_w),
str(dila_h), str(dila_w),
str(cur_param_dict['flag_bias']), str(cur_param_dict['flag_act']),
str(dtype)]
elif cur_op_name == 'batchnorm':
batch = input_dims.strip('[' ']').split()[0]
in_ch = input_dims.strip('[' ']').split()[1]
height = input_dims.strip('[' ']').split()[2]
width = input_dims.strip('[' ']').split()[3]
output_dims = input_dims
runtime_cmd = [str(batch), str(in_ch), str(height), str(width),
str(cur_param_dict['epsilon']), str(cur_param_dict['momentum'])]
elif cur_op_name == 'pooling':
batch = input_dims.strip('[' ']').split()[0]
in_ch = input_dims.strip('[' ']').split()[1]
height = input_dims.strip('[' ']').split()[2]
width = input_dims.strip('[' ']').split()[3]
hout = 1
wout = 1
pad_top = cur_param_dict['pad'].strip('[' ']').split()[0]
pad_bottom = cur_param_dict['pad'].strip('[' ']').split()[1]
pad_left = cur_param_dict['pad'].strip('[' ']').split()[2]
pad_right = cur_param_dict['pad'].strip('[' ']').split()[3]
kernel_h = cur_param_dict['kernel'][0]
kernel_w = cur_param_dict['kernel'][2]
stride_h = cur_param_dict['stride'].strip('[' ']').split()[0]
stride_w = cur_param_dict['stride'].strip('[' ']').split()[1]
if cur_param_dict['flag_global'] == '0':
if cur_param_dict['ceil_mode'] == '0':
hout = (int(height) - int(kernel_h) + int(pad_top) + int(pad_bottom)) / int(stride_h) + 1
wout = (int(width) - int(kernel_w) + int(pad_left) + int(pad_right)) / int(stride_w) + 1
else:
hout = (int(height) - int(kernel_h) + int(pad_top) + int(pad_bottom) + int(stride_h) - 1) / int(stride_h) + 1
wout = (int(width) - int(kernel_w) + int(pad_left) + int(pad_right) + int(stride_w) - 1) / int(stride_w) + 1
output_dims = '[' + batch + ' ' + str(in_ch) + ' ' + str(int(hout)) + ' ' + str(int(wout)) + ']'
pooling_type = 0
if cur_param_dict['pooling_type'] == 'max':
pooling_type = 0
else:
pooling_type = 1
runtime_cmd = [str(batch), str(in_ch), str(height), str(width),
str(stride_h), str(stride_w),
str(pad_top), str(pad_bottom),
str(pad_left), str(pad_right),
str(cur_param_dict['kernel'])[0], str(cur_param_dict['ceil_mode']),
str(cur_param_dict['flag_global']), str(cur_param_dict['exclusive']),
str(pooling_type)]
elif cur_op_name == 'activation':
batch = input_dims.strip('[' ']').split()[0]
in_ch = input_dims.strip('[' ']').split()[1]
height = input_dims.strip('[' ']').split()[2]
width = input_dims.strip('[' ']').split()[3]
act_type = 1
if cur_param_dict['act_type'] == 'relu':
act_type = 1
elif cur_param_dict['act_type'] == 'relu6':
act_type = 2
elif cur_param_dict['act_type'] == 'leaky_relu':
act_type = 4
elif cur_param_dict['act_type'] == 'sigmoid':
act_type = 5
elif cur_param_dict['act_type'] == 'tanh':
act_type = 6
elif cur_param_dict['act_type'] == 'swish':
act_type = 7
elif cur_param_dict['act_type'] == 'exp':
act_type = 8
elif cur_param_dict['act_type'] == 'abs':
act_type = 9
elif cur_param_dict['act_type'] == 'hard_swish':
act_type = 10
elif cur_param_dict['act_type'] == 'reciprocal':
act_type = 11
elif cur_param_dict['act_type'] == 'threshold_relu':
act_type = 12
output_dims = input_dims
runtime_cmd = [str(batch), str(in_ch), str(height), str(width),
str(act_type)]
elif cur_op_name == 'fc':
m = input_dims.strip('[' ']').split()[0]
k = input_dims.strip('[' ']').split()[1]
n = cur_param_dict['param_dim'].split('x')[1]
output_dims = '[' + m + ' ' + n + ']'
runtime_cmd = [str(m), str(n), str(k), str(cur_param_dict['flag_bias']),
str(cur_param_dict['dtype'])]
avg_latency, min_latency, max_latency = get_op_latency([cur_op_name] +
runtime_cmd + [str(args.threads), str(args.power_mode),
str(args.warmup_times), str(args.repeats_times)],
args.platform)
param_dict = ''
for k in cur_param_dict:
param_dict += str(k) + '=' + str(cur_param_dict[k]) + ','
param_dict = '(' + param_dict[:-1] + ')'
handle.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(cur_op_name.ljust(10), input_dims.ljust(10), output_dims.ljust(10), param_dict.ljust(80), str(min_latency).ljust(10), str(max_latency).ljust(10), str(avg_latency).ljust(10)))
fid.close()
handle.close()
print('Congratulations! Get Latency LookUp Table is Completed.')
if __name__ == '__main__':
main()
dev_info armv7/v8 core_num thread_num power_mode core0 arch core1 arch core2 arch core3 arch core4 arch core5 arch core6 arch core7 arch
Hisilicon Kirin980 armv8 8 1 ARM_A55 ARM_A55 ARM_A55 ARM_A55 ARM_A76 ARM_A76 ARM_A76 ARM_A76
op_name input_dims output_dims param_info min_latency(ms) max_latency(ms) avg_latency(ms)
conv [1 96 112 112] [1 48 114 114] (ch_out=48,stride=[1 1],pad=[0 0 0 0],kernel=1x1,group=1,dilation=[1 1],flag_bias=0,flag_act=0,dtype=float) 3.472 5.384 3.97393
fc [4 8] [4 1000] (param_dim=8x1000,flag_bias=1,dtype=float) 0.009 0.023 0.00951
batchnorm [1 8 64 64] [1 8 64 64] (epsilon=1e-4f,momentum=0.9f,dtype=float) 0.01 0.012 0.0114
pooling [1 8 64 64] [1 8 32 32] (stride=[2 2],pad=[0 0 0 0],kernel=2x2,ceil_mode=0,flag_global=0,exclusive=0,pooling_type=avg,dtype=float) 0.009 0.01 0.00969
activation [1 8 64 64] [1 8 64 64] (act_type=relu,dtype=float) 0.01 0.028 0.01098
conv [1 96 112 112] (ch_out=48, stride=[1 1], group=1, kernel=1x1, pad=[0 0 0 0], dilation=[1 1], flag_bias=0, flag_act=0, dtype=float)
fc [4 8] (flag_bias=1, param_dim=8x1000)
batchnorm [1 8 64 64] (epsilon=1e-4f, momentum=0.9f)
pooling [1 8 64 64] (stride=[2 2], kernel=2x2, pad=[0 0 0 0], exclusive=0, pooling_type=avg)
activation [1 8 64 64] (act_type=relu)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdlib.h>
#include <iostream>
#include <memory>
#include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h"
#include "lite/kernels/arm/activation_compute.h"
#include "lite/operators/op_params.h"
#include "lite/tests/utils/tensor_utils.h"
typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::DDim DDim;
typedef paddle::lite::operators::ActivationParam ActivationParam;
using paddle::lite::profile::Timer;
int main(int argc, char** argv) {
if (argc != 10) {
std::cerr << "usage: " << argv[0] << "\n"
<< " <batch_size>\n"
<< " <input_channel>\n"
<< " <input_height>\n"
<< " <input_width>\n"
<< " <act_type>\n"
<< " <thread_num>\n"
<< " <power_mode>\n"
<< " <warmup_times>\n"
<< " <repeats_times>" << std::endl;
return 0;
}
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
int batch_size = atoi(argv[1]);
int input_channel = atoi(argv[2]);
int input_height = atoi(argv[3]);
int input_width = atoi(argv[4]);
int thread_num = atoi(argv[6]);
int power_mode = atoi(argv[7]);
int warmup = atoi(argv[8]);
int repeats = atoi(argv[9]);
int act_type = atoi(argv[5]);
const float six = 6.f;
const float leakey_relu_scale = 8.88f;
#ifdef LITE_WITH_ARM
ActivationParam act_param;
Tensor x, y;
DDim dim_in = DDim({batch_size, input_channel, input_height, input_width});
x.set_precision(PRECISION(kFloat));
x.Resize(dim_in);
paddle::lite::fill_tensor_rand(x, -1.f, 1.f);
act_param.X = &x;
act_param.active_type = (paddle::lite_api::ActivationType)act_type;
act_param.has_active = true;
if (act_type == 2) {
act_param.Relu_clipped_coef = six;
} else if (act_type == 4) {
act_param.Leaky_relu_alpha = leakey_relu_scale;
}
act_param.Out = &y;
act_param.Out->set_precision(PRECISION(kFloat));
act_param.Out->Resize(dim_in);
Timer t0;
if (act_type == 1) {
paddle::lite::kernels::arm::ReluCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 2) {
paddle::lite::kernels::arm::Relu6Compute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 4) {
paddle::lite::kernels::arm::LeakyReluCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 5) {
paddle::lite::kernels::arm::SigmoidCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 6) {
paddle::lite::kernels::arm::TanhCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 7) {
paddle::lite::kernels::arm::SwishCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 8) {
paddle::lite::kernels::arm::ExpCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 9) {
paddle::lite::kernels::arm::AbsCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 10) {
paddle::lite::kernels::arm::HardSwishCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 11) {
paddle::lite::kernels::arm::ReciprocalCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
} else if (act_type == 12) {
paddle::lite::kernels::arm::ThresholdedReluCompute act_compute;
act_compute.SetParam(act_param);
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
act_compute.SetContext(std::move(ctx1));
act_compute.PrepareForRun();
// warm up
for (int i = 0; i < warmup; ++i) {
act_compute.Launch();
}
// compute
for (int i = 0; i < repeats; ++i) {
t0.Start();
act_compute.Launch();
t0.Stop();
}
}
printf("Avg Latency is %f\n", t0.LapTimes().Avg());
printf("Min Latency is %f\n", t0.LapTimes().Min());
printf("Max Latency is %f\n", t0.LapTimes().Max());
#endif
return 0;
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdlib.h>
#include <iostream>
#include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h"
#include "lite/kernels/arm/batch_norm_compute.h"
#include "lite/operators/op_params.h"
typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::kernels::arm::BatchNormCompute BatchNormCompute;
using paddle::lite::profile::Timer;
int main(int argc, char** argv) {
if (argc != 11) {
std::cerr << "usage: " << argv[0] << "\n"
<< " <batch_size>\n"
<< " <input_channel>\n"
<< " <input_height>\n"
<< " <input_width>\n"
<< " <epsilon>\n"
<< " <momentum>\n"
<< " <thread_num>\n"
<< " <power_mode>\n"
<< " <warmup_times>\n"
<< " <repeats_times>\n"
<< std::endl;
return 0;
}
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
int batch_size = atoi(argv[1]);
int input_channel = atoi(argv[2]);
int input_height = atoi(argv[3]);
int input_width = atoi(argv[4]);
float epsilon = atof(argv[5]);
float momentum = atof(argv[6]);
int thread_num = atoi(argv[7]);
int power_mode = atoi(argv[8]);
int warmup = atoi(argv[9]);
int repeats = atoi(argv[10]);
#ifdef LITE_WITH_ARM
Tensor x;
Tensor scale;
Tensor bias;
Tensor mean;
Tensor variance;
Tensor y;
Tensor mean_out;
Tensor variance_out;
Tensor saved_mean;
Tensor saved_variance;
std::vector<int64_t> in_out_shape = {
batch_size, input_channel, input_height, input_width};
x.Resize(in_out_shape);
scale.Resize({input_channel});
bias.Resize({input_channel});
mean.Resize({input_channel});
variance.Resize({input_channel});
y.Resize(in_out_shape);
mean_out.Resize({input_channel});
variance_out.Resize({input_channel});
saved_mean.Resize({input_channel});
saved_variance.Resize({input_channel});
// initialize the data of input tensors
auto* x_data = x.mutable_data<float>();
auto* scale_data = scale.mutable_data<float>();
auto* bias_data = bias.mutable_data<float>();
auto* mean_data = mean.mutable_data<float>();
auto* variance_data = variance.mutable_data<float>();
for (int i = 0; i < x.dims().production(); i++) {
x_data[i] = static_cast<float>(i % 64);
}
for (int i = 0; i < scale.dims().production(); i++) {
scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
}
for (int i = 0; i < bias.dims().production(); i++) {
bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
}
for (int i = 0; i < mean.dims().production(); i++) {
mean_data[i] = static_cast<float>(i) * 0.0565f;
}
for (int i = 0; i < variance.dims().production(); i++) {
variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
}
// prepare kernel params and run
BatchNormCompute batch_norm;
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
batch_norm.SetContext(std::move(ctx1));
paddle::lite::operators::BatchNormParam param;
param.x = &x;
param.scale = &scale;
param.bias = &bias;
param.mean = &mean;
param.variance = &variance;
param.is_test = false;
param.use_global_stats = true;
param.epsilon = epsilon;
param.momentum = momentum;
param.data_layout = DATALAYOUT(kNCHW);
param.y = &y;
param.mean_out = &mean_out;
param.variance_out = &variance_out;
param.saved_mean = &saved_mean;
param.saved_variance = &saved_variance;
batch_norm.SetParam(param);
// warm up
for (int i = 0; i < warmup; ++i) {
batch_norm.Launch();
}
// compute
Timer t0;
for (int i = 0; i < repeats; ++i) {
t0.Start();
batch_norm.Launch();
t0.Stop();
}
printf("Avg Latency is %f\n", t0.LapTimes().Avg());
printf("Min Latency is %f\n", t0.LapTimes().Min());
printf("Max Latency is %f\n", t0.LapTimes().Max());
#endif
return 0;
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdlib.h>
#include <iostream>
#include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h"
#include "lite/kernels/arm/conv_compute.h"
#include "lite/operators/op_params.h"
#include "lite/tests/utils/tensor_utils.h"
typedef paddle::lite::operators::ConvParam ConvParam;
typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::DDim DDim;
typedef paddle::lite::operators::ActivationParam ActivationParam;
using paddle::lite::profile::Timer;
using paddle::lite_api::PrecisionType;
DDim compute_out_dim(const DDim& dim_in,
const paddle::lite::operators::ConvParam& param) {
DDim dim_out = dim_in;
auto paddings = *param.paddings;
auto dilations = *param.dilations;
dim_out[1] = param.filter->dims()[0];
auto kernel_h = param.filter->dims()[2];
auto kernel_w = param.filter->dims()[3];
auto h = dim_in[2];
auto w = dim_in[3];
int dila_h = dilations[0];
int dila_w = dilations[1];
int pad_top = paddings[0];
int pad_bottom = paddings[1];
int pad_left = paddings[2];
int pad_right = paddings[3];
int stride_h = param.strides[0];
int stride_w = param.strides[1];
auto kernel_exten = dila_h * (kernel_h - 1) + 1;
auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1;
kernel_exten = dila_w * (kernel_w - 1) + 1;
auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1;
dim_out[2] = hout;
dim_out[3] = wout;
return dim_out;
}
template <PrecisionType Ptype, PrecisionType OutType>
void test_conv(const DDim& input_dims,
const DDim& weight_dims,
const int group,
const std::vector<int>& strides,
const std::vector<int>& pads,
const std::vector<int>& dilas,
const bool flag_bias,
const int flag_act,
const int thread_num,
const int power_mode,
const int warmup,
const int repeats,
const float leakey_relu_scale = 8.88f) {
ConvParam param;
Tensor x, f, y;
Tensor bias;
param.x = &x;
param.x->set_precision(Ptype);
param.filter = &f;
param.filter->Resize(weight_dims);
param.filter->set_precision(Ptype);
if (flag_bias) {
param.bias = &bias;
param.bias->Resize({weight_dims[0]});
param.bias->set_precision(PRECISION(kFloat));
}
param.strides = strides;
param.paddings = std::make_shared<std::vector<int>>(pads);
param.dilations = std::make_shared<std::vector<int>>(dilas);
param.groups = group;
const float six = 6.f;
if (Ptype == PRECISION(kInt8)) {
std::vector<float> scale_in{1.f / 127};
std::vector<float> scale_out(1, weight_dims.count(1, 4) / 127.f);
if (flag_act == 2) {
scale_out[0] = six / 127.f;
} else if (flag_act == 4) {
if (std::abs(leakey_relu_scale) > 1) {
scale_out[0] *= std::abs(leakey_relu_scale);
}
}
std::vector<float> scale_w(weight_dims[0], 1.f / 127);
param.input_scale = scale_in[0];
param.output_scale = scale_out[0];
param.weight_scale = scale_w;
}
if (flag_act > 0) {
ActivationParam act_param;
act_param.has_active = true;
act_param.active_type = (paddle::lite_api::ActivationType)
flag_act; // 1-relu, 2-relu6, 4-leakyrelu
if (flag_act == 1) {
param.fuse_relu = true;
} else if (flag_act == 2) {
act_param.Relu_clipped_coef = six;
} else if (flag_act == 4) {
act_param.Leaky_relu_alpha = leakey_relu_scale;
}
param.activation_param = act_param;
}
param.output = &y;
param.output->set_precision(OutType);
paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
if (flag_bias) {
paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
}
paddle::lite::kernels::arm::ConvCompute<Ptype, OutType> conv;
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
param.x->Resize(input_dims);
DDim dim_out = compute_out_dim(input_dims, param);
param.output->Resize(dim_out);
conv.SetParam(param);
conv.SetContext(std::move(ctx1));
conv.PrepareForRun();
paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
// warm up
for (int i = 0; i < warmup; ++i) {
conv.Launch();
}
// compute
Timer t0;
for (int i = 0; i < repeats; ++i) {
t0.Start();
conv.Launch();
t0.Stop();
}
printf("Avg Latency is %f\n", t0.LapTimes().Avg());
printf("Min Latency is %f\n", t0.LapTimes().Min());
printf("Max Latency is %f\n", t0.LapTimes().Max());
}
int main(int argc, char** argv) {
if (argc != 23) {
std::cerr << "usage: " << argv[0] << "\n"
<< " <batch_size>\n"
<< " <input_channel>\n"
<< " <input_height>\n"
<< " <input_width>\n"
<< " <output_channel>\n"
<< " <group_size>\n"
<< " <kernel_size>\n"
<< " <pad_top>\n"
<< " <pad_bottom>\n"
<< " <pad_left>\n"
<< " <pad_right>\n"
<< " <stride_h>\n"
<< " <stride_w>\n"
<< " <dilation_h>\n"
<< " <dilation_w>\n"
<< " <flag_bias>\n"
<< " <flag_act>\n"
<< " <dtype>\n"
<< " <thread_num>\n"
<< " <power_mode>\n"
<< " <warmup_times>\n"
<< " <repeats_times>\n"
<< std::endl;
return 0;
}
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
int batch_size = atoi(argv[1]);
int input_channel = atoi(argv[2]);
int input_height = atoi(argv[3]);
int input_width = atoi(argv[4]);
int output_channel = atoi(argv[5]);
int group_size = atoi(argv[6]);
int kernel_size = atoi(argv[7]);
int pad_top = atoi(argv[8]);
int pad_bottom = atoi(argv[9]);
int pad_left = atoi(argv[10]);
int pad_right = atoi(argv[11]);
int stride_h = atoi(argv[12]);
int stride_w = atoi(argv[13]);
int dilation_h = atoi(argv[14]);
int dilation_w = atoi(argv[15]);
int flag_bias = atoi(argv[16]);
int flag_act = atoi(argv[17]);
int dtype = atoi(argv[18]);
int thread_num = atoi(argv[19]);
int power_mode = atoi(argv[20]);
int warmup = atoi(argv[21]);
int repeats = atoi(argv[22]);
DDim weight_dims(
{output_channel, input_channel / group_size, kernel_size, kernel_size});
DDim input_dims({batch_size, input_channel, input_height, input_width});
switch (dtype) {
case 0:
test_conv<PRECISION(kFloat), PRECISION(kFloat)>(
input_dims,
weight_dims,
group_size,
{stride_h, stride_w},
{pad_top, pad_bottom, pad_left, pad_right},
{dilation_h, dilation_w},
flag_bias,
flag_act,
thread_num,
power_mode,
warmup,
repeats);
break;
case 1:
test_conv<PRECISION(kInt8), PRECISION(kFloat)>(
input_dims,
weight_dims,
group_size,
{stride_h, stride_w},
{pad_top, pad_bottom, pad_left, pad_right},
{dilation_h, dilation_w},
flag_bias,
flag_act,
thread_num,
power_mode,
warmup,
repeats);
break;
case 2:
test_conv<PRECISION(kInt8), PRECISION(kInt8)>(
input_dims,
weight_dims,
group_size,
{stride_h, stride_w},
{pad_top, pad_bottom, pad_left, pad_right},
{dilation_h, dilation_w},
flag_bias,
flag_act,
thread_num,
power_mode,
warmup,
repeats);
break;
default:
test_conv<PRECISION(kFloat), PRECISION(kFloat)>(
input_dims,
weight_dims,
group_size,
{stride_h, stride_w},
{pad_top, pad_bottom, pad_left, pad_right},
{dilation_h, dilation_w},
flag_bias,
flag_act,
thread_num,
power_mode,
warmup,
repeats);
}
return 0;
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdlib.h>
#include <iostream>
#include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h"
#include "lite/kernels/arm/fc_compute.h"
#include "lite/operators/op_params.h"
#include "lite/tests/utils/tensor_utils.h"
typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::DDim DDim;
typedef paddle::lite::operators::FcParam FcParam;
using paddle::lite::profile::Timer;
using paddle::lite_api::PrecisionType;
template <PrecisionType Ptype, PrecisionType OutType>
void test_fc(const int m,
const int n,
const int k,
const bool has_bias,
const int thread_num,
const int power_mode,
const int warmup,
const int repeats) {
FcParam param;
Tensor x, y, bias, w;
param.input = &x;
param.input->set_precision(Ptype);
param.input->Resize({m, k});
param.w = &w;
param.w->set_precision(Ptype);
param.w->Resize({k, n});
if (has_bias) {
param.bias = &bias;
param.bias->set_precision(Ptype);
param.bias->Resize({1, n});
} else {
param.bias = nullptr;
}
param.output = &y;
param.output->set_precision(OutType);
param.output->Resize({m, n});
param.in_num_col_dims = 1;
param.in_mat_dims = param.input->dims();
paddle::lite::kernels::arm::FcCompute<Ptype, OutType> fc_compute;
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
// set param and context
fc_compute.SetParam(param);
fc_compute.SetContext(std::move(ctx1));
// prepare for run
fc_compute.PrepareForRun();
paddle::lite::fill_tensor_rand(*param.input, -1.f, 1.f);
paddle::lite::fill_tensor_rand(*param.w, -1.f, 1.f);
if (has_bias) {
paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
}
// warm up
for (int i = 0; i < warmup; ++i) {
fc_compute.Launch();
}
// compute
Timer t0;
for (int i = 0; i < repeats; ++i) {
t0.Start();
fc_compute.Launch();
t0.Stop();
}
printf("Avg Latency is %f\n", t0.LapTimes().Avg());
printf("Min Latency is %f\n", t0.LapTimes().Min());
printf("Max Latency is %f\n", t0.LapTimes().Max());
}
int main(int argc, char** argv) {
if (argc != 10) {
std::cerr << "usage: " << argv[0] << "\n"
<< " <m>\n"
<< " <n>\n"
<< " <k>\n"
<< " <has_bias>\n"
<< " <dtype>\n"
<< " <thread_num>\n"
<< " <power_mode>\n"
<< " <warmup_times>\n"
<< " <repeats_times>\n"
<< std::endl;
return 0;
}
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int k = atoi(argv[3]);
bool has_bias = atoi(argv[4]) == 0 ? false : true;
int dtype = argv[5] == "int8_int8" ? 2 : argv[5] == "float_int8"
? 1
: argv[5] == "float" ? 0 : 0;
int thread_num = atoi(argv[6]);
int power_mode = atoi(argv[7]);
int warmup = atoi(argv[8]);
int repeats = atoi(argv[9]);
switch (dtype) {
case 0:
test_fc<PRECISION(kFloat), PRECISION(kFloat)>(
m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
break;
case 1:
test_fc<PRECISION(kInt8), PRECISION(kFloat)>(
m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
break;
case 2:
test_fc<PRECISION(kInt8), PRECISION(kInt8)>(
m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
break;
default:
test_fc<PRECISION(kFloat), PRECISION(kFloat)>(
m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
break;
}
return 0;
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdlib.h>
#include <iostream>
#include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h"
#include "lite/kernels/arm/pool_compute.h"
#include "lite/operators/op_params.h"
#include "lite/tests/utils/tensor_utils.h"
typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::DDim DDim;
typedef paddle::lite::operators::PoolParam PoolParam;
using paddle::lite::profile::Timer;
DDim compute_out_dim(const DDim& dim_in,
const paddle::lite::operators::PoolParam& param) {
DDim dim_out = dim_in;
auto kernel_h = param.ksize[0];
auto kernel_w = param.ksize[1];
auto h = dim_in[2];
auto w = dim_in[3];
auto paddings = *param.paddings;
int stride_h = param.strides[0];
int stride_w = param.strides[1];
bool ceil_mode = param.ceil_mode;
bool flag_global = param.global_pooling;
int hout = 1;
int wout = 1;
if (!flag_global) {
if (!ceil_mode) {
hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1;
wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1;
} else {
hout =
(h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h +
1;
wout =
(w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w +
1;
}
}
dim_out[2] = hout;
dim_out[3] = wout;
return dim_out;
}
int main(int argc, char** argv) {
if (argc != 20) {
std::cerr << "usage: " << argv[0] << "\n"
<< " <batch_size>\n"
<< " <input_channel>\n"
<< " <input_height>\n"
<< " <input_width>\n"
<< " <kernel_size>\n"
<< " <stride_size>\n"
<< " <pad_size>\n"
<< " <exclusive>\n"
<< " <pooling_type>\n"
<< " <ceil_mode>\n"
<< " <flag_global>\n"
<< " <thread_num>\n"
<< " <power_mode>\n"
<< " <warmup_times>\n"
<< " <repeats_times>\n"
<< std::endl;
return 0;
}
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
int batch_size = atoi(argv[1]);
int input_channel = atoi(argv[2]);
int input_height = atoi(argv[3]);
int input_width = atoi(argv[4]);
int stride_h = atoi(argv[5]);
int stride_w = atoi(argv[6]);
int pad_top = atoi(argv[7]);
int pad_bottom = atoi(argv[8]);
int pad_left = atoi(argv[9]);
int pad_right = atoi(argv[10]);
int kernel_size = atoi(argv[11]);
bool ceil_mode = argv[12] == 0 ? false : true;
bool flag_global = argv[13] == 0 ? false : true;
bool exclusive = atoi(argv[14]) == 0 ? false : true;
std::string pooling_type = atoi(argv[15]) == 0 ? "max" : "avg";
int thread_num = atoi(argv[16]);
int power_mode = atoi(argv[17]);
int warmup = atoi(argv[18]);
int repeats = atoi(argv[19]);
#ifdef LITE_WITH_ARM
PoolParam param;
Tensor x, y;
param.x = &x;
param.x->set_precision(PRECISION(kFloat));
param.ksize = {kernel_size, kernel_size};
param.strides = {stride_h, stride_w};
param.paddings = std::make_shared<std::vector<int>>(
std::vector<int>{pad_top, pad_bottom, pad_left, pad_right});
param.ceil_mode = ceil_mode;
param.global_pooling = flag_global;
param.pooling_type = pooling_type;
param.exclusive = exclusive;
param.adaptive = false;
param.use_quantizer = false;
param.output = &y;
param.output->set_precision(PRECISION(kFloat));
paddle::lite::kernels::arm::PoolCompute pool;
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
thread_num);
// set param and context
pool.SetParam(param);
pool.SetContext(std::move(ctx1));
// prepare for run
pool.PrepareForRun();
DDim dim_in = DDim({batch_size, input_channel, input_height, input_width});
DDim dim_out = compute_out_dim(dim_in, param);
param.x->Resize(dim_in);
param.output->Resize(dim_out);
paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
// warm up
for (int i = 0; i < warmup; ++i) {
pool.Launch();
}
// compute
Timer t0;
for (int i = 0; i < repeats; ++i) {
t0.Start();
pool.Launch();
t0.Stop();
}
printf("Avg Latency is %f\n", t0.LapTimes().Avg());
printf("Min Latency is %f\n", t0.LapTimes().Min());
printf("Max Latency is %f\n", t0.LapTimes().Max());
#endif
return 0;
}
...@@ -130,7 +130,6 @@ void TestCast(Place place, float abs_error, int in_dtype, int out_dtype) { ...@@ -130,7 +130,6 @@ void TestCast(Place place, float abs_error, int in_dtype, int out_dtype) {
} }
TEST(Cast, precision) { TEST(Cast, precision) {
LOG(INFO) << "test cast op";
Place place; Place place;
float abs_error = 2e-5; float abs_error = 2e-5;
#if defined(LITE_WITH_ARM) #if defined(LITE_WITH_ARM)
...@@ -150,7 +149,7 @@ TEST(Cast, precision) { ...@@ -150,7 +149,7 @@ TEST(Cast, precision) {
TestCast(place, abs_error, 20, 5); TestCast(place, abs_error, 20, 5);
#endif #endif
TestCast(place, abs_error, 2, 5); TestCast(place, abs_error, 2, 5);
#if defined(LITE_WITH_XPU) || defined(LITE_WITH_HUAWEI_ASCEND_NPU) #if defined(LITE_WITH_HUAWEI_ASCEND_NPU)
TestCast(place, abs_error, 3, 5); TestCast(place, abs_error, 3, 5);
TestCast(place, abs_error, 5, 3); TestCast(place, abs_error, 5, 3);
#endif #endif
......
...@@ -121,9 +121,9 @@ class FcOPTest : public arena::TestCase { ...@@ -121,9 +121,9 @@ class FcOPTest : public arena::TestCase {
int k = wdims_[0]; int k = wdims_[0];
int n = wdims_[1]; int n = wdims_[1];
LOG(INFO) << "M=" << m << ", N=" << n << ", K=" << k VLOG(4) << "M=" << m << ", N=" << n << ", K=" << k << ", bias=" << flag_bias
<< ", bias=" << flag_bias << ", with_relu=" << with_relu_ << ", with_relu=" << with_relu_
<< ", padding_weights=" << padding_weights_; << ", padding_weights=" << padding_weights_;
if (m == 1) { if (m == 1) {
basic_gemv(n, basic_gemv(n,
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
template <class T = float, class R = int64_t>
class GatherComputeTest : public arena::TestCase { class GatherComputeTest : public arena::TestCase {
protected: protected:
// common attributes for this op. // common attributes for this op.
...@@ -53,9 +54,9 @@ class GatherComputeTest : public arena::TestCase { ...@@ -53,9 +54,9 @@ class GatherComputeTest : public arena::TestCase {
out_dims[0] = batch_size; out_dims[0] = batch_size;
out->Resize(out_dims); out->Resize(out_dims);
auto x_data = x->data<int64_t>(); auto x_data = x->template data<T>();
auto index_data = index->data<int64_t>(); auto index_data = index->template data<R>();
auto out_data = out->mutable_data<int64_t>(); auto out_data = out->template mutable_data<T>();
auto slice_num = x_dims[0]; auto slice_num = x_dims[0];
auto slice_size = x_dims.Slice(1, x_dims.size()).production(); auto slice_size = x_dims.Slice(1, x_dims.size()).production();
...@@ -66,7 +67,7 @@ class GatherComputeTest : public arena::TestCase { ...@@ -66,7 +67,7 @@ class GatherComputeTest : public arena::TestCase {
CHECK_GE(index, 0) << "gather ids[i] expected >= 0 but got " << index; CHECK_GE(index, 0) << "gather ids[i] expected >= 0 but got " << index;
memcpy(out_data + i * slice_size, memcpy(out_data + i * slice_size,
x_data + index * slice_size, x_data + index * slice_size,
slice_size * sizeof(int64_t)); slice_size * sizeof(T));
} }
} }
...@@ -78,11 +79,12 @@ class GatherComputeTest : public arena::TestCase { ...@@ -78,11 +79,12 @@ class GatherComputeTest : public arena::TestCase {
} }
void PrepareData() override { void PrepareData() override {
std::vector<int64_t> x(x_dims_.production()); std::vector<T> x(x_dims_.production());
fill_data_rand(x.data(), int64_t(-1), int64_t(1), x_dims_.production()); fill_data_rand(
x.data(), static_cast<T>(-1), static_cast<T>(1), x_dims_.production());
std::vector<int64_t> index(index_dims_.production()); std::vector<R> index(index_dims_.production());
fill_data_rand<int64_t>( fill_data_rand<R>(
index.data(), 0, x_dims_[0] - 1, index_dims_.production()); index.data(), 0, x_dims_[0] - 1, index_dims_.production());
SetCommonTensor(x_, x_dims_, x.data()); SetCommonTensor(x_, x_dims_, x.data());
...@@ -90,8 +92,20 @@ class GatherComputeTest : public arena::TestCase { ...@@ -90,8 +92,20 @@ class GatherComputeTest : public arena::TestCase {
} }
}; };
template <class T = float, class R = int64_t>
void TestGather(const std::vector<int64_t>& x_dims,
const std::vector<int64_t>& index_dims,
Place place,
float abs_error = 1e-5,
const std::string& alias = "def") {
std::unique_ptr<arena::TestCase> tester(new GatherComputeTest<T, R>(
place, alias, DDim(x_dims), DDim(index_dims)));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision();
}
TEST(Gather, precision) { TEST(Gather, precision) {
float abs_error = 2e-5; float abs_error = 1e-5;
Place place; Place place;
#if defined(LITE_WITH_NPU) #if defined(LITE_WITH_NPU)
place = TARGET(kNPU); place = TARGET(kNPU);
...@@ -110,10 +124,14 @@ TEST(Gather, precision) { ...@@ -110,10 +124,14 @@ TEST(Gather, precision) {
for (auto x_dims : for (auto x_dims :
std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) { std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
for (auto index_dims : std::vector<std::vector<int64_t>>{{3}, {7}, {10}}) { for (auto index_dims : std::vector<std::vector<int64_t>>{{3}, {7}, {10}}) {
std::unique_ptr<arena::TestCase> tester(new GatherComputeTest( #if defined(LITE_WITH_XPU) || defined(LITE_WITH_NPU)
place, "int64", DDim(x_dims), DDim(index_dims))); TestGather<float, int>(x_dims, index_dims, place, abs_error, "def");
arena::Arena arena(std::move(tester), place, abs_error); #else
arena.TestPrecision(); TestGather<float, int64_t>(x_dims, index_dims, place, abs_error, "int64");
TestGather<int64_t, int64_t>(
x_dims, index_dims, place, abs_error, "int64");
TestGather<float, int>(x_dims, index_dims, place, abs_error, "int32");
#endif
} }
} }
} }
......
...@@ -738,7 +738,7 @@ TEST(PriorBox, precision) { ...@@ -738,7 +738,7 @@ TEST(PriorBox, precision) {
} }
TEST(DensityPriorBox, precision) { TEST(DensityPriorBox, precision) {
#ifdef LITE_WITH_X86 #if defined(LITE_WITH_X86) && !defined(LITE_WITH_XPU)
Place place(TARGET(kX86)); Place place(TARGET(kX86));
test_density_prior_box(place); test_density_prior_box(place);
#endif #endif
......
...@@ -372,7 +372,7 @@ function make_x86 { ...@@ -372,7 +372,7 @@ function make_x86 {
build_directory=$BUILD_DIR/build.lite.x86 build_directory=$BUILD_DIR/build.lite.x86
if [ ${WITH_HUAWEI_ASCEND_NPU} == "ON" ]; then if [ ${WITH_HUAWEI_ASCEND_NPU} == "ON" ]; then
export CXX=/usr/bin/g++ # Ascend need g++ in centos export CXX=g++ # Huawei Ascend NPU need g++
build_directory=$BUILD_DIR/build.lite.huawei_ascend_npu build_directory=$BUILD_DIR/build.lite.huawei_ascend_npu
fi fi
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册