未验证 提交 b16ae4e2 编写于 作者: T tensor-tang 提交者: GitHub

[lite] fix fc bias and enable armv7 fc (#17695)

上级 4b253569
...@@ -45,7 +45,8 @@ if (NOT ANDROID_ARCH_ABI IN_LIST ANDROID_ARCH_ABI_LIST) ...@@ -45,7 +45,8 @@ if (NOT ANDROID_ARCH_ABI IN_LIST ANDROID_ARCH_ABI_LIST)
endif() endif()
if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a") if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
message(STATUS "NEON is enabled on arm-v7a") set(CMAKE_ANDROID_ARM_NEON ON)
message(STATUS "NEON is enabled on arm-v7a with softfp")
endif() endif()
set(ANDROID_STL_TYPE_LITS "gnustl_static" "c++_static") set(ANDROID_STL_TYPE_LITS "gnustl_static" "c++_static")
......
...@@ -21,14 +21,14 @@ namespace arm { ...@@ -21,14 +21,14 @@ namespace arm {
namespace math { namespace math {
template <> template <>
void fill_bias_fc<float>(float *tensor, const float *bias, const int num, void fill_bias_fc<float>(float *out, const float *bias, const int num,
const int channel) { const int channel) {
int cnt = channel >> 4; int cnt = channel >> 4;
int remain = channel & 15; int remain = channel & 15;
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
const float *ptr_bias = bias; const float *ptr_bias = bias;
float *ptr_out = tensor + j * channel; float *ptr_out = out + j * channel;
float32x4_t vout1; float32x4_t vout1;
float32x4_t vout2; float32x4_t vout2;
...@@ -61,7 +61,6 @@ void fill_bias_fc<float>(float *tensor, const float *bias, const int num, ...@@ -61,7 +61,6 @@ void fill_bias_fc<float>(float *tensor, const float *bias, const int num,
ptr_out += 16; ptr_out += 16;
ptr_bias += 16; ptr_bias += 16;
} }
#if 0 #if 0
if (cnt > 0) { if (cnt > 0) {
asm( asm(
...@@ -79,21 +78,21 @@ void fill_bias_fc<float>(float *tensor, const float *bias, const int num, ...@@ -79,21 +78,21 @@ void fill_bias_fc<float>(float *tensor, const float *bias, const int num,
); );
} }
#endif #endif
for (; remain > 0; remain--) { for (int i = 0; i < remain; ++i) {
*(ptr_out++) += *(ptr_bias++); *(ptr_out++) += *(ptr_bias++);
} }
} }
} }
template <> template <>
void fill_bias_fc<int>(int *tensor, const int *bias, const int num, void fill_bias_fc<int>(int *out, const int *bias, const int num,
const int channel) { const int channel) {
int cnt = channel >> 4; int cnt = channel >> 4;
int remain = channel & 15; int remain = channel & 15;
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
const int *ptr_bias = bias; const int *ptr_bias = bias;
int *ptr_out = tensor + j * channel; int *ptr_out = out + j * channel;
int32x4_t vout1; int32x4_t vout1;
int32x4_t vout2; int32x4_t vout2;
...@@ -144,7 +143,7 @@ void fill_bias_fc<int>(int *tensor, const int *bias, const int num, ...@@ -144,7 +143,7 @@ void fill_bias_fc<int>(int *tensor, const int *bias, const int num,
); );
} }
#endif #endif
for (; remain > 0; remain--) { for (int i = 0; i < remain; ++i) {
*(ptr_out++) += *(ptr_bias++); *(ptr_out++) += *(ptr_bias++);
} }
} }
......
...@@ -2751,7 +2751,7 @@ void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias, ...@@ -2751,7 +2751,7 @@ void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias,
bool transB, ARMContext* ctx) { bool transB, ARMContext* ctx) {
size_t l2_cache = size_t l2_cache =
ctx->l2_cache_size() > 0 ? ctx->l2_cache_size() : 512 * 1024; ctx->l2_cache_size() > 0 ? ctx->l2_cache_size() : 512 * 1024;
void* workspace = ctx->get_work_space(); auto* workspace = ctx->workspace_data<float>();
int threads = ctx->threads(); int threads = ctx->threads();
//! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
int x_block = int x_block =
...@@ -2785,7 +2785,7 @@ void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias, ...@@ -2785,7 +2785,7 @@ void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias,
flag_p_remain = true; flag_p_remain = true;
} }
//! load bpanel //! load bpanel
float* b_pannel = static_cast<float*>(workspace); float* b_pannel = workspace;
if (transB) { if (transB) {
loadb_trans(b_pannel, B, K, 0, K, x0, xmax); loadb_trans(b_pannel, B, K, 0, K, x0, xmax);
} else { } else {
......
...@@ -20,7 +20,7 @@ namespace lite { ...@@ -20,7 +20,7 @@ namespace lite {
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
void DeviceInfo::get_info(DeviceInfo* dev) { void DeviceInfo::InitInternal(DeviceInfo* dev) {
set_default_cache(dev); set_default_cache(dev);
dev->compute_core_num_ = arm_get_cpucount(); dev->compute_core_num_ = arm_get_cpucount();
dev->max_memory_ = arm_get_meminfo(); dev->max_memory_ = arm_get_meminfo();
......
...@@ -85,14 +85,14 @@ class DeviceInfo { ...@@ -85,14 +85,14 @@ class DeviceInfo {
return *x; return *x;
} }
static void init_info() { static void Init() {
auto& info = Global(); auto& info = Global();
get_info(&info); InitInternal(&info);
} }
private: private:
DeviceInfo() = default; DeviceInfo() = default;
static void get_info(DeviceInfo* dev); static void InitInternal(DeviceInfo* dev);
}; };
size_t arm_get_meminfo(); size_t arm_get_meminfo();
......
...@@ -64,7 +64,7 @@ class RuntimeContextAssignPass : public StmtPass { ...@@ -64,7 +64,7 @@ class RuntimeContextAssignPass : public StmtPass {
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
std::unique_ptr<KernelContext> NewARMContext() { std::unique_ptr<KernelContext> NewARMContext() {
DeviceInfo::init_info(); DeviceInfo::Init();
std::unique_ptr<KernelContext> ctx(new KernelContext); std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>(); ctx->As<ARMContext>();
return ctx; return ctx;
......
...@@ -9,7 +9,7 @@ cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps}) ...@@ -9,7 +9,7 @@ cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3) cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} eigen3) cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} eigen3)
lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm eigen3) lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
set(arm_kernels set(arm_kernels
fc_compute_arm fc_compute_arm
......
...@@ -61,8 +61,6 @@ TEST(fc_arm, compare_test) { ...@@ -61,8 +61,6 @@ TEST(fc_arm, compare_test) {
b_data[i] = static_cast<float>(i); b_data[i] = static_cast<float>(i);
} }
// TODO(TJ): enable bias soon
b_data = nullptr;
lite::arm::math::fc_compute_eigen(x_data, batch_size, 3, // lite::arm::math::fc_compute_eigen(x_data, batch_size, 3, //
w_data, 3, 4, // w_data, 3, 4, //
b_data, ref_data); b_data, ref_data);
...@@ -74,11 +72,11 @@ TEST(fc_arm, compare_test) { ...@@ -74,11 +72,11 @@ TEST(fc_arm, compare_test) {
param.in_num_col_dims = 1; param.in_num_col_dims = 1;
param.input = &x; param.input = &x;
param.w = &w; param.w = &w;
param.bias = nullptr; param.bias = &b;
param.output = &out; param.output = &out;
param.in_mat_dims = x.dims(); param.in_mat_dims = x.dims();
DeviceInfo::init_info(); DeviceInfo::Init();
std::unique_ptr<KernelContext> ctx(new KernelContext); std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>(); ctx->As<ARMContext>();
fc.SetParam(param); fc.SetParam(param);
...@@ -136,7 +134,7 @@ TEST(fc_arm, num_col_dims) { ...@@ -136,7 +134,7 @@ TEST(fc_arm, num_col_dims) {
std::unique_ptr<KernelContext> ctx(new KernelContext); std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>(); ctx->As<ARMContext>();
DeviceInfo::init_info(); DeviceInfo::Init();
fc.SetParam(param); fc.SetParam(param);
fc.SetContext(std::move(ctx)); fc.SetContext(std::move(ctx));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册