未验证 提交 b16ae4e2 编写于 作者: T tensor-tang 提交者: GitHub

[lite] fix fc bias and enable armv7 fc (#17695)

上级 4b253569
......@@ -45,7 +45,8 @@ if (NOT ANDROID_ARCH_ABI IN_LIST ANDROID_ARCH_ABI_LIST)
endif()
if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
message(STATUS "NEON is enabled on arm-v7a")
set(CMAKE_ANDROID_ARM_NEON ON)
message(STATUS "NEON is enabled on arm-v7a with softfp")
endif()
set(ANDROID_STL_TYPE_LITS "gnustl_static" "c++_static")
......
......@@ -21,14 +21,14 @@ namespace arm {
namespace math {
template <>
void fill_bias_fc<float>(float *tensor, const float *bias, const int num,
void fill_bias_fc<float>(float *out, const float *bias, const int num,
const int channel) {
int cnt = channel >> 4;
int remain = channel & 15;
for (int j = 0; j < num; ++j) {
const float *ptr_bias = bias;
float *ptr_out = tensor + j * channel;
float *ptr_out = out + j * channel;
float32x4_t vout1;
float32x4_t vout2;
......@@ -61,7 +61,6 @@ void fill_bias_fc<float>(float *tensor, const float *bias, const int num,
ptr_out += 16;
ptr_bias += 16;
}
#if 0
if (cnt > 0) {
asm(
......@@ -79,21 +78,21 @@ void fill_bias_fc<float>(float *tensor, const float *bias, const int num,
);
}
#endif
for (; remain > 0; remain--) {
for (int i = 0; i < remain; ++i) {
*(ptr_out++) += *(ptr_bias++);
}
}
}
template <>
void fill_bias_fc<int>(int *tensor, const int *bias, const int num,
void fill_bias_fc<int>(int *out, const int *bias, const int num,
const int channel) {
int cnt = channel >> 4;
int remain = channel & 15;
for (int j = 0; j < num; ++j) {
const int *ptr_bias = bias;
int *ptr_out = tensor + j * channel;
int *ptr_out = out + j * channel;
int32x4_t vout1;
int32x4_t vout2;
......@@ -144,7 +143,7 @@ void fill_bias_fc<int>(int *tensor, const int *bias, const int num,
);
}
#endif
for (; remain > 0; remain--) {
for (int i = 0; i < remain; ++i) {
*(ptr_out++) += *(ptr_bias++);
}
}
......
......@@ -2751,7 +2751,7 @@ void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias,
bool transB, ARMContext* ctx) {
size_t l2_cache =
ctx->l2_cache_size() > 0 ? ctx->l2_cache_size() : 512 * 1024;
void* workspace = ctx->get_work_space();
auto* workspace = ctx->workspace_data<float>();
int threads = ctx->threads();
//! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
int x_block =
......@@ -2785,7 +2785,7 @@ void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias,
flag_p_remain = true;
}
//! load bpanel
float* b_pannel = static_cast<float*>(workspace);
float* b_pannel = workspace;
if (transB) {
loadb_trans(b_pannel, B, K, 0, K, x0, xmax);
} else {
......
......@@ -20,7 +20,7 @@ namespace lite {
#ifdef LITE_WITH_ARM
void DeviceInfo::get_info(DeviceInfo* dev) {
void DeviceInfo::InitInternal(DeviceInfo* dev) {
set_default_cache(dev);
dev->compute_core_num_ = arm_get_cpucount();
dev->max_memory_ = arm_get_meminfo();
......
......@@ -85,14 +85,14 @@ class DeviceInfo {
return *x;
}
static void init_info() {
static void Init() {
auto& info = Global();
get_info(&info);
InitInternal(&info);
}
private:
DeviceInfo() = default;
static void get_info(DeviceInfo* dev);
static void InitInternal(DeviceInfo* dev);
};
size_t arm_get_meminfo();
......
......@@ -64,7 +64,7 @@ class RuntimeContextAssignPass : public StmtPass {
#ifdef LITE_WITH_ARM
std::unique_ptr<KernelContext> NewARMContext() {
DeviceInfo::init_info();
DeviceInfo::Init();
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
return ctx;
......
......@@ -9,7 +9,7 @@ cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} eigen3)
lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm eigen3)
lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
set(arm_kernels
fc_compute_arm
......
......@@ -61,8 +61,6 @@ TEST(fc_arm, compare_test) {
b_data[i] = static_cast<float>(i);
}
// TODO(TJ): enable bias soon
b_data = nullptr;
lite::arm::math::fc_compute_eigen(x_data, batch_size, 3, //
w_data, 3, 4, //
b_data, ref_data);
......@@ -74,11 +72,11 @@ TEST(fc_arm, compare_test) {
param.in_num_col_dims = 1;
param.input = &x;
param.w = &w;
param.bias = nullptr;
param.bias = &b;
param.output = &out;
param.in_mat_dims = x.dims();
DeviceInfo::init_info();
DeviceInfo::Init();
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
fc.SetParam(param);
......@@ -136,7 +134,7 @@ TEST(fc_arm, num_col_dims) {
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
DeviceInfo::init_info();
DeviceInfo::Init();
fc.SetParam(param);
fc.SetContext(std::move(ctx));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册