提交 3ea368b1 编写于 作者: X Xiaoyang LI 提交者: cyj1986

Fix gemv int8 error (#2249)

* remove log in reshape, fix conv error when padding size=4, test=develop

* fix style, test=develop

* remove useless code, test=develop

* remove redundant model test file, test=develop

* change cluster to power_mode, test=develop

* fix build error, test=develop

* change cluster to power_mode, test=develop

* change opt_nb to use_optimize_nb, test=develop

* null, test=develop

* add gemv-int8 test, fix clang build error, test=develop

* fix gemv-int8 error when build with clang, test=develop
上级 e975c68a
......@@ -33,7 +33,7 @@ if(ARM_TARGET_LANG STREQUAL "gcc")
endif()
if(NOT DEFINED ANDROID_API_LEVEL)
set(ANDROID_API_LEVEL "22")
set(ANDROID_API_LEVEL "23")
endif()
# then check input arm abi
......
......@@ -628,7 +628,7 @@ bool gemv_int8_sdot(const int8_t* A,
"str s2, [%[out]] \n" /* save result */
: [in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [cnt] "+r"(cnt_loop)
: [out] "r"(ptr_out)
: "cc", "memory", "v0", "v8", "v9", "v18");
: "cc", "memory", "v0", "v1", "v2", "v8", "v9", "v18");
for (int i = 0; i < tail; ++i) {
ptr_out[0] += ptr_in[i] * ptr_w0[i];
}
......@@ -652,12 +652,15 @@ bool gemv_int8<float>(const int8_t* A,
const ARMContext* ctx) {
#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD)
if (ctx->has_dot()) {
gemv_int8_sdot<float>(A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
return gemv_int8_sdot<float>(
A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
} else {
gemv_int8_oth<float>(A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
return gemv_int8_oth<float>(
A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
}
#else
gemv_int8_oth<float>(A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
return gemv_int8_oth<float>(
A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
#endif
}
......@@ -675,13 +678,15 @@ bool gemv_int8<int8_t>(const int8_t* A,
const ARMContext* ctx) {
#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD)
if (ctx->has_dot()) {
gemv_int8_sdot<int8_t>(
return gemv_int8_sdot<int8_t>(
A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
} else {
gemv_int8_oth<int8_t>(A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
return gemv_int8_oth<int8_t>(
A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
}
#else
gemv_int8_oth<int8_t>(A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
return gemv_int8_oth<int8_t>(
A, x, y, transA, M, N, scale, is_bias, bias, is_relu);
#endif
}
......
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(gemm_int8_compute_test SRCS gemm_int8_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(conv_compute_test SRCS conv_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(gemm_int8_compute_test SRCS gemm_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(gemv_int8_compute_test SRCS gemv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(conv_compute_test SRCS conv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......@@ -225,7 +225,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
if (max_diff > 5e-4f) {
LOG(WARNING) << "basic result";
print_tensor(tout_basic);
LOG(WARNING) << "saber result";
LOG(WARNING) << "lite result";
print_tensor(*param.output);
Tensor tdiff;
tdiff.Resize(tout_basic.dims());
......
......@@ -346,7 +346,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
if (max_diff > 5e-5f) {
LOG(WARNING) << "basic result";
print_tensor(tout_basic_fp32);
LOG(WARNING) << "saber result";
LOG(WARNING) << "lite result";
print_tensor(*param_fp32_out.output);
Tensor tdiff;
tdiff.Resize(tout_basic_fp32.dims());
......@@ -407,7 +407,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
if (!check) {
LOG(WARNING) << "int8 basic result";
print_tensor(tout_basic_int8);
LOG(WARNING) << "int8 saber result";
LOG(WARNING) << "int8 lite result";
print_tensor(*param_int8_out.output);
LOG(WARNING) << "int8 diff tensor";
print_tensor(tdiff);
......
......@@ -218,7 +218,7 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
if (max_diff > 5e-4f) {
LOG(WARNING) << "basic result";
print_tensor(tout_basic);
LOG(WARNING) << "saber result";
LOG(WARNING) << "lite result";
print_tensor(*param.output);
Tensor tdiff;
tdiff.Resize(tout_basic.dims());
......
......@@ -258,7 +258,7 @@ bool test_gemm_int8(bool tra,
tensor_diff(tc_basic_fp32, tc_fp32, tdiff);
LOG(INFO) << "basic result: ";
print_tensor(tc_basic_fp32);
LOG(INFO) << "saber result: ";
LOG(INFO) << "lite result: ";
print_tensor(tc_fp32);
LOG(INFO) << "diff result: ";
print_tensor(tdiff);
......@@ -297,7 +297,7 @@ bool test_gemm_int8(bool tra,
if (!check) {
LOG(WARNING) << "int8 basic result";
print_tensor(tc_basic_int8);
LOG(WARNING) << "int8 saber result";
LOG(WARNING) << "int8 lite result";
print_tensor(tc_int8);
LOG(WARNING) << "int8 diff tensor";
print_tensor(tdiff);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "lite/tests/utils/fill_data.h"
#include "lite/tests/utils/naive_math_impl.h"
#ifdef LITE_WITH_ARM
#include "lite/backends/arm/math/funcs.h"
#endif // LITE_WITH_ARM
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
typedef paddle::lite::Tensor Tensor;
using paddle::lite::Timer;
DEFINE_int32(power_mode,
3,
"power mode: "
"0 for POWER_HIGH;"
"1 for POWER_LOW;"
"2 for POWER_FULL;"
"3 for NO_BIND");
DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_bool(basic_test, false, "do all tests");
DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(M, 512, "gemv: M");
DEFINE_int32(N, 512, "gemv: N");
DEFINE_bool(traA, false, "gemv: A transpose");
DEFINE_bool(flag_relu, false, "do relu");
DEFINE_bool(flag_bias, false, "with bias");
bool test_gemv_int8(
bool tra, int m, int n, bool has_bias, bool has_relu, int cls, int ths) {
Tensor ta;
Tensor tb;
Tensor tc_int8;
Tensor tc_fp32;
Tensor tc_basic_int8;
Tensor tc_basic_fp32;
Tensor tbias;
ta.Resize({m, n});
tb.Resize({n});
tc_int8.Resize({m});
tc_fp32.Resize({m});
tc_basic_int8.Resize({m});
tc_basic_fp32.Resize({m});
tbias.Resize({m});
ta.set_precision(PRECISION(kInt8));
tb.set_precision(PRECISION(kInt8));
tc_int8.set_precision(PRECISION(kInt8));
tc_fp32.set_precision(PRECISION(kFloat));
tc_basic_int8.set_precision(PRECISION(kInt8));
tc_basic_fp32.set_precision(PRECISION(kFloat));
tbias.set_precision(PRECISION(kFloat));
fill_tensor_rand(ta, -127, 127);
fill_tensor_rand(tb, -127, 127);
fill_tensor_rand(tbias, -1.f, 1.f);
std::vector<float> scale_a(static_cast<size_t>(m), 1.f / 127);
std::vector<float> scale_b = {1.f / 127};
std::vector<float> scale_c = {n / 127.f};
std::vector<float> scale_merge_fp32(static_cast<size_t>(m));
std::vector<float> scale_merge_int8(static_cast<size_t>(m));
for (int j = 0; j < m; ++j) {
scale_merge_fp32[j] = scale_a[j] * scale_b[0];
scale_merge_int8[j] = scale_merge_fp32[j] / scale_c[0];
}
LOG(INFO) << "gemv_int8 M: " << m << ", N: " << n
<< ", transA: " << (tra ? "true" : "false")
<< ", relu: " << (has_relu ? "true" : "false")
<< ", bias: " << (has_bias ? "true" : "false");
#ifdef LITE_WITH_ARM
auto da = ta.mutable_data<int8_t>();
auto db = tb.mutable_data<int8_t>();
auto dc_int8 = tc_int8.mutable_data<int8_t>();
auto dc_fp32 = tc_fp32.mutable_data<float>();
auto dc_basic_int8 = tc_basic_int8.mutable_data<int8_t>();
auto dc_basic_fp32 = tc_basic_fp32.mutable_data<float>();
auto dbias = tbias.mutable_data<float>();
if (FLAGS_check_result) {
Tensor ta_fp32;
Tensor tb_fp32;
ta_fp32.Resize({m, n});
ta_fp32.set_precision(PRECISION(kFloat));
tb_fp32.Resize({n});
tb_fp32.set_precision(PRECISION(kFloat));
auto da_fp32 = ta_fp32.mutable_data<float>();
auto db_fp32 = tb_fp32.mutable_data<float>();
paddle::lite::arm::math::int8_to_fp32(
da, da_fp32, scale_a.data(), 1, 1, ta.numel());
paddle::lite::arm::math::int8_to_fp32(
db, db_fp32, scale_b.data(), 1, 1, tb.numel());
basic_gemv(m,
n,
da_fp32,
db_fp32,
dbias,
dc_basic_fp32,
1.f,
0.f,
false,
has_bias,
has_relu);
paddle::lite::arm::math::fp32_to_int8(dc_basic_fp32,
dc_basic_int8,
scale_c.data(),
1,
1,
tc_basic_fp32.numel());
}
Timer t0;
//! compute
double ops = 2.0 * m * n;
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), ths);
/// warmup
for (int j = 0; j < FLAGS_warmup; ++j) {
paddle::lite::arm::math::gemv_int8(da,
db,
dc_fp32,
false,
m,
n,
scale_merge_fp32.data(),
has_bias,
dbias,
has_relu,
&ctx);
}
/// int8 output compute
Tensor tbias_int8;
tbias_int8.Resize(tbias.dims());
tbias_int8.set_precision(PRECISION(kFloat));
auto dbias_int8 = tbias_int8.mutable_data<float>();
for (int l = 0; l < tbias_int8.numel(); ++l) {
dbias_int8[l] = dbias[l] / scale_c[0];
}
for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start();
paddle::lite::arm::math::gemv_int8(da,
db,
dc_fp32,
false,
m,
n,
scale_merge_fp32.data(),
has_bias,
dbias,
has_relu,
&ctx);
t0.end();
}
LOG(INFO) << "gemv_int8_int8 output: M: " << m << ", N: " << n
<< ", power_mode: " << cls << ", threads: " << ths
<< ", GOPS: " << ops * 1e-9f
<< " GOPS, avg time: " << t0.get_average_ms()
<< " ms, min time: " << t0.get_min_time()
<< " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
<< " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
<< " GOPs";
/// fp32 output compute
t0.clear();
for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start();
paddle::lite::arm::math::gemv_int8(da,
db,
dc_int8,
false,
m,
n,
scale_merge_int8.data(),
has_bias,
dbias_int8,
has_relu,
&ctx);
t0.end();
}
LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n
<< ", power_mode: " << cls << ", threads: " << ths
<< ", GOPS: " << ops * 1e-9f
<< " GOPS, avg time: " << t0.get_average_ms()
<< " ms, min time: " << t0.get_min_time()
<< " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
<< " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
<< " GOPs";
if (FLAGS_check_result) {
double max_ratio = 0;
double max_diff = 0;
/// fp32 result
tensor_cmp_host(tc_basic_fp32, tc_fp32, max_ratio, max_diff);
LOG(INFO) << "fp32 compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) {
Tensor tdiff;
tdiff.set_precision(PRECISION(kFloat));
tdiff.Resize(tc_fp32.dims());
tensor_diff(tc_basic_fp32, tc_fp32, tdiff);
LOG(INFO) << "basic result: ";
print_tensor(tc_basic_fp32);
LOG(INFO) << "lite result: ";
print_tensor(tc_fp32);
LOG(INFO) << "diff result: ";
print_tensor(tdiff);
return false;
}
/// int8 result
max_ratio = 0;
max_diff = 0;
tensor_cmp_host(tc_basic_int8, tc_int8, max_ratio, max_diff);
LOG(INFO) << "int8 compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
if (fabs(max_ratio) > 1e-4f) {
Tensor tdiff;
tdiff.Resize(tc_int8.dims());
tdiff.set_precision(PRECISION(kInt8));
tensor_diff(tc_basic_int8, tc_int8, tdiff);
auto ptr = tdiff.data<int8_t>();
auto ptr_basic_fp32 = tc_basic_fp32.data<float>();
float count = 0;
bool check = true;
for (int i = 0; i < tdiff.numel(); ++i) {
if (abs(ptr[i]) > 1) {
check = false;
LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i]
<< ", after scale: " << ptr_basic_fp32[i] / scale_c[0];
break;
}
if (ptr[i] != 0) {
LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i]
<< ", after scale: " << ptr_basic_fp32[i] / scale_c[0];
count += 1;
}
}
check =
check && count < std::max(10, static_cast<int>(0.01 * tdiff.numel()));
if (!check) {
LOG(WARNING) << "int8 basic result";
print_tensor(tc_basic_int8);
LOG(WARNING) << "int8 lite result";
print_tensor(tc_int8);
LOG(WARNING) << "int8 diff tensor";
print_tensor(tdiff);
return false;
}
}
}
#endif
return true;
}
TEST(TestLiteGemvInt8, gemv_prepacked_int8) {
if (FLAGS_basic_test) {
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
LOG(INFO) << "run basic sgemm test";
for (auto& m : {1, 3, 8, 32, 397}) {
for (auto& n : {1, 3, 13, 141, 512, 789}) {
for (auto& tra : {false}) {
for (auto& has_bias : {false, true}) {
for (auto& has_relu : {false, true}) {
for (auto& th : {1, 2, 4}) {
auto flag = test_gemv_int8(
tra, m, n, has_bias, has_relu, FLAGS_power_mode, th);
if (flag) {
LOG(INFO) << "test m = " << m << ", n=" << n
<< ", bias: " << (has_bias ? "true" : "false")
<< ", relu: " << (has_relu ? "true" : "false")
<< ", trans A: " << (tra ? "true" : "false")
<< " passed\n";
} else {
LOG(FATAL) << "test m = " << m << ", n=" << n
<< ", bias: " << (has_bias ? "true" : "false")
<< ", relu: " << (has_relu ? "true" : "false")
<< ", trans A: " << (tra ? "true" : "false")
<< " failed\n";
}
}
}
}
}
}
}
}
}
TEST(TestGemvInt8Custom, gemv_prepacked_int8_custom) {
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
auto flag = test_gemv_int8(FLAGS_traA,
FLAGS_M,
FLAGS_N,
FLAGS_flag_bias,
FLAGS_flag_relu,
FLAGS_power_mode,
FLAGS_threads);
if (!flag) {
LOG(FATAL) << "test m = " << FLAGS_M << ", n=" << FLAGS_N
<< ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
<< ", relu: " << FLAGS_flag_relu << " failed!!";
}
LOG(INFO) << "test m = " << FLAGS_M << ", n=" << FLAGS_N
<< ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
<< ", relu: " << FLAGS_flag_relu << " passed!!";
}
......@@ -216,7 +216,7 @@ bool test_sgemm(bool tra,
print_tensor(tc_backup);
LOG(INFO) << "basic result: ";
print_tensor(tc_basic);
LOG(INFO) << "saber result: ";
LOG(INFO) << "lite result: ";
print_tensor(tc);
LOG(INFO) << "diff result: ";
print_tensor(tdiff);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册