提交 c0ed3a66 编写于 作者: X xiebaiyuan 提交者: GitHub

Merge pull request #1208 from hjchen2/dev-latest

Fix load quant and dequant ops
...@@ -224,5 +224,9 @@ LOAD_FUSION_MATCHER(fusion_conv_bn); ...@@ -224,5 +224,9 @@ LOAD_FUSION_MATCHER(fusion_conv_bn);
#ifdef ELEMENTWISESUB_OP #ifdef ELEMENTWISESUB_OP
LOAD_OP1(elementwise_sub, CPU) LOAD_OP1(elementwise_sub, CPU)
#endif #endif
#ifdef QUANT_OP
LOAD_OP1(quantize, CPU); LOAD_OP1(quantize, CPU);
#endif
#ifdef DEQUANT_OP
LOAD_OP1(dequantize, CPU); LOAD_OP1(dequantize, CPU);
#endif
...@@ -135,11 +135,15 @@ static void quantize_round_to_even(const Tensor *input, const float scale, ...@@ -135,11 +135,15 @@ static void quantize_round_to_even(const Tensor *input, const float scale,
#if defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4; size_t loop = size >> 4;
size_t remain = size & 0xF; size_t remain = size & 0xF;
#pragma omp parallel for
for (size_t i = 0; i < loop; ++i) { for (size_t i = 0; i < loop; ++i) {
float32x4_t r0 = vld1q_f32(x); const float *local_x = x + (i << 4);
float32x4_t r1 = vld1q_f32(x + 4); int8_t *local_y = y + (i << 4);
float32x4_t r2 = vld1q_f32(x + 8); float32x4_t r0 = vld1q_f32(local_x);
float32x4_t r3 = vld1q_f32(x + 12); float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = vmulq_n_f32(r0, scale); r0 = vmulq_n_f32(r0, scale);
r1 = vmulq_n_f32(r1, scale); r1 = vmulq_n_f32(r1, scale);
r2 = vmulq_n_f32(r2, scale); r2 = vmulq_n_f32(r2, scale);
...@@ -156,12 +160,12 @@ static void quantize_round_to_even(const Tensor *input, const float scale, ...@@ -156,12 +160,12 @@ static void quantize_round_to_even(const Tensor *input, const float scale,
int16x8_t q6 = vcombine_s16(d2, d3); int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5); int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6); int8x8_t d6 = vmovn_s16(q6);
vst1_s8(y, d5); vst1_s8(local_y, d5);
vst1_s8(y + 8, d6); vst1_s8(local_y + 8, d6);
x += 16;
y += 16;
} }
size = remain; size = remain;
x += (loop << 4);
y += (loop << 4);
#endif #endif
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
float value = x[i] * scale; float value = x[i] * scale;
...@@ -187,11 +191,15 @@ static void quantize_round_to_zero(const Tensor *input, const float scale, ...@@ -187,11 +191,15 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON) #ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4; size_t loop = size >> 4;
size_t remain = size & 0xF; size_t remain = size & 0xF;
#pragma omp parallel for
for (size_t i = 0; i < loop; ++i) { for (size_t i = 0; i < loop; ++i) {
float32x4_t r0 = vld1q_f32(x); const float *local_x = x + (i << 4);
float32x4_t r1 = vld1q_f32(x + 4); int8_t *local_y = y + (i << 4);
float32x4_t r2 = vld1q_f32(x + 8); float32x4_t r0 = vld1q_f32(local_x);
float32x4_t r3 = vld1q_f32(x + 12); float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = vmulq_n_f32(r0, scale); r0 = vmulq_n_f32(r0, scale);
r1 = vmulq_n_f32(r1, scale); r1 = vmulq_n_f32(r1, scale);
r2 = vmulq_n_f32(r2, scale); r2 = vmulq_n_f32(r2, scale);
...@@ -208,12 +216,12 @@ static void quantize_round_to_zero(const Tensor *input, const float scale, ...@@ -208,12 +216,12 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
int16x8_t q6 = vcombine_s16(d2, d3); int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5); int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6); int8x8_t d6 = vmovn_s16(q6);
vst1_s8(y, d5); vst1_s8(local_y, d5);
vst1_s8(y + 8, d6); vst1_s8(local_y + 8, d6);
x += 16;
y += 16;
} }
size = remain; size = remain;
x += (loop << 4);
y += (loop << 4);
#endif #endif
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
y[i] = trunc(x[i] * scale); y[i] = trunc(x[i] * scale);
...@@ -228,11 +236,15 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale, ...@@ -228,11 +236,15 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
#if defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4; size_t loop = size >> 4;
size_t remain = size & 0xF; size_t remain = size & 0xF;
#pragma omp parallel for
for (size_t i = 0; i < loop; ++i) { for (size_t i = 0; i < loop; ++i) {
float32x4_t r0 = vld1q_f32(x); const float *local_x = x + (i << 4);
float32x4_t r1 = vld1q_f32(x + 4); int8_t *local_y = y + (i << 4);
float32x4_t r2 = vld1q_f32(x + 8); float32x4_t r0 = vld1q_f32(local_x);
float32x4_t r3 = vld1q_f32(x + 12); float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = vmulq_n_f32(r0, scale); r0 = vmulq_n_f32(r0, scale);
r1 = vmulq_n_f32(r1, scale); r1 = vmulq_n_f32(r1, scale);
r2 = vmulq_n_f32(r2, scale); r2 = vmulq_n_f32(r2, scale);
...@@ -249,12 +261,12 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale, ...@@ -249,12 +261,12 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
int16x8_t q6 = vcombine_s16(d2, d3); int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5); int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6); int8x8_t d6 = vmovn_s16(q6);
vst1_s8(y, d5); vst1_s8(local_y, d5);
vst1_s8(y + 8, d6); vst1_s8(local_y + 8, d6);
x += 16;
y += 16;
} }
size = remain; size = remain;
x += (loop << 4);
y += (loop << 4);
#endif #endif
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
y[i] = round(x[i] * scale); y[i] = round(x[i] * scale);
......
...@@ -58,6 +58,7 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) { ...@@ -58,6 +58,7 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
const float *input_data = input_x->data<float>(); const float *input_data = input_x->data<float>();
float *output_data = Out->mutable_data<float>(); float *output_data = Out->mutable_data<float>();
for (int i = 0; i < batch; ++i) { for (int i = 0; i < batch; ++i) {
#pragma omp parallel for
for (int j = 0; j < channels; ++j) { for (int j = 0; j < channels; ++j) {
size_t offset = (i * channels + j) * elementwise_num; size_t offset = (i * channels + j) * elementwise_num;
const float *input = input_data + offset; const float *input = input_data + offset;
......
...@@ -25,8 +25,8 @@ int main() { ...@@ -25,8 +25,8 @@ int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
#endif #endif
paddle_mobile.SetThreadNum(1); paddle_mobile.SetThreadNum(4);
bool optimize = false; bool optimize = true;
auto time1 = time(); auto time1 = time();
if (paddle_mobile.Load(g_googlenet, optimize)) { if (paddle_mobile.Load(g_googlenet, optimize)) {
auto time2 = time(); auto time2 = time();
...@@ -35,10 +35,10 @@ int main() { ...@@ -35,10 +35,10 @@ int main() {
std::vector<float> output; std::vector<float> output;
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims); GetInput<float>(g_test_image_1x3x224x224, &input, dims);
// // 预热十次 // 预热十次
// for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
// output = paddle_mobile.Predict(input, dims); output = paddle_mobile.Predict(input, dims);
// } }
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
output = paddle_mobile.Predict(input, dims); output = paddle_mobile.Predict(input, dims);
...@@ -47,9 +47,6 @@ int main() { ...@@ -47,9 +47,6 @@ int main() {
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
<< std::endl; << std::endl;
for (int i = 0; i < output.size(); ++i) {
DLOG << "result[" << i << "] = " << output[i];
}
} }
return 0; return 0;
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册