diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h index 2b76b0158fe06e8678208f6f98fcdb71f8d91e51..4c6842572e49daa283efa2d92bd43e4687d92e26 100644 --- a/src/framework/load_ops.h +++ b/src/framework/load_ops.h @@ -221,5 +221,9 @@ LOAD_FUSION_MATCHER(fusion_conv_bn); #ifdef ELEMENTWISESUB_OP LOAD_OP1(elementwise_sub, CPU) #endif +#ifdef QUANT_OP LOAD_OP1(quantize, CPU); +#endif +#ifdef DEQUANT_OP LOAD_OP1(dequantize, CPU); +#endif diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index e7552d2602b31f9a5c10e3d81122babae8fcf1a8..11a1f0a53d4886e1a07d258b76b3827671471dca 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -135,11 +135,15 @@ static void quantize_round_to_even(const Tensor *input, const float scale, #if defined(__ARM_NEON__) || defined(__ARM_NEON) size_t loop = size >> 4; size_t remain = size & 0xF; + + #pragma omp parallel for for (size_t i = 0; i < loop; ++i) { - float32x4_t r0 = vld1q_f32(x); - float32x4_t r1 = vld1q_f32(x + 4); - float32x4_t r2 = vld1q_f32(x + 8); - float32x4_t r3 = vld1q_f32(x + 12); + const float *local_x = x + (i << 4); + int8_t *local_y = y + (i << 4); + float32x4_t r0 = vld1q_f32(local_x); + float32x4_t r1 = vld1q_f32(local_x + 4); + float32x4_t r2 = vld1q_f32(local_x + 8); + float32x4_t r3 = vld1q_f32(local_x + 12); r0 = vmulq_n_f32(r0, scale); r1 = vmulq_n_f32(r1, scale); r2 = vmulq_n_f32(r2, scale); @@ -156,12 +160,12 @@ static void quantize_round_to_even(const Tensor *input, const float scale, int16x8_t q6 = vcombine_s16(d2, d3); int8x8_t d5 = vmovn_s16(q5); int8x8_t d6 = vmovn_s16(q6); - vst1_s8(y, d5); - vst1_s8(y + 8, d6); - x += 16; - y += 16; + vst1_s8(local_y, d5); + vst1_s8(local_y + 8, d6); } size = remain; + x += (loop << 4); + y += (loop << 4); #endif for (size_t i = 0; i < size; ++i) { float value = x[i] * scale; @@ -187,11 +191,15 @@ static void quantize_round_to_zero(const Tensor *input, const float scale, #ifdef defined(__ARM_NEON__) || defined(__ARM_NEON) size_t loop = size >> 4; size_t remain = size & 0xF; + + #pragma omp parallel for for (size_t i = 0; i < loop; ++i) { - float32x4_t r0 = vld1q_f32(x); - float32x4_t r1 = vld1q_f32(x + 4); - float32x4_t r2 = vld1q_f32(x + 8); - float32x4_t r3 = vld1q_f32(x + 12); + const float *local_x = x + (i << 4); + int8_t *local_y = y + (i << 4); + float32x4_t r0 = vld1q_f32(local_x); + float32x4_t r1 = vld1q_f32(local_x + 4); + float32x4_t r2 = vld1q_f32(local_x + 8); + float32x4_t r3 = vld1q_f32(local_x + 12); r0 = vmulq_n_f32(r0, scale); r1 = vmulq_n_f32(r1, scale); r2 = vmulq_n_f32(r2, scale); @@ -208,12 +216,12 @@ static void quantize_round_to_zero(const Tensor *input, const float scale, int16x8_t q6 = vcombine_s16(d2, d3); int8x8_t d5 = vmovn_s16(q5); int8x8_t d6 = vmovn_s16(q6); - vst1_s8(y, d5); - vst1_s8(y + 8, d6); - x += 16; - y += 16; + vst1_s8(local_y, d5); + vst1_s8(local_y + 8, d6); } size = remain; + x += (loop << 4); + y += (loop << 4); #endif for (size_t i = 0; i < size; ++i) { y[i] = trunc(x[i] * scale); @@ -228,11 +236,15 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale, #if defined(__ARM_NEON__) || defined(__ARM_NEON) size_t loop = size >> 4; size_t remain = size & 0xF; + + #pragma omp parallel for for (size_t i = 0; i < loop; ++i) { - float32x4_t r0 = vld1q_f32(x); - float32x4_t r1 = vld1q_f32(x + 4); - float32x4_t r2 = vld1q_f32(x + 8); - float32x4_t r3 = vld1q_f32(x + 12); + const float *local_x = x + (i << 4); + int8_t *local_y = y + (i << 4); + float32x4_t r0 = vld1q_f32(local_x); + float32x4_t r1 = vld1q_f32(local_x + 4); + float32x4_t r2 = vld1q_f32(local_x + 8); + float32x4_t r3 = vld1q_f32(local_x + 12); r0 = vmulq_n_f32(r0, scale); r1 = vmulq_n_f32(r1, scale); r2 = vmulq_n_f32(r2, scale); @@ -249,12 +261,12 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale, int16x8_t q6 = vcombine_s16(d2, d3); int8x8_t d5 = vmovn_s16(q5); int8x8_t d6 = vmovn_s16(q6); - vst1_s8(y, d5); - vst1_s8(y + 8, d6); - x += 16; - y += 16; + vst1_s8(local_y, d5); + vst1_s8(local_y + 8, d6); } size = remain; + x += (loop << 4); + y += (loop << 4); #endif for (size_t i = 0; i < size; ++i) { y[i] = round(x[i] * scale); diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h index 0c01ef0072444479d2d2e2f7676b842d89e432ec..b6288380a04c71b3d6467f7f6648db046ae9acc9 100644 --- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h @@ -58,6 +58,7 @@ void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { const float *input_data = input_x->data(); float *output_data = Out->mutable_data(); for (int i = 0; i < batch; ++i) { + #pragma omp parallel for for (int j = 0; j < channels; ++j) { size_t offset = (i * channels + j) * elementwise_num; const float *input = input_data + offset; diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index b489f33bab364bebdc81c87a7b6c200082f3e079..f7d29942224b51734cf62988ba8f271f1fa05bc3 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -25,7 +25,7 @@ int main() { paddle_mobile::PaddleMobile paddle_mobile; #endif - paddle_mobile.SetThreadNum(1); + paddle_mobile.SetThreadNum(4); bool optimize = true; auto time1 = time(); if (paddle_mobile.Load(g_googlenet, optimize)) {