From 2d62cf4e223be4dd438ffc46a7144840b0e5a961 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Mon, 29 Oct 2018 16:34:02 +0800 Subject: [PATCH] Optimize float im2col for general kernel and pad --- .../kernel/arm/dequantize_kernel.cpp | 24 +++-- src/operators/math/im2col.cpp | 101 ++++++++++++++---- test/CMakeLists.txt | 4 +- test/framework/test_load_memory.cpp | 3 +- test/net/test_benchmark.cpp | 64 +++++++++++ test/net/test_googlenet.cpp | 10 +- 6 files changed, 168 insertions(+), 38 deletions(-) create mode 100644 test/net/test_benchmark.cpp diff --git a/src/operators/kernel/arm/dequantize_kernel.cpp b/src/operators/kernel/arm/dequantize_kernel.cpp index cd6c8d17f1..64da460da1 100644 --- a/src/operators/kernel/arm/dequantize_kernel.cpp +++ b/src/operators/kernel/arm/dequantize_kernel.cpp @@ -44,11 +44,15 @@ void DequantizeKernel::Compute( size_t loop = size >> 4; size_t remain = size & 0xF; float32x4_t s = vdupq_n_f32(scale); + + #pragma omp parallel for for (size_t i = 0; i < loop; ++i) { - int32x4_t r0 = vld1q_s32(x); - int32x4_t r1 = vld1q_s32(x + 4); - int32x4_t r2 = vld1q_s32(x + 8); - int32x4_t r3 = vld1q_s32(x + 12); + const int32_t *local_x = x + (i << 4); + float *local_y = y + (i << 4); + int32x4_t r0 = vld1q_s32(local_x); + int32x4_t r1 = vld1q_s32(local_x + 4); + int32x4_t r2 = vld1q_s32(local_x + 8); + int32x4_t r3 = vld1q_s32(local_x + 12); float32x4_t f0 = vcvtq_f32_s32(r0); float32x4_t f1 = vcvtq_f32_s32(r1); float32x4_t f2 = vcvtq_f32_s32(r2); @@ -57,14 +61,14 @@ void DequantizeKernel::Compute( f1 = vmulq_f32(f1, s); f2 = vmulq_f32(f2, s); f3 = vmulq_f32(f3, s); - vst1q_f32(y, f0); - vst1q_f32(y + 4, f1); - vst1q_f32(y + 8, f2); - vst1q_f32(y + 12, f3); - x += 16; - y += 16; + vst1q_f32(local_y, f0); + vst1q_f32(local_y + 4, f1); + vst1q_f32(local_y + 8, f2); + vst1q_f32(local_y + 12, f3); } size = remain; + x += (loop << 4); + y += (loop << 4); #endif for (size_t i = 0; i < size; ++i) { y[i] = x[i] * scale; diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp index 4c81e7fa3b..ecbf9de206 100644 --- a/src/operators/math/im2col.cpp +++ b/src/operators/math/im2col.cpp @@ -22,6 +22,70 @@ namespace paddle_mobile { namespace operators { namespace math { +void ExtractToImg(const float *im_data, float *col_data, const int im_height, + const int im_width, const int col_height, const int col_width, + const int padding_h, const int padding_w, const int stride_h, + const int stride_w, const int kh, const int kw) { + int h = padding_h - kh; + int w = padding_w - kw; + int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0; + int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0; + int start_height = kh + col_start_height * stride_h - padding_h; + int start_width = kw + col_start_width * stride_w - padding_w; + + int end_height = (col_height - col_start_height) * stride_h + start_height; + end_height = end_height > im_height ? im_height : end_height; + int end_width = (col_width - col_start_width) * stride_w + start_width; + end_width = end_width > im_width ? im_width : end_width; + int extract = (end_width - start_width + stride_w - 1) / stride_w; + + im_data += start_height * im_width + start_width; + col_data += col_start_height * col_width + col_start_width; + #pragma omp parallel for + for (int i = start_height; i < end_height; i += stride_h) { + const float *local_im_data = im_data + i * im_width * stride_h; + float *local_col_data = col_data + col_width; + if (stride_w == 1) { + memcpy(local_col_data, local_im_data, extract * sizeof(float)); + } else if (stride_w == 2) { + int s = 0; +#if __ARM_NEON + for (; s < extract - 15; s += 16) { + float32x4x2_t img = vld2q_f32(local_im_data + s * 2); + vst1q_f32(local_col_data + s, img.val[0]); + } +#endif + for (; s < extract; ++s) { + local_col_data[s] = local_im_data[s * 2]; + } + } else if (stride_w == 3) { + int s = 0; +#if __ARM_NEON + for (; s < extract - 15; s += 16) { + float32x4x3_t img = vld3q_f32(local_im_data + s * 3); + vst1q_f32(local_col_data + s, img.val[0]); + } +#endif + for (; s < extract; ++s) { + local_col_data[s] = local_im_data[s * 3]; + } + } else if (stride_w == 4) { + int s = 0; +#if __ARM_NEON + for (; s < extract - 15; s += 16) { + float32x4x4_t img = vld4q_f32(local_im_data + s * 4); + vst1q_f32(local_col_data + s, img.val[0]); + } +#endif + for (; s < extract; ++s) { + local_col_data[s] = local_im_data[s * 4]; + } + } else { + PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4."); + } + } +} + /* * im = [input_channels, input_height, input_width] * col = @@ -363,7 +427,22 @@ void Im2ColFunctor::operator()( col_data += 9 * oosize; im_data += isize * isize; } + } else if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) { + // pad 0 + memset(col_data, 0, col->numel() * sizeof(float)); + for (int ic = 0; ic < im_channels; ++ic) { + for (int kh = 0; kh < filter_height; ++kh) { + for (int kw = 0; kw < filter_width; ++kw) { + ExtractToImg(im_data, col_data, im_height, im_width, col_height, + col_width, padding[0], padding[1], stride[0], stride[1], + kh, kw); + col_data += col_height * col_width; + } + } + im_data += im_height * im_width; + } } else { +#endif for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; int h_offset = (c / filter_width) % filter_height; @@ -382,25 +461,7 @@ void Im2ColFunctor::operator()( } } } - } -#else - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < col_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < col_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * col_height + h) * col_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? static_cast(0) - : im_data[im_idx]; - } - } +#if __ARM_NEON } #endif } @@ -489,7 +550,7 @@ void Im2ColFunctor::operator()( int channels_col = im_channels * filter_height * filter_width; const int8_t *im_data = im.data(); - int8_t *col_data = col->data(); + int8_t *col_data = col->mutable_data(); #if defined(__ARM_NEON__) || defined(__ARM_NEON) if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) { // pad 0 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2050b34d21..32ca8523ef 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -334,11 +334,13 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h) target_link_libraries(test-fssd paddle-mobile) - # gen test ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h) target_link_libraries(test-multi-process paddle-mobile) + # gen test benchmark + ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp) + target_link_libraries(test-benchmark paddle-mobile) #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) endif () diff --git a/test/framework/test_load_memory.cpp b/test/framework/test_load_memory.cpp index 162dba3727..afab17d5e7 100644 --- a/test/framework/test_load_memory.cpp +++ b/test/framework/test_load_memory.cpp @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include - #include "../test_helper.h" #include "../test_include.h" + static size_t ReadBuffer(const char *file_name, uint8_t **out) { FILE *fp; fp = fopen(file_name, "rb"); diff --git a/test/net/test_benchmark.cpp b/test/net/test_benchmark.cpp new file mode 100644 index 0000000000..3378229d0f --- /dev/null +++ b/test/net/test_benchmark.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +int main(int argc, char* argv[]) { + if (argc < 4) { + std::cout << "Usage: " << std::endl + << "./test_benchmark fluid_model feed_shape thread_num [use_fuse]" + << std::endl; + std::cout << "use_fuse: optional, bool, default is 1\n"; + return 1; + } + bool optimize = true; + char* fluid_model = argv[1]; + char* feed_shape = argv[2]; + int thread_num = atoi(argv[3]); + if (argc == 5) { + optimize = atoi(argv[4]); + } + + paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(thread_num); + auto time1 = time(); + if (paddle_mobile.Load(fluid_model, optimize)) { + auto time2 = time(); + std::cout << "load cost :" << time_diff(time1, time2) << "ms\n"; + paddle_mobile::framework::Tensor input; + std::shared_ptr output; + std::vector dims{1, 3, 224, 224}; + if (feed_shape) { + sscanf(feed_shape, "%d,%d,%d,%d", &dims[0], &dims[1], &dims[2], &dims[3]); + } + std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", " + << dims[2] << ", " << dims[3] << "]\n"; + paddle_mobile::framework::DDim in_shape = + paddle_mobile::framework::make_ddim(dims); + SetupTensor(&input, in_shape, 0.f, 255.f); + // warmup + for (int i = 0; i < 10; ++i) { + output = paddle_mobile.Predict(input); + } + auto time3 = time(); + for (int i = 0; i < 10; ++i) { + output = paddle_mobile.Predict(input); + } + auto time4 = time(); + std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms\n"; + } + return 0; +} diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index f7d2994222..a61df31e39 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -20,22 +20,21 @@ int main() { #ifdef PADDLE_MOBILE_FPGA paddle_mobile::PaddleMobile paddle_mobile; #endif - #ifdef PADDLE_MOBILE_CPU paddle_mobile::PaddleMobile paddle_mobile; #endif - paddle_mobile.SetThreadNum(4); + paddle_mobile.SetThreadNum(1); bool optimize = true; auto time1 = time(); if (paddle_mobile.Load(g_googlenet, optimize)) { auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; + std::cout << "load cost: " << time_diff(time1, time2) << "ms\n"; std::vector input; std::vector output; std::vector dims{1, 3, 224, 224}; GetInput(g_test_image_1x3x224x224, &input, dims); - // 预热十次 + // warmup for (int i = 0; i < 10; ++i) { output = paddle_mobile.Predict(input, dims); } @@ -45,8 +44,7 @@ int main() { } auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; + std::cout << "predict cost: " << time_diff(time3, time4) / 10 << "ms\n"; } return 0; } -- GitLab