From 2d62cf4e223be4dd438ffc46a7144840b0e5a961 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Mon, 29 Oct 2018 16:34:02 +0800
Subject: [PATCH] Optimize float im2col for general kernel and pad

---
 .../kernel/arm/dequantize_kernel.cpp          |  24 +++--
 src/operators/math/im2col.cpp                 | 101 ++++++++++++++----
 test/CMakeLists.txt                           |   4 +-
 test/framework/test_load_memory.cpp           |   3 +-
 test/net/test_benchmark.cpp                   |  64 +++++++++++
 test/net/test_googlenet.cpp                   |  10 +-
 6 files changed, 168 insertions(+), 38 deletions(-)
 create mode 100644 test/net/test_benchmark.cpp

diff --git a/src/operators/kernel/arm/dequantize_kernel.cpp b/src/operators/kernel/arm/dequantize_kernel.cpp
index cd6c8d17f1..64da460da1 100644
--- a/src/operators/kernel/arm/dequantize_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_kernel.cpp
@@ -44,11 +44,15 @@ void DequantizeKernel<CPU, float>::Compute(
   size_t loop = size >> 4;
   size_t remain = size & 0xF;
   float32x4_t s = vdupq_n_f32(scale);
+
+  #pragma omp parallel for
   for (size_t i = 0; i < loop; ++i) {
-    int32x4_t r0 = vld1q_s32(x);
-    int32x4_t r1 = vld1q_s32(x + 4);
-    int32x4_t r2 = vld1q_s32(x + 8);
-    int32x4_t r3 = vld1q_s32(x + 12);
+    const int32_t *local_x = x + (i << 4);
+    float *local_y = y + (i << 4);
+    int32x4_t r0 = vld1q_s32(local_x);
+    int32x4_t r1 = vld1q_s32(local_x + 4);
+    int32x4_t r2 = vld1q_s32(local_x + 8);
+    int32x4_t r3 = vld1q_s32(local_x + 12);
     float32x4_t f0 = vcvtq_f32_s32(r0);
     float32x4_t f1 = vcvtq_f32_s32(r1);
     float32x4_t f2 = vcvtq_f32_s32(r2);
@@ -57,14 +61,14 @@ void DequantizeKernel<CPU, float>::Compute(
     f1 = vmulq_f32(f1, s);
     f2 = vmulq_f32(f2, s);
     f3 = vmulq_f32(f3, s);
-    vst1q_f32(y, f0);
-    vst1q_f32(y + 4, f1);
-    vst1q_f32(y + 8, f2);
-    vst1q_f32(y + 12, f3);
-    x += 16;
-    y += 16;
+    vst1q_f32(local_y, f0);
+    vst1q_f32(local_y + 4, f1);
+    vst1q_f32(local_y + 8, f2);
+    vst1q_f32(local_y + 12, f3);
   }
   size = remain;
+  x += (loop << 4);
+  y += (loop << 4);
 #endif
   for (size_t i = 0; i < size; ++i) {
     y[i] = x[i] * scale;
diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp
index 4c81e7fa3b..ecbf9de206 100644
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -22,6 +22,70 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
+void ExtractToImg(const float *im_data, float *col_data, const int im_height,
+                  const int im_width, const int col_height, const int col_width,
+                  const int padding_h, const int padding_w, const int stride_h,
+                  const int stride_w, const int kh, const int kw) {
+  int h = padding_h - kh;
+  int w = padding_w - kw;
+  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
+  int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
+  int start_height = kh + col_start_height * stride_h - padding_h;
+  int start_width = kw + col_start_width * stride_w - padding_w;
+
+  int end_height = (col_height - col_start_height) * stride_h + start_height;
+  end_height = end_height > im_height ? im_height : end_height;
+  int end_width = (col_width - col_start_width) * stride_w + start_width;
+  end_width = end_width > im_width ? im_width : end_width;
+  int extract = (end_width - start_width + stride_w - 1) / stride_w;
+
+  im_data += start_height * im_width + start_width;
+  col_data += col_start_height * col_width + col_start_width;
+  #pragma omp parallel for
+  for (int i = start_height; i < end_height; i += stride_h) {
+    const float *local_im_data = im_data + i * im_width * stride_h;
+    float *local_col_data = col_data + col_width;
+    if (stride_w == 1) {
+      memcpy(local_col_data, local_im_data, extract * sizeof(float));
+    } else if (stride_w == 2) {
+      int s = 0;
+#if __ARM_NEON
+      for (; s < extract - 15; s += 16) {
+        float32x4x2_t img = vld2q_f32(local_im_data + s * 2);
+        vst1q_f32(local_col_data + s, img.val[0]);
+      }
+#endif
+      for (; s < extract; ++s) {
+        local_col_data[s] = local_im_data[s * 2];
+      }
+    } else if (stride_w == 3) {
+      int s = 0;
+#if __ARM_NEON
+      for (; s < extract - 15; s += 16) {
+        float32x4x3_t img = vld3q_f32(local_im_data + s * 3);
+        vst1q_f32(local_col_data + s, img.val[0]);
+      }
+#endif
+      for (; s < extract; ++s) {
+        local_col_data[s] = local_im_data[s * 3];
+      }
+    } else if (stride_w == 4) {
+      int s = 0;
+#if __ARM_NEON
+      for (; s < extract - 15; s += 16) {
+        float32x4x4_t img = vld4q_f32(local_im_data + s * 4);
+        vst1q_f32(local_col_data + s, img.val[0]);
+      }
+#endif
+      for (; s < extract; ++s) {
+        local_col_data[s] = local_im_data[s * 4];
+      }
+    } else {
+      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4.");
+    }
+  }
+}
+
 /*
  * im = [input_channels, input_height, input_width]
  * col =
@@ -363,7 +427,22 @@ void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
       col_data += 9 * oosize;
       im_data += isize * isize;
     }
+  } else if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) {
+    // pad 0
+    memset(col_data, 0, col->numel() * sizeof(float));
+    for (int ic = 0; ic < im_channels; ++ic) {
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          ExtractToImg(im_data, col_data, im_height, im_width, col_height,
+                       col_width, padding[0], padding[1], stride[0], stride[1],
+                       kh, kw);
+          col_data += col_height * col_width;
+        }
+      }
+      im_data += im_height * im_width;
+    }
   } else {
+#endif
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
@@ -382,25 +461,7 @@ void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
         }
       }
     }
-  }
-#else
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % filter_width;
-    int h_offset = (c / filter_width) % filter_height;
-    int c_im = c / (filter_width * filter_height);
-    for (int h = 0; h < col_height; ++h) {
-      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-      for (int w = 0; w < col_width; ++w) {
-        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-        int col_idx = (c * col_height + h) * col_width + w;
-        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                             im_col_idx < 0 || im_col_idx >= im_width)
-                                ? static_cast<float>(0)
-                                : im_data[im_idx];
-      }
-    }
+#if __ARM_NEON
   }
 #endif
 }
@@ -489,7 +550,7 @@ void Im2ColFunctor<ColFormat::kCFO, CPU, int8_t>::operator()(
 
   int channels_col = im_channels * filter_height * filter_width;
   const int8_t *im_data = im.data<int8_t>();
-  int8_t *col_data = col->data<int8_t>();
+  int8_t *col_data = col->mutable_data<int8_t>();
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) {
     // pad 0
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 2050b34d21..32ca8523ef 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -334,11 +334,13 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
     target_link_libraries(test-fssd paddle-mobile)
 
-
     # gen test
     ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
     target_link_libraries(test-multi-process paddle-mobile)
 
+    # gen test benchmark
+    ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp)
+    target_link_libraries(test-benchmark paddle-mobile)
 
     #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
diff --git a/test/framework/test_load_memory.cpp b/test/framework/test_load_memory.cpp
index 162dba3727..afab17d5e7 100644
--- a/test/framework/test_load_memory.cpp
+++ b/test/framework/test_load_memory.cpp
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <iostream>
 #include <string>
-
 #include "../test_helper.h"
 #include "../test_include.h"
+
 static size_t ReadBuffer(const char *file_name, uint8_t **out) {
   FILE *fp;
   fp = fopen(file_name, "rb");
diff --git a/test/net/test_benchmark.cpp b/test/net/test_benchmark.cpp
new file mode 100644
index 0000000000..3378229d0f
--- /dev/null
+++ b/test/net/test_benchmark.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main(int argc, char* argv[]) {
+  if (argc < 4) {
+    std::cout << "Usage: " << std::endl
+              << "./test_benchmark fluid_model feed_shape thread_num [use_fuse]"
+              << std::endl;
+    std::cout << "use_fuse: optional, bool, default is 1\n";
+    return 1;
+  }
+  bool optimize = true;
+  char* fluid_model = argv[1];
+  char* feed_shape = argv[2];
+  int thread_num = atoi(argv[3]);
+  if (argc == 5) {
+    optimize = atoi(argv[4]);
+  }
+
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(thread_num);
+  auto time1 = time();
+  if (paddle_mobile.Load(fluid_model, optimize)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms\n";
+    paddle_mobile::framework::Tensor input;
+    std::shared_ptr<paddle_mobile::framework::Tensor> output;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    if (feed_shape) {
+      sscanf(feed_shape, "%d,%d,%d,%d", &dims[0], &dims[1], &dims[2], &dims[3]);
+    }
+    std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
+              << dims[2] << ", " << dims[3] << "]\n";
+    paddle_mobile::framework::DDim in_shape =
+        paddle_mobile::framework::make_ddim(dims);
+    SetupTensor<float>(&input, in_shape, 0.f, 255.f);
+    // warmup
+    for (int i = 0; i < 10; ++i) {
+      output = paddle_mobile.Predict(input);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      output = paddle_mobile.Predict(input);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms\n";
+  }
+  return 0;
+}
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index f7d2994222..a61df31e39 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -20,22 +20,21 @@ int main() {
 #ifdef PADDLE_MOBILE_FPGA
   paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
 #endif
-
 #ifdef PADDLE_MOBILE_CPU
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 #endif
 
-  paddle_mobile.SetThreadNum(4);
+  paddle_mobile.SetThreadNum(1);
   bool optimize = true;
   auto time1 = time();
   if (paddle_mobile.Load(g_googlenet, optimize)) {
     auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
+    std::cout << "load cost: " << time_diff(time1, time2) << "ms\n";
     std::vector<float> input;
     std::vector<float> output;
     std::vector<int64_t> dims{1, 3, 224, 224};
     GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    // 预热十次
+    // warmup
     for (int i = 0; i < 10; ++i) {
       output = paddle_mobile.Predict(input, dims);
     }
@@ -45,8 +44,7 @@ int main() {
     }
     auto time4 = time();
 
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
+    std::cout << "predict cost: " << time_diff(time3, time4) / 10 << "ms\n";
   }
   return 0;
 }
-- 
GitLab