Replace omp statement to optimize elementwise add while batch size > 1

3047c4b4 · hjchen2 · 33e1e2dd · 3047c4b4 · 3047c4b4 · 3047c4b4
5 changed file
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -63,7 +63,12 @@ struct PaddleMobileException : public std::exception {
 #else
 #define PADDLE_MOBILE_THROW_EXCEPTION(...)

-#define PADDLE_MOBILE_ENFORCE(stat, ...)
+#define PADDLE_MOBILE_ENFORCE(stat, ...) \
+  {                                      \
+    if (stat) {                          \
+    } else {                             \
+    }                                    \
+  }

 #endif


--- a/src/operators/gru_op.cpp
+++ b/src/operators/gru_op.cpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef GRU_OP

 #include "operators/gru_op.h"
-#include <iostream>
 #include <vector>
 #include "common/enforce.h"


--- a/src/operators/kernel/arm/cast_kernel.cpp
+++ b/src/operators/kernel/arm/cast_kernel.cpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef CAST_OP

 #include <algorithm>
-#include <iostream>
 #include <vector>
 #include "framework/data_type.h"
 #include "operators/kernel/kernels.h"

--- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
@@ -26,18 +26,12 @@ namespace paddle_mobile {
 namespace operators {

 template <typename T>
-struct AddFunctor {
-  inline T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename P>
-void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
+inline void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
+  const framework::Tensor *input_x = param.InputX();
+  const framework::Tensor *input_y = param.InputY();
+  framework::Tensor *Out = param.Out();
  int axis = param.Axis();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
  const auto &x_dims = input_x->dims();
  const auto &y_dims = input_y->dims();
  /// axis = -1 represent the last dimensions.
@@ -57,18 +51,20 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
  const float *bias_data = input_y->data<float>();
  const float *input_data = input_x->data<float>();
  float *output_data = Out->mutable_data<float>();
+
+  #pragma omp parallel for collapse(2)
  for (int i = 0; i < batch; ++i) {
-    #pragma omp parallel for
    for (int j = 0; j < channels; ++j) {
      size_t offset = (i * channels + j) * elementwise_num;
      const float *input = input_data + offset;
-      const float *bias = bias_data + j;
+      const float bias = bias_data[j];
      float *output = output_data + offset;
-
+      int remain = elementwise_num;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
      int loop = elementwise_num >> 0x4;
-      int remain = elementwise_num & 0xF;
+      remain = elementwise_num & 0xF;
      for (int k = 0; k < loop; ++k) {
-        float32x4_t rb = vdupq_n_f32(*bias);
+        float32x4_t rb = vdupq_n_f32(bias);
        float32x4_t r0 = vld1q_f32(input);
        float32x4_t r1 = vld1q_f32(input + 4);
        float32x4_t r2 = vld1q_f32(input + 8);
@@ -84,15 +80,12 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
        input += 16;
        output += 16;
      }
+#endif
      for (int k = 0; k < remain; ++k) {
-        output[k] = input[k] + *bias;
+        output[k] = input[k] + bias;
      }
    }
  }
-#else
-  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
-                                                 AddFunctor<float>(), Out);
-#endif
 }

 template class ElementwiseAddKernel<CPU, float>;

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 int main(int argc, char* argv[]) {
  if (argc < 2) {
    std::cout << "Usage: ./test_benchmark feed_shape [thread_num] [use_fuse]\n"
-              << "feed_shape: input tensor shape, such as 1,3,224,224.\n"
+              << "feed_shape: input tensor shape, such as 3,224,224.\n"
              << "thread_num: optional int, threads count, default is 1.\n"
              << "use_fuse: optional bool, default is 0.\n";
    return 1;
@@ -52,7 +52,7 @@ int main(int argc, char* argv[]) {
      sscanf(feed_shape, "%d,%d,%d", &dims[1], &dims[2], &dims[3]);
    }
    std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
-              << dims[2] << ", " << dims[3] << "]\n";
+              << dims[2] << ", " << dims[3] << "]" << std::endl;
    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
    // warmup
    for (int i = 0; i < 10; ++i) {