Merge pull request #299 from Eclipsess/develop

fix #298 optimize some ops

Merge pull request #299 from Eclipsess/develop
fix #298 optimize some ops
7cc78945 · eclipsycn · GitHub · 7c9dee59 · 215dbff9 · 7cc78945
5 changed file
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -71,8 +71,9 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
    {
      for (int n = 0; n < N; n++) {
        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
          for (int w = 0; w < W; w++) {
-            int index = n * stride0 + i * stride1 + h * stride2 + w;
+            int index = tmp_index + w;
            out_ptr[index] =
                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
          }

--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
@@ -51,38 +51,6 @@ class ConcatFunctor {
    }
  }
 };
-template <typename T>
-void StridedNumelCopyWithAxis(int64_t axis, T *dst,
-                              const framework::DDim &dst_stride_numel,
-                              const T *src,
-                              const framework::DDim &src_stride_numel,
-                              int64_t size) {
-  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
-  int64_t src_after = src_stride_numel[axis];
-  int64_t dst_after = dst_stride_numel[axis];
-
-  /// "src and dst tensor should have the same dims size."
-  assert(src_stride_numel.size() == dst_stride_numel.size());
-
-  for (int64_t i = 0; i < axis; ++i) {
-    if (i < axis) {
-      /// src and dst should have the same elements
-      /// except the specified axis.
-      assert(src_stride_numel[i] / src_stride_numel[axis] ==
-             dst_stride_numel[i] / dst_stride_numel[axis]);
-
-    } else if (i == axis) {
-      continue;
-    } else {
-      /// "src and dst should have the same elements "
-      ///         "except the specified axis."
-      assert(src_stride_numel[i] == dst_stride_numel[i]);
-    }
-  }
-  for (int64_t i = 0; i < before; ++i) {
-    memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
-  }
-}

 template <>
 void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
@@ -97,10 +65,13 @@ void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
    for (auto *in : inputs) {
      auto in_stride = framework::stride_numel(in->dims());
      auto out_stride = framework::stride_numel(out->dims());
-      StridedNumelCopyWithAxis<float>(axis, out->data<float>() + output_offset,
-                                      out_stride, in->data<float>(), in_stride,
-                                      in_stride[axis]);
-      output_offset += in_stride[axis];
+      auto dst = out->data<float>() + output_offset;
+      auto src = in->data<float>();
+      PADDLE_MOBILE_ENFORCE(
+          in_stride.size() == out_stride.size(),
+          "src and dst tensor should have the same dims size.");
+      memory::Copy(dst, src, sizeof(float) * in_stride[0]);
+      output_offset += in_stride[0];
    }
  } else {
    std::vector<framework::Tensor> inputs_concat(inputs.size());

--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -15,19 +15,30 @@ limitations under the License. */
 #pragma once

 #include "operators/kernel/relu_kernel.h"
+#include <operators/math/transform.h>

 namespace paddle_mobile {
 namespace operators {

+template <typename T>
+struct ReluFunctor {
+  inline T operator()(T in) const { return in > 0 ? in : 0; }
+};
+
 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
  const auto *input_x = param.InputX();
  auto *input_x_ptr = input_x->data<float>();
  auto *out = param.Out();
  auto *out_ptr = out->mutable_data<float>();
-  for (int i = 0; i < input_x->numel(); i++) {
-    out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0;
-  }
+
+  ReluFunctor<float> func_;
+  math::Transform trans;
+  trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
+
+  //  for (int i = 0; i < input_x->numel(); i++) {
+  //    out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0;
+  //  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -42,12 +42,13 @@ struct LRNFunctor {
        for (int index = start; index < end; index++) {
          int channel = b + index;
          if (channel >= 0 && channel < C) {
+            int tmp_u = a * stride0 + b * stride1;
+            int tmp_i = a * stride0 + channel * stride1;
            for (int c = 0; c < H; c++) {
              for (int d = 0; d < W; d++) {
-                int u = a * stride0 + b * stride1 + c * stride2 + d;
-
-                int i = a * stride0 + channel * stride1 + c * stride2 + d;
-
+                int tmp = c * stride2 + d;
+                int u = tmp_u + tmp;
+                int i = tmp_i + tmp;
                sqr_buffer_ptr[u] += alpha * input_ptr[i] * input_ptr[i];
              }
            }

--- a/src/operators/math/elementwise_op_function.h
+++ b/src/operators/math/elementwise_op_function.h
@@ -67,35 +67,6 @@ inline void trim_trailing_singular_dims(framework::DDim *dims) {
  }
 }

-template <typename T>
-class RowwiseTransformIterator {
- public:
-  RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
-
-  RowwiseTransformIterator<T> &operator++() {
-    ++i_;
-    if (UNLIKELY(i_ == n_)) {
-      i_ = 0;
-    }
-    return *this;
-  }
-
-  bool operator==(const RowwiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) == &(*rhs);
-  }
-
-  bool operator!=(const RowwiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) != &(*rhs);
-  }
-
-  const T &operator*() { return ptr_[i_]; }
-
- private:
-  const T *ptr_;
-  int i_;
-  int64_t n_;
-};
-
 /// (4,20,2)+(20,): (20,) just as (20,1), when move 2 strides in last
 /// dimension
 /// in (4,20,2) is 2 ,
@@ -107,15 +78,23 @@ class MidWiseTransformIterator {
      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}

  MidWiseTransformIterator<T> &operator++() {
-    ++j_;
-    if (UNLIKELY(j_ == post_)) {
+    if (post_ != 1) {
+      ++j_;
+      if (UNLIKELY(j_ == post_)) {
+        ++i_;
+        j_ = 0;
+        if (UNLIKELY(i_ == n_)) {
+          i_ = 0;
+        }
+      }
+      return *this;
+    } else {
      ++i_;
-      j_ = 0;
      if (UNLIKELY(i_ == n_)) {
        i_ = 0;
      }
+      return *this;
    }
-    return *this;
  }

  bool operator==(const MidWiseTransformIterator<T> &rhs) const {
@@ -153,11 +132,6 @@ class TransformFunctor {
    trans(x_, x_ + nx_, y_, z_, func_);
  }

-  inline void RunRowWise(int n, int pre) const {
-    math::Transform trans;
-    trans(x_, x_ + nx_, RowwiseTransformIterator<T>(y_, n), z_, func_);
-  }
-
  inline void RunMidWise(int n, int pre, int post) const {
    math::Transform trans;
    trans(x_, x_ + nx_, MidWiseTransformIterator<T>(y_, n, post), z_, func_);
@@ -179,31 +153,25 @@ void ElementwiseComputeEx(const framework::Tensor *x,

  auto x_dims = x->dims();
  auto y_dims = y->dims();
-  // PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-  //                  "Rank of first input must >= rank of second
-  //                  input.");
+  PADDLE_MOBILE_ENFORCE(x_dims.size() >= y_dims.size(),
+                        "Rank of first input must >= rank of second input.");

  if (x_dims == y_dims) {
    functor.Run();
    return;
  }

-  /// axis = -1 represent the last dimension.
+  /// axis = -1 represent the last dimensions.
  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  // PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-  //               "Axis should be in range [0, x_dims)");
+  PADDLE_MOBILE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                        "Axis should be in range [0, x_dims)");
  trim_trailing_singular_dims(&y_dims);
  axis = (y_dims.size() == 0) ? x_dims.size() : axis;

  int pre, n, post;
  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-  if (post == 1) {
-    functor.RunRowWise(n, pre);
-    return;
-  } else {
-    functor.RunMidWise(n, pre, post);
-    return;
-  }
+
+  functor.RunMidWise(n, pre, post);
 }

 }  // namespace operators