[ARM] add elementwise mod operator, test=develop (#3726)

4e9852e7 · yiicy · GitHub · 07ae2599 · 4e9852e7 · 4e9852e7
6 changed file
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "lite/backends/arm/math/elementwise.h"
+#include <math.h>
 #include <algorithm>
 #include "lite/backends/arm/math/funcs.h"

@@ -1541,6 +1541,87 @@ void elementwise_div_relu_broadcast<float>(const float* dinx,
  }
 }

+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const T* din_ptr = dinx + offset;
+      const T diny_data = diny[j];
+      T* dout_ptr = dout + offset;
+
+      int cnt = num >> 2;
+      int remain = num % 4;
+      for (int k = 0; k < cnt; ++k) {
+        register T dinx0 = din_ptr[0];
+        register T dinx1 = din_ptr[1];
+        register T dinx2 = din_ptr[2];
+        register T dinx3 = din_ptr[3];
+        dout_ptr[0] = dinx0 % diny_data;
+        dout_ptr[1] = dinx1 % diny_data;
+        dout_ptr[2] = dinx2 % diny_data;
+        dout_ptr[3] = dinx3 % diny_data;
+        din_ptr += 4;
+        dout_ptr += 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; p++) {
+          *dout_ptr++ = *din_ptr++ % diny_data;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num) {
+  int cnt = num >> 2;
+  int remain = num % 4;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const T* dinx_ptr = dinx + (i << 2);
+    const T* diny_ptr = diny + (i << 2);
+    T* dout_ptr = dout + (i << 2);
+
+    register T dinx0 = dinx_ptr[0];
+    register T dinx1 = dinx_ptr[1];
+    register T dinx2 = dinx_ptr[2];
+    register T dinx3 = dinx_ptr[3];
+
+    register T diny0 = diny_ptr[0];
+    register T diny1 = diny_ptr[1];
+    register T diny2 = diny_ptr[2];
+    register T diny3 = diny_ptr[3];
+
+    dout_ptr[0] = dinx0 % diny0;
+    dout_ptr[1] = dinx1 % diny1;
+    dout_ptr[2] = dinx2 % diny2;
+    dout_ptr[3] = dinx3 % diny3;
+  }
+  if (remain > 0) {
+    const T* dinx_ptr = dinx + (cnt << 2);
+    const T* diny_ptr = diny + (cnt << 2);
+    T* dout_ptr = dout + (cnt << 2);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr++ = *dinx_ptr++ % *diny_ptr++;
+    }
+  }
+}
+
+template void elementwise_mod<int64_t>(const int64_t* dinx,
+                                       const int64_t* diny,
+                                       int64_t* dout,
+                                       int num);
+
+template void elementwise_mod_broadcast<int64_t>(const int64_t* dinx,
+                                                 const int64_t* diny,
+                                                 int64_t* dout,
+                                                 int batch,
+                                                 int channels,
+                                                 int num);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
@@ -253,6 +253,13 @@ template <typename T>
 void elementwise_div_relu_broadcast(
    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);

+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num);
+
+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -351,6 +351,29 @@ void ElementwiseDivActivationCompute::Run() {
  }
 }

+template <typename T, PrecisionType PType>
+void ElementwiseModCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ElementwiseParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_mod_broadcast<T>(
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_mod_broadcast<T>(
+        x_data, y_data, out_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_mod<T>(
+        x_data, y_data, out_data, x_dims.production());
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -487,3 +510,13 @@ REGISTER_LITE_KERNEL(
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+
+using elementwise_mod_int64 =
+    paddle::lite::kernels::arm::ElementwiseModCompute<int64_t,
+                                                      PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(
+    elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -102,6 +102,22 @@ class ElementwiseDivActivationCompute
  virtual ~ElementwiseDivActivationCompute() = default;
 };

+template <typename T, PrecisionType PType>
+class ElementwiseModCompute : public KernelLite<TARGET(kARM), PType> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseModCompute() = default;
+};
+
+// class ElementwiseModActivationCompute
+//     : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+//  public:
+//   void Run() override;
+
+//   virtual ~ElementwiseModActivationCompute() = default;
+// };
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
@@ -14,6 +14,7 @@

 #include "lite/kernels/arm/elementwise_compute.h"
 #include <gtest/gtest.h>
+#include <cmath>
 #include <string>
 #include <vector>
 #include "lite/core/op_registry.h"
@@ -140,6 +141,119 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
  }
 }

+template <typename dtype>
+void elementwise_fmod_compute_ref(const operators::ElementwiseParam& param,
+                                  const std::string act_type) {
+  const dtype* x_data = param.X->data<const dtype>();
+  const dtype* y_data = param.Y->data<const dtype>();
+  dtype* out_data = param.Out->mutable_data<dtype>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const dtype* din_ptr = x_data + offset;
+      const dtype diny_data = y_data[j];
+      dtype* dout_ptr = out_data + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = fmod(diny_data + fmod(*din_ptr, diny_data), diny_data);
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+  // do activation relu
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename dtype>
+void elementwise_imod_compute_ref(const operators::ElementwiseParam& param,
+                                  const std::string act_type) {
+  const dtype* x_data = param.X->data<const dtype>();
+  const dtype* y_data = param.Y->data<const dtype>();
+  dtype* out_data = param.Out->mutable_data<dtype>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const dtype* din_ptr = x_data + offset;
+      const dtype diny_data = y_data[j];
+      dtype* dout_ptr = out_data + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = (*din_ptr) % diny_data;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+  // do activation relu
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template void elementwise_fmod_compute_ref<float>(
+    const operators::ElementwiseParam& param, const std::string act_type);
+template void elementwise_imod_compute_ref<int32_t>(
+    const operators::ElementwiseParam& param, const std::string act_type);
+template void elementwise_imod_compute_ref<int64_t>(
+    const operators::ElementwiseParam& param, const std::string act_type);
+
 TEST(elementwise_add, compute) {
  ElementwiseAddCompute elementwise_add;
  operators::ElementwiseParam param;
@@ -685,7 +799,7 @@ TEST(fusion_elementwise_max_activation_arm, compute) {
                }
                for (int i = 0; i < y_dim.production(); i++) {
                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                  y_data[i] = i * sign;
+                  y_data[i] = (i + 1) * sign;
                }
                param.X = &x;
                param.Y = &y;
@@ -708,6 +822,108 @@ TEST(fusion_elementwise_max_activation_arm, compute) {
  }
 }

+TEST(elementwise_mod_int64_arm, retrive_op) {
+  auto elementwise_mod =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kInt64)>(
+          "elementwise_mod");
+  ASSERT_FALSE(elementwise_mod.empty());
+  ASSERT_TRUE(elementwise_mod.front());
+}
+
+TEST(elementwise_mod_int64_arm, init) {
+  ElementwiseModCompute<int64_t, PRECISION(kInt64)> elementwise_mod;
+  ASSERT_EQ(elementwise_mod.precision(), PRECISION(kInt64));
+  ASSERT_EQ(elementwise_mod.target(), TARGET(kARM));
+}
+
+TEST(elementwise_mod_int64_arm, compute) {
+  ElementwiseModCompute<int64_t, PRECISION(kInt64)> elementwise_mod;
+  operators::ElementwiseParam param;
+  lite::Tensor x, y, output, output_ref;
+
+#if 1
+  for (auto n : {1, 3, 4}) {
+    for (auto c : {1, 3, 4}) {
+      for (auto h : {1, 3, 4}) {
+        for (auto w : {1, 3, 4}) {
+          for (auto axis : {-1, 0, 1, 3}) {
+            for (auto yd : {std::vector<int64_t>({n}),
+                            std::vector<int64_t>({c}),
+                            std::vector<int64_t>({h}),
+                            std::vector<int64_t>({w}),
+                            std::vector<int64_t>({n, c}),
+                            std::vector<int64_t>({c, h}),
+                            std::vector<int64_t>({c, h, w}),
+                            std::vector<int64_t>({n, c, h, w})}) {
+#else
+  for (auto n : {1, 3, 4, 11}) {
+    for (auto c : {1, 3, 4, 11}) {
+      for (auto h : {1, 3, 4, 11}) {
+        for (auto w : {1, 3, 4, 11}) {
+          for (auto axis : {-1, 0, 1, 2, 3}) {
+            for (auto yd : {std::vector<int64_t>({n}),
+                            std::vector<int64_t>({c}),
+                            std::vector<int64_t>({h}),
+                            std::vector<int64_t>({w}),
+                            std::vector<int64_t>({n, c}),
+                            std::vector<int64_t>({c, h}),
+                            std::vector<int64_t>({h, w}),
+                            std::vector<int64_t>({n, c, h}),
+                            std::vector<int64_t>({c, h, w}),
+                            std::vector<int64_t>({n, c, h, w})}) {
+#endif
+              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+              auto y_dim = DDim(yd);
+              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+
+              if (axis_t + y_dim.size() > 4) continue;
+              bool flag = false;
+              for (int i = 0; i < y_dim.size(); i++) {
+                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+              }
+              if (flag) continue;
+
+              x.Resize(x_dim);
+              y.Resize(y_dim);
+              output.Resize(x_dim);
+              output_ref.Resize(x_dim);
+              auto* x_data = x.mutable_data<int64_t>();
+              auto* y_data = y.mutable_data<int64_t>();
+              auto* output_data = output.mutable_data<int64_t>();
+              auto* output_ref_data = output_ref.mutable_data<int64_t>();
+              for (int i = 0; i < x_dim.production(); i++) {
+                x_data[i] = i + 1;
+              }
+              for (int i = 0; i < y_dim.production(); i++) {
+                y_data[i] = y_dim.production() - i;
+              }
+              param.X = &x;
+              param.Y = &y;
+              param.axis = axis;
+              param.Out = &output;
+              elementwise_mod.SetParam(param);
+              elementwise_mod.Run();
+              param.Out = &output_ref;
+              elementwise_imod_compute_ref<int64_t>(param, "");
+              for (int i = 0; i < output.dims().production(); i++) {
+                if (std::abs(output_data[i] - output_ref_data[i]) > 1e-5 ||
+                    std::isnan(output_data[i]) ||
+                    std::isnan(output_ref_data[i])) {
+                  LOG(FATAL) << "elementwise mod cmp error, i: " << i
+                             << ", x_data: " << x_data[i]
+                             << ", y_data: " << y_data[i]
+                             << ", output_data: " << output_data[i]
+                             << ", output_ref_data: " << output_ref_data[i];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -719,3 +935,4 @@ USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_mod, kARM, kInt64, kNCHW, def);
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -144,6 +144,7 @@ REGISTER_LITE_OP(elementwise_add, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_mul, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_max, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_div, paddle::lite::operators::ElementwiseOp);
+REGISTER_LITE_OP(elementwise_mod, paddle::lite::operators::ElementwiseOp);

 // #ifdef LITE_WITH_TRAIN
 // REGISTER_LITE_OP(elementwise_sub_grad,