Merge branch 'refactor-eltwise' into 'master'

Refactor: Polish the Eltwise code and add some UTs. See merge request !978

Merge branch 'refactor-eltwise' into 'master'
Refactor: Polish the Eltwise code and add some UTs. See merge request !978
a8b3c4f4 · 刘托 · 0247e6ba · 3a424351 · a8b3c4f4 · a8b3c4f4
4 changed file
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -14,6 +14,7 @@
 #include <vector>
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/eltwise.h"
 #include "mace/ops/ops_test_util.h"
@@ -531,6 +532,100 @@ TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
 }
+namespace {
+template <typename T>
+void GPUOverflowTest(const ops::EltwiseType type,
+                     const std::vector<index_t> &shape0,
+                     const std::vector<T> &input0,
+                     const std::vector<index_t> &shape1,
+                     const std::vector<T> &input1,
+                     const std::vector<index_t> &output_shape,
+                     const std::vector<T> &output) {
+  // Construct graph
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<DeviceType::GPU, T>("Input0", shape0, input0);
+  net.AddInputFromArray<DeviceType::GPU, T>("Input1", shape1, input1);
+  OpDefBuilder("Eltwise", "EltwiseTest")
+      .AddIntArg("T", DataTypeToEnum<T>::v())
+      .Input("Input0")
+      .Input("Input1")
+      .AddIntArg("type", static_cast<int>(type))
+      .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+      .Output("EltOutput")
+      .OutputShape(output_shape)
+      .Finalize(net.AddNewOperatorDef());
+  net.AddInputFromArray<DeviceType::GPU, T>(
+      "Filter",
+      {output_shape.back(), shape0.back(), 3, 3},
+      std::vector<float>(output_shape.back() * shape0.back() * 9, 1));
+  OpDefBuilder("Conv2D", "Conv2D")
+      .AddIntArg("T", DataTypeToEnum<T>::v())
+      .Input("EltOutput")
+      .Input("Filter")
+      .Output("Output")
+      .OutputShape(output_shape)
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::SAME)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.AddNewOperatorDef());
+  // Run
+  net.RunOp(DeviceType::GPU);
+  auto expected = net.CreateTensor<T>(output_shape, output);
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+}  // namespace
+TEST_F(EltwiseOpTest, GPUOverflowTest) {
+  GPUOverflowTest<float>(
+      ops::EltwiseType::SUM, {1, 2, 2, 2}, std::vector<float>(8, 1),
+      {1, 1, 1, 2}, {1, 1},
+      {1, 2, 2, 1}, {16, 16, 16, 16});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::SUB, {2, 2, 2, 2}, std::vector<float>(16, 1),
+      {2, 1, 1, 2}, {1, 1, 2, 2},
+      {2, 2, 2, 1}, {0, 0, 0, 0, -8, -8, -8, -8});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::PROD, {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 3, 2, 1}, {4, 4, 6, 6, 4, 4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::DIV, {2, 3, 2, 1}, std::vector<float>(12, 1),
+      {2, 3, 2, 1}, std::vector<float>(12, 1),
+      {2, 3, 2, 1}, {4, 4, 6, 6, 4, 4, 4, 4, 6, 6, 4, 4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::MIN, {1, 2, 2, 2}, std::vector<float>(8, 1),
+      {1, 1, 1, 2}, {1, 1},
+      {1, 2, 2, 1}, {8, 8, 8, 8});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::MAX, {2, 2, 2, 2}, std::vector<float>(16, 1),
+      {2, 1, 1, 2}, {1, 1, 2, 2},
+      {2, 2, 2, 1}, {8, 8, 8, 8, 16, 16, 16, 16});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::NEG, {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 1, 1, 1}, {0},
+      {1, 3, 2, 1}, {-4, -4, -6, -6, -4, -4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::ABS, {2, 3, 2, 1}, std::vector<float>(12, -1),
+      {1, 1, 1, 1}, {0},
+      {2, 3, 2, 1}, {4, 4, 6, 6, 4, 4, 4, 4, 6, 6, 4, 4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::SQR_DIFF, {2, 2, 2, 2}, std::vector<float>(16, 1),
+      {2, 1, 1, 2}, {1, 1, 2, 2},
+      {2, 2, 2, 1}, {0, 0, 0, 0, 8, 8, 8, 8});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::POW, {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 3, 2, 1}, {4, 4, 6, 6, 4, 4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::FLOOR_DIV, {2, 2, 2, 2}, std::vector<float>(16, 1),
+      {2, 1, 1, 2}, {1, 1, 2, 2},
+      {2, 2, 2, 1}, {8, 8, 8, 8, 0, 0, 0, 0});
+}
 namespace {
 template <typename T>
 void RandomTensorScalar(const ops::EltwiseType type,

--- a/mace/ops/opencl/cl/eltwise.cl
+++ b/mace/ops/opencl/cl/eltwise.cl
@@ -3,7 +3,7 @@
 __kernel void eltwise(OUT_OF_RANGE_PARAMS
                      GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input0,
-#if INPUT_TYPE == 1
+#if defined(INPUT_SCALAR)
                      __private const float value,
 #else
                      __read_only image2d_t input1,
@@ -28,14 +28,14 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS
  const int pos = mad24(chan_idx, width, width_idx);
  DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(pos, hb));
-#if INPUT_TYPE == 1
+#if defined(INPUT_SCALAR)
  DATA_TYPE4 in1 = (DATA_TYPE4)(value, value, value, value);
-#elif INPUT_TYPE == 2
+#elif defined(INPUT_VECTOR)
+  DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, 0));
+#elif defined(INPUT_BATCH_VECTOR)
  const int batch_idx = hb / height;
  DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, batch_idx));
-#elif INPUT_TYPE == 3
+#elif defined(INPUT_TENSOR_BC_CHAN)
-  DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, 0));
-#elif INPUT_TYPE == 4
  DATA_TYPE4 tmp = READ_IMAGET(input1, SAMPLER, (int2)(width_idx, hb));
  DATA_TYPE4 in1 = (DATA_TYPE4)(tmp.x, tmp.x, tmp.x, tmp.x);
 #else
@@ -89,11 +89,11 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS
  #endif
 #endif
-#if ((INPUT_TYPE == 1 || INPUT_TYPE == 4) &&                            \
+#if defined(NOT_DIVISIBLE_FOUR) &&                                       \
-     (ELTWISE_TYPE == 0 || ELTWISE_TYPE == 1 || ELTWISE_TYPE == 4 ||    \
+    ((ELTWISE_TYPE == 3 || ELTWISE_TYPE == 9 || ELTWISE_TYPE == 11)      \
-      ELTWISE_TYPE == 5 || ELTWISE_TYPE == 8 || ELTWISE_TYPE == 9)) ||  \
+     || ((defined(INPUT_SCALAR) || defined(INPUT_TENSOR_BC_CHAN)) &&     \
-    ((INPUT_TYPE != 1 || INPUT_TYPE != 4) &&                            \
+         (ELTWISE_TYPE == 0 || ELTWISE_TYPE == 1 || ELTWISE_TYPE == 4 || \
-     (ELTWISE_TYPE == 3 || ELTWISE_TYPE == 9 || ELTWISE_TYPE == 11))
+          ELTWISE_TYPE == 5 || ELTWISE_TYPE == 8)))
  const int remain_channel = channel - 4 * chan_idx;
  if (remain_channel < 4) {
    switch (remain_channel) {

--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -67,25 +67,52 @@ MaceStatus EltwiseKernel<T>::Compute(
    const Tensor *input1,
    Tensor *output) {
  bool swapped = false;
-  if (input1 != nullptr) {
+  std::string input1_type = "";
+  if (input1 == nullptr) {
+    input1_type = "INPUT_SCALAR";
+  } else {
    MACE_CHECK(input0->dim_size() == input1->dim_size() ||
        input0->dim_size() == 1 || input1->dim_size() == 1)
      << "Inputs of Eltwise op must be same shape";
+    MACE_CHECK(type_ != EltwiseType::EQUAL)
+      << "Eltwise op on GPU does not support EQUAL";
+    // broadcast
    if (input0->size() != input1->size()) {
      if (input0->size() < input1->size()) {
        std::swap(input0, input1);
        swapped = true;
      }
-      if (input1->dim_size() == 1) {
+      if (input1->dim_size() == 1
-        MACE_CHECK(input0->dim(3) == input1->dim(0))
+          || (input1->dim(0) == 1 && input1->dim(1) == 1
-          << "Element-Wise op support broadcast on only-channel or non-channel dimension";  // NOLINT(whitespace/line_length)
+              && input1->dim(2) == 1)) {
-      } else {
+        // Tensor-Vector element wise
-        MACE_CHECK(((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1)
+        if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
-            && input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
+          input1_type = "INPUT_VECTOR";
-            input1->dim(2) == 1) || (input0->dim(0) == input1->dim(0) &&
+        } else {
-            input0->dim(1) == input1->dim(1) && input0->dim(2) == input1->dim(2)
+          LOG(FATAL) << "Inputs not match the broadcast logic, "
-            && input1->dim(3) == 1))
+                     << MakeString(input0->shape()) << " vs "
-          << "Element-Wise op support broadcast on only-channel or non-channel dimension";  // NOLINT(whitespace/line_length)
+                     << MakeString(input1->shape());
+        }
+      } else {  // must be 4-D
+        if (input0->dim(0) == input1->dim(0)
+            && input1->dim(1) == 1
+            && input1->dim(2) == 1
+            && input0->dim(3) == input1->dim(3)) {
+          input1_type = "INPUT_BATCH_VECTOR";
+        } else if (input0->dim(0) == input1->dim(0)
+            && input0->dim(1) == input1->dim(1)
+            && input0->dim(2) == input1->dim(2)
+            && input1->dim(3) == 1) {
+          // broadcast on channel dimension
+          input1_type = "INPUT_TENSOR_BC_CHAN";
+        } else {
+          LOG(FATAL) << "Element-Wise op only support broadcast on"
+                        " channel dimension:"
+                        "Tensor-BatchVector(4D-[N,1,1,C]) "
+                        "and Tensor-Tensor(4D-[N,H,W,1]). but got "
+                     << MakeString(input0->shape()) << " vs "
+                     << MakeString(input1->shape());
+        }
      }
    }
  }
@@ -129,20 +156,11 @@ MaceStatus EltwiseKernel<T>::Compute(
    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
-    if (input1 == nullptr) {
+    if (!input1_type.empty()) {
-      built_options.emplace("-DINPUT_TYPE=1");
+      built_options.emplace("-D" + input1_type);
-    } else if (input0->size() != input1->size()) {
-      if (input0->dim(0) == input1->dim(0) && input0->dim(1) == input1->dim(1)
-          && input0->dim(2) == input1->dim(2) && input1->dim(3) == 1) {
-        // only broadcast on channel
-        built_options.emplace("-DINPUT_TYPE=4");
-      } else if (input1->dim(0) == 1 || input1->dim_size() == 1) {
-        built_options.emplace("-DINPUT_TYPE=3");
-      } else {
-        built_options.emplace("-DINPUT_TYPE=2");
-      }
-      if (swapped) built_options.emplace("-DSWAPPED");
    }
+    if (swapped) built_options.emplace("-DSWAPPED");
+    if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
                                              built_options, &kernel_));

--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -159,16 +159,16 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() {
 bool OpsTestNet::Setup(mace::DeviceType device) {
  NetDef net_def;
-  for (auto &op_def_ : op_defs_) {
+  for (auto &op_def : op_defs_) {
-    net_def.add_op()->CopyFrom(op_def_);
+    net_def.add_op()->CopyFrom(op_def);
-    for (auto input : op_def_.input()) {
+    for (auto input : op_def.input()) {
      if (ws_.GetTensor(input) != nullptr &&
          !ws_.GetTensor(input)->is_weight()) {
        auto input_info = net_def.add_input_info();
        input_info->set_name(input);
        auto data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def_, "data_format", DataFormat::DF_NONE);
+            op_def, "data_format", DataFormat::DF_NONE);
        input_info->set_data_format(data_format);
        auto &shape = ws_.GetTensor(input)->shape();
        for (auto d : shape) {
@@ -176,16 +176,16 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
        }
      }
    }
+  }
-    for (int i = 0; i < op_def_.output_size(); ++i) {
+  auto op_def = op_defs_.back();
-      ws_.RemoveTensor(op_def_.output(i));
+  for (int i = 0; i < op_def.output_size(); ++i) {
-      auto output_info = net_def.add_output_info();
+    ws_.RemoveTensor(op_def.output(i));
-      output_info->set_name(op_def_.output(i));
+    auto output_info = net_def.add_output_info();
-      if (op_def_.output_type_size() == op_def_.output_size()) {
+    output_info->set_name(op_def.output(i));
-        output_info->set_data_type(op_def_.output_type(i));
+    if (op_def.output_type_size() == op_def.output_size()) {
-      } else {
+      output_info->set_data_type(op_def.output_type(i));
-        output_info->set_data_type(DataType::DT_FLOAT);
+    } else {
-      }
+      output_info->set_data_type(DataType::DT_FLOAT);
    }
  }
  MemoryOptimizer mem_optimizer;