Refactor eltwise

46baf92a · 李寅 · 7bdc8a4d · 46baf92a · 46baf92a · 46baf92a
5 changed file
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -42,48 +42,79 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
                        Tensor *output,
                        StatsFuture *future) {
    MACE_UNUSED(future);
-    const index_t batch = input->dim(0);
-    const index_t class_count = input->dim(1);
-    const index_t class_size = input->dim(2) * input->dim(3);
-    const index_t batch_size = class_count * class_size;
-
    Tensor::MappingGuard input_guard(input);
    Tensor::MappingGuard output_guard(output);
    const float *input_data = input->data<float>();
    float *output_data = output->mutable_data<float>();

-    for (index_t b = 0; b < batch; ++b) {
+    // softmax for nchw image
+    if (input->dim_size() == 4) {
+      const index_t batch = input->dim(0);
+      const index_t class_count = input->dim(1);
+      const index_t class_size = input->dim(2) * input->dim(3);
+      const index_t batch_size = class_count * class_size;
+
+      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for
+        for (index_t k = 0; k < class_size; ++k) {
+          const float *input_ptr = input_data + b * batch_size + k;
+          float *output_ptr = output_data + b * batch_size + k;
+
+          float max_val = std::numeric_limits<float>::lowest();
+          index_t channel_offset = 0;
+          for (index_t c = 0; c < class_count; ++c) {
+            float data = input_ptr[channel_offset];
+            if (data > max_val) {
+              max_val = data;
+            }
+            channel_offset += class_size;
+          }
+
+          channel_offset = 0;
+          float sum = 0;
+          for (index_t c = 0; c < class_count; ++c) {
+            float exp_value = ::exp(input_ptr[channel_offset] - max_val);
+            sum += exp_value;
+            output_ptr[channel_offset] = exp_value;
+            channel_offset += class_size;
+          }
+
+          sum = std::max(sum, std::numeric_limits<float>::min());
+          channel_offset = 0;
+          for (index_t c = 0; c < class_count; ++c) {
+            output_ptr[channel_offset] /= sum;
+            channel_offset += class_size;
+          }
+        }  // k
+      }  // b
+    } else if (input->dim_size() == 2) {  // normal 2d softmax
+      const index_t class_size = input->dim(0);
+      const index_t class_count = input->dim(1);
 #pragma omp parallel for
      for (index_t k = 0; k < class_size; ++k) {
-        const float *input_ptr = input_data + b * batch_size + k;
-        float *output_ptr = output_data + b * batch_size + k;
+        const float *input_ptr = input_data + k * class_count;
+        float *output_ptr = output_data + k * class_count;

        float max_val = std::numeric_limits<float>::lowest();
-        index_t channel_offset = 0;
        for (index_t c = 0; c < class_count; ++c) {
-          float data = input_ptr[channel_offset];
-          if (data > max_val) {
-            max_val = data;
-          }
-          channel_offset += class_size;
+          max_val = std::max(max_val, input_ptr[c]);
        }

-        channel_offset = 0;
        float sum = 0;
        for (index_t c = 0; c < class_count; ++c) {
-          float exp_value = ::exp(input_ptr[channel_offset] - max_val);
+          float exp_value = ::exp(input_ptr[c] - max_val);
          sum += exp_value;
-          output_ptr[channel_offset] = exp_value;
-          channel_offset += class_size;
+          output_ptr[c] = exp_value;
        }

-        channel_offset = 0;
+        sum = std::max(sum, std::numeric_limits<float>::min());
        for (index_t c = 0; c < class_count; ++c) {
-          output_ptr[channel_offset] /= sum;
-          channel_offset += class_size;
+          output_ptr[c] /= sum;
        }
-      }  // k
-    }  // b
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }

    return MACE_SUCCESS;
  }

--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -30,7 +30,9 @@ class EltwiseOp : public Operator<D, T> {
            static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
                "type", static_cast<int>(kernels::EltwiseType::NONE))),
            OperatorBase::GetRepeatedArgs<float>("coeff"),
-            OperatorBase::GetOptionalArg<float>("value", 1.0)) {}
+            OperatorBase::GetOptionalArg<float>("value", 1.0),
+            static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
+                "data_format", 0))) {}

  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input0 = this->Input(0);

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -41,6 +41,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
        .Input("TInput")
        .AddIntArg("type", static_cast<int>(type))
        .AddFloatArg("value", x)
+        .AddIntArg("data_format", DataFormat::NCHW)
        .Output("TOutput")
        .Finalize(net.NewOperatorDef());
    // Run
@@ -84,15 +85,24 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
  net.AddInputFromArray<D, float>("Input1", shape1, input1);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
-    net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
-    OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("TInput0")
-        .Input("TInput1")
+    auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest")
        .AddIntArg("type", static_cast<int>(type))
        .AddFloatsArg("coeff", coeff)
-        .Output("TOutput")
-        .Finalize(net.NewOperatorDef());
+        .AddIntArg("data_format", DataFormat::NCHW)
+        .Output("TOutput");
+    if (shape0.size() > 1) {
+      net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
+      op_builder.Input("TInput0");
+    } else {
+      op_builder.Input("Input0");
+    }
+    if (shape1.size() > 1) {
+      net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
+      op_builder.Input("TInput1");
+    } else {
+      op_builder.Input("Input1");
+    }
+    op_builder.Finalize(net.NewOperatorDef());

    // Run
    net.RunOp(D);
@@ -214,6 +224,35 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
      kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
+
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
+      {1, 2, 3}, {2, 4, 6, 5, 7, 9});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3},
+      {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
+      {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5},
+      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
 }

 TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
@@ -322,6 +361,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
      .Input("TInput")
      .AddIntArg("type", static_cast<int>(type))
      .AddFloatArg("value", 0.1)
+      .AddIntArg("data_format", DataFormat::NCHW)
      .Output("TOutput")
      .Finalize(net.NewOperatorDef());
  // Run
@@ -375,6 +415,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
      .Input("TInput1")
      .AddIntArg("type", static_cast<int>(type))
      .AddFloatsArg("coeff", coeff)
+      .AddIntArg("data_format", DataFormat::NCHW)
      .Output("TOutput")
      .Finalize(net.NewOperatorDef());


--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -29,8 +29,12 @@ void Simple() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
                                  {1, 1, 1, 1, 1, 2, 3, 4});
+  auto expected = CreateTensor<float>(
+      {1, 1, 2, 4},
+      {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});

  if (D == DeviceType::CPU) {
+    // test 4d softmax
    net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
    OpDefBuilder("Softmax", "SoftmaxTest")
        .Input("InputNCHW")
@@ -40,6 +44,21 @@ void Simple() {
    // Run
    net.RunOp(D);
    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+
+    // check 2d softmax
+    net.AddInputFromArray<D, float>("Input2d", {2, 4},
+                                    {1, 1, 1, 1, 1, 2, 3, 4});
+    OpDefBuilder("Softmax", "SoftmaxTest")
+        .Input("Input2d")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+    net.GetOutput("Output")->Reshape({1, 1, 2, 4});
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -55,15 +74,11 @@ void Simple() {
    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
+
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-
-  auto expected = CreateTensor<float>(
-      {1, 1, 2, 4},
-      {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
-
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace