Refactor eltwise

46baf92a · 李寅 · 7bdc8a4d · 46baf92a · 46baf92a · 46baf92a
5 changed file
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -17,8 +17,8 @@
 #include <algorithm>
 #include <memory>
-#include <vector>
 #include <utility>
+#include <vector>
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
@@ -44,70 +44,253 @@ enum EltwiseType {
  NONE = 10,
 };
-inline void TensorScalar(const EltwiseType type,
+inline void TensorBroadcastEltwise(const EltwiseType type,
-                         const float *input0,
+                                   const float *input0,
-                         const float value,
+                                   const float *input1,
-                         const index_t size,
+                                   const std::vector<float> &coeff,
-                         float *output) {
+                                   const index_t diff_size,
+                                   const index_t common_size,
+                                   const bool swapped,
+                                   float *output) {
  switch (type) {
    case SUM:
+      if (coeff.empty()) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] + input1[i];
+          }
+        }
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] * coeff_copy[0] +
+                input1[i] * coeff_copy[1];
+          }
+        }
+      }
+      break;
+    case SUB:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] - input1[i];
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input1[i] - input0[i + d * common_size];
+          }
+        }
+      }
+      break;
+    case PROD:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] = input0[i + d * common_size] * input1[i];
+        }
+      }
+      break;
+    case DIV:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] / input1[i];
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input1[i] / input0[i + d * common_size];
+          }
+        }
+      }
+      break;
+    case MIN:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              std::min(input0[i + d * common_size], input1[i]);
+        }
+      }
+      break;
+    case MAX:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              std::max(input0[i + d * common_size], input1[i]);
+        }
+      }
+      break;
+    case SQR_DIFF:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              std::pow(input0[i + d * common_size] - input1[i], 2.f);
+        }
+      }
+      break;
+    case POW:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                std::pow(input0[i + d * common_size], input1[i]);
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                std::pow(input1[i], input0[i + d * common_size]);
+          }
+        }
+      }
+      break;
+    case NEG:
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
+      for (index_t i = 0; i < diff_size * common_size; ++i) {
-        output[i] = input0[i] + value;
+        output[i] = -input0[i];
+      }
+      break;
+    case ABS:
+#pragma omp parallel for
+      for (index_t i = 0; i < diff_size * common_size; ++i) {
+        output[i] = std::fabs(input0[i]);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Eltwise op not support type " << type;
+  }
+}
+// Multiplication is costly, so we specialize the following case.
+inline void TensorEltwise(const EltwiseType type,
+                          const float *input0,
+                          const float *input1,
+                          const std::vector<float> &coeff,
+                          const index_t size,
+                          const bool swapped,
+                          float *output) {
+  switch (type) {
+    case SUM:
+      if (coeff.empty()) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] + input1[i];
+        }
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
+        }
      }
      break;
    case SUB:
+      if (!swapped) {
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
+        for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] - value;
+          output[i] = input0[i] - input1[i];
+        }
+      } else {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1[i] - input0[i];
+        }
      }
      break;
    case PROD:
 #pragma omp parallel for
      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] * value;
+        output[i] = input0[i] * input1[i];
      }
      break;
    case DIV:
+      if (!swapped) {
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
+        for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] / value;
+          output[i] = input0[i] / input1[i];
+        }
+      } else {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1[i] / input0[i];
+        }
      }
      break;
    case MIN:
 #pragma omp parallel for
      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::min<float>(input0[i], value);
+        output[i] = std::min(input0[i], input1[i]);
      }
      break;
    case MAX:
 #pragma omp parallel for
      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::max<float>(input0[i], value);
+        output[i] = std::max(input0[i], input1[i]);
      }
      break;
-    case NEG:
+    case SQR_DIFF:
 #pragma omp parallel for
      for (index_t i = 0; i < size; ++i) {
-        output[i] = -input0[i];
+        output[i] = std::pow(input0[i] - input1[i], 2.f);
      }
      break;
-    case ABS:
+    case POW:
+      if (!swapped) {
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
+        for (index_t i = 0; i < size; ++i) {
-        output[i] = std::abs(input0[i]);
+          output[i] = std::pow(input0[i], input1[i]);
+        }
+      } else {
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input1[i], input0[i]);
+        }
      }
      break;
-    case SQR_DIFF:
+    case NEG:
 #pragma omp parallel for
      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i] - value, 2.f);
+        output[i] = -input0[i];
      }
      break;
-    case POW:
+    case ABS:
 #pragma omp parallel for
      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i], value);
+        output[i] = std::fabs(input0[i]);
      }
      break;
    default:
@@ -115,328 +298,304 @@ inline void TensorScalar(const EltwiseType type,
  }
 }
-inline void TensorBatchVector(const EltwiseType type,
+// Multiplication is costly, so we specialize the following case.
-                              const float *input0,
+inline void TensorScalarEltwise(const EltwiseType type,
-                              const float *input1,
+                                const float *input0,
-                              const index_t batch,
+                                const float input1,
-                              const index_t channel,
+                                const std::vector<float> &coeff,
-                              const index_t hw,
+                                const index_t size,
-                              const bool swapped,
+                                const bool swapped,
-                              float *output) {
+                                float *output) {
  switch (type) {
    case SUM:
-#pragma omp parallel for collapse(3)
+      if (coeff.empty()) {
-      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for
-        for (index_t c = 0; c < channel; ++c) {
+        for (index_t i = 0; i < size; ++i) {
-          for (index_t i = 0; i < hw; ++i) {
+          output[i] = input0[i] + input1;
-            const index_t idx0 = (b * channel + c) * hw + i;
+        }
-            const index_t idx1 = b * channel + c;
-            output[idx0] = input0[idx0] + input1[idx1];
+      } else {
-          }
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
        }
      }
      break;
    case SUB:
-      if (swapped) {
+      if (!swapped) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for
-        for (index_t b = 0; b < batch; ++b) {
+        for (index_t i = 0; i < size; ++i) {
-          for (index_t c = 0; c < channel; ++c) {
+          output[i] = input0[i] - input1;
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = b * channel + c;
-              output[idx0] = input1[idx1] - input0[idx0];
-            }
-          }
        }
      } else {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for
-        for (index_t b = 0; b < batch; ++b) {
+        for (index_t i = 0; i < size; ++i) {
-          for (index_t c = 0; c < channel; ++c) {
+          output[i] = input1 - input0[i];
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = b * channel + c;
-              output[idx0] = input0[idx0] - input1[idx1];
-            }
-          }
        }
      }
      break;
    case PROD:
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for
-      for (index_t b = 0; b < batch; ++b) {
+      for (index_t i = 0; i < size; ++i) {
-        for (index_t c = 0; c < channel; ++c) {
+        output[i] = input0[i] * input1;
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = input0[idx0] * input1[idx1];
-          }
-        }
      }
      break;
    case DIV:
-      if (swapped) {
+      if (!swapped) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for
-        for (index_t b = 0; b < batch; ++b) {
+        for (index_t i = 0; i < size; ++i) {
-          for (index_t c = 0; c < channel; ++c) {
+          output[i] = input0[i] / input1;
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = b * channel + c;
-              output[idx0] = input1[idx1] / input0[idx0];
-            }
-          }
        }
      } else {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for
-        for (index_t b = 0; b < batch; ++b) {
+        for (index_t i = 0; i < size; ++i) {
-          for (index_t c = 0; c < channel; ++c) {
+          output[i] = input1 / input0[i];
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = b * channel + c;
-              output[idx0] = input0[idx0] / input1[idx1];
-            }
-          }
        }
      }
      break;
    case MIN:
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for
-      for (index_t b = 0; b < batch; ++b) {
+      for (index_t i = 0; i < size; ++i) {
-        for (index_t c = 0; c < channel; ++c) {
+        output[i] = std::min(input0[i], input1);
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = std::min<float>(input0[idx0], input1[idx1]);
-          }
-        }
      }
      break;
    case MAX:
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for
-      for (index_t b = 0; b < batch; ++b) {
+      for (index_t i = 0; i < size; ++i) {
-        for (index_t c = 0; c < channel; ++c) {
+        output[i] = std::max(input0[i], input1);
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = std::max<float>(input0[idx0], input1[idx1]);
-          }
-        }
      }
      break;
    case SQR_DIFF:
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for
-      for (index_t b = 0; b < batch; ++b) {
+      for (index_t i = 0; i < size; ++i) {
-        for (index_t c = 0; c < channel; ++c) {
+        output[i] = std::pow(input0[i] - input1, 2.f);
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f);
-          }
-        }
      }
      break;
    case POW:
-#pragma omp parallel for collapse(3)
+      if (!swapped) {
-      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for
-        for (index_t c = 0; c < channel; ++c) {
+        for (index_t i = 0; i < size; ++i) {
-          for (index_t i = 0; i < hw; ++i) {
+          output[i] = std::pow(input0[i], input1);
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = std::pow(input0[idx0], input1[idx1]);
-          }
        }
+      } else {
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input1, input0[i]);
+        }
+      }
+      break;
+    case NEG:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = -input0[i];
+      }
+      break;
+    case ABS:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::fabs(input0[i]);
      }
      break;
    default:
      LOG(FATAL) << "Eltwise op not support type " << type;
  }
 }
-inline void TensorVector(const EltwiseType type,
-                         const float *input0,
+inline void TensorEltwisePerChannel(const EltwiseType type,
-                         const float *input1,
+                                    const float *input0,
-                         const index_t batch,
+                                    const float *input1,
-                         const index_t channel,
+                                    const std::vector<float> &coeff,
-                         const index_t hw,
+                                    const index_t batch0,
-                         const bool swapped,
+                                    const index_t batch1,
-                         float *output) {
+                                    const index_t channel,
+                                    const index_t image_size,
+                                    const bool swapped,
+                                    float *output) {
  switch (type) {
    case SUM:
-#pragma omp parallel for collapse(3)
+      if (coeff.empty()) {
-      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for collapse(2)
-        for (index_t c = 0; c < channel; ++c) {
+        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t i = 0; i < hw; ++i) {
+          for (index_t c = 0; c < channel; ++c) {
-            const index_t idx0 = (b * channel + c) * hw + i;
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const index_t idx1 = c;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            output[idx0] = input0[idx0] + input1[idx1];
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] + in1_ptr[c];
+            }
+          }
+        }
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] =
+                  in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
+            }
          }
        }
      }
      break;
    case SUB:
-      if (swapped) {
+      if (!swapped) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch; ++b) {
+        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-              const index_t idx0 = (b * channel + c) * hw + i;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-              const index_t idx1 = c;
+            float *out_ptr = output + ((b * channel) + c) * image_size;
-              output[idx0] = input1[idx1] - input0[idx0];
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] - in1_ptr[c];
            }
          }
        }
      } else {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch; ++b) {
+        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-              const index_t idx0 = (b * channel + c) * hw + i;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-              const index_t idx1 = c;
+            float *out_ptr = output + ((b * channel) + c) * image_size;
-              output[idx0] = input0[idx0] - input1[idx1];
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in1_ptr[c] - in0_ptr[i];
            }
          }
        }
      }
      break;
    case PROD:
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch; ++b) {
+      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
+          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const index_t idx0 = (b * channel + c) * hw + i;
+          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            const index_t idx1 = c;
+          float *out_ptr = output + ((b * channel) + c) * image_size;
-            output[idx0] = input0[idx0] * input1[idx1];
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = in0_ptr[i] * in1_ptr[c];
          }
        }
      }
      break;
    case DIV:
-      if (swapped) {
+      if (!swapped) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch; ++b) {
+        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-              const index_t idx0 = (b * channel + c) * hw + i;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-              const index_t idx1 = c;
+            float *out_ptr = output + ((b * channel) + c) * image_size;
-              output[idx0] = input1[idx1] / input0[idx0];
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] / in1_ptr[c];
            }
          }
        }
      } else {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch; ++b) {
+        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-              const index_t idx0 = (b * channel + c) * hw + i;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-              const index_t idx1 = c;
+            float *out_ptr = output + ((b * channel) + c) * image_size;
-              output[idx0] = input0[idx0] / input1[idx1];
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in1_ptr[c] / in0_ptr[i];
            }
          }
        }
      }
      break;
    case MIN:
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch; ++b) {
+      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
+          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const index_t idx0 = (b * channel + c) * hw + i;
+          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            const index_t idx1 = c;
+          float *out_ptr = output + ((b * channel) + c) * image_size;
-            output[idx0] = std::min<float>(input0[idx0], input1[idx1]);
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]);
          }
        }
      }
      break;
    case MAX:
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch; ++b) {
+      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
+          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const index_t idx0 = (b * channel + c) * hw + i;
+          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            const index_t idx1 = c;
+          float *out_ptr = output + ((b * channel) + c) * image_size;
-            output[idx0] = std::max<float>(input0[idx0], input1[idx1]);
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]);
          }
        }
      }
      break;
    case SQR_DIFF:
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch; ++b) {
+      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
+          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const index_t idx0 = (b * channel + c) * hw + i;
+          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            const index_t idx1 = c;
+          float *out_ptr = output + ((b * channel) + c) * image_size;
-            output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f);
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f);
          }
        }
      }
      break;
    case POW:
-#pragma omp parallel for collapse(3)
+      if (!swapped) {
-      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for collapse(2)
-        for (index_t c = 0; c < channel; ++c) {
+        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t i = 0; i < hw; ++i) {
+          for (index_t c = 0; c < channel; ++c) {
-            const index_t idx0 = (b * channel + c) * hw + i;
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const index_t idx1 = c;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            output[idx0] = std::pow(input0[idx0], input1[idx1]);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]);
+            }
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]);
+            }
          }
        }
      }
      break;
-    default:
+    case NEG:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
-}
-inline void TensorEltwise(const EltwiseType type,
-                          const float *input0,
-                          const float *input1,
-                          const index_t size,
-                          float *output) {
-  switch (type) {
-    case SUM:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] + input1[i];
-      }
-      break;
-    case SUB:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] - input1[i];
-      }
-      break;
-    case PROD:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] * input1[i];
-      }
-      break;
-    case DIV:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] / input1[i];
-      }
-      break;
-    case MIN:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::min<float>(input0[i], input1[i]);
-      }
-      break;
-    case MAX:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::max<float>(input0[i], input1[i]);
-      }
-      break;
-    case SQR_DIFF:
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
+      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
-        output[i] = std::pow(input0[i] - input1[i], 2.f);
+        output[i] = -input0[i];
      }
      break;
-    case POW:
+    case ABS:
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
+      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
-        output[i] = std::pow(input0[i], input1[i]);
+        output[i] = std::fabs(input0[i]);
      }
      break;
    default:
@@ -444,95 +603,109 @@ inline void TensorEltwise(const EltwiseType type,
  }
 }
 struct EltwiseFunctorBase {
  EltwiseFunctorBase(const EltwiseType type,
                     const std::vector<float> &coeff,
-                     const float value)
+                     const float value,
-      : type_(type), coeff_(coeff), value_(value) {}
+                     const DataFormat data_format)
+      : type_(type), coeff_(coeff), value_(value), data_format_(data_format) {}
  EltwiseType type_;
  std::vector<float> coeff_;
  float value_;
+  DataFormat data_format_;
 };
 template <DeviceType D, typename T>
 struct EltwiseFunctor;
 template <>
-struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
+struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
  EltwiseFunctor(const EltwiseType type,
                 const std::vector<float> &coeff,
-                 const float value)
+                 const float value,
-      : EltwiseFunctorBase(type, coeff, value) {}
+                 const DataFormat data_format)
+      : EltwiseFunctorBase(type, coeff, value, data_format) {}
  MaceStatus operator()(const Tensor *input0,
-                  const Tensor *input1,
+                        const Tensor *input1,
-                  Tensor *output,
+                        Tensor *output,
-                  StatsFuture *future) {
+                        StatsFuture *future) {
    MACE_UNUSED(future);
+    if (input1 == nullptr) {
+      scalar_tensor_.Resize({});
+      Tensor::MappingGuard guard(&scalar_tensor_);
+      auto scalar_data = scalar_tensor_.mutable_data<float>();
+      scalar_data[0] = value_;
+      input1 = &scalar_tensor_;
+    }
    bool swapped = false;
-    if (input1 != nullptr) {
+    if (input0->size() < input1->size()) {
-      MACE_CHECK(input0->dim_size() == input1->dim_size()
+      std::swap(input0, input1);
-                     || input0->dim_size() == 1
+      swapped = true;
-                     || input1->dim_size() == 1)
+    }
-        << "Inputs of Eltwise op must be same shape";
-      if (input0->size() != input1->size()) {
+    // check if we can broadcast tensor
-        if (input0->size() < input1->size()) {
+    uint32_t rank_diff =
-          std::swap(input0, input1);
+        static_cast<uint32_t>(input0->dim_size() - input1->dim_size());
-          swapped = true;
+    if (data_format_ == NCHW) {
-        }
+      MACE_CHECK(
-        if (input1->dim_size() == 1) {
+          input0->dim_size() == 4 &&
-          MACE_CHECK(input0->dim(1) == input1->dim(0))
+              (input1->dim_size() == 0 ||
-            << "Element-Wise op only support channel dimension broadcast";
+               input1->dim_size() == 4 && input1->dim(1) == input0->dim(1) &&
-        } else {
+                   (input1->dim(0) == input0->dim(0) || input1->dim(0) == 1) ||
-          MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1)
+               input1->dim_size() == 1 && input1->dim(0) == input0->dim(1)),
-                         && input0->dim(1) == input1->dim(1)
+          "only support broadcast channel dimension");
-                         && input1->dim(2) == 1
+    } else {
-                         && input1->dim(3) == 1)
+      if (rank_diff > 0 && rank_diff < input0->dim_size()) {
-            << "Element-Wise op only support channel dimension broadcast";
+        for (uint32_t i = 0; i < input1->dim_size(); ++i) {
+          MACE_CHECK(input0->dim(rank_diff + i) == input1->dim(i),
+                     "Element-Wise op only support tail dimensions broadcast");
        }
      }
    }
+    index_t common_size = input1->size();
+    index_t diff_size = input0->size() / common_size;
    MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
    Tensor::MappingGuard input0_guard(input0);
+    Tensor::MappingGuard input1_guard(input1);
    Tensor::MappingGuard output_guard(output);
    const float *input0_ptr = input0->data<float>();
+    const float *input1_ptr = input1->data<float>();
    float *output_ptr = output->mutable_data<float>();
-    const index_t size = input0->size();
-    if (input1 == nullptr) {
+    if (data_format_ == NCHW && input1->dim_size() > 0 &&
-      TensorScalar(type_, input0_ptr, value_, size, output_ptr);
+        input1->size() < input0->size()) {
+      TensorEltwisePerChannel(
+          type_, input0_ptr, input1_ptr, coeff_, input0->dim(0),
+          input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1),
+          input0->dim(2) * input0->dim(3), swapped, output_ptr);
    } else {
-      Tensor::MappingGuard input1_guard(input1);
+      if (input1->size() == input0->size()) {
+        TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(),
-      const float *input1_ptr = input1->data<float>();
+                      swapped, output_ptr);
-      if (input1->size() != input0->size()) {
+      } else if (input1->size() < input0->size()) {
-        const index_t batch = input0->dim(0);
+        if (input1->size() > 1) {
-        const index_t channel = input0->dim(1);
+          TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
-        const index_t hw = input0->dim(2) * input0->dim(3);
+                                 diff_size, common_size, swapped, output_ptr);
-        if (input1->dim(0) == 1 || input1->dim_size() == 1)
-          TensorVector(type_, input0_ptr, input1_ptr,
-                       batch, channel, hw, swapped, output_ptr);
-        else
-          TensorBatchVector(type_, input0_ptr, input1_ptr,
-                            batch, channel, hw, swapped, output_ptr);
-      } else {
-        if (!coeff_.empty() && type_ == SUM) {
-#pragma omp parallel for
-          for (index_t i = 0; i < size; ++i) {
-            output_ptr[i] = coeff_[0] * input0_ptr[i] +
-                coeff_[1] * input1_ptr[i];
-          }
        } else {
-          TensorEltwise(type_, input0_ptr, input1_ptr, size, output_ptr);
+          TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_,
+                              input0->size(), swapped, output_ptr);
        }
      }
    }
    return MACE_SUCCESS;
  }
+  Tensor scalar_tensor_;
 };
 #ifdef MACE_ENABLE_OPENCL
@@ -540,13 +713,14 @@ template <typename T>
 struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
  EltwiseFunctor(const EltwiseType type,
                 const std::vector<float> &coeff,
-                 const float value)
+                 const float value,
-      : EltwiseFunctorBase(type, coeff, value) {}
+                 const DataFormat data_format)
+      : EltwiseFunctorBase(type, coeff, value, data_format) {}
  MaceStatus operator()(const Tensor *input0,
-                  const Tensor *input1,
+                        const Tensor *input1,
-                  Tensor *output,
+                        Tensor *output,
-                  StatsFuture *future);
+                        StatsFuture *future);
  cl::Kernel kernel_;
  uint32_t kwg_size_;

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -42,48 +42,79 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
                        Tensor *output,
                        StatsFuture *future) {
    MACE_UNUSED(future);
-    const index_t batch = input->dim(0);
-    const index_t class_count = input->dim(1);
-    const index_t class_size = input->dim(2) * input->dim(3);
-    const index_t batch_size = class_count * class_size;
    Tensor::MappingGuard input_guard(input);
    Tensor::MappingGuard output_guard(output);
    const float *input_data = input->data<float>();
    float *output_data = output->mutable_data<float>();
-    for (index_t b = 0; b < batch; ++b) {
+    // softmax for nchw image
+    if (input->dim_size() == 4) {
+      const index_t batch = input->dim(0);
+      const index_t class_count = input->dim(1);
+      const index_t class_size = input->dim(2) * input->dim(3);
+      const index_t batch_size = class_count * class_size;
+      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for
+        for (index_t k = 0; k < class_size; ++k) {
+          const float *input_ptr = input_data + b * batch_size + k;
+          float *output_ptr = output_data + b * batch_size + k;
+          float max_val = std::numeric_limits<float>::lowest();
+          index_t channel_offset = 0;
+          for (index_t c = 0; c < class_count; ++c) {
+            float data = input_ptr[channel_offset];
+            if (data > max_val) {
+              max_val = data;
+            }
+            channel_offset += class_size;
+          }
+          channel_offset = 0;
+          float sum = 0;
+          for (index_t c = 0; c < class_count; ++c) {
+            float exp_value = ::exp(input_ptr[channel_offset] - max_val);
+            sum += exp_value;
+            output_ptr[channel_offset] = exp_value;
+            channel_offset += class_size;
+          }
+          sum = std::max(sum, std::numeric_limits<float>::min());
+          channel_offset = 0;
+          for (index_t c = 0; c < class_count; ++c) {
+            output_ptr[channel_offset] /= sum;
+            channel_offset += class_size;
+          }
+        }  // k
+      }  // b
+    } else if (input->dim_size() == 2) {  // normal 2d softmax
+      const index_t class_size = input->dim(0);
+      const index_t class_count = input->dim(1);
 #pragma omp parallel for
      for (index_t k = 0; k < class_size; ++k) {
-        const float *input_ptr = input_data + b * batch_size + k;
+        const float *input_ptr = input_data + k * class_count;
-        float *output_ptr = output_data + b * batch_size + k;
+        float *output_ptr = output_data + k * class_count;
        float max_val = std::numeric_limits<float>::lowest();
-        index_t channel_offset = 0;
        for (index_t c = 0; c < class_count; ++c) {
-          float data = input_ptr[channel_offset];
+          max_val = std::max(max_val, input_ptr[c]);
-          if (data > max_val) {
-            max_val = data;
-          }
-          channel_offset += class_size;
        }
-        channel_offset = 0;
        float sum = 0;
        for (index_t c = 0; c < class_count; ++c) {
-          float exp_value = ::exp(input_ptr[channel_offset] - max_val);
+          float exp_value = ::exp(input_ptr[c] - max_val);
          sum += exp_value;
-          output_ptr[channel_offset] = exp_value;
+          output_ptr[c] = exp_value;
-          channel_offset += class_size;
        }
-        channel_offset = 0;
+        sum = std::max(sum, std::numeric_limits<float>::min());
        for (index_t c = 0; c < class_count; ++c) {
-          output_ptr[channel_offset] /= sum;
+          output_ptr[c] /= sum;
-          channel_offset += class_size;
        }
-      }  // k
+      }
-    }  // b
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
    return MACE_SUCCESS;
  }

--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -30,7 +30,9 @@ class EltwiseOp : public Operator<D, T> {
            static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
                "type", static_cast<int>(kernels::EltwiseType::NONE))),
            OperatorBase::GetRepeatedArgs<float>("coeff"),
-            OperatorBase::GetOptionalArg<float>("value", 1.0)) {}
+            OperatorBase::GetOptionalArg<float>("value", 1.0),
+            static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
+                "data_format", 0))) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input0 = this->Input(0);

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -41,6 +41,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
        .Input("TInput")
        .AddIntArg("type", static_cast<int>(type))
        .AddFloatArg("value", x)
+        .AddIntArg("data_format", DataFormat::NCHW)
        .Output("TOutput")
        .Finalize(net.NewOperatorDef());
    // Run
@@ -84,15 +85,24 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
  net.AddInputFromArray<D, float>("Input1", shape1, input1);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
+    auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest")
-    net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
-    OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("TInput0")
-        .Input("TInput1")
        .AddIntArg("type", static_cast<int>(type))
        .AddFloatsArg("coeff", coeff)
-        .Output("TOutput")
+        .AddIntArg("data_format", DataFormat::NCHW)
-        .Finalize(net.NewOperatorDef());
+        .Output("TOutput");
+    if (shape0.size() > 1) {
+      net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
+      op_builder.Input("TInput0");
+    } else {
+      op_builder.Input("Input0");
+    }
+    if (shape1.size() > 1) {
+      net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
+      op_builder.Input("TInput1");
+    } else {
+      op_builder.Input("Input1");
+    }
+    op_builder.Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -214,6 +224,35 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
      kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
+      {1, 2, 3}, {2, 4, 6, 5, 7, 9});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3},
+      {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
+      {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5},
+      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
 }
 TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
@@ -322,6 +361,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
      .Input("TInput")
      .AddIntArg("type", static_cast<int>(type))
      .AddFloatArg("value", 0.1)
+      .AddIntArg("data_format", DataFormat::NCHW)
      .Output("TOutput")
      .Finalize(net.NewOperatorDef());
  // Run
@@ -375,6 +415,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
      .Input("TInput1")
      .AddIntArg("type", static_cast<int>(type))
      .AddFloatsArg("coeff", coeff)
+      .AddIntArg("data_format", DataFormat::NCHW)
      .Output("TOutput")
      .Finalize(net.NewOperatorDef());

--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -29,8 +29,12 @@ void Simple() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
                                  {1, 1, 1, 1, 1, 2, 3, 4});
+  auto expected = CreateTensor<float>(
+      {1, 1, 2, 4},
+      {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
  if (D == DeviceType::CPU) {
+    // test 4d softmax
    net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
    OpDefBuilder("Softmax", "SoftmaxTest")
        .Input("InputNCHW")
@@ -40,6 +44,21 @@ void Simple() {
    // Run
    net.RunOp(D);
    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+    // check 2d softmax
+    net.AddInputFromArray<D, float>("Input2d", {2, 4},
+                                    {1, 1, 1, 1, 1, 2, 3, 4});
+    OpDefBuilder("Softmax", "SoftmaxTest")
+        .Input("Input2d")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.GetOutput("Output")->Reshape({1, 1, 2, 4});
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -55,15 +74,11 @@ void Simple() {
    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  auto expected = CreateTensor<float>(
-      {1, 1, 2, 4},
-      {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace