From 46baf92a9d243347fa5dc511db79e0e71f756cd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= <liyin@xiaomi.com>
Date: Tue, 5 Jun 2018 15:28:05 +0800
Subject: [PATCH] Refactor eltwise

---
 mace/kernels/eltwise.h   | 780 ++++++++++++++++++++++++---------------
 mace/kernels/softmax.h   |  77 ++--
 mace/ops/eltwise.h       |   4 +-
 mace/ops/eltwise_test.cc |  55 ++-
 mace/ops/softmax_test.cc |  27 +-
 5 files changed, 603 insertions(+), 340 deletions(-)
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index 44a60e38..82418618 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -17,8 +17,8 @@
 
 #include <algorithm>
 #include <memory>
-#include <vector>
 #include <utility>
+#include <vector>
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
@@ -44,70 +44,253 @@ enum EltwiseType {
   NONE = 10,
 };
 
-inline void TensorScalar(const EltwiseType type,
-                         const float *input0,
-                         const float value,
-                         const index_t size,
-                         float *output) {
+inline void TensorBroadcastEltwise(const EltwiseType type,
+                                   const float *input0,
+                                   const float *input1,
+                                   const std::vector<float> &coeff,
+                                   const index_t diff_size,
+                                   const index_t common_size,
+                                   const bool swapped,
+                                   float *output) {
   switch (type) {
     case SUM:
+      if (coeff.empty()) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] + input1[i];
+          }
+        }
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] * coeff_copy[0] +
+                input1[i] * coeff_copy[1];
+          }
+        }
+      }
+      break;
+    case SUB:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] - input1[i];
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input1[i] - input0[i + d * common_size];
+          }
+        }
+      }
+      break;
+    case PROD:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] = input0[i + d * common_size] * input1[i];
+        }
+      }
+      break;
+    case DIV:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] / input1[i];
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input1[i] / input0[i + d * common_size];
+          }
+        }
+      }
+      break;
+    case MIN:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              std::min(input0[i + d * common_size], input1[i]);
+        }
+      }
+      break;
+    case MAX:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              std::max(input0[i + d * common_size], input1[i]);
+        }
+      }
+      break;
+    case SQR_DIFF:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              std::pow(input0[i + d * common_size] - input1[i], 2.f);
+        }
+      }
+      break;
+    case POW:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                std::pow(input0[i + d * common_size], input1[i]);
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                std::pow(input1[i], input0[i + d * common_size]);
+          }
+        }
+      }
+      break;
+    case NEG:
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] + value;
+      for (index_t i = 0; i < diff_size * common_size; ++i) {
+        output[i] = -input0[i];
+      }
+      break;
+    case ABS:
+#pragma omp parallel for
+      for (index_t i = 0; i < diff_size * common_size; ++i) {
+        output[i] = std::fabs(input0[i]);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Eltwise op not support type " << type;
+  }
+}
+
+// Multiplication is costly, so we specialize the following case.
+inline void TensorEltwise(const EltwiseType type,
+                          const float *input0,
+                          const float *input1,
+                          const std::vector<float> &coeff,
+                          const index_t size,
+                          const bool swapped,
+                          float *output) {
+  switch (type) {
+    case SUM:
+      if (coeff.empty()) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] + input1[i];
+        }
+
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
+        }
       }
       break;
     case SUB:
+      if (!swapped) {
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] - value;
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] - input1[i];
+        }
+
+      } else {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1[i] - input0[i];
+        }
       }
       break;
     case PROD:
 #pragma omp parallel for
       for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] * value;
+        output[i] = input0[i] * input1[i];
       }
+
       break;
     case DIV:
+      if (!swapped) {
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] / value;
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] / input1[i];
+        }
+
+      } else {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1[i] / input0[i];
+        }
       }
       break;
     case MIN:
 #pragma omp parallel for
       for (index_t i = 0; i < size; ++i) {
-        output[i] = std::min<float>(input0[i], value);
+        output[i] = std::min(input0[i], input1[i]);
       }
+
       break;
     case MAX:
 #pragma omp parallel for
       for (index_t i = 0; i < size; ++i) {
-        output[i] = std::max<float>(input0[i], value);
+        output[i] = std::max(input0[i], input1[i]);
       }
+
       break;
-    case NEG:
+    case SQR_DIFF:
 #pragma omp parallel for
       for (index_t i = 0; i < size; ++i) {
-        output[i] = -input0[i];
+        output[i] = std::pow(input0[i] - input1[i], 2.f);
       }
+
       break;
-    case ABS:
+    case POW:
+      if (!swapped) {
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::abs(input0[i]);
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input0[i], input1[i]);
+        }
+      } else {
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input1[i], input0[i]);
+        }
       }
       break;
-    case SQR_DIFF:
+    case NEG:
 #pragma omp parallel for
       for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i] - value, 2.f);
+        output[i] = -input0[i];
       }
       break;
-    case POW:
+    case ABS:
 #pragma omp parallel for
       for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i], value);
+        output[i] = std::fabs(input0[i]);
       }
       break;
     default:
@@ -115,328 +298,304 @@ inline void TensorScalar(const EltwiseType type,
   }
 }
 
-inline void TensorBatchVector(const EltwiseType type,
-                              const float *input0,
-                              const float *input1,
-                              const index_t batch,
-                              const index_t channel,
-                              const index_t hw,
-                              const bool swapped,
-                              float *output) {
+// Multiplication is costly, so we specialize the following case.
+inline void TensorScalarEltwise(const EltwiseType type,
+                                const float *input0,
+                                const float input1,
+                                const std::vector<float> &coeff,
+                                const index_t size,
+                                const bool swapped,
+                                float *output) {
   switch (type) {
     case SUM:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = input0[idx0] + input1[idx1];
-          }
+      if (coeff.empty()) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] + input1;
+        }
+
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
         }
       }
       break;
     case SUB:
-      if (swapped) {
-#pragma omp parallel for collapse(3)
-        for (index_t b = 0; b < batch; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = b * channel + c;
-              output[idx0] = input1[idx1] - input0[idx0];
-            }
-          }
+      if (!swapped) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] - input1;
         }
+
       } else {
-#pragma omp parallel for collapse(3)
-        for (index_t b = 0; b < batch; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = b * channel + c;
-              output[idx0] = input0[idx0] - input1[idx1];
-            }
-          }
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1 - input0[i];
         }
       }
       break;
     case PROD:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = input0[idx0] * input1[idx1];
-          }
-        }
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = input0[i] * input1;
       }
+
       break;
     case DIV:
-      if (swapped) {
-#pragma omp parallel for collapse(3)
-        for (index_t b = 0; b < batch; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = b * channel + c;
-              output[idx0] = input1[idx1] / input0[idx0];
-            }
-          }
+      if (!swapped) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] / input1;
         }
+
       } else {
-#pragma omp parallel for collapse(3)
-        for (index_t b = 0; b < batch; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = b * channel + c;
-              output[idx0] = input0[idx0] / input1[idx1];
-            }
-          }
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1 / input0[i];
         }
       }
       break;
     case MIN:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = std::min<float>(input0[idx0], input1[idx1]);
-          }
-        }
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::min(input0[i], input1);
       }
+
       break;
     case MAX:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = std::max<float>(input0[idx0], input1[idx1]);
-          }
-        }
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::max(input0[i], input1);
       }
+
       break;
     case SQR_DIFF:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f);
-          }
-        }
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::pow(input0[i] - input1, 2.f);
       }
+
       break;
     case POW:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = b * channel + c;
-            output[idx0] = std::pow(input0[idx0], input1[idx1]);
-          }
+      if (!swapped) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input0[i], input1);
         }
+      } else {
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input1, input0[i]);
+        }
+      }
+      break;
+    case NEG:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = -input0[i];
+      }
+      break;
+    case ABS:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::fabs(input0[i]);
       }
       break;
     default:
       LOG(FATAL) << "Eltwise op not support type " << type;
   }
 }
-inline void TensorVector(const EltwiseType type,
-                         const float *input0,
-                         const float *input1,
-                         const index_t batch,
-                         const index_t channel,
-                         const index_t hw,
-                         const bool swapped,
-                         float *output) {
+
+inline void TensorEltwisePerChannel(const EltwiseType type,
+                                    const float *input0,
+                                    const float *input1,
+                                    const std::vector<float> &coeff,
+                                    const index_t batch0,
+                                    const index_t batch1,
+                                    const index_t channel,
+                                    const index_t image_size,
+                                    const bool swapped,
+                                    float *output) {
   switch (type) {
     case SUM:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = c;
-            output[idx0] = input0[idx0] + input1[idx1];
+      if (coeff.empty()) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] + in1_ptr[c];
+            }
+          }
+        }
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] =
+                  in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
+            }
           }
         }
       }
       break;
     case SUB:
-      if (swapped) {
-#pragma omp parallel for collapse(3)
-        for (index_t b = 0; b < batch; ++b) {
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = c;
-              output[idx0] = input1[idx1] - input0[idx0];
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] - in1_ptr[c];
             }
           }
         }
       } else {
-#pragma omp parallel for collapse(3)
-        for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = c;
-              output[idx0] = input0[idx0] - input1[idx1];
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in1_ptr[c] - in0_ptr[i];
             }
           }
         }
       }
       break;
     case PROD:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
         for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = c;
-            output[idx0] = input0[idx0] * input1[idx1];
+          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          float *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = in0_ptr[i] * in1_ptr[c];
           }
         }
       }
       break;
     case DIV:
-      if (swapped) {
-#pragma omp parallel for collapse(3)
-        for (index_t b = 0; b < batch; ++b) {
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = c;
-              output[idx0] = input1[idx1] / input0[idx0];
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] / in1_ptr[c];
             }
           }
         }
       } else {
-#pragma omp parallel for collapse(3)
-        for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
-            for (index_t i = 0; i < hw; ++i) {
-              const index_t idx0 = (b * channel + c) * hw + i;
-              const index_t idx1 = c;
-              output[idx0] = input0[idx0] / input1[idx1];
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in1_ptr[c] / in0_ptr[i];
             }
           }
         }
       }
       break;
     case MIN:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
         for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = c;
-            output[idx0] = std::min<float>(input0[idx0], input1[idx1]);
+          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          float *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]);
           }
         }
       }
       break;
     case MAX:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
         for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = c;
-            output[idx0] = std::max<float>(input0[idx0], input1[idx1]);
+          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          float *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]);
           }
         }
       }
       break;
     case SQR_DIFF:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
         for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = c;
-            output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f);
+          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          float *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f);
           }
         }
       }
       break;
     case POW:
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          for (index_t i = 0; i < hw; ++i) {
-            const index_t idx0 = (b * channel + c) * hw + i;
-            const index_t idx1 = c;
-            output[idx0] = std::pow(input0[idx0], input1[idx1]);
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]);
+            }
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            float *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]);
+            }
           }
         }
       }
       break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
-}
-inline void TensorEltwise(const EltwiseType type,
-                          const float *input0,
-                          const float *input1,
-                          const index_t size,
-                          float *output) {
-  switch (type) {
-    case SUM:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] + input1[i];
-      }
-      break;
-    case SUB:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] - input1[i];
-      }
-      break;
-    case PROD:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] * input1[i];
-      }
-      break;
-    case DIV:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] / input1[i];
-      }
-      break;
-    case MIN:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::min<float>(input0[i], input1[i]);
-      }
-      break;
-    case MAX:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::max<float>(input0[i], input1[i]);
-      }
-      break;
-    case SQR_DIFF:
+    case NEG:
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i] - input1[i], 2.f);
+      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
+        output[i] = -input0[i];
       }
       break;
-    case POW:
+    case ABS:
 #pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i], input1[i]);
+      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
+        output[i] = std::fabs(input0[i]);
       }
       break;
     default:
@@ -444,95 +603,109 @@ inline void TensorEltwise(const EltwiseType type,
   }
 }
 
-
 struct EltwiseFunctorBase {
   EltwiseFunctorBase(const EltwiseType type,
                      const std::vector<float> &coeff,
-                     const float value)
-      : type_(type), coeff_(coeff), value_(value) {}
+                     const float value,
+                     const DataFormat data_format)
+      : type_(type), coeff_(coeff), value_(value), data_format_(data_format) {}
 
   EltwiseType type_;
   std::vector<float> coeff_;
   float value_;
+  DataFormat data_format_;
 };
 
 template <DeviceType D, typename T>
 struct EltwiseFunctor;
 
 template <>
-struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
+struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
   EltwiseFunctor(const EltwiseType type,
                  const std::vector<float> &coeff,
-                 const float value)
-      : EltwiseFunctorBase(type, coeff, value) {}
+                 const float value,
+                 const DataFormat data_format)
+      : EltwiseFunctorBase(type, coeff, value, data_format) {}
 
   MaceStatus operator()(const Tensor *input0,
-                  const Tensor *input1,
-                  Tensor *output,
-                  StatsFuture *future) {
+                        const Tensor *input1,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
+
+    if (input1 == nullptr) {
+      scalar_tensor_.Resize({});
+      Tensor::MappingGuard guard(&scalar_tensor_);
+      auto scalar_data = scalar_tensor_.mutable_data<float>();
+      scalar_data[0] = value_;
+      input1 = &scalar_tensor_;
+    }
+
     bool swapped = false;
-    if (input1 != nullptr) {
-      MACE_CHECK(input0->dim_size() == input1->dim_size()
-                     || input0->dim_size() == 1
-                     || input1->dim_size() == 1)
-        << "Inputs of Eltwise op must be same shape";
-      if (input0->size() != input1->size()) {
-        if (input0->size() < input1->size()) {
-          std::swap(input0, input1);
-          swapped = true;
-        }
-        if (input1->dim_size() == 1) {
-          MACE_CHECK(input0->dim(1) == input1->dim(0))
-            << "Element-Wise op only support channel dimension broadcast";
-        } else {
-          MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1)
-                         && input0->dim(1) == input1->dim(1)
-                         && input1->dim(2) == 1
-                         && input1->dim(3) == 1)
-            << "Element-Wise op only support channel dimension broadcast";
+    if (input0->size() < input1->size()) {
+      std::swap(input0, input1);
+      swapped = true;
+    }
+
+    // check if we can broadcast tensor
+    uint32_t rank_diff =
+        static_cast<uint32_t>(input0->dim_size() - input1->dim_size());
+    if (data_format_ == NCHW) {
+      MACE_CHECK(
+          input0->dim_size() == 4 &&
+              (input1->dim_size() == 0 ||
+               input1->dim_size() == 4 && input1->dim(1) == input0->dim(1) &&
+                   (input1->dim(0) == input0->dim(0) || input1->dim(0) == 1) ||
+               input1->dim_size() == 1 && input1->dim(0) == input0->dim(1)),
+          "only support broadcast channel dimension");
+    } else {
+      if (rank_diff > 0 && rank_diff < input0->dim_size()) {
+        for (uint32_t i = 0; i < input1->dim_size(); ++i) {
+          MACE_CHECK(input0->dim(rank_diff + i) == input1->dim(i),
+                     "Element-Wise op only support tail dimensions broadcast");
         }
       }
     }
+
+    index_t common_size = input1->size();
+    index_t diff_size = input0->size() / common_size;
+
     MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
 
     Tensor::MappingGuard input0_guard(input0);
+    Tensor::MappingGuard input1_guard(input1);
     Tensor::MappingGuard output_guard(output);
 
     const float *input0_ptr = input0->data<float>();
+    const float *input1_ptr = input1->data<float>();
     float *output_ptr = output->mutable_data<float>();
-    const index_t size = input0->size();
-    if (input1 == nullptr) {
-      TensorScalar(type_, input0_ptr, value_, size, output_ptr);
+
+    if (data_format_ == NCHW && input1->dim_size() > 0 &&
+        input1->size() < input0->size()) {
+      TensorEltwisePerChannel(
+          type_, input0_ptr, input1_ptr, coeff_, input0->dim(0),
+          input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1),
+          input0->dim(2) * input0->dim(3), swapped, output_ptr);
+
     } else {
-      Tensor::MappingGuard input1_guard(input1);
-
-      const float *input1_ptr = input1->data<float>();
-      if (input1->size() != input0->size()) {
-        const index_t batch = input0->dim(0);
-        const index_t channel = input0->dim(1);
-        const index_t hw = input0->dim(2) * input0->dim(3);
-        if (input1->dim(0) == 1 || input1->dim_size() == 1)
-          TensorVector(type_, input0_ptr, input1_ptr,
-                       batch, channel, hw, swapped, output_ptr);
-        else
-          TensorBatchVector(type_, input0_ptr, input1_ptr,
-                            batch, channel, hw, swapped, output_ptr);
-      } else {
-        if (!coeff_.empty() && type_ == SUM) {
-#pragma omp parallel for
-          for (index_t i = 0; i < size; ++i) {
-            output_ptr[i] = coeff_[0] * input0_ptr[i] +
-                coeff_[1] * input1_ptr[i];
-          }
+      if (input1->size() == input0->size()) {
+        TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(),
+                      swapped, output_ptr);
+      } else if (input1->size() < input0->size()) {
+        if (input1->size() > 1) {
+          TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
+                                 diff_size, common_size, swapped, output_ptr);
         } else {
-          TensorEltwise(type_, input0_ptr, input1_ptr, size, output_ptr);
+          TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_,
+                              input0->size(), swapped, output_ptr);
         }
       }
     }
 
     return MACE_SUCCESS;
   }
+
+  Tensor scalar_tensor_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
@@ -540,13 +713,14 @@ template <typename T>
 struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
   EltwiseFunctor(const EltwiseType type,
                  const std::vector<float> &coeff,
-                 const float value)
-      : EltwiseFunctorBase(type, coeff, value) {}
+                 const float value,
+                 const DataFormat data_format)
+      : EltwiseFunctorBase(type, coeff, value, data_format) {}
 
   MaceStatus operator()(const Tensor *input0,
-                  const Tensor *input1,
-                  Tensor *output,
-                  StatsFuture *future);
+                        const Tensor *input1,
+                        Tensor *output,
+                        StatsFuture *future);
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index ebcb7b40..406f87af 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -42,48 +42,79 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
                         Tensor *output,
                         StatsFuture *future) {
     MACE_UNUSED(future);
-    const index_t batch = input->dim(0);
-    const index_t class_count = input->dim(1);
-    const index_t class_size = input->dim(2) * input->dim(3);
-    const index_t batch_size = class_count * class_size;
-
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
     const float *input_data = input->data<float>();
     float *output_data = output->mutable_data<float>();
 
-    for (index_t b = 0; b < batch; ++b) {
+    // softmax for nchw image
+    if (input->dim_size() == 4) {
+      const index_t batch = input->dim(0);
+      const index_t class_count = input->dim(1);
+      const index_t class_size = input->dim(2) * input->dim(3);
+      const index_t batch_size = class_count * class_size;
+
+      for (index_t b = 0; b < batch; ++b) {
+#pragma omp parallel for
+        for (index_t k = 0; k < class_size; ++k) {
+          const float *input_ptr = input_data + b * batch_size + k;
+          float *output_ptr = output_data + b * batch_size + k;
+
+          float max_val = std::numeric_limits<float>::lowest();
+          index_t channel_offset = 0;
+          for (index_t c = 0; c < class_count; ++c) {
+            float data = input_ptr[channel_offset];
+            if (data > max_val) {
+              max_val = data;
+            }
+            channel_offset += class_size;
+          }
+
+          channel_offset = 0;
+          float sum = 0;
+          for (index_t c = 0; c < class_count; ++c) {
+            float exp_value = ::exp(input_ptr[channel_offset] - max_val);
+            sum += exp_value;
+            output_ptr[channel_offset] = exp_value;
+            channel_offset += class_size;
+          }
+
+          sum = std::max(sum, std::numeric_limits<float>::min());
+          channel_offset = 0;
+          for (index_t c = 0; c < class_count; ++c) {
+            output_ptr[channel_offset] /= sum;
+            channel_offset += class_size;
+          }
+        }  // k
+      }  // b
+    } else if (input->dim_size() == 2) {  // normal 2d softmax
+      const index_t class_size = input->dim(0);
+      const index_t class_count = input->dim(1);
 #pragma omp parallel for
       for (index_t k = 0; k < class_size; ++k) {
-        const float *input_ptr = input_data + b * batch_size + k;
-        float *output_ptr = output_data + b * batch_size + k;
+        const float *input_ptr = input_data + k * class_count;
+        float *output_ptr = output_data + k * class_count;
 
         float max_val = std::numeric_limits<float>::lowest();
-        index_t channel_offset = 0;
         for (index_t c = 0; c < class_count; ++c) {
-          float data = input_ptr[channel_offset];
-          if (data > max_val) {
-            max_val = data;
-          }
-          channel_offset += class_size;
+          max_val = std::max(max_val, input_ptr[c]);
         }
 
-        channel_offset = 0;
         float sum = 0;
         for (index_t c = 0; c < class_count; ++c) {
-          float exp_value = ::exp(input_ptr[channel_offset] - max_val);
+          float exp_value = ::exp(input_ptr[c] - max_val);
           sum += exp_value;
-          output_ptr[channel_offset] = exp_value;
-          channel_offset += class_size;
+          output_ptr[c] = exp_value;
         }
 
-        channel_offset = 0;
+        sum = std::max(sum, std::numeric_limits<float>::min());
         for (index_t c = 0; c < class_count; ++c) {
-          output_ptr[channel_offset] /= sum;
-          channel_offset += class_size;
+          output_ptr[c] /= sum;
         }
-      }  // k
-    }  // b
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
 
     return MACE_SUCCESS;
   }
diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h
index 9cc800bf..8c88e9a2 100644
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -30,7 +30,9 @@ class EltwiseOp : public Operator<D, T> {
             static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
                 "type", static_cast<int>(kernels::EltwiseType::NONE))),
             OperatorBase::GetRepeatedArgs<float>("coeff"),
-            OperatorBase::GetOptionalArg<float>("value", 1.0)) {}
+            OperatorBase::GetOptionalArg<float>("value", 1.0),
+            static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
+                "data_format", 0))) {}
 
   MaceStatus Run(StatsFuture *future) override {
     const Tensor *input0 = this->Input(0);
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index e8fd81cf..37666b33 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -41,6 +41,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
         .Input("TInput")
         .AddIntArg("type", static_cast<int>(type))
         .AddFloatArg("value", x)
+        .AddIntArg("data_format", DataFormat::NCHW)
         .Output("TOutput")
         .Finalize(net.NewOperatorDef());
     // Run
@@ -84,15 +85,24 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
   net.AddInputFromArray<D, float>("Input1", shape1, input1);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
-    net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
-    OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("TInput0")
-        .Input("TInput1")
+    auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest")
         .AddIntArg("type", static_cast<int>(type))
         .AddFloatsArg("coeff", coeff)
-        .Output("TOutput")
-        .Finalize(net.NewOperatorDef());
+        .AddIntArg("data_format", DataFormat::NCHW)
+        .Output("TOutput");
+    if (shape0.size() > 1) {
+      net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
+      op_builder.Input("TInput0");
+    } else {
+      op_builder.Input("Input0");
+    }
+    if (shape1.size() > 1) {
+      net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
+      op_builder.Input("TInput1");
+    } else {
+      op_builder.Input("Input1");
+    }
+    op_builder.Finalize(net.NewOperatorDef());
 
     // Run
     net.RunOp(D);
@@ -214,6 +224,35 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
       kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
       {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
+
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
+      {1, 2, 3}, {2, 4, 6, 5, 7, 9});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3},
+      {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
+      {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  SimpleTensorEltwise<DeviceType::CPU, float>(
+      kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5},
+      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
 }
 
 TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
@@ -322,6 +361,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
       .Input("TInput")
       .AddIntArg("type", static_cast<int>(type))
       .AddFloatArg("value", 0.1)
+      .AddIntArg("data_format", DataFormat::NCHW)
       .Output("TOutput")
       .Finalize(net.NewOperatorDef());
   // Run
@@ -375,6 +415,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
       .Input("TInput1")
       .AddIntArg("type", static_cast<int>(type))
       .AddFloatsArg("coeff", coeff)
+      .AddIntArg("data_format", DataFormat::NCHW)
       .Output("TOutput")
       .Finalize(net.NewOperatorDef());
 
diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc
index 9997ee10..5468ca24 100644
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -29,8 +29,12 @@ void Simple() {
   // Add input data
   net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
                                   {1, 1, 1, 1, 1, 2, 3, 4});
+  auto expected = CreateTensor<float>(
+      {1, 1, 2, 4},
+      {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
 
   if (D == DeviceType::CPU) {
+    // test 4d softmax
     net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
     OpDefBuilder("Softmax", "SoftmaxTest")
         .Input("InputNCHW")
@@ -40,6 +44,21 @@ void Simple() {
     // Run
     net.RunOp(D);
     net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+
+    // check 2d softmax
+    net.AddInputFromArray<D, float>("Input2d", {2, 4},
+                                    {1, 1, 1, 1, 1, 2, 3, 4});
+    OpDefBuilder("Softmax", "SoftmaxTest")
+        .Input("Input2d")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+    net.GetOutput("Output")->Reshape({1, 1, 2, 4});
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
                             kernels::BufferType::IN_OUT_CHANNEL);
@@ -55,15 +74,11 @@ void Simple() {
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                             kernels::BufferType::IN_OUT_CHANNEL);
+
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
-
-  auto expected = CreateTensor<float>(
-      {1, 1, 2, 4},
-      {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
-
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
 
-- 
GitLab