Implement argmax, equal op;

Revise memory allocation logic to make non-float output possible

Implement argmax, equal op;
Revise memory allocation logic to make non-float output possible
d6adf881 · 李寅 · 7ac05858 · d6adf881 · d6adf881 · d6adf881
20 changed file
--- a/docs/getting_started/op_lists.rst
+++ b/docs/getting_started/op_lists.rst
@@ -7,22 +7,25 @@ Operator lists
    :header: "Operator","Android NN","Supported","Remark"

    "AVERAGE_POOL_2D","Y","Y",""
+    "ARGMAX","","Y","Only CPU and tensorflow is supported"
    "BATCH_NORM","","Y","Fusion with activation is supported"
    "BATCH_TO_SPACE_ND","Y","Y",""
    "BIAS_ADD","","Y",""
+    "CAST","","Y","Only CPU and tensorflow model is supported"
    "CHANNEL_SHUFFLE","","Y",""
    "CONCATENATION","Y","Y","Only support channel axis concatenation"
    "CONV_2D","Y","Y","Fusion with BN and activation layer is supported"
-    "DECONV_2D","N","Y","Only tensorflow model is supported"
+    "DECONV_2D","","Y","Only tensorflow model is supported"
    "DEPTHWISE_CONV_2D","Y","Y","Only multiplier = 1 is supported; Fusion is supported"
    "DEPTH_TO_SPACE","Y","Y",""
    "DEQUANTIZE","Y","Y","Model quantization will be supported later"
-    "ELEMENT_WISE","Y","Y","ADD/MUL/DIV/MIN/MAX/NEG/ABS/SQR_DIFF/POW"
-    "EMBEDDING_LOOKUP","Y","",""
+    "ELEMENT_WISE","Y","Y","ADD/MUL/DIV/MIN/MAX/NEG/ABS/SQR_DIFF/POW/RSQRT/EQUAL"
+    "EMBEDDING_LOOKUP","Y","Y","Only support channel axis concatenation"
    "FLOOR","Y","",""
    "FULLY_CONNECTED","Y","Y",""
    "GROUP_CONV_2D","","","Caffe model with group count = channel count is supported"
    "HASHTABLE_LOOKUP","Y","",""
+    "IDENTITY","","Y","Only tensorflow model is supported"
    "L2_NORMALIZATION","Y","",""
    "L2_POOL_2D","Y","",""
    "LOCAL_RESPONSE_NORMALIZATION","Y","Y",""
@@ -31,9 +34,10 @@ Operator lists
    "LSTM","Y","",""
    "MATMUL","","Y",""
    "MAX_POOL_2D","Y","Y",""
-    "PAD", "N","Y",""
+    "PAD", "Y","Y",""
    "PSROI_ALIGN","","Y",""
    "PRELU","","Y","Only caffe model is supported"
+    "REDUCE_MEAN","Y","Y","Only tensorflow model is supported"
    "RELU","Y","Y",""
    "RELU1","Y","Y",""
    "RELU6","Y","Y",""
@@ -42,9 +46,14 @@ Operator lists
    "RESIZE_BILINEAR","Y","Y",""
    "RNN","Y","",""
    "RPN_PROPOSAL_LAYER","","Y",""
-    "SLICE","N","Y","Only support channel axis slice"
+    "SHAPE","","Y","Only CPU and tensorflow is supported"
+    "STACK","","Y","Only CPU and tensorflow is supported"
+    "STRIDEDSLICE","Y","Y","Only CPU and tensorflow is supported"
+    "SLICE","","Y","In tensorflow, this op is equivalent to SPLIT; Only support channel axis slice"
    "SOFTMAX","Y","Y",""
    "SPACE_TO_BATCH_ND","Y", "Y",""
    "SPACE_TO_DEPTH","Y","Y",""
+    "SQEEZE","Y","Y","Only CPU and tensorflow is supported"
    "SVDF","Y","",""
    "TANH","Y","Y",""
+    "TRANSPOSE","Y","Y","Only CPU and tensorflow is supported"
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -264,7 +264,6 @@ MaceStatus MaceEngine::Impl::Run(
      auto shape = output_tensor->shape();
      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
                                            std::multiplies<int64_t>());
-      MACE_CHECK(!shape.empty()) << "Output's shape must greater than 0";
      MACE_CHECK(shape == output.second.shape())
          << "Output shape mismatch: "
          << MakeString<int64_t>(output.second.shape())

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -76,6 +76,7 @@ namespace ops {
 // Keep in lexicographical order
 extern void Register_Activation(OperatorRegistry *op_registry);
 extern void Register_AddN(OperatorRegistry *op_registry);
+extern void Register_ArgMax(OperatorRegistry *op_registry);
 extern void Register_BatchNorm(OperatorRegistry *op_registry);
 extern void Register_BatchToSpaceND(OperatorRegistry *op_registry);
 extern void Register_BiasAdd(OperatorRegistry *op_registry);
@@ -124,6 +125,7 @@ OperatorRegistry::OperatorRegistry() {
  // Keep in lexicographical order
  ops::Register_Activation(this);
  ops::Register_AddN(this);
+  ops::Register_ArgMax(this);
  ops::Register_BatchNorm(this);
  ops::Register_BatchToSpaceND(this);
  ops::Register_BiasAdd(this);

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -157,6 +157,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
    }
  }
  MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
+  // TODO(liyin): memory block should not have concept of type, but to be
+  // consistent with gpu, all memory block use float/half as unit
  for (auto &mem_block : net_def.mem_arena().mem_block()) {
    if (device_type == DeviceType::GPU) {
      // TODO(liuqi): refactor based on PB
@@ -191,8 +193,15 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
      auto mem_ids = op.mem_id();
      int count = mem_ids.size();
      for (int i = 0; i < count; ++i) {
+        DataType output_type;
+        if (i < op.output_type_size()) {
+          output_type = op.output_type(i);
+        } else {
+          output_type = dtype;
+        }
        std::unique_ptr<Tensor> tensor
-            (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype));
+            (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
+                        output_type));
        tensor->SetSourceOpName(op.name());
        if (device_type == DeviceType::GPU) {
          VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"

--- a/mace/kernels/argmax.h
+++ b/mace/kernels/argmax.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_ARGMAX_H_
+#define MACE_KERNELS_ARGMAX_H_
+
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "mace/core/future.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, typename T>
+struct ArgMaxFunctor {
+  MaceStatus operator()(const Tensor *input,
+                        const Tensor *axis,
+                        Tensor *output,
+                        StatsFuture *future) {
+    MACE_UNUSED(future);
+
+    MACE_CHECK(input->dim_size() > 0, "ArgMax input should not be a scalar");
+    MACE_CHECK(axis->dim_size() == 0, "Mace argmax only supports scalar axis");
+    Tensor::MappingGuard axis_guard(axis);
+    int axis_value = axis->data<int32_t>()[0];
+    if (axis_value < 0) {
+      axis_value += input->dim_size();
+    }
+    MACE_CHECK(axis_value == input->dim_size() - 1,
+               "Mace argmax only supports last dimension as axis");
+
+    std::vector<index_t> output_shape(input->dim_size() - 1);
+    for (index_t d = 0; d < input->dim_size() - 1; ++d) {
+      output_shape[d] = input->dim(d < axis_value ? d : d + 1);
+    }
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    auto input_data = input->data<T>();
+    auto output_data = output->mutable_data<int32_t>();
+
+    index_t outer_size = output->size();
+    index_t inner_size = input->dim(axis_value);
+
+#pragma omp parallel for
+    for (index_t i = 0; i < outer_size; ++i) {
+      int idx = 0;
+      T max_value = std::numeric_limits<T>::lowest();
+      const T *input_ptr = input_data + i * inner_size;
+      for (index_t j = 0; j < inner_size; ++j) {
+        if (input_ptr[j] > max_value) {
+          max_value = input_ptr[j];
+          idx = j;
+        }
+      }
+      output_data[i] = idx;
+    }
+
+    return MACE_SUCCESS;
+  }
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_ARGMAX_H_
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -16,10 +16,10 @@
 #define MACE_KERNELS_ELTWISE_H_

 #include <algorithm>
+#include <functional>
 #include <memory>
 #include <utility>
 #include <vector>
-#include <functional>

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
@@ -42,9 +42,12 @@ enum EltwiseType {
  ABS = 7,
  SQR_DIFF = 8,
  POW = 9,
-  NONE = 10,
+  EQUAL = 10,
+  NONE = 11,
 };

+static bool IsLogicalType(EltwiseType type) { return type == EQUAL; }
+
 inline index_t GetIndex(const std::vector<index_t> &shape,
                        const std::vector<index_t> &index) {
  index_t idx = 0;
@@ -68,22 +71,19 @@ inline void IncreaseIndex(const std::vector<index_t> &shape,
  }
 }

-inline void TensorGeneralBroadcastEltwise(const EltwiseType type,
-                                          const float *input0,
-                                          const float *input1,
-                                          const std::vector<float> &coeff,
-                                          const bool swapped,
-                                          const std::vector<index_t>
-                                          &input0_shape,
-                                          const std::vector<index_t>
-                                          &input1_shape,
-                                          const std::vector<index_t>
-                                          &output_shape,
-                                          float *output) {
-  const index_t output_size = std::accumulate(output_shape.begin(),
-                                              output_shape.end(),
-                                              1,
-                                              std::multiplies<index_t>());
+template <typename T, typename DstType>
+inline void TensorGeneralBroadcastEltwise(
+    const EltwiseType type,
+    const T *input0,
+    const T *input1,
+    const std::vector<float> &coeff,
+    const bool swapped,
+    const std::vector<index_t> &input0_shape,
+    const std::vector<index_t> &input1_shape,
+    const std::vector<index_t> &output_shape,
+    DstType *output) {
+  const index_t output_size = std::accumulate(
+      output_shape.begin(), output_shape.end(), 1, std::multiplies<index_t>());
  std::vector<index_t> out_index(output_shape.size(), 0);
  switch (type) {
    case SUM:
@@ -191,19 +191,28 @@ inline void TensorGeneralBroadcastEltwise(const EltwiseType type,
        }
      }
      break;
+    case EQUAL:
+      for (index_t i = 0; i < output_size; ++i) {
+        const index_t idx0 = GetIndex(input0_shape, out_index);
+        const index_t idx1 = GetIndex(input1_shape, out_index);
+        output[i] = input1[idx1] == input0[idx0];
+        IncreaseIndex(output_shape, &out_index);
+      }
+      break;
    default:
      LOG(FATAL) << "Eltwise op not support type " << type;
  }
 }

+template <typename T, typename DstType>
 inline void TensorBroadcastEltwise(const EltwiseType type,
-                                   const float *input0,
-                                   const float *input1,
+                                   const T *input0,
+                                   const T *input1,
                                   const std::vector<float> &coeff,
                                   const index_t diff_size,
                                   const index_t common_size,
                                   const bool swapped,
-                                   float *output) {
+                                   DstType *output) {
  switch (type) {
    case SUM:
      if (coeff.empty()) {
@@ -333,19 +342,29 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
        output[i] = std::fabs(input0[i]);
      }
      break;
+    case EQUAL:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              input0[i + d * common_size] == input1[i];
+        }
+      }
+      break;
    default:
      LOG(FATAL) << "Eltwise op not support type " << type;
  }
 }

 // Multiplication is costly, so we specialize the following case.
+template <typename T, typename DstType>
 inline void TensorEltwise(const EltwiseType type,
-                          const float *input0,
-                          const float *input1,
+                          const T *input0,
+                          const T *input1,
                          const std::vector<float> &coeff,
                          const index_t size,
                          const bool swapped,
-                          float *output) {
+                          DstType *output) {
  switch (type) {
    case SUM:
      if (coeff.empty()) {
@@ -445,19 +464,26 @@ inline void TensorEltwise(const EltwiseType type,
        output[i] = std::fabs(input0[i]);
      }
      break;
+    case EQUAL:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = input0[i] == input1[i];
+      }
+      break;
    default:
      LOG(FATAL) << "Eltwise op not support type " << type;
  }
 }

 // Multiplication is costly, so we specialize the following case.
+template <typename T, typename DstType>
 inline void TensorScalarEltwise(const EltwiseType type,
-                                const float *input0,
-                                const float input1,
+                                const T *input0,
+                                const T input1,
                                const std::vector<float> &coeff,
                                const index_t size,
                                const bool swapped,
-                                float *output) {
+                                DstType *output) {
  switch (type) {
    case SUM:
      if (coeff.empty()) {
@@ -556,31 +582,39 @@ inline void TensorScalarEltwise(const EltwiseType type,
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::fabs(input0[i]);
      }
+      break;
+    case EQUAL:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = input0[i] == input1;
+      }
+
      break;
    default:
      LOG(FATAL) << "Eltwise op not support type " << type;
  }
 }

+template <typename T, typename DstType>
 inline void TensorEltwisePerChannel(const EltwiseType type,
-                                    const float *input0,
-                                    const float *input1,
+                                    const T *input0,
+                                    const T *input1,
                                    const std::vector<float> &coeff,
                                    const index_t batch0,
                                    const index_t batch1,
                                    const index_t channel,
                                    const index_t image_size,
                                    const bool swapped,
-                                    float *output) {
+                                    DstType *output) {
  switch (type) {
    case SUM:
      if (coeff.empty()) {
 #pragma omp parallel for collapse(2)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            float *out_ptr = output + ((b * channel) + c) * image_size;
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
            for (index_t i = 0; i < image_size; ++i) {
              out_ptr[i] = in0_ptr[i] + in1_ptr[c];
            }
@@ -594,9 +628,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            float *out_ptr = output + ((b * channel) + c) * image_size;
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
            for (index_t i = 0; i < image_size; ++i) {
              out_ptr[i] =
                  in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
@@ -610,9 +644,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            float *out_ptr = output + ((b * channel) + c) * image_size;
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
            for (index_t i = 0; i < image_size; ++i) {
              out_ptr[i] = in0_ptr[i] - in1_ptr[c];
            }
@@ -622,9 +656,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            float *out_ptr = output + ((b * channel) + c) * image_size;
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
            for (index_t i = 0; i < image_size; ++i) {
              out_ptr[i] = in1_ptr[c] - in0_ptr[i];
            }
@@ -636,9 +670,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
-          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          float *out_ptr = output + ((b * channel) + c) * image_size;
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
          for (index_t i = 0; i < image_size; ++i) {
            out_ptr[i] = in0_ptr[i] * in1_ptr[c];
          }
@@ -650,9 +684,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            float *out_ptr = output + ((b * channel) + c) * image_size;
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
            for (index_t i = 0; i < image_size; ++i) {
              out_ptr[i] = in0_ptr[i] / in1_ptr[c];
            }
@@ -662,9 +696,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            float *out_ptr = output + ((b * channel) + c) * image_size;
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
            for (index_t i = 0; i < image_size; ++i) {
              out_ptr[i] = in1_ptr[c] / in0_ptr[i];
            }
@@ -676,9 +710,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
-          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          float *out_ptr = output + ((b * channel) + c) * image_size;
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
          for (index_t i = 0; i < image_size; ++i) {
            out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]);
          }
@@ -689,9 +723,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
-          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          float *out_ptr = output + ((b * channel) + c) * image_size;
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
          for (index_t i = 0; i < image_size; ++i) {
            out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]);
          }
@@ -702,9 +736,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
-          const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          float *out_ptr = output + ((b * channel) + c) * image_size;
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
          for (index_t i = 0; i < image_size; ++i) {
            out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f);
          }
@@ -716,9 +750,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            float *out_ptr = output + ((b * channel) + c) * image_size;
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
            for (index_t i = 0; i < image_size; ++i) {
              out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]);
            }
@@ -728,9 +762,9 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 #pragma omp parallel for collapse(2)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
-            const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            float *out_ptr = output + ((b * channel) + c) * image_size;
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
            for (index_t i = 0; i < image_size; ++i) {
              out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]);
            }
@@ -750,6 +784,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
        output[i] = std::fabs(input0[i]);
      }
      break;
+    case EQUAL:
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
+        for (index_t c = 0; c < channel; ++c) {
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = in0_ptr[i] == in1_ptr[c];
+          }
+        }
+      }
+      break;
    default:
      LOG(FATAL) << "Eltwise op not support type " << type;
  }
@@ -769,30 +816,17 @@ struct EltwiseFunctorBase {
 };

 template <DeviceType D, typename T>
-struct EltwiseFunctor;
-
-template <>
-struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
+struct EltwiseFunctor : EltwiseFunctorBase {
  EltwiseFunctor(const EltwiseType type,
                 const std::vector<float> &coeff,
-                 const float value,
+                 const float value,  // keep it float as it comes from arg
                 const DataFormat data_format)
      : EltwiseFunctorBase(type, coeff, value, data_format) {}

-  MaceStatus operator()(const Tensor *input0,
-                        const Tensor *input1,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-
-    if (input1 == nullptr) {
-      scalar_tensor_.Resize({});
-      Tensor::MappingGuard guard(&scalar_tensor_);
-      auto scalar_data = scalar_tensor_.mutable_data<float>();
-      scalar_data[0] = value_;
-      input1 = &scalar_tensor_;
-    }
-
+  template <typename DstType>
+  MaceStatus DoEltwise(const Tensor *input0,
+                       const Tensor *input1,
+                       Tensor *output) {
    bool swapped = false;
    if (input0->size() < input1->size()) {
      std::swap(input0, input1);
@@ -804,20 +838,16 @@ struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
        static_cast<uint32_t>(input0->dim_size() - input1->dim_size());
    if (data_format_ == NCHW) {
      MACE_CHECK(
-          (input0->dim_size() == 4)
-              && ((input1->dim_size() == 0)
-                  || (input1->dim_size() == 4
-                      && input1->dim(1) == input0->dim(1)
-                      && (input1->dim(0) == input0->dim(0)
-                          || input1->dim(0) == 1))
-                  || (input1->dim_size() == 1
-                      && input1->dim(0) == input0->dim(1))),
+          (input0->dim_size() == 4) &&
+              ((input1->dim_size() == 0) ||
+               (input1->dim_size() == 4 && input1->dim(1) == input0->dim(1) &&
+                (input1->dim(0) == input0->dim(0) || input1->dim(0) == 1)) ||
+               (input1->dim_size() == 1 && input1->dim(0) == input0->dim(1))),
          "only support broadcast channel dimension");
    } else {
      for (uint32_t i = 0; i < input1->dim_size(); ++i) {
-        MACE_CHECK(input0->dim(rank_diff + i) == 1
-                       || input1->dim(i) == 1
-                       || input0->dim(rank_diff + i) == input1->dim(i),
+        MACE_CHECK(input0->dim(rank_diff + i) == 1 || input1->dim(i) == 1 ||
+                       input0->dim(rank_diff + i) == input1->dim(i),
                   "Element-Wise op only support tail dimensions broadcast");
      }
    }
@@ -825,14 +855,14 @@ struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
    Tensor::MappingGuard input0_guard(input0);
    Tensor::MappingGuard input1_guard(input1);

-    const float *input0_ptr = input0->data<float>();
-    const float *input1_ptr = input1->data<float>();
+    const T *input0_ptr = input0->data<T>();
+    const T *input1_ptr = input1->data<T>();

    if (data_format_ == NCHW && input1->dim_size() > 0 &&
        input1->size() < input0->size()) {
      MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
      Tensor::MappingGuard output_guard(output);
-      float *output_ptr = output->mutable_data<float>();
+      DstType *output_ptr = output->mutable_data<DstType>();
      TensorEltwisePerChannel(
          type_, input0_ptr, input1_ptr, coeff_, input0->dim(0),
          input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1),
@@ -841,8 +871,7 @@ struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
    } else {
      const std::vector<index_t> &input0_shape = input0->shape();
      std::vector<index_t> input1_shape(rank_diff, 1);
-      input1_shape.insert(input1_shape.end(),
-                          input1->shape().begin(),
+      input1_shape.insert(input1_shape.end(), input1->shape().begin(),
                          input1->shape().end());

      std::vector<index_t> output_shape(input0->dim_size(), 0);
@@ -851,27 +880,21 @@ struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
      }
      MACE_RETURN_IF_ERROR(output->Resize(output_shape));
      Tensor::MappingGuard output_guard(output);
-      float *output_ptr = output->mutable_data<float>();
+      DstType *output_ptr = output->mutable_data<DstType>();

      bool need_general_broadcast = false;
      for (uint32_t i = 0; i < input1->dim_size(); ++i) {
-        if ((input0->dim(rank_diff + i) == 1 && input1->dim(i) > 1)
-            || (input0->dim(rank_diff + i) > 1 && input1->dim(i) == 1)) {
+        if ((input0->dim(rank_diff + i) == 1 && input1->dim(i) > 1) ||
+            (input0->dim(rank_diff + i) > 1 && input1->dim(i) == 1)) {
          need_general_broadcast = true;
          break;
        }
      }

      if (need_general_broadcast) {
-        TensorGeneralBroadcastEltwise(type_,
-                                      input0_ptr,
-                                      input1_ptr,
-                                      coeff_,
-                                      swapped,
-                                      input0_shape,
-                                      input1_shape,
-                                      output_shape,
-                                      output_ptr);
+        TensorGeneralBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
+                                      swapped, input0_shape, input1_shape,
+                                      output_shape, output_ptr);
      } else if (input1->size() == input0->size()) {
        TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(),
                      swapped, output_ptr);
@@ -891,6 +914,28 @@ struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
    return MACE_SUCCESS;
  }

+  MaceStatus operator()(const Tensor *input0,
+                        const Tensor *input1,
+                        Tensor *output,
+                        StatsFuture *future) {
+    MACE_UNUSED(future);
+
+    if (input1 == nullptr) {
+      scalar_tensor_.Resize({});
+      Tensor::MappingGuard guard(&scalar_tensor_);
+      auto scalar_data = scalar_tensor_.mutable_data<T>();
+      scalar_data[0] = static_cast<T>(value_);
+      input1 = &scalar_tensor_;
+    }
+
+    if (IsLogicalType(type_)) {
+      // as we do not have bool-type tensor, we use int type
+      return DoEltwise<int32_t>(input0, input1, output);
+    } else {
+      return DoEltwise<T>(input0, input1, output);
+    }
+  }
+
  Tensor scalar_tensor_;
 };


--- a/mace/kernels/strided_slice.h
+++ b/mace/kernels/strided_slice.h
@@ -67,6 +67,26 @@ struct StridedSliceFunctor {
    const T *input_data = input->data<T>();
    const int32_t *begin_indices_data = begin_indices->data<int32_t>();
    const int32_t *end_indices_data = end_indices->data<int32_t>();
+    const int32_t *strides_data = strides->data<int32_t>();
+
+    std::vector<int32_t> pad_begin_indices(input->dim_size(), 0);
+    std::vector<int32_t> pad_end_indices(input->dim_size(), 0);
+    std::vector<int32_t> pad_strides_indices(input->dim_size(), 1);
+
+    if (begin_indices->size() < input->dim_size()) {
+      for (index_t i = 0; i < begin_indices->size(); ++i) {
+        pad_begin_indices[i] = begin_indices_data[i];
+        pad_end_indices[i] = end_indices_data[i];
+        pad_strides_indices[i] = strides_data[i];
+      }
+      for (index_t i = begin_indices->size(); i < input->dim_size(); ++i) {
+        pad_end_indices[i] = input->dim(i);
+      }
+      begin_indices_data = pad_begin_indices.data();
+      end_indices_data = pad_end_indices.data();
+      strides_data = pad_strides_indices.data();
+    }
+
    std::vector<int32_t> slice_end_data;
    if (is_slice_) {
      // if this op is slice, the end_indices_data is size actually
@@ -80,7 +100,6 @@ struct StridedSliceFunctor {
      }
      end_indices_data = slice_end_data.data();
    }
-    const int32_t *strides_data = strides->data<int32_t>();

    std::vector<index_t> output_shape;
    std::vector<index_t> real_begin_indices(input->dim_size(), 0);

--- a/mace/ops/argmax.cc
+++ b/mace/ops/argmax.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/argmax.h"
+
+namespace mace {
+namespace ops {
+
+void Register_ArgMax(OperatorRegistry *op_registry) {
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ArgMax")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<float>("T")
+                                          .Build(),
+                         ArgMaxOp<DeviceType::CPU, float>);
+}
+
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/argmax.h
+++ b/mace/ops/argmax.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARGMAX_H_
+#define MACE_OPS_ARGMAX_H_
+
+#include <vector>
+
+#include "mace/core/operator.h"
+#include "mace/kernels/argmax.h"
+
+namespace mace {
+namespace ops {
+
+template<DeviceType D, class T>
+class ArgMaxOp : public Operator<D, T> {
+ public:
+  ArgMaxOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<D, T>(operator_def, ws) {}
+
+  MaceStatus Run(StatsFuture *future) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *axis = this->Input(1);
+    Tensor *output = this->Output(0);
+    return functor_(input, axis, output, future);
+  }
+
+ private:
+  kernels::ArgMaxFunctor<D, T> functor_;
+
+  MACE_OP_INPUT_TAGS(INPUT, AXIS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARGMAX_H_
--- a/mace/ops/argmax_test.cc
+++ b/mace/ops/argmax_test.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/operator.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class ArgMaxOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D>
+void ArgMaxTest(const std::vector<index_t> &input_shape,
+                const std::vector<float> &input,
+                const std::vector<index_t> &output_shape,
+                const std::vector<int32_t> &output) {
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, float>("Input", input_shape, input);
+  net.AddInputFromArray<D, int32_t>("axis", {}, {-1});
+
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("ArgMax", "ArgMaxTest")
+        .Input("Input")
+        .Input("axis")
+        .Output("Output")
+        .OutputType({DT_INT32})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  // Check
+  auto expected = CreateTensor<int32_t>(output_shape, output);
+  ExpectTensorNear<int32_t>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+}  // namespace
+
+TEST_F(ArgMaxOpTest, Vector) { ArgMaxTest<CPU>({3}, {-3, -1, -2}, {}, {1}); }
+
+TEST_F(ArgMaxOpTest, Matrix) {
+  ArgMaxTest<CPU>({3, 3}, {4, 5, 6, 9, 8, 7, 1, 2, 3}, {3}, {2, 0, 2});
+}
+
+TEST_F(ArgMaxOpTest, HighRank) {
+  ArgMaxTest<CPU>({1, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+                  {1, 2, 2}, {2, 2, 2, 2});
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/cast.h
+++ b/mace/ops/cast.h
@@ -22,11 +22,11 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, typename DT>
-class CastOp : public Operator<D, DT> {
+template <DeviceType D, typename SrcType>
+class CastOp : public Operator<D, SrcType> {
 public:
  CastOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, DT>(op_def, ws) {}
+      : Operator<D, SrcType>(op_def, ws) {}

  MaceStatus Run(StatsFuture *future) override {
    MACE_UNUSED(future);
@@ -36,17 +36,16 @@ class CastOp : public Operator<D, DT> {

    Tensor::MappingGuard input_guard(input);
    Tensor::MappingGuard output_guard(output);
-    auto src_dtype = input->dtype();
-
-    auto output_data = output->mutable_data<DT>();
+    auto dst_dtype = output->dtype();

 #define MACE_CAST_COPY \
-    auto input_data = input->data<T>();                                 \
+    auto output_data = output->mutable_data<T>();                       \
+    auto input_data = input->data<SrcType>();                           \
    for (index_t i = 0; i < output->size(); ++i) {                      \
-      output_data[i] = static_cast<DT>(input_data[i]);                  \
+      output_data[i] = static_cast<T>(input_data[i]);                   \
    }

-    MACE_RUN_WITH_TYPE_ENUM(src_dtype, MACE_CAST_COPY);
+    MACE_RUN_WITH_TYPE_ENUM(dst_dtype, MACE_CAST_COPY);

    return MACE_SUCCESS;
  }

--- a/mace/ops/cast_test.cc
+++ b/mace/ops/cast_test.cc
@@ -30,8 +30,9 @@ void TestCast(const std::vector<index_t> &input_shape,
  OpsTestNet net;
  OpDefBuilder("Cast", "CastTest")
      .Input("Input")
+      .OutputType({DataTypeToEnum<DstType>::v()})
      .Output("Output")
-      .AddIntArg("T", DataTypeToEnum<DstType>::v())
+      .AddIntArg("T", DataTypeToEnum<SrcType>::v())
      .Finalize(net.NewOperatorDef());

  // Add input data
@@ -55,10 +56,12 @@ void TestCast(const std::vector<index_t> &input_shape,

 TEST_F(CastOpTest, TestCastFromFloatToInt32) {
  TestCast<float, int32_t>({1, 2, 3}, {1.1, 2.2, 3.3, 4.4, 5.5, 6.6});
+  TestCast<float, int32_t>({}, {3.3});
 }

 TEST_F(CastOpTest, TestCastFromInt32ToFloat) {
  TestCast<int32_t, float>({1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  TestCast<int32_t, float>({}, {3});
 }

 }  // namespace test

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -23,6 +23,11 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
                                          .TypeConstraint<float>("T")
                                          .Build(),
                         EltwiseOp<DeviceType::CPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<int32_t>("T")
+                                          .Build(),
+                         EltwiseOp<DeviceType::CPU, int32_t>);

 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -23,30 +23,63 @@ namespace test {
 class EltwiseOpTest : public OpsTestBase {};

 namespace {
-template <DeviceType D, typename T>
+template <DeviceType D, typename T, typename DstType>
+void SimpleScalarScalar(const kernels::EltwiseType type,
+                        const T input,
+                        const float x,
+                        const DstType output) {
+  // Construct graph
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, T>("Input", {}, {input});
+
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("Eltwise", "EltwiseTest")
+        .Input("Input")
+        .AddIntArg("T", DataTypeToEnum<T>::v())
+        .AddIntArg("type", static_cast<int>(type))
+        .AddFloatArg("value", x)
+        .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  auto expected = CreateTensor<DstType>({}, {output});
+
+  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+
+template <DeviceType D, typename T, typename DstType>
 void SimpleTensorScalar(const kernels::EltwiseType type,
                        const std::vector<index_t> &shape,
-                        const std::vector<float> &input,
+                        const std::vector<T> &input,
                        const float x,
-                        const std::vector<float> &output) {
+                        const std::vector<DstType> &output) {
  // Construct graph
  OpsTestNet net;

  // Add input data
-  net.AddInputFromArray<D, float>("Input", shape, input);
+  net.AddInputFromArray<D, T>("Input", shape, input);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "TInput", NCHW);
+    net.TransformDataFormat<D, T>("Input", NHWC, "TInput", NCHW);
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("TInput")
+        .AddIntArg("T", DataTypeToEnum<T>::v())
        .AddIntArg("type", static_cast<int>(type))
        .AddFloatArg("value", x)
        .AddIntArg("data_format", DataFormat::NCHW)
+        .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
        .Output("TOutput")
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("TOutput", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
  } else {
    BufferToImage<D, T>(&net, "Input", "InputImg",
                        kernels::BufferType::IN_OUT_CHANNEL);
@@ -60,44 +93,47 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
    // Run
    net.RunOp(D);

-    ImageToBuffer<D, float>(&net, "OutputImg", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, DstType>(&net, "OutputImg", "Output",
+                              kernels::BufferType::IN_OUT_CHANNEL);
  }

-  auto expected = CreateTensor<float>(shape, output);
+  auto expected = CreateTensor<DstType>(shape, output);

-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }

-template <DeviceType D, typename T>
+template <DeviceType D, typename T, typename DstType>
 void SimpleTensorEltwise(const kernels::EltwiseType type,
                         const std::vector<index_t> &shape0,
-                         const std::vector<float> &input0,
+                         const std::vector<T> &input0,
                         const std::vector<index_t> &shape1,
-                         const std::vector<float> &input1,
-                         const std::vector<float> &output,
+                         const std::vector<T> &input1,
+                         const std::vector<DstType> &output,
                         const std::vector<float> &coeff = {}) {
  // Construct graph
  OpsTestNet net;

  // Add input data
-  net.AddInputFromArray<D, float>("Input0", shape0, input0);
-  net.AddInputFromArray<D, float>("Input1", shape1, input1);
+  net.AddInputFromArray<D, T>("Input0", shape0, input0);
+  net.AddInputFromArray<D, T>("Input1", shape1, input1);

  if (D == DeviceType::CPU) {
-    auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest")
-        .AddIntArg("type", static_cast<int>(type))
-        .AddFloatsArg("coeff", coeff)
-        .AddIntArg("data_format", DataFormat::NCHW)
-        .Output("TOutput");
+    auto op_builder =
+        OpDefBuilder("Eltwise", "EltwiseTest")
+            .AddIntArg("T", DataTypeToEnum<T>::v())
+            .AddIntArg("type", static_cast<int>(type))
+            .AddFloatsArg("coeff", coeff)
+            .AddIntArg("data_format", DataFormat::NCHW)
+            .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+            .Output("TOutput");
    if (shape0.size() > 1) {
-      net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
+      net.TransformDataFormat<D, T>("Input0", NHWC, "TInput0", NCHW);
      op_builder.Input("TInput0");
    } else {
      op_builder.Input("Input0");
    }
    if (shape1.size() > 1) {
-      net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
+      net.TransformDataFormat<D, T>("Input1", NHWC, "TInput1", NCHW);
      op_builder.Input("TInput1");
    } else {
      op_builder.Input("Input1");
@@ -106,7 +142,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,

    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("TOutput", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
  } else {
    BufferToImage<D, T>(&net, "Input0", "InputImg0",
                        kernels::BufferType::IN_OUT_CHANNEL);
@@ -123,42 +159,45 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
    // Run
    net.RunOp(D);

-    ImageToBuffer<D, float>(&net, "OutputImg", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, DstType>(&net, "OutputImg", "Output",
+                              kernels::BufferType::IN_OUT_CHANNEL);
  }

  std::vector<index_t> output_shape = shape0;
  if (input0.size() < input1.size()) {
    output_shape = shape1;
  }
-  auto expected = CreateTensor<float>(output_shape, output);
+  auto expected = CreateTensor<DstType>(output_shape, output);

-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }

-template<DeviceType D, typename T>
+template <DeviceType D, typename T, typename DstType>
 void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
                                   const std::vector<index_t> &shape0,
-                                   const std::vector<float> &input0,
+                                   const std::vector<T> &input0,
                                   const std::vector<index_t> &shape1,
-                                   const std::vector<float> &input1,
+                                   const std::vector<T> &input1,
                                   const std::vector<index_t> &output_shape,
-                                   const std::vector<float> &output,
+                                   const std::vector<DstType> &output,
                                   const std::vector<float> &coeff = {}) {
  // Construct graph
  OpsTestNet net;

  // Add input data
-  net.AddInputFromArray<D, float>("Input0", shape0, input0);
-  net.AddInputFromArray<D, float>("Input1", shape1, input1);
+  net.AddInputFromArray<D, T>("Input0", shape0, input0);
+  net.AddInputFromArray<D, T>("Input1", shape1, input1);

  if (D == DeviceType::CPU) {
-    auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("Input0")
-        .Input("Input1")
-        .AddIntArg("type", static_cast<int>(type))
-        .AddFloatsArg("coeff", coeff)
-        .Output("Output");
+    auto op_builder =
+        OpDefBuilder("Eltwise", "EltwiseTest")
+            .AddIntArg("T", DataTypeToEnum<T>::v())
+            .Input("Input0")
+            .Input("Input1")
+            .AddIntArg("type", static_cast<int>(type))
+            .AddFloatsArg("coeff", coeff)
+            .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+            .Output("Output");
    op_builder.Finalize(net.NewOperatorDef());

    // Run
@@ -167,214 +206,248 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
    MACE_NOT_IMPLEMENTED;
  }

-  auto expected = CreateTensor<float>(output_shape, output);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  auto expected = CreateTensor<DstType>(output_shape, output);
+  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace

+TEST_F(EltwiseOpTest, CPUSimpleScalarScalar) {
+  SimpleScalarScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::SUM, 1, 2, 3);
+  SimpleScalarScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::SUB, 1, 2, -1);
+  SimpleScalarScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::PROD, 1, 2, 2);
+  SimpleScalarScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::DIV, 1, 2, 0.5);
+  SimpleScalarScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::MIN, 1, 2, 1);
+  SimpleScalarScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::MAX, 1, 2, 2);
+  SimpleScalarScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::NEG, 1, 2, -1);
+  SimpleScalarScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::ABS, -1, 3, 1);
+  SimpleScalarScalar<DeviceType::CPU, int32_t, int32_t>(
+      kernels::EltwiseType::EQUAL, 1, 3, 0);
+  SimpleScalarScalar<DeviceType::CPU, int32_t, int32_t>(
+      kernels::EltwiseType::EQUAL, 3, 3, 1);
+}
+
 TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) {
-  SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SUM,
-                                             {1, 1, 1, 1}, {1}, 1, {2});
-  SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SUB,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             1, {0, 1, 2, 3, 4, 5});
-  SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::PROD,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             2, {2, 4, 6, 8, 10, 12});
-  SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::DIV,
-                                             {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12},
-                                             2, {1, 2, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::MIN,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             1, {1, 1, 1, 1, 1, 1});
-  SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::MAX,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             3, {3, 3, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::NEG,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             3, {-1, -2, -3, -4, -5, -6});
-  SimpleTensorScalar<DeviceType::CPU, float>(
+  SimpleTensorScalar<DeviceType::CPU, float, float>(kernels::EltwiseType::SUM,
+                                                    {1, 1, 1, 1}, {1}, 1, {2});
+  SimpleTensorScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      {0, 1, 2, 3, 4, 5});
+  SimpleTensorScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 2,
+      {2, 4, 6, 8, 10, 12});
+  SimpleTensorScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::DIV, {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}, 2,
+      {1, 2, 3, 4, 5, 6});
+  SimpleTensorScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      {1, 1, 1, 1, 1, 1});
+  SimpleTensorScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      {3, 3, 3, 4, 5, 6});
+  SimpleTensorScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::NEG, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      {-1, -2, -3, -4, -5, -6});
+  SimpleTensorScalar<DeviceType::CPU, float, float>(
      kernels::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
      {1, 2, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SQR_DIFF,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             1, {0, 1, 4, 9, 16, 25});
+  SimpleTensorScalar<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      {0, 1, 4, 9, 16, 25});
+  SimpleTensorScalar<DeviceType::CPU, int32_t, int32_t>(
+      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      {0, 0, 1, 0, 0, 0});
 }

 TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) {
-  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUM,
-                                             {1, 1, 1, 1}, {1}, 1, {2});
-  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUB,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             1, {0, 1, 2, 3, 4, 5});
-  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::PROD,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             2, {2, 4, 6, 8, 10, 12});
-  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::DIV,
-                                             {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12},
-                                             2, {1, 2, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MIN,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             1, {1, 1, 1, 1, 1, 1});
-  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MAX,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             3, {3, 3, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::NEG,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             3, {-1, -2, -3, -4, -5, -6});
-  SimpleTensorScalar<DeviceType::GPU, float>(
+  SimpleTensorScalar<DeviceType::GPU, float, float>(kernels::EltwiseType::SUM,
+                                                    {1, 1, 1, 1}, {1}, 1, {2});
+  SimpleTensorScalar<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      {0, 1, 2, 3, 4, 5});
+  SimpleTensorScalar<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 2,
+      {2, 4, 6, 8, 10, 12});
+  SimpleTensorScalar<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::DIV, {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}, 2,
+      {1, 2, 3, 4, 5, 6});
+  SimpleTensorScalar<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      {1, 1, 1, 1, 1, 1});
+  SimpleTensorScalar<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      {3, 3, 3, 4, 5, 6});
+  SimpleTensorScalar<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::NEG, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      {-1, -2, -3, -4, -5, -6});
+  SimpleTensorScalar<DeviceType::GPU, float, float>(
      kernels::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
      {1, 2, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SQR_DIFF,
-                                             {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
-                                             1, {0, 1, 4, 9, 16, 25});
+  SimpleTensorScalar<DeviceType::GPU, float, float>(
+      kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      {0, 1, 4, 9, 16, 25});
 }

 TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
      {1, 2, 3}, {2, 4, 6, 5, 7, 9});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
      {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
      {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
+  SimpleTensorEltwise<DeviceType::CPU, int32_t, int32_t>(
+      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      {1, 1, 1, 3}, {1, 2, 3}, {1, 1, 1, 0, 0, 0});

-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
      {1, 2, 3}, {2, 4, 6, 5, 7, 9});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3},
      {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
      {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
-      kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5},
-      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
-      {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
+      kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
+  SimpleTensorEltwise<DeviceType::CPU, int32_t, int32_t>(
+      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
+      {1, 2, 3}, {1, 1, 1, 0, 0, 0});
 }

 TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
      {1, 2, 3}, {2, 4, 6, 5, 7, 9});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
      {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
      {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
 }

 TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) {
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
      {1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
      {1, 2, 3, 4, 5, 6}, {0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.1, 0.1});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
      {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
      {1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-  SimpleTensorEltwise<DeviceType::CPU, float>(
+  SimpleTensorEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
+  SimpleTensorEltwise<DeviceType::CPU, int32_t, int32_t>(
+      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
 }
 TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
      {1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
      {1, 2, 3, 4, 5, 6}, {0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.1, 0.1});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
      {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
      {1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-  SimpleTensorEltwise<DeviceType::GPU, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float, float>(
      kernels::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
@@ -595,27 +668,30 @@ TEST_F(EltwiseOpTest, RandomTensorTensorHalf) {
 }

 TEST_F(EltwiseOpTest, TensorGeneralBroadcast) {
-  TensorGeneralBroadcastEltwise<DeviceType::CPU, float>(
+  TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
      {1, 2}, {1, 1, 2, 3}, {2, 3, 4, 6, 7, 8});
-  TensorGeneralBroadcastEltwise<DeviceType::CPU, float>(
+  TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
      {1, 2}, {1, 1, 2, 3}, {0, 1, 2, 2, 3, 4});
-  TensorGeneralBroadcastEltwise<DeviceType::CPU, float>(
+  TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {1, 2, 3, 8, 10, 12});
-  TensorGeneralBroadcastEltwise<DeviceType::CPU, float>(
+  TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::DIV, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
      {1, 2}, {1, 1, 2, 3}, {1, 2, 3, 2, 2.5, 3});
-  TensorGeneralBroadcastEltwise<DeviceType::CPU, float>(
+  TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
      {1, 2}, {1, 1, 2, 3}, {1, 1, 1, 2, 2, 2});
-  TensorGeneralBroadcastEltwise<DeviceType::CPU, float>(
+  TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
      {1, 2}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
-  TensorGeneralBroadcastEltwise<DeviceType::CPU, float>(
+  TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
      kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {0, 1, 4, 4, 9, 16});
+  TensorGeneralBroadcastEltwise<DeviceType::CPU, int32_t, int32_t>(
+      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      {1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {1, 0, 0, 0, 0, 0});
 }

 }  // namespace test

--- a/mace/ops/identity.cc
+++ b/mace/ops/identity.cc
@@ -23,6 +23,11 @@ void Register_Identity(OperatorRegistry *op_registry) {
                                          .TypeConstraint<float>("T")
                                          .Build(),
                         IdentityOp<DeviceType::CPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<int32_t>("T")
+                                          .Build(),
+                         IdentityOp<DeviceType::CPU, int32_t>);

 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity")

--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
@@ -38,12 +38,12 @@ void TestStridedSlice(const std::vector<index_t> &input_shape,
  OpsTestNet net;
  net.AddInputFromArray<CPU, float>("Input", input_shape, input);
  net.AddInputFromArray<CPU, int32_t>(
-      "BeginIndices", {static_cast<int32_t>(input_shape.size())},
+      "BeginIndices", {static_cast<int32_t>(begin_indices.size())},
      begin_indices);
  net.AddInputFromArray<CPU, int32_t>(
-      "EndIndices", {static_cast<int32_t>(input_shape.size())}, end_indices);
+      "EndIndices", {static_cast<int32_t>(end_indices.size())}, end_indices);
  net.AddInputFromArray<CPU, int32_t>(
-      "Strides", {static_cast<int32_t>(input_shape.size())}, strides);
+      "Strides", {static_cast<int32_t>(strides.size())}, strides);

  OpDefBuilder("StridedSlice", "StridedSliceOpTest")
      .Input("Input")
@@ -130,6 +130,8 @@ TEST_F(StridedSliceOpTest, TestStridedSliceRank1) {
 TEST_F(StridedSliceOpTest, TestStridedSliceRank2) {
  TestStridedSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0, 0}, {2, 3}, {1, 1}, 0, 0, 0,
                   0, 0, {2, 3}, {1, 2, 3, 4, 5, 6});
+  TestStridedSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0}, {2}, {1}, 0, 0, 0,
+                   0, 0, {2, 3}, {1, 2, 3, 4, 5, 6});
  TestStridedSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1}, {2, 3}, {1, 1}, 0, 0, 0,
                   0, 0, {1, 2}, {5, 6});
  TestStridedSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0, 0}, {2, 3}, {1, 2}, 0, 0, 0,

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -66,11 +66,13 @@ class EltwiseType(Enum):
    ABS = 7
    SQR_DIFF = 8
    POW = 9
+    EQUAL = 10


 MaceSupportedOps = [
    'Activation',
    'AddN',
+    'ArgMax',
    'BatchNorm',
    'BatchToSpaceND',
    'BiasAdd',

--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -62,6 +62,7 @@ TFSupportedOps = [
    'Square',
    'SquaredDifference',
    'Rsqrt',
+    'Equal',
    'Relu',
    'Relu6',
    'Tanh',
@@ -93,6 +94,7 @@ TFSupportedOps = [
    'Stack',
    'Pack',
    'Cast',
+    'ArgMax',
 ]

 TFOpType = Enum('TFOpType', [(op, op) for op in TFSupportedOps], type=str)
@@ -125,7 +127,8 @@ class TensorflowConverter(base_converter.ConverterInterface):
        TFOpType.RealDiv.name: EltwiseType.DIV,
        TFOpType.SquaredDifference.name: EltwiseType.SQR_DIFF,
        TFOpType.Square.name: EltwiseType.POW,
-        TFOpType.Rsqrt.name: EltwiseType.POW
+        TFOpType.Rsqrt.name: EltwiseType.POW,
+        TFOpType.Equal.name: EltwiseType.EQUAL,
    }
    activation_type = {
        TFOpType.Relu.name: ActivationType.RELU,
@@ -153,6 +156,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
            TFOpType.SquaredDifference.name: self.convert_elementwise,
            TFOpType.Square.name: self.convert_elementwise,
            TFOpType.Rsqrt.name: self.convert_elementwise,
+            TFOpType.Equal.name: self.convert_elementwise,
            TFOpType.Relu.name: self.convert_activation,
            TFOpType.Relu6.name: self.convert_activation,
            TFOpType.Tanh.name: self.convert_activation,
@@ -183,7 +187,8 @@ class TensorflowConverter(base_converter.ConverterInterface):
            TFOpType.Slice.name: self.convert_slice,
            TFOpType.Pack.name: self.convert_stack,
            TFOpType.Stack.name: self.convert_stack,
-            TFOpType.Cast.name: self.convert_cast
+            TFOpType.Cast.name: self.convert_cast,
+            TFOpType.ArgMax.name: self.convert_argmax,
        }
        self._option = option
        self._mace_net_def = mace_pb2.NetDef()
@@ -376,18 +381,29 @@ class TensorflowConverter(base_converter.ConverterInterface):

        if type_arg.i != EltwiseType.NEG.value \
                and type_arg.i != EltwiseType.ABS.value:
-            if len(tf_op.inputs[0].shape) == 0:
-                value_arg = op.arg.add()
-                value_arg.name = MaceKeyword.mace_value_str
-                value_arg.f = tf_op.inputs[0].eval().astype(np.float32)
-                self._skip_tensor.add(tf_op.inputs[0].name)
-                del op.input[0]
-            elif len(tf_op.inputs) > 1 and len(tf_op.inputs[1].shape) == 0:
-                value_arg = op.arg.add()
-                value_arg.name = MaceKeyword.mace_value_str
-                value_arg.f = tf_op.inputs[1].eval().astype(np.float32)
-                self._skip_tensor.add(tf_op.inputs[1].name)
-                del op.input[1]
+            try:
+                def is_commutative(eltwise_type):
+                    return EltwiseType(eltwise_type) in [
+                        EltwiseType.SUM, EltwiseType.PROD,
+                        EltwiseType.MAX, EltwiseType.MIN]
+
+                if len(tf_op.inputs) > 1 and len(tf_op.inputs[1].shape) == 0:
+                    scalar = tf_op.inputs[1].eval().astype(np.float32)
+                    value_arg = op.arg.add()
+                    value_arg.name = MaceKeyword.mace_value_str
+                    value_arg.f = scalar
+                    self._skip_tensor.add(tf_op.inputs[1].name)
+                    del op.input[1]
+                elif len(tf_op.inputs[0].shape) == 0 and \
+                        is_commutative(type_arg.i):
+                    scalar = tf_op.inputs[0].eval().astype(np.float32)
+                    value_arg = op.arg.add()
+                    value_arg.name = MaceKeyword.mace_value_str
+                    value_arg.f = scalar
+                    self._skip_tensor.add(tf_op.inputs[0].name)
+                    del op.input[0]
+            except tf.errors.InvalidArgumentError:
+                pass

    def convert_biasadd(self, tf_op):
        op = self.convert_general_op(tf_op)
@@ -550,7 +566,13 @@ class TensorflowConverter(base_converter.ConverterInterface):
            transpose_a_arg.name = MaceKeyword.mace_transpose_a_str
            transpose_a_arg.i = int(adj_x)
        except ValueError:
-            pass
+            try:
+                transpose_a = tf_op.get_attr('transpose_a')
+                transpose_a_arg = op.arg.add()
+                transpose_a_arg.name = MaceKeyword.mace_transpose_a_str
+                transpose_a_arg.i = int(transpose_a)
+            except ValueError:
+                pass

        try:
            adj_y = tf_op.get_attr('adj_y')
@@ -558,7 +580,13 @@ class TensorflowConverter(base_converter.ConverterInterface):
            transpose_b_arg.name = MaceKeyword.mace_transpose_b_str
            transpose_b_arg.i = int(adj_y)
        except ValueError:
-            pass
+            try:
+                transpose_b = tf_op.get_attr('transpose_b')
+                transpose_b_arg = op.arg.add()
+                transpose_b_arg.name = MaceKeyword.mace_transpose_b_str
+                transpose_b_arg.i = int(transpose_b)
+            except ValueError:
+                pass

    def convert_shape(self, tf_op):
        op = self.convert_general_op(tf_op)
@@ -689,14 +717,18 @@ class TensorflowConverter(base_converter.ConverterInterface):
        op = self.convert_general_op(tf_op)
        op.type = MaceOp.Cast.name

-        data_type_arg = ConverterUtil.get_arg(op, 'T')
        try:
            dtype = tf_op.get_attr('DstT')
            if dtype == tf.int32:
-                data_type_arg.i = mace_pb2.DT_INT32
+                op.output_type.extend([mace_pb2.DT_INT32])
            elif dtype == tf.float32:
-                data_type_arg.i = self._option.data_type
+                op.output_type.extend([self._option.data_type])
            else:
                mace_check(False, "data type %s not supported" % dtype)
        except ValueError:
-            data_type_arg.i = self._option.data_type
+            op.output_type.extend([self._option.data_type])
+
+    def convert_argmax(self, tf_op):
+        op = self.convert_general_op(tf_op)
+        op.type = MaceOp.ArgMax.name
+        op.output_type.extend([mace_pb2.DT_INT32])
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -48,6 +48,10 @@ namespace str_util {

 std::vector<std::string> Split(const std::string &str, char delims) {
  std::vector<std::string> result;
+  if (str.empty()) {
+    result.push_back("");
+    return result;
+  }
  std::string tmp = str;
  while (!tmp.empty()) {
    size_t next_offset = tmp.find(delims);

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -773,11 +773,17 @@ def tuning_run(abi,
            (phone_data_dir, os.path.basename(opencl_binary_file)),
        ])
        adb_cmd = ' '.join(adb_cmd)
+        adb_cmd_file = "%s/%s" % (phone_data_dir, 'cmd_file')
+        with open('/tmp/mace_cmd_file', 'w') as cmd_file:
+            cmd_file.write(adb_cmd)
+        adb_push('/tmp/mace_cmd_file', adb_cmd_file, serialno)
+
        sh.adb(
            "-s",
            serialno,
            "shell",
-            adb_cmd,
+            "sh",
+            adb_cmd_file,
            _tty_in=True,
            _out=process_output,
            _err_to_out=True)
@@ -1159,10 +1165,7 @@ def benchmark_model(abi,
                 phone_data_dir,
                 serialno)

-        sh.adb(
-            "-s",
-            serialno,
-            "shell",
+        adb_cmd = [
            "LD_LIBRARY_PATH=%s" % phone_data_dir,
            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
@@ -1185,6 +1188,19 @@ def benchmark_model(abi,
            "--model_file=%s" % mace_model_phone_path,
            "--opencl_binary_file=%s/%s" %
            (phone_data_dir, os.path.basename(opencl_binary_file)),
+        ]
+        adb_cmd = ' '.join(adb_cmd)
+        adb_cmd_file = "%s/%s" % (phone_data_dir, 'cmd_file')
+        with open('/tmp/mace_cmd_file', 'w') as cmd_file:
+            cmd_file.write(adb_cmd)
+        adb_push('/tmp/mace_cmd_file', adb_cmd_file, serialno)
+
+        sh.adb(
+            "-s",
+            serialno,
+            "shell",
+            "sh",
+            adb_cmd_file,
            _fg=True)

    print("Benchmark done!\n")