Implement argmax, equal op;

Revise memory allocation logic to make non-float output possible

Implement argmax, equal op;
Revise memory allocation logic to make non-float output possible
d6adf881 · 李寅 · 7ac05858 · d6adf881 · d6adf881 · d6adf881
20 changed file
--- a/docs/getting_started/op_lists.rst
+++ b/docs/getting_started/op_lists.rst
@@ -7,22 +7,25 @@ Operator lists
    :header: "Operator","Android NN","Supported","Remark"
    "AVERAGE_POOL_2D","Y","Y",""
+    "ARGMAX","","Y","Only CPU and tensorflow is supported"
    "BATCH_NORM","","Y","Fusion with activation is supported"
    "BATCH_TO_SPACE_ND","Y","Y",""
    "BIAS_ADD","","Y",""
+    "CAST","","Y","Only CPU and tensorflow model is supported"
    "CHANNEL_SHUFFLE","","Y",""
    "CONCATENATION","Y","Y","Only support channel axis concatenation"
    "CONV_2D","Y","Y","Fusion with BN and activation layer is supported"
-    "DECONV_2D","N","Y","Only tensorflow model is supported"
+    "DECONV_2D","","Y","Only tensorflow model is supported"
    "DEPTHWISE_CONV_2D","Y","Y","Only multiplier = 1 is supported; Fusion is supported"
    "DEPTH_TO_SPACE","Y","Y",""
    "DEQUANTIZE","Y","Y","Model quantization will be supported later"
-    "ELEMENT_WISE","Y","Y","ADD/MUL/DIV/MIN/MAX/NEG/ABS/SQR_DIFF/POW"
+    "ELEMENT_WISE","Y","Y","ADD/MUL/DIV/MIN/MAX/NEG/ABS/SQR_DIFF/POW/RSQRT/EQUAL"
-    "EMBEDDING_LOOKUP","Y","",""
+    "EMBEDDING_LOOKUP","Y","Y","Only support channel axis concatenation"
    "FLOOR","Y","",""
    "FULLY_CONNECTED","Y","Y",""
    "GROUP_CONV_2D","","","Caffe model with group count = channel count is supported"
    "HASHTABLE_LOOKUP","Y","",""
+    "IDENTITY","","Y","Only tensorflow model is supported"
    "L2_NORMALIZATION","Y","",""
    "L2_POOL_2D","Y","",""
    "LOCAL_RESPONSE_NORMALIZATION","Y","Y",""
@@ -31,9 +34,10 @@ Operator lists
    "LSTM","Y","",""
    "MATMUL","","Y",""
    "MAX_POOL_2D","Y","Y",""
-    "PAD", "N","Y",""
+    "PAD", "Y","Y",""
    "PSROI_ALIGN","","Y",""
    "PRELU","","Y","Only caffe model is supported"
+    "REDUCE_MEAN","Y","Y","Only tensorflow model is supported"
    "RELU","Y","Y",""
    "RELU1","Y","Y",""
    "RELU6","Y","Y",""
@@ -42,9 +46,14 @@ Operator lists
    "RESIZE_BILINEAR","Y","Y",""
    "RNN","Y","",""
    "RPN_PROPOSAL_LAYER","","Y",""
-    "SLICE","N","Y","Only support channel axis slice"
+    "SHAPE","","Y","Only CPU and tensorflow is supported"
+    "STACK","","Y","Only CPU and tensorflow is supported"
+    "STRIDEDSLICE","Y","Y","Only CPU and tensorflow is supported"
+    "SLICE","","Y","In tensorflow, this op is equivalent to SPLIT; Only support channel axis slice"
    "SOFTMAX","Y","Y",""
    "SPACE_TO_BATCH_ND","Y", "Y",""
    "SPACE_TO_DEPTH","Y","Y",""
+    "SQEEZE","Y","Y","Only CPU and tensorflow is supported"
    "SVDF","Y","",""
    "TANH","Y","Y",""
+    "TRANSPOSE","Y","Y","Only CPU and tensorflow is supported"
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -264,7 +264,6 @@ MaceStatus MaceEngine::Impl::Run(
      auto shape = output_tensor->shape();
      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
                                            std::multiplies<int64_t>());
-      MACE_CHECK(!shape.empty()) << "Output's shape must greater than 0";
      MACE_CHECK(shape == output.second.shape())
          << "Output shape mismatch: "
          << MakeString<int64_t>(output.second.shape())

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -76,6 +76,7 @@ namespace ops {
 // Keep in lexicographical order
 extern void Register_Activation(OperatorRegistry *op_registry);
 extern void Register_AddN(OperatorRegistry *op_registry);
+extern void Register_ArgMax(OperatorRegistry *op_registry);
 extern void Register_BatchNorm(OperatorRegistry *op_registry);
 extern void Register_BatchToSpaceND(OperatorRegistry *op_registry);
 extern void Register_BiasAdd(OperatorRegistry *op_registry);
@@ -124,6 +125,7 @@ OperatorRegistry::OperatorRegistry() {
  // Keep in lexicographical order
  ops::Register_Activation(this);
  ops::Register_AddN(this);
+  ops::Register_ArgMax(this);
  ops::Register_BatchNorm(this);
  ops::Register_BatchToSpaceND(this);
  ops::Register_BiasAdd(this);

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -157,6 +157,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
    }
  }
  MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
+  // TODO(liyin): memory block should not have concept of type, but to be
+  // consistent with gpu, all memory block use float/half as unit
  for (auto &mem_block : net_def.mem_arena().mem_block()) {
    if (device_type == DeviceType::GPU) {
      // TODO(liuqi): refactor based on PB
@@ -191,8 +193,15 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
      auto mem_ids = op.mem_id();
      int count = mem_ids.size();
      for (int i = 0; i < count; ++i) {
+        DataType output_type;
+        if (i < op.output_type_size()) {
+          output_type = op.output_type(i);
+        } else {
+          output_type = dtype;
+        }
        std::unique_ptr<Tensor> tensor
-            (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype));
+            (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
+                        output_type));
        tensor->SetSourceOpName(op.name());
        if (device_type == DeviceType::GPU) {
          VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"

--- a/mace/kernels/argmax.h
+++ b/mace/kernels/argmax.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_ARGMAX_H_
+#define MACE_KERNELS_ARGMAX_H_
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <vector>
+#include "mace/core/future.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+namespace mace {
+namespace kernels {
+template <DeviceType D, typename T>
+struct ArgMaxFunctor {
+  MaceStatus operator()(const Tensor *input,
+                        const Tensor *axis,
+                        Tensor *output,
+                        StatsFuture *future) {
+    MACE_UNUSED(future);
+    MACE_CHECK(input->dim_size() > 0, "ArgMax input should not be a scalar");
+    MACE_CHECK(axis->dim_size() == 0, "Mace argmax only supports scalar axis");
+    Tensor::MappingGuard axis_guard(axis);
+    int axis_value = axis->data<int32_t>()[0];
+    if (axis_value < 0) {
+      axis_value += input->dim_size();
+    }
+    MACE_CHECK(axis_value == input->dim_size() - 1,
+               "Mace argmax only supports last dimension as axis");
+    std::vector<index_t> output_shape(input->dim_size() - 1);
+    for (index_t d = 0; d < input->dim_size() - 1; ++d) {
+      output_shape[d] = input->dim(d < axis_value ? d : d + 1);
+    }
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    auto input_data = input->data<T>();
+    auto output_data = output->mutable_data<int32_t>();
+    index_t outer_size = output->size();
+    index_t inner_size = input->dim(axis_value);
+#pragma omp parallel for
+    for (index_t i = 0; i < outer_size; ++i) {
+      int idx = 0;
+      T max_value = std::numeric_limits<T>::lowest();
+      const T *input_ptr = input_data + i * inner_size;
+      for (index_t j = 0; j < inner_size; ++j) {
+        if (input_ptr[j] > max_value) {
+          max_value = input_ptr[j];
+          idx = j;
+        }
+      }
+      output_data[i] = idx;
+    }
+    return MACE_SUCCESS;
+  }
+};
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_ARGMAX_H_
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
--- a/mace/kernels/strided_slice.h
+++ b/mace/kernels/strided_slice.h
@@ -67,6 +67,26 @@ struct StridedSliceFunctor {
    const T *input_data = input->data<T>();
    const int32_t *begin_indices_data = begin_indices->data<int32_t>();
    const int32_t *end_indices_data = end_indices->data<int32_t>();
+    const int32_t *strides_data = strides->data<int32_t>();
+    std::vector<int32_t> pad_begin_indices(input->dim_size(), 0);
+    std::vector<int32_t> pad_end_indices(input->dim_size(), 0);
+    std::vector<int32_t> pad_strides_indices(input->dim_size(), 1);
+    if (begin_indices->size() < input->dim_size()) {
+      for (index_t i = 0; i < begin_indices->size(); ++i) {
+        pad_begin_indices[i] = begin_indices_data[i];
+        pad_end_indices[i] = end_indices_data[i];
+        pad_strides_indices[i] = strides_data[i];
+      }
+      for (index_t i = begin_indices->size(); i < input->dim_size(); ++i) {
+        pad_end_indices[i] = input->dim(i);
+      }
+      begin_indices_data = pad_begin_indices.data();
+      end_indices_data = pad_end_indices.data();
+      strides_data = pad_strides_indices.data();
+    }
    std::vector<int32_t> slice_end_data;
    if (is_slice_) {
      // if this op is slice, the end_indices_data is size actually
@@ -80,7 +100,6 @@ struct StridedSliceFunctor {
      }
      end_indices_data = slice_end_data.data();
    }
-    const int32_t *strides_data = strides->data<int32_t>();
    std::vector<index_t> output_shape;
    std::vector<index_t> real_begin_indices(input->dim_size(), 0);

--- a/mace/ops/argmax.cc
+++ b/mace/ops/argmax.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/argmax.h"
+namespace mace {
+namespace ops {
+void Register_ArgMax(OperatorRegistry *op_registry) {
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ArgMax")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<float>("T")
+                                          .Build(),
+                         ArgMaxOp<DeviceType::CPU, float>);
+}
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/argmax.h
+++ b/mace/ops/argmax.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARGMAX_H_
+#define MACE_OPS_ARGMAX_H_
+#include <vector>
+#include "mace/core/operator.h"
+#include "mace/kernels/argmax.h"
+namespace mace {
+namespace ops {
+template<DeviceType D, class T>
+class ArgMaxOp : public Operator<D, T> {
+ public:
+  ArgMaxOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<D, T>(operator_def, ws) {}
+  MaceStatus Run(StatsFuture *future) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *axis = this->Input(1);
+    Tensor *output = this->Output(0);
+    return functor_(input, axis, output, future);
+  }
+ private:
+  kernels::ArgMaxFunctor<D, T> functor_;
+  MACE_OP_INPUT_TAGS(INPUT, AXIS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARGMAX_H_
--- a/mace/ops/argmax_test.cc
+++ b/mace/ops/argmax_test.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/operator.h"
+#include "mace/ops/ops_test_util.h"
+namespace mace {
+namespace ops {
+namespace test {
+class ArgMaxOpTest : public OpsTestBase {};
+namespace {
+template <DeviceType D>
+void ArgMaxTest(const std::vector<index_t> &input_shape,
+                const std::vector<float> &input,
+                const std::vector<index_t> &output_shape,
+                const std::vector<int32_t> &output) {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>("Input", input_shape, input);
+  net.AddInputFromArray<D, int32_t>("axis", {}, {-1});
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("ArgMax", "ArgMaxTest")
+        .Input("Input")
+        .Input("axis")
+        .Output("Output")
+        .OutputType({DT_INT32})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+  // Check
+  auto expected = CreateTensor<int32_t>(output_shape, output);
+  ExpectTensorNear<int32_t>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+}  // namespace
+TEST_F(ArgMaxOpTest, Vector) { ArgMaxTest<CPU>({3}, {-3, -1, -2}, {}, {1}); }
+TEST_F(ArgMaxOpTest, Matrix) {
+  ArgMaxTest<CPU>({3, 3}, {4, 5, 6, 9, 8, 7, 1, 2, 3}, {3}, {2, 0, 2});
+}
+TEST_F(ArgMaxOpTest, HighRank) {
+  ArgMaxTest<CPU>({1, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+                  {1, 2, 2}, {2, 2, 2, 2});
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/cast.h
+++ b/mace/ops/cast.h
@@ -22,11 +22,11 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, typename DT>
+template <DeviceType D, typename SrcType>
-class CastOp : public Operator<D, DT> {
+class CastOp : public Operator<D, SrcType> {
 public:
  CastOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, DT>(op_def, ws) {}
+      : Operator<D, SrcType>(op_def, ws) {}
  MaceStatus Run(StatsFuture *future) override {
    MACE_UNUSED(future);
@@ -36,17 +36,16 @@ class CastOp : public Operator<D, DT> {
    Tensor::MappingGuard input_guard(input);
    Tensor::MappingGuard output_guard(output);
-    auto src_dtype = input->dtype();
+    auto dst_dtype = output->dtype();
-    auto output_data = output->mutable_data<DT>();
 #define MACE_CAST_COPY \
-    auto input_data = input->data<T>();                                 \
+    auto output_data = output->mutable_data<T>();                       \
+    auto input_data = input->data<SrcType>();                           \
    for (index_t i = 0; i < output->size(); ++i) {                      \
-      output_data[i] = static_cast<DT>(input_data[i]);                  \
+      output_data[i] = static_cast<T>(input_data[i]);                   \
    }
-    MACE_RUN_WITH_TYPE_ENUM(src_dtype, MACE_CAST_COPY);
+    MACE_RUN_WITH_TYPE_ENUM(dst_dtype, MACE_CAST_COPY);
    return MACE_SUCCESS;
  }

--- a/mace/ops/cast_test.cc
+++ b/mace/ops/cast_test.cc
@@ -30,8 +30,9 @@ void TestCast(const std::vector<index_t> &input_shape,
  OpsTestNet net;
  OpDefBuilder("Cast", "CastTest")
      .Input("Input")
+      .OutputType({DataTypeToEnum<DstType>::v()})
      .Output("Output")
-      .AddIntArg("T", DataTypeToEnum<DstType>::v())
+      .AddIntArg("T", DataTypeToEnum<SrcType>::v())
      .Finalize(net.NewOperatorDef());
  // Add input data
@@ -55,10 +56,12 @@ void TestCast(const std::vector<index_t> &input_shape,
 TEST_F(CastOpTest, TestCastFromFloatToInt32) {
  TestCast<float, int32_t>({1, 2, 3}, {1.1, 2.2, 3.3, 4.4, 5.5, 6.6});
+  TestCast<float, int32_t>({}, {3.3});
 }
 TEST_F(CastOpTest, TestCastFromInt32ToFloat) {
  TestCast<int32_t, float>({1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  TestCast<int32_t, float>({}, {3});
 }
 }  // namespace test

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -23,6 +23,11 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
                                          .TypeConstraint<float>("T")
                                          .Build(),
                         EltwiseOp<DeviceType::CPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<int32_t>("T")
+                                          .Build(),
+                         EltwiseOp<DeviceType::CPU, int32_t>);
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
--- a/mace/ops/identity.cc
+++ b/mace/ops/identity.cc
@@ -23,6 +23,11 @@ void Register_Identity(OperatorRegistry *op_registry) {
                                          .TypeConstraint<float>("T")
                                          .Build(),
                         IdentityOp<DeviceType::CPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<int32_t>("T")
+                                          .Build(),
+                         IdentityOp<DeviceType::CPU, int32_t>);
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity")

--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
@@ -38,12 +38,12 @@ void TestStridedSlice(const std::vector<index_t> &input_shape,
  OpsTestNet net;
  net.AddInputFromArray<CPU, float>("Input", input_shape, input);
  net.AddInputFromArray<CPU, int32_t>(
-      "BeginIndices", {static_cast<int32_t>(input_shape.size())},
+      "BeginIndices", {static_cast<int32_t>(begin_indices.size())},
      begin_indices);
  net.AddInputFromArray<CPU, int32_t>(
-      "EndIndices", {static_cast<int32_t>(input_shape.size())}, end_indices);
+      "EndIndices", {static_cast<int32_t>(end_indices.size())}, end_indices);
  net.AddInputFromArray<CPU, int32_t>(
-      "Strides", {static_cast<int32_t>(input_shape.size())}, strides);
+      "Strides", {static_cast<int32_t>(strides.size())}, strides);
  OpDefBuilder("StridedSlice", "StridedSliceOpTest")
      .Input("Input")
@@ -130,6 +130,8 @@ TEST_F(StridedSliceOpTest, TestStridedSliceRank1) {
 TEST_F(StridedSliceOpTest, TestStridedSliceRank2) {
  TestStridedSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0, 0}, {2, 3}, {1, 1}, 0, 0, 0,
                   0, 0, {2, 3}, {1, 2, 3, 4, 5, 6});
+  TestStridedSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0}, {2}, {1}, 0, 0, 0,
+                   0, 0, {2, 3}, {1, 2, 3, 4, 5, 6});
  TestStridedSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1}, {2, 3}, {1, 1}, 0, 0, 0,
                   0, 0, {1, 2}, {5, 6});
  TestStridedSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0, 0}, {2, 3}, {1, 2}, 0, 0, 0,

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -66,11 +66,13 @@ class EltwiseType(Enum):
    ABS = 7
    SQR_DIFF = 8
    POW = 9
+    EQUAL = 10
 MaceSupportedOps = [
    'Activation',
    'AddN',
+    'ArgMax',
    'BatchNorm',
    'BatchToSpaceND',
    'BiasAdd',

--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -62,6 +62,7 @@ TFSupportedOps = [
    'Square',
    'SquaredDifference',
    'Rsqrt',
+    'Equal',
    'Relu',
    'Relu6',
    'Tanh',
@@ -93,6 +94,7 @@ TFSupportedOps = [
    'Stack',
    'Pack',
    'Cast',
+    'ArgMax',
 ]
 TFOpType = Enum('TFOpType', [(op, op) for op in TFSupportedOps], type=str)
@@ -125,7 +127,8 @@ class TensorflowConverter(base_converter.ConverterInterface):
        TFOpType.RealDiv.name: EltwiseType.DIV,
        TFOpType.SquaredDifference.name: EltwiseType.SQR_DIFF,
        TFOpType.Square.name: EltwiseType.POW,
-        TFOpType.Rsqrt.name: EltwiseType.POW
+        TFOpType.Rsqrt.name: EltwiseType.POW,
+        TFOpType.Equal.name: EltwiseType.EQUAL,
    }
    activation_type = {
        TFOpType.Relu.name: ActivationType.RELU,
@@ -153,6 +156,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
            TFOpType.SquaredDifference.name: self.convert_elementwise,
            TFOpType.Square.name: self.convert_elementwise,
            TFOpType.Rsqrt.name: self.convert_elementwise,
+            TFOpType.Equal.name: self.convert_elementwise,
            TFOpType.Relu.name: self.convert_activation,
            TFOpType.Relu6.name: self.convert_activation,
            TFOpType.Tanh.name: self.convert_activation,
@@ -183,7 +187,8 @@ class TensorflowConverter(base_converter.ConverterInterface):
            TFOpType.Slice.name: self.convert_slice,
            TFOpType.Pack.name: self.convert_stack,
            TFOpType.Stack.name: self.convert_stack,
-            TFOpType.Cast.name: self.convert_cast
+            TFOpType.Cast.name: self.convert_cast,
+            TFOpType.ArgMax.name: self.convert_argmax,
        }
        self._option = option
        self._mace_net_def = mace_pb2.NetDef()
@@ -376,18 +381,29 @@ class TensorflowConverter(base_converter.ConverterInterface):
        if type_arg.i != EltwiseType.NEG.value \
                and type_arg.i != EltwiseType.ABS.value:
-            if len(tf_op.inputs[0].shape) == 0:
+            try:
-                value_arg = op.arg.add()
+                def is_commutative(eltwise_type):
-                value_arg.name = MaceKeyword.mace_value_str
+                    return EltwiseType(eltwise_type) in [
-                value_arg.f = tf_op.inputs[0].eval().astype(np.float32)
+                        EltwiseType.SUM, EltwiseType.PROD,
-                self._skip_tensor.add(tf_op.inputs[0].name)
+                        EltwiseType.MAX, EltwiseType.MIN]
-                del op.input[0]
-            elif len(tf_op.inputs) > 1 and len(tf_op.inputs[1].shape) == 0:
+                if len(tf_op.inputs) > 1 and len(tf_op.inputs[1].shape) == 0:
-                value_arg = op.arg.add()
+                    scalar = tf_op.inputs[1].eval().astype(np.float32)
-                value_arg.name = MaceKeyword.mace_value_str
+                    value_arg = op.arg.add()
-                value_arg.f = tf_op.inputs[1].eval().astype(np.float32)
+                    value_arg.name = MaceKeyword.mace_value_str
-                self._skip_tensor.add(tf_op.inputs[1].name)
+                    value_arg.f = scalar
-                del op.input[1]
+                    self._skip_tensor.add(tf_op.inputs[1].name)
+                    del op.input[1]
+                elif len(tf_op.inputs[0].shape) == 0 and \
+                        is_commutative(type_arg.i):
+                    scalar = tf_op.inputs[0].eval().astype(np.float32)
+                    value_arg = op.arg.add()
+                    value_arg.name = MaceKeyword.mace_value_str
+                    value_arg.f = scalar
+                    self._skip_tensor.add(tf_op.inputs[0].name)
+                    del op.input[0]
+            except tf.errors.InvalidArgumentError:
+                pass
    def convert_biasadd(self, tf_op):
        op = self.convert_general_op(tf_op)
@@ -550,7 +566,13 @@ class TensorflowConverter(base_converter.ConverterInterface):
            transpose_a_arg.name = MaceKeyword.mace_transpose_a_str
            transpose_a_arg.i = int(adj_x)
        except ValueError:
-            pass
+            try:
+                transpose_a = tf_op.get_attr('transpose_a')
+                transpose_a_arg = op.arg.add()
+                transpose_a_arg.name = MaceKeyword.mace_transpose_a_str
+                transpose_a_arg.i = int(transpose_a)
+            except ValueError:
+                pass
        try:
            adj_y = tf_op.get_attr('adj_y')
@@ -558,7 +580,13 @@ class TensorflowConverter(base_converter.ConverterInterface):
            transpose_b_arg.name = MaceKeyword.mace_transpose_b_str
            transpose_b_arg.i = int(adj_y)
        except ValueError:
-            pass
+            try:
+                transpose_b = tf_op.get_attr('transpose_b')
+                transpose_b_arg = op.arg.add()
+                transpose_b_arg.name = MaceKeyword.mace_transpose_b_str
+                transpose_b_arg.i = int(transpose_b)
+            except ValueError:
+                pass
    def convert_shape(self, tf_op):
        op = self.convert_general_op(tf_op)
@@ -689,14 +717,18 @@ class TensorflowConverter(base_converter.ConverterInterface):
        op = self.convert_general_op(tf_op)
        op.type = MaceOp.Cast.name
-        data_type_arg = ConverterUtil.get_arg(op, 'T')
        try:
            dtype = tf_op.get_attr('DstT')
            if dtype == tf.int32:
-                data_type_arg.i = mace_pb2.DT_INT32
+                op.output_type.extend([mace_pb2.DT_INT32])
            elif dtype == tf.float32:
-                data_type_arg.i = self._option.data_type
+                op.output_type.extend([self._option.data_type])
            else:
                mace_check(False, "data type %s not supported" % dtype)
        except ValueError:
-            data_type_arg.i = self._option.data_type
+            op.output_type.extend([self._option.data_type])
+    def convert_argmax(self, tf_op):
+        op = self.convert_general_op(tf_op)
+        op.type = MaceOp.ArgMax.name
+        op.output_type.extend([mace_pb2.DT_INT32])
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -48,6 +48,10 @@ namespace str_util {
 std::vector<std::string> Split(const std::string &str, char delims) {
  std::vector<std::string> result;
+  if (str.empty()) {
+    result.push_back("");
+    return result;
+  }
  std::string tmp = str;
  while (!tmp.empty()) {
    size_t next_offset = tmp.find(delims);

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -773,11 +773,17 @@ def tuning_run(abi,
            (phone_data_dir, os.path.basename(opencl_binary_file)),
        ])
        adb_cmd = ' '.join(adb_cmd)
+        adb_cmd_file = "%s/%s" % (phone_data_dir, 'cmd_file')
+        with open('/tmp/mace_cmd_file', 'w') as cmd_file:
+            cmd_file.write(adb_cmd)
+        adb_push('/tmp/mace_cmd_file', adb_cmd_file, serialno)
        sh.adb(
            "-s",
            serialno,
            "shell",
-            adb_cmd,
+            "sh",
+            adb_cmd_file,
            _tty_in=True,
            _out=process_output,
            _err_to_out=True)
@@ -1159,10 +1165,7 @@ def benchmark_model(abi,
                 phone_data_dir,
                 serialno)
-        sh.adb(
+        adb_cmd = [
-            "-s",
-            serialno,
-            "shell",
            "LD_LIBRARY_PATH=%s" % phone_data_dir,
            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
@@ -1185,6 +1188,19 @@ def benchmark_model(abi,
            "--model_file=%s" % mace_model_phone_path,
            "--opencl_binary_file=%s/%s" %
            (phone_data_dir, os.path.basename(opencl_binary_file)),
+        ]
+        adb_cmd = ' '.join(adb_cmd)
+        adb_cmd_file = "%s/%s" % (phone_data_dir, 'cmd_file')
+        with open('/tmp/mace_cmd_file', 'w') as cmd_file:
+            cmd_file.write(adb_cmd)
+        adb_push('/tmp/mace_cmd_file', adb_cmd_file, serialno)
+        sh.adb(
+            "-s",
+            serialno,
+            "shell",
+            "sh",
+            adb_cmd_file,
            _fg=True)
    print("Benchmark done!\n")