Support int32 input data type.

1. Support int32 input data type. 2. Support GatherV2 op 3. Add transpose to ExpandDim op.

Support int32 input data type.
1. Support int32 input data type. 2. Support GatherV2 op 3. Add transpose to ExpandDim op.
80d1c9dd · liuqi · ad4953cb · 80d1c9dd · 80d1c9dd · 80d1c9dd
17 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -114,7 +114,7 @@ jobs:
        - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=False --target_abis=armeabi-v7a || exit 1
        - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=False --target_abis=armeabi-v7a || exit 1
        - echo 'Extra Test'
-        - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=False --target_abis=armeabi-v7a || exit 1
+        - python tools/bazel_adb_run.py --target="//mace/utils:utils_test" --run_target=False --target_abis=armeabi-v7a || exit 1
      env: TYPE=Extra-Test-ARMEABI-v7a
      os: linux
      dist: xenial

--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -332,18 +332,17 @@ int Main(int argc, char **argv) {
  std::map<std::string, mace::MaceTensor> inputs;
  std::map<std::string, mace::MaceTensor> outputs;
  for (size_t i = 0; i < input_count; ++i) {
-    // Allocate input and output
+    // only support float and int32, use char for generalization
    int64_t input_size =
-        std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1,
+        std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 4,
                        std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
+    auto buffer_in = std::shared_ptr<char>(new char[input_size],
+                                            std::default_delete<char[]>());
    // load input
    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
                          std::ios::in | std::ios::binary);
    if (in_file.is_open()) {
-      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
-                   input_size * sizeof(float));
+      in_file.read(buffer_in.get(), input_size);
      in_file.close();
    } else {
      LOG(INFO) << "Open input file failed";
@@ -354,12 +353,13 @@ int Main(int argc, char **argv) {
  }

  for (size_t i = 0; i < output_count; ++i) {
+    // only support float and int32, use char for generalization
    int64_t output_size =
        std::accumulate(output_shape_vec[i].begin(),
-                        output_shape_vec[i].end(), 1,
+                        output_shape_vec[i].end(), 4,
                        std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
+    auto buffer_out = std::shared_ptr<char>(new char[output_size],
+                                            std::default_delete<char[]>());
    outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
                                                buffer_out,
                                                output_data_formats[i]);

--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
@@ -33,7 +33,7 @@ namespace mace {

 bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
  static const std::unordered_set<std::string> kReuseOp = {
-      "Reshape", "Identity", "Squeeze", "ExpandDims"
+      "Reshape", "Identity", "Squeeze"
  };
  return kReuseOp.count(op_type) == 1;
 }

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -267,6 +267,7 @@ bool RunModel(const std::vector<std::string> &input_names,
        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
                        std::multiplies<int64_t>());
    inputs_size[input_names[i]] = input_size;
+    // Only support float and int32 data type
    auto buffer_in = std::shared_ptr<float>(new float[input_size],
                                            std::default_delete<float[]>());
    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
@@ -277,6 +278,7 @@ bool RunModel(const std::vector<std::string> &input_names,
    int64_t output_size =
        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
                        std::multiplies<int64_t>());
+    // Only support float and int32 data type
    auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                             std::default_delete<float[]>());
    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -284,13 +284,13 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
 class MaceTensor::Impl {
 public:
  std::vector<int64_t> shape;
-  std::shared_ptr<float> data;
+  std::shared_ptr<void> data;
  DataFormat format;
  int64_t buffer_size;
 };

 MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
-                       std::shared_ptr<float> data,
+                       std::shared_ptr<void> data,
                       const DataFormat format) {
  MACE_CHECK_NOTNULL(data.get());
  MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC
@@ -345,9 +345,21 @@ MaceTensor::~MaceTensor() = default;

 const std::vector<int64_t> &MaceTensor::shape() const { return impl_->shape; }

-const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }
+const std::shared_ptr<float> MaceTensor::data() const {
+  return std::static_pointer_cast<float>(impl_->data);
+}
+
+std::shared_ptr<float> MaceTensor::data() {
+  return std::static_pointer_cast<float>(impl_->data);
+}
+
+std::shared_ptr<void> MaceTensor::raw_data() const {
+  return impl_->data;
+}

-std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
+std::shared_ptr<void> MaceTensor::raw_mutable_data() {
+  return impl_->data;
+}

 DataFormat MaceTensor::data_format() const {
  return impl_->format;
@@ -466,8 +478,9 @@ MaceStatus MaceEngine::Impl::Init(
                 << "' does not belong to model's inputs: "
                 << MakeString(MapKeys(input_info_map_));
    }
+    DataType input_dt = input_info_map_[input_name].data_type();
    Tensor *input_tensor =
-        ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
+        ws_->CreateTensor(input_name, device_->allocator(), input_dt);
    // Resize to possible largest shape to avoid resize during running.
    std::vector<index_t> shape(input_info_map_[input_name].dims_size());
    for (int i = 0; i < input_info_map_[input_name].dims_size(); ++i) {
@@ -485,8 +498,9 @@ MaceStatus MaceEngine::Impl::Init(
                 << MakeString(MapKeys(output_info_map_));
    }
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
+    DataType output_dt = output_info_map_[output_name].data_type();
    Tensor *output_tensor =
-        ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
+        ws_->CreateTensor(output_name, device_->allocator(), output_dt);
    output_tensor->set_data_format(NHWC);
 #endif
  }
@@ -572,54 +586,71 @@ MaceStatus MaceEngine::Impl::TransposeInput(
    Tensor *input_tensor) {
  bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
  DataFormat data_format = DataFormat::DF_NONE;
+  DataType input_dt = input_tensor->dtype();
  if (has_data_format) {
+    std::vector<int> dst_dims;
    if (device_->device_type() == DeviceType::CPU &&
        input.second.shape().size() == 4 &&
        input.second.data_format() == NHWC &&
        !is_quantized_model_) {
      VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
      input_tensor->set_data_format(DataFormat::NCHW);
-      std::vector<int> dst_dims = {0, 3, 1, 2};
-      std::vector<index_t> output_shape =
-          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
-      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
-      Tensor::MappingGuard input_guard(input_tensor);
-      float *input_data = input_tensor->mutable_data<float>();
-      return ops::Transpose(input.second.data().get(),
-                            input.second.shape(),
-                            dst_dims,
-                            input_data);
+      dst_dims = {0, 3, 1, 2};
    } else if (
        (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
            input.second.shape().size() == 4 &&
            input.second.data_format() == DataFormat::NCHW) {
      VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
-      std::vector<int> dst_dims = {0, 2, 3, 1};
      input_tensor->set_data_format(DataFormat::NHWC);
+      dst_dims = {0, 2, 3, 1};
+    }
+    if (!dst_dims.empty()) {
      std::vector<index_t> output_shape =
          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
      Tensor::MappingGuard input_guard(input_tensor);
-      float *input_data = input_tensor->mutable_data<float>();
-      return ops::Transpose(input.second.data().get(),
-                            input.second.shape(),
-                            dst_dims,
-                            input_data);
+      if (input_dt == DataType::DT_FLOAT) {
+        auto input_data = input_tensor->mutable_data<float>();
+        return ops::Transpose(input.second.data<float>().get(),
+                              input.second.shape(),
+                              dst_dims,
+                              input_data,
+                              input_dt);
+      } else if (input_dt == DataType::DT_INT32) {
+        auto input_data = input_tensor->mutable_data<int>();
+        return ops::Transpose(input.second.data<int>().get(),
+                              input.second.shape(),
+                              dst_dims,
+                              input_data,
+                              input_dt);
+      } else {
+        LOG(FATAL) << "MACE do not support the input data type: " << input_dt;
+      }
    }
+
    data_format = input.second.data_format();
  }
  input_tensor->set_data_format(data_format);
  MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
  Tensor::MappingGuard input_guard(input_tensor);
-  float *input_data = input_tensor->mutable_data<float>();
-  memcpy(input_data, input.second.data().get(),
-         input_tensor->size() * sizeof(float));
+  if (input_dt == DataType::DT_FLOAT) {
+    auto input_data = input_tensor->mutable_data<float>();
+    memcpy(input_data, input.second.data().get(),
+           input_tensor->size() * sizeof(float));
+  } else if (input_dt == DataType::DT_INT32) {
+    auto input_data = input_tensor->mutable_data<int>();
+    memcpy(input_data, input.second.data().get(),
+           input_tensor->size() * sizeof(int));
+  } else {
+    LOG(FATAL) << "MACE do not support the input data type: " << input_dt;
+  }
  return MaceStatus::MACE_SUCCESS;
 }

 MaceStatus MaceEngine::Impl::TransposeOutput(
    const mace::Tensor *output_tensor,
    std::pair<const std::string, mace::MaceTensor> *output) {
+  DataType output_dt = output_tensor->dtype();
  // save output
  if (output_tensor != nullptr && output->second.data() != nullptr) {
    if (output_tensor->data_format() != DataFormat::DF_NONE &&
@@ -655,11 +686,23 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
        << output->second.impl_->buffer_size;
      output->second.impl_->shape = shape;
      Tensor::MappingGuard output_guard(output_tensor);
-      const float *output_data = output_tensor->data<float>();
-      return ops::Transpose(output_data,
-                            output_tensor->shape(),
-                            dst_dims,
-                            output->second.data().get());
+      if (output_dt == DataType::DT_FLOAT) {
+        auto output_data = output_tensor->data<float>();
+        return ops::Transpose(output_data,
+                              output_tensor->shape(),
+                              dst_dims,
+                              output->second.data<float>().get());
+      } else if (output_dt == DataType::DT_INT32) {
+        auto output_data = output_tensor->data<int>();
+        return ops::Transpose(output_data,
+                              output_tensor->shape(),
+                              dst_dims,
+                              output->second.data<int>().get(),
+                              output_dt);
+      } else {
+        LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
+        return MaceStatus::MACE_INVALID_ARGS;
+      }
    } else {
      Tensor::MappingGuard output_guard(output_tensor);
      auto shape = output_tensor->shape();
@@ -670,8 +713,17 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
        << MakeString<int64_t>(shape) << " vs buffer size "
        << output->second.impl_->buffer_size;
      output->second.impl_->shape = shape;
-      std::memcpy(output->second.data().get(), output_tensor->data<float>(),
-                  output_size * sizeof(float));
+      if (output_dt == DataType::DT_FLOAT) {
+        std::memcpy(output->second.data<float>().get(),
+                    output_tensor->data<float>(),
+                    output_size * sizeof(float));
+      } else if (output_dt == DataType::DT_INT32) {
+        std::memcpy(output->second.data<int>().get(),
+            output_tensor->data<int>(),
+            output_size * sizeof(int));
+      } else {
+        LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
+      }
      return MaceStatus::MACE_SUCCESS;
    }
  } else {

--- a/mace/ops/common/transpose.cc
+++ b/mace/ops/common/transpose.cc
@@ -14,19 +14,14 @@

 #include "mace/ops/common/transpose.h"

-#include <algorithm>
-
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
 #endif

-#include "mace/core/types.h"
-#include "mace/utils/logging.h"
-
 namespace mace {
 namespace ops {

-namespace {
+namespace transpose {
 void TransposeNHWCToNCHWC3(const float *input,
                           float *output,
                           const index_t height,
@@ -100,119 +95,44 @@ void TransposeNCHWToNHWCC2(const float *input,
 #endif
  }
 }
-}  // namespace

-MaceStatus Transpose(const float *input,
-                     const std::vector<int64_t> &input_shape,
-                     const std::vector<int> &dst_dims,
-                     float *output) {
-  MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
-               (input_shape.size() == 4 && dst_dims.size() == 4),
-             "Only support 2D or 4D transpose");
+void TransposeNHWCToNCHWC3(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width) {
+  index_t image_size = height * width;

-  std::vector<index_t> output_shape;
-  for (size_t i = 0; i < dst_dims.size(); ++i) {
-    output_shape.push_back(input_shape[dst_dims[i]]);
-  }
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width * 3;
+    index_t out_offset = h * width;

-  if (input_shape.size() == 2) {
-    MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
-    index_t height = input_shape[0];
-    index_t width = input_shape[1];
-    index_t stride_i = height;
-    index_t stride_j = width;
-    index_t tile_size = height > 512 || width > 512 ? 64 : 32;
-#pragma omp parallel for collapse(2)
-    for (index_t i = 0; i < height; i += tile_size) {
-      for (index_t j = 0; j < width; j += tile_size) {
-        index_t end_i = std::min(i + tile_size, height);
-        index_t end_j = std::min(j + tile_size, width);
-        for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
-          for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
-            output[tile_j * stride_i + tile_i] =
-              input[tile_i * stride_j + tile_j];
-          }
-        }
+    for (index_t w = 0; w < width; ++w) {
+      for (index_t c = 0; c < 3; ++c) {
+        output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c];
      }
    }
-  } else if (input_shape.size() == 4) {
-    std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
-    std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
-    index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
-
-    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) {
-      for (index_t b = 0; b < input_shape[0]; ++b) {
-        TransposeNHWCToNCHWC3(input + b * batch_size,
-                              output + b * batch_size,
-                              input_shape[1],
-                              input_shape[2]);
-      }
-    } else if (dst_dims == transpose_order_from_NCHW_to_NHWC
-      && input_shape[1] == 2) {
-      for (index_t b = 0; b < input_shape[0]; ++b) {
-        TransposeNCHWToNHWCC2(input + b * batch_size,
-                              output + b * batch_size,
-                              input_shape[2],
-                              input_shape[3]);
-      }
-    } else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
-      index_t height = input_shape[1];
-      index_t width = input_shape[2];
-      index_t channel = input_shape[3];
-      index_t channel_raw_size = channel * sizeof(float);
-      index_t stride_i = height;
-      index_t stride_j = width;
-      index_t tile_size = std::max(static_cast<index_t>(1),
-                                   static_cast<index_t>(std::sqrt(
-                                     8 * 1024 / channel)));
-#pragma omp parallel for collapse(2)
-      for (index_t i = 0; i < height; i += tile_size) {
-        for (index_t j = 0; j < width; j += tile_size) {
-          index_t end_i = std::min(i + tile_size, height);
-          index_t end_j = std::min(j + tile_size, width);
-          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
-            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
-              memcpy(output + (tile_j * stride_i + tile_i) * channel,
-                     input + (tile_i * stride_j + tile_j) * channel,
-                     channel_raw_size);
-            }
-          }
-        }
-      }
-    } else {
-      std::vector<index_t>
-        in_stride{input_shape[1] * input_shape[2] * input_shape[3],
-                  input_shape[2] * input_shape[3], input_shape[3], 1};
-      std::vector<index_t>
-        out_stride{output_shape[1] * output_shape[2] * output_shape[3],
-                   output_shape[2] * output_shape[3], output_shape[3], 1};
+  }
+}

-      std::vector<index_t> idim(4, 0);
-      std::vector<index_t> odim(4, 0);
-      for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
-        for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
-          for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
-            for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
-              idim[dst_dims[0]] = odim[0];
-              idim[dst_dims[1]] = odim[1];
-              idim[dst_dims[2]] = odim[2];
-              idim[dst_dims[3]] = odim[3];
+void TransposeNCHWToNHWCC2(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width) {
+  index_t image_size = height * width;
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width;
+    index_t out_offset = h * width * 2;

-              output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
-                + odim[2] * out_stride[2] + odim[3]] =
-                input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
-                  + idim[2] * in_stride[2] + idim[3]];
-            }
-          }
-        }
+    for (index_t w = 0; w < width; ++w) {
+      for (index_t c = 0; c < 2; ++c) {
+        output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w];
      }
    }
-  } else {
-    MACE_NOT_IMPLEMENTED;
  }
-
-  return MaceStatus::MACE_SUCCESS;
 }
+}  // namespace transpose

 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/common/transpose.h
+++ b/mace/ops/common/transpose.h
@@ -15,17 +15,154 @@
 #ifndef MACE_OPS_COMMON_TRANSPOSE_H_
 #define MACE_OPS_COMMON_TRANSPOSE_H_

+#include <algorithm>
 #include <vector>

 #include "mace/public/mace.h"
+#include "mace/core/tensor.h"

 namespace mace {
 namespace ops {
+namespace transpose {

-MaceStatus Transpose(const float *input,
+void TransposeNHWCToNCHWC3(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width);
+
+void TransposeNHWCToNCHWC3(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width);
+
+void TransposeNCHWToNHWCC2(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width);
+
+void TransposeNCHWToNHWCC2(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width);
+}  // namespace transpose
+
+template <typename T>
+MaceStatus Transpose(const T *input,
                     const std::vector<int64_t> &input_shape,
                     const std::vector<int> &dst_dims,
-                     float *output);
+                     T *output,
+                     DataType data_type = DataType::DT_FLOAT) {
+  MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
+      (input_shape.size() == 4 && dst_dims.size() == 4),
+             "Only support 2D or 4D transpose");
+
+  std::vector<index_t> output_shape;
+  for (size_t i = 0; i < dst_dims.size(); ++i) {
+    output_shape.push_back(input_shape[dst_dims[i]]);
+  }
+
+  if (input_shape.size() == 2) {
+    MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
+    index_t height = input_shape[0];
+    index_t width = input_shape[1];
+    index_t stride_i = height;
+    index_t stride_j = width;
+    index_t tile_size = height > 512 || width > 512 ? 64 : 32;
+#pragma omp parallel for collapse(2)
+    for (index_t i = 0; i < height; i += tile_size) {
+      for (index_t j = 0; j < width; j += tile_size) {
+        index_t end_i = std::min(i + tile_size, height);
+        index_t end_j = std::min(j + tile_size, width);
+        for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+          for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+            output[tile_j * stride_i + tile_i] =
+                input[tile_i * stride_j + tile_j];
+          }
+        }
+      }
+    }
+  } else if (input_shape.size() == 4) {
+    std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
+    std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
+    index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
+    bool supported_dt = (data_type == DataType::DT_FLOAT ||
+        data_type == DataType::DT_INT32);
+
+    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3 &&
+        supported_dt) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        transpose::TransposeNHWCToNCHWC3(input + b * batch_size,
+                                         output + b * batch_size,
+                                         input_shape[1],
+                                         input_shape[2]);
+      }
+    } else if (dst_dims == transpose_order_from_NCHW_to_NHWC
+        && input_shape[1] == 2 && supported_dt) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        transpose::TransposeNCHWToNHWCC2(input + b * batch_size,
+                                         output + b * batch_size,
+                                         input_shape[2],
+                                         input_shape[3]);
+      }
+    } else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
+      index_t height = input_shape[1];
+      index_t width = input_shape[2];
+      index_t channel = input_shape[3];
+      index_t channel_raw_size = channel * sizeof(T);
+      index_t stride_i = height;
+      index_t stride_j = width;
+      index_t tile_size = std::max(static_cast<index_t>(1),
+                                   static_cast<index_t>(std::sqrt(
+                                       8 * 1024 / channel)));
+#pragma omp parallel for collapse(2)
+      for (index_t i = 0; i < height; i += tile_size) {
+        for (index_t j = 0; j < width; j += tile_size) {
+          index_t end_i = std::min(i + tile_size, height);
+          index_t end_j = std::min(j + tile_size, width);
+          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+              memcpy(output + (tile_j * stride_i + tile_i) * channel,
+                     input + (tile_i * stride_j + tile_j) * channel,
+                     channel_raw_size);
+            }
+          }
+        }
+      }
+    } else {
+      std::vector<index_t>
+          in_stride{input_shape[1] * input_shape[2] * input_shape[3],
+                    input_shape[2] * input_shape[3], input_shape[3], 1};
+      std::vector<index_t>
+          out_stride{output_shape[1] * output_shape[2] * output_shape[3],
+                     output_shape[2] * output_shape[3], output_shape[3], 1};
+
+      std::vector<index_t> idim(4, 0);
+      std::vector<index_t> odim(4, 0);
+      for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
+        for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
+          for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
+            for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
+              idim[dst_dims[0]] = odim[0];
+              idim[dst_dims[1]] = odim[1];
+              idim[dst_dims[2]] = odim[2];
+              idim[dst_dims[3]] = odim[3];
+
+              output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
+                  + odim[2] * out_stride[2] + odim[3]] =
+                  input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+                      + idim[2] * in_stride[2] + idim[3]];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+

 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
@@ -14,6 +14,8 @@


 #include "mace/core/operator.h"
+#include "mace/ops/common/transpose.h"
+#include "mace/utils/math.h"

 namespace mace {
 namespace ops {
@@ -33,21 +35,35 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);
    index_t input_dims_size = input->dim_size();
-    if ( axis_ < 0 ) {
+    if (axis_ < 0) {
      axis_ += input_dims_size + 1;
    }
    MACE_CHECK(axis_ >= 0 && axis_ <= input_dims_size,
               "axis is out of bound: ", axis_);
    const std::vector<index_t> input_shape = input->shape();
-    std::vector<index_t> output_shape;
-    output_shape.insert(output_shape.end(), input_shape.begin(),
-                        input_shape.begin() + axis_);
-    output_shape.insert(output_shape.end(), 1);
-    output_shape.insert(output_shape.end(), input_shape.begin() + axis_,
-                        input_shape.end());
+    std::vector<index_t> output_shape(input_shape);
+    output_shape.insert(output_shape.begin() + axis_, 1);

-    output->ReuseTensorBuffer(*input);
-    output->Reshape(output_shape);
+    bool has_data_format = Operation::GetOptionalArg<int>(
+        "has_data_format", 0) == 1;
+    if (has_data_format && output_shape.size() == 4) {
+      // only tensorflow support expand dim, so the default format is NHWC
+      // transform NHWC to NCHW
+      auto t_output_shape = TransposeShape<int64_t, int64_t>(output_shape,
+          {0, 3, 1, 2});
+      output->Resize(t_output_shape);
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard output_guard(output);
+      auto input_data = input->data<T>();
+      auto output_data = output->mutable_data<T>();
+
+      Transpose(input_data, output_shape, {0, 3, 1, 2}, output_data);
+    } else {
+      output->Resize(output_shape);
+      Tensor::MappingGuard input_guard(input);
+      auto input_data = input->data<T>();
+      output->Copy<T>(input_data, input->size());
+    }

    return MaceStatus::MACE_SUCCESS;
  }
@@ -62,11 +78,6 @@ void RegisterExpandDims(OpRegistryBase *op_registry) {

  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
                   DeviceType::CPU, int32_t);
-
-#ifdef MACE_ENABLE_QUANTIZE
-  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
-                   DeviceType::CPU, uint8_t);
-#endif  // MACE_ENABLE_QUANTIZE
 }

 }  // namespace ops

--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -326,7 +326,7 @@ class MACE_API MaceTensor {
  //        of shared_ptr and manage the life cycle of the buffer by yourself.
  //        For example, std::shared_ptr<float>(raw_buffer, [](float *){});
  MaceTensor(const std::vector<int64_t> &shape,
-             std::shared_ptr<float> data,
+             std::shared_ptr<void> data,
             const DataFormat format = DataFormat::NHWC);
  MaceTensor();
  MaceTensor(const MaceTensor &other);
@@ -339,8 +339,20 @@ class MACE_API MaceTensor {
  const std::vector<int64_t> &shape() const;
  const std::shared_ptr<float> data() const;
  std::shared_ptr<float> data();
+  template <typename T>
+  const std::shared_ptr<T> data() const {
+    return std::static_pointer_cast<T>(raw_data());
+  }
+  template <typename T>
+  std::shared_ptr<T> data() {
+    return std::static_pointer_cast<T>(raw_mutable_data());
+  }
  DataFormat data_format() const;

+ private:
+  std::shared_ptr<void> raw_data() const;
+  std::shared_ptr<void> raw_mutable_data();
+
 private:
  class Impl;
  std::unique_ptr<Impl> impl_;

--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -47,6 +47,11 @@ data_format_map = {
    'OIHW': cvt.DataFormat.OIHW,
 }

+data_type_map = {
+    'float32': mace_pb2.DT_FLOAT,
+    'int32': mace_pb2.DT_INT32,
+}
+

 def parse_data_type(data_type, device_type):
    if device_type == cvt.DeviceType.CPU.value or \
@@ -141,6 +146,7 @@ def main(unused_args):
    option.data_type = parse_data_type(FLAGS.data_type, option.device)

    input_node_names = FLAGS.input_node.split(',')
+    input_data_types = FLAGS.input_data_types.split(',')
    input_node_shapes = FLAGS.input_shape.split(':')
    input_node_formats = FLAGS.input_data_formats.split(",")
    if FLAGS.input_range:
@@ -152,10 +158,8 @@ def main(unused_args):
    for i in six.moves.range(len(input_node_names)):
        input_node = cvt.NodeInfo()
        input_node.name = input_node_names[i]
-        if len(input_node_formats) == 1:
-            input_node.data_format = data_format_map[input_node_formats[0]]
-        else:
-            input_node.data_format = data_format_map[input_node_formats[i]]
+        input_node.data_type = data_type_map[input_data_types[i]]
+        input_node.data_format = data_format_map[input_node_formats[i]]
        input_node.shape = parse_int_array_from_str(input_node_shapes[i])
        if input_node.data_format == cvt.DataFormat.NCHW and\
                len(input_node.shape) == 4:
@@ -166,6 +170,7 @@ def main(unused_args):
        option.add_input_node(input_node)

    output_node_names = FLAGS.output_node.split(',')
+    output_data_types = FLAGS.output_data_types.split(',')
    output_node_shapes = FLAGS.output_shape.split(':')
    output_node_formats = FLAGS.output_data_formats.split(",")
    if len(output_node_names) != len(output_node_shapes):
@@ -173,10 +178,8 @@ def main(unused_args):
    for i in six.moves.range(len(output_node_names)):
        output_node = cvt.NodeInfo()
        output_node.name = output_node_names[i]
-        if len(output_node_formats) == 1:
-            output_node.data_format = data_format_map[output_node_formats[0]]
-        else:
-            output_node.data_format = data_format_map[output_node_formats[i]]
+        output_node.data_type = data_type_map[output_data_types[i]]
+        output_node.data_format = data_format_map[output_node_formats[i]]
        output_node.shape = parse_int_array_from_str(output_node_shapes[i])
        if output_node.data_format == cvt.DataFormat.NCHW and\
                len(output_node.shape) == 4:
@@ -290,6 +293,11 @@ def parse_args():
        type=str,
        default="input_node",
        help="e.g., input_node")
+    parser.add_argument(
+        "--input_data_types",
+        type=str,
+        default="float32",
+        help="e.g., float32|int32")
    parser.add_argument(
        "--input_data_formats",
        type=str,
@@ -297,6 +305,11 @@ def parse_args():
        help="e.g., NHWC,NONE")
    parser.add_argument(
        "--output_node", type=str, default="softmax", help="e.g., softmax")
+    parser.add_argument(
+        "--output_data_types",
+        type=str,
+        default="float32",
+        help="e.g., float32|int32")
    parser.add_argument(
        "--output_data_formats",
        type=str,

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -298,6 +298,7 @@ class NodeInfo(object):

    def __init__(self):
        self._name = None
+        self._data_type = mace_pb2.DT_FLOAT
        self._shape = []
        self._data_format = DataFormat.NHWC
        self._range = [-1.0, 1.0]
@@ -306,6 +307,10 @@ class NodeInfo(object):
    def name(self):
        return self._name

+    @property
+    def data_type(self):
+        return self._data_type
+
    @property
    def shape(self):
        return self._shape
@@ -322,6 +327,10 @@ class NodeInfo(object):
    def name(self, name):
        self._name = name

+    @data_type.setter
+    def data_type(self, data_type):
+        self._data_type = data_type
+
    @shape.setter
    def shape(self, shape):
        self._shape = shape

--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -102,6 +102,7 @@ TFSupportedOps = [
    'Mean',
    'Const',
    'Gather',
+    'GatherV2',
    'StridedSlice',
    'Slice',
    'ReverseV2',
@@ -241,6 +242,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
            TFOpType.Mean.name: self.convert_mean,
            TFOpType.Const.name: self.convert_nop,
            TFOpType.Gather.name: self.convert_gather,
+            TFOpType.GatherV2.name: self.convert_gather,
            TFOpType.StridedSlice.name: self.convert_stridedslice,
            TFOpType.Slice.name: self.convert_slice,
            TFOpType.ReverseV2.name: self.convert_reverse,
@@ -838,16 +840,11 @@ class TensorflowConverter(base_converter.ConverterInterface):
        op = self.convert_general_op(tf_op)
        op.type = MaceOp.ExpandDims.name

+        axis_value = tf_op.inputs[1].eval().astype(np.int32)
        axis_arg = op.arg.add()
        axis_arg.name = MaceKeyword.mace_axis_str
-        try:
-            axis_value = tf_op.get_attr('dim')
-        except ValueError:
-            try:
-                axis_value = tf_op.get_attr('axis')
-            except ValueError:
-                axis_value = 0
        axis_arg.i = axis_value
+        del op.input[1]

    def convert_squeeze(self, tf_op):
        op = self.convert_general_op(tf_op)

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -323,7 +323,7 @@ class Transformer(base_converter.ConverterInterface):
            input_info.name = input_node.name
            input_info.data_format = input_node.data_format.value
            input_info.dims.extend(input_node.shape)
-            input_info.data_type = mace_pb2.DT_FLOAT
+            input_info.data_type = input_node.data_type

        output_nodes = self._option.check_nodes.values()
        for output_node in output_nodes:
@@ -332,7 +332,7 @@ class Transformer(base_converter.ConverterInterface):
            output_info.data_format = output_node.data_format.value
            output_info.dims.extend(
                self._producer[output_node.name].output_shape[0].dims)
-            output_info.data_type = mace_pb2.DT_FLOAT
+            output_info.data_type = output_node.data_type

        return False


--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -317,17 +317,18 @@ bool RunModel(const std::string &model_name,
  std::map<std::string, mace::MaceTensor> outputs;
  for (size_t i = 0; i < input_count; ++i) {
    // Allocate input and output
+    // only support float and int32, use char for generalization
+    // sizeof(int) == 4, sizeof(float) == 4
    int64_t input_size =
-        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
+        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 4,
                        std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
+    auto buffer_in = std::shared_ptr<char>(new char[input_size],
+                                           std::default_delete<char[]>());
    // load input
    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
                          std::ios::in | std::ios::binary);
    if (in_file.is_open()) {
-      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
-                   input_size * sizeof(float));
+      in_file.read(buffer_in.get(), input_size);
      in_file.close();
    } else {
      LOG(INFO) << "Open input file failed";
@@ -338,11 +339,12 @@ bool RunModel(const std::string &model_name,
  }

  for (size_t i = 0; i < output_count; ++i) {
+    // only support float and int32, use char for generalization
    int64_t output_size =
-        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4,
                        std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
+    auto buffer_out = std::shared_ptr<char>(new char[output_size],
+                                            std::default_delete<char[]>());
    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
        output_data_formats[i]);
  }
@@ -454,12 +456,12 @@ bool RunModel(const std::string &model_name,
    std::string output_name =
        FLAGS_output_file + "_" + FormatName(output_names[i]);
    std::ofstream out_file(output_name, std::ios::binary);
+    // only support float and int32
    int64_t output_size =
-        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4,
                        std::multiplies<int64_t>());
    out_file.write(
-        reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
-        output_size * sizeof(float));
+        outputs[output_names[i]].data<char>().get(), output_size);
    out_file.flush();
    out_file.close();
    LOG(INFO) << "Write output file " << output_name << " with size "
@@ -524,6 +526,7 @@ int Main(int argc, char **argv) {

  // get cpu capability
  Capability cpu_capability = GetCapability(DeviceType::CPU);
+  float cpu_float32_performance = cpu_capability.float32_performance.exec_time;

  bool ret = false;
  for (int i = 0; i < FLAGS_restart_round; ++i) {
@@ -531,7 +534,7 @@ int Main(int argc, char **argv) {
    ret = RunModel(FLAGS_model_name,
        input_names, input_shape_vec, input_data_formats,
        output_names, output_shape_vec, output_data_formats,
-        cpu_capability.float32_performance.exec_time);
+        cpu_float32_performance);
  }
  if (ret) {
    return 0;

--- a/tools/common.py
+++ b/tools/common.py
@@ -397,6 +397,7 @@ class YAMLKeyword(object):
    runtime = 'runtime'
    data_type = 'data_type'
    input_data_types = 'input_data_types'
+    output_data_types = 'output_data_types'
    input_data_formats = 'input_data_formats'
    output_data_formats = 'output_data_formats'
    limit_opencl_kernel_time = 'limit_opencl_kernel_time'

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -65,13 +65,13 @@ RuntimeTypeStrs = [
    "cpu+gpu"
 ]

-InputDataTypeStrs = [
+InOutDataTypeStrs = [
    "int32",
    "float32",
 ]

-InputDataType = Enum('InputDataType',
-                     [(ele, ele) for ele in InputDataTypeStrs],
+InOutDataType = Enum('InputDataType',
+                     [(ele, ele) for ele in InOutDataTypeStrs],
                     type=str)

 FPDataTypeStrs = [
@@ -410,17 +410,23 @@ def format_model_config(flags):
                else:
                    subgraph[key] = []

-            input_data_types = subgraph.get(YAMLKeyword.input_data_types, "")
-            if input_data_types:
-                if not isinstance(input_data_types, list):
-                    subgraph[YAMLKeyword.input_data_types] = [input_data_types]
-                for input_data_type in subgraph[YAMLKeyword.input_data_types]:
-                    mace_check(input_data_type in InputDataTypeStrs,
-                               ModuleName.YAML_CONFIG,
-                               "'input_data_types' must be in "
-                               + str(InputDataTypeStrs))
-            else:
-                subgraph[YAMLKeyword.input_data_types] = []
+            for key in [YAMLKeyword.input_data_types,
+                        YAMLKeyword.output_data_types]:
+                if key == YAMLKeyword.input_data_types:
+                    count = input_size
+                else:
+                    count = output_size
+                data_types = subgraph.get(key, "")
+                if data_types:
+                    if not isinstance(data_types, list):
+                        subgraph[key] = [data_types] * count
+                    for data_type in subgraph[key]:
+                        mace_check(data_type in InOutDataTypeStrs,
+                                   ModuleName.YAML_CONFIG,
+                                   key + " must be in "
+                                   + str(InOutDataTypeStrs))
+                else:
+                    subgraph[key] = [InOutDataType.float32] * count

            input_data_formats = subgraph.get(YAMLKeyword.input_data_formats,
                                              [])
@@ -722,8 +728,10 @@ def convert_model(configs, cl_mem_type):
            model_config[YAMLKeyword.model_sha256_checksum],
            model_config[YAMLKeyword.weight_sha256_checksum],
            ",".join(subgraphs[0][YAMLKeyword.input_tensors]),
+            ",".join(subgraphs[0][YAMLKeyword.input_data_types]),
            ",".join(subgraphs[0][YAMLKeyword.input_data_formats]),
            ",".join(subgraphs[0][YAMLKeyword.output_tensors]),
+            ",".join(subgraphs[0][YAMLKeyword.output_data_types]),
            ",".join(subgraphs[0][YAMLKeyword.output_data_formats]),
            ",".join(subgraphs[0][YAMLKeyword.check_tensors]),
            runtime,

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -480,8 +480,10 @@ def gen_model_code(model_codegen_dir,
                   model_sha256_checksum,
                   weight_sha256_checksum,
                   input_nodes,
+                   input_data_types,
                   input_data_formats,
                   output_nodes,
+                   output_data_types,
                   output_data_formats,
                   check_nodes,
                   runtime,
@@ -515,8 +517,10 @@ def gen_model_code(model_codegen_dir,
              "--model_checksum=%s" % model_sha256_checksum,
              "--weight_checksum=%s" % weight_sha256_checksum,
              "--input_node=%s" % input_nodes,
+              "--input_data_types=%s" % input_data_types,
              "--input_data_formats=%s" % input_data_formats,
              "--output_node=%s" % output_nodes,
+              "--output_data_types=%s" % output_data_types,
              "--output_data_formats=%s" % output_data_formats,
              "--check_node=%s" % check_nodes,
              "--runtime=%s" % runtime,