Merge branch 'embedding' into 'master'

Support int32 input data type. See merge request !1047

Merge branch 'embedding' into 'master'
Support int32 input data type. See merge request !1047
5967c7ab · 李寅 · c35775c7 · 80d1c9dd · 5967c7ab · 5967c7ab
16 changed file
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -335,18 +335,17 @@ int Main(int argc, char **argv) {
  std::map<std::string, mace::MaceTensor> inputs;
  std::map<std::string, mace::MaceTensor> outputs;
  for (size_t i = 0; i < input_count; ++i) {
-    // Allocate input and output
+    // only support float and int32, use char for generalization
    int64_t input_size =
-        std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1,
+        std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 4,
                        std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
+    auto buffer_in = std::shared_ptr<char>(new char[input_size],
+                                            std::default_delete<char[]>());
    // load input
    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
                          std::ios::in | std::ios::binary);
    if (in_file.is_open()) {
-      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
-                   input_size * sizeof(float));
+      in_file.read(buffer_in.get(), input_size);
      in_file.close();
    } else {
      LOG(INFO) << "Open input file failed";
@@ -357,12 +356,13 @@ int Main(int argc, char **argv) {
  }

  for (size_t i = 0; i < output_count; ++i) {
+    // only support float and int32, use char for generalization
    int64_t output_size =
        std::accumulate(output_shape_vec[i].begin(),
-                        output_shape_vec[i].end(), 1,
+                        output_shape_vec[i].end(), 4,
                        std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
+    auto buffer_out = std::shared_ptr<char>(new char[output_size],
+                                            std::default_delete<char[]>());
    outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
                                                buffer_out,
                                                output_data_formats[i]);

--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
@@ -33,7 +33,7 @@ namespace mace {

 bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
  static const std::unordered_set<std::string> kReuseOp = {
-      "Reshape", "Identity", "Squeeze", "ExpandDims"
+      "Reshape", "Identity", "Squeeze"
  };
  return kReuseOp.count(op_type) == 1;
 }

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -269,6 +269,7 @@ bool RunModel(const std::vector<std::string> &input_names,
        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
                        std::multiplies<int64_t>());
    inputs_size[input_names[i]] = input_size;
+    // Only support float and int32 data type
    auto buffer_in = std::shared_ptr<float>(new float[input_size],
                                            std::default_delete<float[]>());
    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
@@ -279,6 +280,7 @@ bool RunModel(const std::vector<std::string> &input_names,
    int64_t output_size =
        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
                        std::multiplies<int64_t>());
+    // Only support float and int32 data type
    auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                             std::default_delete<float[]>());
    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -284,13 +284,13 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
 class MaceTensor::Impl {
 public:
  std::vector<int64_t> shape;
-  std::shared_ptr<float> data;
+  std::shared_ptr<void> data;
  DataFormat format;
  int64_t buffer_size;
 };

 MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
-                       std::shared_ptr<float> data,
+                       std::shared_ptr<void> data,
                       const DataFormat format) {
  MACE_CHECK_NOTNULL(data.get());
  MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC
@@ -345,9 +345,21 @@ MaceTensor::~MaceTensor() = default;

 const std::vector<int64_t> &MaceTensor::shape() const { return impl_->shape; }

-const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }
+const std::shared_ptr<float> MaceTensor::data() const {
+  return std::static_pointer_cast<float>(impl_->data);
+}
+
+std::shared_ptr<float> MaceTensor::data() {
+  return std::static_pointer_cast<float>(impl_->data);
+}
+
+std::shared_ptr<void> MaceTensor::raw_data() const {
+  return impl_->data;
+}

-std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
+std::shared_ptr<void> MaceTensor::raw_mutable_data() {
+  return impl_->data;
+}

 DataFormat MaceTensor::data_format() const {
  return impl_->format;
@@ -466,8 +478,9 @@ MaceStatus MaceEngine::Impl::Init(
                 << "' does not belong to model's inputs: "
                 << MakeString(MapKeys(input_info_map_));
    }
+    DataType input_dt = input_info_map_[input_name].data_type();
    Tensor *input_tensor =
-        ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
+        ws_->CreateTensor(input_name, device_->allocator(), input_dt);
    // Resize to possible largest shape to avoid resize during running.
    std::vector<index_t> shape(input_info_map_[input_name].dims_size());
    for (int i = 0; i < input_info_map_[input_name].dims_size(); ++i) {
@@ -485,8 +498,9 @@ MaceStatus MaceEngine::Impl::Init(
                 << MakeString(MapKeys(output_info_map_));
    }
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
+    DataType output_dt = output_info_map_[output_name].data_type();
    Tensor *output_tensor =
-        ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
+        ws_->CreateTensor(output_name, device_->allocator(), output_dt);
    output_tensor->set_data_format(NHWC);
 #endif
  }
@@ -572,54 +586,71 @@ MaceStatus MaceEngine::Impl::TransposeInput(
    Tensor *input_tensor) {
  bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
  DataFormat data_format = DataFormat::DF_NONE;
+  DataType input_dt = input_tensor->dtype();
  if (has_data_format) {
+    std::vector<int> dst_dims;
    if (device_->device_type() == DeviceType::CPU &&
        input.second.shape().size() == 4 &&
        input.second.data_format() == NHWC &&
        !is_quantized_model_) {
      VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
      input_tensor->set_data_format(DataFormat::NCHW);
-      std::vector<int> dst_dims = {0, 3, 1, 2};
-      std::vector<index_t> output_shape =
-          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
-      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
-      Tensor::MappingGuard input_guard(input_tensor);
-      float *input_data = input_tensor->mutable_data<float>();
-      return ops::Transpose(input.second.data().get(),
-                            input.second.shape(),
-                            dst_dims,
-                            input_data);
+      dst_dims = {0, 3, 1, 2};
    } else if (
        (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
            input.second.shape().size() == 4 &&
            input.second.data_format() == DataFormat::NCHW) {
      VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
-      std::vector<int> dst_dims = {0, 2, 3, 1};
      input_tensor->set_data_format(DataFormat::NHWC);
+      dst_dims = {0, 2, 3, 1};
+    }
+    if (!dst_dims.empty()) {
      std::vector<index_t> output_shape =
          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
      Tensor::MappingGuard input_guard(input_tensor);
-      float *input_data = input_tensor->mutable_data<float>();
-      return ops::Transpose(input.second.data().get(),
-                            input.second.shape(),
-                            dst_dims,
-                            input_data);
+      if (input_dt == DataType::DT_FLOAT) {
+        auto input_data = input_tensor->mutable_data<float>();
+        return ops::Transpose(input.second.data<float>().get(),
+                              input.second.shape(),
+                              dst_dims,
+                              input_data,
+                              input_dt);
+      } else if (input_dt == DataType::DT_INT32) {
+        auto input_data = input_tensor->mutable_data<int>();
+        return ops::Transpose(input.second.data<int>().get(),
+                              input.second.shape(),
+                              dst_dims,
+                              input_data,
+                              input_dt);
+      } else {
+        LOG(FATAL) << "MACE do not support the input data type: " << input_dt;
+      }
    }
+
    data_format = input.second.data_format();
  }
  input_tensor->set_data_format(data_format);
  MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
  Tensor::MappingGuard input_guard(input_tensor);
-  float *input_data = input_tensor->mutable_data<float>();
-  memcpy(input_data, input.second.data().get(),
-         input_tensor->size() * sizeof(float));
+  if (input_dt == DataType::DT_FLOAT) {
+    auto input_data = input_tensor->mutable_data<float>();
+    memcpy(input_data, input.second.data().get(),
+           input_tensor->size() * sizeof(float));
+  } else if (input_dt == DataType::DT_INT32) {
+    auto input_data = input_tensor->mutable_data<int>();
+    memcpy(input_data, input.second.data().get(),
+           input_tensor->size() * sizeof(int));
+  } else {
+    LOG(FATAL) << "MACE do not support the input data type: " << input_dt;
+  }
  return MaceStatus::MACE_SUCCESS;
 }

 MaceStatus MaceEngine::Impl::TransposeOutput(
    const mace::Tensor *output_tensor,
    std::pair<const std::string, mace::MaceTensor> *output) {
+  DataType output_dt = output_tensor->dtype();
  // save output
  if (output_tensor != nullptr && output->second.data() != nullptr) {
    if (output_tensor->data_format() != DataFormat::DF_NONE &&
@@ -655,11 +686,23 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
        << output->second.impl_->buffer_size;
      output->second.impl_->shape = shape;
      Tensor::MappingGuard output_guard(output_tensor);
-      const float *output_data = output_tensor->data<float>();
-      return ops::Transpose(output_data,
-                            output_tensor->shape(),
-                            dst_dims,
-                            output->second.data().get());
+      if (output_dt == DataType::DT_FLOAT) {
+        auto output_data = output_tensor->data<float>();
+        return ops::Transpose(output_data,
+                              output_tensor->shape(),
+                              dst_dims,
+                              output->second.data<float>().get());
+      } else if (output_dt == DataType::DT_INT32) {
+        auto output_data = output_tensor->data<int>();
+        return ops::Transpose(output_data,
+                              output_tensor->shape(),
+                              dst_dims,
+                              output->second.data<int>().get(),
+                              output_dt);
+      } else {
+        LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
+        return MaceStatus::MACE_INVALID_ARGS;
+      }
    } else {
      Tensor::MappingGuard output_guard(output_tensor);
      auto shape = output_tensor->shape();
@@ -670,8 +713,17 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
        << MakeString<int64_t>(shape) << " vs buffer size "
        << output->second.impl_->buffer_size;
      output->second.impl_->shape = shape;
-      std::memcpy(output->second.data().get(), output_tensor->data<float>(),
-                  output_size * sizeof(float));
+      if (output_dt == DataType::DT_FLOAT) {
+        std::memcpy(output->second.data<float>().get(),
+                    output_tensor->data<float>(),
+                    output_size * sizeof(float));
+      } else if (output_dt == DataType::DT_INT32) {
+        std::memcpy(output->second.data<int>().get(),
+            output_tensor->data<int>(),
+            output_size * sizeof(int));
+      } else {
+        LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
+      }
      return MaceStatus::MACE_SUCCESS;
    }
  } else {

--- a/mace/ops/common/transpose.cc
+++ b/mace/ops/common/transpose.cc
@@ -14,19 +14,14 @@

 #include "mace/ops/common/transpose.h"

-#include <algorithm>
-
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
 #endif

-#include "mace/core/types.h"
-#include "mace/utils/logging.h"
-
 namespace mace {
 namespace ops {

-namespace {
+namespace transpose {
 void TransposeNHWCToNCHWC3(const float *input,
                           float *output,
                           const index_t height,
@@ -100,119 +95,44 @@ void TransposeNCHWToNHWCC2(const float *input,
 #endif
  }
 }
-}  // namespace

-MaceStatus Transpose(const float *input,
-                     const std::vector<int64_t> &input_shape,
-                     const std::vector<int> &dst_dims,
-                     float *output) {
-  MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
-               (input_shape.size() == 4 && dst_dims.size() == 4),
-             "Only support 2D or 4D transpose");
+void TransposeNHWCToNCHWC3(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width) {
+  index_t image_size = height * width;

-  std::vector<index_t> output_shape;
-  for (size_t i = 0; i < dst_dims.size(); ++i) {
-    output_shape.push_back(input_shape[dst_dims[i]]);
-  }
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width * 3;
+    index_t out_offset = h * width;

-  if (input_shape.size() == 2) {
-    MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
-    index_t height = input_shape[0];
-    index_t width = input_shape[1];
-    index_t stride_i = height;
-    index_t stride_j = width;
-    index_t tile_size = height > 512 || width > 512 ? 64 : 32;
-#pragma omp parallel for collapse(2)
-    for (index_t i = 0; i < height; i += tile_size) {
-      for (index_t j = 0; j < width; j += tile_size) {
-        index_t end_i = std::min(i + tile_size, height);
-        index_t end_j = std::min(j + tile_size, width);
-        for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
-          for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
-            output[tile_j * stride_i + tile_i] =
-              input[tile_i * stride_j + tile_j];
-          }
-        }
+    for (index_t w = 0; w < width; ++w) {
+      for (index_t c = 0; c < 3; ++c) {
+        output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c];
      }
    }
-  } else if (input_shape.size() == 4) {
-    std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
-    std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
-    index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
-
-    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) {
-      for (index_t b = 0; b < input_shape[0]; ++b) {
-        TransposeNHWCToNCHWC3(input + b * batch_size,
-                              output + b * batch_size,
-                              input_shape[1],
-                              input_shape[2]);
-      }
-    } else if (dst_dims == transpose_order_from_NCHW_to_NHWC
-      && input_shape[1] == 2) {
-      for (index_t b = 0; b < input_shape[0]; ++b) {
-        TransposeNCHWToNHWCC2(input + b * batch_size,
-                              output + b * batch_size,
-                              input_shape[2],
-                              input_shape[3]);
-      }
-    } else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
-      index_t height = input_shape[1];
-      index_t width = input_shape[2];
-      index_t channel = input_shape[3];
-      index_t channel_raw_size = channel * sizeof(float);
-      index_t stride_i = height;
-      index_t stride_j = width;
-      index_t tile_size = std::max(static_cast<index_t>(1),
-                                   static_cast<index_t>(std::sqrt(
-                                     8 * 1024 / channel)));
-#pragma omp parallel for collapse(2)
-      for (index_t i = 0; i < height; i += tile_size) {
-        for (index_t j = 0; j < width; j += tile_size) {
-          index_t end_i = std::min(i + tile_size, height);
-          index_t end_j = std::min(j + tile_size, width);
-          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
-            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
-              memcpy(output + (tile_j * stride_i + tile_i) * channel,
-                     input + (tile_i * stride_j + tile_j) * channel,
-                     channel_raw_size);
-            }
-          }
-        }
-      }
-    } else {
-      std::vector<index_t>
-        in_stride{input_shape[1] * input_shape[2] * input_shape[3],
-                  input_shape[2] * input_shape[3], input_shape[3], 1};
-      std::vector<index_t>
-        out_stride{output_shape[1] * output_shape[2] * output_shape[3],
-                   output_shape[2] * output_shape[3], output_shape[3], 1};
+  }
+}

-      std::vector<index_t> idim(4, 0);
-      std::vector<index_t> odim(4, 0);
-      for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
-        for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
-          for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
-            for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
-              idim[dst_dims[0]] = odim[0];
-              idim[dst_dims[1]] = odim[1];
-              idim[dst_dims[2]] = odim[2];
-              idim[dst_dims[3]] = odim[3];
+void TransposeNCHWToNHWCC2(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width) {
+  index_t image_size = height * width;
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width;
+    index_t out_offset = h * width * 2;

-              output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
-                + odim[2] * out_stride[2] + odim[3]] =
-                input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
-                  + idim[2] * in_stride[2] + idim[3]];
-            }
-          }
-        }
+    for (index_t w = 0; w < width; ++w) {
+      for (index_t c = 0; c < 2; ++c) {
+        output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w];
      }
    }
-  } else {
-    MACE_NOT_IMPLEMENTED;
  }
-
-  return MaceStatus::MACE_SUCCESS;
 }
+}  // namespace transpose

 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/common/transpose.h
+++ b/mace/ops/common/transpose.h
@@ -15,17 +15,154 @@
 #ifndef MACE_OPS_COMMON_TRANSPOSE_H_
 #define MACE_OPS_COMMON_TRANSPOSE_H_

+#include <algorithm>
 #include <vector>

 #include "mace/public/mace.h"
+#include "mace/core/tensor.h"

 namespace mace {
 namespace ops {
+namespace transpose {

-MaceStatus Transpose(const float *input,
+void TransposeNHWCToNCHWC3(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width);
+
+void TransposeNHWCToNCHWC3(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width);
+
+void TransposeNCHWToNHWCC2(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width);
+
+void TransposeNCHWToNHWCC2(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width);
+}  // namespace transpose
+
+template <typename T>
+MaceStatus Transpose(const T *input,
                     const std::vector<int64_t> &input_shape,
                     const std::vector<int> &dst_dims,
-                     float *output);
+                     T *output,
+                     DataType data_type = DataType::DT_FLOAT) {
+  MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
+      (input_shape.size() == 4 && dst_dims.size() == 4),
+             "Only support 2D or 4D transpose");
+
+  std::vector<index_t> output_shape;
+  for (size_t i = 0; i < dst_dims.size(); ++i) {
+    output_shape.push_back(input_shape[dst_dims[i]]);
+  }
+
+  if (input_shape.size() == 2) {
+    MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
+    index_t height = input_shape[0];
+    index_t width = input_shape[1];
+    index_t stride_i = height;
+    index_t stride_j = width;
+    index_t tile_size = height > 512 || width > 512 ? 64 : 32;
+#pragma omp parallel for collapse(2)
+    for (index_t i = 0; i < height; i += tile_size) {
+      for (index_t j = 0; j < width; j += tile_size) {
+        index_t end_i = std::min(i + tile_size, height);
+        index_t end_j = std::min(j + tile_size, width);
+        for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+          for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+            output[tile_j * stride_i + tile_i] =
+                input[tile_i * stride_j + tile_j];
+          }
+        }
+      }
+    }
+  } else if (input_shape.size() == 4) {
+    std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
+    std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
+    index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
+    bool supported_dt = (data_type == DataType::DT_FLOAT ||
+        data_type == DataType::DT_INT32);
+
+    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3 &&
+        supported_dt) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        transpose::TransposeNHWCToNCHWC3(input + b * batch_size,
+                                         output + b * batch_size,
+                                         input_shape[1],
+                                         input_shape[2]);
+      }
+    } else if (dst_dims == transpose_order_from_NCHW_to_NHWC
+        && input_shape[1] == 2 && supported_dt) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        transpose::TransposeNCHWToNHWCC2(input + b * batch_size,
+                                         output + b * batch_size,
+                                         input_shape[2],
+                                         input_shape[3]);
+      }
+    } else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
+      index_t height = input_shape[1];
+      index_t width = input_shape[2];
+      index_t channel = input_shape[3];
+      index_t channel_raw_size = channel * sizeof(T);
+      index_t stride_i = height;
+      index_t stride_j = width;
+      index_t tile_size = std::max(static_cast<index_t>(1),
+                                   static_cast<index_t>(std::sqrt(
+                                       8 * 1024 / channel)));
+#pragma omp parallel for collapse(2)
+      for (index_t i = 0; i < height; i += tile_size) {
+        for (index_t j = 0; j < width; j += tile_size) {
+          index_t end_i = std::min(i + tile_size, height);
+          index_t end_j = std::min(j + tile_size, width);
+          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+              memcpy(output + (tile_j * stride_i + tile_i) * channel,
+                     input + (tile_i * stride_j + tile_j) * channel,
+                     channel_raw_size);
+            }
+          }
+        }
+      }
+    } else {
+      std::vector<index_t>
+          in_stride{input_shape[1] * input_shape[2] * input_shape[3],
+                    input_shape[2] * input_shape[3], input_shape[3], 1};
+      std::vector<index_t>
+          out_stride{output_shape[1] * output_shape[2] * output_shape[3],
+                     output_shape[2] * output_shape[3], output_shape[3], 1};
+
+      std::vector<index_t> idim(4, 0);
+      std::vector<index_t> odim(4, 0);
+      for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
+        for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
+          for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
+            for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
+              idim[dst_dims[0]] = odim[0];
+              idim[dst_dims[1]] = odim[1];
+              idim[dst_dims[2]] = odim[2];
+              idim[dst_dims[3]] = odim[3];
+
+              output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
+                  + odim[2] * out_stride[2] + odim[3]] =
+                  input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+                      + idim[2] * in_stride[2] + idim[3]];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+

 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
@@ -14,6 +14,8 @@


 #include "mace/core/operator.h"
+#include "mace/ops/common/transpose.h"
+#include "mace/utils/math.h"

 namespace mace {
 namespace ops {
@@ -33,21 +35,35 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);
    index_t input_dims_size = input->dim_size();
-    if ( axis_ < 0 ) {
+    if (axis_ < 0) {
      axis_ += input_dims_size + 1;
    }
    MACE_CHECK(axis_ >= 0 && axis_ <= input_dims_size,
               "axis is out of bound: ", axis_);
    const std::vector<index_t> input_shape = input->shape();
-    std::vector<index_t> output_shape;
-    output_shape.insert(output_shape.end(), input_shape.begin(),
-                        input_shape.begin() + axis_);
-    output_shape.insert(output_shape.end(), 1);
-    output_shape.insert(output_shape.end(), input_shape.begin() + axis_,
-                        input_shape.end());
+    std::vector<index_t> output_shape(input_shape);
+    output_shape.insert(output_shape.begin() + axis_, 1);

-    output->ReuseTensorBuffer(*input);
-    output->Reshape(output_shape);
+    bool has_data_format = Operation::GetOptionalArg<int>(
+        "has_data_format", 0) == 1;
+    if (has_data_format && output_shape.size() == 4) {
+      // only tensorflow support expand dim, so the default format is NHWC
+      // transform NHWC to NCHW
+      auto t_output_shape = TransposeShape<int64_t, int64_t>(output_shape,
+          {0, 3, 1, 2});
+      output->Resize(t_output_shape);
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard output_guard(output);
+      auto input_data = input->data<T>();
+      auto output_data = output->mutable_data<T>();
+
+      Transpose(input_data, output_shape, {0, 3, 1, 2}, output_data);
+    } else {
+      output->Resize(output_shape);
+      Tensor::MappingGuard input_guard(input);
+      auto input_data = input->data<T>();
+      output->Copy<T>(input_data, input->size());
+    }

    return MaceStatus::MACE_SUCCESS;
  }
@@ -62,11 +78,6 @@ void RegisterExpandDims(OpRegistryBase *op_registry) {

  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
                   DeviceType::CPU, int32_t);
-
-#ifdef MACE_ENABLE_QUANTIZE
-  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
-                   DeviceType::CPU, uint8_t);
-#endif  // MACE_ENABLE_QUANTIZE
 }

 }  // namespace ops

--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -326,7 +326,7 @@ class MACE_API MaceTensor {
  //        of shared_ptr and manage the life cycle of the buffer by yourself.
  //        For example, std::shared_ptr<float>(raw_buffer, [](float *){});
  MaceTensor(const std::vector<int64_t> &shape,
-             std::shared_ptr<float> data,
+             std::shared_ptr<void> data,
             const DataFormat format = DataFormat::NHWC);
  MaceTensor();
  MaceTensor(const MaceTensor &other);
@@ -339,8 +339,20 @@ class MACE_API MaceTensor {
  const std::vector<int64_t> &shape() const;
  const std::shared_ptr<float> data() const;
  std::shared_ptr<float> data();
+  template <typename T>
+  const std::shared_ptr<T> data() const {
+    return std::static_pointer_cast<T>(raw_data());
+  }
+  template <typename T>
+  std::shared_ptr<T> data() {
+    return std::static_pointer_cast<T>(raw_mutable_data());
+  }
  DataFormat data_format() const;

+ private:
+  std::shared_ptr<void> raw_data() const;
+  std::shared_ptr<void> raw_mutable_data();
+
 private:
  class Impl;
  std::unique_ptr<Impl> impl_;

--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -47,6 +47,11 @@ data_format_map = {
    'OIHW': cvt.DataFormat.OIHW,
 }

+data_type_map = {
+    'float32': mace_pb2.DT_FLOAT,
+    'int32': mace_pb2.DT_INT32,
+}
+

 def parse_data_type(data_type, device_type):
    if device_type == cvt.DeviceType.CPU.value or \
@@ -141,6 +146,7 @@ def main(unused_args):
    option.data_type = parse_data_type(FLAGS.data_type, option.device)

    input_node_names = FLAGS.input_node.split(',')
+    input_data_types = FLAGS.input_data_types.split(',')
    input_node_shapes = FLAGS.input_shape.split(':')
    input_node_formats = FLAGS.input_data_formats.split(",")
    if FLAGS.input_range:
@@ -152,10 +158,8 @@ def main(unused_args):
    for i in six.moves.range(len(input_node_names)):
        input_node = cvt.NodeInfo()
        input_node.name = input_node_names[i]
-        if len(input_node_formats) == 1:
-            input_node.data_format = data_format_map[input_node_formats[0]]
-        else:
-            input_node.data_format = data_format_map[input_node_formats[i]]
+        input_node.data_type = data_type_map[input_data_types[i]]
+        input_node.data_format = data_format_map[input_node_formats[i]]
        input_node.shape = parse_int_array_from_str(input_node_shapes[i])
        if input_node.data_format == cvt.DataFormat.NCHW and\
                len(input_node.shape) == 4:
@@ -166,6 +170,7 @@ def main(unused_args):
        option.add_input_node(input_node)

    output_node_names = FLAGS.output_node.split(',')
+    output_data_types = FLAGS.output_data_types.split(',')
    output_node_shapes = FLAGS.output_shape.split(':')
    output_node_formats = FLAGS.output_data_formats.split(",")
    if len(output_node_names) != len(output_node_shapes):
@@ -173,10 +178,8 @@ def main(unused_args):
    for i in six.moves.range(len(output_node_names)):
        output_node = cvt.NodeInfo()
        output_node.name = output_node_names[i]
-        if len(output_node_formats) == 1:
-            output_node.data_format = data_format_map[output_node_formats[0]]
-        else:
-            output_node.data_format = data_format_map[output_node_formats[i]]
+        output_node.data_type = data_type_map[output_data_types[i]]
+        output_node.data_format = data_format_map[output_node_formats[i]]
        output_node.shape = parse_int_array_from_str(output_node_shapes[i])
        if output_node.data_format == cvt.DataFormat.NCHW and\
                len(output_node.shape) == 4:
@@ -290,6 +293,11 @@ def parse_args():
        type=str,
        default="input_node",
        help="e.g., input_node")
+    parser.add_argument(
+        "--input_data_types",
+        type=str,
+        default="float32",
+        help="e.g., float32|int32")
    parser.add_argument(
        "--input_data_formats",
        type=str,
@@ -297,6 +305,11 @@ def parse_args():
        help="e.g., NHWC,NONE")
    parser.add_argument(
        "--output_node", type=str, default="softmax", help="e.g., softmax")
+    parser.add_argument(
+        "--output_data_types",
+        type=str,
+        default="float32",
+        help="e.g., float32|int32")
    parser.add_argument(
        "--output_data_formats",
        type=str,

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -298,6 +298,7 @@ class NodeInfo(object):

    def __init__(self):
        self._name = None
+        self._data_type = mace_pb2.DT_FLOAT
        self._shape = []
        self._data_format = DataFormat.NHWC
        self._range = [-1.0, 1.0]
@@ -306,6 +307,10 @@ class NodeInfo(object):
    def name(self):
        return self._name

+    @property
+    def data_type(self):
+        return self._data_type
+
    @property
    def shape(self):
        return self._shape
@@ -322,6 +327,10 @@ class NodeInfo(object):
    def name(self, name):
        self._name = name

+    @data_type.setter
+    def data_type(self, data_type):
+        self._data_type = data_type
+
    @shape.setter
    def shape(self, shape):
        self._shape = shape

--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -102,6 +102,7 @@ TFSupportedOps = [
    'Mean',
    'Const',
    'Gather',
+    'GatherV2',
    'StridedSlice',
    'Slice',
    'ReverseV2',
@@ -241,6 +242,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
            TFOpType.Mean.name: self.convert_mean,
            TFOpType.Const.name: self.convert_nop,
            TFOpType.Gather.name: self.convert_gather,
+            TFOpType.GatherV2.name: self.convert_gather,
            TFOpType.StridedSlice.name: self.convert_stridedslice,
            TFOpType.Slice.name: self.convert_slice,
            TFOpType.ReverseV2.name: self.convert_reverse,
@@ -838,16 +840,11 @@ class TensorflowConverter(base_converter.ConverterInterface):
        op = self.convert_general_op(tf_op)
        op.type = MaceOp.ExpandDims.name

+        axis_value = tf_op.inputs[1].eval().astype(np.int32)
        axis_arg = op.arg.add()
        axis_arg.name = MaceKeyword.mace_axis_str
-        try:
-            axis_value = tf_op.get_attr('dim')
-        except ValueError:
-            try:
-                axis_value = tf_op.get_attr('axis')
-            except ValueError:
-                axis_value = 0
        axis_arg.i = axis_value
+        del op.input[1]

    def convert_squeeze(self, tf_op):
        op = self.convert_general_op(tf_op)

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -323,7 +323,7 @@ class Transformer(base_converter.ConverterInterface):
            input_info.name = input_node.name
            input_info.data_format = input_node.data_format.value
            input_info.dims.extend(input_node.shape)
-            input_info.data_type = mace_pb2.DT_FLOAT
+            input_info.data_type = input_node.data_type

        output_nodes = self._option.check_nodes.values()
        for output_node in output_nodes:
@@ -332,7 +332,7 @@ class Transformer(base_converter.ConverterInterface):
            output_info.data_format = output_node.data_format.value
            output_info.dims.extend(
                self._producer[output_node.name].output_shape[0].dims)
-            output_info.data_type = mace_pb2.DT_FLOAT
+            output_info.data_type = output_node.data_type

        return False


--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -269,17 +269,18 @@ bool RunModel(const std::string &model_name,
  std::map<std::string, mace::MaceTensor> outputs;
  for (size_t i = 0; i < input_count; ++i) {
    // Allocate input and output
+    // only support float and int32, use char for generalization
+    // sizeof(int) == 4, sizeof(float) == 4
    int64_t input_size =
-        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
+        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 4,
                        std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
+    auto buffer_in = std::shared_ptr<char>(new char[input_size],
+                                           std::default_delete<char[]>());
    // load input
    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
                          std::ios::in | std::ios::binary);
    if (in_file.is_open()) {
-      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
-                   input_size * sizeof(float));
+      in_file.read(buffer_in.get(), input_size);
      in_file.close();
    } else {
      LOG(INFO) << "Open input file failed";
@@ -290,11 +291,12 @@ bool RunModel(const std::string &model_name,
  }

  for (size_t i = 0; i < output_count; ++i) {
+    // only support float and int32, use char for generalization
    int64_t output_size =
-        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4,
                        std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
+    auto buffer_out = std::shared_ptr<char>(new char[output_size],
+                                            std::default_delete<char[]>());
    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
        output_data_formats[i]);
  }
@@ -408,12 +410,12 @@ bool RunModel(const std::string &model_name,
    std::string output_name =
        FLAGS_output_file + "_" + FormatName(output_names[i]);
    std::ofstream out_file(output_name, std::ios::binary);
+    // only support float and int32
    int64_t output_size =
-        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4,
                        std::multiplies<int64_t>());
    out_file.write(
-        reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
-        output_size * sizeof(float));
+        outputs[output_names[i]].data<char>().get(), output_size);
    out_file.flush();
    out_file.close();
    LOG(INFO) << "Write output file " << output_name << " with size "
@@ -478,6 +480,7 @@ int Main(int argc, char **argv) {

  // get cpu capability
  Capability cpu_capability = GetCapability(DeviceType::CPU);
+  float cpu_float32_performance = cpu_capability.float32_performance.exec_time;

  bool ret = false;
  for (int i = 0; i < FLAGS_restart_round; ++i) {
@@ -485,7 +488,7 @@ int Main(int argc, char **argv) {
    ret = RunModel(FLAGS_model_name,
        input_names, input_shape_vec, input_data_formats,
        output_names, output_shape_vec, output_data_formats,
-        cpu_capability.float32_performance.exec_time);
+        cpu_float32_performance);
  }
  if (ret) {
    return 0;

--- a/tools/common.py
+++ b/tools/common.py
@@ -397,6 +397,7 @@ class YAMLKeyword(object):
    runtime = 'runtime'
    data_type = 'data_type'
    input_data_types = 'input_data_types'
+    output_data_types = 'output_data_types'
    input_data_formats = 'input_data_formats'
    output_data_formats = 'output_data_formats'
    limit_opencl_kernel_time = 'limit_opencl_kernel_time'

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -65,13 +65,13 @@ RuntimeTypeStrs = [
    "cpu+gpu"
 ]

-InputDataTypeStrs = [
+InOutDataTypeStrs = [
    "int32",
    "float32",
 ]

-InputDataType = Enum('InputDataType',
-                     [(ele, ele) for ele in InputDataTypeStrs],
+InOutDataType = Enum('InputDataType',
+                     [(ele, ele) for ele in InOutDataTypeStrs],
                     type=str)

 FPDataTypeStrs = [
@@ -410,17 +410,23 @@ def format_model_config(flags):
                else:
                    subgraph[key] = []

-            input_data_types = subgraph.get(YAMLKeyword.input_data_types, "")
-            if input_data_types:
-                if not isinstance(input_data_types, list):
-                    subgraph[YAMLKeyword.input_data_types] = [input_data_types]
-                for input_data_type in subgraph[YAMLKeyword.input_data_types]:
-                    mace_check(input_data_type in InputDataTypeStrs,
-                               ModuleName.YAML_CONFIG,
-                               "'input_data_types' must be in "
-                               + str(InputDataTypeStrs))
-            else:
-                subgraph[YAMLKeyword.input_data_types] = []
+            for key in [YAMLKeyword.input_data_types,
+                        YAMLKeyword.output_data_types]:
+                if key == YAMLKeyword.input_data_types:
+                    count = input_size
+                else:
+                    count = output_size
+                data_types = subgraph.get(key, "")
+                if data_types:
+                    if not isinstance(data_types, list):
+                        subgraph[key] = [data_types] * count
+                    for data_type in subgraph[key]:
+                        mace_check(data_type in InOutDataTypeStrs,
+                                   ModuleName.YAML_CONFIG,
+                                   key + " must be in "
+                                   + str(InOutDataTypeStrs))
+                else:
+                    subgraph[key] = [InOutDataType.float32] * count

            input_data_formats = subgraph.get(YAMLKeyword.input_data_formats,
                                              [])
@@ -722,8 +728,10 @@ def convert_model(configs, cl_mem_type):
            model_config[YAMLKeyword.model_sha256_checksum],
            model_config[YAMLKeyword.weight_sha256_checksum],
            ",".join(subgraphs[0][YAMLKeyword.input_tensors]),
+            ",".join(subgraphs[0][YAMLKeyword.input_data_types]),
            ",".join(subgraphs[0][YAMLKeyword.input_data_formats]),
            ",".join(subgraphs[0][YAMLKeyword.output_tensors]),
+            ",".join(subgraphs[0][YAMLKeyword.output_data_types]),
            ",".join(subgraphs[0][YAMLKeyword.output_data_formats]),
            ",".join(subgraphs[0][YAMLKeyword.check_tensors]),
            runtime,

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -484,8 +484,10 @@ def gen_model_code(model_codegen_dir,
                   model_sha256_checksum,
                   weight_sha256_checksum,
                   input_nodes,
+                   input_data_types,
                   input_data_formats,
                   output_nodes,
+                   output_data_types,
                   output_data_formats,
                   check_nodes,
                   runtime,
@@ -519,8 +521,10 @@ def gen_model_code(model_codegen_dir,
              "--model_checksum=%s" % model_sha256_checksum,
              "--weight_checksum=%s" % weight_sha256_checksum,
              "--input_node=%s" % input_nodes,
+              "--input_data_types=%s" % input_data_types,
              "--input_data_formats=%s" % input_data_formats,
              "--output_node=%s" % output_nodes,
+              "--output_data_types=%s" % output_data_types,
              "--output_data_formats=%s" % output_data_formats,
              "--check_node=%s" % check_nodes,
              "--runtime=%s" % runtime,