diff --git a/.travis.yml b/.travis.yml
index b0ba55e06293a857defd7943f58ba3e5ed339c46..889c11d3d0557877463188a57d73e76f932ebca0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -114,7 +114,7 @@ jobs:
         - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=False --target_abis=armeabi-v7a || exit 1
         - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=False --target_abis=armeabi-v7a || exit 1
         - echo 'Extra Test'
-        - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=False --target_abis=armeabi-v7a || exit 1
+        - python tools/bazel_adb_run.py --target="//mace/utils:utils_test" --run_target=False --target_abis=armeabi-v7a || exit 1
       env: TYPE=Extra-Test-ARMEABI-v7a
       os: linux
       dist: xenial
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index adb267f3c8bb5361e5b4f929d3888b37b1c014f2..685ad8f2e3edca492b58b66ffc4e0d149146c33c 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -332,18 +332,17 @@ int Main(int argc, char **argv) {
   std::map<std::string, mace::MaceTensor> inputs;
   std::map<std::string, mace::MaceTensor> outputs;
   for (size_t i = 0; i < input_count; ++i) {
-    // Allocate input and output
+    // only support float and int32, use char for generalization
     int64_t input_size =
-        std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1,
+        std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 4,
                         std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
+    auto buffer_in = std::shared_ptr<char>(new char[input_size],
+                                            std::default_delete<char[]>());
     // load input
     std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
                           std::ios::in | std::ios::binary);
     if (in_file.is_open()) {
-      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
-                   input_size * sizeof(float));
+      in_file.read(buffer_in.get(), input_size);
       in_file.close();
     } else {
       LOG(INFO) << "Open input file failed";
@@ -354,12 +353,13 @@ int Main(int argc, char **argv) {
   }
 
   for (size_t i = 0; i < output_count; ++i) {
+    // only support float and int32, use char for generalization
     int64_t output_size =
         std::accumulate(output_shape_vec[i].begin(),
-                        output_shape_vec[i].end(), 1,
+                        output_shape_vec[i].end(), 4,
                         std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
+    auto buffer_out = std::shared_ptr<char>(new char[output_size],
+                                            std::default_delete<char[]>());
     outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
                                                 buffer_out,
                                                 output_data_formats[i]);
diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc
index 0136e3f130e679a13f3e50ccc86c6c4bfda1e4f6..7f86d0eb426d5c5834f9d498f9554c73a0602df0 100644
--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
@@ -33,7 +33,7 @@ namespace mace {
 
 bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
   static const std::unordered_set<std::string> kReuseOp = {
-      "Reshape", "Identity", "Squeeze", "ExpandDims"
+      "Reshape", "Identity", "Squeeze"
   };
   return kReuseOp.count(op_type) == 1;
 }
diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc
index 26f615d132421011207429be6cffc516751863bb..845f9ff6f83f3814a9d00face7e3573f134f5e14 100644
--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -267,6 +267,7 @@ bool RunModel(const std::vector<std::string> &input_names,
         std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
                         std::multiplies<int64_t>());
     inputs_size[input_names[i]] = input_size;
+    // Only support float and int32 data type
     auto buffer_in = std::shared_ptr<float>(new float[input_size],
                                             std::default_delete<float[]>());
     inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
@@ -277,6 +278,7 @@ bool RunModel(const std::vector<std::string> &input_names,
     int64_t output_size =
         std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
                         std::multiplies<int64_t>());
+    // Only support float and int32 data type
     auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                              std::default_delete<float[]>());
     outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 3476fef1486d9bf95d8dad01eb3f2f73a8520115..0a44eb97e3969329721e4aa7c95cc8340541b3b8 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -284,13 +284,13 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
 class MaceTensor::Impl {
  public:
   std::vector<int64_t> shape;
-  std::shared_ptr<float> data;
+  std::shared_ptr<void> data;
   DataFormat format;
   int64_t buffer_size;
 };
 
 MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
-                       std::shared_ptr<float> data,
+                       std::shared_ptr<void> data,
                        const DataFormat format) {
   MACE_CHECK_NOTNULL(data.get());
   MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC
@@ -345,9 +345,21 @@ MaceTensor::~MaceTensor() = default;
 
 const std::vector<int64_t> &MaceTensor::shape() const { return impl_->shape; }
 
-const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }
+const std::shared_ptr<float> MaceTensor::data() const {
+  return std::static_pointer_cast<float>(impl_->data);
+}
+
+std::shared_ptr<float> MaceTensor::data() {
+  return std::static_pointer_cast<float>(impl_->data);
+}
+
+std::shared_ptr<void> MaceTensor::raw_data() const {
+  return impl_->data;
+}
 
-std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
+std::shared_ptr<void> MaceTensor::raw_mutable_data() {
+  return impl_->data;
+}
 
 DataFormat MaceTensor::data_format() const {
   return impl_->format;
@@ -466,8 +478,9 @@ MaceStatus MaceEngine::Impl::Init(
                  << "' does not belong to model's inputs: "
                  << MakeString(MapKeys(input_info_map_));
     }
+    DataType input_dt = input_info_map_[input_name].data_type();
     Tensor *input_tensor =
-        ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
+        ws_->CreateTensor(input_name, device_->allocator(), input_dt);
     // Resize to possible largest shape to avoid resize during running.
     std::vector<index_t> shape(input_info_map_[input_name].dims_size());
     for (int i = 0; i < input_info_map_[input_name].dims_size(); ++i) {
@@ -485,8 +498,9 @@ MaceStatus MaceEngine::Impl::Init(
                  << MakeString(MapKeys(output_info_map_));
     }
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
+    DataType output_dt = output_info_map_[output_name].data_type();
     Tensor *output_tensor =
-        ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
+        ws_->CreateTensor(output_name, device_->allocator(), output_dt);
     output_tensor->set_data_format(NHWC);
 #endif
   }
@@ -572,54 +586,71 @@ MaceStatus MaceEngine::Impl::TransposeInput(
     Tensor *input_tensor) {
   bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
   DataFormat data_format = DataFormat::DF_NONE;
+  DataType input_dt = input_tensor->dtype();
   if (has_data_format) {
+    std::vector<int> dst_dims;
     if (device_->device_type() == DeviceType::CPU &&
         input.second.shape().size() == 4 &&
         input.second.data_format() == NHWC &&
         !is_quantized_model_) {
       VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
       input_tensor->set_data_format(DataFormat::NCHW);
-      std::vector<int> dst_dims = {0, 3, 1, 2};
-      std::vector<index_t> output_shape =
-          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
-      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
-      Tensor::MappingGuard input_guard(input_tensor);
-      float *input_data = input_tensor->mutable_data<float>();
-      return ops::Transpose(input.second.data().get(),
-                            input.second.shape(),
-                            dst_dims,
-                            input_data);
+      dst_dims = {0, 3, 1, 2};
     } else if (
         (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
             input.second.shape().size() == 4 &&
             input.second.data_format() == DataFormat::NCHW) {
       VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
-      std::vector<int> dst_dims = {0, 2, 3, 1};
       input_tensor->set_data_format(DataFormat::NHWC);
+      dst_dims = {0, 2, 3, 1};
+    }
+    if (!dst_dims.empty()) {
       std::vector<index_t> output_shape =
           TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
       MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
       Tensor::MappingGuard input_guard(input_tensor);
-      float *input_data = input_tensor->mutable_data<float>();
-      return ops::Transpose(input.second.data().get(),
-                            input.second.shape(),
-                            dst_dims,
-                            input_data);
+      if (input_dt == DataType::DT_FLOAT) {
+        auto input_data = input_tensor->mutable_data<float>();
+        return ops::Transpose(input.second.data<float>().get(),
+                              input.second.shape(),
+                              dst_dims,
+                              input_data,
+                              input_dt);
+      } else if (input_dt == DataType::DT_INT32) {
+        auto input_data = input_tensor->mutable_data<int>();
+        return ops::Transpose(input.second.data<int>().get(),
+                              input.second.shape(),
+                              dst_dims,
+                              input_data,
+                              input_dt);
+      } else {
+        LOG(FATAL) << "MACE do not support the input data type: " << input_dt;
+      }
     }
+
     data_format = input.second.data_format();
   }
   input_tensor->set_data_format(data_format);
   MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
   Tensor::MappingGuard input_guard(input_tensor);
-  float *input_data = input_tensor->mutable_data<float>();
-  memcpy(input_data, input.second.data().get(),
-         input_tensor->size() * sizeof(float));
+  if (input_dt == DataType::DT_FLOAT) {
+    auto input_data = input_tensor->mutable_data<float>();
+    memcpy(input_data, input.second.data().get(),
+           input_tensor->size() * sizeof(float));
+  } else if (input_dt == DataType::DT_INT32) {
+    auto input_data = input_tensor->mutable_data<int>();
+    memcpy(input_data, input.second.data().get(),
+           input_tensor->size() * sizeof(int));
+  } else {
+    LOG(FATAL) << "MACE do not support the input data type: " << input_dt;
+  }
   return MaceStatus::MACE_SUCCESS;
 }
 
 MaceStatus MaceEngine::Impl::TransposeOutput(
     const mace::Tensor *output_tensor,
     std::pair<const std::string, mace::MaceTensor> *output) {
+  DataType output_dt = output_tensor->dtype();
   // save output
   if (output_tensor != nullptr && output->second.data() != nullptr) {
     if (output_tensor->data_format() != DataFormat::DF_NONE &&
@@ -655,11 +686,23 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
         << output->second.impl_->buffer_size;
       output->second.impl_->shape = shape;
       Tensor::MappingGuard output_guard(output_tensor);
-      const float *output_data = output_tensor->data<float>();
-      return ops::Transpose(output_data,
-                            output_tensor->shape(),
-                            dst_dims,
-                            output->second.data().get());
+      if (output_dt == DataType::DT_FLOAT) {
+        auto output_data = output_tensor->data<float>();
+        return ops::Transpose(output_data,
+                              output_tensor->shape(),
+                              dst_dims,
+                              output->second.data<float>().get());
+      } else if (output_dt == DataType::DT_INT32) {
+        auto output_data = output_tensor->data<int>();
+        return ops::Transpose(output_data,
+                              output_tensor->shape(),
+                              dst_dims,
+                              output->second.data<int>().get(),
+                              output_dt);
+      } else {
+        LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
+        return MaceStatus::MACE_INVALID_ARGS;
+      }
     } else {
       Tensor::MappingGuard output_guard(output_tensor);
       auto shape = output_tensor->shape();
@@ -670,8 +713,17 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
         << MakeString<int64_t>(shape) << " vs buffer size "
         << output->second.impl_->buffer_size;
       output->second.impl_->shape = shape;
-      std::memcpy(output->second.data().get(), output_tensor->data<float>(),
-                  output_size * sizeof(float));
+      if (output_dt == DataType::DT_FLOAT) {
+        std::memcpy(output->second.data<float>().get(),
+                    output_tensor->data<float>(),
+                    output_size * sizeof(float));
+      } else if (output_dt == DataType::DT_INT32) {
+        std::memcpy(output->second.data<int>().get(),
+            output_tensor->data<int>(),
+            output_size * sizeof(int));
+      } else {
+        LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
+      }
       return MaceStatus::MACE_SUCCESS;
     }
   } else {
diff --git a/mace/ops/common/transpose.cc b/mace/ops/common/transpose.cc
index 469456a1c4424445ba836261c0f9bd71db878155..79a7a6be064368f34864fee115af6d7735b50a83 100644
--- a/mace/ops/common/transpose.cc
+++ b/mace/ops/common/transpose.cc
@@ -14,19 +14,14 @@
 
 #include "mace/ops/common/transpose.h"
 
-#include <algorithm>
-
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
 #endif
 
-#include "mace/core/types.h"
-#include "mace/utils/logging.h"
-
 namespace mace {
 namespace ops {
 
-namespace {
+namespace transpose {
 void TransposeNHWCToNCHWC3(const float *input,
                            float *output,
                            const index_t height,
@@ -100,119 +95,44 @@ void TransposeNCHWToNHWCC2(const float *input,
 #endif
   }
 }
-}  // namespace
 
-MaceStatus Transpose(const float *input,
-                     const std::vector<int64_t> &input_shape,
-                     const std::vector<int> &dst_dims,
-                     float *output) {
-  MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
-               (input_shape.size() == 4 && dst_dims.size() == 4),
-             "Only support 2D or 4D transpose");
+void TransposeNHWCToNCHWC3(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width) {
+  index_t image_size = height * width;
 
-  std::vector<index_t> output_shape;
-  for (size_t i = 0; i < dst_dims.size(); ++i) {
-    output_shape.push_back(input_shape[dst_dims[i]]);
-  }
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width * 3;
+    index_t out_offset = h * width;
 
-  if (input_shape.size() == 2) {
-    MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
-    index_t height = input_shape[0];
-    index_t width = input_shape[1];
-    index_t stride_i = height;
-    index_t stride_j = width;
-    index_t tile_size = height > 512 || width > 512 ? 64 : 32;
-#pragma omp parallel for collapse(2)
-    for (index_t i = 0; i < height; i += tile_size) {
-      for (index_t j = 0; j < width; j += tile_size) {
-        index_t end_i = std::min(i + tile_size, height);
-        index_t end_j = std::min(j + tile_size, width);
-        for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
-          for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
-            output[tile_j * stride_i + tile_i] =
-              input[tile_i * stride_j + tile_j];
-          }
-        }
+    for (index_t w = 0; w < width; ++w) {
+      for (index_t c = 0; c < 3; ++c) {
+        output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c];
       }
     }
-  } else if (input_shape.size() == 4) {
-    std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
-    std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
-    index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
-
-    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) {
-      for (index_t b = 0; b < input_shape[0]; ++b) {
-        TransposeNHWCToNCHWC3(input + b * batch_size,
-                              output + b * batch_size,
-                              input_shape[1],
-                              input_shape[2]);
-      }
-    } else if (dst_dims == transpose_order_from_NCHW_to_NHWC
-      && input_shape[1] == 2) {
-      for (index_t b = 0; b < input_shape[0]; ++b) {
-        TransposeNCHWToNHWCC2(input + b * batch_size,
-                              output + b * batch_size,
-                              input_shape[2],
-                              input_shape[3]);
-      }
-    } else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
-      index_t height = input_shape[1];
-      index_t width = input_shape[2];
-      index_t channel = input_shape[3];
-      index_t channel_raw_size = channel * sizeof(float);
-      index_t stride_i = height;
-      index_t stride_j = width;
-      index_t tile_size = std::max(static_cast<index_t>(1),
-                                   static_cast<index_t>(std::sqrt(
-                                     8 * 1024 / channel)));
-#pragma omp parallel for collapse(2)
-      for (index_t i = 0; i < height; i += tile_size) {
-        for (index_t j = 0; j < width; j += tile_size) {
-          index_t end_i = std::min(i + tile_size, height);
-          index_t end_j = std::min(j + tile_size, width);
-          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
-            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
-              memcpy(output + (tile_j * stride_i + tile_i) * channel,
-                     input + (tile_i * stride_j + tile_j) * channel,
-                     channel_raw_size);
-            }
-          }
-        }
-      }
-    } else {
-      std::vector<index_t>
-        in_stride{input_shape[1] * input_shape[2] * input_shape[3],
-                  input_shape[2] * input_shape[3], input_shape[3], 1};
-      std::vector<index_t>
-        out_stride{output_shape[1] * output_shape[2] * output_shape[3],
-                   output_shape[2] * output_shape[3], output_shape[3], 1};
+  }
+}
 
-      std::vector<index_t> idim(4, 0);
-      std::vector<index_t> odim(4, 0);
-      for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
-        for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
-          for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
-            for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
-              idim[dst_dims[0]] = odim[0];
-              idim[dst_dims[1]] = odim[1];
-              idim[dst_dims[2]] = odim[2];
-              idim[dst_dims[3]] = odim[3];
+void TransposeNCHWToNHWCC2(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width) {
+  index_t image_size = height * width;
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width;
+    index_t out_offset = h * width * 2;
 
-              output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
-                + odim[2] * out_stride[2] + odim[3]] =
-                input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
-                  + idim[2] * in_stride[2] + idim[3]];
-            }
-          }
-        }
+    for (index_t w = 0; w < width; ++w) {
+      for (index_t c = 0; c < 2; ++c) {
+        output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w];
       }
     }
-  } else {
-    MACE_NOT_IMPLEMENTED;
   }
-
-  return MaceStatus::MACE_SUCCESS;
 }
+}  // namespace transpose
 
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/common/transpose.h b/mace/ops/common/transpose.h
index 5f8e23490698ab71439d2486bc32d269e8d5ee0b..4d2e5a519e680276884fb95ad6edf088738c99d0 100644
--- a/mace/ops/common/transpose.h
+++ b/mace/ops/common/transpose.h
@@ -15,17 +15,154 @@
 #ifndef MACE_OPS_COMMON_TRANSPOSE_H_
 #define MACE_OPS_COMMON_TRANSPOSE_H_
 
+#include <algorithm>
 #include <vector>
 
 #include "mace/public/mace.h"
+#include "mace/core/tensor.h"
 
 namespace mace {
 namespace ops {
+namespace transpose {
 
-MaceStatus Transpose(const float *input,
+void TransposeNHWCToNCHWC3(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width);
+
+void TransposeNHWCToNCHWC3(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width);
+
+void TransposeNCHWToNHWCC2(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width);
+
+void TransposeNCHWToNHWCC2(const int *input,
+                           int *output,
+                           const index_t height,
+                           const index_t width);
+}  // namespace transpose
+
+template <typename T>
+MaceStatus Transpose(const T *input,
                      const std::vector<int64_t> &input_shape,
                      const std::vector<int> &dst_dims,
-                     float *output);
+                     T *output,
+                     DataType data_type = DataType::DT_FLOAT) {
+  MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
+      (input_shape.size() == 4 && dst_dims.size() == 4),
+             "Only support 2D or 4D transpose");
+
+  std::vector<index_t> output_shape;
+  for (size_t i = 0; i < dst_dims.size(); ++i) {
+    output_shape.push_back(input_shape[dst_dims[i]]);
+  }
+
+  if (input_shape.size() == 2) {
+    MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
+    index_t height = input_shape[0];
+    index_t width = input_shape[1];
+    index_t stride_i = height;
+    index_t stride_j = width;
+    index_t tile_size = height > 512 || width > 512 ? 64 : 32;
+#pragma omp parallel for collapse(2)
+    for (index_t i = 0; i < height; i += tile_size) {
+      for (index_t j = 0; j < width; j += tile_size) {
+        index_t end_i = std::min(i + tile_size, height);
+        index_t end_j = std::min(j + tile_size, width);
+        for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+          for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+            output[tile_j * stride_i + tile_i] =
+                input[tile_i * stride_j + tile_j];
+          }
+        }
+      }
+    }
+  } else if (input_shape.size() == 4) {
+    std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
+    std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
+    index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
+    bool supported_dt = (data_type == DataType::DT_FLOAT ||
+        data_type == DataType::DT_INT32);
+
+    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3 &&
+        supported_dt) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        transpose::TransposeNHWCToNCHWC3(input + b * batch_size,
+                                         output + b * batch_size,
+                                         input_shape[1],
+                                         input_shape[2]);
+      }
+    } else if (dst_dims == transpose_order_from_NCHW_to_NHWC
+        && input_shape[1] == 2 && supported_dt) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        transpose::TransposeNCHWToNHWCC2(input + b * batch_size,
+                                         output + b * batch_size,
+                                         input_shape[2],
+                                         input_shape[3]);
+      }
+    } else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
+      index_t height = input_shape[1];
+      index_t width = input_shape[2];
+      index_t channel = input_shape[3];
+      index_t channel_raw_size = channel * sizeof(T);
+      index_t stride_i = height;
+      index_t stride_j = width;
+      index_t tile_size = std::max(static_cast<index_t>(1),
+                                   static_cast<index_t>(std::sqrt(
+                                       8 * 1024 / channel)));
+#pragma omp parallel for collapse(2)
+      for (index_t i = 0; i < height; i += tile_size) {
+        for (index_t j = 0; j < width; j += tile_size) {
+          index_t end_i = std::min(i + tile_size, height);
+          index_t end_j = std::min(j + tile_size, width);
+          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+              memcpy(output + (tile_j * stride_i + tile_i) * channel,
+                     input + (tile_i * stride_j + tile_j) * channel,
+                     channel_raw_size);
+            }
+          }
+        }
+      }
+    } else {
+      std::vector<index_t>
+          in_stride{input_shape[1] * input_shape[2] * input_shape[3],
+                    input_shape[2] * input_shape[3], input_shape[3], 1};
+      std::vector<index_t>
+          out_stride{output_shape[1] * output_shape[2] * output_shape[3],
+                     output_shape[2] * output_shape[3], output_shape[3], 1};
+
+      std::vector<index_t> idim(4, 0);
+      std::vector<index_t> odim(4, 0);
+      for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
+        for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
+          for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
+            for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
+              idim[dst_dims[0]] = odim[0];
+              idim[dst_dims[1]] = odim[1];
+              idim[dst_dims[2]] = odim[2];
+              idim[dst_dims[3]] = odim[3];
+
+              output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
+                  + odim[2] * out_stride[2] + odim[3]] =
+                  input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+                      + idim[2] * in_stride[2] + idim[3]];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
 
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/expand_dims.cc b/mace/ops/expand_dims.cc
index 5d7ad1bade6fe8f0dbdbfebedbea892c9d354b28..2d99d7a742659549c750fc1246449f35701f2277 100644
--- a/mace/ops/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
@@ -14,6 +14,8 @@
 
 
 #include "mace/core/operator.h"
+#include "mace/ops/common/transpose.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
@@ -33,21 +35,35 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
     const Tensor *input = this->Input(0);
     Tensor *output = this->Output(0);
     index_t input_dims_size = input->dim_size();
-    if ( axis_ < 0 ) {
+    if (axis_ < 0) {
       axis_ += input_dims_size + 1;
     }
     MACE_CHECK(axis_ >= 0 && axis_ <= input_dims_size,
                "axis is out of bound: ", axis_);
     const std::vector<index_t> input_shape = input->shape();
-    std::vector<index_t> output_shape;
-    output_shape.insert(output_shape.end(), input_shape.begin(),
-                        input_shape.begin() + axis_);
-    output_shape.insert(output_shape.end(), 1);
-    output_shape.insert(output_shape.end(), input_shape.begin() + axis_,
-                        input_shape.end());
+    std::vector<index_t> output_shape(input_shape);
+    output_shape.insert(output_shape.begin() + axis_, 1);
 
-    output->ReuseTensorBuffer(*input);
-    output->Reshape(output_shape);
+    bool has_data_format = Operation::GetOptionalArg<int>(
+        "has_data_format", 0) == 1;
+    if (has_data_format && output_shape.size() == 4) {
+      // only tensorflow support expand dim, so the default format is NHWC
+      // transform NHWC to NCHW
+      auto t_output_shape = TransposeShape<int64_t, int64_t>(output_shape,
+          {0, 3, 1, 2});
+      output->Resize(t_output_shape);
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard output_guard(output);
+      auto input_data = input->data<T>();
+      auto output_data = output->mutable_data<T>();
+
+      Transpose(input_data, output_shape, {0, 3, 1, 2}, output_data);
+    } else {
+      output->Resize(output_shape);
+      Tensor::MappingGuard input_guard(input);
+      auto input_data = input->data<T>();
+      output->Copy<T>(input_data, input->size());
+    }
 
     return MaceStatus::MACE_SUCCESS;
   }
@@ -62,11 +78,6 @@ void RegisterExpandDims(OpRegistryBase *op_registry) {
 
   MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
                    DeviceType::CPU, int32_t);
-
-#ifdef MACE_ENABLE_QUANTIZE
-  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
-                   DeviceType::CPU, uint8_t);
-#endif  // MACE_ENABLE_QUANTIZE
 }
 
 }  // namespace ops
diff --git a/mace/public/mace.h b/mace/public/mace.h
index c265401ed3ca3f0eb88a51ed03ab206aa2c7c2b3..8cc251132d9d2ee26ecf70b2684e7eee25f50f15 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -326,7 +326,7 @@ class MACE_API MaceTensor {
   //        of shared_ptr and manage the life cycle of the buffer by yourself.
   //        For example, std::shared_ptr<float>(raw_buffer, [](float *){});
   MaceTensor(const std::vector<int64_t> &shape,
-             std::shared_ptr<float> data,
+             std::shared_ptr<void> data,
              const DataFormat format = DataFormat::NHWC);
   MaceTensor();
   MaceTensor(const MaceTensor &other);
@@ -339,8 +339,20 @@ class MACE_API MaceTensor {
   const std::vector<int64_t> &shape() const;
   const std::shared_ptr<float> data() const;
   std::shared_ptr<float> data();
+  template <typename T>
+  const std::shared_ptr<T> data() const {
+    return std::static_pointer_cast<T>(raw_data());
+  }
+  template <typename T>
+  std::shared_ptr<T> data() {
+    return std::static_pointer_cast<T>(raw_mutable_data());
+  }
   DataFormat data_format() const;
 
+ private:
+  std::shared_ptr<void> raw_data() const;
+  std::shared_ptr<void> raw_mutable_data();
+
  private:
   class Impl;
   std::unique_ptr<Impl> impl_;
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index 0de68ce4f6af1c0ae6c995e77738015b998dafba..446321a447703414ba00e51d74745c5df635ee69 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -47,6 +47,11 @@ data_format_map = {
     'OIHW': cvt.DataFormat.OIHW,
 }
 
+data_type_map = {
+    'float32': mace_pb2.DT_FLOAT,
+    'int32': mace_pb2.DT_INT32,
+}
+
 
 def parse_data_type(data_type, device_type):
     if device_type == cvt.DeviceType.CPU.value or \
@@ -141,6 +146,7 @@ def main(unused_args):
     option.data_type = parse_data_type(FLAGS.data_type, option.device)
 
     input_node_names = FLAGS.input_node.split(',')
+    input_data_types = FLAGS.input_data_types.split(',')
     input_node_shapes = FLAGS.input_shape.split(':')
     input_node_formats = FLAGS.input_data_formats.split(",")
     if FLAGS.input_range:
@@ -152,10 +158,8 @@ def main(unused_args):
     for i in six.moves.range(len(input_node_names)):
         input_node = cvt.NodeInfo()
         input_node.name = input_node_names[i]
-        if len(input_node_formats) == 1:
-            input_node.data_format = data_format_map[input_node_formats[0]]
-        else:
-            input_node.data_format = data_format_map[input_node_formats[i]]
+        input_node.data_type = data_type_map[input_data_types[i]]
+        input_node.data_format = data_format_map[input_node_formats[i]]
         input_node.shape = parse_int_array_from_str(input_node_shapes[i])
         if input_node.data_format == cvt.DataFormat.NCHW and\
                 len(input_node.shape) == 4:
@@ -166,6 +170,7 @@ def main(unused_args):
         option.add_input_node(input_node)
 
     output_node_names = FLAGS.output_node.split(',')
+    output_data_types = FLAGS.output_data_types.split(',')
     output_node_shapes = FLAGS.output_shape.split(':')
     output_node_formats = FLAGS.output_data_formats.split(",")
     if len(output_node_names) != len(output_node_shapes):
@@ -173,10 +178,8 @@ def main(unused_args):
     for i in six.moves.range(len(output_node_names)):
         output_node = cvt.NodeInfo()
         output_node.name = output_node_names[i]
-        if len(output_node_formats) == 1:
-            output_node.data_format = data_format_map[output_node_formats[0]]
-        else:
-            output_node.data_format = data_format_map[output_node_formats[i]]
+        output_node.data_type = data_type_map[output_data_types[i]]
+        output_node.data_format = data_format_map[output_node_formats[i]]
         output_node.shape = parse_int_array_from_str(output_node_shapes[i])
         if output_node.data_format == cvt.DataFormat.NCHW and\
                 len(output_node.shape) == 4:
@@ -290,6 +293,11 @@ def parse_args():
         type=str,
         default="input_node",
         help="e.g., input_node")
+    parser.add_argument(
+        "--input_data_types",
+        type=str,
+        default="float32",
+        help="e.g., float32|int32")
     parser.add_argument(
         "--input_data_formats",
         type=str,
@@ -297,6 +305,11 @@ def parse_args():
         help="e.g., NHWC,NONE")
     parser.add_argument(
         "--output_node", type=str, default="softmax", help="e.g., softmax")
+    parser.add_argument(
+        "--output_data_types",
+        type=str,
+        default="float32",
+        help="e.g., float32|int32")
     parser.add_argument(
         "--output_data_formats",
         type=str,
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 7fc877d662a90bc4d6030daab3843b27cb801f80..4ed7156a089a6568bfeb98a4cd40c2e87c7fc67c 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -298,6 +298,7 @@ class NodeInfo(object):
 
     def __init__(self):
         self._name = None
+        self._data_type = mace_pb2.DT_FLOAT
         self._shape = []
         self._data_format = DataFormat.NHWC
         self._range = [-1.0, 1.0]
@@ -306,6 +307,10 @@ class NodeInfo(object):
     def name(self):
         return self._name
 
+    @property
+    def data_type(self):
+        return self._data_type
+
     @property
     def shape(self):
         return self._shape
@@ -322,6 +327,10 @@ class NodeInfo(object):
     def name(self, name):
         self._name = name
 
+    @data_type.setter
+    def data_type(self, data_type):
+        self._data_type = data_type
+
     @shape.setter
     def shape(self, shape):
         self._shape = shape
diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py
index ec255e3a90296a04d8538c1ff464edb097fe5193..53d57151f02ecd82f4a1d0504fc957542c1011e7 100644
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -102,6 +102,7 @@ TFSupportedOps = [
     'Mean',
     'Const',
     'Gather',
+    'GatherV2',
     'StridedSlice',
     'Slice',
     'ReverseV2',
@@ -241,6 +242,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
             TFOpType.Mean.name: self.convert_mean,
             TFOpType.Const.name: self.convert_nop,
             TFOpType.Gather.name: self.convert_gather,
+            TFOpType.GatherV2.name: self.convert_gather,
             TFOpType.StridedSlice.name: self.convert_stridedslice,
             TFOpType.Slice.name: self.convert_slice,
             TFOpType.ReverseV2.name: self.convert_reverse,
@@ -838,16 +840,11 @@ class TensorflowConverter(base_converter.ConverterInterface):
         op = self.convert_general_op(tf_op)
         op.type = MaceOp.ExpandDims.name
 
+        axis_value = tf_op.inputs[1].eval().astype(np.int32)
         axis_arg = op.arg.add()
         axis_arg.name = MaceKeyword.mace_axis_str
-        try:
-            axis_value = tf_op.get_attr('dim')
-        except ValueError:
-            try:
-                axis_value = tf_op.get_attr('axis')
-            except ValueError:
-                axis_value = 0
         axis_arg.i = axis_value
+        del op.input[1]
 
     def convert_squeeze(self, tf_op):
         op = self.convert_general_op(tf_op)
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 1083e23545767725e2f4e0d9c394d790fd5d0dd3..8fd513e882715757754e3a6997991a67fd4c1cdb 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -323,7 +323,7 @@ class Transformer(base_converter.ConverterInterface):
             input_info.name = input_node.name
             input_info.data_format = input_node.data_format.value
             input_info.dims.extend(input_node.shape)
-            input_info.data_type = mace_pb2.DT_FLOAT
+            input_info.data_type = input_node.data_type
 
         output_nodes = self._option.check_nodes.values()
         for output_node in output_nodes:
@@ -332,7 +332,7 @@ class Transformer(base_converter.ConverterInterface):
             output_info.data_format = output_node.data_format.value
             output_info.dims.extend(
                 self._producer[output_node.name].output_shape[0].dims)
-            output_info.data_type = mace_pb2.DT_FLOAT
+            output_info.data_type = output_node.data_type
 
         return False
 
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index 0653304fde80b275217eba9332ab4a121c169a9a..8d1c0e28164ef4a375c7308aeca72cb5b22e0534 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -317,17 +317,18 @@ bool RunModel(const std::string &model_name,
   std::map<std::string, mace::MaceTensor> outputs;
   for (size_t i = 0; i < input_count; ++i) {
     // Allocate input and output
+    // only support float and int32, use char for generalization
+    // sizeof(int) == 4, sizeof(float) == 4
     int64_t input_size =
-        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
+        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 4,
                         std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
+    auto buffer_in = std::shared_ptr<char>(new char[input_size],
+                                           std::default_delete<char[]>());
     // load input
     std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
                           std::ios::in | std::ios::binary);
     if (in_file.is_open()) {
-      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
-                   input_size * sizeof(float));
+      in_file.read(buffer_in.get(), input_size);
       in_file.close();
     } else {
       LOG(INFO) << "Open input file failed";
@@ -338,11 +339,12 @@ bool RunModel(const std::string &model_name,
   }
 
   for (size_t i = 0; i < output_count; ++i) {
+    // only support float and int32, use char for generalization
     int64_t output_size =
-        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4,
                         std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
+    auto buffer_out = std::shared_ptr<char>(new char[output_size],
+                                            std::default_delete<char[]>());
     outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
         output_data_formats[i]);
   }
@@ -454,12 +456,12 @@ bool RunModel(const std::string &model_name,
     std::string output_name =
         FLAGS_output_file + "_" + FormatName(output_names[i]);
     std::ofstream out_file(output_name, std::ios::binary);
+    // only support float and int32
     int64_t output_size =
-        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4,
                         std::multiplies<int64_t>());
     out_file.write(
-        reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
-        output_size * sizeof(float));
+        outputs[output_names[i]].data<char>().get(), output_size);
     out_file.flush();
     out_file.close();
     LOG(INFO) << "Write output file " << output_name << " with size "
@@ -524,6 +526,7 @@ int Main(int argc, char **argv) {
 
   // get cpu capability
   Capability cpu_capability = GetCapability(DeviceType::CPU);
+  float cpu_float32_performance = cpu_capability.float32_performance.exec_time;
 
   bool ret = false;
   for (int i = 0; i < FLAGS_restart_round; ++i) {
@@ -531,7 +534,7 @@ int Main(int argc, char **argv) {
     ret = RunModel(FLAGS_model_name,
         input_names, input_shape_vec, input_data_formats,
         output_names, output_shape_vec, output_data_formats,
-        cpu_capability.float32_performance.exec_time);
+        cpu_float32_performance);
   }
   if (ret) {
     return 0;
diff --git a/tools/common.py b/tools/common.py
index 82a25e5d5e6c04c1db474f93cf7dd21c3d1d48d3..0884319ff9f369c0d05271141e16935cdbf57a56 100644
--- a/tools/common.py
+++ b/tools/common.py
@@ -397,6 +397,7 @@ class YAMLKeyword(object):
     runtime = 'runtime'
     data_type = 'data_type'
     input_data_types = 'input_data_types'
+    output_data_types = 'output_data_types'
     input_data_formats = 'input_data_formats'
     output_data_formats = 'output_data_formats'
     limit_opencl_kernel_time = 'limit_opencl_kernel_time'
diff --git a/tools/converter.py b/tools/converter.py
index 7422a4b52fdad97567abdd1d6b962221ff1aecde..5d8c0c5fcdf718f9f4a91e0b7849aab2e5e2d80a 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -65,13 +65,13 @@ RuntimeTypeStrs = [
     "cpu+gpu"
 ]
 
-InputDataTypeStrs = [
+InOutDataTypeStrs = [
     "int32",
     "float32",
 ]
 
-InputDataType = Enum('InputDataType',
-                     [(ele, ele) for ele in InputDataTypeStrs],
+InOutDataType = Enum('InputDataType',
+                     [(ele, ele) for ele in InOutDataTypeStrs],
                      type=str)
 
 FPDataTypeStrs = [
@@ -410,17 +410,23 @@ def format_model_config(flags):
                 else:
                     subgraph[key] = []
 
-            input_data_types = subgraph.get(YAMLKeyword.input_data_types, "")
-            if input_data_types:
-                if not isinstance(input_data_types, list):
-                    subgraph[YAMLKeyword.input_data_types] = [input_data_types]
-                for input_data_type in subgraph[YAMLKeyword.input_data_types]:
-                    mace_check(input_data_type in InputDataTypeStrs,
-                               ModuleName.YAML_CONFIG,
-                               "'input_data_types' must be in "
-                               + str(InputDataTypeStrs))
-            else:
-                subgraph[YAMLKeyword.input_data_types] = []
+            for key in [YAMLKeyword.input_data_types,
+                        YAMLKeyword.output_data_types]:
+                if key == YAMLKeyword.input_data_types:
+                    count = input_size
+                else:
+                    count = output_size
+                data_types = subgraph.get(key, "")
+                if data_types:
+                    if not isinstance(data_types, list):
+                        subgraph[key] = [data_types] * count
+                    for data_type in subgraph[key]:
+                        mace_check(data_type in InOutDataTypeStrs,
+                                   ModuleName.YAML_CONFIG,
+                                   key + " must be in "
+                                   + str(InOutDataTypeStrs))
+                else:
+                    subgraph[key] = [InOutDataType.float32] * count
 
             input_data_formats = subgraph.get(YAMLKeyword.input_data_formats,
                                               [])
@@ -722,8 +728,10 @@ def convert_model(configs, cl_mem_type):
             model_config[YAMLKeyword.model_sha256_checksum],
             model_config[YAMLKeyword.weight_sha256_checksum],
             ",".join(subgraphs[0][YAMLKeyword.input_tensors]),
+            ",".join(subgraphs[0][YAMLKeyword.input_data_types]),
             ",".join(subgraphs[0][YAMLKeyword.input_data_formats]),
             ",".join(subgraphs[0][YAMLKeyword.output_tensors]),
+            ",".join(subgraphs[0][YAMLKeyword.output_data_types]),
             ",".join(subgraphs[0][YAMLKeyword.output_data_formats]),
             ",".join(subgraphs[0][YAMLKeyword.check_tensors]),
             runtime,
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 02b8ffbaee2dcf2d979432fab3195a07b5a40591..969ceda6405cc25e8a41c7e6b1d803d414ff4f5b 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -480,8 +480,10 @@ def gen_model_code(model_codegen_dir,
                    model_sha256_checksum,
                    weight_sha256_checksum,
                    input_nodes,
+                   input_data_types,
                    input_data_formats,
                    output_nodes,
+                   output_data_types,
                    output_data_formats,
                    check_nodes,
                    runtime,
@@ -515,8 +517,10 @@ def gen_model_code(model_codegen_dir,
               "--model_checksum=%s" % model_sha256_checksum,
               "--weight_checksum=%s" % weight_sha256_checksum,
               "--input_node=%s" % input_nodes,
+              "--input_data_types=%s" % input_data_types,
               "--input_data_formats=%s" % input_data_formats,
               "--output_node=%s" % output_nodes,
+              "--output_data_types=%s" % output_data_types,
               "--output_data_formats=%s" % output_data_formats,
               "--check_node=%s" % check_nodes,
               "--runtime=%s" % runtime,