diff --git a/README.md b/README.md
index eb431878d9e860f19708511958fff19d4b2e44f0..520b5de0ba49e46d49c8afd965b140fc25a0f573 100644
--- a/README.md
+++ b/README.md
@@ -12,12 +12,17 @@
 mobile heterogeneous computing platforms. The design is focused on the following
 targets:
 * Performance
-  * The runtime is highly optimized with NEON, OpenCL and HVX. Except for the
-    inference speed, the initialization speed is also intensively optimized.
+  * The runtime is highly optimized with NEON, OpenCL and Hexagon, and
+    [Winograd algorithm](https://arxiv.org/abs/1509.09308) is introduced to
+    speed up the convolution operations. Except for the inference speed, the
+    initialization speed is also intensively optimized.
 * Power consumption
-  * Chip dependent power options are included as advanced API.
+  * Chip dependent power options like big.LITTLE scheduling, Adreno GPU hints are
+    included as advanced API.
 * Memory usage and library footprint
   * Graph level memory allocation optimization and buffer reuse is supported.
+    The core library tries to keep minium external dependencies to keep the
+    library footprint small.
 * Model protection
   * Model protection is one the highest priority feature from the beginning of 
     the design. Various techniques are introduced like coverting models to C++
@@ -28,31 +33,34 @@ targets:
     archetectures with limited performance.
 
 ## Getting Started
+* [Introduction](docs/getting_started/introduction.rst)
+* [How to build](docs/getting_started/how_to_build.rst)
+* [Create a model deployment file](docs/getting_started/create_a_model_deployment.rst)
 
 ## Performance
-[MiAI Model Zoo](http://v9.git.n.xiaomi.com/deep-computing/mace-models) contains
-several common neural networks models and built daily against several mobile
+[MiAI Compute Engine Model Zoo](http://v9.git.n.xiaomi.com/deep-computing/mace-models) contains
+several common neural networks models and built daily against a list of mobile
 phones. The benchmark result can be found in the CI result page.
 
 ## Communication
 * GitHub issues: bug reports, usage issues, feature requests
-* Gitter or Slack:
-* QQ群:
+* Gitter:
+* QQ群: 756046893
 
 ## Contributing
 Any kind of contributions are welcome. For bug reports, feature requests,
 please just open an issue without any hesitance. For code contributions, it's
 strongly suggested to open an issue for discussion first. For more details,
-please refer to [this guide](docs).
+please refer to [the contribution guide](docs/development/contributing.md).
 
 ## License
 [Apache License 2.0](LICENSE).
 
 ## Acknowledgement
-*MiAI Compute Engine* depends on several open source projects located in
+MiAI Compute Engine depends on several open source projects located in
 [third_party](mace/third_party) directory. Particularly, we learned a lot from
 the following projects during the development:
-* [nnlib](https://source.codeaurora.org/quic/hexagon_nn/nnlib): the DSP runtime
+* [Qualcomm Hexagon NN Offload Framework](https://source.codeaurora.org/quic/hexagon_nn/nnlib): the Hexagon DSP runtime
   depends on this library.
 * [TensorFlow](https://github.com/tensorflow/tensorflow),
   [Caffe](https://github.com/BVLC/caffe),
diff --git a/RELEASE.md b/RELEASE.md
index 7ac3259665b5a03bfa967e94dba8f2c85fce72fb..54b6aa295cb0afc868f6a8ffffad7e49c2549794 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -5,10 +5,12 @@ v0.6.0 (2018-04-04)
 ------
 1. Change mace header interfaces, only including necessary methods.
 
+v0.6.3 (2018-05-21)
+------
+1. support `float` data_type when running in gpu
 
 v0.7.0 (2018-05-18)
 ------
 1. Change interface that report error type
 2. Improve cpu performace
-3. Merge cpu/gpu engine to one
-
+3. Merge cpu/gpu engine to one
\ No newline at end of file
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 8a8c089c493baa0982fbd68e8fa815ed68ea3e45..2b83834a39485e5f3fbc519538e59643402aa081 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,8 +1,6 @@
 FROM ubuntu:16.04
 
 # Update source
-# Looks like mirrors.163.com does not work in xiaomi network
-# RUN sed -i 's/http:\/\/archive\.ubuntu\.com\/ubuntu\//http:\/\/mirrors\.163\.com\/ubuntu\//g' /etc/apt/sources.list
 RUN apt-get update -y
 
 ## Basic tools
diff --git a/docker/gitlab-runner/Dockerfile b/docker/gitlab-runner/Dockerfile
index 13984e4cde4c1caea4f96e3b0c36b4f8d1f9e60e..1db7869dc29cba9a428b9343f3d48ac2d6bb4647 100644
--- a/docker/gitlab-runner/Dockerfile
+++ b/docker/gitlab-runner/Dockerfile
@@ -1,8 +1,6 @@
 FROM cr.d.xiaomi.net/mace/mace-dev:latest
 
 # Update source
-# Looks like mirrors.163.com does not work in xiaomi network
-# RUN sed -i 's/http:\/\/archive\.ubuntu\.com\/ubuntu\//http:\/\/mirrors\.163\.com\/ubuntu\//g' /etc/apt/sources.list
 RUN apt-get update -y
 
 # Install gitlab runner
diff --git a/docs/getting_started/mace-arch.png b/docs/getting_started/mace-arch.png
index bfa4928b83df9604273a3bca4633bad40c0299ad..b95f932d289da857cd6c18c05329bdd36961b777 100644
Binary files a/docs/getting_started/mace-arch.png and b/docs/getting_started/mace-arch.png differ
diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/kernels/arm/conv_2d_neon.h
index 5d2d5f9adb96571ee0a7def3a527cdb23c192d5d..b35429baf035b87950b500c673e8e6260a38f469 100644
--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/kernels/arm/conv_2d_neon.h
@@ -65,6 +65,18 @@ extern void Conv2dNeonK7x7S3(const float *input,
                              const index_t *out_shape,
                              float *output);
 
+extern void Conv2dNeonK1x15S1(const float *input,
+                              const float *filter,
+                              const index_t *in_shape,
+                              const index_t *out_shape,
+                              float *output);
+
+extern void Conv2dNeonK15x1S1(const float *input,
+                              const float *filter,
+                              const index_t *in_shape,
+                              const index_t *out_shape,
+                              float *output);
+
 }  // namespace kernels
 }  // namespace mace
 
diff --git a/mace/kernels/arm/conv_2d_neon_15x1.cc b/mace/kernels/arm/conv_2d_neon_15x1.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80dda31493b1ba3f157dd6333848d13f6c247001
--- /dev/null
+++ b/mace/kernels/arm/conv_2d_neon_15x1.cc
@@ -0,0 +1,163 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+namespace kernels {
+
+inline void Conv2dCPUK15x1Calc(const float *in_ptr,
+                               const float *filter_ptr,
+                               const index_t in_width,
+                               const index_t in_channels,
+                               const index_t out_height,
+                               const index_t out_width,
+                               const index_t w,
+                               const index_t tile_width,
+                               const index_t out_image_size,
+                               float *out_ptr,
+                               const index_t io,
+                               const int stride) {
+  for (index_t ih = 0; ih < out_height; ++ih) {
+    for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
+      for (int i = 0; i < 15; ++i) {
+        for (int j = 0; j < 1; ++j) {
+          out_ptr[io * out_image_size + ih * out_width + w + iw]
+              += in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)]
+              * filter_ptr[io * in_channels * 15 + i * 1 + j];
+        }
+      }
+    }
+  }
+}
+
+
+// Ho = 4, Wo = 1, Co = 1
+void Conv2dNeonK15x1S1(const float *input,
+                       const float *filter,
+                       const index_t *in_shape,
+                       const index_t *out_shape,
+                       float *output) {
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t tile_width =
+      out_shape[1] < 4 ?  RoundUpDiv4(out_shape[3]) : out_shape[3];
+
+#pragma omp parallel for collapse(3)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t w = 0; w < out_shape[3]; w += tile_width) {
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        float *out_ptr_base =
+            output + b * out_batch_size + m * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
+#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+          /* load filter (1 outch x 1 height x 4 width) */
+          float32x4_t vf0, vf1, vf2, vf3;
+          vf0 = vld1q_f32(filter_ptr);
+          vf1 = vld1q_f32(filter_ptr + 4);
+          vf2 = vld1q_f32(filter_ptr + 8);
+          vf3 = vld1q_f32(filter_ptr + 11);
+
+          for (index_t h = 0; h + 3 < out_height; h += 4) {
+            for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
+              // load output
+              index_t out_offset = h * out_width + w + wt;
+              // output (1 outch x 1 height x 4 width): vo_outch_height
+              float32x4_t vo = {out_ptr_base[out_offset],
+                                out_ptr_base[out_offset + out_width],
+                                out_ptr_base[out_offset + 2 * out_width],
+                                out_ptr_base[out_offset + 3 * out_width]};
+
+              // input offset
+              index_t in_offset = h * in_width + w + wt;
+              // input (3 slide)
+              float32x4_t vi0 = {in_ptr_base[in_offset],
+                                 in_ptr_base[in_offset + in_width],
+                                 in_ptr_base[in_offset + 2 * in_width],
+                                 in_ptr_base[in_offset + 3 * in_width]};
+              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                 in_ptr_base[in_offset + 5 * in_width],
+                                 in_ptr_base[in_offset + 6 * in_width],
+                                 in_ptr_base[in_offset + 7 * in_width]};
+              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                 in_ptr_base[in_offset + 9 * in_width],
+                                 in_ptr_base[in_offset + 10 * in_width],
+                                 in_ptr_base[in_offset + 11 * in_width]};
+              float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
+                                  in_ptr_base[in_offset + 13 * in_width],
+                                  in_ptr_base[in_offset + 14 * in_width],
+                                  in_ptr_base[in_offset + 15 * in_width]};
+              float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
+                                  in_ptr_base[in_offset + 17 * in_width]};
+              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+              float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
+              float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
+              float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
+              float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
+              float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
+              float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
+
+              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+              out_ptr_base[out_offset] = vo[0];
+              out_ptr_base[out_offset + out_width] = vo[1];
+              out_ptr_base[out_offset + 2 * out_width] = vo[2];
+              out_ptr_base[out_offset + 3 * out_width] = vo[3];
+            }  // wt
+          }  // h
+#else
+          Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
+                             out_height, out_width, w, tile_width,
+                             out_image_size, out_ptr_base, 0, 1);
+#endif
+        }  // c
+      }  // w
+    }  // m
+  }  // b
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/arm/conv_2d_neon_1x15.cc b/mace/kernels/arm/conv_2d_neon_1x15.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0dd39fba40cb9574f14ac9a0a918d21d6c8caec7
--- /dev/null
+++ b/mace/kernels/arm/conv_2d_neon_1x15.cc
@@ -0,0 +1,149 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/utils/utils.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace kernels {
+
+inline void Conv2dCPUK1x15Calc(const float *in_ptr,
+                               const float *filter_ptr,
+                               const index_t in_width,
+                               const index_t in_channels,
+                               const index_t out_height,
+                               const index_t h,
+                               const index_t tile_height,
+                               const index_t out_width,
+                               const index_t out_image_size,
+                               float *out_ptr,
+                               const index_t io,
+                               const int stride) {
+  for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) {
+    for (index_t iw = 0; iw < out_width; ++iw) {
+      for (int i = 0; i < 1; ++i) {
+        for (int j = 0; j < 15; ++j) {
+          out_ptr[io * out_image_size + (h + ih) * out_width + iw]
+              += in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)]
+              * filter_ptr[io * in_channels * 15 + i * 15 + j];
+        }
+      }
+    }
+  }
+}
+
+
+// Ho = 1, Wo = 4, Co = 1
+void Conv2dNeonK1x15S1(const float *input,
+                       const float *filter,
+                       const index_t *in_shape,
+                       const index_t *out_shape,
+                       float *output) {
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t tile_height =
+      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
+
+#pragma omp parallel for collapse(3)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t h = 0; h < out_shape[2]; h += tile_height) {
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        float *out_ptr_base =
+            output + b * out_batch_size + m * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
+#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+          /* load filter (1 outch x 4 height x 1 width) */
+          float32x4_t vf0, vf1, vf2, vf3;
+          vf0 = vld1q_f32(filter_ptr);
+          vf1 = vld1q_f32(filter_ptr + 4);
+          vf2 = vld1q_f32(filter_ptr + 8);
+          vf3 = vld1q_f32(filter_ptr + 11);
+
+          for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              // output (1 outch x 1 height x 4 width): vo_outch_height
+              float32x4_t vo;
+              // load output
+              index_t out_offset = (h + ht) * out_width + w;
+              vo = vld1q_f32(out_ptr_base + out_offset);
+
+              // input (3 slide)
+              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
+                  vi10, vi11, vi12, vi13, vi14, vi16;
+              // input offset
+              index_t in_offset = (h + ht) * in_width + w;
+              // load input
+              vi0 = vld1q_f32(in_ptr_base + in_offset);
+              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+              vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
+              vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
+              vi1 = vextq_f32(vi0, vi4, 1);
+              vi2 = vextq_f32(vi0, vi4, 2);
+              vi3 = vextq_f32(vi0, vi4, 3);
+              vi5 = vextq_f32(vi4, vi8, 1);
+              vi6 = vextq_f32(vi4, vi8, 2);
+              vi7 = vextq_f32(vi4, vi8, 3);
+              vi9 = vextq_f32(vi8, vi12, 1);
+              vi10 = vextq_f32(vi8, vi12, 2);
+              vi11 = vextq_f32(vi8, vi12, 3);
+              vi13 = vextq_f32(vi12, vi16, 1);
+              vi14 = vextq_f32(vi12, vi16, 2);
+
+              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+              vst1q_f32(out_ptr_base + out_offset, vo);
+            }  // w
+          }  // ht
+#else
+          Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
+                             out_height, h, tile_height, out_width,
+                             out_image_size, out_ptr_base, 0, 1);
+#endif
+        }  // c
+      }  // h
+    }  // m
+  }  // b
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/arm/conv_winograd.cc b/mace/kernels/arm/conv_winograd.cc
index e73061e3a1160bebed2da0ac17cf0a5474ae00f5..6a3b520b7a579b69d6dfa9378c47fab92dc765cd 100644
--- a/mace/kernels/arm/conv_winograd.cc
+++ b/mace/kernels/arm/conv_winograd.cc
@@ -24,7 +24,7 @@ namespace mace {
 namespace kernels {
 
 namespace {
-// NCHW => TNCB (T: in tile pixels, B: tile indices)
+// NCHW => NTCB (T: in tile pixels, B: tile indices)
 void TransformInput4x4(const float *input,
                        const index_t batch,
                        const index_t in_height,
@@ -32,87 +32,95 @@ void TransformInput4x4(const float *input,
                        const index_t in_channels,
                        const index_t tile_count,
                        float *output) {
-  const index_t stride = batch * in_channels * tile_count;
+  const index_t stride = in_channels * tile_count;
   const index_t in_height_width = in_height * in_width;
+  const index_t input_batch_size = in_height_width * in_channels;
+  const index_t output_batch_size = 16 * in_channels * tile_count;
 
-#pragma omp parallel for
-  for (index_t nc = 0; nc < batch * in_channels; ++nc) {
-    index_t tile_index = nc * tile_count;
-    for (index_t h = 0; h < in_height - 2; h += 2) {
-      for (index_t w = 0; w < in_width - 2; w += 2) {
-        float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
-          d15;
-        float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-          s15;
-
-        // load tile data
-        const index_t tile_offset = nc * in_height_width + h * in_width + w;
-        d0 = input[tile_offset];
-        d1 = input[tile_offset + 1];
-        d2 = input[tile_offset + 2];
-        d3 = input[tile_offset + 3];
-
-        d4 = input[tile_offset + in_width];
-        d5 = input[tile_offset + in_width + 1];
-        d6 = input[tile_offset + in_width + 2];
-        d7 = input[tile_offset + in_width + 3];
-
-        d8 = input[tile_offset + 2 * in_width];
-        d9 = input[tile_offset + 2 * in_width + 1];
-        d10 = input[tile_offset + 2 * in_width + 2];
-        d11 = input[tile_offset + 2 * in_width + 3];
-
-        d12 = input[tile_offset + 3 * in_width];
-        d13 = input[tile_offset + 3 * in_width + 1];
-        d14 = input[tile_offset + 3 * in_width + 2];
-        d15 = input[tile_offset + 3 * in_width + 3];
-
-        // s = BT * d * B
-        s0 = (d0 - d8) - (d2 - d10);
-        s1 = (d1 - d9) + (d2 - d10);
-        s2 = (d2 - d10) - (d1 - d9);
-        s3 = (d1 - d9) - (d3 - d11);
-        s4 = (d4 + d8) - (d6 + d10);
-        s5 = (d5 + d9) + (d6 + d10);
-        s6 = (d6 + d10) - (d5 + d9);
-        s7 = (d5 + d9) - (d7 + d11);
-        s8 = (d8 - d4) - (d10 - d6);
-        s9 = (d9 - d5) + (d10 - d6);
-        s10 = (d10 - d6) - (d9 - d5);
-        s11 = (d9 - d5) - (d11 - d7);
-        s12 = (d4 - d12) - (d6 - d14);
-        s13 = (d5 - d13) + (d6 - d14);
-        s14 = (d6 - d14) - (d5 - d13);
-        s15 = (d5 - d13) - (d7 - d15);
-
-        // store output
-        output[tile_index + 0 * stride] = s0;
-        output[tile_index + 1 * stride] = s1;
-        output[tile_index + 2 * stride] = s2;
-        output[tile_index + 3 * stride] = s3;
-
-        output[tile_index + 4 * stride] = s4;
-        output[tile_index + 5 * stride] = s5;
-        output[tile_index + 6 * stride] = s6;
-        output[tile_index + 7 * stride] = s7;
-
-        output[tile_index + 8 * stride] = s8;
-        output[tile_index + 9 * stride] = s9;
-        output[tile_index + 10 * stride] = s10;
-        output[tile_index + 11 * stride] = s11;
-
-        output[tile_index + 12 * stride] = s12;
-        output[tile_index + 13 * stride] = s13;
-        output[tile_index + 14 * stride] = s14;
-        output[tile_index + 15 * stride] = s15;
-
-        ++tile_index;
+#pragma omp parallel for collapse(2)
+  for (index_t n = 0; n < batch; ++n) {
+    for (index_t c = 0; c < in_channels; ++c) {
+      index_t tile_index = 0;
+      for (index_t h = 0; h < in_height - 2; h += 2) {
+        for (index_t w = 0; w < in_width - 2; w += 2) {
+          float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
+            d15;
+          float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+            s15;
+
+          // load tile data
+          const float *input_ptr =
+            input + n * input_batch_size + c * in_height_width + h * in_width
+              + w;
+          d0 = input_ptr[0];
+          d1 = input_ptr[1];
+          d2 = input_ptr[2];
+          d3 = input_ptr[3];
+
+          d4 = input_ptr[in_width];
+          d5 = input_ptr[in_width + 1];
+          d6 = input_ptr[in_width + 2];
+          d7 = input_ptr[in_width + 3];
+
+          d8 = input_ptr[2 * in_width];
+          d9 = input_ptr[2 * in_width + 1];
+          d10 = input_ptr[2 * in_width + 2];
+          d11 = input_ptr[2 * in_width + 3];
+
+          d12 = input_ptr[3 * in_width];
+          d13 = input_ptr[3 * in_width + 1];
+          d14 = input_ptr[3 * in_width + 2];
+          d15 = input_ptr[3 * in_width + 3];
+
+          // s = BT * d * B
+          s0 = (d0 - d8) - (d2 - d10);
+          s1 = (d1 - d9) + (d2 - d10);
+          s2 = (d2 - d10) - (d1 - d9);
+          s3 = (d1 - d9) - (d3 - d11);
+          s4 = (d4 + d8) - (d6 + d10);
+          s5 = (d5 + d9) + (d6 + d10);
+          s6 = (d6 + d10) - (d5 + d9);
+          s7 = (d5 + d9) - (d7 + d11);
+          s8 = (d8 - d4) - (d10 - d6);
+          s9 = (d9 - d5) + (d10 - d6);
+          s10 = (d10 - d6) - (d9 - d5);
+          s11 = (d9 - d5) - (d11 - d7);
+          s12 = (d4 - d12) - (d6 - d14);
+          s13 = (d5 - d13) + (d6 - d14);
+          s14 = (d6 - d14) - (d5 - d13);
+          s15 = (d5 - d13) - (d7 - d15);
+
+          // store output
+          float *output_ptr =
+            output + n * output_batch_size + c * tile_count + tile_index;
+          output_ptr[0] = s0;
+          output_ptr[1 * stride] = s1;
+          output_ptr[2 * stride] = s2;
+          output_ptr[3 * stride] = s3;
+
+          output_ptr[4 * stride] = s4;
+          output_ptr[5 * stride] = s5;
+          output_ptr[6 * stride] = s6;
+          output_ptr[7 * stride] = s7;
+
+          output_ptr[8 * stride] = s8;
+          output_ptr[9 * stride] = s9;
+          output_ptr[10 * stride] = s10;
+          output_ptr[11 * stride] = s11;
+
+          output_ptr[12 * stride] = s12;
+          output_ptr[13 * stride] = s13;
+          output_ptr[14 * stride] = s14;
+          output_ptr[15 * stride] = s15;
+
+          ++tile_index;
+        }
       }
     }
   }
 }
 
-// NCHW => TNCB (T: in tile pixels, B: tile indices)
+// NCHW => NTCB (T: in tile pixels, B: tile indices)
 /**
  * BT =
 ⎡1   0    -21/4    0    21/4     0    -1  0⎤
@@ -146,85 +154,94 @@ void TransformInput8x8(const float *input,
                        const index_t in_channels,
                        const index_t tile_count,
                        float *output) {
-  const index_t stride = batch * in_channels * tile_count;
+  const index_t stride = in_channels * tile_count;
   const index_t in_height_width = in_height * in_width;
+  const index_t input_batch_size = in_height_width * in_channels;
+  const index_t output_batch_size = 64 * in_channels * tile_count;
 
-#pragma omp parallel for
-  for (index_t nc = 0; nc < batch * in_channels; ++nc) {
-    index_t tile_index = nc * tile_count;
-    float s[8][8];
-    for (index_t h = 0; h < in_height - 2; h += 6) {
-      for (index_t w = 0; w < in_width - 2; w += 6) {
-        index_t tile_offset = nc * in_height_width + h * in_width + w;
-        for (int i = 0; i < 8; ++i) {
-          float d0, d1, d2, d3, d4, d5, d6, d7;
-          d0 = input[tile_offset];
-          d1 = input[tile_offset + 1];
-          d2 = input[tile_offset + 2];
-          d3 = input[tile_offset + 3];
-          d4 = input[tile_offset + 4];
-          d5 = input[tile_offset + 5];
-          d6 = input[tile_offset + 6];
-          d7 = input[tile_offset + 7];
-
-          s[i][0] = d0 - d6 + (d4 - d2) * 5.25;
-          s[i][7] = d7 - d1 + (d3 - d5) * 5.25;
-
-          float u = d2 + d6 - d4 * 4.25;
-          float v = d1 + d5 - d3 * 4.25;
-          s[i][1] = u + v;
-          s[i][2] = u - v;
-
-          u = d6 + d2 * 0.25 - d4 * 1.25;
-          v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
-          s[i][3] = u + v;
-          s[i][4] = u - v;
-
-          u = d6 + (d2 - d4 * 1.25) * 4;
-          v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
-          s[i][5] = u + v;
-          s[i][6] = u - v;
-
-          tile_offset += in_width;
-        }
+#pragma omp parallel for collapse(2)
+  for (index_t n = 0; n < batch; ++n) {
+    for (index_t c = 0; c < in_channels; ++c) {
+      index_t tile_index = 0;
+      float s[8][8];
+      for (index_t h = 0; h < in_height - 2; h += 6) {
+        for (index_t w = 0; w < in_width - 2; w += 6) {
+          const float *input_ptr =
+            input + n * input_batch_size + c * in_height_width + h * in_width
+              + w;
+
+          for (int i = 0; i < 8; ++i) {
+            float d0, d1, d2, d3, d4, d5, d6, d7;
+            d0 = input_ptr[0];
+            d1 = input_ptr[1];
+            d2 = input_ptr[2];
+            d3 = input_ptr[3];
+            d4 = input_ptr[4];
+            d5 = input_ptr[5];
+            d6 = input_ptr[6];
+            d7 = input_ptr[7];
+
+            s[i][0] = d0 - d6 + (d4 - d2) * 5.25;
+            s[i][7] = d7 - d1 + (d3 - d5) * 5.25;
+
+            float u = d2 + d6 - d4 * 4.25;
+            float v = d1 + d5 - d3 * 4.25;
+            s[i][1] = u + v;
+            s[i][2] = u - v;
+
+            u = d6 + d2 * 0.25 - d4 * 1.25;
+            v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
+            s[i][3] = u + v;
+            s[i][4] = u - v;
+
+            u = d6 + (d2 - d4 * 1.25) * 4;
+            v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
+            s[i][5] = u + v;
+            s[i][6] = u - v;
+
+            input_ptr += in_width;
+          }
 
-        for (int i = 0; i < 8; ++i) {
-          float d0, d1, d2, d3, d4, d5, d6, d7;
-          d0 = s[0][i];
-          d1 = s[1][i];
-          d2 = s[2][i];
-          d3 = s[3][i];
-          d4 = s[4][i];
-          d5 = s[5][i];
-          d6 = s[6][i];
-          d7 = s[7][i];
-
-          output[tile_index + i * stride] = d0 - d6 + (d4 - d2) * 5.25;
-          output[tile_index + (56 + i) * stride] = d7 - d1 + (d3 - d5) * 5.25;
-
-          float u = d2 + d6 - d4 * 4.25;
-          float v = d1 + d5 - d3 * 4.25;
-          output[tile_index + (8 + i) * stride] = u + v;
-          output[tile_index + (16 + i) * stride] = u - v;
-
-          u = d6 + d2 * 0.25 - d4 * 1.25;
-          v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
-          output[tile_index + (24 + i) * stride] = u + v;
-          output[tile_index + (32 + i) * stride] = u - v;
-
-          u = d6 + (d2 - d4 * 1.25) * 4;
-          v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
-          output[tile_index + (40 + i) * stride] = u + v;
-          output[tile_index + (48 + i) * stride] = u - v;
-        }
+          float *output_ptr =
+            output + n * output_batch_size + c * tile_count + tile_index;
+          for (int i = 0; i < 8; ++i) {
+            float d0, d1, d2, d3, d4, d5, d6, d7;
+            d0 = s[0][i];
+            d1 = s[1][i];
+            d2 = s[2][i];
+            d3 = s[3][i];
+            d4 = s[4][i];
+            d5 = s[5][i];
+            d6 = s[6][i];
+            d7 = s[7][i];
+
+            output_ptr[i * stride] = d0 - d6 + (d4 - d2) * 5.25;
+            output_ptr[(56 + i) * stride] = d7 - d1 + (d3 - d5) * 5.25;
+
+            float u = d2 + d6 - d4 * 4.25;
+            float v = d1 + d5 - d3 * 4.25;
+            output_ptr[(8 + i) * stride] = u + v;
+            output_ptr[(16 + i) * stride] = u - v;
+
+            u = d6 + d2 * 0.25 - d4 * 1.25;
+            v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
+            output_ptr[(24 + i) * stride] = u + v;
+            output_ptr[(32 + i) * stride] = u - v;
+
+            u = d6 + (d2 - d4 * 1.25) * 4;
+            v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
+            output_ptr[(40 + i) * stride] = u + v;
+            output_ptr[(48 + i) * stride] = u - v;
+          }
 
-        ++tile_index;
+          ++tile_index;
+        }
       }
     }
   }
 }
 
-// TOC * TNCB => TNOB
+// TOC * NTCB => NTOB
 void BatchGemm(const float *input,
                const float *filter,
                index_t batch,
@@ -233,12 +250,13 @@ void BatchGemm(const float *input,
                index_t tile_count,
                int out_tile_size,
                float *output) {
-  const index_t in_stride = batch * in_channels * tile_count;
-  const index_t in_channels_tile_count = in_channels * tile_count;
   const index_t filter_stride = out_channels * in_channels;
-  const index_t out_stride = batch * out_channels * tile_count;
-  const index_t out_channels_tile_count = out_channels * tile_count;
   const int in_tile_area = (out_tile_size + 2) * (out_tile_size + 2);
+  const index_t in_batch_size = in_tile_area * in_channels * tile_count;
+  const index_t in_stride = in_channels * tile_count;
+  const index_t out_batch_size = in_tile_area * out_channels * tile_count;
+  const index_t out_stride = out_channels * tile_count;
+
   if (batch == 1) {
     Gemm(filter,
          input,
@@ -248,12 +266,13 @@ void BatchGemm(const float *input,
          tile_count,
          output);
   } else {
-    for (int i = 0; i < in_tile_area; ++i) {
-      for (int b = 0; b < batch; ++b) {
+#pragma omp parallel for collapse(2)
+    for (int b = 0; b < batch; ++b) {
+      for (int i = 0; i < in_tile_area; ++i) {
         const float
-          *in_ptr = input + i * in_stride + b * in_channels_tile_count;
+          *in_ptr = input + b * in_batch_size + i * in_stride;
         const float *filter_ptr = filter + i * filter_stride;
-        float *out_ptr = output + i * out_stride + b * out_channels_tile_count;
+        float *out_ptr = output + b * out_batch_size + i * out_stride;
         Gemm(filter_ptr,
              in_ptr,
              1,
@@ -266,7 +285,7 @@ void BatchGemm(const float *input,
   }
 }
 
-// TNOB => ToNOB => NOHoWo
+// NTOB => NToOB => NOHoWo
 void TransformOutput4x4(const float *input,
                         index_t batch,
                         index_t out_height,
@@ -274,65 +293,74 @@ void TransformOutput4x4(const float *input,
                         index_t out_channels,
                         index_t tile_count,
                         float *output) {
-  const index_t in_stride = batch * out_channels * tile_count;
-
-#pragma omp parallel for
-  for (index_t nm = 0; nm < batch * out_channels; ++nm) {
-    index_t tile_offset = nm * tile_count;
-    for (index_t h = 0; h < out_height; h += 2) {
-      for (index_t w = 0; w < out_width; w += 2) {
-        float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
-          d15;
-        float s0, s1, s2, s3, s4, s5, s6, s7;
-        float v0, v1, v2, v3;
-
-        d0 = input[tile_offset + 0 * in_stride];
-        d1 = input[tile_offset + 1 * in_stride];
-        d2 = input[tile_offset + 2 * in_stride];
-        d3 = input[tile_offset + 3 * in_stride];
-
-        d4 = input[tile_offset + 4 * in_stride];
-        d5 = input[tile_offset + 5 * in_stride];
-        d6 = input[tile_offset + 6 * in_stride];
-        d7 = input[tile_offset + 7 * in_stride];
-
-        d8 = input[tile_offset + 8 * in_stride];
-        d9 = input[tile_offset + 9 * in_stride];
-        d10 = input[tile_offset + 10 * in_stride];
-        d11 = input[tile_offset + 11 * in_stride];
-
-        d12 = input[tile_offset + 12 * in_stride];
-        d13 = input[tile_offset + 13 * in_stride];
-        d14 = input[tile_offset + 14 * in_stride];
-        d15 = input[tile_offset + 15 * in_stride];
-
-        s0 = d0 + d1 + d2;
-        s1 = d1 - d2 - d3;
-        s2 = d4 + d5 + d6;
-        s3 = d5 - d6 - d7;
-        s4 = d8 + d9 + d10;
-        s5 = d9 - d10 - d11;
-        s6 = d12 + d13 + d14;
-        s7 = d13 - d14 - d15;
-
-        v0 = s0 + s2 + s4;
-        v1 = s1 + s3 + s5;
-        v2 = s2 - s4 - s6;
-        v3 = s3 - s5 - s7;
-
-        index_t out_offset = nm * out_height * out_width + h * out_width + w;
-        output[out_offset] = v0;
-        output[out_offset + 1] = v1;
-        output[out_offset + out_width] = v2;
-        output[out_offset + out_width + 1] = v3;
-
-        ++tile_offset;
+  const index_t stride = out_channels * tile_count;
+  const index_t input_batch_size = 16 * stride;
+  const index_t out_image_size = out_height * out_width;
+  const index_t output_batch_size = out_channels * out_image_size;
+
+#pragma omp parallel for collapse(2)
+  for (index_t n = 0; n < batch; ++n) {
+    for (index_t m = 0; m < out_channels; ++m) {
+      index_t tile_offset = 0;
+      for (index_t h = 0; h < out_height; h += 2) {
+        for (index_t w = 0; w < out_width; w += 2) {
+          float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
+            d15;
+          float s0, s1, s2, s3, s4, s5, s6, s7;
+          float v0, v1, v2, v3;
+
+          const float *input_ptr =
+            input + n * input_batch_size + m * tile_count + tile_offset;
+          d0 = input_ptr[0];
+          d1 = input_ptr[1 * stride];
+          d2 = input_ptr[2 * stride];
+          d3 = input_ptr[3 * stride];
+
+          d4 = input_ptr[4 * stride];
+          d5 = input_ptr[5 * stride];
+          d6 = input_ptr[6 * stride];
+          d7 = input_ptr[7 * stride];
+
+          d8 = input_ptr[8 * stride];
+          d9 = input_ptr[9 * stride];
+          d10 = input_ptr[10 * stride];
+          d11 = input_ptr[11 * stride];
+
+          d12 = input_ptr[12 * stride];
+          d13 = input_ptr[13 * stride];
+          d14 = input_ptr[14 * stride];
+          d15 = input_ptr[15 * stride];
+
+          s0 = d0 + d1 + d2;
+          s1 = d1 - d2 - d3;
+          s2 = d4 + d5 + d6;
+          s3 = d5 - d6 - d7;
+          s4 = d8 + d9 + d10;
+          s5 = d9 - d10 - d11;
+          s6 = d12 + d13 + d14;
+          s7 = d13 - d14 - d15;
+
+          v0 = s0 + s2 + s4;
+          v1 = s1 + s3 + s5;
+          v2 = s2 - s4 - s6;
+          v3 = s3 - s5 - s7;
+
+          float *output_ptr =
+            output + n * output_batch_size + m * out_image_size + h * out_width
+              + w;
+          output_ptr[0] = v0;
+          output_ptr[1] = v1;
+          output_ptr[out_width] = v2;
+          output_ptr[out_width + 1] = v3;
+
+          ++tile_offset;
+        }
       }
     }
   }
 }
 
-// TNOB => ToNOB => NOHoWo
+// NTOB => NToOB => NOHoWo
 /**
  * AT =
 ⎡1  1  1   1    1   32  32   0⎤
@@ -362,72 +390,81 @@ void TransformOutput8x8(const float *input,
                         index_t out_channels,
                         index_t tile_count,
                         float *output) {
-  const index_t in_stride = batch * out_channels * tile_count;
-
-#pragma omp parallel for
-  for (index_t nm = 0; nm < batch * out_channels; ++nm) {
-    index_t tile_offset = nm * tile_count;
-    float s[8][6];
-    for (index_t h = 0; h < out_height; h += 6) {
-      for (index_t w = 0; w < out_width; w += 6) {
-        index_t tile_offset_tmp = tile_offset;
-        for (int i = 0; i < 8; ++i) {
-          float d0, d1, d2, d3, d4, d5, d6, d7;
-          d0 = input[tile_offset_tmp + 0 * in_stride];
-          d1 = input[tile_offset_tmp + 1 * in_stride];
-          d2 = input[tile_offset_tmp + 2 * in_stride];
-          d3 = input[tile_offset_tmp + 3 * in_stride];
-          d4 = input[tile_offset_tmp + 4 * in_stride];
-          d5 = input[tile_offset_tmp + 5 * in_stride];
-          d6 = input[tile_offset_tmp + 6 * in_stride];
-          d7 = input[tile_offset_tmp + 7 * in_stride];
-
-          float u = d1 + d2;
-          float v = d1 - d2;
-          float w = d3 + d4;
-          float x = d3 - d4;
-          float y = d5 + d6;
-          float z = d5 - d6;
-
-          s[i][0] = d0 + u + w + y * 32;
-          s[i][1] = v + x + x + z * 16;
-          s[i][2] = u + w * 4 + y * 8;
-          s[i][3] = v + x * 8 + z * 4;
-          s[i][4] = u + w * 16 + y + y;
-          s[i][5] = v + x * 32 + z + d7;
-
-          tile_offset_tmp += 8 * in_stride;
-        }
+  const index_t stride = out_channels * tile_count;
+  const index_t input_batch_size = 64 * stride;
+  const index_t out_image_size = out_height * out_width;
+  const index_t output_batch_size = out_channels * out_image_size;
 
-        index_t out_offset = nm * out_height * out_width + h * out_width + w;
-
-        for (int i = 0; i < 6; ++i) {
-          float d0, d1, d2, d3, d4, d5, d6, d7;
-          d0 = s[0][i];
-          d1 = s[1][i];
-          d2 = s[2][i];
-          d3 = s[3][i];
-          d4 = s[4][i];
-          d5 = s[5][i];
-          d6 = s[6][i];
-          d7 = s[7][i];
-
-          float u = d1 + d2;
-          float v = d1 - d2;
-          float w = d3 + d4;
-          float x = d3 - d4;
-          float y = d5 + d6;
-          float z = d5 - d6;
-
-          output[out_offset + 0 * out_width + i] = d0 + u + w + y * 32;
-          output[out_offset + 1 * out_width + i] = v + x + x + z * 16;
-          output[out_offset + 2 * out_width + i] = u + w * 4 + y * 8;
-          output[out_offset + 3 * out_width + i] = v + x * 8 + z * 4;
-          output[out_offset + 4 * out_width + i] = u + w * 16 + y + y;
-          output[out_offset + 5 * out_width + i] = v + x * 32 + z + d7;
-        }
+#pragma omp parallel for collapse(2)
+  for (index_t n = 0; n < batch; ++n) {
+    for (index_t m = 0; m < out_channels; ++m) {
+      index_t tile_offset = 0;
+      float s[8][6];
+      for (index_t h = 0; h < out_height; h += 6) {
+        for (index_t w = 0; w < out_width; w += 6) {
+          const float *input_ptr =
+            input + n * input_batch_size + m * tile_count + tile_offset;
+          for (int i = 0; i < 8; ++i) {
+            float d0, d1, d2, d3, d4, d5, d6, d7;
+
+            d0 = input_ptr[0];
+            d1 = input_ptr[1 * stride];
+            d2 = input_ptr[2 * stride];
+            d3 = input_ptr[3 * stride];
+            d4 = input_ptr[4 * stride];
+            d5 = input_ptr[5 * stride];
+            d6 = input_ptr[6 * stride];
+            d7 = input_ptr[7 * stride];
+
+            float u = d1 + d2;
+            float v = d1 - d2;
+            float w = d3 + d4;
+            float x = d3 - d4;
+            float y = d5 + d6;
+            float z = d5 - d6;
+
+            s[i][0] = d0 + u + w + y * 32;
+            s[i][1] = v + x + x + z * 16;
+            s[i][2] = u + w * 4 + y * 8;
+            s[i][3] = v + x * 8 + z * 4;
+            s[i][4] = u + w * 16 + y + y;
+            s[i][5] = v + x * 32 + z + d7;
+
+            input_ptr += 8 * stride;
+          }
 
-        ++tile_offset;
+          float *output_ptr =
+            output + n * output_batch_size + m * out_image_size + h * out_width
+              + w;
+
+          for (int i = 0; i < 6; ++i) {
+            float d0, d1, d2, d3, d4, d5, d6, d7;
+            d0 = s[0][i];
+            d1 = s[1][i];
+            d2 = s[2][i];
+            d3 = s[3][i];
+            d4 = s[4][i];
+            d5 = s[5][i];
+            d6 = s[6][i];
+            d7 = s[7][i];
+
+            float u = d1 + d2;
+            float v = d1 - d2;
+            float w = d3 + d4;
+            float x = d3 - d4;
+            float y = d5 + d6;
+            float z = d5 - d6;
+
+            output_ptr[i] = d0 + u + w + y * 32;
+            output_ptr[1 * out_width + i] = v + x + x + z * 16;
+            output_ptr[2 * out_width + i] = u + w * 4 + y * 8;
+            output_ptr[3 * out_width + i] = v + x * 8 + z * 4;
+            output_ptr[4 * out_width + i] = u + w * 16 + y + y;
+            output_ptr[5 * out_width + i] = v + x * 32 + z + d7;
+          }
+
+          ++tile_offset;
+        }
       }
     }
   }
@@ -448,7 +485,7 @@ void TransformFilter4x4(const float *filter,
     for (index_t c = 0; c < in_channels; ++c) {
       float g0, g1, g2, g3, g4, g5, g6, g7, g8;
       float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-          s15;
+        s15;
 
       // load filter
       index_t filter_offset = (m * in_channels + c) * 9;
@@ -537,14 +574,14 @@ void TransformFilter8x8(const float *filter,
   const index_t stride = out_channels * in_channels;
 
   const float G[8][3] = {
-      {1.0f, 0.0f, 0.0f},
-      {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-      {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-      {1.0f / 90, 1.0f / 45, 2.0f / 45},
-      {1.0f / 90, -1.0f / 45, 2.0f / 45},
-      {1.0f / 45, 1.0f / 90, 1.0f / 180},
-      {1.0f / 45, -1.0f / 90, 1.0f / 180},
-      {0.0f, 0.0f, 1.0f}
+    {1.0f, 0.0f, 0.0f},
+    {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+    {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+    {1.0f / 90, 1.0f / 45, 2.0f / 45},
+    {1.0f / 90, -1.0f / 45, 2.0f / 45},
+    {1.0f / 45, 1.0f / 90, 1.0f / 180},
+    {1.0f / 45, -1.0f / 90, 1.0f / 180},
+    {0.0f, 0.0f, 1.0f}
   };
 
 #pragma omp parallel for collapse(2)
@@ -575,7 +612,7 @@ void TransformFilter8x8(const float *filter,
       for (int i = 0; i < 8; ++i) {
         for (int j = 0; j < 8; ++j) {
           output[output_offset + (i * 8 + j) * stride] =
-              G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
+            G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
         }
       }
     }
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index eb374960cab4eb5d14a29316c4d78ab6fb2c324b..7a0b8328bb84ed088dcf532295928b5a6040e658 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -363,6 +363,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
         && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
     bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7
         && stride_h == 3 && stride_w == 3 && dilation_h == 1 && dilation_w == 1;
+    bool use_neon_1x15_s1 = filter_h == 1 && filter_w == 15
+        && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+    bool use_neon_15x1_s1 = filter_h == 15 && filter_w == 1
+        && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
 
     std::vector<index_t> transformed_input_shape;
     std::vector<index_t> transformed_output_shape;
@@ -402,24 +406,26 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                                        tile_count});
       transformed_filter_shape.insert(transformed_filter_shape.end(),
                                       {in_tile_area, channels, input_channels});
-    } else if (use_neon_3x3_s1) {
-      extra_output_height = RoundUp<index_t>(height, 2);
-      extra_input_height =
-        std::max(padded_input_height, extra_output_height + 2);
-      extra_output_width = RoundUp<index_t>(width, 4);
-      extra_input_width = std::max(padded_input_width, extra_output_width + 2);
-      if (extra_input_height != padded_input_height) {
-        pad_bottom += (extra_input_height - padded_input_height);
-      }
-      if (extra_input_width != padded_input_width) {
-        pad_right += (extra_input_width - padded_input_width);
+    } else {
+      index_t tile_h, tile_w;
+      if (use_neon_1x1_s1) {
+        tile_h = 1;
+        tile_w = 1;
+      } else if (use_neon_3x3_s1) {
+        tile_h = 2;
+        tile_w = 4;
+      } else if (use_neon_15x1_s1) {
+        tile_h = 4;
+        tile_w = 1;
+      } else {
+        tile_h = 1;
+        tile_w = 4;
       }
-    } else if (!use_neon_1x1_s1) {
-      extra_output_height = height;
+      extra_output_height = RoundUp<index_t>(height, tile_h);
       extra_input_height =
           std::max(padded_input_height, (extra_output_height - 1) * stride_h
               + (filter_h - 1) * dilation_h + 1);
-      extra_output_width = RoundUp<index_t>(width, 4);
+      extra_output_width = RoundUp<index_t>(width, tile_w);
       extra_input_width =
           std::max(padded_input_width, (extra_output_width - 1) * stride_w
               + (filter_w - 1) * dilation_w + 1);
@@ -584,6 +590,22 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                          extra_output_shape,
                          pad_output);
       };
+    } else if (use_neon_1x15_s1) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK1x15S1(pad_input,
+                         filter_data,
+                         extra_input_shape,
+                         extra_output_shape,
+                         pad_output);
+      };
+    } else if (use_neon_15x1_s1) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK15x1S1(pad_input,
+                          filter_data,
+                          extra_input_shape,
+                          extra_output_shape,
+                          pad_output);
+      };
     } else {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dGeneral(pad_input,
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index bd21547d2cf8294913781f1c1cb6bb3828170edb..ac8c99131c4132cf4375ac06fdc443db39912edc 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -43,6 +43,7 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
     const index_t batch = input->dim(0);
     const index_t class_count = input->dim(1);
     const index_t class_size = input->dim(2) * input->dim(3);
+    const index_t batch_size = class_count * class_size;
 
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
@@ -50,46 +51,37 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
     float *output_data = output->mutable_data<float>();
 
     for (index_t b = 0; b < batch; ++b) {
-      std::vector<float>
-        max_val(class_size, std::numeric_limits<float>::lowest());
-      std::vector<float> sum_val(class_size, 0.f);
-
-      // calculate max for each class
-      for (index_t c = 0; c < class_count; ++c) {
-        const float
-          *input_ptr = input_data + (b * class_count + c) * class_size;
-        for (index_t k = 0; k < class_size; ++k) {
-          max_val[k] = std::max(max_val[k], input_ptr[k]);
-        }
-      }
-
-      // calculate data - max for each class
 #pragma omp parallel for
-      for (index_t c = 0; c < class_count; ++c) {
-        const float
-          *input_ptr = input_data + (b * class_count + c) * class_size;
-        float *output_ptr = output_data + (b * class_count + c) * class_size;
-        for (index_t k = 0; k < class_size; ++k) {
-          output_ptr[k] = ::exp(input_ptr[k] - max_val[k]);
+      for (index_t k = 0; k < class_size; ++k) {
+        const float *input_ptr = input_data + b * batch_size + k;
+        float *output_ptr = output_data + b * batch_size + k;
+
+        float max_val = std::numeric_limits<float>::lowest();
+        index_t channel_offset = 0;
+        for (index_t c = 0; c < class_count; ++c) {
+          float data = input_ptr[channel_offset];
+          if (data > max_val) {
+            max_val = data;
+          }
+          channel_offset += class_size;
         }
-      }
 
-      // calculate sum for each class
-      for (index_t c = 0; c < class_count; ++c) {
-        float *output_ptr = output_data + (b * class_count + c) * class_size;
-        for (index_t k = 0; k < class_size; ++k) {
-          sum_val[k] += output_ptr[k];
+        channel_offset = 0;
+        float sum = 0;
+        for (index_t c = 0; c < class_count; ++c) {
+          float exp_value = ::exp(input_ptr[channel_offset] - max_val);
+          sum += exp_value;
+          output_ptr[channel_offset] = exp_value;
+          channel_offset += class_size;
         }
-      }
 
-      // calculate (data - max) / sum for each class
-      for (index_t c = 0; c < class_count; ++c) {
-        float *output_ptr = output_data + (b * class_count + c) * class_size;
-        for (index_t k = 0; k < class_size; ++k) {
-          output_ptr[k] /= sum_val[k];
+        channel_offset = 0;
+        for (index_t c = 0; c < class_count; ++c) {
+          output_ptr[channel_offset] /= sum;
+          channel_offset += class_size;
         }
-      }
-    }
+      }  // k
+    }  // b
   }
 };
 
diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.h
index 3f49ee9c4f6198548f2178b70a70a64efa340086..3e52de1a105ff96c2fc93b6f0ce12f70078f4aa1 100644
--- a/mace/kernels/transpose.h
+++ b/mace/kernels/transpose.h
@@ -15,6 +15,10 @@
 #ifndef MACE_KERNELS_TRANSPOSE_H_
 #define MACE_KERNELS_TRANSPOSE_H_
 
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
 #include <vector>
 
 #include "mace/core/future.h"
@@ -25,6 +29,65 @@
 namespace mace {
 namespace kernels {
 
+static void TransposeNHWCToNCHWC3(const float *input,
+                                  float *output,
+                                  const index_t height,
+                                  const index_t width) {
+  index_t image_size = height * width;
+
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width * 3;
+    index_t out_offset = h * width;
+
+    index_t w;
+    for (w = 0; w + 3 < width; w += 4) {
+      float32x4x3_t vi = vld3q_f32(input + in_offset);
+      vst1q_f32(output + out_offset, vi.val[0]);
+      vst1q_f32(output + out_offset + image_size, vi.val[1]);
+      vst1q_f32(output + out_offset + image_size * 2, vi.val[2]);
+
+      in_offset += 12;
+      out_offset += 4;
+    }
+    for (; w < width; ++w) {
+      for (index_t c = 0; c < 3; ++c) {
+        output[h * width + image_size * c + w] =
+          input[h * width * 3 + w * 3 + c];
+      }
+    }
+  }
+}
+
+static void TransposeNCHWToNHWCC2(const float *input,
+                                  float *output,
+                                  const index_t height,
+                                  const index_t width) {
+  index_t image_size = height * width;
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width;
+    index_t out_offset = h * width * 2;
+
+    index_t w;
+    for (w = 0; w + 3 < width; w += 4) {
+      float32x4_t vi0 = vld1q_f32(input + in_offset);
+      float32x4_t vi1 = vld1q_f32(input + in_offset + image_size);
+      float32x4x2_t vi = {vi0, vi1};
+      vst2q_f32(output + out_offset, vi);
+
+      in_offset += 4;
+      out_offset += 8;
+    }
+    for (; w < width; ++w) {
+      for (index_t c = 0; c < 2; ++c) {
+        output[h * width * 2 + w * 2 + c] =
+          input[h * width + image_size * c + w];
+      }
+    }
+  }
+}
+
 template<DeviceType D, typename T>
 struct TransposeFunctor {
   explicit TransposeFunctor(const std::vector<int> &dims) : dims_(dims) {}
@@ -48,28 +111,48 @@ struct TransposeFunctor {
         }
       }
     } else if (input->dim_size() == 4) {
-      std::vector<index_t>
-        in_stride{input_shape[1] * input_shape[2] * input_shape[3],
-                  input_shape[2] * input_shape[3], input_shape[3], 1};
-      std::vector<index_t>
-        out_stride{output_shape[1] * output_shape[2] * output_shape[3],
-                   output_shape[2] * output_shape[3], output_shape[3], 1};
-
-      std::vector<index_t> idim(4, 0);
-      std::vector<index_t> odim(4, 0);
-      for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
-        for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
-          for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
-            for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
-              idim[dims_[0]] = odim[0];
-              idim[dims_[1]] = odim[1];
-              idim[dims_[2]] = odim[2];
-              idim[dims_[3]] = odim[3];
-
-              output_data[odim[0] * out_stride[0] + odim[1] * out_stride[1]
-                + odim[2] * out_stride[2] + odim[3]] =
-                input_data[idim[0] * in_stride[0] + idim[1] * in_stride[1]
-                  + idim[2] * in_stride[2] + idim[3]];
+      std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
+      std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
+      index_t batch_size = input->dim(1) * input->dim(2) * input->dim(3);
+      if (dims_ == transpose_order_from_NHWC_to_NCHW && input->dim(3) == 3) {
+        for (index_t b = 0; b < input->dim(0); ++b) {
+          TransposeNHWCToNCHWC3(input_data + b * batch_size,
+                                output_data + b * batch_size,
+                                input->dim(1),
+                                input->dim(2));
+        }
+      } else if (dims_ == transpose_order_from_NCHW_to_NHWC
+        && input->dim(1) == 2) {
+        for (index_t b = 0; b < input->dim(0); ++b) {
+          TransposeNCHWToNHWCC2(input_data + b * batch_size,
+                                output_data + b * batch_size,
+                                input->dim(2),
+                                input->dim(3));
+        }
+      } else {
+        std::vector<index_t>
+          in_stride{input_shape[1] * input_shape[2] * input_shape[3],
+                    input_shape[2] * input_shape[3], input_shape[3], 1};
+        std::vector<index_t>
+          out_stride{output_shape[1] * output_shape[2] * output_shape[3],
+                     output_shape[2] * output_shape[3], output_shape[3], 1};
+
+        std::vector<index_t> idim(4, 0);
+        std::vector<index_t> odim(4, 0);
+        for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
+          for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
+            for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
+              for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
+                idim[dims_[0]] = odim[0];
+                idim[dims_[1]] = odim[1];
+                idim[dims_[2]] = odim[2];
+                idim[dims_[3]] = odim[3];
+
+                output_data[odim[0] * out_stride[0] + odim[1] * out_stride[1]
+                  + odim[2] * out_stride[2] + odim[3]] =
+                  input_data[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+                    + idim[2] * in_stride[2] + idim[3]];
+              }
             }
           }
         }
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index a208653333bdea04dfa81303cbd9b78a5b8aa5a8..4a5d80e4cfbfabcd7d948d874fdc8c2f144fbfd4 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -165,6 +165,13 @@ BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
 BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128);
 BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024);
 
+BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
+BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
+
+BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
+BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
+BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index ea50b0c17c2155adb974485b4fef8b4d275686ef..543e2ac906aed88c1bb904c592dab6bfa7708482 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -779,11 +779,17 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
   TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2},
                                                 {1, 1});
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {15, 1, 64, 2},
+                                             {1, 1});
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {15, 1, 32, 2},
+                                             {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
   TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2},
                                                 {1, 1});
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {1, 15, 32, 2},
+                                             {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc
index a86549ed9cc4206b00d9276df524e95d491acad7..24e6f2ffe44499de11da8fd2eb22a1010401c6b6 100644
--- a/mace/ops/transpose_benchmark.cc
+++ b/mace/ops/transpose_benchmark.cc
@@ -83,6 +83,9 @@ void TransposeBenchmark(int iters,
 #define BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3)                   \
   BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU);
 
+
+BM_TRANSPOSE4D(1, 512, 512, 3, 0, 3, 1, 2);
+BM_TRANSPOSE4D(1, 2, 512, 512, 0, 2, 3, 1);
 BM_TRANSPOSE4D(1, 64, 64, 512, 0, 3, 1, 2);
 BM_TRANSPOSE4D(1, 512, 64, 64, 0, 2, 3, 1);
 BM_TRANSPOSE2D(128, 128);
diff --git a/mace/ops/transpose_test.cc b/mace/ops/transpose_test.cc
index 0faacc9111c4e904a6bd2a95b44b835376ae9987..3a4b5729e349889f0ad725255f46fa0cbf5a90b9 100644
--- a/mace/ops/transpose_test.cc
+++ b/mace/ops/transpose_test.cc
@@ -37,16 +37,51 @@ void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
   // Run on cpu
   net.RunOp();
 
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNCHW", "Input");
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  DataFormat::NHWC,
+                                                  "InputNCHW",
+                                                  DataFormat::NCHW);
 
   ExpectTensorNear<float>(*net.GetOutput("InputNCHW"),
                           *net.GetOutput("Output"));
 }
+
+void TransposeNHWCTest(const std::vector<index_t> &input_shape) {
+  // Construct graph
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<CPU, float>("Input", input_shape);
+
+  OpDefBuilder("Transpose", "TransposeNHWCTest")
+    .Input("Input")
+    .Output("Output")
+    .AddIntsArg("dims", {0, 2, 3, 1})
+    .Finalize(net.NewOperatorDef());
+
+  // Run on cpu
+  net.RunOp();
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  DataFormat::NCHW,
+                                                  "InputNHWC",
+                                                  DataFormat::NHWC);
+
+  ExpectTensorNear<float>(*net.GetOutput("InputNHWC"),
+                          *net.GetOutput("Output"));
+}
 }  // namespace
 
-TEST_F(TransposeOpTest, NCHW) {
+TEST_F(TransposeOpTest, NHWC_to_NCHW) {
   TransposeNCHWTest({3, 64, 64, 128});
   TransposeNCHWTest({1, 64, 48, 128});
+  TransposeNCHWTest({1, 512, 512, 3});
+  TransposeNCHWTest({2, 512, 512, 3});
+}
+
+TEST_F(TransposeOpTest, NCHW_to_NHWC) {
+  TransposeNHWCTest({1, 2, 512, 512});
+  TransposeNHWCTest({1, 3, 512, 512});
+  TransposeNHWCTest({2, 2, 512, 512});
 }
 
 TEST_F(TransposeOpTest, Rank2) {
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index 0de6190717a2a3628efff28b73825a2638b8da8e..164b54e45595891752a6302a8883b9271e2bc7aa 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -40,11 +40,6 @@ FLAGS = None
 device_type_map = {'cpu': cvt.DeviceType.CPU.value,
                    'gpu': cvt.DeviceType.GPU.value,
                    'dsp': cvt.DeviceType.HEXAGON.value}
-device_data_type_map = {
-    cvt.DeviceType.CPU.value: mace_pb2.DT_FLOAT,
-    cvt.DeviceType.GPU.value: mace_pb2.DT_HALF,
-    cvt.DeviceType.HEXAGON.value: mace_pb2.DT_UINT8
-}
 
 
 def file_checksum(fname):
@@ -129,6 +124,17 @@ def main(unused_args):
                                                        FLAGS.weight_file)
 
         output_graph_def = converter.run()
+
+        if FLAGS.gpu_data_type == 'half':
+            gpu_data_type = mace_pb2.DT_HALF
+        else:
+            gpu_data_type = mace_pb2.DT_FLOAT
+        device_data_type_map = {
+            cvt.DeviceType.CPU.value: mace_pb2.DT_FLOAT,
+            cvt.DeviceType.GPU.value: gpu_data_type,
+            cvt.DeviceType.HEXAGON.value: mace_pb2.DT_UINT8
+        }
+
         print("Transform model to one that can better run on device")
         if not FLAGS.runtime:
             cpu_graph_def = copy.deepcopy(output_graph_def)
@@ -180,7 +186,7 @@ def main(unused_args):
         tensor_util.rename_tensor(output_graph_def)
 
     tensor_infos, model_data = tensor_util.get_tensor_info_and_model_data(
-            output_graph_def, FLAGS.runtime)
+            output_graph_def, FLAGS.runtime, FLAGS.gpu_data_type)
 
     source_converter_lib.convert_to_source(
             output_graph_def, model_checksum, weight_checksum, FLAGS.template,
@@ -194,7 +200,10 @@ def main(unused_args):
             f.write(bytearray(model_data))
 
     if FLAGS.model_load_type == 'pb':
-        tensor_util.del_tensor_data(output_graph_def, FLAGS.runtime)
+        tensor_util.del_tensor_data(
+                output_graph_def, FLAGS.runtime, FLAGS.gpu_data_type)
+        tensor_util.update_tensor_data_type(
+                output_graph_def, FLAGS.runtime, FLAGS.gpu_data_type)
         with open(FLAGS.pb_output, "wb") as f:
             f.write(output_graph_def.SerializeToString())
         # with open(FLAGS.pb_output + '_txt', "wb") as f:
@@ -253,8 +262,6 @@ def parse_args():
         help="e.g., input_node")
     parser.add_argument(
         "--output_node", type=str, default="softmax", help="e.g., softmax")
-    parser.add_argument(
-        "--output_type", type=str, default="pb", help="output type: source/pb")
     parser.add_argument(
         "--template", type=str, default="", help="template path")
     parser.add_argument(
@@ -293,6 +300,8 @@ def parse_args():
         default="source",
         help="[source|pb] Load models in generated `source` code" +
                 "or `pb` file.")
+    parser.add_argument(
+        "--gpu_data_type", type=str, default="half", help="half/float")
     return parser.parse_known_args()
 
 
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 709d1feeaef6f4fe918b9d9c9334b9e685d616ec..be6e67529166d6678df326af14c369124fff7e08 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -153,14 +153,15 @@ class TransformerRule(Enum):
     TRANSFORM_GPU_WINOGRAD = 8
     TRANSFORM_ADD_TO_BIASADD = 9
     FOLD_BIASADD = 10
-    FOLD_ACTIVATION = 11
-    TRANSPOSE_FILTERS = 12
-    RESHAPE_FC_WEIGHT = 13
-    TRANSPOSE_DATA_FORMAT = 14
-    TRANSFORM_GLOBAL_CONV_TO_FC = 15
-    TRANSFORM_BUFFER_IMAGE = 16
-    ADD_DEVICE_AND_DATA_TYPE = 17
-    SORT_BY_EXECUTION = 18
+    FLATTEN_ATROUS_CONV = 11
+    FOLD_ACTIVATION = 12
+    TRANSPOSE_FILTERS = 13
+    RESHAPE_FC_WEIGHT = 14
+    TRANSPOSE_DATA_FORMAT = 15
+    TRANSFORM_GLOBAL_CONV_TO_FC = 16
+    TRANSFORM_BUFFER_IMAGE = 17
+    ADD_DEVICE_AND_DATA_TYPE = 18
+    SORT_BY_EXECUTION = 19
 
 
 class ConverterInterface(object):
@@ -218,6 +219,7 @@ class ConverterOption(object):
             TransformerRule.TRANSFORM_GPU_WINOGRAD,
             TransformerRule.TRANSFORM_ADD_TO_BIASADD,
             TransformerRule.FOLD_BIASADD,
+            TransformerRule.FLATTEN_ATROUS_CONV,
             TransformerRule.FOLD_ACTIVATION,
             TransformerRule.TRANSPOSE_FILTERS,
             TransformerRule.TRANSPOSE_DATA_FORMAT,
diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py
index 2917562df353872b4a4b5b9f1a9ad398a1e992e3..8f05e61c02d496716889f57609fc69c0f642e184 100644
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -16,6 +16,7 @@
 import math
 import numpy as np
 import tensorflow as tf
+from enum import Enum
 
 from mace.proto import mace_pb2
 from mace.python.tools.converter_tool import base_converter
@@ -41,6 +42,50 @@ tf_epsilon_str = 'epsilon'
 tf_align_corners = 'align_corners'
 tf_block_size = 'block_size'
 
+TFSupportedOps = [
+    'Conv2D',
+    'DepthwiseConv2dNative',
+    'Conv2DBackpropInput',
+    'BiasAdd',
+    'Add',
+    'Sub',
+    'Mul',
+    'Div',
+    'Min',
+    'Max',
+    'Neg',
+    'Abs',
+    'RealDiv',
+    'SquaredDifference',
+    'Pow',
+    'Relu',
+    'Relu6',
+    'Tanh',
+    'Sigmoid',
+    'FusedBatchNorm',
+    'AvgPool',
+    'MaxPool',
+    'Squeeze',
+    'MatMul',
+    'Identity',
+    'Reshape',
+    'Shape',
+    'Transpose',
+    'Softmax',
+    'ResizeBilinear',
+    'Placeholder',
+    'SpaceToBatchND',
+    'BatchToSpaceND',
+    'DepthToSpace',
+    'SpaceToDepth',
+    'Pad',
+    'ConcatV2',
+    'Mean',
+    'Const',
+]
+
+TFOpType = Enum('TFOpType', [(op, op) for op in TFSupportedOps], type=str)
+
 
 class TensorflowConverter(base_converter.ConverterInterface):
     """A class for convert tensorflow frozen model to mace model.
@@ -53,71 +98,70 @@ class TensorflowConverter(base_converter.ConverterInterface):
         'FULL': PaddingMode.FULL
     }
     pooling_type_mode = {
-        'AvgPool': PoolingType.AVG,
-        'MaxPool': PoolingType.MAX
+        TFOpType.AvgPool.name: PoolingType.AVG,
+        TFOpType.MaxPool.name: PoolingType.MAX
     }
     eltwise_type = {
-        'Add': EltwiseType.SUM,
-        'Sub': EltwiseType.SUB,
-        'Mul': EltwiseType.PROD,
-        'Div': EltwiseType.DIV,
-        'Min': EltwiseType.MIN,
-        'Max': EltwiseType.MAX,
-        'Neg': EltwiseType.NEG,
-        'Abs': EltwiseType.ABS,
-        'RealDiv': EltwiseType.DIV,
-        'SquaredDifference': EltwiseType.SQR_DIFF,
-        'Pow': EltwiseType.POW
+        TFOpType.Add.name: EltwiseType.SUM,
+        TFOpType.Sub.name: EltwiseType.SUB,
+        TFOpType.Mul.name: EltwiseType.PROD,
+        TFOpType.Div.name: EltwiseType.DIV,
+        TFOpType.Min.name: EltwiseType.MIN,
+        TFOpType.Max.name: EltwiseType.MAX,
+        TFOpType.Neg.name: EltwiseType.NEG,
+        TFOpType.Abs.name: EltwiseType.ABS,
+        TFOpType.RealDiv.name: EltwiseType.DIV,
+        TFOpType.SquaredDifference.name: EltwiseType.SQR_DIFF,
+        TFOpType.Pow.name: EltwiseType.POW
     }
     activation_type = {
-        'Relu': ActivationType.RELU,
-        'Relu6': ActivationType.RELUX,
-        'Tanh': ActivationType.TANH,
-        'Sigmoid': ActivationType.SIGMOID
+        TFOpType.Relu.name: ActivationType.RELU,
+        TFOpType.Relu6.name: ActivationType.RELUX,
+        TFOpType.Tanh.name: ActivationType.TANH,
+        TFOpType.Sigmoid.name: ActivationType.SIGMOID
     }
 
     def __init__(self, option, src_model_file):
         self._op_converters = {
-            'Conv2D': self.convert_conv2d,
-            'DepthwiseConv2dNative': self.convert_conv2d,
-            'Conv2DBackpropInput': self.convert_conv2d,
-            'BiasAdd': self.convert_biasadd,
-            'Add': self.convert_add,
-            'Sub': self.convert_elementwise,
-            'Mul': self.convert_elementwise,
-            'Div': self.convert_elementwise,
-            'Min': self.convert_elementwise,
-            'Max': self.convert_elementwise,
-            'Neg': self.convert_elementwise,
-            'Abs': self.convert_elementwise,
-            'RealDiv': self.convert_elementwise,
-            'SquaredDifference': self.convert_elementwise,
-            'Pow': self.convert_elementwise,
-            'Relu': self.convert_activation,
-            'Relu6': self.convert_activation,
-            'Tanh': self.convert_activation,
-            'Sigmoid': self.convert_activation,
-            'FusedBatchNorm': self.convert_fused_batchnorm,
-            'AvgPool': self.convert_pooling,
-            'MaxPool': self.convert_pooling,
-            'Squeeze': self.convert_identity,
-            'MatMul': self.convert_matmul,
-            'Identity': self.convert_identity,
-            'Reshape': self.convert_reshape,
-            'Shape': self.convert_nop,
-            'Transpose': self.convert_transpose,
-            'Softmax': self.convert_softmax,
-            'ResizeBilinear': self.convert_resize_bilinear,
-            'Placeholder': self.convert_nop,
-            'SpaceToBatchND': self.convert_space_batch,
-            'BatchToSpaceND': self.convert_space_batch,
-            'DepthToSpace': self.convert_space_depth,
-            'SpaceToDepth': self.convert_space_depth,
-            'Pad': self.convert_pad,
-            'ConcatV2': self.convert_concat,
-            'Mean': self.convert_mean,
-            # Const converter_tool should be placed at the end
-            'Const': self.convert_tensor,
+            TFOpType.Conv2D.name: self.convert_conv2d,
+            TFOpType.DepthwiseConv2dNative.name: self.convert_conv2d,
+            TFOpType.Conv2DBackpropInput.name: self.convert_conv2d,
+            TFOpType.BiasAdd.name: self.convert_biasadd,
+            TFOpType.Add.name: self.convert_add,
+            TFOpType.Sub.name: self.convert_elementwise,
+            TFOpType.Mul.name: self.convert_elementwise,
+            TFOpType.Div.name: self.convert_elementwise,
+            TFOpType.Min.name: self.convert_elementwise,
+            TFOpType.Max.name: self.convert_elementwise,
+            TFOpType.Neg.name: self.convert_elementwise,
+            TFOpType.Abs.name: self.convert_elementwise,
+            TFOpType.RealDiv.name: self.convert_elementwise,
+            TFOpType.SquaredDifference.name: self.convert_elementwise,
+            TFOpType.Pow.name: self.convert_elementwise,
+            TFOpType.Relu.name: self.convert_activation,
+            TFOpType.Relu6.name: self.convert_activation,
+            TFOpType.Tanh.name: self.convert_activation,
+            TFOpType.Sigmoid.name: self.convert_activation,
+            TFOpType.FusedBatchNorm.name: self.convert_fused_batchnorm,
+            TFOpType.AvgPool.name: self.convert_pooling,
+            TFOpType.MaxPool.name: self.convert_pooling,
+            TFOpType.Squeeze.name: self.convert_identity,
+            TFOpType.MatMul.name: self.convert_matmul,
+            TFOpType.Identity.name: self.convert_identity,
+            TFOpType.Reshape.name: self.convert_reshape,
+            TFOpType.Shape.name: self.convert_nop,
+            TFOpType.Transpose.name: self.convert_transpose,
+            TFOpType.Softmax.name: self.convert_softmax,
+            TFOpType.ResizeBilinear.name: self.convert_resize_bilinear,
+            TFOpType.Placeholder.name: self.convert_nop,
+            TFOpType.SpaceToBatchND.name: self.convert_space_batch,
+            TFOpType.BatchToSpaceND.name: self.convert_space_batch,
+            TFOpType.DepthToSpace.name: self.convert_space_depth,
+            TFOpType.SpaceToDepth.name: self.convert_space_depth,
+            TFOpType.Pad.name: self.convert_pad,
+            TFOpType.ConcatV2.name: self.convert_concat,
+            TFOpType.Mean.name: self.convert_mean,
+            TFOpType.Const.name: self.convert_nop,
         }
         self._option = option
         self._mace_net_def = mace_pb2.NetDef()
@@ -180,24 +224,29 @@ class TensorflowConverter(base_converter.ConverterInterface):
                        "Mace does not support tensorflow op type %s yet"
                        % tf_op.type)
             self._op_converters[tf_op.type](tf_op)
+        self.convert_tensors()
 
-    def convert_tensor(self, tf_op):
-        output_name = tf_op.outputs[0].name
-        if output_name not in self._skip_tensor:
-            tensor = self._mace_net_def.tensors.add()
-            tensor.name = tf_op.outputs[0].name
-            tf_tensor = tf_op.outputs[0].eval()
-            tensor.dims.extend(list(tf_tensor.shape))
-
-            tf_dt = tf_op.get_attr('dtype')
-            if tf_dt == tf.float32:
-                tensor.data_type = mace_pb2.DT_FLOAT
-                tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
-            elif tf_dt == tf.int32:
-                tensor.data_type = mace_pb2.DT_INT32
-                tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
-            else:
-                mace_check(False, "Not supported tensor type: %s" % tf_dt.name)
+    def convert_tensors(self):
+        for tf_op in self._tf_graph.get_operations():
+            if tf_op.type != TFOpType.Const.name:
+                continue
+            output_name = tf_op.outputs[0].name
+            if output_name not in self._skip_tensor:
+                tensor = self._mace_net_def.tensors.add()
+                tensor.name = tf_op.outputs[0].name
+                tf_tensor = tf_op.outputs[0].eval()
+                tensor.dims.extend(list(tf_tensor.shape))
+
+                tf_dt = tf_op.get_attr('dtype')
+                if tf_dt == tf.float32:
+                    tensor.data_type = mace_pb2.DT_FLOAT
+                    tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
+                elif tf_dt == tf.int32:
+                    tensor.data_type = mace_pb2.DT_INT32
+                    tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
+                else:
+                    mace_check(False,
+                               "Not supported tensor type: %s" % tf_dt.name)
 
     def add_tensor(self, name, shape, data_type, value):
         tensor = self._mace_net_def.tensors.add()
@@ -229,9 +278,9 @@ class TensorflowConverter(base_converter.ConverterInterface):
 
     def convert_conv2d(self, tf_op):
         op = self.convert_general_op(tf_op)
-        if tf_op.type == 'DepthwiseConv2dNative':
+        if tf_op.type == TFOpType.DepthwiseConv2dNative.name:
             op.type = MaceOp.DepthwiseConv2d.name
-        elif tf_op.type == 'Conv2DBackpropInput':
+        elif tf_op.type == TFOpType.Conv2DBackpropInput.name:
             op.type = MaceOp.Deconv2D.name
         else:
             op.type = MaceOp.Conv2D.name
@@ -274,7 +323,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
         type_arg.name = MaceKeyword.mace_activation_type_str
         type_arg.s = self.activation_type[tf_op.type].name
 
-        if tf_op.type == 'Relu6':
+        if tf_op.type == TFOpType.Relu6.name:
             limit_arg = op.arg.add()
             limit_arg.name = MaceKeyword.mace_activation_max_limit_str
             limit_arg.f = 6.0
@@ -335,7 +384,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
         size_arg.name = MaceKeyword.mace_resize_size_str
         size_value = tf_op.inputs[1].eval().astype(np.int32)
         size_arg.ints.extend(size_value)
-        self._skip_tensor.update(tf_op.inputs[1].name)
+        self._skip_tensor.add(tf_op.inputs[1].name)
         align_corners_arg = op.arg.add()
         align_corners_arg.name = MaceKeyword.mace_align_corners_str
         align_corners_arg.i = tf_op.get_attr(tf_align_corners)
@@ -357,7 +406,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
         size_arg.ints.extend(size_value)
 
         crops_or_paddings_arg = op.arg.add()
-        if op.type == 'BatchToSpaceND':
+        if op.type == TFOpType.BatchToSpaceND.name:
             op.type = MaceOp.BatchToSpaceND.name
             crops_or_paddings_arg.name = \
                 MaceKeyword.mace_batch_to_space_crops_str
@@ -367,12 +416,12 @@ class TensorflowConverter(base_converter.ConverterInterface):
         crops_or_paddings_value = tf_op.inputs[2].eval().astype(np.int32).flat
         crops_or_paddings_arg.ints.extend(crops_or_paddings_value)
 
-        self._skip_tensor.update(tf_op.inputs[1].name)
-        self._skip_tensor.update(tf_op.inputs[2].name)
+        self._skip_tensor.add(tf_op.inputs[1].name)
+        self._skip_tensor.add(tf_op.inputs[2].name)
 
     def convert_space_depth(self, tf_op):
         op = self.convert_general_op(tf_op)
-        if op.type == 'SpaceToDepth':
+        if op.type == TFOpType.SpaceToDepth.name:
             op.type = MaceOp.SpaceToDepth.name
         else:
             op.type = MaceOp.DepthToSpace.name
@@ -390,14 +439,14 @@ class TensorflowConverter(base_converter.ConverterInterface):
         paddings_arg.name = MaceKeyword.mace_paddings_str
         paddings_value = tf_op.inputs[1].eval().astype(np.int32).flat
         paddings_arg.ints.extend(paddings_value)
-        self._skip_tensor.update(tf_op.inputs[1].name)
+        self._skip_tensor.add(tf_op.inputs[1].name)
 
         if len(tf_op.inputs) == 3:
             constant_value_arg = op.arg.add()
             constant_value_arg.name = MaceKeyword.mace_constant_value_str
             constant_value = tf_op.inputs[2].eval().astype(np.int32).flat[0]
             constant_value_arg.i = constant_value
-            self._skip_tensor.update(tf_op.inputs[2].name)
+            self._skip_tensor.add(tf_op.inputs[2].name)
 
     def convert_concat(self, tf_op):
         op = self.convert_general_op(tf_op)
@@ -412,7 +461,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
 
         mace_check(axis == 3, "only support concat at channel dimension")
 
-        self._skip_tensor.update(tf_op.inputs[-1].name)
+        self._skip_tensor.add(tf_op.inputs[-1].name)
 
     def convert_matmul(self, tf_op):
         op = self.convert_general_op(tf_op)
@@ -426,13 +475,13 @@ class TensorflowConverter(base_converter.ConverterInterface):
         shape_arg = op.arg.add()
         shape_arg.name = MaceKeyword.mace_shape_str
         shape_value = []
-        if tf_op.inputs[1].op.type == 'Const':
+        if tf_op.inputs[1].op.type == TFOpType.Const.name:
             shape_value = list(tf_op.inputs[1].eval().astype(np.int32))
             for i in xrange(len(shape_value)):
                 if shape_value[i] == -1:
                     shape_value[i] = 1
-            self._skip_tensor.update(tf_op.inputs[-1].name)
-        elif tf_op.inputs[1].op.type == 'Shape':
+            self._skip_tensor.add(tf_op.inputs[-1].name)
+        elif tf_op.inputs[1].op.type == TFOpType.Shape.name:
             shape_value = list(tf_op.inputs[1].op.inputs[0].shape.as_list())
 
         shape_arg.ints.extend(shape_value)
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 4b7e098d3883bb295fe995795c051abc0a657835..0fa5ddd967f026757e886a39f2e84f5a63c975bf 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -66,6 +66,8 @@ class Transformer(base_converter.ConverterInterface):
             TransformerRule.TRANSFORM_ADD_TO_BIASADD,
             TransformerRule.FOLD_BIASADD,
             TransformerRule.FOLD_ACTIVATION,
+            TransformerRule.FLATTEN_ATROUS_CONV,
+            TransformerRule.FOLD_ACTIVATION,
             TransformerRule.TRANSPOSE_FILTERS,
             TransformerRule.TRANSPOSE_DATA_FORMAT,
             TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC,
@@ -93,6 +95,7 @@ class Transformer(base_converter.ConverterInterface):
             TransformerRule.TRANSFORM_ADD_TO_BIASADD:
                 self.transform_add_to_biasadd,
             TransformerRule.FOLD_BIASADD: self.fold_biasadd,
+            TransformerRule.FLATTEN_ATROUS_CONV: self.flatten_atrous_conv,
             TransformerRule.FOLD_ACTIVATION: self.fold_activation,
             TransformerRule.TRANSPOSE_FILTERS: self.transpose_filters,
             TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
@@ -616,6 +619,65 @@ class Transformer(base_converter.ConverterInterface):
 
         return False
 
+    def flatten_atrous_conv(self):
+        if self._option.device != DeviceType.GPU.value:
+            return
+
+        net = self._model
+        for op in net.op:
+            if (op.type == MaceOp.SpaceToBatchND.name
+                    and len(self._consumers.get(op.output[0], [])) == 1):
+                conv_op = self._consumers.get(op.output[0])[0]
+                if (conv_op.type == MaceOp.Conv2D.name
+                        or conv_op.type == MaceOp.DepthwiseConv2d.name) \
+                        and len(self._consumers.get(conv_op.output[0], [])) == 1:  # noqa
+                    b2s_op = self._consumers.get(conv_op.output[0])[0]
+                    if b2s_op.type == MaceOp.BatchToSpaceND.name:
+                        print "Flatten atrous convolution"
+                        # Add args.
+                        padding_arg_values = ConverterUtil.get_arg(
+                            op,
+                            MaceKeyword.mace_paddings_str).ints
+                        blocks_arg_values = ConverterUtil.get_arg(
+                            b2s_op,
+                            MaceKeyword.mace_space_batch_block_shape_str).ints
+                        dilation_arg = ConverterUtil.get_arg(
+                            conv_op,
+                            MaceKeyword.mace_dilations_str)
+                        if dilation_arg is None:
+                            dilation_arg = conv_op.arg.add()
+                        dilation_arg.name = MaceKeyword.mace_dilations_str
+                        dilation_arg.ints[:] = blocks_arg_values
+
+                        padding_arg = ConverterUtil.get_arg(
+                            conv_op,
+                            MaceKeyword.mace_padding_str)
+                        if padding_arg is None:
+                            padding_arg = conv_op.arg.add()
+                        padding_arg.name = MaceKeyword.mace_padding_str
+                        if len(padding_arg_values) > 0 \
+                                and padding_arg_values[0] > 0:
+                            padding_arg.i = PaddingMode.SAME.value
+                        else:
+                            padding_arg.i = PaddingMode.VALID.value
+
+                        strides_arg = ConverterUtil.get_arg(
+                            conv_op,
+                            MaceKeyword.mace_strides_str)
+                        if strides_arg is None:
+                            strides_arg = conv_op.arg.add()
+                        strides_arg.name = MaceKeyword.mace_strides_str
+                        strides_arg.ints[:] = [1, 1]
+
+                        # update output shape
+                        conv_op.output_shape[0].dims[:] = \
+                            b2s_op.output_shape[0].dims[:]
+
+                        self.safe_remove_node(op, None)
+                        self.safe_remove_node(b2s_op, conv_op)
+                        return True
+        return False
+
     def fold_activation(self):
         net = self._model
         for op in net.op:
diff --git a/mace/python/tools/source_converter_lib.py b/mace/python/tools/source_converter_lib.py
index 0fbf32fb31b0fc3f7ad8aa6dd7f3544c6dc78916..145d1d4127801cfd4c9387ca4b4078be8a87d72d 100644
--- a/mace/python/tools/source_converter_lib.py
+++ b/mace/python/tools/source_converter_lib.py
@@ -27,7 +27,6 @@ def convert_to_source(net_def, model_checksum, weight_checksum, template_dir,
                       obfuscate, model_tag, output, runtime, embed_model_data,
                       winograd_conv, model_load_type, tensor_infos,
                       model_data):
-
     # Capture our current directory
     print template_dir
 
diff --git a/mace/python/tools/tensor_util.py b/mace/python/tools/tensor_util.py
index e94a10ca92042e745900c8d2f7c8cd7faf19d1a6..62d33052982fe19784385cf2d32cd3faad146233 100644
--- a/mace/python/tools/tensor_util.py
+++ b/mace/python/tools/tensor_util.py
@@ -105,11 +105,11 @@ def rename_tensor(net_def):
 
 
 class TensorInfo:
-    def __init__(self, id, t, runtime):
+    def __init__(self, id, t, runtime, gpu_data_type):
         self.id = id
         self.data_type = mace_pb2.DataType.Name(t.data_type)
         if t.data_type == mace_pb2.DT_FLOAT:
-            if runtime == 'gpu':
+            if runtime == 'gpu' and gpu_data_type == 'half':
                 self.data_type = mace_pb2.DT_HALF
                 self.data = bytearray(
                     np.array(t.float_data).astype(np.float16).tobytes())
@@ -127,13 +127,13 @@ class TensorInfo:
             raise Exception('Tensor data type %s not supported' % t.data_type)
 
 
-def get_tensor_info_and_model_data(net_def, runtime):
+def get_tensor_info_and_model_data(net_def, runtime, gpu_data_type):
     model_data = []
     offset = 0
     counter = 0
     tensor_infos = []
     for t in net_def.tensors:
-        tensor_info = TensorInfo(counter, t, runtime)
+        tensor_info = TensorInfo(counter, t, runtime, gpu_data_type)
         tensor_infos.append(tensor_info)
         # align
         if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0:
@@ -156,15 +156,17 @@ def get_tensor_info_and_model_data(net_def, runtime):
     return tensor_infos, model_data
 
 
-def del_tensor_data(net_def, runtime):
+def del_tensor_data(net_def, runtime, gpu_data_type):
     for t in net_def.tensors:
         if t.data_type == mace_pb2.DT_FLOAT:
             del t.float_data[:]
-            if runtime == 'gpu':
-                t.data_type = mace_pb2.DT_HALF
-            else:
-                t.data_type = mace_pb2.DT_FLOAT
         elif t.data_type == mace_pb2.DT_INT32:
             del t.int32_data[:]
         elif t.data_type == mace_pb2.DT_UINT8:
             del t.int32_data[:]
+
+def update_tensor_data_type(net_def, runtime, gpu_data_type):
+    for t in net_def.tensors:
+        if t.data_type == mace_pb2.DT_FLOAT and runtime == 'gpu' \
+                and gpu_data_type == 'half':
+            t.data_type = mace_pb2.DT_HALF
diff --git a/tools/mace_tools.py b/tools/mace_tools.py
index c68ea22532d00da22de0aaca6dcfa7953112fd7d..7bc15c7dac93f03a4596b4b152707c9222a6e87e 100644
--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -538,6 +538,11 @@ def parse_args():
         default="source",
         help="[source|pb] Load models in generated `source` code" +
                 "or `pb` file.")
+    parser.add_argument(
+        "--gpu_data_type",
+        type=str,
+        default="half",
+        help="[half | float].")
     return parser.parse_known_args()
 
 
@@ -809,7 +814,8 @@ def main(unused_args):
                 model_config["fast_conv"],
                 model_config["obfuscate"],
                 model_output_base_dir,
-                FLAGS.model_load_type)
+                FLAGS.model_load_type,
+                FLAGS.gpu_data_type)
 
     for target_abi in configs["target_abis"]:
         for target_soc in target_socs:
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 464ede8e9d19e818edcf70c1ac7b4bb297e852ab..065d40f057a5f9fab179415c83b8dfd4fe892f5d 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -470,7 +470,8 @@ def gen_model_code(model_codegen_dir,
                    fast_conv,
                    obfuscate,
                    model_output_dir,
-                   model_load_type):
+                   model_load_type,
+                   gpu_data_type):
     print("* Genearte model code")
     bazel_build_common("//mace/python/tools:converter")
 
@@ -499,6 +500,7 @@ def gen_model_code(model_codegen_dir,
                   "--codegen_output=%s/model.cc" % model_codegen_dir,
                   "--pb_output=%s/%s.pb" % (model_output_dir, model_tag),
                   "--model_load_type=%s" % model_load_type,
+                  "--gpu_data_type=%s" % gpu_data_type,
                   _out=process_output,
                   _bg=True,
                   _err_to_out=True)