diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel
index fcb7b20773fafa2c48b9e1636271308ec161d8cf..971b2a271c389b11c61f37e1def1ce49b4537a2e 100644
--- a/mace/core/BUILD.bazel
+++ b/mace/core/BUILD.bazel
@@ -28,21 +28,37 @@ cc_library(
             "*.cc",
             "runtime/cpu/*.cc",
         ],
+        exclude = [
+            "rpcmem.cc",
+        ],
     ) + if_opencl_enabled(glob(
         [
             "runtime/opencl/*.cc",
         ],
-    )) + if_hexagon_enabled([
+    )) + if_hexagon_or_hta_enabled([
+        "runtime/hexagon/hexagon_allocator.h",
+        "runtime/hexagon/hexagon_allocator.cc",
+        "runtime/hexagon/hexagon_device.cc",
+    ]) + if_hexagon_enabled([
         "runtime/hexagon/hexagon_dsp_wrapper.cc",
     ]) + if_hta_enabled([
+        "runtime/hexagon/hexagon_hta_transformer.h",
+        "runtime/hexagon/hexagon_hta_transformer.cc",
         "runtime/hexagon/hexagon_hta_wrapper.cc",
     ]) + if_apu_enabled(glob([
         "runtime/apu/*.cc",
-    ])),
-    hdrs = glob([
-        "*.h",
-        "runtime/cpu/*.h",
-    ]) + if_opencl_enabled(glob([
+    ])) + if_rpcmem_enabled([
+        "rpcmem.cc",
+    ]),
+    hdrs = glob(
+        [
+            "*.h",
+            "runtime/cpu/*.h",
+        ],
+        exclude = [
+            "rpcmem.h",
+        ],
+    ) + if_opencl_enabled(glob([
         "runtime/opencl/*.h",
     ])) + if_hexagon_or_hta_enabled(glob([
         "runtime/hexagon/hexagon_control_wrapper.h",
@@ -53,7 +69,9 @@ cc_library(
         "runtime/hexagon/*hta*.h",
     ])) + if_apu_enabled(glob([
         "runtime/apu/*.h"
-    ])),
+    ])) + if_rpcmem_enabled([
+        "rpcmem.h",
+    ]),
     copts = [
         "-Werror",
         "-Wextra",
diff --git a/mace/core/CMakeLists.txt b/mace/core/CMakeLists.txt
index 75b74bb9005d321355332de46f5921ba7d638822..25ab20bff9167b3936f8fb2101c3c9165016ea46 100644
--- a/mace/core/CMakeLists.txt
+++ b/mace/core/CMakeLists.txt
@@ -17,11 +17,17 @@ set(CORE_SRCS
   runtime/cpu/cpu_runtime.cc
 )
 
+if(MACE_ENABLE_RPCMEM)
+  set(CORE_SRCS ${CORE_SRCS} rpcmem.cc)
+  set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} rpcmem)
+endif(MACE_ENABLE_RPCMEM)
+
 if(MACE_ENABLE_OPENCL)
   set(CORE_SRCS ${CORE_SRCS}
     runtime/opencl/gpu_device.cc
     runtime/opencl/gpu_runtime.cc
     runtime/opencl/opencl_allocator.cc
+    runtime/opencl/opencl_helper.cc
     runtime/opencl/opencl_runtime.cc
     runtime/opencl/opencl_util.cc
     runtime/opencl/opencl_wrapper.cc
@@ -30,12 +36,20 @@ if(MACE_ENABLE_OPENCL)
   set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} generated_opencl_kernel dl)
 endif(MACE_ENABLE_OPENCL)
 
+if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
+  set(CORE_SRCS ${CORE_SRCS}
+    runtime/hexagon/hexagon_allocator.cc
+    runtime/hexagon/hexagon_device.cc
+  )
+endif(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
+
 if(MACE_ENABLE_HEXAGON_DSP)
   set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_dsp_wrapper.cc)
   set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hexagon_controller)
 endif(MACE_ENABLE_HEXAGON_DSP)
 
 if(MACE_ENABLE_HEXAGON_HTA)
+  set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_hta_transformer.cc)
   set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_hta_wrapper.cc)
   set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hta_hexagon_runtime)
 endif(MACE_ENABLE_HEXAGON_HTA)
@@ -45,10 +59,6 @@ if(MACE_ENABLE_MTK_APU)
   set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} apu-frontend)
 endif(MACE_ENABLE_MTK_APU)
 
-if(MACE_ENABLE_RPCMEM)
-    set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} rpcmem)
-endif(MACE_ENABLE_RPCMEM)
-
 add_library(core STATIC ${CORE_SRCS})
 target_link_libraries(core PRIVATE
   proto
diff --git a/mace/core/allocator.cc b/mace/core/allocator.cc
index 003b1c2c02e2907651c56193180439a08223f97a..28ceeedcd4828154ab4a53cc2a9198849252ae94 100644
--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -16,6 +16,54 @@
 
 namespace mace {
 
+MaceStatus Allocator::NewImage(const std::vector<size_t> &image_shape,
+                               const DataType dt,
+                               void **result) {
+  MACE_UNUSED(image_shape);
+  MACE_UNUSED(dt);
+  MACE_UNUSED(result);
+  MACE_NOT_IMPLEMENTED;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void Allocator::DeleteImage(void *data) {
+  MACE_UNUSED(data);
+  MACE_NOT_IMPLEMENTED;
+}
+
+void *Allocator::Map(void *buffer,
+                     size_t offset,
+                     size_t nbytes,
+                     bool finish_cmd_queue) {
+  MACE_UNUSED(nbytes);
+  MACE_UNUSED(finish_cmd_queue);
+  return reinterpret_cast<char*>(buffer) + offset;
+}
+
+void *Allocator::MapImage(void *buffer,
+                          const std::vector<size_t> &image_shape,
+                          std::vector<size_t> *mapped_image_pitch,
+                          bool finish_cmd_queue) {
+  MACE_UNUSED(buffer);
+  MACE_UNUSED(image_shape);
+  MACE_UNUSED(mapped_image_pitch);
+  MACE_UNUSED(finish_cmd_queue);
+  MACE_NOT_IMPLEMENTED;
+  return nullptr;
+}
+
+void Allocator::Unmap(void *buffer, void *mapper_ptr) {
+  MACE_UNUSED(buffer);
+  MACE_UNUSED(mapper_ptr);
+}
+
+#ifdef MACE_ENABLE_RPCMEM
+Rpcmem *Allocator::rpcmem() {
+  MACE_NOT_IMPLEMENTED;
+  return nullptr;
+}
+#endif  // MACE_ENABLE_RPCMEM
+
 Allocator *GetCPUAllocator() {
   static CPUAllocator allocator;
   return &allocator;
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index e4e7b35f0b58a9e98a70bfa6f3559ab1e039463b..c9f8f8f6c10db6e27047aa9aedf42dd8eb217a18 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -27,6 +27,10 @@
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"
 
+#ifdef MACE_ENABLE_RPCMEM
+#include "mace/core/rpcmem.h"
+#endif  // MACE_ENABLE_RPCMEM
+
 namespace mace {
 
 #if defined(__hexagon__)
@@ -50,19 +54,22 @@ class Allocator {
   virtual MaceStatus New(size_t nbytes, void **result) = 0;
   virtual MaceStatus NewImage(const std::vector<size_t> &image_shape,
                               const DataType dt,
-                              void **result) = 0;
+                              void **result);
   virtual void Delete(void *data) = 0;
-  virtual void DeleteImage(void *data) = 0;
+  virtual void DeleteImage(void *data);
   virtual void *Map(void *buffer,
                     size_t offset,
                     size_t nbytes,
-                    bool finish_cmd_queue) const = 0;
+                    bool finish_cmd_queue);
   virtual void *MapImage(void *buffer,
                          const std::vector<size_t> &image_shape,
                          std::vector<size_t> *mapped_image_pitch,
-                         bool finish_cmd_queue) const = 0;
-  virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
+                         bool finish_cmd_queue);
+  virtual void Unmap(void *buffer, void *mapper_ptr);
   virtual bool OnHost() const = 0;
+#ifdef MACE_ENABLE_RPCMEM
+  virtual Rpcmem *rpcmem();
+#endif  // MACE_ENABLE_RPCMEM
 };
 
 class CPUAllocator : public Allocator {
@@ -84,46 +91,12 @@ class CPUAllocator : public Allocator {
     return MaceStatus::MACE_SUCCESS;
   }
 
-  MaceStatus NewImage(const std::vector<size_t> &shape,
-                      const DataType dt,
-                      void **result) override {
-    MACE_UNUSED(shape);
-    MACE_UNUSED(dt);
-    MACE_UNUSED(result);
-    LOG(FATAL) << "Allocate CPU image";
-    return MaceStatus::MACE_SUCCESS;
-  }
-
   void Delete(void *data) override {
     MACE_CHECK_NOTNULL(data);
     VLOG(3) << "Free CPU buffer";
     free(data);
   }
-  void DeleteImage(void *data) override {
-    LOG(FATAL) << "Free CPU image";
-    free(data);
-  };
-  void *Map(void *buffer,
-            size_t offset,
-            size_t nbytes,
-            bool finish_cmd_queue) const override {
-    MACE_UNUSED(nbytes);
-    MACE_UNUSED(finish_cmd_queue);
-    return reinterpret_cast<char*>(buffer) + offset;
-  }
-  void *MapImage(void *buffer,
-                 const std::vector<size_t> &image_shape,
-                 std::vector<size_t> *mapped_image_pitch,
-                 bool finish_cmd_queue) const override {
-    MACE_UNUSED(image_shape);
-    MACE_UNUSED(mapped_image_pitch);
-    MACE_UNUSED(finish_cmd_queue);
-    return buffer;
-  }
-  void Unmap(void *buffer, void *mapper_ptr) const override {
-    MACE_UNUSED(buffer);
-    MACE_UNUSED(mapper_ptr);
-  }
+
   bool OnHost() const override { return true; }
 };
 
diff --git a/mace/core/quantize.h b/mace/core/quantize.h
index c7b6666dda378e99ad7a99714a6c7dca74b41619..439c9522b20212495e3fc7d0e2a5a9b79df012f0 100644
--- a/mace/core/quantize.h
+++ b/mace/core/quantize.h
@@ -117,9 +117,12 @@ inline void GetOutputMultiplierAndShift(
 template<typename F, typename Q>
 class QuantizeUtil {
  public:
+  QuantizeUtil() = default;
   explicit QuantizeUtil(utils::ThreadPool *thread_pool)
       : thread_pool_(thread_pool) {}
 
+  void Init(utils::ThreadPool *thread_pool) { thread_pool_ = thread_pool; }
+
   void QuantizeWithScaleAndZeropoint(const float *input,
                                      const index_t size,
                                      float scale,
diff --git a/mace/core/rpcmem.cc b/mace/core/rpcmem.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3b3ed396ab1f7199a2f2f2dad109c7fc582d634
--- /dev/null
+++ b/mace/core/rpcmem.cc
@@ -0,0 +1,54 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/rpcmem.h"
+
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+Rpcmem::Rpcmem() {
+  rpcmem_init(&rm);
+  MACE_CHECK(rm.flag == 0, "rpcmem_init failed!");
+}
+
+Rpcmem::~Rpcmem() {
+  rpcmem_deinit(&rm);
+}
+
+void *Rpcmem::New(int heapid, uint32_t flags, int nbytes) {
+  return rpcmem_alloc(&rm, heapid, flags, nbytes);
+}
+
+void *Rpcmem::New(int nbytes) {
+  return New(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_FLAG_CACHED, nbytes);
+}
+
+void Rpcmem::Delete(void *data) {
+  rpcmem_free(&rm, data);
+}
+
+int Rpcmem::ToFd(void *data) {
+  return rpcmem_to_fd(&rm, data);
+}
+
+int Rpcmem::SyncCacheStart(void *data) {
+  return rpcmem_sync_cache(&rm, data, RPCMEM_SYNC_START);
+}
+
+int Rpcmem::SyncCacheEnd(void *data) {
+  return rpcmem_sync_cache(&rm, data, RPCMEM_SYNC_END);
+}
+
+}  // namespace mace
diff --git a/mace/core/rpcmem.h b/mace/core/rpcmem.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d5795c21c0541705f3bcf0d9b04f1e74fd024e2
--- /dev/null
+++ b/mace/core/rpcmem.h
@@ -0,0 +1,36 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RPCMEM_H_
+#define MACE_CORE_RPCMEM_H_
+
+#include "third_party/rpcmem/rpcmem.h"
+
+namespace mace {
+class Rpcmem {
+ public:
+  Rpcmem();
+  ~Rpcmem();
+  void *New(int heapid, uint32_t flags, int nbytes);
+  void *New(int nbytes);
+  void Delete(void *data);
+  int ToFd(void *data);
+  int SyncCacheStart(void *data);
+  int SyncCacheEnd(void *data);
+
+ private:
+  rpcmem rm;
+};
+}  // namespace mace
+#endif  // MACE_CORE_RPCMEM_H_
diff --git a/mace/core/runtime/hexagon/hexagon_allocator.cc b/mace/core/runtime/hexagon/hexagon_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c1f034ca5a0db2aca3295b84449077d71fe62d1
--- /dev/null
+++ b/mace/core/runtime/hexagon/hexagon_allocator.cc
@@ -0,0 +1,38 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/hexagon/hexagon_allocator.h"
+
+namespace mace {
+
+MaceStatus HexagonAllocator::New(size_t nbytes, void **result) {
+  *result = rpcmem_.New(nbytes);
+  MACE_CHECK_NOTNULL(*result);
+  memset(*result, 0, nbytes);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void HexagonAllocator::Delete(void *data) {
+  rpcmem_.Delete(data);
+}
+
+bool HexagonAllocator::OnHost() const {
+    return true;
+}
+
+Rpcmem *HexagonAllocator::rpcmem() {
+  return &rpcmem_;
+}
+
+}  // namespace mace
diff --git a/mace/core/runtime/hexagon/hexagon_allocator.h b/mace/core/runtime/hexagon/hexagon_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4143996d424141335032eb31abb1efe91134c4a
--- /dev/null
+++ b/mace/core/runtime/hexagon/hexagon_allocator.h
@@ -0,0 +1,38 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_
+
+#include "mace/core/allocator.h"
+
+namespace mace {
+
+class HexagonAllocator : public Allocator {
+ public:
+  MaceStatus New(size_t nbytes, void **result) override;
+
+  void Delete(void *buffer) override;
+
+  bool OnHost() const override;
+
+  Rpcmem *rpcmem() override;
+
+ private:
+  Rpcmem rpcmem_;
+};
+
+}  // namespace mace
+
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_
diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.h b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
index 0ab7e3f9e8cfa46a80e684d9fd66cc45b3bf6a12..801c095ddf4b20df0db3b6a7daa3bbc370f3ac35 100644
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -25,27 +25,15 @@
 #include "mace/public/mace.h"
 
 namespace mace {
-
 struct InOutInfo {
   InOutInfo(const std::string &name,
             const std::vector<index_t> &shape,
-            const DataType data_type,
-            const float scale,
-            const int32_t zero_point,
-            std::unique_ptr<Tensor> tensor_u8)
-      :  name(name),
-         shape(shape),
-         data_type(data_type),
-         scale(scale),
-         zero_point(zero_point),
-         tensor_u8(std::move(tensor_u8)) {}
+            const DataType data_type)
+      :  name(name), shape(shape), data_type(data_type) {}
 
   std::string name;
   std::vector<index_t> shape;
   DataType data_type;
-  float scale;
-  int32_t zero_point;
-  std::unique_ptr<Tensor> tensor_u8;
 };
 
 class HexagonControlWrapper {
@@ -79,8 +67,6 @@ class HexagonControlWrapper {
 
   int nn_id_;
 
-  std::vector<InOutInfo> input_info_;
-  std::vector<InOutInfo> output_info_;
   int num_inputs_;
   int num_outputs_;
 
diff --git a/mace/core/runtime/hexagon/hexagon_device.cc b/mace/core/runtime/hexagon/hexagon_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8a0aa4e49873a306dcfa795a2a96d351f534dc5
--- /dev/null
+++ b/mace/core/runtime/hexagon/hexagon_device.cc
@@ -0,0 +1,71 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/hexagon/hexagon_device.h"
+
+namespace mace {
+
+HexagonDevice::HexagonDevice(DeviceType device_type,
+                             utils::ThreadPool *thread_pool
+#ifdef MACE_ENABLE_OPENCL
+                             , std::unique_ptr<GPUDevice> gpu_device
+#endif  // MACE_ENABLE_OPENCL
+                             )
+      : CPUDevice(0, AFFINITY_NONE, thread_pool),
+        allocator_(make_unique<HexagonAllocator>()),
+        device_type_(device_type)
+#ifdef MACE_ENABLE_OPENCL
+        , gpu_device_(std::move(gpu_device))
+#endif  // MACE_ENABLE_OPENCL
+         {}
+
+#ifdef MACE_ENABLE_OPENCL
+GPURuntime *HexagonDevice::gpu_runtime() {
+  return gpu_device_->gpu_runtime();
+}
+#endif  // MACE_ENABLE_OPENCL
+
+Allocator *HexagonDevice::allocator() {
+#ifdef MACE_ENABLE_OPENCL
+  return gpu_device_->allocator();
+#else
+  return allocator_.get();
+#endif  // MACE_ENABLE_OPENCL
+}
+
+DeviceType HexagonDevice::device_type() const {
+  return device_type_;
+}
+
+std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
+    Device *device) {
+  std::unique_ptr<HexagonControlWrapper> hexagon_controller;
+  auto device_type = device->device_type();
+  switch (device_type) {
+#ifdef MACE_ENABLE_HEXAGON
+    case HEXAGON:
+      hexagon_controller = make_unique<HexagonDSPWrapper>();
+      break;
+#endif
+#ifdef MACE_ENABLE_HTA
+    case HTA:
+      hexagon_controller = make_unique<HexagonHTAWrapper>(device);
+      break;
+#endif
+    default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
+  }
+
+  return hexagon_controller;
+}
+}  // namespace mace
diff --git a/mace/core/runtime/hexagon/hexagon_device.h b/mace/core/runtime/hexagon/hexagon_device.h
index b17b19e5469cb5bb01e42f9beecdba286d8454af..660965eb9f89c46a395b0087ea6f864304a2f668 100644
--- a/mace/core/runtime/hexagon/hexagon_device.h
+++ b/mace/core/runtime/hexagon/hexagon_device.h
@@ -19,6 +19,7 @@
 #include <utility>
 
 #include "mace/core/device.h"
+#include "mace/core/runtime/hexagon/hexagon_allocator.h"
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
 #ifdef MACE_ENABLE_HEXAGON
 #include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
@@ -26,44 +27,39 @@
 #ifdef MACE_ENABLE_HTA
 #include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
 #endif
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/gpu_device.h"
+#include "mace/core/runtime/opencl/gpu_runtime.h"
+#endif
 
 namespace mace {
 
 class HexagonDevice : public CPUDevice {
  public:
-  explicit HexagonDevice(DeviceType device_type,
-                         utils::ThreadPool *thread_pool)
-      : CPUDevice(0, AFFINITY_NONE, thread_pool),
-        device_type_(device_type) {}
+  HexagonDevice(DeviceType device_type,
+#ifdef MACE_ENABLE_OPENCL
+                utils::ThreadPool *thread_pool,
+                std::unique_ptr<GPUDevice> gpu_device);
+#else
+                utils::ThreadPool *thread_pool);
+#endif  // MACE_ENABLE_OPENCL
 
-  DeviceType device_type() const override {
-    return device_type_;
-  };
+#ifdef MACE_ENABLE_OPENCL
+  GPURuntime *gpu_runtime() override;
+#endif  // MACE_ENABLE_OPENCL
+  Allocator *allocator() override;
+  DeviceType device_type() const override;
 
  private:
+  std::unique_ptr<HexagonAllocator> allocator_;
   DeviceType device_type_;
+#ifdef MACE_ENABLE_OPENCL
+  std::unique_ptr<GPUDevice> gpu_device_;
+#endif  // MACE_ENABLE_OPENCL
 };
 
 std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
-    Device *device) {
-  std::unique_ptr<HexagonControlWrapper> hexagon_controller;
-  auto device_type = device->device_type();
-  switch (device_type) {
-#ifdef MACE_ENABLE_HEXAGON
-    case HEXAGON:
-      hexagon_controller = make_unique<HexagonDSPWrapper>();
-      break;
-#endif
-#ifdef MACE_ENABLE_HTA
-    case HTA:
-      hexagon_controller = make_unique<HexagonHTAWrapper>(device);
-      break;
-#endif
-    default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
-  }
-
-  return hexagon_controller;
-}
+    Device *device);
 
 }  // namespace mace
 #endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
index 9d17ab44c18f406c86d1c23fcd60ca22041cb462..dedff7f908daf2afadca967b01517333b2314a75 100644
--- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
@@ -107,8 +107,6 @@ hexagon_nn_corner_type TransformCornerType(HexagonNNCornerType corner) {
 }  // namespace
 
 HexagonDSPWrapper::HexagonDSPWrapper() {
-  hexnn_controller_init();
-
   std::string env_log_execute_time_str;
   GetEnv("MACE_DSP_LOG_EXECUTE_TIME", &env_log_execute_time_str);
   if (env_log_execute_time_str.empty()) {
@@ -119,9 +117,7 @@ HexagonDSPWrapper::HexagonDSPWrapper() {
   }
 }
 
-HexagonDSPWrapper::~HexagonDSPWrapper() {
-  hexnn_controller_deinit();
-}
+HexagonDSPWrapper::~HexagonDSPWrapper() {}
 
 int HexagonDSPWrapper::GetVersion() {
   int version;
@@ -258,10 +254,7 @@ bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def,
     }
     input_info_.emplace_back(input_info.name(),
                              input_shape,
-                             input_info.data_type(),
-                             input_info.scale(),
-                             input_info.zero_point(),
-                             make_unique<Tensor>());
+                             input_info.data_type());
   }
 
   // output info
@@ -275,10 +268,7 @@ bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def,
     }
     output_info_.emplace_back(output_info.name(),
                               output_shape,
-                              output_info.data_type(),
-                              output_info.scale(),
-                              output_info.zero_point(),
-                              make_unique<Tensor>());
+                              output_info.data_type());
     VLOG(1) << "OutputInfo: "
             << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
             << " " << output_shape[2] << " " << output_shape[3]
diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h
index 831d163f8ac02ac692f4edc3a3d0ff5c402a6bf6..2ec98e7ab6e590b8d44fbdb9c18e4da20cb66a8a 100644
--- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h
@@ -56,7 +56,8 @@ class HexagonDSPWrapper : public HexagonControlWrapper {
   uint64_t GetLastExecuteCycles();
 
   bool log_execute_time_;
-
+  std::vector<InOutInfo> input_info_;
+  std::vector<InOutInfo> output_info_;
   MACE_DISABLE_COPY_AND_ASSIGN(HexagonDSPWrapper);
 };
 }  // namespace mace
diff --git a/mace/core/runtime/hexagon/hexagon_hta_transformer.cc b/mace/core/runtime/hexagon/hexagon_hta_transformer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf0b753122bf1624dc7e37c59dfe090204d00d41
--- /dev/null
+++ b/mace/core/runtime/hexagon/hexagon_hta_transformer.cc
@@ -0,0 +1,644 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
+
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "mace/core/op_context.h"
+#include "mace/core/quantize.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/utils/math.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_helper.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace {
+template <DeviceType D>
+class QuantizeTransformer;
+
+template <>
+class QuantizeTransformer<DeviceType::CPU> : public BaseTransformer {
+ public:
+  void Init(Device *device) override {
+    device_ = device;
+    quantize_util_.Init(&device_->cpu_runtime()->thread_pool());
+  }
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "Quantize on CPU");
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    output->SetScale(input->scale());
+    output->SetZeroPoint(input->zero_point());
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    auto input_data = input->data<float>();
+    auto output_data = output->mutable_data<uint8_t>();
+    quantize_util_.QuantizeWithScaleAndZeropoint(
+        input_data, input->size(), input->scale(), input->zero_point(),
+        output_data);
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  QuantizeUtil<float, uint8_t> quantize_util_;
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <>
+class QuantizeTransformer<DeviceType::GPU> : public BaseTransformer {
+ public:
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "Quantize on GPU");
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    output->SetScale(input->scale());
+    output->SetZeroPoint(input->zero_point());
+    const uint32_t gws = static_cast<uint32_t>(RoundUpDiv4(output->size()));
+    OpenCLRuntime *runtime = device_->gpu_runtime()->opencl_runtime();
+    if (kernel_.get() == nullptr) {
+      std::set<std::string> built_options;
+      std::string kernel_name = MACE_OBFUSCATE_SYMBOL("buffer_quantize");
+      built_options.emplace("-Dbuffer_quantize=" + kernel_name);
+      built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
+      MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", kernel_name,
+                                                built_options, &kernel_));
+    }
+
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, gws);
+    kernel_.setArg(idx++, input->scale());
+    kernel_.setArg(idx++, input->zero_point());
+    kernel_.setArg(idx++, *(input->opencl_buffer()));
+    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+               "buffer offset not aligned");
+    kernel_.setArg(idx++,
+                   static_cast<uint32_t>(input->buffer_offset() /
+                                         GetEnumTypeSize(input->dtype())));
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+
+    const uint32_t lws = static_cast<uint32_t>(
+        RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
+    cl::Event event;
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(gws), cl::NDRange(lws), nullptr,
+          &event);
+    } else {
+      uint32_t roundup_gws = RoundUp(gws, lws);
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(roundup_gws), cl::NDRange(lws),
+          nullptr, &event);
+    }
+    MACE_CL_RET_STATUS(error);
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  cl::Kernel kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+template <DeviceType D>
+class DequantizeTransformer;
+
+template <>
+class DequantizeTransformer<DeviceType::CPU> : public BaseTransformer {
+ public:
+  void Init(Device *device) override {
+    device_ = device;
+    quantize_util_.Init(&device_->cpu_runtime()->thread_pool());
+  }
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "Dequantize on CPU");
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    output->SetScale(input->scale());
+    output->SetZeroPoint(input->zero_point());
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    auto input_data = input->data<uint8_t>();
+    auto output_data = output->mutable_data<float>();
+    quantize_util_.Dequantize(input_data, input->size(), input->scale(),
+                              input->zero_point(), output_data);
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  QuantizeUtil<float, uint8_t> quantize_util_;
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <>
+class DequantizeTransformer<DeviceType::GPU> : public BaseTransformer {
+ public:
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "Dequantize on GPU");
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    output->SetScale(input->scale());
+    output->SetZeroPoint(input->zero_point());
+    const uint32_t gws = static_cast<uint32_t>(RoundUpDiv4(output->size()));
+    OpenCLRuntime *runtime = device_->gpu_runtime()->opencl_runtime();
+    if (kernel_.get() == nullptr) {
+      std::set<std::string> built_options;
+      std::string kernel_name = MACE_OBFUSCATE_SYMBOL("buffer_dequantize");
+      built_options.emplace("-Dbuffer_dequantize=" + kernel_name);
+      built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
+      MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", kernel_name,
+                                                built_options, &kernel_));
+    }
+
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, gws);
+    kernel_.setArg(idx++, input->scale());
+    kernel_.setArg(idx++, input->zero_point());
+    kernel_.setArg(idx++, *(input->opencl_buffer()));
+    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+               "buffer offset not aligned");
+    kernel_.setArg(idx++,
+                   static_cast<uint32_t>(input->buffer_offset() /
+                                         GetEnumTypeSize(input->dtype())));
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+
+    const uint32_t lws = static_cast<uint32_t>(
+        RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
+    cl::Event event;
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(gws), cl::NDRange(lws), nullptr,
+          &event);
+    } else {
+      uint32_t roundup_gws = RoundUp(gws, lws);
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(roundup_gws), cl::NDRange(lws),
+          nullptr, &event);
+    }
+    MACE_CL_RET_STATUS(error);
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  cl::Kernel kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+template <DeviceType D>
+class NHWCToNCHW32Transformer;
+template <>
+class NHWCToNCHW32Transformer<DeviceType::CPU> : public BaseTransformer {
+ public:
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "NHWCToNCHW32Transformer on CPU");
+    int32_t padding_value = input->zero_point();
+    index_t batch = input->dim(0);
+    index_t height = input->dim(1);
+    index_t width = input->dim(2);
+    index_t channels = input->dim(3);
+    index_t height_stride = width * channels;
+    index_t batch_stride = height * width * channels;
+
+    index_t output_width = RoundUp<index_t>(width, 32);
+    index_t output_channel_stride = height * output_width;
+    index_t output_batch_stride = channels * height * output_width;
+
+    output->Resize({batch, channels, height, output_width});
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard output_mapper(output);
+    const auto input_data = input->data<uint8_t>();
+    auto output_data = output->mutable_data<uint8_t>();
+
+    device_->cpu_runtime()->thread_pool().Compute2D(
+        [=](index_t start0, index_t end0, index_t step0, index_t start1,
+            index_t end1, index_t step1) {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              index_t input_offset = b * batch_stride + c;
+              index_t output_offset =
+                  b * output_batch_stride + c * output_channel_stride;
+              for (index_t h = 0; h < height; ++h) {
+                for (index_t w = 0; w < width; ++w) {
+                  output_data[output_offset + w] =
+                      input_data[input_offset + w * channels];
+                }
+                std::fill_n(output_data + output_offset + width,
+                            output_width - width, padding_value);
+                input_offset += height_stride;
+                output_offset += output_width;
+              }
+            }
+          }
+        },
+        0, batch, 1, 0, channels, 1);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <>
+class NHWCToNCHW32Transformer<DeviceType::GPU> : public BaseTransformer {
+ public:
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "NHWCToNCHW32Transformer on GPU");
+    const index_t batch = input->dim(0);
+    const index_t height = input->dim(1);
+    const index_t width = input->dim(2);
+    const index_t channels = input->dim(3);
+    const index_t output_width = RoundUp<index_t>(width, 32);
+    std::vector<index_t> transformed_shape = {batch, channels, height,
+                                              output_width};
+    uint32_t gws[3];
+    gws[0] = static_cast<uint32_t>(RoundUpDiv4<index_t>(output_width));
+    gws[1] = static_cast<uint32_t>(height);
+    gws[2] = static_cast<uint32_t>(batch * channels);
+    MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
+
+    if (kernel_.get() == nullptr) {
+      std::set<std::string> built_options;
+      std::string kernel_name =
+          MACE_OBFUSCATE_SYMBOL("transform_nhwc_to_nchw32");
+      built_options.emplace("-Dtransform_nhwc_to_nchw32=" + kernel_name);
+      std::string data_dt = DtToCLDt(input->dtype());
+      built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+      built_options.emplace("-DDATA_TYPE=" + data_dt);
+      MACE_RETURN_IF_ERROR(
+          device_->gpu_runtime()->opencl_runtime()->BuildKernel(
+              "buffer_transform", kernel_name, built_options, &kernel_));
+    }
+    uint32_t idx = 0;
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_buffer()));
+    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+               "buffer offset not aligned");
+    kernel_.setArg(idx++,
+                   static_cast<uint32_t>(input->buffer_offset() /
+                                         GetEnumTypeSize(input->dtype())));
+    kernel_.setArg(idx++, input->zero_point());
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+
+    std::string tuning_key = Concat("transform_nhwc_to_nchw32",
+                                    transformed_shape[0], transformed_shape[1],
+                                    transformed_shape[2], transformed_shape[3]);
+    std::vector<uint32_t> lws = {4, 4, 4, 0};
+    MACE_RETURN_IF_ERROR(
+        TuningOrRun3DKernel(device_->gpu_runtime()->opencl_runtime(), kernel_,
+                            tuning_key, gws, lws, nullptr));
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  cl::Kernel kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+template <DeviceType D>
+class NHWCToD32Transformer;
+
+template <>
+class NHWCToD32Transformer<DeviceType::CPU> : public BaseTransformer {
+ public:
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "NHWCToD32Transformer on CPU");
+    index_t batch = input->dim(0);
+    index_t height = input->dim(1);
+    index_t width = input->dim(2);
+    index_t channels = input->dim(3);
+    index_t height_stride = width * channels;
+    index_t batch_stride = height * width * channels;
+
+    index_t channel_slices = RoundUpDiv(channels, static_cast<index_t>(32));
+    index_t output_channel_slices_stride = width * 32;
+    index_t output_height_stride = channel_slices * width * 32;
+    index_t output_batch_stride = height * channel_slices * width * 32;
+
+    std::vector<index_t> output_shape{batch, height, channel_slices, width, 32};
+    output->Resize(output_shape);
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    auto input_data = input->data<uint8_t>();
+    auto output_data = output->mutable_data<uint8_t>();
+    std::fill_n(output_data, output->size(), input->zero_point());
+
+    device_->cpu_runtime()->thread_pool().Compute2D(
+        [=](index_t start0, index_t end0, index_t step0, index_t start1,
+            index_t end1, index_t step1) {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t h = start1; h < end1; h += step1) {
+              index_t input_offset = b * batch_stride + h * height_stride;
+              index_t output_offset =
+                  b * output_batch_stride + h * output_height_stride;
+              for (index_t w = 0; w < width; ++w) {
+                for (index_t c = 0; c < channels; ++c) {
+                  output_data[output_offset +
+                              c / 32 * output_channel_slices_stride + c % 32] =
+                      input_data[input_offset + c];
+                }
+                input_offset += channels;
+                output_offset += 32;
+              }
+            }
+          }
+        },
+        0, batch, 1, 0, height, 1);
+    return MaceStatus::MACE_SUCCESS;
+  }
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <>
+class NHWCToD32Transformer<DeviceType::GPU> : public BaseTransformer {
+ public:
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "D32ToNHWCTransformer on GPU");
+    const index_t batch = input->dim(0);
+    const index_t height = input->dim(1);
+    const index_t width = input->dim(2);
+    const index_t channels = input->dim(3);
+    const index_t channel_slices = RoundUpDiv<index_t>(channels, 32);
+    std::vector<index_t> output_shape{batch, height, channel_slices, width, 32};
+    output->Resize(output_shape);
+
+    uint32_t gws[3];
+    gws[0] = static_cast<uint32_t>(RoundUpDiv4<index_t>(width * 32));
+    gws[1] = static_cast<uint32_t>(channel_slices);
+    gws[2] = static_cast<uint32_t>(batch * height);
+
+    if (kernel_.get() == nullptr) {
+      std::set<std::string> built_options;
+      std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_nhwc_to_d32");
+      built_options.emplace("-Dtransform_nhwc_to_d32=" + kernel_name);
+      std::string data_dt = DtToCLDt(input->dtype());
+      built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+      built_options.emplace("-DDATA_TYPE=" + data_dt);
+      MACE_RETURN_IF_ERROR(
+          device_->gpu_runtime()->opencl_runtime()->BuildKernel(
+              "buffer_transform", kernel_name, built_options, &kernel_));
+    }
+
+    uint32_t idx = 0;
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_buffer()));
+    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+               "buffer offset not aligned");
+    kernel_.setArg(idx++,
+                   static_cast<uint32_t>(input->buffer_offset() /
+                                         GetEnumTypeSize(input->dtype())));
+    kernel_.setArg(idx++, input->zero_point());
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_slices));
+
+
+    std::string tuning_key =
+        Concat("transform_nhwc_to_d32", batch, height, width, channels);
+    std::vector<uint32_t> lws = {4, 4, 4, 0};
+    MACE_RETURN_IF_ERROR(
+        TuningOrRun3DKernel(device_->gpu_runtime()->opencl_runtime(), kernel_,
+                            tuning_key, gws, lws, nullptr));
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  cl::Kernel kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+template <DeviceType D>
+class D32ToNHWCTransformer;
+
+template <>
+class D32ToNHWCTransformer<DeviceType::CPU> : public BaseTransformer {
+ public:
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "D32ToNHWCTransformer on CPU");
+    index_t batch = output->dim(0);
+    index_t height = output->dim(1);
+    index_t width = output->dim(2);
+    index_t channel = output->dim(3);
+    index_t height_stride = width * channel;
+    index_t batch_stride = height * width * channel;
+
+    index_t channel_slices = RoundUpDiv(channel, static_cast<index_t>(32));
+    index_t input_channel_slices_stride = width * 32;
+    index_t input_height_stride = channel_slices * width * 32;
+    index_t input_batch_stride = height * channel_slices * width * 32;
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    auto input_data = input->data<uint8_t>();
+    auto output_data = output->mutable_data<uint8_t>();
+
+    device_->cpu_runtime()->thread_pool().Compute2D(
+        [=](index_t start0, index_t end0, index_t step0, index_t start1,
+            index_t end1, index_t step1) {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t h = start1; h < end1; h += step1) {
+              index_t input_offset =
+                  b * input_batch_stride + h * input_height_stride;
+              index_t output_offset = b * batch_stride + h * height_stride;
+              for (index_t w = 0; w < width; ++w) {
+                for (index_t c = 0; c < channel; ++c) {
+                  output_data[output_offset + c] =
+                      input_data[input_offset +
+                                 c / 32 * input_channel_slices_stride + c % 32];
+                }
+                input_offset += 32;
+                output_offset += channel;
+              }
+            }
+          }
+        },
+        0, batch, 1, 0, height, 1);
+    return MaceStatus::MACE_SUCCESS;
+  }
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <>
+class D32ToNHWCTransformer<DeviceType::GPU> : public BaseTransformer {
+ public:
+  MaceStatus Compute(const Tensor *input, Tensor *output) override {
+    MACE_LATENCY_LOGGER(1, "D32ToNHWCTransformer on GPU");
+    const index_t batch = output->dim(0);
+    const index_t height = output->dim(1);
+    const index_t width = output->dim(2);
+    const index_t channels = output->dim(3);
+    const index_t channel_slices = RoundUpDiv<index_t>(channels, 32);
+
+    uint32_t gws[3];
+    gws[0] = static_cast<uint32_t>(RoundUpDiv4<index_t>(channels));
+    gws[1] = static_cast<uint32_t>(width);
+    gws[2] = static_cast<uint32_t>(batch * height);
+
+    if (kernel_.get() == nullptr) {
+      std::set<std::string> built_options;
+      std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_d32_to_nhwc");
+      built_options.emplace("-Dtransform_d32_to_nhwc=" + kernel_name);
+      std::string data_dt = DtToCLDt(input->dtype());
+      built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+      built_options.emplace("-DDATA_TYPE=" + data_dt);
+      MACE_RETURN_IF_ERROR(
+          device_->gpu_runtime()->opencl_runtime()->BuildKernel(
+              "buffer_transform", kernel_name, built_options, &kernel_));
+    }
+
+    uint32_t idx = 0;
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_buffer()));
+    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+               "buffer offset not aligned");
+    kernel_.setArg(idx++,
+                   static_cast<uint32_t>(input->buffer_offset() /
+                                         GetEnumTypeSize(input->dtype())));
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_slices));
+
+
+    std::string tuning_key =
+        Concat("transform_d32_to_nhwc", batch, height, width, channels);
+    std::vector<uint32_t> lws = {4, 4, 4, 0};
+    MACE_RETURN_IF_ERROR(
+        TuningOrRun3DKernel(device_->gpu_runtime()->opencl_runtime(), kernel_,
+                            tuning_key, gws, lws, nullptr));
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  cl::Kernel kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+}  // namespace
+
+template <DeviceType D>
+void HexagonHTATranformer<D>::Init(Device *device) {
+  device_ = device;
+  quantizer_ = make_unique<QuantizeTransformer<D>>();
+  quantizer_->Init(device);
+  dequantizer_ = make_unique<DequantizeTransformer<D>>();
+  dequantizer_->Init(device);
+}
+
+template <DeviceType D>
+MaceStatus HexagonHTATranformer<D>::SetInputTransformer(
+    hexagon_hta_hw_layout format) {
+  switch (format) {
+    case HEXAGON_HTA_HW_FORMAT_D32:
+      input_transformers_.push_back(make_unique<NHWCToD32Transformer<D>>());
+      break;
+    case HEXAGON_HTA_HW_FORMAT_PLANAR:
+      input_transformers_.push_back(make_unique<NHWCToNCHW32Transformer<D>>());
+      break;
+    case HEXAGON_HTA_HW_FORMAT_DEPTH_FIRST:
+    default:
+      MACE_NOT_IMPLEMENTED;
+      break;
+  }
+  input_transformers_.back()->Init(device_);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+template <DeviceType D>
+MaceStatus HexagonHTATranformer<D>::SetOutputTransformer(
+    hexagon_hta_hw_layout format) {
+  switch (format) {
+    case HEXAGON_HTA_HW_FORMAT_D32:
+      output_transformers_.push_back(make_unique<D32ToNHWCTransformer<D>>());
+      break;
+    case HEXAGON_HTA_HW_FORMAT_PLANAR:
+    case HEXAGON_HTA_HW_FORMAT_DEPTH_FIRST:
+    default:
+      MACE_NOT_IMPLEMENTED;
+      break;
+  }
+  output_transformers_.back()->Init(device_);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+template <DeviceType D>
+MaceStatus HexagonHTATranformer<D>::TransformInput(const Tensor *input,
+                                                   Tensor *output,
+                                                   int index) {
+  return input_transformers_[index]->Compute(input, output);
+}
+
+template <DeviceType D>
+MaceStatus HexagonHTATranformer<D>::TransformOutput(const Tensor *input,
+                                                    Tensor *output,
+                                                    int index) {
+  return output_transformers_[index]->Compute(input, output);
+}
+
+template <DeviceType D>
+MaceStatus HexagonHTATranformer<D>::Quantize(const Tensor *input,
+                                             Tensor *output) {
+  return quantizer_->Compute(input, output);
+}
+
+template <DeviceType D>
+MaceStatus HexagonHTATranformer<D>::Dequantize(const Tensor *input,
+                                               Tensor *output) {
+  return dequantizer_->Compute(input, output);
+}
+
+template void HexagonHTATranformer<CPU>::Init(Device *device);
+template MaceStatus HexagonHTATranformer<CPU>::Quantize(const Tensor *input,
+                                                        Tensor *output);
+template MaceStatus HexagonHTATranformer<CPU>::Dequantize(const Tensor *input,
+                                                          Tensor *output);
+template MaceStatus HexagonHTATranformer<CPU>::SetInputTransformer(
+    hexagon_hta_hw_layout format);
+template MaceStatus HexagonHTATranformer<CPU>::SetOutputTransformer(
+    hexagon_hta_hw_layout format);
+template MaceStatus HexagonHTATranformer<CPU>::TransformInput(
+    const Tensor *input, Tensor *output, int index);
+template MaceStatus HexagonHTATranformer<CPU>::TransformOutput(
+    const Tensor *input, Tensor *output, int index);
+
+#ifdef MACE_ENABLE_OPENCL
+template void HexagonHTATranformer<GPU>::Init(Device *device);
+template MaceStatus HexagonHTATranformer<GPU>::Quantize(const Tensor *input,
+                                                        Tensor *output);
+template MaceStatus HexagonHTATranformer<GPU>::Dequantize(const Tensor *input,
+                                                          Tensor *output);
+template MaceStatus HexagonHTATranformer<GPU>::SetInputTransformer(
+    hexagon_hta_hw_layout format);
+template MaceStatus HexagonHTATranformer<GPU>::SetOutputTransformer(
+    hexagon_hta_hw_layout format);
+template MaceStatus HexagonHTATranformer<GPU>::TransformInput(
+    const Tensor *input, Tensor *output, int index);
+template MaceStatus HexagonHTATranformer<GPU>::TransformOutput(
+    const Tensor *input, Tensor *output, int index);
+#endif  // MACE_ENABLE_OPENCL
+}  // namespace mace
diff --git a/mace/core/runtime/hexagon/hexagon_hta_transformer.h b/mace/core/runtime/hexagon/hexagon_hta_transformer.h
new file mode 100644
index 0000000000000000000000000000000000000000..74369a4b3ff9634c82d4f0680e7180a6db6c4ec4
--- /dev/null
+++ b/mace/core/runtime/hexagon/hexagon_hta_transformer.h
@@ -0,0 +1,85 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_
+
+#include <memory>
+#include <vector>
+
+#include "mace/core/device.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/utils/math.h"
+#include "mace/utils/thread_pool.h"
+#include "third_party/hta/hta_hexagon_api.h"
+
+namespace mace {
+class BaseTransformer {
+ public:
+  BaseTransformer() = default;
+  virtual ~BaseTransformer() = default;
+
+  virtual void Init(Device *device) { device_ = device; }
+  virtual MaceStatus Compute(const Tensor *input, Tensor *output) = 0;
+
+ protected:
+  Device *device_;
+};
+
+class HexagonHTATranformerBase {
+ public:
+  HexagonHTATranformerBase() = default;
+  virtual ~HexagonHTATranformerBase() = default;
+
+  virtual void Init(Device *device) = 0;
+  virtual MaceStatus SetInputTransformer(
+      const hexagon_hta_hw_layout format) = 0;
+  virtual MaceStatus SetOutputTransformer(
+      const hexagon_hta_hw_layout format) = 0;
+  virtual MaceStatus Quantize(const Tensor *input, Tensor *output) = 0;
+  virtual MaceStatus Dequantize(const Tensor *input, Tensor *output) = 0;
+  virtual MaceStatus TransformInput(const Tensor *input,
+                                    Tensor *output,
+                                    int index) = 0;
+  virtual MaceStatus TransformOutput(const Tensor *input,
+                                     Tensor *output,
+                                     int index) = 0;
+};
+
+template <DeviceType D>
+class HexagonHTATranformer : public HexagonHTATranformerBase {
+ public:
+  void Init(Device *device) override;
+  MaceStatus SetInputTransformer(const hexagon_hta_hw_layout format) override;
+  MaceStatus SetOutputTransformer(const hexagon_hta_hw_layout format) override;
+
+  MaceStatus Quantize(const Tensor *input, Tensor *output) override;
+  MaceStatus Dequantize(const Tensor *input, Tensor *output) override;
+  MaceStatus TransformInput(const Tensor *input,
+                            Tensor *output,
+                            int index) override;
+  MaceStatus TransformOutput(const Tensor *input,
+                             Tensor *output,
+                             int index) override;
+
+ private:
+  Device *device_;
+  std::unique_ptr<BaseTransformer> quantizer_;
+  std::unique_ptr<BaseTransformer> dequantizer_;
+  std::vector<std::unique_ptr<BaseTransformer>> input_transformers_;
+  std::vector<std::unique_ptr<BaseTransformer>> output_transformers_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_
diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
index 069eab30c923053f1effdf1ee3e38ae5df0e2fe8..24a5eb82f015b8a66203d2071b1a1190b2aa4b4b 100644
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
-
+#include <sys/types.h>
 #include <algorithm>
 #include <iomanip>
 #include <map>
@@ -24,48 +24,59 @@
 #include <utility>
 
 #include "mace/core/runtime/hexagon/hexagon_hta_ops.h"
+#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
 #include "mace/core/types.h"
 #include "mace/utils/memory.h"
-#include "mace/core/quantize.h"
 #include "third_party/hta/hta_hexagon_api.h"
-
 namespace mace {
 
 namespace {
-struct InputOutputMetadata {
-  void Init(float min_val, float max_val, int needs_quantization) {
-    this->min_val = min_val;
-    this->max_val = max_val;
-    this->needs_quantization = needs_quantization;
+int GetHTAEnv(const std::string &name, int default_value) {
+  int value = default_value;
+  std::string env_str;
+  MaceStatus status = GetEnv(name.c_str(), &env_str);
+  if (status == MaceStatus::MACE_SUCCESS && !env_str.empty()) {
+    value = std::atoi(env_str.c_str());
   }
-  float min_val;
-  float max_val;
-  int needs_quantization;
-};
-
-template<typename T>
-void AddInputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) {
-  tensor->batches = 1;
-  tensor->height = 1;
-  tensor->width = 1;
-  tensor->depth = 1;
-  tensor->data = const_cast<unsigned char *>(
-      reinterpret_cast<const unsigned char *>(&data));
-  tensor->dataLen = sizeof(data);
-  tensor->data_valid_len = sizeof(data);
-  tensor->unused = 0;
+  return value;
 }
 
-template<typename T>
-void AddOutputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) {
-  tensor->data = const_cast<unsigned char *>(
-      reinterpret_cast<const unsigned char *>(&data));
-  tensor->dataLen = sizeof(data);
+// Print the API logs to standard output.
+void HtaApiLog(hexagon_hta_nn_nn_id id,
+               const char *api_op,
+               const char *const format,
+                 ...) {
+  va_list arg;
+  va_start(arg, format);
+  if (api_op != NULL) {
+    printf("Graph ID: %d\t", id);
+  }
+  vfprintf(stdout, format, arg);
+  va_end(arg);
+}
+// Print the performance stats to standard output.
+void HtaPerformanceLog(int log_level,
+                       uint32_t network_handle,
+                       uint32_t thread_id,
+                       const char *const format,
+                       ...) {
+  va_list arg;
+  va_start(arg, format);
+  printf("Log Level: %d, Network Handle: %d, Thread ID: %d - ", log_level,
+         network_handle, thread_id);
+  vfprintf(stdout, format, arg);
+  va_end(arg);
 }
 }  // namespace
 
 HexagonHTAWrapper::HexagonHTAWrapper(Device *device)
-    : quantize_util_(&device->cpu_runtime()->thread_pool()) {
+    : allocator_(device->allocator()),
+#ifdef MACE_ENABLE_OPENCL
+      transformer_(make_unique<HexagonHTATranformer<GPU>>()) {
+#else
+      transformer_(make_unique<HexagonHTATranformer<CPU>>()) {
+#endif
+  transformer_->Init(device);
 }
 
 int HexagonHTAWrapper::GetVersion() {
@@ -84,6 +95,40 @@ bool HexagonHTAWrapper::Init() {
   LOG(INFO) << "Hexagon init";
   MACE_CHECK(hexagon_hta_nn_init(&nn_id_) == 0, "hexagon_nn_init failed");
   ResetPerfInfo();
+
+  int ret;
+  int power_level = GetHTAEnv("MACE_HTA_POWER_LEVEL", -1);
+  if (power_level != -1) {
+    ret = hexagon_hta_nn_set_config_params(nn_id_, HTA_NN_CONFIG_POWER_LEVEL,
+                                           &power_level, sizeof(power_level));
+    LOG(INFO) << "HTA_NN_CONFIG_POWER_LEVEL: " << power_level
+              << " returns: " << ret;
+  }
+
+  int is_compress = GetHTAEnv("MACE_HTA_BANDWIDTH_COMPRESSION", 1);
+  if (is_compress) {
+    ret = hexagon_hta_nn_set_config_params(nn_id_,
+                                           HTA_NN_CONFIG_BANDWIDTH_COMPRESSION,
+                                           &is_compress, sizeof(is_compress));
+    LOG(INFO) << "HTA_NN_CONFIG_BANDWIDTH_COMPRESSION: " << is_compress
+              << " returns: " << ret;
+  }
+
+  if (VLOG_IS_ON(2)) {
+    ret = hexagon_hta_nn_set_config_params(
+        nn_id_, HTA_NN_CONFIG_PERFORMANCE_LOG,
+        reinterpret_cast<void *>(&HtaPerformanceLog),
+        sizeof(&HtaPerformanceLog));
+    MACE_CHECK(ret == 0, "HTA_NN_CONFIG_PERFORMANCE_LOG returns: " , ret);
+  }
+
+  if (VLOG_IS_ON(3)) {
+    ret = hexagon_hta_nn_set_config_params(nn_id_, HTA_NN_CONFIG_API_LOG,
+                                           reinterpret_cast<void *>(&HtaApiLog),
+                                           sizeof(&HtaApiLog));
+    MACE_CHECK(ret == 0, "HTA_NN_CONFIG_API_LOG returns: ", ret);
+  }
+
   return true;
 }
 
@@ -172,52 +217,107 @@ bool HexagonHTAWrapper::SetupGraph(const NetDef &net_def,
                                outputs.size());
   }
 
+  int64_t t1 = NowMicros();
+
+  MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed");
+
+  int64_t t2 = NowMicros();
+
+  VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
+
   // input info
   num_inputs_ = net_def.input_info_size();
   input_info_.reserve(num_inputs_);
-  for (const InputOutputInfo &input_info : net_def.input_info()) {
+  input_tensordef_.resize(num_inputs_);
+  for (int index = 0; index < num_inputs_; ++index) {
+    auto input_info = net_def.input_info(index);
     std::vector<index_t> input_shape(input_info.dims().begin(),
                                      input_info.dims().end());
     while (input_shape.size() < 4) {
       input_shape.insert(input_shape.begin(), 1);
     }
-    input_info_.emplace_back(input_info.name(),
-                             input_shape,
-                             input_info.data_type(),
-                             input_info.scale(),
-                             input_info.zero_point(),
-                             make_unique<Tensor>());
+
+    auto quantized_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
+    auto hta_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
+    hexagon_hta_nn_hw_tensordef &input_tensordef = input_tensordef_[index];
+    memset(&input_tensordef, 0, sizeof(input_tensordef));
+    MACE_CHECK(hexagon_hta_nn_get_memory_layout(nn_id_, 0, index,
+                                                &input_tensordef) == 0);
+    input_tensordef.dataLen = input_tensordef.batchStride;
+    VLOG(1) << input_tensordef.format << " " << input_tensordef.elementSize
+            << " " << input_tensordef.numDims << " "
+            << input_tensordef.batchStride;
+    for (uint32_t i = 0; i < input_tensordef.numDims; ++i) {
+      VLOG(1) << input_tensordef.dim[i].length << " "
+              << input_tensordef.dim[i].lpadding << " "
+              << input_tensordef.dim[i].valid;
+    }
+    hta_tensor->Resize({input_tensordef.dataLen});
+    MACE_CHECK(hta_tensor->raw_size() == input_tensordef.dataLen);
+    Tensor::MappingGuard input_guard(hta_tensor.get());
+    input_tensordef.fd =
+        allocator_->rpcmem()->ToFd(hta_tensor->mutable_data<void>());
+    MACE_CHECK(hexagon_hta_nn_register_tensor(nn_id_, &input_tensordef) == 0);
+
+    transformer_->SetInputTransformer(input_tensordef.format);
+
+    input_info_.emplace_back(
+        input_info.name(), input_shape, input_info.data_type(),
+        input_info.scale(), input_info.zero_point(),
+        std::move(quantized_tensor),
+        std::move(hta_tensor));
   }
 
   // output info
   num_outputs_ = net_def.output_info_size();
   output_info_.reserve(num_outputs_);
-  for (const InputOutputInfo &output_info : net_def.output_info()) {
+  output_tensordef_.resize(num_outputs_);
+  for (int index = 0; index < num_outputs_; ++index) {
+    auto output_info = net_def.output_info(index);
     std::vector<index_t> output_shape(output_info.dims().begin(),
                                       output_info.dims().end());
     while (output_shape.size() < 4) {
       output_shape.insert(output_shape.begin(), 1);
     }
-    output_info_.emplace_back(output_info.name(),
-                              output_shape,
-                              output_info.data_type(),
-                              output_info.scale(),
-                              output_info.zero_point(),
-                              make_unique<Tensor>());
+
+    auto quantized_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
+    auto hta_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
+    quantized_tensor->SetScale(output_info.scale());
+    quantized_tensor->SetZeroPoint(output_info.zero_point());
+
+    hexagon_hta_nn_hw_tensordef &output_tensordef = output_tensordef_[index];
+    memset(&output_tensordef, 0, sizeof(output_tensordef));
+    MACE_CHECK(hexagon_hta_nn_get_memory_layout(nn_id_, 1, index,
+                                                &output_tensordef) == 0);
+    output_tensordef.dataLen = output_tensordef.batchStride;
+    VLOG(1) << output_tensordef.format << " " << output_tensordef.elementSize
+            << " " << output_tensordef.numDims << " "
+            << output_tensordef.batchStride;
+    for (uint32_t i = 0; i < output_tensordef.numDims; ++i) {
+      VLOG(1) << output_tensordef.dim[i].length << " "
+              << output_tensordef.dim[i].lpadding << " "
+              << output_tensordef.dim[i].valid;
+    }
+    hta_tensor->Resize({output_tensordef.batchStride});
+    MACE_CHECK(hta_tensor->raw_size() == output_tensordef.dataLen);
+    Tensor::MappingGuard output_guard(hta_tensor.get());
+    output_tensordef.fd =
+        allocator_->rpcmem()->ToFd(hta_tensor->mutable_data<void>());
+    MACE_CHECK(hexagon_hta_nn_register_tensor(nn_id_, &output_tensordef) == 0);
+
+    transformer_->SetOutputTransformer(output_tensordef.format);
+
+    output_info_.emplace_back(
+        output_info.name(), output_shape, output_info.data_type(),
+        output_info.scale(), output_info.zero_point(),
+        std::move(quantized_tensor), std::move(hta_tensor));
+
     VLOG(1) << "OutputInfo: "
             << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
             << " " << output_shape[2] << " " << output_shape[3]
             << "\n\t type: " << output_info.data_type();
   }
 
-  int64_t t1 = NowMicros();
-
-  MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed");
-
-  int64_t t2 = NowMicros();
-
-  VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
-
   return true;
 }
 
@@ -266,78 +366,41 @@ bool HexagonHTAWrapper::ExecuteGraphNew(
   MACE_CHECK(num_outputs_ == static_cast<int>(num_outputs),
              "Wrong outputs num");
 
-  std::vector<hexagon_hta_nn_tensordef> inputs(num_inputs * kNumMetaData);
-  std::vector<hexagon_hta_nn_tensordef> outputs(num_outputs * kNumMetaData);
-  std::vector<InputOutputMetadata> input_metadata(num_inputs);
-  std::vector<InputOutputMetadata> output_metadata(num_outputs);
-
   for (size_t i = 0; i < num_inputs; ++i) {
     const auto input_tensor = input_tensors.at(input_info_[i].name);
-    const auto &input_shape = input_tensor->shape();
-    size_t index = i * kNumMetaData;
-    inputs[index].batches = static_cast<uint32_t>(input_shape[0]);
-    inputs[index].height = static_cast<uint32_t>(input_shape[1]);
-    inputs[index].width = static_cast<uint32_t>(input_shape[2]);
-    inputs[index].depth = static_cast<uint32_t>(input_shape[3]);
-    inputs[index].data = const_cast<unsigned char *>(
-        reinterpret_cast<const unsigned char *>(input_tensor->raw_data()));
-    inputs[index].dataLen = static_cast<int>(input_tensor->raw_size());
-    inputs[index].data_valid_len =
-        static_cast<uint32_t>(input_tensor->raw_size());
-    inputs[index].unused = 0;
-    input_metadata[i].Init(.0f, .0f, 1);
-    AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]);
-    AddInputMetadata(input_metadata[i].max_val, &inputs[index + 2]);
-    AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]);
-  }
+    input_tensor->SetScale(input_info_[i].scale);
+    input_tensor->SetZeroPoint(input_info_[i].zero_point);
+    MACE_CHECK_SUCCESS(
+        transformer_->Quantize(input_tensors.at(input_info_[i].name),
+                               input_info_[i].quantized_tensor.get()));
 
-  // transform mace output to hexagon output
-  for (size_t i = 0; i < num_outputs; ++i) {
-    auto output_tensor = output_tensors->at(output_info_[i].name);
-    size_t index = i * kNumMetaData;
-    output_tensor->SetDtype(output_info_[i].data_type);
-    output_tensor->Resize(output_info_[i].shape);
-
-    outputs[index].data = reinterpret_cast<unsigned char *>(
-        output_tensor->raw_mutable_data());
-    outputs[index].dataLen = static_cast<int>(output_tensor->raw_size());
-    output_metadata[i].Init(.0f, .0f, 1);
-
-    AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]);
-    AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]);
-    AddOutputMetadata(output_metadata[i].needs_quantization,
-                      &outputs[index + 3]);
+    MACE_CHECK_SUCCESS(transformer_->TransformInput(
+        input_info_[i].quantized_tensor.get(),
+        input_info_[i].hta_tensor.get(), i));
+
+    Tensor::MappingGuard input_guard(input_info_[i].hta_tensor.get());
   }
 
-  int res = hexagon_hta_nn_execute_new(nn_id_,
-                                       inputs.data(),
-                                       num_inputs * kNumMetaData,
-                                       outputs.data(),
-                                       num_outputs * kNumMetaData);
-  MACE_CHECK(res == 0, "execute error");
+  MACE_CHECK(hexagon_hta_nn_execute_hw(nn_id_,
+                                       input_tensordef_.data(), num_inputs,
+                                       output_tensordef_.data(), num_outputs,
+                                       nullptr, nullptr) == 0);
 
   for (size_t i = 0; i < num_outputs; ++i) {
-    size_t index = i * kNumMetaData;
-    std::vector<uint32_t> output_shape{
-        outputs[index].batches, outputs[index].height, outputs[index].width,
-        outputs[index].depth};
-    MACE_CHECK(output_shape.size() == output_info_[i].shape.size(),
-               output_shape.size(), " vs ", output_info_[i].shape.size(),
-               " wrong output shape inferred");
-    for (size_t j = 0; j < output_shape.size(); ++j) {
-      MACE_CHECK(static_cast<index_t>(output_shape[j])
-                     == output_info_[i].shape[j],
-                 output_shape[j], " vs ", output_info_[i].shape[j],
-                 " wrong output shape[", j, "] inferred");
+    { // To sync cache
+      Tensor::MappingGuard output_guard(output_info_[i].hta_tensor.get());
     }
+    output_info_[i].quantized_tensor->Resize(output_info_[i].shape);
+    transformer_->TransformOutput(output_info_[i].hta_tensor.get(),
+                                  output_info_[i].quantized_tensor.get(), i);
+
+
     auto output_tensor = output_tensors->at(output_info_[i].name);
-    MACE_CHECK(static_cast<index_t>(outputs[index].data_valid_len)
-                   == output_tensor->raw_size(),
-               outputs[index].data_valid_len, " vs ", output_tensor->raw_size(),
-               " wrong output bytes inferred.");
+    MaceStatus st = transformer_->Dequantize(
+        output_info_[i].quantized_tensor.get(), output_tensor);
   }
 
-  return res == 0;
+  return true;
 }
 
 }  // namespace mace
diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
index 6b33514ccc8b1d628c856ca04c8b26f8d0af9006..aabcdc61c6aec244fcb1858d5c80f6b016b3e88b 100644
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
@@ -16,18 +16,40 @@
 #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
 
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
+#include <utility>
 
-#include "mace/utils/thread_pool.h"
-#include "mace/core/quantize.h"
+#include "mace/core/device.h"
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
 #include "mace/core/tensor.h"
-#include "mace/core/device.h"
 #include "mace/public/mace.h"
+#include "third_party/hta/hta_hexagon_api.h"
 
 namespace mace {
 
+struct HTAInOutInfo : public InOutInfo {
+  HTAInOutInfo(const std::string &name,
+               const std::vector<index_t> &shape,
+               const DataType data_type,
+               const float scale,
+               const int32_t zero_point,
+               std::unique_ptr<Tensor> quantized_tensor,
+               std::unique_ptr<Tensor> hta_tensor)
+      :  InOutInfo(name, shape, data_type),
+         scale(scale),
+         zero_point(zero_point),
+         quantized_tensor(std::move(quantized_tensor)),
+         hta_tensor(std::move(hta_tensor)) {}
+
+  float scale;
+  int32_t zero_point;
+  std::unique_ptr<Tensor> quantized_tensor;
+  std::unique_ptr<Tensor> hta_tensor;
+};
+
 class HexagonHTAWrapper : public HexagonControlWrapper {
  public:
   explicit HexagonHTAWrapper(Device *device);
@@ -50,7 +72,12 @@ class HexagonHTAWrapper : public HexagonControlWrapper {
   void SetDebugLevel(int level) override;
 
  private:
-  QuantizeUtil<float, uint8_t> quantize_util_;
+  Allocator *allocator_;
+  std::vector<HTAInOutInfo> input_info_;
+  std::vector<HTAInOutInfo> output_info_;
+  std::vector<hexagon_hta_nn_hw_tensordef> input_tensordef_;
+  std::vector<hexagon_hta_nn_hw_tensordef> output_tensordef_;
+  std::unique_ptr<HexagonHTATranformerBase> transformer_;
   MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper);
 };
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index b0bf041b6883000057a63608a82761abfa2037a3..a0865958893885c247ec2d5addef7d308c4ea3b5 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -16,9 +16,6 @@
 
 #include "mace/core/runtime/opencl/opencl_allocator.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#ifdef MACE_ENABLE_RPCMEM
-#include "third_party/rpcmem/rpcmem.h"
-#endif  // MACE_ENABLE_RPCMEM
 
 namespace mace {
 namespace {
@@ -39,24 +36,10 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
   }
 }
 
-#ifdef MACE_ENABLE_RPCMEM
-std::once_flag ion_prepared;
-void PrepareQualcommION() {
-  rpcmem_init();
-  std::atexit(rpcmem_deinit);
-}
-#endif  // MACE_ENABLE_RPCMEM
-
 }  // namespace
 
 OpenCLAllocator::OpenCLAllocator(
-    OpenCLRuntime *opencl_runtime): opencl_runtime_(opencl_runtime) {
-#ifdef MACE_ENABLE_RPCMEM
-  if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
-    std::call_once(ion_prepared, PrepareQualcommION);
-  }
-#endif  // MACE_ENABLE_RPCMEM
-}
+    OpenCLRuntime *opencl_runtime): opencl_runtime_(opencl_runtime) {}
 
 OpenCLAllocator::~OpenCLAllocator() {}
 
@@ -168,7 +151,7 @@ void OpenCLAllocator::Delete(void *buffer) {
     if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
       auto it = cl_to_host_map_.find(buffer);
       MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL buffer not found!");
-      rpcmem_free(it->second);
+      rpcmem_.Delete(it->second);
       cl_to_host_map_.erase(buffer);
     }
 #endif  // MACE_ENABLE_RPCMEM
@@ -184,7 +167,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
     if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
       auto it = cl_to_host_map_.find(buffer);
       MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL image not found!");
-      rpcmem_free(it->second);
+      rpcmem_.Delete(it->second);
       cl_to_host_map_.erase(buffer);
     }
 #endif  // MACE_ENABLE_RPCMEM
@@ -194,7 +177,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
 void *OpenCLAllocator::Map(void *buffer,
                            size_t offset,
                            size_t nbytes,
-                           bool finish_cmd_queue) const {
+                           bool finish_cmd_queue) {
   MACE_LATENCY_LOGGER(1, "Map OpenCL buffer");
   void *mapped_ptr = nullptr;
 #ifdef MACE_ENABLE_RPCMEM
@@ -209,7 +192,7 @@ void *OpenCLAllocator::Map(void *buffer,
 
     if (opencl_runtime_->qcom_host_cache_policy() ==
         CL_MEM_HOST_WRITEBACK_QCOM) {
-      MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0);
+      MACE_CHECK(rpcmem_.SyncCacheStart(mapped_ptr) == 0);
     }
   } else {
 #endif  // MACE_ENABLE_RPCMEM
@@ -234,7 +217,7 @@ void *OpenCLAllocator::Map(void *buffer,
 void *OpenCLAllocator::MapImage(void *buffer,
                                 const std::vector<size_t> &image_shape,
                                 std::vector<size_t> *mapped_image_pitch,
-                                bool finish_cmd_queue) const {
+                                bool finish_cmd_queue) {
   MACE_LATENCY_LOGGER(1, "Map OpenCL Image");
   MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
   void *mapped_ptr = nullptr;
@@ -251,7 +234,7 @@ void *OpenCLAllocator::MapImage(void *buffer,
 
     if (opencl_runtime_->qcom_host_cache_policy() ==
         CL_MEM_HOST_WRITEBACK_QCOM) {
-      MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0);
+      MACE_CHECK(rpcmem_.SyncCacheStart(mapped_ptr) == 0);
     }
   } else {
 #endif  // MACE_ENABLE_RPCMEM
@@ -275,13 +258,13 @@ void *OpenCLAllocator::MapImage(void *buffer,
   return mapped_ptr;
 }
 
-void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
+void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
   MACE_LATENCY_LOGGER(1, "Unmap OpenCL buffer/Image");
 #ifdef MACE_ENABLE_RPCMEM
   if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
     if (opencl_runtime_->qcom_host_cache_policy() ==
         CL_MEM_HOST_WRITEBACK_QCOM) {
-      MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_END) == 0);
+      MACE_CHECK(rpcmem_.SyncCacheEnd(mapped_ptr) == 0);
     }
   } else {
 #endif  // MACE_ENABLE_RPCMEM
@@ -301,17 +284,20 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
 bool OpenCLAllocator::OnHost() const { return false; }
 
 #ifdef MACE_ENABLE_RPCMEM
+Rpcmem *OpenCLAllocator::rpcmem() {
+  return &rpcmem_;
+}
+
 void OpenCLAllocator::CreateQualcommBufferIONHostPtr(
     const size_t nbytes,
     cl_mem_ion_host_ptr *ion_host) {
-  void *host = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_FLAG_CACHED,
-                            nbytes + opencl_runtime_->qcom_ext_mem_padding());
+  void *host = rpcmem_.New(nbytes + opencl_runtime_->qcom_ext_mem_padding());
   MACE_CHECK_NOTNULL(host);
   auto host_addr = reinterpret_cast<std::uintptr_t>(host);
   auto page_size = opencl_runtime_->qcom_page_size();
   MACE_CHECK(host_addr % page_size == 0, "ION memory address: ", host_addr,
              " must be aligned to page size: ", page_size);
-  int fd = rpcmem_to_fd(host);
+  int fd = rpcmem_.ToFd(host);
   MACE_CHECK(fd >= 0, "Invalid rpcmem file descriptor: ", fd);
 
   ion_host->ext_host_ptr.allocation_type = CL_MEM_ION_HOST_PTR_QCOM;
diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h
index 0c2783a137ffc89aacc52e5d72a0a0a05a53d1d5..5411a93e710d08518879516f76b77fcfe1bdbe50 100644
--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -48,17 +48,21 @@ class OpenCLAllocator : public Allocator {
   void *Map(void *buffer,
             size_t offset,
             size_t nbytes,
-            bool finish_cmd_queue) const override;
+            bool finish_cmd_queue) override;
 
   void *MapImage(void *buffer,
                  const std::vector<size_t> &image_shape,
                  std::vector<size_t> *mapped_image_pitch,
-                 bool finish_cmd_queue) const override;
+                 bool finish_cmd_queue) override;
 
-  void Unmap(void *buffer, void *mapped_ptr) const override;
+  void Unmap(void *buffer, void *mapped_ptr) override;
 
   bool OnHost() const override;
 
+#ifdef MACE_ENABLE_RPCMEM
+  Rpcmem *rpcmem() override;
+#endif  // MACE_ENABLE_RPCMEM
+
  private:
 #ifdef MACE_ENABLE_RPCMEM
   void CreateQualcommBufferIONHostPtr(const size_t nbytes,
@@ -69,6 +73,7 @@ class OpenCLAllocator : public Allocator {
                                      cl_mem_ion_host_ptr *ion_host);
 
   std::unordered_map<void *, void *> cl_to_host_map_;
+  Rpcmem rpcmem_;
 #endif  // MACE_ENABLE_RPCMEM
   OpenCLRuntime *opencl_runtime_;
 };
diff --git a/mace/ops/opencl/helper.cc b/mace/core/runtime/opencl/opencl_helper.cc
similarity index 98%
rename from mace/ops/opencl/helper.cc
rename to mace/core/runtime/opencl/opencl_helper.cc
index 16acafb5a244583c3d9b34df25755eb3d50284f7..a216b6215a22618e7a298ff4e915299b8f5ffb0e 100644
--- a/mace/ops/opencl/helper.cc
+++ b/mace/core/runtime/opencl/opencl_helper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 #include <algorithm>
 #include <string>
@@ -22,7 +22,6 @@
 #include "mace/utils/math.h"
 
 namespace mace {
-namespace ops {
 
 std::vector<index_t> FormatBufferShape(
     const std::vector<index_t> &buffer_shape,
@@ -59,8 +58,10 @@ std::string DtToCLDt(const DataType dt) {
       return "float";
     case DT_HALF:
       return "half";
+    case DT_UINT8:
+      return "uchar";
     default:
-      LOG(FATAL) << "Unsupported data type";
+      LOG(FATAL) << "Unsupported data type: " << dt;
       return "";
   }
 }
@@ -365,5 +366,4 @@ MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
   return MaceStatus::MACE_SUCCESS;
 }
 
-}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/opencl/helper.h b/mace/core/runtime/opencl/opencl_helper.h
similarity index 97%
rename from mace/ops/opencl/helper.h
rename to mace/core/runtime/opencl/opencl_helper.h
index a9e9866c31e85bd82efb1d1b2622d429f8639c5a..4d10862a1da42d0b581e0e6a5e2e4d019425d0ca 100644
--- a/mace/ops/opencl/helper.h
+++ b/mace/core/runtime/opencl/opencl_helper.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_OPENCL_HELPER_H_
-#define MACE_OPS_OPENCL_HELPER_H_
+#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_
+#define MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_
 
 #include <memory>
 #include <string>
@@ -30,7 +30,6 @@
 #include "mace/utils/math.h"
 
 namespace mace {
-namespace ops {
 // oorc for 'Out Of Range Check'
 #define MACE_OUT_OF_RANGE_DEFINITION           \
   std::shared_ptr<BufferBase> oorc_flag;
@@ -161,6 +160,5 @@ std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
                                        const uint32_t *gws,
                                        const uint32_t kwg_size);
 
-}  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_OPENCL_HELPER_H_
+#endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_
diff --git a/mace/libmace/BUILD.bazel b/mace/libmace/BUILD.bazel
index 2d16fd94b17fbde485f0fa678a6fc74303e046c8..8b540b53d946df2751df3ba957d9bc0bdda2534a 100644
--- a/mace/libmace/BUILD.bazel
+++ b/mace/libmace/BUILD.bazel
@@ -21,6 +21,7 @@ load(
     "if_opencl_enabled",
     "if_openmp_enabled",
     "if_quantize_enabled",
+    "if_rpcmem_enabled",
 )
 
 cc_library(
@@ -47,6 +48,8 @@ cc_library(
         "-DMACE_ENABLE_HTA",
     ]) + if_apu_enabled([
         "-DMACE_ENABLE_APU",
+    ]) + if_rpcmem_enabled([
+        "-DMACE_ENABLE_RPCMEM",
     ]),
     deps = [
         "//mace/ops",
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 3ab4b13ee1374b6db60c6d7fa2042051c4d1aeeb..b9d3b13c24f1490c688d775f51534c2094c6f377 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -520,9 +520,23 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
   if (device_type_ == DeviceType::HEXAGON
       || device_type_ == DeviceType::HTA) {
+#ifdef MACE_ENABLE_OPENCL
+    device_.reset(new HexagonDevice(
+        device_type_, thread_pool_.get(),
+        make_unique<GPUDevice>(
+            config.impl_->gpu_context()->opencl_tuner(),
+            config.impl_->gpu_context()->opencl_cache_storage(),
+            config.impl_->gpu_priority_hint(),
+            config.impl_->gpu_perf_hint(),
+            config.impl_->gpu_context()->opencl_binary_storage(),
+            config.impl_->num_threads(),
+            config.impl_->cpu_affinity_policy(),
+            thread_pool_.get())));
+#else
     device_.reset(new HexagonDevice(device_type_, thread_pool_.get()));
+#endif  // MACE_ENABLE_OPENCL
   }
-#endif
+#endif  // defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
 #ifdef MACE_ENABLE_APU
   if (device_type_ == DeviceType::APU) {
     device_.reset(new ApuDevice(thread_pool_.get()));
@@ -579,15 +593,19 @@ MaceStatus MaceEngine::Impl::Init(
                  << MakeString(MapKeys(output_info_map_));
     }
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-    DataType output_dt = output_info_map_[output_name].data_type();
-    Tensor *output_tensor =
-        ws_->CreateTensor(output_name, device_->allocator(), output_dt);
-    output_tensor->set_data_format(DataFormat::NHWC);
+    if (device_type_ == HEXAGON || device_type_ == HTA) {
+      DataType output_dt = output_info_map_[output_name].data_type();
+      Tensor *output_tensor =
+          ws_->CreateTensor(output_name, device_->allocator(), output_dt);
+      output_tensor->set_data_format(DataFormat::NHWC);
+    }
 #endif
 #if defined(MACE_ENABLE_APU)
-    Tensor *output_tensor =
-        ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
-    output_tensor->set_data_format(DataFormat::NHWC);
+    if (device_type_ == DeviceType::APU) {
+      Tensor *output_tensor =
+          ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
+      output_tensor->set_data_format(DataFormat::NHWC);
+    }
 #endif
   }
 #ifdef MACE_ENABLE_HEXAGON
diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h
index c32ccbb13069ea800aa30b2ac8cd8a2eb6cac2b5..25415877e676707aab857fd09e81d4821ae99361 100644
--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -21,7 +21,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc
index 2cb3ae0043df20ddfa25421572db5377f0c12363..688ded664fa7dac533fd7fbafcfc7d1d8fbf9cdc 100644
--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/buffer/conv_2d.h b/mace/ops/opencl/buffer/conv_2d.h
index c50752c3bc6abeaaabc961084d72e8f7afba9f76..563b835861ea76c6cb90b8ad27f2fa4c9d09e955 100644
--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
@@ -21,7 +21,7 @@
 #include <vector>
 
 #include "mace/ops/opencl/buffer/utils.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"
 
 namespace mace {
diff --git a/mace/ops/opencl/buffer/conv_2d_1x1.cc b/mace/ops/opencl/buffer/conv_2d_1x1.cc
index 6eeb0f1d1584eb4eb14fd749602895437286e766..95c85b17dd24438a8c9bd45c974b7c23c46c85be 100644
--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/buffer/conv_2d_general.cc b/mace/ops/opencl/buffer/conv_2d_general.cc
index b19b702083bbdeb2f94b2d6ab8e7e13a02c3ab12..4c03ee2af0c5b5452878db16067fff114088884c 100644
--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.h b/mace/ops/opencl/buffer/depthwise_conv2d.h
index 98dffa12734b8404221869d147420a2e76866224..60d680777ba06af2aec2c04ff42dcad6a5bd5caa 100644
--- a/mace/ops/opencl/buffer/depthwise_conv2d.h
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.h
@@ -21,7 +21,7 @@
 #include <vector>
 
 #include "mace/ops/opencl/buffer/utils.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"
 
 namespace mace {
diff --git a/mace/ops/opencl/buffer/pooling.h b/mace/ops/opencl/buffer/pooling.h
index 9e675e29aa14bd12409f0a1315fe34c023a73b5d..952a8f5e4b7ef4ea743c64dba48575e77560502d 100644
--- a/mace/ops/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
@@ -23,7 +23,7 @@
 #include <vector>
 
 #include "mace/ops/opencl/buffer/utils.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"
 
 namespace mace {
diff --git a/mace/ops/opencl/buffer/reshape.h b/mace/ops/opencl/buffer/reshape.h
index f030f1e759b8fc7bf837d4b1054c062dcdac8338..bc572acfdf1bf42b18cd1f326b0d3d455ec9ad50 100644
--- a/mace/ops/opencl/buffer/reshape.h
+++ b/mace/ops/opencl/buffer/reshape.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/buffer/softmax.h b/mace/ops/opencl/buffer/softmax.h
index 05d27cac7f4bdcd408c6b25b958e6414bde8249a..0acae465953c75fc6d053b8d6c90040a17f75818 100644
--- a/mace/ops/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/buffer/utils.cc b/mace/ops/opencl/buffer/utils.cc
index 536a7e0df8e4a875bb80c46a67613a246c9e221b..3e15115cc909465ccf1ddf3eed0501f70c0b4267 100644
--- a/mace/ops/opencl/buffer/utils.cc
+++ b/mace/ops/opencl/buffer/utils.cc
@@ -19,7 +19,7 @@
 #include <vector>
 
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/cl/buffer_transform.cl b/mace/ops/opencl/cl/buffer_transform.cl
index 0a7674d66f5fbc08b4443da288f26ac5ff5c1d47..9e554dcfe41b9503a0b6cf649c5f190710a480fe 100644
--- a/mace/ops/opencl/cl/buffer_transform.cl
+++ b/mace/ops/opencl/cl/buffer_transform.cl
@@ -209,3 +209,202 @@ __kernel void transform_data_type(BUFFER_OUT_OF_RANGE_PARAMS
   DATA_TYPE4 input_value = CONVERT4(vload4(out_idx, input + input_offset));
   vstore4(input_value, out_idx, output);
 }
+
+__kernel void buffer_quantize(BUFFER_OUT_OF_RANGE_PARAMS
+                              __private const int global_size_dim0,
+                              __private const float scale,
+                              __private const int zero_point,
+                              __global float *input,
+                              __private const int input_offset,
+                              __global uchar *output) {
+  const int out_idx = get_global_id(0);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_idx >= global_size_dim0) {
+    return;
+  }
+#endif
+
+  uchar4 output_value =
+      convert_uchar4_sat_rte(vload4(out_idx, input) / scale + zero_point);
+  vstore4(output_value, out_idx, output);
+}
+
+__kernel void buffer_dequantize(BUFFER_OUT_OF_RANGE_PARAMS
+                                __private const int global_size_dim0,
+                                __private const float scale,
+                                __private const int zero_point,
+                                __global uchar *input,
+                                __private const int input_offset,
+                                __global float *output) {
+  const int out_idx = get_global_id(0);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_idx >= global_size_dim0) {
+    return;
+  }
+#endif
+  float4 output_value =
+      convert_float4(convert_int4(vload4(out_idx, input)) - zero_point) * scale;
+  vstore4(output_value, out_idx, output);
+}
+
+
+// NHWC -> NCHW (W roundup to 32)
+__kernel void transform_nhwc_to_nchw32(BUFFER_OUT_OF_RANGE_PARAMS
+                                       GLOBAL_WORK_GROUP_SIZE_DIM3
+                                       __global uchar *input,  // NHWC
+                                       __private const int input_offset,
+                                       __private const int zero_point,
+                                       __global uchar *output,
+                                       __private const int batch,
+                                       __private const int height,
+                                       __private const int width,
+                                       __private const int channels) {
+  const int width_blk_idx = get_global_id(0);
+  const int h_idx = get_global_id(1);
+  const int bc_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (width_blk_idx >= global_size_dim0 ||
+      h_idx >= global_size_dim1 ||
+      bc_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int b_idx = bc_idx / channels;
+  const int chan_idx = bc_idx - mul24(b_idx, channels);
+  const int w_idx = width_blk_idx << 2;
+  const int padded_width = global_size_dim0 << 2;
+
+  const int in_offset = mad24(mad24(mad24(b_idx, height, h_idx),
+      width, w_idx), channels, chan_idx) + input_offset;
+  const int out_offset = (mad24(mad24(mad24(b_idx, channels, chan_idx),
+      height, h_idx), padded_width, w_idx));
+
+  uchar4 value = zero_point;
+  if (w_idx + 3 < width) {
+    value.x = input[in_offset];
+    value.y = input[in_offset + channels];
+    value.z = input[in_offset + 2 * channels];
+    value.w = input[in_offset + 3 * channels];
+  } else if (w_idx < width) {
+    const int diff = width - w_idx;
+    switch(diff) {
+      case 3:
+        value.z = input[in_offset + 2 * channels];
+      case 2:
+        value.y = input[in_offset + channels];
+      case 1:
+        value.x = input[in_offset];
+    }
+  }
+  VSTORE4(value, output, out_offset);
+}
+
+
+// N H ceil(C/32) W 32 -> NHWC
+__kernel void transform_d32_to_nhwc(BUFFER_OUT_OF_RANGE_PARAMS
+                                    GLOBAL_WORK_GROUP_SIZE_DIM3
+                                    __global uchar *input,  // D32
+                                    __private const int input_offset,
+                                    __global uchar *output, // NHWC
+                                    __private const int batch,
+                                    __private const int height,
+                                    __private const int width,
+                                    __private const int channels,
+                                    __private const int channel_slices) {
+  const int chan_blk_idx = get_global_id(0);
+  const int w_idx = get_global_id(1);
+  const int bh_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_blk_idx >= global_size_dim0 ||
+      w_idx >= global_size_dim1 ||
+      bh_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int b_idx = bh_idx / height;
+  const int h_idx = bh_idx - mul24(b_idx, height);
+  const int c_idx = chan_blk_idx << 2;
+  const int c_slice = c_idx >> 5;
+  const int c_slice_idx = c_idx & 31;
+
+  const int in_offset = mad24(mad24(mad24(mad24(b_idx, height, h_idx),
+      channel_slices, c_slice), width, w_idx), 32, c_slice_idx) + input_offset;
+  const int out_offset = (mad24(mad24(mad24(b_idx, height, h_idx),
+      width, w_idx), channels, c_idx));
+
+  uchar4 value = vload4(0, input + in_offset);
+  if (c_idx + 3 < channels) {
+    VSTORE4(value, output, out_offset);
+  } else {
+    const int diff = channels - c_idx;
+    switch(diff) {
+      case 3:
+        vstore3(value.xyz, 0, output + out_offset);
+        break;
+      case 2:
+        vstore2(value.xy, 0, output + out_offset);
+        break;
+      case 1:
+        output[out_offset] = value.x;
+        break;
+    }
+  }
+}
+
+// NHWC -> N H ceil(C/32) W 32
+__kernel void transform_nhwc_to_d32(BUFFER_OUT_OF_RANGE_PARAMS
+                                    GLOBAL_WORK_GROUP_SIZE_DIM3
+                                    __global uchar *input,  // NHWC
+                                    __private const int input_offset,
+                                    __private const int zero_point,
+                                    __global uchar *output, // D32
+                                    __private const int batch,
+                                    __private const int height,
+                                    __private const int width,
+                                    __private const int channels,
+                                    __private const int channel_slices) {
+  const int w_32_idx = get_global_id(0);
+  const int c_slice = get_global_id(1);
+  const int bh_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w_32_idx >= global_size_dim0 ||
+      c_slice >= global_size_dim1 ||
+      bh_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int b_idx = bh_idx / height;
+  const int h_idx = bh_idx - mul24(b_idx, height);
+  const int w_idx = w_32_idx >> 3;
+  const int c_slice_blk_idx = w_32_idx & 7;
+  const int c_slice_idx = c_slice_blk_idx << 2;
+  const int c_idx = (c_slice << 5) + c_slice_idx;
+
+  const int in_offset = (mad24(mad24(mad24(b_idx, height, h_idx),
+      width, w_idx), channels, c_idx)) + input_offset;
+  const int out_offset = mad24(mad24(mad24(mad24(b_idx, height, h_idx),
+      channel_slices, c_slice), width, w_idx), 32, c_slice_idx);
+
+  uchar4 value = zero_point;
+  if (c_idx + 3 < channels) {
+    value = vload4(0, input + in_offset);
+  } else if (c_idx < channels) {
+    value = vload4(0, input + in_offset);
+    const int diff = channels - c_idx;
+    switch(diff) {
+      case 3:
+        value.w = zero_point; break;
+      case 2:
+        value.zw = zero_point; break;
+      case 1:
+        value.yzw = zero_point; break;
+    }
+  }  // else value = zero_point
+
+  VSTORE4(value, output, out_offset);
+}
diff --git a/mace/ops/opencl/image/activation.h b/mace/ops/opencl/image/activation.h
index e98b5e9daefe0cf988b6cb39ee7e0cf4903ea89b..929d267ddd2860161c45eb63b3be465e870298ed 100644
--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -24,7 +24,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/addn.h b/mace/ops/opencl/image/addn.h
index b163152bf15838c385b38690c75f8f92499b5ae2..575dee22764af5e856ec19792f5fce60634f906b 100644
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/batch_norm.h b/mace/ops/opencl/image/batch_norm.h
index b2201a96631fef8ddd3b1a1748550aa96897e646..6b7773682ff546753b75f2f94f0fb2282a0b39fc 100644
--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -24,7 +24,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h
index a0aced7c021cdee7dfe55b3800e9da324e7abf59..a9d047aa2a7af096b535f2086afe9450beed46c3 100644
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/bias_add.h b/mace/ops/opencl/image/bias_add.h
index 7c25662da81b183ca88588dc756b724b50ed33ac..67644d6a1f58b99fc0c3d4d2d4021c1e2e178adb 100644
--- a/mace/ops/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h
index 493f6579db7ced93681ad2b8b80b491edd934b8d..3389118279f3cdf7c8050e1bb5fd17c9e154530d 100644
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/channel_shuffle.h b/mace/ops/opencl/image/channel_shuffle.h
index 371ecf22a6cf61e3e7c60b8af4abe981f3a1264e..94448d805ccb86887bb1b9e12bce0cfba66db4a4 100644
--- a/mace/ops/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/concat.h b/mace/ops/opencl/image/concat.h
index f1e51fd96e7312d30419eafada40796f000c55c2..e5cd297779f7adb583653e31d25aa5816a377d4f 100644
--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
@@ -21,7 +21,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/conv_2d.h b/mace/ops/opencl/image/conv_2d.h
index a1ee3301b373d43980d38b1fd38fb7876c5c47d2..6044c1a7235535cc0f67dcdc716b25189ed7a3d4 100644
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
@@ -21,7 +21,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc
index 718240152f6a74e0835c75edbae7782a1fb8c23f..494672a4447cf0ed9e8611e11a241f9cc1387816 100644
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc
index d8a8b9cfbba611e5d0a320e8708f7f08c0a2b844..8bfc988c8ebf4057b9a2942f632594d14cfcf7d0 100644
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/math.h"
 
 namespace mace {
diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc
index bf4baea72da0e0776eb3223e71650b475c994906..9964c5f25cba4b39e401ac39764bd6d29b6f62f1 100644
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -14,7 +14,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/utils/math.h"
 
diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h
index c2f1c53aa2383ab89669be2520a9af3c1f2a27c8..33a5d2603e56e297b0c0271ad806009b38550a07 100644
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h
index aa3b9d249b58cd3982866c71ef07b30ee24c75bc..4f1db7e66fa4580690bd648c259543dce292083d 100644
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h
index ac68cbdb02adffad9b7bfc911363b1c95d2a7a86..383a4c6f3fed98c2b4cec5b36121004a78a0109f 100644
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/depthwise_conv2d.h b/mace/ops/opencl/image/depthwise_conv2d.h
index f4bc4f2a0c92adc2edca4c5eb820c2f00f63d680..c72170acdb1c15ebf27dbd327d64b5b73d40de2e 100644
--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
@@ -21,7 +21,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h
index 2055511678d8340da655b298c2a8a163279c95a3..fe039cb679c449f0d432b86531d17795cb3e83e6 100644
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h
index 5678f9c72ffbe2ec706e2b44bd73457f938cb585..a9298cc6582e4e5f8d805c1a0d00f9f65e99de0b 100644
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -25,7 +25,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/eltwise_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h
index 9f1bae647f33c5906f31f312c5a094d64ef322e6..010edcac9979c659e6d926e076d941d9fea426dd 100644
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -24,7 +24,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h
index 85893f6b283da5b659d30466568c65c52d931954..5d5c524884c0ccb6ce976ee8cd45d345c445e20d 100644
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -21,7 +21,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/ops/opencl/buffer_transform_kernel.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/lpnorm.h b/mace/ops/opencl/image/lpnorm.h
index ae517c28f018b595e78e56b6a59ea939e23276df..cac641125d99d4e93495d67b45e00f0f27bb3c7c 100644
--- a/mace/ops/opencl/image/lpnorm.h
+++ b/mace/ops/opencl/image/lpnorm.h
@@ -17,7 +17,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/opencl/lpnorm.h"
 
 namespace mace {
diff --git a/mace/ops/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h
index 006374f9d099df01f866231f1756c97ec4b16190..998d8147675c2dd1b3ade1b782055a86117aea83 100644
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h
index afd4792cba2eb3a33ccbf88959481ebb0cb3f225..8ee05239b798d8c8b6f660fa4aea335ded3549b7 100644
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
@@ -24,7 +24,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/mvnorm.h b/mace/ops/opencl/image/mvnorm.h
index 9ff6d47f4547227dc54c22e4709b1ca26ff4d3b9..f6e609d27240612a0c53141ce409790b6b826234 100644
--- a/mace/ops/opencl/image/mvnorm.h
+++ b/mace/ops/opencl/image/mvnorm.h
@@ -20,7 +20,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/opencl/mvnorm.h"
 
 namespace mace {
diff --git a/mace/ops/opencl/image/pad.h b/mace/ops/opencl/image/pad.h
index f4b8278bbbfc4f67e1e16622baac4517ac441fb6..3df88f34e31020a848ee34d9c958cf8bc0200b32 100644
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
@@ -24,7 +24,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/pad_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h
index 8d709368c8f9d2154cbd60eb07c1a9742fc2f506..5c0e14a52b544e65af82bfd05bcc2a939e9d2a1b 100644
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
@@ -24,7 +24,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h
index 992ac1b1491c1ccfeba27ad39b743ab568354797..0dfb48b427a25df89e475e45873d0ec69197f95a 100644
--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/common/reduce_type.h"
 
 namespace mace {
diff --git a/mace/ops/opencl/image/reshape.h b/mace/ops/opencl/image/reshape.h
index 4004fb5e904c4105f3ae5615e8dde37c557a62e1..60be5fe0272c8827ce95003613ba0e07ab025396 100644
--- a/mace/ops/opencl/image/reshape.h
+++ b/mace/ops/opencl/image/reshape.h
@@ -21,7 +21,7 @@
 #include <memory>
 
 #include "mace/core/operator.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/opencl/buffer_transform_kernel.h"
 
 namespace mace {
diff --git a/mace/ops/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h
index cb215f19aa6a22fb3f919b2048b85e084c35667e..5abc553974e0c3fb1a4c2056ec140baf70e736cd 100644
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
@@ -24,7 +24,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h
index 68b1478dc81d620cc2bde198b02c221913b7939f..ca3602d33942da03de3aa3f3cb093513af74a324 100644
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
@@ -24,7 +24,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/resize_nearest_neighbor.h b/mace/ops/opencl/image/resize_nearest_neighbor.h
index 9e2cec61a822e4e86e139e6bfe299771a94794d6..8bb10d4b2fd56046a689beae3e9abb3f0671f05e 100644
--- a/mace/ops/opencl/image/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.h
@@ -24,7 +24,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/softmax.h b/mace/ops/opencl/image/softmax.h
index 505dff57c9a7caf718a4a7f98ab3d6ffe58a5565..525f1edc51ee8bc7637a2c9c83ffa876d67ab4b2 100644
--- a/mace/ops/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
@@ -24,7 +24,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h
index 6ad5d22833e2ff2104c974bd77f6da5c76af1ad3..20777dc88453bc1746aab4e50c2c20f98babecec 100644
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h
index 324977ea45c518a4a7a46520f0b5626c82716ea2..661e09af222ebf8ae07082d4192878d8e4703f36 100644
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h
index 956ff6573a60ed2050d5b526f58734cdc8fdff43..20e1936207dca72126efba0a1b80a3bafa149012 100644
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -24,7 +24,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h
index bd2d1e7f394e7ed98eb8bd4e948da32e615349be..5acddb2556946b42ad1062ce6ec8c7bcf255e2cf 100644
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc
index 1ea2634a022e7614bcc600e3e34827e7a4aa8338..fd7cdfe67f1f37b4f1701d77d28f0759829594dc 100644
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -16,7 +16,7 @@
 #include "mace/core/op_context.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
 
diff --git a/mace/tools/mace_run.cc b/mace/tools/mace_run.cc
index 55f2931795893a3181e86828c19046380d7c991d..6d025026d728825c8c55dd44b60832840d8879c9 100644
--- a/mace/tools/mace_run.cc
+++ b/mace/tools/mace_run.cc
@@ -175,9 +175,9 @@ bool RunModel(const std::string &model_name,
   if (status != MaceStatus::MACE_SUCCESS) {
     LOG(WARNING) << "Set openmp or cpu affinity failed.";
   }
-#ifdef MACE_ENABLE_OPENCL
+#if defined(MACE_ENABLE_OPENCL) || defined(MACE_ENABLE_HTA)
   std::shared_ptr<GPUContext> gpu_context;
-  if (device_type == DeviceType::GPU) {
+  if (device_type == DeviceType::GPU || device_type == DeviceType::HTA) {
     const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
     const std::string storage_path =
         std::string(storage_path_ptr == nullptr ?
diff --git a/test/ccunit/BUILD.bazel b/test/ccunit/BUILD.bazel
index d3039c0067793111783cda70398f8ac7a35f094d..ce4d268cd3ba86b2badb605a4b21d1d10422afbc 100644
--- a/test/ccunit/BUILD.bazel
+++ b/test/ccunit/BUILD.bazel
@@ -9,6 +9,7 @@ load(
     "if_android",
     "if_android_armv7",
     "if_hexagon_enabled",
+    "if_hta_enabled",
     "if_neon_enabled",
     "if_opencl_enabled",
     "if_openmp_enabled",
@@ -41,7 +42,9 @@ cc_test(
         [
             "mace/ops/opencl/*.cc",
         ]
-    )),
+    )) + if_hta_enabled([
+        "mace/core/runtime/hexagon/hta_transform_test.cc",
+    ]),
     copts = [
         "-Werror",
         "-Wextra",
@@ -59,6 +62,8 @@ cc_test(
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
     ]),
     linkopts = if_openmp_enabled([
         "-fopenmp",
diff --git a/test/ccunit/CMakeLists.txt b/test/ccunit/CMakeLists.txt
index 21107815e9b604e4a0eb616db4d9c20c80186c6c..ae83e1a3aee54e0bb0bc9998607e38f1a22887f6 100644
--- a/test/ccunit/CMakeLists.txt
+++ b/test/ccunit/CMakeLists.txt
@@ -7,6 +7,10 @@ file(GLOB MACE_CC_TEST_SRCS
   mace/libmace/*.cc
 )
 
+if(MACE_ENABLE_HTA)
+  set(MACE_CC_TEST_SRCS ${MACE_CC_TEST_SRCS} mace/core/runtime/hexagon/hta_transform_test.cc)
+endif(MACE_ENABLE_HTA)
+
 add_executable(mace_cc_test ${MACE_CC_TEST_SRCS})
 target_link_libraries(mace_cc_test PUBLIC
   mace_cc_test_utils
diff --git a/test/ccunit/mace/core/runtime/hexagon/hta_transform_test.cc b/test/ccunit/mace/core/runtime/hexagon/hta_transform_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d123c8bd1e9955b58a4dd761e9705846e7d0af9
--- /dev/null
+++ b/test/ccunit/mace/core/runtime/hexagon/hta_transform_test.cc
@@ -0,0 +1,152 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/quantize.h"
+#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class HTATransformTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D>
+void TestHTAQuantizeDequantize(const std::vector<float> &input) {
+  float min_val, max_val;
+  FindMinMax(input.data(), input.size(), &min_val, &max_val);
+  float scale;
+  int32_t zero;
+  AdjustRange<uint8_t>(min_val, max_val, false, &scale, &zero);
+
+  OpsTestNet net;
+    Device *device = OpTestContext::Get()->GetDevice(D);
+
+  net.AddInputFromArray<D, float>("Input",
+                                  {static_cast<index_t>(input.size())},
+                                  input);
+  Tensor *input_tensor = net.GetOutput("Input");
+  input_tensor->SetScale(scale);
+  input_tensor->SetZeroPoint(zero);
+  Tensor *quantized_output = net.ws()->CreateTensor(
+      "QuantizedOutput", device->allocator(), DT_UINT8);
+  Tensor *dequantized_output = net.ws()->CreateTensor(
+      "DequantizedOutput", device->allocator(), DT_FLOAT);
+
+  mace::HexagonHTATranformer<D> transformer;
+  transformer.Init(device);
+  transformer.Quantize(input_tensor, quantized_output);
+  transformer.Dequantize(quantized_output, dequantized_output);
+
+  ExpectTensorNear<float>(*input_tensor,
+                          *dequantized_output,
+                          0.1);
+}
+
+}  // namespace
+
+TEST_F(HTATransformTest, TestHTAQuantize) {
+  TestHTAQuantizeDequantize<CPU>({-2, -1, 0, 1, 2, 3, 4});
+  TestHTAQuantizeDequantize<GPU>({-2, -1, 0, 1, 2, 3, 4});
+}
+
+namespace {
+void TestHTAInputTransform(const std::vector<index_t> &input_shape,
+                           const hexagon_hta_hw_layout format) {
+  OpsTestNet net;
+  Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
+  net.AddRandomInput<GPU, uint8_t>("Input", input_shape);
+  Tensor *input_tensor = net.GetOutput("Input");
+  input_tensor->SetScale(0.1);
+  input_tensor->SetZeroPoint(1);
+  Tensor *cpu_transformed_tensor = net.ws()->CreateTensor(
+      "CpuTransformedOutput", device->allocator(), DT_UINT8);
+  Tensor *gpu_transformed_tensor = net.ws()->CreateTensor(
+      "GpuTransformedOutput", device->allocator(), DT_UINT8);
+
+  mace::HexagonHTATranformer<CPU> cpu_transformer;
+  mace::HexagonHTATranformer<GPU> gpu_transformer;
+  cpu_transformer.Init(device);
+  gpu_transformer.Init(device);
+  cpu_transformer.SetInputTransformer(format);
+  gpu_transformer.SetInputTransformer(format);
+  cpu_transformer.TransformInput(input_tensor, cpu_transformed_tensor, 0);
+  gpu_transformer.TransformInput(input_tensor, gpu_transformed_tensor, 0);
+
+  net.Sync();
+  ExpectTensorNear<uint8_t>(*cpu_transformed_tensor, *gpu_transformed_tensor);
+}
+
+}  // namespace
+
+TEST_F(HTATransformTest, TestHTAInputTransform) {
+  TestHTAInputTransform({1, 15, 33, 2}, HEXAGON_HTA_HW_FORMAT_PLANAR);
+  TestHTAInputTransform({1, 19, 31, 3}, HEXAGON_HTA_HW_FORMAT_PLANAR);
+  TestHTAInputTransform({1, 224, 224, 3}, HEXAGON_HTA_HW_FORMAT_PLANAR);
+  TestHTAInputTransform({1, 19, 31, 3}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAInputTransform({1, 15, 33, 27}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAInputTransform({1, 15, 33, 35}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAInputTransform({1, 224, 224, 3}, HEXAGON_HTA_HW_FORMAT_D32);
+}
+
+
+namespace {
+void TestHTAOutputTransform(const std::vector<index_t> &output_shape,
+                            const hexagon_hta_hw_layout format) {
+  index_t batch = output_shape[0];
+  index_t height = output_shape[1];
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+  MACE_CHECK(format == HEXAGON_HTA_HW_FORMAT_D32);
+  std::vector<index_t> input_shape {
+      batch, height, RoundUpDiv<index_t>(channels, 32), width, 32};
+
+  OpsTestNet net;
+  Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
+  net.AddRandomInput<GPU, uint8_t>("Input", input_shape);
+  Tensor *input_tensor = net.GetOutput("Input");
+  Tensor *cpu_transformed_tensor = net.ws()->CreateTensor(
+      "CpuTransformedOutput", device->allocator(), DT_UINT8);
+  Tensor *gpu_transformed_tensor = net.ws()->CreateTensor(
+      "GpuTransformedOutput", device->allocator(), DT_UINT8);
+  cpu_transformed_tensor->Resize(output_shape);
+  gpu_transformed_tensor->Resize(output_shape);
+
+  mace::HexagonHTATranformer<CPU> cpu_transformer;
+  mace::HexagonHTATranformer<GPU> gpu_transformer;
+  cpu_transformer.Init(device);
+  gpu_transformer.Init(device);
+  cpu_transformer.SetOutputTransformer(format);
+  gpu_transformer.SetOutputTransformer(format);
+  cpu_transformer.TransformOutput(input_tensor, cpu_transformed_tensor, 0);
+  gpu_transformer.TransformOutput(input_tensor, gpu_transformed_tensor, 0);
+
+  net.Sync();
+  ExpectTensorNear<uint8_t>(*cpu_transformed_tensor, *gpu_transformed_tensor);
+}
+
+}  // namespace
+
+TEST_F(HTATransformTest, TestHTAOutputTransform) {
+  TestHTAOutputTransform({1, 15, 33, 2}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAOutputTransform({1, 19, 31, 27}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAOutputTransform({1, 19, 31, 35}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAOutputTransform({1, 224, 224, 2}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAOutputTransform({1, 384, 384, 3}, HEXAGON_HTA_HW_FORMAT_D32);
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
index 5ee423d335b14eeda11c2148222e0e9fd854e925..3dfe468a8db889418c48a15776e79adccadf9319 100644
--- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
+++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
@@ -21,7 +21,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"
 
 namespace mace {
diff --git a/test/ccunit/mace/ops/quantize_test.cc b/test/ccunit/mace/ops/quantize_test.cc
index ecfe5f5d07b638154fabb481348f4164b505fe5f..b6a86841717614ca2287909c296ad0573198cc0e 100644
--- a/test/ccunit/mace/ops/quantize_test.cc
+++ b/test/ccunit/mace/ops/quantize_test.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/quantize.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/test/ccutils/mace/ops/testing/test_utils.h b/test/ccutils/mace/ops/testing/test_utils.h
index 6a0a045b6326a67689f9755bc911a2f54fbc798a..ef830781d7fd597599180d8882c47eda1800a3e8 100644
--- a/test/ccutils/mace/ops/testing/test_utils.h
+++ b/test/ccutils/mace/ops/testing/test_utils.h
@@ -250,8 +250,24 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
     Tensor::MappingGuard y_mapper(&y);
     auto a = x.data<EXP_TYPE>();
     auto b = y.data<RES_TYPE>();
-    for (int i = 0; i < x.size(); ++i) {
-      ExpectEqual(a[i], b[i]);
+    if (x.dim_size() == 4) {
+      for (int n = 0; n < x.dim(0); ++n) {
+        for (int h = 0; h < x.dim(1); ++h) {
+          for (int w = 0; w < x.dim(2); ++w) {
+            for (int c = 0; c < x.dim(3); ++c) {
+              EXPECT_EQ(*a, *b) << "with index = [" << n << ", " << h << ", "
+                                << w << ", " << c << "]";
+              a++;
+              b++;
+            }
+          }
+        }
+      }
+    } else {
+      for (int i = 0; i < x.size(); ++i) {
+        EXPECT_EQ(a[i], b[i])
+            << "a = " << a << " b = " << b << " index = " << i;
+      }
     }
   }
 
diff --git a/third_party/nnlib/arm64-v8a/libhexagon_controller.so b/third_party/nnlib/arm64-v8a/libhexagon_controller.so
index 5b17a007b440e9e068925ca3f347dc8d94ccb55a..5e2e85a1341ba040885086e45d8b03e85ae5c6d6 100755
Binary files a/third_party/nnlib/arm64-v8a/libhexagon_controller.so and b/third_party/nnlib/arm64-v8a/libhexagon_controller.so differ
diff --git a/third_party/nnlib/armeabi-v7a/libhexagon_controller.so b/third_party/nnlib/armeabi-v7a/libhexagon_controller.so
index fc3e89b99f531afbfddba918c5ee4c8ffddb05ac..1ac9ae9d00ed1e3f8ee0acb04bb006a1d625bbcc 100755
Binary files a/third_party/nnlib/armeabi-v7a/libhexagon_controller.so and b/third_party/nnlib/armeabi-v7a/libhexagon_controller.so differ
diff --git a/third_party/nnlib/hexnn_dsp_controller.h b/third_party/nnlib/hexnn_dsp_controller.h
index 671a6c68a13889a7179acad5f709f49dad68e292..8818217d220729238af7d0b18bd1d6c06d8bfe6d 100644
--- a/third_party/nnlib/hexnn_dsp_controller.h
+++ b/third_party/nnlib/hexnn_dsp_controller.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2019, The Linux Foundation. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted (subject to the limitations in the
@@ -42,10 +42,6 @@ extern "C" {
 
 int hexnn_controller_request_unsigned_pd();
 
-int hexnn_controller_init();
-
-int hexnn_controller_deinit();
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/rpcmem/arm64-v8a/rpcmem.a b/third_party/rpcmem/arm64-v8a/rpcmem.a
index 982714b75645f2a408f7fde000376cf32d18055b..9bc7dea567e21127a9eede04904b678c881d2b7f 100644
Binary files a/third_party/rpcmem/arm64-v8a/rpcmem.a and b/third_party/rpcmem/arm64-v8a/rpcmem.a differ
diff --git a/third_party/rpcmem/armeabi-v7a/rpcmem.a b/third_party/rpcmem/armeabi-v7a/rpcmem.a
index faa1baa5f8e7a689e66ac126df0446c27d08e061..85f991e897b53977cfd02ce9fa12edb3953bfece 100644
Binary files a/third_party/rpcmem/armeabi-v7a/rpcmem.a and b/third_party/rpcmem/armeabi-v7a/rpcmem.a differ
diff --git a/third_party/rpcmem/rpcmem.h b/third_party/rpcmem/rpcmem.h
index f0296a2212d8fc57e64dbcee1c6cde3b5b0e821c..ac1889e02db582e702fe119a3be4d0e166416e46 100755
--- a/third_party/rpcmem/rpcmem.h
+++ b/third_party/rpcmem/rpcmem.h
@@ -6,6 +6,8 @@
 #ifndef RPCMEM_H
 #define RPCMEM_H
 
+#include <stdint.h>
+
 /**
  * RPCMEM_DEFAULT_HEAP
  * Dynamicaly select the heap to use.  This should be ok for most usecases.
@@ -77,16 +79,25 @@
 extern "C" {
 #endif
 
+typedef struct rpcmem rpcmem;
+struct rpcmem {
+  void *lst;  // QList*
+  void *mt;   // pthread_mutex_t*
+  int ionfd;
+  int flag;
+  int ionversion;
+};
+
 /**
  * call once to initialize the library
- * NOTE: rpcmem_init is not thread safe
+ * NOTE: rpcmem_init is now thread safe
  */
-void rpcmem_init(void);
+void rpcmem_init(rpcmem *rm);
 /**
  * call once for cleanup
- * NOTE: rpcmem_deinit is not thread safe
+ * NOTE: rpcmem_deinit is now thread safe
  */
-void rpcmem_deinit(void);
+void rpcmem_deinit(rpcmem *rm);
 
 /**
  * Allocate via ION a buffer of size
@@ -96,10 +107,10 @@ void rpcmem_deinit(void);
  * @retval, 0 on failure, pointer to buffer on success
  *
  * For example:
- *    buf = rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
+ *    buf = rpcmem_alloc(rm, RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
  */
 
-void* rpcmem_alloc(int heapid, unsigned int flags, int size);
+void* rpcmem_alloc(rpcmem *rm, int heapid, uint32_t flags, int size);
 
 /**
  * allocate with default settings
@@ -107,24 +118,24 @@ void* rpcmem_alloc(int heapid, unsigned int flags, int size);
  #if !defined(WINNT) && !defined (_WIN32_WINNT)
 __attribute__((unused))
 #endif
-static __inline void* rpcmem_alloc_def(int size) {
-   return rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
+static __inline void* rpcmem_alloc_def(rpcmem *rm, int size) {
+  return rpcmem_alloc(rm, RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
 }
 
 /**
  * free buffer, ignores invalid buffers
  */
-void rpcmem_free(void* po);
+void rpcmem_free(rpcmem *rm, void* po);
 
 /**
  * returns associated fd
  */
-int rpcmem_to_fd(void* po);
+int rpcmem_to_fd(rpcmem *rm, void* po);
 
 /**
  * cache coherency management
  */
-int rpcmem_sync_cache(void* po, unsigned int flags);
+int rpcmem_sync_cache(rpcmem *rm, void* po, uint32_t flags);
 
 #ifdef __cplusplus
 }
diff --git a/tools/bazel-build-standalone-lib.sh b/tools/bazel-build-standalone-lib.sh
index 8a07811373c996793e7652fab21379086c146945..f01dd06510d19f747f44303a5f949276962ba0a1 100755
--- a/tools/bazel-build-standalone-lib.sh
+++ b/tools/bazel-build-standalone-lib.sh
@@ -21,6 +21,11 @@ mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_dsp
 mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu
 mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_apu
 
+if [[ "$BUILD_HTA" == "1" ]]; then
+    mkdir -p $LIB_DIR/armeabi-v7a/cpu_gpu_hta
+    mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_hta
+fi
+
 rm -rf $LIB_DIR/linux-x86-64
 mkdir -p $LIB_DIR/linux-x86-64
 
@@ -33,6 +38,18 @@ mkdir -p $LIB_DIR/aarch64_linux_gnu/cpu_gpu
 
 
 # build shared libraries
+if [[ "$BUILD_HTA" == "1" ]]; then
+	echo "build shared lib for armeabi-v7a + cpu_gpu_hta"
+	bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
+	cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
+	cp third_party/hta/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
+
+	echo "build shared lib for arm64-v8a + cpu_gpu_hta"
+	bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
+	cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu_hta/
+	cp third_party/hta/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_hta/
+fi
+
 echo "build shared lib for armeabi-v7a + cpu_gpu_dsp"
 bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
@@ -71,6 +88,18 @@ if [[ "$OSTYPE" != "darwin"* ]];then
 fi
 
 # build static libraries
+if [[ "$BUILD_HTA" == "1" ]]; then
+	echo "build static lib for armeabi-v7a + cpu_gpu_hta"
+	bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
+	cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
+	cp third_party/hta/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
+
+	echo "build static lib for arm64-v8a + cpu_gpu_hta"
+	bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
+	cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu_hta/
+	cp third_party/hta/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_hta/
+fi
+
 echo "build static lib for armeabi-v7a + cpu_gpu_dsp"
 bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 1679f604d85cac55b47865a22fd87f021931a227..a0bed2496ad02066a14c7bbb4f89474c726c9a04 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -100,6 +100,11 @@ def parse_args():
         type=str2bool,
         default=True,
         help="Whether to use rpcmem")
+    parser.add_argument(
+        "--enable_hta",
+        type=str2bool,
+        default=False,
+        help="Whether to use hta")
     parser.add_argument(
         '--address_sanitizer',
         action="store_true",
@@ -170,6 +175,7 @@ def main(unused_args):
             enable_neon=FLAGS.enable_neon,
             enable_quantize=FLAGS.enable_quantize,
             enable_rpcmem=FLAGS.enable_rpcmem,
+            enable_hta=FLAGS.enable_hta,
             address_sanitizer=FLAGS.address_sanitizer,
             debug_mode=FLAGS.debug_mode)
         if FLAGS.run_target:
diff --git a/tools/converter.py b/tools/converter.py
index 324907943c9076d4291be201eb77847d84013564..dd9a6cbc9218eef8fa771210b9e2ae0eb2ebbbc5 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -218,7 +218,8 @@ def get_opencl_mode(configs):
                 YAMLKeyword.runtime, "")
         runtime_list.append(model_runtime.lower())
 
-    if RuntimeType.gpu in runtime_list or RuntimeType.cpu_gpu in runtime_list:
+    if RuntimeType.gpu in runtime_list or RuntimeType.cpu_gpu in runtime_list \
+            or RuntimeType.hta in runtime_list:
         return True
     return False
 
diff --git a/tools/python/transform/hexagon_converter.py b/tools/python/transform/hexagon_converter.py
index 3a99c5cfa1a23c4be1a567897de26071859bb936..6d6d8eaf27cc65015291e19f6bb55d0c3b02ea5e 100644
--- a/tools/python/transform/hexagon_converter.py
+++ b/tools/python/transform/hexagon_converter.py
@@ -295,6 +295,20 @@ class HexagonConverter(base_converter.ConverterInterface):
             else:
                 index += 1
 
+        if self._option.device == DeviceType.HTA.value:
+            # replace QuantizeINPUT_f_to_8 with INPUT
+            quantize_input_op.type = HexagonOp.INPUT.name
+            del quantize_input_op.output_shape[1:]
+            del quantize_input_op.output_type[1:]
+            del quantize_input_op.out_max_byte_size[1:]
+
+            # replace first op's input min max with constant
+            self.add_constant_min_max_for_first_op(self._model.op[1])
+
+            # replace DequantizeOUTPUT_8tof with OUTPUT
+            dequantize_output_op.type = HexagonOp.OUTPUT.name
+            del dequantize_output_op.input[1:]
+
         return quantize_input_op.output
 
     def add_node_id(self, model_inputs):
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 831d015d1b3613ce5088ff91adb5aab36f3bae55..219b8502fac4dd44f18886c2727b1633abd907ab 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -605,18 +605,20 @@ def create_internal_storage_dir(serialno, phone_data_dir):
 
 def push_depended_so_libs(libmace_dynamic_library_path,
                           abi, phone_data_dir, serialno):
-    dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends",
-                          libmace_dynamic_library_path)
-    src_file = ""
-    for dep in split_stdout(dep_so_libs):
-        if dep == "libgnustl_shared.so":
-            src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \
-                       "%s/libgnustl_shared.so" \
-                       % (os.environ["ANDROID_NDK_HOME"], abi)
-        elif dep == "libc++_shared.so":
-            src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \
-                       "%s/libc++_shared.so"\
-                       % (os.environ["ANDROID_NDK_HOME"], abi)
+    src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \
+               "%s/libc++_shared.so" \
+               % (os.environ["ANDROID_NDK_HOME"], abi)
+    try:
+        dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends",
+                              libmace_dynamic_library_path)
+    except sh.ErrorReturnCode_127:
+        print("Find no ndk-depends, use default libc++_shared.so")
+    else:
+        for dep in split_stdout(dep_so_libs):
+            if dep == "libgnustl_shared.so":
+                src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \
+                           "%s/libgnustl_shared.so" \
+                           % (os.environ["ANDROID_NDK_HOME"], abi)
     print("push %s to %s" % (src_file, phone_data_dir))
     adb_push(src_file, phone_data_dir, serialno)