Support HTA custom API

28954099 · Bin Li · luxuhui · f4061000 · 28954099 · 28954099
90 changed file
--- a/mace/core/BUILD.bazel
+++ b/mace/core/BUILD.bazel
@@ -28,21 +28,37 @@ cc_library(
            "*.cc",
            "runtime/cpu/*.cc",
        ],
+        exclude = [
+            "rpcmem.cc",
+        ],
    ) + if_opencl_enabled(glob(
        [
            "runtime/opencl/*.cc",
        ],
-    )) + if_hexagon_enabled([
+    )) + if_hexagon_or_hta_enabled([
+        "runtime/hexagon/hexagon_allocator.h",
+        "runtime/hexagon/hexagon_allocator.cc",
+        "runtime/hexagon/hexagon_device.cc",
+    ]) + if_hexagon_enabled([
        "runtime/hexagon/hexagon_dsp_wrapper.cc",
    ]) + if_hta_enabled([
+        "runtime/hexagon/hexagon_hta_transformer.h",
+        "runtime/hexagon/hexagon_hta_transformer.cc",
        "runtime/hexagon/hexagon_hta_wrapper.cc",
    ]) + if_apu_enabled(glob([
        "runtime/apu/*.cc",
-    ])),
-    hdrs = glob([
-        "*.h",
-        "runtime/cpu/*.h",
-    ]) + if_opencl_enabled(glob([
+    ])) + if_rpcmem_enabled([
+        "rpcmem.cc",
+    ]),
+    hdrs = glob(
+        [
+            "*.h",
+            "runtime/cpu/*.h",
+        ],
+        exclude = [
+            "rpcmem.h",
+        ],
+    ) + if_opencl_enabled(glob([
        "runtime/opencl/*.h",
    ])) + if_hexagon_or_hta_enabled(glob([
        "runtime/hexagon/hexagon_control_wrapper.h",
@@ -53,7 +69,9 @@ cc_library(
        "runtime/hexagon/*hta*.h",
    ])) + if_apu_enabled(glob([
        "runtime/apu/*.h"
-    ])),
+    ])) + if_rpcmem_enabled([
+        "rpcmem.h",
+    ]),
    copts = [
        "-Werror",
        "-Wextra",

--- a/mace/core/CMakeLists.txt
+++ b/mace/core/CMakeLists.txt
@@ -17,11 +17,17 @@ set(CORE_SRCS
  runtime/cpu/cpu_runtime.cc
 )

+if(MACE_ENABLE_RPCMEM)
+  set(CORE_SRCS ${CORE_SRCS} rpcmem.cc)
+  set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} rpcmem)
+endif(MACE_ENABLE_RPCMEM)
+
 if(MACE_ENABLE_OPENCL)
  set(CORE_SRCS ${CORE_SRCS}
    runtime/opencl/gpu_device.cc
    runtime/opencl/gpu_runtime.cc
    runtime/opencl/opencl_allocator.cc
+    runtime/opencl/opencl_helper.cc
    runtime/opencl/opencl_runtime.cc
    runtime/opencl/opencl_util.cc
    runtime/opencl/opencl_wrapper.cc
@@ -30,12 +36,20 @@ if(MACE_ENABLE_OPENCL)
  set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} generated_opencl_kernel dl)
 endif(MACE_ENABLE_OPENCL)

+if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
+  set(CORE_SRCS ${CORE_SRCS}
+    runtime/hexagon/hexagon_allocator.cc
+    runtime/hexagon/hexagon_device.cc
+  )
+endif(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
+
 if(MACE_ENABLE_HEXAGON_DSP)
  set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_dsp_wrapper.cc)
  set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hexagon_controller)
 endif(MACE_ENABLE_HEXAGON_DSP)

 if(MACE_ENABLE_HEXAGON_HTA)
+  set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_hta_transformer.cc)
  set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_hta_wrapper.cc)
  set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hta_hexagon_runtime)
 endif(MACE_ENABLE_HEXAGON_HTA)
@@ -45,10 +59,6 @@ if(MACE_ENABLE_MTK_APU)
  set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} apu-frontend)
 endif(MACE_ENABLE_MTK_APU)

-if(MACE_ENABLE_RPCMEM)
-    set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} rpcmem)
-endif(MACE_ENABLE_RPCMEM)
-
 add_library(core STATIC ${CORE_SRCS})
 target_link_libraries(core PRIVATE
  proto

--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -16,6 +16,54 @@

 namespace mace {

+MaceStatus Allocator::NewImage(const std::vector<size_t> &image_shape,
+                               const DataType dt,
+                               void **result) {
+  MACE_UNUSED(image_shape);
+  MACE_UNUSED(dt);
+  MACE_UNUSED(result);
+  MACE_NOT_IMPLEMENTED;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void Allocator::DeleteImage(void *data) {
+  MACE_UNUSED(data);
+  MACE_NOT_IMPLEMENTED;
+}
+
+void *Allocator::Map(void *buffer,
+                     size_t offset,
+                     size_t nbytes,
+                     bool finish_cmd_queue) {
+  MACE_UNUSED(nbytes);
+  MACE_UNUSED(finish_cmd_queue);
+  return reinterpret_cast<char*>(buffer) + offset;
+}
+
+void *Allocator::MapImage(void *buffer,
+                          const std::vector<size_t> &image_shape,
+                          std::vector<size_t> *mapped_image_pitch,
+                          bool finish_cmd_queue) {
+  MACE_UNUSED(buffer);
+  MACE_UNUSED(image_shape);
+  MACE_UNUSED(mapped_image_pitch);
+  MACE_UNUSED(finish_cmd_queue);
+  MACE_NOT_IMPLEMENTED;
+  return nullptr;
+}
+
+void Allocator::Unmap(void *buffer, void *mapper_ptr) {
+  MACE_UNUSED(buffer);
+  MACE_UNUSED(mapper_ptr);
+}
+
+#ifdef MACE_ENABLE_RPCMEM
+Rpcmem *Allocator::rpcmem() {
+  MACE_NOT_IMPLEMENTED;
+  return nullptr;
+}
+#endif  // MACE_ENABLE_RPCMEM
+
 Allocator *GetCPUAllocator() {
  static CPUAllocator allocator;
  return &allocator;

--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -27,6 +27,10 @@
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"

+#ifdef MACE_ENABLE_RPCMEM
+#include "mace/core/rpcmem.h"
+#endif  // MACE_ENABLE_RPCMEM
+
 namespace mace {

 #if defined(__hexagon__)
@@ -50,19 +54,22 @@ class Allocator {
  virtual MaceStatus New(size_t nbytes, void **result) = 0;
  virtual MaceStatus NewImage(const std::vector<size_t> &image_shape,
                              const DataType dt,
-                              void **result) = 0;
+                              void **result);
  virtual void Delete(void *data) = 0;
-  virtual void DeleteImage(void *data) = 0;
+  virtual void DeleteImage(void *data);
  virtual void *Map(void *buffer,
                    size_t offset,
                    size_t nbytes,
-                    bool finish_cmd_queue) const = 0;
+                    bool finish_cmd_queue);
  virtual void *MapImage(void *buffer,
                         const std::vector<size_t> &image_shape,
                         std::vector<size_t> *mapped_image_pitch,
-                         bool finish_cmd_queue) const = 0;
-  virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
+                         bool finish_cmd_queue);
+  virtual void Unmap(void *buffer, void *mapper_ptr);
  virtual bool OnHost() const = 0;
+#ifdef MACE_ENABLE_RPCMEM
+  virtual Rpcmem *rpcmem();
+#endif  // MACE_ENABLE_RPCMEM
 };

 class CPUAllocator : public Allocator {
@@ -84,46 +91,12 @@ class CPUAllocator : public Allocator {
    return MaceStatus::MACE_SUCCESS;
  }

-  MaceStatus NewImage(const std::vector<size_t> &shape,
-                      const DataType dt,
-                      void **result) override {
-    MACE_UNUSED(shape);
-    MACE_UNUSED(dt);
-    MACE_UNUSED(result);
-    LOG(FATAL) << "Allocate CPU image";
-    return MaceStatus::MACE_SUCCESS;
-  }
-
  void Delete(void *data) override {
    MACE_CHECK_NOTNULL(data);
    VLOG(3) << "Free CPU buffer";
    free(data);
  }
-  void DeleteImage(void *data) override {
-    LOG(FATAL) << "Free CPU image";
-    free(data);
-  };
-  void *Map(void *buffer,
-            size_t offset,
-            size_t nbytes,
-            bool finish_cmd_queue) const override {
-    MACE_UNUSED(nbytes);
-    MACE_UNUSED(finish_cmd_queue);
-    return reinterpret_cast<char*>(buffer) + offset;
-  }
-  void *MapImage(void *buffer,
-                 const std::vector<size_t> &image_shape,
-                 std::vector<size_t> *mapped_image_pitch,
-                 bool finish_cmd_queue) const override {
-    MACE_UNUSED(image_shape);
-    MACE_UNUSED(mapped_image_pitch);
-    MACE_UNUSED(finish_cmd_queue);
-    return buffer;
-  }
-  void Unmap(void *buffer, void *mapper_ptr) const override {
-    MACE_UNUSED(buffer);
-    MACE_UNUSED(mapper_ptr);
-  }
+
  bool OnHost() const override { return true; }
 };


--- a/mace/core/quantize.h
+++ b/mace/core/quantize.h
@@ -117,9 +117,12 @@ inline void GetOutputMultiplierAndShift(
 template<typename F, typename Q>
 class QuantizeUtil {
 public:
+  QuantizeUtil() = default;
  explicit QuantizeUtil(utils::ThreadPool *thread_pool)
      : thread_pool_(thread_pool) {}

+  void Init(utils::ThreadPool *thread_pool) { thread_pool_ = thread_pool; }
+
  void QuantizeWithScaleAndZeropoint(const float *input,
                                     const index_t size,
                                     float scale,

--- a/mace/core/rpcmem.cc
+++ b/mace/core/rpcmem.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/rpcmem.h"
+
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+Rpcmem::Rpcmem() {
+  rpcmem_init(&rm);
+  MACE_CHECK(rm.flag == 0, "rpcmem_init failed!");
+}
+
+Rpcmem::~Rpcmem() {
+  rpcmem_deinit(&rm);
+}
+
+void *Rpcmem::New(int heapid, uint32_t flags, int nbytes) {
+  return rpcmem_alloc(&rm, heapid, flags, nbytes);
+}
+
+void *Rpcmem::New(int nbytes) {
+  return New(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_FLAG_CACHED, nbytes);
+}
+
+void Rpcmem::Delete(void *data) {
+  rpcmem_free(&rm, data);
+}
+
+int Rpcmem::ToFd(void *data) {
+  return rpcmem_to_fd(&rm, data);
+}
+
+int Rpcmem::SyncCacheStart(void *data) {
+  return rpcmem_sync_cache(&rm, data, RPCMEM_SYNC_START);
+}
+
+int Rpcmem::SyncCacheEnd(void *data) {
+  return rpcmem_sync_cache(&rm, data, RPCMEM_SYNC_END);
+}
+
+}  // namespace mace
--- a/mace/core/rpcmem.h
+++ b/mace/core/rpcmem.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RPCMEM_H_
+#define MACE_CORE_RPCMEM_H_
+
+#include "third_party/rpcmem/rpcmem.h"
+
+namespace mace {
+class Rpcmem {
+ public:
+  Rpcmem();
+  ~Rpcmem();
+  void *New(int heapid, uint32_t flags, int nbytes);
+  void *New(int nbytes);
+  void Delete(void *data);
+  int ToFd(void *data);
+  int SyncCacheStart(void *data);
+  int SyncCacheEnd(void *data);
+
+ private:
+  rpcmem rm;
+};
+}  // namespace mace
+#endif  // MACE_CORE_RPCMEM_H_
--- a/mace/core/runtime/hexagon/hexagon_allocator.cc
+++ b/mace/core/runtime/hexagon/hexagon_allocator.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/hexagon/hexagon_allocator.h"
+
+namespace mace {
+
+MaceStatus HexagonAllocator::New(size_t nbytes, void **result) {
+  *result = rpcmem_.New(nbytes);
+  MACE_CHECK_NOTNULL(*result);
+  memset(*result, 0, nbytes);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void HexagonAllocator::Delete(void *data) {
+  rpcmem_.Delete(data);
+}
+
+bool HexagonAllocator::OnHost() const {
+    return true;
+}
+
+Rpcmem *HexagonAllocator::rpcmem() {
+  return &rpcmem_;
+}
+
+}  // namespace mace
--- a/mace/core/runtime/hexagon/hexagon_allocator.h
+++ b/mace/core/runtime/hexagon/hexagon_allocator.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_
+
+#include "mace/core/allocator.h"
+
+namespace mace {
+
+class HexagonAllocator : public Allocator {
+ public:
+  MaceStatus New(size_t nbytes, void **result) override;
+
+  void Delete(void *buffer) override;
+
+  bool OnHost() const override;
+
+  Rpcmem *rpcmem() override;
+
+ private:
+  Rpcmem rpcmem_;
+};
+
+}  // namespace mace
+
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -25,27 +25,15 @@
 #include "mace/public/mace.h"

 namespace mace {
-
 struct InOutInfo {
  InOutInfo(const std::string &name,
            const std::vector<index_t> &shape,
-            const DataType data_type,
-            const float scale,
-            const int32_t zero_point,
-            std::unique_ptr<Tensor> tensor_u8)
-      :  name(name),
-         shape(shape),
-         data_type(data_type),
-         scale(scale),
-         zero_point(zero_point),
-         tensor_u8(std::move(tensor_u8)) {}
+            const DataType data_type)
+      :  name(name), shape(shape), data_type(data_type) {}

  std::string name;
  std::vector<index_t> shape;
  DataType data_type;
-  float scale;
-  int32_t zero_point;
-  std::unique_ptr<Tensor> tensor_u8;
 };

 class HexagonControlWrapper {
@@ -79,8 +67,6 @@ class HexagonControlWrapper {

  int nn_id_;

-  std::vector<InOutInfo> input_info_;
-  std::vector<InOutInfo> output_info_;
  int num_inputs_;
  int num_outputs_;


--- a/mace/core/runtime/hexagon/hexagon_device.cc
+++ b/mace/core/runtime/hexagon/hexagon_device.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/hexagon/hexagon_device.h"
+
+namespace mace {
+
+HexagonDevice::HexagonDevice(DeviceType device_type,
+                             utils::ThreadPool *thread_pool
+#ifdef MACE_ENABLE_OPENCL
+                             , std::unique_ptr<GPUDevice> gpu_device
+#endif  // MACE_ENABLE_OPENCL
+                             )
+      : CPUDevice(0, AFFINITY_NONE, thread_pool),
+        allocator_(make_unique<HexagonAllocator>()),
+        device_type_(device_type)
+#ifdef MACE_ENABLE_OPENCL
+        , gpu_device_(std::move(gpu_device))
+#endif  // MACE_ENABLE_OPENCL
+         {}
+
+#ifdef MACE_ENABLE_OPENCL
+GPURuntime *HexagonDevice::gpu_runtime() {
+  return gpu_device_->gpu_runtime();
+}
+#endif  // MACE_ENABLE_OPENCL
+
+Allocator *HexagonDevice::allocator() {
+#ifdef MACE_ENABLE_OPENCL
+  return gpu_device_->allocator();
+#else
+  return allocator_.get();
+#endif  // MACE_ENABLE_OPENCL
+}
+
+DeviceType HexagonDevice::device_type() const {
+  return device_type_;
+}
+
+std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
+    Device *device) {
+  std::unique_ptr<HexagonControlWrapper> hexagon_controller;
+  auto device_type = device->device_type();
+  switch (device_type) {
+#ifdef MACE_ENABLE_HEXAGON
+    case HEXAGON:
+      hexagon_controller = make_unique<HexagonDSPWrapper>();
+      break;
+#endif
+#ifdef MACE_ENABLE_HTA
+    case HTA:
+      hexagon_controller = make_unique<HexagonHTAWrapper>(device);
+      break;
+#endif
+    default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
+  }
+
+  return hexagon_controller;
+}
+}  // namespace mace
--- a/mace/core/runtime/hexagon/hexagon_device.h
+++ b/mace/core/runtime/hexagon/hexagon_device.h
@@ -19,6 +19,7 @@
 #include <utility>

 #include "mace/core/device.h"
+#include "mace/core/runtime/hexagon/hexagon_allocator.h"
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
 #ifdef MACE_ENABLE_HEXAGON
 #include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
@@ -26,44 +27,39 @@
 #ifdef MACE_ENABLE_HTA
 #include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
 #endif
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/gpu_device.h"
+#include "mace/core/runtime/opencl/gpu_runtime.h"
+#endif

 namespace mace {

 class HexagonDevice : public CPUDevice {
 public:
-  explicit HexagonDevice(DeviceType device_type,
-                         utils::ThreadPool *thread_pool)
-      : CPUDevice(0, AFFINITY_NONE, thread_pool),
-        device_type_(device_type) {}
+  HexagonDevice(DeviceType device_type,
+#ifdef MACE_ENABLE_OPENCL
+                utils::ThreadPool *thread_pool,
+                std::unique_ptr<GPUDevice> gpu_device);
+#else
+                utils::ThreadPool *thread_pool);
+#endif  // MACE_ENABLE_OPENCL

-  DeviceType device_type() const override {
-    return device_type_;
-  };
+#ifdef MACE_ENABLE_OPENCL
+  GPURuntime *gpu_runtime() override;
+#endif  // MACE_ENABLE_OPENCL
+  Allocator *allocator() override;
+  DeviceType device_type() const override;

 private:
+  std::unique_ptr<HexagonAllocator> allocator_;
  DeviceType device_type_;
+#ifdef MACE_ENABLE_OPENCL
+  std::unique_ptr<GPUDevice> gpu_device_;
+#endif  // MACE_ENABLE_OPENCL
 };

 std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
-    Device *device) {
-  std::unique_ptr<HexagonControlWrapper> hexagon_controller;
-  auto device_type = device->device_type();
-  switch (device_type) {
-#ifdef MACE_ENABLE_HEXAGON
-    case HEXAGON:
-      hexagon_controller = make_unique<HexagonDSPWrapper>();
-      break;
-#endif
-#ifdef MACE_ENABLE_HTA
-    case HTA:
-      hexagon_controller = make_unique<HexagonHTAWrapper>(device);
-      break;
-#endif
-    default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
-  }
-
-  return hexagon_controller;
-}
+    Device *device);

 }  // namespace mace
 #endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
--- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
@@ -107,8 +107,6 @@ hexagon_nn_corner_type TransformCornerType(HexagonNNCornerType corner) {
 }  // namespace

 HexagonDSPWrapper::HexagonDSPWrapper() {
-  hexnn_controller_init();
-
  std::string env_log_execute_time_str;
  GetEnv("MACE_DSP_LOG_EXECUTE_TIME", &env_log_execute_time_str);
  if (env_log_execute_time_str.empty()) {
@@ -119,9 +117,7 @@ HexagonDSPWrapper::HexagonDSPWrapper() {
  }
 }

-HexagonDSPWrapper::~HexagonDSPWrapper() {
-  hexnn_controller_deinit();
-}
+HexagonDSPWrapper::~HexagonDSPWrapper() {}

 int HexagonDSPWrapper::GetVersion() {
  int version;
@@ -258,10 +254,7 @@ bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def,
    }
    input_info_.emplace_back(input_info.name(),
                             input_shape,
-                             input_info.data_type(),
-                             input_info.scale(),
-                             input_info.zero_point(),
-                             make_unique<Tensor>());
+                             input_info.data_type());
  }

  // output info
@@ -275,10 +268,7 @@ bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def,
    }
    output_info_.emplace_back(output_info.name(),
                              output_shape,
-                              output_info.data_type(),
-                              output_info.scale(),
-                              output_info.zero_point(),
-                              make_unique<Tensor>());
+                              output_info.data_type());
    VLOG(1) << "OutputInfo: "
            << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
            << " " << output_shape[2] << " " << output_shape[3]

--- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h
@@ -56,7 +56,8 @@ class HexagonDSPWrapper : public HexagonControlWrapper {
  uint64_t GetLastExecuteCycles();

  bool log_execute_time_;
-
+  std::vector<InOutInfo> input_info_;
+  std::vector<InOutInfo> output_info_;
  MACE_DISABLE_COPY_AND_ASSIGN(HexagonDSPWrapper);
 };
 }  // namespace mace

--- a/mace/core/runtime/hexagon/hexagon_hta_transformer.cc
+++ b/mace/core/runtime/hexagon/hexagon_hta_transformer.cc
--- a/mace/core/runtime/hexagon/hexagon_hta_transformer.h
+++ b/mace/core/runtime/hexagon/hexagon_hta_transformer.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_
+
+#include <memory>
+#include <vector>
+
+#include "mace/core/device.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/utils/math.h"
+#include "mace/utils/thread_pool.h"
+#include "third_party/hta/hta_hexagon_api.h"
+
+namespace mace {
+class BaseTransformer {
+ public:
+  BaseTransformer() = default;
+  virtual ~BaseTransformer() = default;
+
+  virtual void Init(Device *device) { device_ = device; }
+  virtual MaceStatus Compute(const Tensor *input, Tensor *output) = 0;
+
+ protected:
+  Device *device_;
+};
+
+class HexagonHTATranformerBase {
+ public:
+  HexagonHTATranformerBase() = default;
+  virtual ~HexagonHTATranformerBase() = default;
+
+  virtual void Init(Device *device) = 0;
+  virtual MaceStatus SetInputTransformer(
+      const hexagon_hta_hw_layout format) = 0;
+  virtual MaceStatus SetOutputTransformer(
+      const hexagon_hta_hw_layout format) = 0;
+  virtual MaceStatus Quantize(const Tensor *input, Tensor *output) = 0;
+  virtual MaceStatus Dequantize(const Tensor *input, Tensor *output) = 0;
+  virtual MaceStatus TransformInput(const Tensor *input,
+                                    Tensor *output,
+                                    int index) = 0;
+  virtual MaceStatus TransformOutput(const Tensor *input,
+                                     Tensor *output,
+                                     int index) = 0;
+};
+
+template <DeviceType D>
+class HexagonHTATranformer : public HexagonHTATranformerBase {
+ public:
+  void Init(Device *device) override;
+  MaceStatus SetInputTransformer(const hexagon_hta_hw_layout format) override;
+  MaceStatus SetOutputTransformer(const hexagon_hta_hw_layout format) override;
+
+  MaceStatus Quantize(const Tensor *input, Tensor *output) override;
+  MaceStatus Dequantize(const Tensor *input, Tensor *output) override;
+  MaceStatus TransformInput(const Tensor *input,
+                            Tensor *output,
+                            int index) override;
+  MaceStatus TransformOutput(const Tensor *input,
+                             Tensor *output,
+                             int index) override;
+
+ private:
+  Device *device_;
+  std::unique_ptr<BaseTransformer> quantizer_;
+  std::unique_ptr<BaseTransformer> dequantizer_;
+  std::vector<std::unique_ptr<BaseTransformer>> input_transformers_;
+  std::vector<std::unique_ptr<BaseTransformer>> output_transformers_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
@@ -13,7 +13,7 @@
 // limitations under the License.

 #include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
-
+#include <sys/types.h>
 #include <algorithm>
 #include <iomanip>
 #include <map>
@@ -24,48 +24,59 @@
 #include <utility>

 #include "mace/core/runtime/hexagon/hexagon_hta_ops.h"
+#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
 #include "mace/core/types.h"
 #include "mace/utils/memory.h"
-#include "mace/core/quantize.h"
 #include "third_party/hta/hta_hexagon_api.h"
-
 namespace mace {

 namespace {
-struct InputOutputMetadata {
-  void Init(float min_val, float max_val, int needs_quantization) {
-    this->min_val = min_val;
-    this->max_val = max_val;
-    this->needs_quantization = needs_quantization;
+int GetHTAEnv(const std::string &name, int default_value) {
+  int value = default_value;
+  std::string env_str;
+  MaceStatus status = GetEnv(name.c_str(), &env_str);
+  if (status == MaceStatus::MACE_SUCCESS && !env_str.empty()) {
+    value = std::atoi(env_str.c_str());
  }
-  float min_val;
-  float max_val;
-  int needs_quantization;
-};
-
-template<typename T>
-void AddInputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) {
-  tensor->batches = 1;
-  tensor->height = 1;
-  tensor->width = 1;
-  tensor->depth = 1;
-  tensor->data = const_cast<unsigned char *>(
-      reinterpret_cast<const unsigned char *>(&data));
-  tensor->dataLen = sizeof(data);
-  tensor->data_valid_len = sizeof(data);
-  tensor->unused = 0;
+  return value;
 }

-template<typename T>
-void AddOutputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) {
-  tensor->data = const_cast<unsigned char *>(
-      reinterpret_cast<const unsigned char *>(&data));
-  tensor->dataLen = sizeof(data);
+// Print the API logs to standard output.
+void HtaApiLog(hexagon_hta_nn_nn_id id,
+               const char *api_op,
+               const char *const format,
+                 ...) {
+  va_list arg;
+  va_start(arg, format);
+  if (api_op != NULL) {
+    printf("Graph ID: %d\t", id);
+  }
+  vfprintf(stdout, format, arg);
+  va_end(arg);
+}
+// Print the performance stats to standard output.
+void HtaPerformanceLog(int log_level,
+                       uint32_t network_handle,
+                       uint32_t thread_id,
+                       const char *const format,
+                       ...) {
+  va_list arg;
+  va_start(arg, format);
+  printf("Log Level: %d, Network Handle: %d, Thread ID: %d - ", log_level,
+         network_handle, thread_id);
+  vfprintf(stdout, format, arg);
+  va_end(arg);
 }
 }  // namespace

 HexagonHTAWrapper::HexagonHTAWrapper(Device *device)
-    : quantize_util_(&device->cpu_runtime()->thread_pool()) {
+    : allocator_(device->allocator()),
+#ifdef MACE_ENABLE_OPENCL
+      transformer_(make_unique<HexagonHTATranformer<GPU>>()) {
+#else
+      transformer_(make_unique<HexagonHTATranformer<CPU>>()) {
+#endif
+  transformer_->Init(device);
 }

 int HexagonHTAWrapper::GetVersion() {
@@ -84,6 +95,40 @@ bool HexagonHTAWrapper::Init() {
  LOG(INFO) << "Hexagon init";
  MACE_CHECK(hexagon_hta_nn_init(&nn_id_) == 0, "hexagon_nn_init failed");
  ResetPerfInfo();
+
+  int ret;
+  int power_level = GetHTAEnv("MACE_HTA_POWER_LEVEL", -1);
+  if (power_level != -1) {
+    ret = hexagon_hta_nn_set_config_params(nn_id_, HTA_NN_CONFIG_POWER_LEVEL,
+                                           &power_level, sizeof(power_level));
+    LOG(INFO) << "HTA_NN_CONFIG_POWER_LEVEL: " << power_level
+              << " returns: " << ret;
+  }
+
+  int is_compress = GetHTAEnv("MACE_HTA_BANDWIDTH_COMPRESSION", 1);
+  if (is_compress) {
+    ret = hexagon_hta_nn_set_config_params(nn_id_,
+                                           HTA_NN_CONFIG_BANDWIDTH_COMPRESSION,
+                                           &is_compress, sizeof(is_compress));
+    LOG(INFO) << "HTA_NN_CONFIG_BANDWIDTH_COMPRESSION: " << is_compress
+              << " returns: " << ret;
+  }
+
+  if (VLOG_IS_ON(2)) {
+    ret = hexagon_hta_nn_set_config_params(
+        nn_id_, HTA_NN_CONFIG_PERFORMANCE_LOG,
+        reinterpret_cast<void *>(&HtaPerformanceLog),
+        sizeof(&HtaPerformanceLog));
+    MACE_CHECK(ret == 0, "HTA_NN_CONFIG_PERFORMANCE_LOG returns: " , ret);
+  }
+
+  if (VLOG_IS_ON(3)) {
+    ret = hexagon_hta_nn_set_config_params(nn_id_, HTA_NN_CONFIG_API_LOG,
+                                           reinterpret_cast<void *>(&HtaApiLog),
+                                           sizeof(&HtaApiLog));
+    MACE_CHECK(ret == 0, "HTA_NN_CONFIG_API_LOG returns: ", ret);
+  }
+
  return true;
 }

@@ -172,52 +217,107 @@ bool HexagonHTAWrapper::SetupGraph(const NetDef &net_def,
                               outputs.size());
  }

+  int64_t t1 = NowMicros();
+
+  MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed");
+
+  int64_t t2 = NowMicros();
+
+  VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
+
  // input info
  num_inputs_ = net_def.input_info_size();
  input_info_.reserve(num_inputs_);
-  for (const InputOutputInfo &input_info : net_def.input_info()) {
+  input_tensordef_.resize(num_inputs_);
+  for (int index = 0; index < num_inputs_; ++index) {
+    auto input_info = net_def.input_info(index);
    std::vector<index_t> input_shape(input_info.dims().begin(),
                                     input_info.dims().end());
    while (input_shape.size() < 4) {
      input_shape.insert(input_shape.begin(), 1);
    }
-    input_info_.emplace_back(input_info.name(),
-                             input_shape,
-                             input_info.data_type(),
-                             input_info.scale(),
-                             input_info.zero_point(),
-                             make_unique<Tensor>());
+
+    auto quantized_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
+    auto hta_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
+    hexagon_hta_nn_hw_tensordef &input_tensordef = input_tensordef_[index];
+    memset(&input_tensordef, 0, sizeof(input_tensordef));
+    MACE_CHECK(hexagon_hta_nn_get_memory_layout(nn_id_, 0, index,
+                                                &input_tensordef) == 0);
+    input_tensordef.dataLen = input_tensordef.batchStride;
+    VLOG(1) << input_tensordef.format << " " << input_tensordef.elementSize
+            << " " << input_tensordef.numDims << " "
+            << input_tensordef.batchStride;
+    for (uint32_t i = 0; i < input_tensordef.numDims; ++i) {
+      VLOG(1) << input_tensordef.dim[i].length << " "
+              << input_tensordef.dim[i].lpadding << " "
+              << input_tensordef.dim[i].valid;
+    }
+    hta_tensor->Resize({input_tensordef.dataLen});
+    MACE_CHECK(hta_tensor->raw_size() == input_tensordef.dataLen);
+    Tensor::MappingGuard input_guard(hta_tensor.get());
+    input_tensordef.fd =
+        allocator_->rpcmem()->ToFd(hta_tensor->mutable_data<void>());
+    MACE_CHECK(hexagon_hta_nn_register_tensor(nn_id_, &input_tensordef) == 0);
+
+    transformer_->SetInputTransformer(input_tensordef.format);
+
+    input_info_.emplace_back(
+        input_info.name(), input_shape, input_info.data_type(),
+        input_info.scale(), input_info.zero_point(),
+        std::move(quantized_tensor),
+        std::move(hta_tensor));
  }

  // output info
  num_outputs_ = net_def.output_info_size();
  output_info_.reserve(num_outputs_);
-  for (const InputOutputInfo &output_info : net_def.output_info()) {
+  output_tensordef_.resize(num_outputs_);
+  for (int index = 0; index < num_outputs_; ++index) {
+    auto output_info = net_def.output_info(index);
    std::vector<index_t> output_shape(output_info.dims().begin(),
                                      output_info.dims().end());
    while (output_shape.size() < 4) {
      output_shape.insert(output_shape.begin(), 1);
    }
-    output_info_.emplace_back(output_info.name(),
-                              output_shape,
-                              output_info.data_type(),
-                              output_info.scale(),
-                              output_info.zero_point(),
-                              make_unique<Tensor>());
+
+    auto quantized_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
+    auto hta_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
+    quantized_tensor->SetScale(output_info.scale());
+    quantized_tensor->SetZeroPoint(output_info.zero_point());
+
+    hexagon_hta_nn_hw_tensordef &output_tensordef = output_tensordef_[index];
+    memset(&output_tensordef, 0, sizeof(output_tensordef));
+    MACE_CHECK(hexagon_hta_nn_get_memory_layout(nn_id_, 1, index,
+                                                &output_tensordef) == 0);
+    output_tensordef.dataLen = output_tensordef.batchStride;
+    VLOG(1) << output_tensordef.format << " " << output_tensordef.elementSize
+            << " " << output_tensordef.numDims << " "
+            << output_tensordef.batchStride;
+    for (uint32_t i = 0; i < output_tensordef.numDims; ++i) {
+      VLOG(1) << output_tensordef.dim[i].length << " "
+              << output_tensordef.dim[i].lpadding << " "
+              << output_tensordef.dim[i].valid;
+    }
+    hta_tensor->Resize({output_tensordef.batchStride});
+    MACE_CHECK(hta_tensor->raw_size() == output_tensordef.dataLen);
+    Tensor::MappingGuard output_guard(hta_tensor.get());
+    output_tensordef.fd =
+        allocator_->rpcmem()->ToFd(hta_tensor->mutable_data<void>());
+    MACE_CHECK(hexagon_hta_nn_register_tensor(nn_id_, &output_tensordef) == 0);
+
+    transformer_->SetOutputTransformer(output_tensordef.format);
+
+    output_info_.emplace_back(
+        output_info.name(), output_shape, output_info.data_type(),
+        output_info.scale(), output_info.zero_point(),
+        std::move(quantized_tensor), std::move(hta_tensor));
+
    VLOG(1) << "OutputInfo: "
            << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
            << " " << output_shape[2] << " " << output_shape[3]
            << "\n\t type: " << output_info.data_type();
  }

-  int64_t t1 = NowMicros();
-
-  MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed");
-
-  int64_t t2 = NowMicros();
-
-  VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
-
  return true;
 }

@@ -266,78 +366,41 @@ bool HexagonHTAWrapper::ExecuteGraphNew(
  MACE_CHECK(num_outputs_ == static_cast<int>(num_outputs),
             "Wrong outputs num");

-  std::vector<hexagon_hta_nn_tensordef> inputs(num_inputs * kNumMetaData);
-  std::vector<hexagon_hta_nn_tensordef> outputs(num_outputs * kNumMetaData);
-  std::vector<InputOutputMetadata> input_metadata(num_inputs);
-  std::vector<InputOutputMetadata> output_metadata(num_outputs);
-
  for (size_t i = 0; i < num_inputs; ++i) {
    const auto input_tensor = input_tensors.at(input_info_[i].name);
-    const auto &input_shape = input_tensor->shape();
-    size_t index = i * kNumMetaData;
-    inputs[index].batches = static_cast<uint32_t>(input_shape[0]);
-    inputs[index].height = static_cast<uint32_t>(input_shape[1]);
-    inputs[index].width = static_cast<uint32_t>(input_shape[2]);
-    inputs[index].depth = static_cast<uint32_t>(input_shape[3]);
-    inputs[index].data = const_cast<unsigned char *>(
-        reinterpret_cast<const unsigned char *>(input_tensor->raw_data()));
-    inputs[index].dataLen = static_cast<int>(input_tensor->raw_size());
-    inputs[index].data_valid_len =
-        static_cast<uint32_t>(input_tensor->raw_size());
-    inputs[index].unused = 0;
-    input_metadata[i].Init(.0f, .0f, 1);
-    AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]);
-    AddInputMetadata(input_metadata[i].max_val, &inputs[index + 2]);
-    AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]);
-  }
+    input_tensor->SetScale(input_info_[i].scale);
+    input_tensor->SetZeroPoint(input_info_[i].zero_point);
+    MACE_CHECK_SUCCESS(
+        transformer_->Quantize(input_tensors.at(input_info_[i].name),
+                               input_info_[i].quantized_tensor.get()));

-  // transform mace output to hexagon output
-  for (size_t i = 0; i < num_outputs; ++i) {
-    auto output_tensor = output_tensors->at(output_info_[i].name);
-    size_t index = i * kNumMetaData;
-    output_tensor->SetDtype(output_info_[i].data_type);
-    output_tensor->Resize(output_info_[i].shape);
-
-    outputs[index].data = reinterpret_cast<unsigned char *>(
-        output_tensor->raw_mutable_data());
-    outputs[index].dataLen = static_cast<int>(output_tensor->raw_size());
-    output_metadata[i].Init(.0f, .0f, 1);
-
-    AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]);
-    AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]);
-    AddOutputMetadata(output_metadata[i].needs_quantization,
-                      &outputs[index + 3]);
+    MACE_CHECK_SUCCESS(transformer_->TransformInput(
+        input_info_[i].quantized_tensor.get(),
+        input_info_[i].hta_tensor.get(), i));
+
+    Tensor::MappingGuard input_guard(input_info_[i].hta_tensor.get());
  }

-  int res = hexagon_hta_nn_execute_new(nn_id_,
-                                       inputs.data(),
-                                       num_inputs * kNumMetaData,
-                                       outputs.data(),
-                                       num_outputs * kNumMetaData);
-  MACE_CHECK(res == 0, "execute error");
+  MACE_CHECK(hexagon_hta_nn_execute_hw(nn_id_,
+                                       input_tensordef_.data(), num_inputs,
+                                       output_tensordef_.data(), num_outputs,
+                                       nullptr, nullptr) == 0);

  for (size_t i = 0; i < num_outputs; ++i) {
-    size_t index = i * kNumMetaData;
-    std::vector<uint32_t> output_shape{
-        outputs[index].batches, outputs[index].height, outputs[index].width,
-        outputs[index].depth};
-    MACE_CHECK(output_shape.size() == output_info_[i].shape.size(),
-               output_shape.size(), " vs ", output_info_[i].shape.size(),
-               " wrong output shape inferred");
-    for (size_t j = 0; j < output_shape.size(); ++j) {
-      MACE_CHECK(static_cast<index_t>(output_shape[j])
-                     == output_info_[i].shape[j],
-                 output_shape[j], " vs ", output_info_[i].shape[j],
-                 " wrong output shape[", j, "] inferred");
+    { // To sync cache
+      Tensor::MappingGuard output_guard(output_info_[i].hta_tensor.get());
    }
+    output_info_[i].quantized_tensor->Resize(output_info_[i].shape);
+    transformer_->TransformOutput(output_info_[i].hta_tensor.get(),
+                                  output_info_[i].quantized_tensor.get(), i);
+
+
    auto output_tensor = output_tensors->at(output_info_[i].name);
-    MACE_CHECK(static_cast<index_t>(outputs[index].data_valid_len)
-                   == output_tensor->raw_size(),
-               outputs[index].data_valid_len, " vs ", output_tensor->raw_size(),
-               " wrong output bytes inferred.");
+    MaceStatus st = transformer_->Dequantize(
+        output_info_[i].quantized_tensor.get(), output_tensor);
  }

-  return res == 0;
+  return true;
 }

 }  // namespace mace
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
@@ -16,18 +16,40 @@
 #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_

 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
+#include <utility>

-#include "mace/utils/thread_pool.h"
-#include "mace/core/quantize.h"
+#include "mace/core/device.h"
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
 #include "mace/core/tensor.h"
-#include "mace/core/device.h"
 #include "mace/public/mace.h"
+#include "third_party/hta/hta_hexagon_api.h"

 namespace mace {

+struct HTAInOutInfo : public InOutInfo {
+  HTAInOutInfo(const std::string &name,
+               const std::vector<index_t> &shape,
+               const DataType data_type,
+               const float scale,
+               const int32_t zero_point,
+               std::unique_ptr<Tensor> quantized_tensor,
+               std::unique_ptr<Tensor> hta_tensor)
+      :  InOutInfo(name, shape, data_type),
+         scale(scale),
+         zero_point(zero_point),
+         quantized_tensor(std::move(quantized_tensor)),
+         hta_tensor(std::move(hta_tensor)) {}
+
+  float scale;
+  int32_t zero_point;
+  std::unique_ptr<Tensor> quantized_tensor;
+  std::unique_ptr<Tensor> hta_tensor;
+};
+
 class HexagonHTAWrapper : public HexagonControlWrapper {
 public:
  explicit HexagonHTAWrapper(Device *device);
@@ -50,7 +72,12 @@ class HexagonHTAWrapper : public HexagonControlWrapper {
  void SetDebugLevel(int level) override;

 private:
-  QuantizeUtil<float, uint8_t> quantize_util_;
+  Allocator *allocator_;
+  std::vector<HTAInOutInfo> input_info_;
+  std::vector<HTAInOutInfo> output_info_;
+  std::vector<hexagon_hta_nn_hw_tensordef> input_tensordef_;
+  std::vector<hexagon_hta_nn_hw_tensordef> output_tensordef_;
+  std::unique_ptr<HexagonHTATranformerBase> transformer_;
  MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper);
 };
 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -16,9 +16,6 @@

 #include "mace/core/runtime/opencl/opencl_allocator.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#ifdef MACE_ENABLE_RPCMEM
-#include "third_party/rpcmem/rpcmem.h"
-#endif  // MACE_ENABLE_RPCMEM

 namespace mace {
 namespace {
@@ -39,24 +36,10 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
  }
 }

-#ifdef MACE_ENABLE_RPCMEM
-std::once_flag ion_prepared;
-void PrepareQualcommION() {
-  rpcmem_init();
-  std::atexit(rpcmem_deinit);
-}
-#endif  // MACE_ENABLE_RPCMEM
-
 }  // namespace

 OpenCLAllocator::OpenCLAllocator(
-    OpenCLRuntime *opencl_runtime): opencl_runtime_(opencl_runtime) {
-#ifdef MACE_ENABLE_RPCMEM
-  if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
-    std::call_once(ion_prepared, PrepareQualcommION);
-  }
-#endif  // MACE_ENABLE_RPCMEM
-}
+    OpenCLRuntime *opencl_runtime): opencl_runtime_(opencl_runtime) {}

 OpenCLAllocator::~OpenCLAllocator() {}

@@ -168,7 +151,7 @@ void OpenCLAllocator::Delete(void *buffer) {
    if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
      auto it = cl_to_host_map_.find(buffer);
      MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL buffer not found!");
-      rpcmem_free(it->second);
+      rpcmem_.Delete(it->second);
      cl_to_host_map_.erase(buffer);
    }
 #endif  // MACE_ENABLE_RPCMEM
@@ -184,7 +167,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
    if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
      auto it = cl_to_host_map_.find(buffer);
      MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL image not found!");
-      rpcmem_free(it->second);
+      rpcmem_.Delete(it->second);
      cl_to_host_map_.erase(buffer);
    }
 #endif  // MACE_ENABLE_RPCMEM
@@ -194,7 +177,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
 void *OpenCLAllocator::Map(void *buffer,
                           size_t offset,
                           size_t nbytes,
-                           bool finish_cmd_queue) const {
+                           bool finish_cmd_queue) {
  MACE_LATENCY_LOGGER(1, "Map OpenCL buffer");
  void *mapped_ptr = nullptr;
 #ifdef MACE_ENABLE_RPCMEM
@@ -209,7 +192,7 @@ void *OpenCLAllocator::Map(void *buffer,

    if (opencl_runtime_->qcom_host_cache_policy() ==
        CL_MEM_HOST_WRITEBACK_QCOM) {
-      MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0);
+      MACE_CHECK(rpcmem_.SyncCacheStart(mapped_ptr) == 0);
    }
  } else {
 #endif  // MACE_ENABLE_RPCMEM
@@ -234,7 +217,7 @@ void *OpenCLAllocator::Map(void *buffer,
 void *OpenCLAllocator::MapImage(void *buffer,
                                const std::vector<size_t> &image_shape,
                                std::vector<size_t> *mapped_image_pitch,
-                                bool finish_cmd_queue) const {
+                                bool finish_cmd_queue) {
  MACE_LATENCY_LOGGER(1, "Map OpenCL Image");
  MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
  void *mapped_ptr = nullptr;
@@ -251,7 +234,7 @@ void *OpenCLAllocator::MapImage(void *buffer,

    if (opencl_runtime_->qcom_host_cache_policy() ==
        CL_MEM_HOST_WRITEBACK_QCOM) {
-      MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0);
+      MACE_CHECK(rpcmem_.SyncCacheStart(mapped_ptr) == 0);
    }
  } else {
 #endif  // MACE_ENABLE_RPCMEM
@@ -275,13 +258,13 @@ void *OpenCLAllocator::MapImage(void *buffer,
  return mapped_ptr;
 }

-void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
+void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
  MACE_LATENCY_LOGGER(1, "Unmap OpenCL buffer/Image");
 #ifdef MACE_ENABLE_RPCMEM
  if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
    if (opencl_runtime_->qcom_host_cache_policy() ==
        CL_MEM_HOST_WRITEBACK_QCOM) {
-      MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_END) == 0);
+      MACE_CHECK(rpcmem_.SyncCacheEnd(mapped_ptr) == 0);
    }
  } else {
 #endif  // MACE_ENABLE_RPCMEM
@@ -301,17 +284,20 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
 bool OpenCLAllocator::OnHost() const { return false; }

 #ifdef MACE_ENABLE_RPCMEM
+Rpcmem *OpenCLAllocator::rpcmem() {
+  return &rpcmem_;
+}
+
 void OpenCLAllocator::CreateQualcommBufferIONHostPtr(
    const size_t nbytes,
    cl_mem_ion_host_ptr *ion_host) {
-  void *host = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_FLAG_CACHED,
-                            nbytes + opencl_runtime_->qcom_ext_mem_padding());
+  void *host = rpcmem_.New(nbytes + opencl_runtime_->qcom_ext_mem_padding());
  MACE_CHECK_NOTNULL(host);
  auto host_addr = reinterpret_cast<std::uintptr_t>(host);
  auto page_size = opencl_runtime_->qcom_page_size();
  MACE_CHECK(host_addr % page_size == 0, "ION memory address: ", host_addr,
             " must be aligned to page size: ", page_size);
-  int fd = rpcmem_to_fd(host);
+  int fd = rpcmem_.ToFd(host);
  MACE_CHECK(fd >= 0, "Invalid rpcmem file descriptor: ", fd);

  ion_host->ext_host_ptr.allocation_type = CL_MEM_ION_HOST_PTR_QCOM;

--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -48,17 +48,21 @@ class OpenCLAllocator : public Allocator {
  void *Map(void *buffer,
            size_t offset,
            size_t nbytes,
-            bool finish_cmd_queue) const override;
+            bool finish_cmd_queue) override;

  void *MapImage(void *buffer,
                 const std::vector<size_t> &image_shape,
                 std::vector<size_t> *mapped_image_pitch,
-                 bool finish_cmd_queue) const override;
+                 bool finish_cmd_queue) override;

-  void Unmap(void *buffer, void *mapped_ptr) const override;
+  void Unmap(void *buffer, void *mapped_ptr) override;

  bool OnHost() const override;

+#ifdef MACE_ENABLE_RPCMEM
+  Rpcmem *rpcmem() override;
+#endif  // MACE_ENABLE_RPCMEM
+
 private:
 #ifdef MACE_ENABLE_RPCMEM
  void CreateQualcommBufferIONHostPtr(const size_t nbytes,
@@ -69,6 +73,7 @@ class OpenCLAllocator : public Allocator {
                                     cl_mem_ion_host_ptr *ion_host);

  std::unordered_map<void *, void *> cl_to_host_map_;
+  Rpcmem rpcmem_;
 #endif  // MACE_ENABLE_RPCMEM
  OpenCLRuntime *opencl_runtime_;
 };

--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 #include <algorithm>
 #include <string>
@@ -22,7 +22,6 @@
 #include "mace/utils/math.h"

 namespace mace {
-namespace ops {

 std::vector<index_t> FormatBufferShape(
    const std::vector<index_t> &buffer_shape,
@@ -59,8 +58,10 @@ std::string DtToCLDt(const DataType dt) {
      return "float";
    case DT_HALF:
      return "half";
+    case DT_UINT8:
+      return "uchar";
    default:
-      LOG(FATAL) << "Unsupported data type";
+      LOG(FATAL) << "Unsupported data type: " << dt;
      return "";
  }
 }
@@ -365,5 +366,4 @@ MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
  return MaceStatus::MACE_SUCCESS;
 }

-}  // namespace ops
 }  // namespace mace
--- a/mace/ops/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_OPENCL_HELPER_H_
-#define MACE_OPS_OPENCL_HELPER_H_
+#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_
+#define MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_

 #include <memory>
 #include <string>
@@ -30,7 +30,6 @@
 #include "mace/utils/math.h"

 namespace mace {
-namespace ops {
 // oorc for 'Out Of Range Check'
 #define MACE_OUT_OF_RANGE_DEFINITION           \
  std::shared_ptr<BufferBase> oorc_flag;
@@ -161,6 +160,5 @@ std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
                                       const uint32_t *gws,
                                       const uint32_t kwg_size);

-}  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_OPENCL_HELPER_H_
+#endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_
--- a/mace/libmace/BUILD.bazel
+++ b/mace/libmace/BUILD.bazel
@@ -21,6 +21,7 @@ load(
    "if_opencl_enabled",
    "if_openmp_enabled",
    "if_quantize_enabled",
+    "if_rpcmem_enabled",
 )

 cc_library(
@@ -47,6 +48,8 @@ cc_library(
        "-DMACE_ENABLE_HTA",
    ]) + if_apu_enabled([
        "-DMACE_ENABLE_APU",
+    ]) + if_rpcmem_enabled([
+        "-DMACE_ENABLE_RPCMEM",
    ]),
    deps = [
        "//mace/ops",

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -520,9 +520,23 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
  if (device_type_ == DeviceType::HEXAGON
      || device_type_ == DeviceType::HTA) {
+#ifdef MACE_ENABLE_OPENCL
+    device_.reset(new HexagonDevice(
+        device_type_, thread_pool_.get(),
+        make_unique<GPUDevice>(
+            config.impl_->gpu_context()->opencl_tuner(),
+            config.impl_->gpu_context()->opencl_cache_storage(),
+            config.impl_->gpu_priority_hint(),
+            config.impl_->gpu_perf_hint(),
+            config.impl_->gpu_context()->opencl_binary_storage(),
+            config.impl_->num_threads(),
+            config.impl_->cpu_affinity_policy(),
+            thread_pool_.get())));
+#else
    device_.reset(new HexagonDevice(device_type_, thread_pool_.get()));
+#endif  // MACE_ENABLE_OPENCL
  }
-#endif
+#endif  // defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
 #ifdef MACE_ENABLE_APU
  if (device_type_ == DeviceType::APU) {
    device_.reset(new ApuDevice(thread_pool_.get()));
@@ -579,15 +593,19 @@ MaceStatus MaceEngine::Impl::Init(
                 << MakeString(MapKeys(output_info_map_));
    }
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-    DataType output_dt = output_info_map_[output_name].data_type();
-    Tensor *output_tensor =
-        ws_->CreateTensor(output_name, device_->allocator(), output_dt);
-    output_tensor->set_data_format(DataFormat::NHWC);
+    if (device_type_ == HEXAGON || device_type_ == HTA) {
+      DataType output_dt = output_info_map_[output_name].data_type();
+      Tensor *output_tensor =
+          ws_->CreateTensor(output_name, device_->allocator(), output_dt);
+      output_tensor->set_data_format(DataFormat::NHWC);
+    }
 #endif
 #if defined(MACE_ENABLE_APU)
-    Tensor *output_tensor =
-        ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
-    output_tensor->set_data_format(DataFormat::NHWC);
+    if (device_type_ == DeviceType::APU) {
+      Tensor *output_tensor =
+          ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
+      output_tensor->set_data_format(DataFormat::NHWC);
+    }
 #endif
  }
 #ifdef MACE_ENABLE_HEXAGON

--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -21,7 +21,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
@@ -21,7 +21,7 @@
 #include <vector>

 #include "mace/ops/opencl/buffer/utils.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"

 namespace mace {

--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/buffer/depthwise_conv2d.h
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.h
@@ -21,7 +21,7 @@
 #include <vector>

 #include "mace/ops/opencl/buffer/utils.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"

 namespace mace {

--- a/mace/ops/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
@@ -23,7 +23,7 @@
 #include <vector>

 #include "mace/ops/opencl/buffer/utils.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"

 namespace mace {

--- a/mace/ops/opencl/buffer/reshape.h
+++ b/mace/ops/opencl/buffer/reshape.h
@@ -19,7 +19,7 @@

 #include <vector>

-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/buffer/utils.cc
+++ b/mace/ops/opencl/buffer/utils.cc
@@ -19,7 +19,7 @@
 #include <vector>

 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/cl/buffer_transform.cl
+++ b/mace/ops/opencl/cl/buffer_transform.cl
@@ -209,3 +209,202 @@ __kernel void transform_data_type(BUFFER_OUT_OF_RANGE_PARAMS
  DATA_TYPE4 input_value = CONVERT4(vload4(out_idx, input + input_offset));
  vstore4(input_value, out_idx, output);
 }
+
+__kernel void buffer_quantize(BUFFER_OUT_OF_RANGE_PARAMS
+                              __private const int global_size_dim0,
+                              __private const float scale,
+                              __private const int zero_point,
+                              __global float *input,
+                              __private const int input_offset,
+                              __global uchar *output) {
+  const int out_idx = get_global_id(0);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_idx >= global_size_dim0) {
+    return;
+  }
+#endif
+
+  uchar4 output_value =
+      convert_uchar4_sat_rte(vload4(out_idx, input) / scale + zero_point);
+  vstore4(output_value, out_idx, output);
+}
+
+__kernel void buffer_dequantize(BUFFER_OUT_OF_RANGE_PARAMS
+                                __private const int global_size_dim0,
+                                __private const float scale,
+                                __private const int zero_point,
+                                __global uchar *input,
+                                __private const int input_offset,
+                                __global float *output) {
+  const int out_idx = get_global_id(0);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_idx >= global_size_dim0) {
+    return;
+  }
+#endif
+  float4 output_value =
+      convert_float4(convert_int4(vload4(out_idx, input)) - zero_point) * scale;
+  vstore4(output_value, out_idx, output);
+}
+
+
+// NHWC -> NCHW (W roundup to 32)
+__kernel void transform_nhwc_to_nchw32(BUFFER_OUT_OF_RANGE_PARAMS
+                                       GLOBAL_WORK_GROUP_SIZE_DIM3
+                                       __global uchar *input,  // NHWC
+                                       __private const int input_offset,
+                                       __private const int zero_point,
+                                       __global uchar *output,
+                                       __private const int batch,
+                                       __private const int height,
+                                       __private const int width,
+                                       __private const int channels) {
+  const int width_blk_idx = get_global_id(0);
+  const int h_idx = get_global_id(1);
+  const int bc_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (width_blk_idx >= global_size_dim0 ||
+      h_idx >= global_size_dim1 ||
+      bc_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int b_idx = bc_idx / channels;
+  const int chan_idx = bc_idx - mul24(b_idx, channels);
+  const int w_idx = width_blk_idx << 2;
+  const int padded_width = global_size_dim0 << 2;
+
+  const int in_offset = mad24(mad24(mad24(b_idx, height, h_idx),
+      width, w_idx), channels, chan_idx) + input_offset;
+  const int out_offset = (mad24(mad24(mad24(b_idx, channels, chan_idx),
+      height, h_idx), padded_width, w_idx));
+
+  uchar4 value = zero_point;
+  if (w_idx + 3 < width) {
+    value.x = input[in_offset];
+    value.y = input[in_offset + channels];
+    value.z = input[in_offset + 2 * channels];
+    value.w = input[in_offset + 3 * channels];
+  } else if (w_idx < width) {
+    const int diff = width - w_idx;
+    switch(diff) {
+      case 3:
+        value.z = input[in_offset + 2 * channels];
+      case 2:
+        value.y = input[in_offset + channels];
+      case 1:
+        value.x = input[in_offset];
+    }
+  }
+  VSTORE4(value, output, out_offset);
+}
+
+
+// N H ceil(C/32) W 32 -> NHWC
+__kernel void transform_d32_to_nhwc(BUFFER_OUT_OF_RANGE_PARAMS
+                                    GLOBAL_WORK_GROUP_SIZE_DIM3
+                                    __global uchar *input,  // D32
+                                    __private const int input_offset,
+                                    __global uchar *output, // NHWC
+                                    __private const int batch,
+                                    __private const int height,
+                                    __private const int width,
+                                    __private const int channels,
+                                    __private const int channel_slices) {
+  const int chan_blk_idx = get_global_id(0);
+  const int w_idx = get_global_id(1);
+  const int bh_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_blk_idx >= global_size_dim0 ||
+      w_idx >= global_size_dim1 ||
+      bh_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int b_idx = bh_idx / height;
+  const int h_idx = bh_idx - mul24(b_idx, height);
+  const int c_idx = chan_blk_idx << 2;
+  const int c_slice = c_idx >> 5;
+  const int c_slice_idx = c_idx & 31;
+
+  const int in_offset = mad24(mad24(mad24(mad24(b_idx, height, h_idx),
+      channel_slices, c_slice), width, w_idx), 32, c_slice_idx) + input_offset;
+  const int out_offset = (mad24(mad24(mad24(b_idx, height, h_idx),
+      width, w_idx), channels, c_idx));
+
+  uchar4 value = vload4(0, input + in_offset);
+  if (c_idx + 3 < channels) {
+    VSTORE4(value, output, out_offset);
+  } else {
+    const int diff = channels - c_idx;
+    switch(diff) {
+      case 3:
+        vstore3(value.xyz, 0, output + out_offset);
+        break;
+      case 2:
+        vstore2(value.xy, 0, output + out_offset);
+        break;
+      case 1:
+        output[out_offset] = value.x;
+        break;
+    }
+  }
+}
+
+// NHWC -> N H ceil(C/32) W 32
+__kernel void transform_nhwc_to_d32(BUFFER_OUT_OF_RANGE_PARAMS
+                                    GLOBAL_WORK_GROUP_SIZE_DIM3
+                                    __global uchar *input,  // NHWC
+                                    __private const int input_offset,
+                                    __private const int zero_point,
+                                    __global uchar *output, // D32
+                                    __private const int batch,
+                                    __private const int height,
+                                    __private const int width,
+                                    __private const int channels,
+                                    __private const int channel_slices) {
+  const int w_32_idx = get_global_id(0);
+  const int c_slice = get_global_id(1);
+  const int bh_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w_32_idx >= global_size_dim0 ||
+      c_slice >= global_size_dim1 ||
+      bh_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int b_idx = bh_idx / height;
+  const int h_idx = bh_idx - mul24(b_idx, height);
+  const int w_idx = w_32_idx >> 3;
+  const int c_slice_blk_idx = w_32_idx & 7;
+  const int c_slice_idx = c_slice_blk_idx << 2;
+  const int c_idx = (c_slice << 5) + c_slice_idx;
+
+  const int in_offset = (mad24(mad24(mad24(b_idx, height, h_idx),
+      width, w_idx), channels, c_idx)) + input_offset;
+  const int out_offset = mad24(mad24(mad24(mad24(b_idx, height, h_idx),
+      channel_slices, c_slice), width, w_idx), 32, c_slice_idx);
+
+  uchar4 value = zero_point;
+  if (c_idx + 3 < channels) {
+    value = vload4(0, input + in_offset);
+  } else if (c_idx < channels) {
+    value = vload4(0, input + in_offset);
+    const int diff = channels - c_idx;
+    switch(diff) {
+      case 3:
+        value.w = zero_point; break;
+      case 2:
+        value.zw = zero_point; break;
+      case 1:
+        value.yzw = zero_point; break;
+    }
+  }  // else value = zero_point
+
+  VSTORE4(value, output, out_offset);
+}
--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -24,7 +24,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -24,7 +24,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
@@ -21,7 +21,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
@@ -21,7 +21,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/math.h"

 namespace mace {

--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -14,7 +14,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/utils/math.h"


--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
@@ -21,7 +21,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -25,7 +25,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/eltwise_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -24,7 +24,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -21,7 +21,7 @@

 #include "mace/core/op_context.h"
 #include "mace/ops/opencl/buffer_transform_kernel.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/lpnorm.h
+++ b/mace/ops/opencl/image/lpnorm.h
@@ -17,7 +17,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/opencl/lpnorm.h"

 namespace mace {

--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
@@ -24,7 +24,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/mvnorm.h
+++ b/mace/ops/opencl/image/mvnorm.h
@@ -20,7 +20,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/opencl/mvnorm.h"

 namespace mace {

--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
@@ -24,7 +24,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/common/pad_type.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
@@ -24,7 +24,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/common/reduce_type.h"

 namespace mace {

--- a/mace/ops/opencl/image/reshape.h
+++ b/mace/ops/opencl/image/reshape.h
@@ -21,7 +21,7 @@
 #include <memory>

 #include "mace/core/operator.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/ops/opencl/buffer_transform_kernel.h"

 namespace mace {

--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
@@ -24,7 +24,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
@@ -24,7 +24,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.h
@@ -24,7 +24,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
@@ -24,7 +24,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -24,7 +24,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -16,7 +16,7 @@
 #include "mace/core/op_context.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"


--- a/mace/tools/mace_run.cc
+++ b/mace/tools/mace_run.cc
@@ -175,9 +175,9 @@ bool RunModel(const std::string &model_name,
  if (status != MaceStatus::MACE_SUCCESS) {
    LOG(WARNING) << "Set openmp or cpu affinity failed.";
  }
-#ifdef MACE_ENABLE_OPENCL
+#if defined(MACE_ENABLE_OPENCL) || defined(MACE_ENABLE_HTA)
  std::shared_ptr<GPUContext> gpu_context;
-  if (device_type == DeviceType::GPU) {
+  if (device_type == DeviceType::GPU || device_type == DeviceType::HTA) {
    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
    const std::string storage_path =
        std::string(storage_path_ptr == nullptr ?

--- a/test/ccunit/BUILD.bazel
+++ b/test/ccunit/BUILD.bazel
@@ -9,6 +9,7 @@ load(
    "if_android",
    "if_android_armv7",
    "if_hexagon_enabled",
+    "if_hta_enabled",
    "if_neon_enabled",
    "if_opencl_enabled",
    "if_openmp_enabled",
@@ -41,7 +42,9 @@ cc_test(
        [
            "mace/ops/opencl/*.cc",
        ]
-    )),
+    )) + if_hta_enabled([
+        "mace/core/runtime/hexagon/hta_transform_test.cc",
+    ]),
    copts = [
        "-Werror",
        "-Wextra",
@@ -59,6 +62,8 @@ cc_test(
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
    ]),
    linkopts = if_openmp_enabled([
        "-fopenmp",

--- a/test/ccunit/CMakeLists.txt
+++ b/test/ccunit/CMakeLists.txt
@@ -7,6 +7,10 @@ file(GLOB MACE_CC_TEST_SRCS
  mace/libmace/*.cc
 )

+if(MACE_ENABLE_HTA)
+  set(MACE_CC_TEST_SRCS ${MACE_CC_TEST_SRCS} mace/core/runtime/hexagon/hta_transform_test.cc)
+endif(MACE_ENABLE_HTA)
+
 add_executable(mace_cc_test ${MACE_CC_TEST_SRCS})
 target_link_libraries(mace_cc_test PUBLIC
  mace_cc_test_utils

--- a/test/ccunit/mace/core/runtime/hexagon/hta_transform_test.cc
+++ b/test/ccunit/mace/core/runtime/hexagon/hta_transform_test.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/quantize.h"
+#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class HTATransformTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D>
+void TestHTAQuantizeDequantize(const std::vector<float> &input) {
+  float min_val, max_val;
+  FindMinMax(input.data(), input.size(), &min_val, &max_val);
+  float scale;
+  int32_t zero;
+  AdjustRange<uint8_t>(min_val, max_val, false, &scale, &zero);
+
+  OpsTestNet net;
+    Device *device = OpTestContext::Get()->GetDevice(D);
+
+  net.AddInputFromArray<D, float>("Input",
+                                  {static_cast<index_t>(input.size())},
+                                  input);
+  Tensor *input_tensor = net.GetOutput("Input");
+  input_tensor->SetScale(scale);
+  input_tensor->SetZeroPoint(zero);
+  Tensor *quantized_output = net.ws()->CreateTensor(
+      "QuantizedOutput", device->allocator(), DT_UINT8);
+  Tensor *dequantized_output = net.ws()->CreateTensor(
+      "DequantizedOutput", device->allocator(), DT_FLOAT);
+
+  mace::HexagonHTATranformer<D> transformer;
+  transformer.Init(device);
+  transformer.Quantize(input_tensor, quantized_output);
+  transformer.Dequantize(quantized_output, dequantized_output);
+
+  ExpectTensorNear<float>(*input_tensor,
+                          *dequantized_output,
+                          0.1);
+}
+
+}  // namespace
+
+TEST_F(HTATransformTest, TestHTAQuantize) {
+  TestHTAQuantizeDequantize<CPU>({-2, -1, 0, 1, 2, 3, 4});
+  TestHTAQuantizeDequantize<GPU>({-2, -1, 0, 1, 2, 3, 4});
+}
+
+namespace {
+void TestHTAInputTransform(const std::vector<index_t> &input_shape,
+                           const hexagon_hta_hw_layout format) {
+  OpsTestNet net;
+  Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
+  net.AddRandomInput<GPU, uint8_t>("Input", input_shape);
+  Tensor *input_tensor = net.GetOutput("Input");
+  input_tensor->SetScale(0.1);
+  input_tensor->SetZeroPoint(1);
+  Tensor *cpu_transformed_tensor = net.ws()->CreateTensor(
+      "CpuTransformedOutput", device->allocator(), DT_UINT8);
+  Tensor *gpu_transformed_tensor = net.ws()->CreateTensor(
+      "GpuTransformedOutput", device->allocator(), DT_UINT8);
+
+  mace::HexagonHTATranformer<CPU> cpu_transformer;
+  mace::HexagonHTATranformer<GPU> gpu_transformer;
+  cpu_transformer.Init(device);
+  gpu_transformer.Init(device);
+  cpu_transformer.SetInputTransformer(format);
+  gpu_transformer.SetInputTransformer(format);
+  cpu_transformer.TransformInput(input_tensor, cpu_transformed_tensor, 0);
+  gpu_transformer.TransformInput(input_tensor, gpu_transformed_tensor, 0);
+
+  net.Sync();
+  ExpectTensorNear<uint8_t>(*cpu_transformed_tensor, *gpu_transformed_tensor);
+}
+
+}  // namespace
+
+TEST_F(HTATransformTest, TestHTAInputTransform) {
+  TestHTAInputTransform({1, 15, 33, 2}, HEXAGON_HTA_HW_FORMAT_PLANAR);
+  TestHTAInputTransform({1, 19, 31, 3}, HEXAGON_HTA_HW_FORMAT_PLANAR);
+  TestHTAInputTransform({1, 224, 224, 3}, HEXAGON_HTA_HW_FORMAT_PLANAR);
+  TestHTAInputTransform({1, 19, 31, 3}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAInputTransform({1, 15, 33, 27}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAInputTransform({1, 15, 33, 35}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAInputTransform({1, 224, 224, 3}, HEXAGON_HTA_HW_FORMAT_D32);
+}
+
+
+namespace {
+void TestHTAOutputTransform(const std::vector<index_t> &output_shape,
+                            const hexagon_hta_hw_layout format) {
+  index_t batch = output_shape[0];
+  index_t height = output_shape[1];
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+  MACE_CHECK(format == HEXAGON_HTA_HW_FORMAT_D32);
+  std::vector<index_t> input_shape {
+      batch, height, RoundUpDiv<index_t>(channels, 32), width, 32};
+
+  OpsTestNet net;
+  Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
+  net.AddRandomInput<GPU, uint8_t>("Input", input_shape);
+  Tensor *input_tensor = net.GetOutput("Input");
+  Tensor *cpu_transformed_tensor = net.ws()->CreateTensor(
+      "CpuTransformedOutput", device->allocator(), DT_UINT8);
+  Tensor *gpu_transformed_tensor = net.ws()->CreateTensor(
+      "GpuTransformedOutput", device->allocator(), DT_UINT8);
+  cpu_transformed_tensor->Resize(output_shape);
+  gpu_transformed_tensor->Resize(output_shape);
+
+  mace::HexagonHTATranformer<CPU> cpu_transformer;
+  mace::HexagonHTATranformer<GPU> gpu_transformer;
+  cpu_transformer.Init(device);
+  gpu_transformer.Init(device);
+  cpu_transformer.SetOutputTransformer(format);
+  gpu_transformer.SetOutputTransformer(format);
+  cpu_transformer.TransformOutput(input_tensor, cpu_transformed_tensor, 0);
+  gpu_transformer.TransformOutput(input_tensor, gpu_transformed_tensor, 0);
+
+  net.Sync();
+  ExpectTensorNear<uint8_t>(*cpu_transformed_tensor, *gpu_transformed_tensor);
+}
+
+}  // namespace
+
+TEST_F(HTATransformTest, TestHTAOutputTransform) {
+  TestHTAOutputTransform({1, 15, 33, 2}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAOutputTransform({1, 19, 31, 27}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAOutputTransform({1, 19, 31, 35}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAOutputTransform({1, 224, 224, 2}, HEXAGON_HTA_HW_FORMAT_D32);
+  TestHTAOutputTransform({1, 384, 384, 3}, HEXAGON_HTA_HW_FORMAT_D32);
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
+++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
@@ -21,7 +21,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
-#include "mace/ops/opencl/helper.h"
+#include "mace/core/runtime/opencl/opencl_helper.h"
 #include "mace/utils/memory.h"

 namespace mace {

--- a/test/ccunit/mace/ops/quantize_test.cc
+++ b/test/ccunit/mace/ops/quantize_test.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "mace/core/quantize.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/test/ccutils/mace/ops/testing/test_utils.h
+++ b/test/ccutils/mace/ops/testing/test_utils.h
@@ -250,8 +250,24 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
    Tensor::MappingGuard y_mapper(&y);
    auto a = x.data<EXP_TYPE>();
    auto b = y.data<RES_TYPE>();
-    for (int i = 0; i < x.size(); ++i) {
-      ExpectEqual(a[i], b[i]);
+    if (x.dim_size() == 4) {
+      for (int n = 0; n < x.dim(0); ++n) {
+        for (int h = 0; h < x.dim(1); ++h) {
+          for (int w = 0; w < x.dim(2); ++w) {
+            for (int c = 0; c < x.dim(3); ++c) {
+              EXPECT_EQ(*a, *b) << "with index = [" << n << ", " << h << ", "
+                                << w << ", " << c << "]";
+              a++;
+              b++;
+            }
+          }
+        }
+      }
+    } else {
+      for (int i = 0; i < x.size(); ++i) {
+        EXPECT_EQ(a[i], b[i])
+            << "a = " << a << " b = " << b << " index = " << i;
+      }
    }
  }


--- a/third_party/nnlib/arm64-v8a/libhexagon_controller.so
+++ b/third_party/nnlib/arm64-v8a/libhexagon_controller.so
--- a/third_party/nnlib/armeabi-v7a/libhexagon_controller.so
+++ b/third_party/nnlib/armeabi-v7a/libhexagon_controller.so
--- a/third_party/nnlib/hexnn_dsp_controller.h
+++ b/third_party/nnlib/hexnn_dsp_controller.h
 /*
- * Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2019, The Linux Foundation. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted (subject to the limitations in the
@@ -42,10 +42,6 @@ extern "C" {

 int hexnn_controller_request_unsigned_pd();

-int hexnn_controller_init();
-
-int hexnn_controller_deinit();
-
 #ifdef __cplusplus
 }
 #endif

--- a/third_party/rpcmem/arm64-v8a/rpcmem.a
+++ b/third_party/rpcmem/arm64-v8a/rpcmem.a
--- a/third_party/rpcmem/armeabi-v7a/rpcmem.a
+++ b/third_party/rpcmem/armeabi-v7a/rpcmem.a
--- a/third_party/rpcmem/rpcmem.h
+++ b/third_party/rpcmem/rpcmem.h
@@ -6,6 +6,8 @@
 #ifndef RPCMEM_H
 #define RPCMEM_H

+#include <stdint.h>
+
 /**
 * RPCMEM_DEFAULT_HEAP
 * Dynamicaly select the heap to use.  This should be ok for most usecases.
@@ -77,16 +79,25 @@
 extern "C" {
 #endif

+typedef struct rpcmem rpcmem;
+struct rpcmem {
+  void *lst;  // QList*
+  void *mt;   // pthread_mutex_t*
+  int ionfd;
+  int flag;
+  int ionversion;
+};
+
 /**
 * call once to initialize the library
- * NOTE: rpcmem_init is not thread safe
+ * NOTE: rpcmem_init is now thread safe
 */
-void rpcmem_init(void);
+void rpcmem_init(rpcmem *rm);
 /**
 * call once for cleanup
- * NOTE: rpcmem_deinit is not thread safe
+ * NOTE: rpcmem_deinit is now thread safe
 */
-void rpcmem_deinit(void);
+void rpcmem_deinit(rpcmem *rm);

 /**
 * Allocate via ION a buffer of size
@@ -96,10 +107,10 @@ void rpcmem_deinit(void);
 * @retval, 0 on failure, pointer to buffer on success
 *
 * For example:
- *    buf = rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
+ *    buf = rpcmem_alloc(rm, RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
 */

-void* rpcmem_alloc(int heapid, unsigned int flags, int size);
+void* rpcmem_alloc(rpcmem *rm, int heapid, uint32_t flags, int size);

 /**
 * allocate with default settings
@@ -107,24 +118,24 @@ void* rpcmem_alloc(int heapid, unsigned int flags, int size);
 #if !defined(WINNT) && !defined (_WIN32_WINNT)
 __attribute__((unused))
 #endif
-static __inline void* rpcmem_alloc_def(int size) {
-   return rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
+static __inline void* rpcmem_alloc_def(rpcmem *rm, int size) {
+  return rpcmem_alloc(rm, RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
 }

 /**
 * free buffer, ignores invalid buffers
 */
-void rpcmem_free(void* po);
+void rpcmem_free(rpcmem *rm, void* po);

 /**
 * returns associated fd
 */
-int rpcmem_to_fd(void* po);
+int rpcmem_to_fd(rpcmem *rm, void* po);

 /**
 * cache coherency management
 */
-int rpcmem_sync_cache(void* po, unsigned int flags);
+int rpcmem_sync_cache(rpcmem *rm, void* po, uint32_t flags);

 #ifdef __cplusplus
 }

--- a/tools/bazel-build-standalone-lib.sh
+++ b/tools/bazel-build-standalone-lib.sh
@@ -21,6 +21,11 @@ mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_dsp
 mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu
 mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_apu

+if [[ "$BUILD_HTA" == "1" ]]; then
+    mkdir -p $LIB_DIR/armeabi-v7a/cpu_gpu_hta
+    mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_hta
+fi
+
 rm -rf $LIB_DIR/linux-x86-64
 mkdir -p $LIB_DIR/linux-x86-64

@@ -33,6 +38,18 @@ mkdir -p $LIB_DIR/aarch64_linux_gnu/cpu_gpu


 # build shared libraries
+if [[ "$BUILD_HTA" == "1" ]]; then
+	echo "build shared lib for armeabi-v7a + cpu_gpu_hta"
+	bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
+	cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
+	cp third_party/hta/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
+
+	echo "build shared lib for arm64-v8a + cpu_gpu_hta"
+	bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
+	cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu_hta/
+	cp third_party/hta/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_hta/
+fi
+
 echo "build shared lib for armeabi-v7a + cpu_gpu_dsp"
 bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
@@ -71,6 +88,18 @@ if [[ "$OSTYPE" != "darwin"* ]];then
 fi

 # build static libraries
+if [[ "$BUILD_HTA" == "1" ]]; then
+	echo "build static lib for armeabi-v7a + cpu_gpu_hta"
+	bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
+	cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
+	cp third_party/hta/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
+
+	echo "build static lib for arm64-v8a + cpu_gpu_hta"
+	bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
+	cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu_hta/
+	cp third_party/hta/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_hta/
+fi
+
 echo "build static lib for armeabi-v7a + cpu_gpu_dsp"
 bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/

--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -100,6 +100,11 @@ def parse_args():
        type=str2bool,
        default=True,
        help="Whether to use rpcmem")
+    parser.add_argument(
+        "--enable_hta",
+        type=str2bool,
+        default=False,
+        help="Whether to use hta")
    parser.add_argument(
        '--address_sanitizer',
        action="store_true",
@@ -170,6 +175,7 @@ def main(unused_args):
            enable_neon=FLAGS.enable_neon,
            enable_quantize=FLAGS.enable_quantize,
            enable_rpcmem=FLAGS.enable_rpcmem,
+            enable_hta=FLAGS.enable_hta,
            address_sanitizer=FLAGS.address_sanitizer,
            debug_mode=FLAGS.debug_mode)
        if FLAGS.run_target:

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -218,7 +218,8 @@ def get_opencl_mode(configs):
                YAMLKeyword.runtime, "")
        runtime_list.append(model_runtime.lower())

-    if RuntimeType.gpu in runtime_list or RuntimeType.cpu_gpu in runtime_list:
+    if RuntimeType.gpu in runtime_list or RuntimeType.cpu_gpu in runtime_list \
+            or RuntimeType.hta in runtime_list:
        return True
    return False


--- a/tools/python/transform/hexagon_converter.py
+++ b/tools/python/transform/hexagon_converter.py
@@ -295,6 +295,20 @@ class HexagonConverter(base_converter.ConverterInterface):
            else:
                index += 1

+        if self._option.device == DeviceType.HTA.value:
+            # replace QuantizeINPUT_f_to_8 with INPUT
+            quantize_input_op.type = HexagonOp.INPUT.name
+            del quantize_input_op.output_shape[1:]
+            del quantize_input_op.output_type[1:]
+            del quantize_input_op.out_max_byte_size[1:]
+
+            # replace first op's input min max with constant
+            self.add_constant_min_max_for_first_op(self._model.op[1])
+
+            # replace DequantizeOUTPUT_8tof with OUTPUT
+            dequantize_output_op.type = HexagonOp.OUTPUT.name
+            del dequantize_output_op.input[1:]
+
        return quantize_input_op.output

    def add_node_id(self, model_inputs):

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -605,18 +605,20 @@ def create_internal_storage_dir(serialno, phone_data_dir):

 def push_depended_so_libs(libmace_dynamic_library_path,
                          abi, phone_data_dir, serialno):
-    dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends",
-                          libmace_dynamic_library_path)
-    src_file = ""
-    for dep in split_stdout(dep_so_libs):
-        if dep == "libgnustl_shared.so":
-            src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \
-                       "%s/libgnustl_shared.so" \
-                       % (os.environ["ANDROID_NDK_HOME"], abi)
-        elif dep == "libc++_shared.so":
-            src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \
-                       "%s/libc++_shared.so"\
-                       % (os.environ["ANDROID_NDK_HOME"], abi)
+    src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \
+               "%s/libc++_shared.so" \
+               % (os.environ["ANDROID_NDK_HOME"], abi)
+    try:
+        dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends",
+                              libmace_dynamic_library_path)
+    except sh.ErrorReturnCode_127:
+        print("Find no ndk-depends, use default libc++_shared.so")
+    else:
+        for dep in split_stdout(dep_so_libs):
+            if dep == "libgnustl_shared.so":
+                src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \
+                           "%s/libgnustl_shared.so" \
+                           % (os.environ["ANDROID_NDK_HOME"], abi)
    print("push %s to %s" % (src_file, phone_data_dir))
    adb_push(src_file, phone_data_dir, serialno)