Integrate HTA

bd7e156e · Bin Li · 4b8897bc · bd7e156e · bd7e156e · bd7e156e
45 changed file
--- a/mace/BUILD.bazel
+++ b/mace/BUILD.bazel
@@ -78,6 +78,17 @@ config_setting(
    visibility = ["//visibility:public"],
 )
+config_setting(
+    name = "hta_enabled",
+    define_values = {
+        "hta": "true",
+    },
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+    },
+    visibility = ["//visibility:public"],
+)
 config_setting(
    name = "openmp_enabled",
    define_values = {

--- a/mace/core/BUILD.bazel
+++ b/mace/core/BUILD.bazel
@@ -12,6 +12,8 @@ load(
    "if_android",
    "if_android_armv7",
    "if_hexagon_enabled",
+    "if_hta_enabled",
+    "if_hexagon_or_hta_enabled",
    "if_neon_enabled",
    "if_not_hexagon_enabled",
    "if_opencl_enabled",
@@ -33,17 +35,24 @@ cc_library(
        [
            "runtime/opencl/*.cc",
        ],
-    )) + if_hexagon_enabled(glob([
+    )) + if_hexagon_enabled([
-        "runtime/hexagon/*.cc",
+        "runtime/hexagon/hexagon_dsp_wrapper.cc",
-    ])),
+    ]) + if_hta_enabled([
+        "runtime/hexagon/hexagon_hta_wrapper.cc",
+    ]),
    hdrs = glob([
        "*.h",
        "runtime/cpu/*.h",
-    ]) + if_opencl_enabled(glob(
+    ]) + if_opencl_enabled(glob([
-        [
+        "runtime/opencl/*.h",
-            "runtime/opencl/*.h",
+    ])) + if_hexagon_or_hta_enabled(glob([
-        ],
+        "runtime/hexagon/hexagon_control_wrapper.h",
-    )) + if_hexagon_enabled(glob(["runtime/hexagon/*.h"])),
+        "runtime/hexagon/hexagon_device.h",
+    ])) + if_hexagon_enabled(glob([
+        "runtime/hexagon/*dsp*.h",
+    ])) + if_hta_enabled(glob([
+        "runtime/hexagon/*hta*.h",
+    ])),
    copts = [
        "-Werror",
        "-Wextra",
@@ -57,6 +66,8 @@ cc_library(
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
    ]) + if_neon_enabled([
        "-DMACE_ENABLE_NEON",
    ]) + if_android_armv7([
@@ -77,6 +88,8 @@ cc_library(
        "@gemmlowp",
    ]) + if_hexagon_enabled([
        "//third_party/nnlib:libhexagon",
+    ]) + if_hta_enabled([
+        "//third_party/hta",
    ]),
 )

--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -16,50 +16,67 @@
 #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
 #include <memory>
+#include <utility>
 #include <vector>
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "third_party/nnlib/hexagon_nn.h"
 namespace mace {
+struct InOutInfo {
+  InOutInfo(const std::vector<index_t> &shape,
+            const DataType data_type,
+            const float scale,
+            const int32_t zero_point,
+            std::unique_ptr<Tensor> tensor_u8)
+      :  shape(shape),
+         data_type(data_type),
+         scale(scale),
+         zero_point(zero_point),
+         tensor_u8(std::move(tensor_u8)) {}
+  std::vector<index_t> shape;
+  DataType data_type;
+  float scale;
+  int32_t zero_point;
+  std::unique_ptr<Tensor> tensor_u8;
+};
 class HexagonControlWrapper {
 public:
-  HexagonControlWrapper() {}
+  HexagonControlWrapper() = default;
-  int GetVersion();
+  virtual ~HexagonControlWrapper() = default;
-  bool Config();
-  bool Init();
-  bool Finalize();
-  bool SetupGraph(const NetDef &net_def, const unsigned char *model_data);
-  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
-  bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
-                       std::vector<Tensor *> *output_tensors,
-                       bool hexagon_quantize);
-  bool TeardownGraph();
+  virtual int GetVersion() = 0;
-  void PrintLog();
+  virtual bool Config() = 0;
-  void PrintGraph();
+  virtual bool Init() = 0;
-  void GetPerfInfo();
+  virtual bool Finalize() = 0;
-  void ResetPerfInfo();
+  virtual bool SetupGraph(const NetDef &net_def,
-  void SetDebugLevel(int level);
+                          const unsigned char *model_data) = 0;
+  virtual bool ExecuteGraph(const Tensor &input_tensor,
+                            Tensor *output_tensor) = 0;
+  virtual bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
+                               std::vector<Tensor *> *output_tensors) = 0;
+  virtual bool TeardownGraph() = 0;
+  virtual void PrintLog() = 0;
+  virtual void PrintGraph() = 0;
+  virtual void GetPerfInfo() = 0;
+  virtual void ResetPerfInfo() = 0;
+  virtual void SetDebugLevel(int level) = 0;
- private:
+ protected:
-  static constexpr int NODE_ID_OFFSET = 10000;
+  static constexpr int kNodeIdOffset = 10000;
-  static constexpr int NUM_METADATA = 4;
+  static constexpr int kNumMetaData = 4;
-  inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; }
+  inline uint32_t node_id(uint32_t nodeid) { return kNodeIdOffset + nodeid; }
  int nn_id_;
-  std::vector<std::vector<index_t>> input_shapes_;
+  std::vector<InOutInfo> input_info_;
-  std::vector<std::vector<index_t>> output_shapes_;
+  std::vector<InOutInfo> output_info_;
-  std::vector<DataType> input_data_types_;
+  int num_inputs_;
-  std::vector<DataType> output_data_types_;
+  int num_outputs_;
-  uint32_t num_inputs_;
-  uint32_t num_outputs_;
-  std::vector<std::unique_ptr<Tensor>> input_tensors_u8_;
-  std::vector<std::unique_ptr<Tensor>> output_tensors_u8_;
  MACE_DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
 };

--- a/mace/core/runtime/hexagon/hexagon_device.h
+++ b/mace/core/runtime/hexagon/hexagon_device.h
@@ -15,18 +15,55 @@
 #ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
 #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
+#include <memory>
+#include <utility>
 #include "mace/core/device.h"
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#ifdef MACE_ENABLE_HEXAGON
+#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
+#endif
+#ifdef MACE_ENABLE_HTA
+#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
+#endif
 namespace mace {
 class HexagonDevice : public CPUDevice {
 public:
-  HexagonDevice() : CPUDevice(0, AFFINITY_NONE, false) {}
+  explicit HexagonDevice(DeviceType device_type)
+      : CPUDevice(0, AFFINITY_NONE, false),
+        device_type_(device_type) {}
  DeviceType device_type() const override {
-    return DeviceType::HEXAGON;
+    return device_type_;
  };
+ private:
+  DeviceType device_type_;
 };
+std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
+    DeviceType device_type) {
+  std::unique_ptr<HexagonControlWrapper> hexagon_controller;
+  switch (device_type) {
+#ifdef MACE_ENABLE_HEXAGON
+    case HEXAGON:
+      hexagon_controller = make_unique<HexagonDSPWrapper>();
+      break;
+#endif
+#ifdef MACE_ENABLE_HTA
+    case HTA:
+      hexagon_controller = make_unique<HexagonHTAWrapper>();
+      break;
+#endif
+    default:
+      LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
+  }
+  return hexagon_controller;
+}
 }  // namespace mace
 #endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
--- a/mace/core/runtime/hexagon/hexagon_nn_ops.h
+++ b/mace/core/runtime/hexagon/hexagon_nn_ops.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_
-#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_
 #include <string>
 #include <unordered_map>
@@ -57,4 +57,4 @@ class OpMap {
 };
 }  // namespace mace
-#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
@@ -14,17 +14,19 @@
 #include <algorithm>
 #include <iomanip>
+#include <memory>
 #include <thread>  // NOLINT(build/c++11)
 #include <vector>
 #include <unordered_map>
 #include <string>
 #include <utility>
-#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
-#include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
+#include "mace/core/runtime/hexagon/hexagon_dsp_ops.h"
 #include "mace/core/types.h"
 #include "mace/port/env.h"
-#include "mace/utils/quantize.h"
+#include "mace/utils/memory.h"
+#include "third_party/nnlib/hexagon_nn.h"
 namespace mace {
@@ -85,33 +87,33 @@ std::string FloatToString(const FloatType v, const int32_t precision) {
 }
 }  // namespace
-int HexagonControlWrapper::GetVersion() {
+int HexagonDSPWrapper::GetVersion() {
  int version;
  MACE_CHECK(hexagon_nn_version(&version) == 0, "get version error");
  return version;
 }
-bool HexagonControlWrapper::Config() {
+bool HexagonDSPWrapper::Config() {
  LOG(INFO) << "Hexagon config";
  MACE_CHECK(hexagon_nn_set_powersave_level(0) == 0, "hexagon power error");
  MACE_CHECK(hexagon_nn_config() == 0, "hexagon config error");
  return true;
 }
-bool HexagonControlWrapper::Init() {
+bool HexagonDSPWrapper::Init() {
  LOG(INFO) << "Hexagon init";
  MACE_CHECK(hexagon_nn_init(&nn_id_) == 0, "hexagon_nn_init failed");
  ResetPerfInfo();
  return true;
 }
-bool HexagonControlWrapper::Finalize() {
+bool HexagonDSPWrapper::Finalize() {
  LOG(INFO) << "Hexagon finalize";
  return hexagon_nn_set_powersave_level(1) == 0;
 }
-bool HexagonControlWrapper::SetupGraph(const NetDef &net_def,
+bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def,
-                                       unsigned const char *model_data) {
+                                   unsigned const char *model_data) {
  LOG(INFO) << "Hexagon setup graph";
  int64_t t0 = NowMicros();
@@ -229,36 +231,40 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def,
  cached_outputs.clear();
  // input info
-  num_inputs_ = 0;
+  num_inputs_ = net_def.input_info_size();
-  for (const InputInfo &input_info : net_def.input_info()) {
+  input_info_.reserve(num_inputs_);
+  for (const InputOutputInfo &input_info : net_def.input_info()) {
    std::vector<index_t> input_shape(input_info.dims().begin(),
                                     input_info.dims().end());
    while (input_shape.size() < 4) {
      input_shape.insert(input_shape.begin(), 1);
    }
-    input_shapes_.push_back(input_shape);
+    input_info_.emplace_back(input_shape,
-    input_data_types_.push_back(input_info.data_type());
+                             input_info.data_type(),
-    num_inputs_ += 1;
+                             input_info.scale(),
+                             input_info.zero_point(),
+                             make_unique<Tensor>());
  }
-  input_tensors_u8_.reserve(num_inputs_);
  // output info
-  num_outputs_ = 0;
+  num_outputs_ = net_def.output_info_size();
-  for (const OutputInfo &output_info : net_def.output_info()) {
+  output_info_.reserve(num_outputs_);
+  for (const InputOutputInfo &output_info : net_def.output_info()) {
    std::vector<index_t> output_shape(output_info.dims().begin(),
                                      output_info.dims().end());
    while (output_shape.size() < 4) {
      output_shape.insert(output_shape.begin(), 1);
    }
-    output_shapes_.push_back(output_shape);
+    output_info_.emplace_back(output_shape,
-    output_data_types_.push_back(output_info.data_type());
+                              output_info.data_type(),
-    num_outputs_ += 1;
+                              output_info.scale(),
+                              output_info.zero_point(),
+                              make_unique<Tensor>());
    VLOG(1) << "OutputInfo: "
            << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
            << " " << output_shape[2] << " " << output_shape[3]
            << "\n\t type: " << output_info.data_type();
  }
-  output_tensors_u8_.reserve(num_outputs_);
  int64_t t1 = NowMicros();
@@ -271,14 +277,14 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def,
  return true;
 }
-bool HexagonControlWrapper::TeardownGraph() {
+bool HexagonDSPWrapper::TeardownGraph() {
  LOG(INFO) << "Hexagon teardown graph";
  return hexagon_nn_teardown(nn_id_) == 0;
 }
 #define MACE_PRINT_BUFSIZE (2 * 1024 * 1024)
-void HexagonControlWrapper::PrintLog() {
+void HexagonDSPWrapper::PrintLog() {
  char *buf;
  if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
  MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
@@ -288,7 +294,7 @@ void HexagonControlWrapper::PrintLog() {
  delete[] buf;
 }
-void HexagonControlWrapper::PrintGraph() {
+void HexagonDSPWrapper::PrintGraph() {
  LOG(INFO) << "Print Graph";
  char *buf;
  if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
@@ -299,13 +305,13 @@ void HexagonControlWrapper::PrintGraph() {
  delete[] buf;
 }
-void HexagonControlWrapper::SetDebugLevel(int level) {
+void HexagonDSPWrapper::SetDebugLevel(int level) {
  LOG(INFO) << "Set debug level: " << level;
  MACE_CHECK(hexagon_nn_set_debug_level(nn_id_, level) == 0,
             "set debug level error");
 }
-void HexagonControlWrapper::GetPerfInfo() {
+void HexagonDSPWrapper::GetPerfInfo() {
  LOG(INFO) << "Get perf info";
  std::vector<hexagon_nn_perfinfo> perf_info(MACE_MAX_NODE);
  unsigned int n_items = 0;
@@ -380,20 +386,20 @@ void HexagonControlWrapper::GetPerfInfo() {
  LOG(INFO) << "total duration: " << std::fixed << total_duration;
 }
-void HexagonControlWrapper::ResetPerfInfo() {
+void HexagonDSPWrapper::ResetPerfInfo() {
  LOG(INFO) << "Reset perf info";
  MACE_CHECK(hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME) == 0,
             "reset perf error");
 }
-bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
+bool HexagonDSPWrapper::ExecuteGraph(const Tensor &input_tensor,
-                                         Tensor *output_tensor) {
+                                     Tensor *output_tensor) {
  VLOG(2) << "Execute graph: " << nn_id_;
  // single input and single output
  MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
  MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
-  output_tensor->SetDtype(output_data_types_[0]);
+  output_tensor->SetDtype(output_info_[0].data_type);
-  output_tensor->Resize(output_shapes_[0]);
+  output_tensor->Resize(output_info_[0].shape);
  std::vector<uint32_t> output_shape(4);
  uint32_t output_bytes;
  int res = hexagon_nn_execute(
@@ -413,10 +419,11 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
      &output_bytes);
  MACE_CHECK(res == 0, "execute error");
-  MACE_ASSERT(output_shape.size() == output_shapes_[0].size(),
+  MACE_ASSERT(output_shape.size() == output_info_[0].shape.size(),
              "wrong output shape inferred");
  for (size_t i = 0; i < output_shape.size(); ++i) {
-    MACE_ASSERT(static_cast<index_t>(output_shape[i]) == output_shapes_[0][i],
+    MACE_ASSERT(static_cast<index_t>(output_shape[i])
+                    == output_info_[0].shape[i],
                "wrong output shape inferred");
  }
  MACE_ASSERT(output_bytes == output_tensor->raw_size(),
@@ -424,59 +431,35 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
  return res == 0;
 }
-bool HexagonControlWrapper::ExecuteGraphNew(
+bool HexagonDSPWrapper::ExecuteGraphNew(
    const std::vector<Tensor *> &input_tensors,
-    std::vector<Tensor *> *output_tensors,
+    std::vector<Tensor *> *output_tensors) {
-    bool hexagon_quantize) {
  VLOG(2) << "Execute graph new: " << nn_id_;
  uint32_t num_inputs = static_cast<uint32_t>(input_tensors.size());
  uint32_t num_outputs = static_cast<uint32_t>(output_tensors->size());
  MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
  MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
-  std::vector<hexagon_nn_tensordef> inputs(num_inputs * NUM_METADATA);
+  std::vector<hexagon_nn_tensordef> inputs(num_inputs * kNumMetaData);
-  std::vector<hexagon_nn_tensordef> outputs(num_outputs * NUM_METADATA);
+  std::vector<hexagon_nn_tensordef> outputs(num_outputs * kNumMetaData);
  std::vector<InputOutputMetadata> input_metadata(num_inputs);
  std::vector<InputOutputMetadata> output_metadata(num_outputs);
  // transform mace input to hexagon input
  for (size_t i = 0; i < num_inputs; ++i) {
    std::vector<index_t> input_shape = input_tensors[i]->shape();
-    size_t index = i * NUM_METADATA;
+    size_t index = i * kNumMetaData;
    inputs[index].batches = static_cast<uint32_t>(input_shape[0]);
    inputs[index].height = static_cast<uint32_t>(input_shape[1]);
    inputs[index].width = static_cast<uint32_t>(input_shape[2]);
    inputs[index].depth = static_cast<uint32_t>(input_shape[3]);
-    if (hexagon_quantize) {
+    inputs[index].data = const_cast<unsigned char *>(
-      inputs[index].data =
+        reinterpret_cast<const unsigned char *>(input_tensors[i]->raw_data()));
-          const_cast<unsigned char *>(reinterpret_cast<const unsigned char *>(
+    inputs[index].dataLen = static_cast<int>(input_tensors[i]->raw_size());
-              input_tensors[i]->raw_data()));
+    inputs[index].data_valid_len =
-      inputs[index].dataLen = static_cast<int>(input_tensors[i]->raw_size());
+        static_cast<uint32_t>(input_tensors[i]->raw_size());
-      inputs[index].data_valid_len =
-          static_cast<uint32_t>(input_tensors[i]->raw_size());
-      input_metadata[i].Init(.0f, .0f, 1);
-    } else {
-      if (input_tensors_u8_.size() < i + 1) {
-        input_tensors_u8_.emplace_back(new Tensor());
-        input_tensors_u8_[i]->SetDtype(DT_UINT8);
-        input_tensors_u8_[i]->Resize(input_shape);
-      }
-      Quantize<uint8_t>(*input_tensors[i],
-                        input_tensors_u8_[i].get(),
-                        &input_metadata[i].min_val,
-                        &input_metadata[i].max_val);
-      inputs[index].data =
-          const_cast<unsigned char *>(reinterpret_cast<const unsigned char *>(
-              input_tensors_u8_[i]->raw_data()));
-      inputs[index].dataLen =
-          static_cast<int>(input_tensors_u8_[i]->raw_size());
-      inputs[index].data_valid_len =
-          static_cast<uint32_t>(input_tensors_u8_[i]->raw_size());
-      input_metadata[i].needs_quantization = 0;
-    }
    inputs[index].unused = 0;
+    input_metadata[i].Init(.0f, .0f, 1);
    AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]);
    AddInputMetadata(input_metadata[i].max_val, &inputs[index + 2]);
    AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]);
@@ -484,29 +467,14 @@ bool HexagonControlWrapper::ExecuteGraphNew(
  // transform mace output to hexagon output
  for (size_t i = 0; i < num_outputs; ++i) {
-    size_t index = i * NUM_METADATA;
+    size_t index = i * kNumMetaData;
-    (*output_tensors)[i]->SetDtype(output_data_types_[i]);
+    (*output_tensors)[i]->SetDtype(output_info_[i].data_type);
-    (*output_tensors)[i]->Resize(output_shapes_[i]);
+    (*output_tensors)[i]->Resize(output_info_[i].shape);
-    if (hexagon_quantize) {
-      outputs[index].data = reinterpret_cast<unsigned char *>(
-          (*output_tensors)[i]->raw_mutable_data());
-      outputs[index].dataLen =
-          static_cast<int>((*output_tensors)[i]->raw_size());
-      output_metadata[i].Init(.0f, .0f, 1);
-    } else {
-      if (output_tensors_u8_.size() < i + 1) {
-        output_tensors_u8_.emplace_back(new Tensor());
-        output_tensors_u8_[i]->SetDtype(DT_UINT8);
-        output_tensors_u8_[i]->Resize(output_shapes_[i]);
-      }
-      outputs[index].data = reinterpret_cast<unsigned char *>(
+    outputs[index].data = reinterpret_cast<unsigned char *>(
-          output_tensors_u8_[i]->raw_mutable_data());
+        (*output_tensors)[i]->raw_mutable_data());
-      outputs[index].dataLen =
+    outputs[index].dataLen = static_cast<int>((*output_tensors)[i]->raw_size());
-          static_cast<int>(output_tensors_u8_[i]->raw_size());
+    output_metadata[i].Init(.0f, .0f, 1);
-      output_metadata[i].Init(.0f, .0f, 0);
-    }
    AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]);
    AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]);
@@ -517,38 +485,27 @@ bool HexagonControlWrapper::ExecuteGraphNew(
  // Execute graph
  int res = hexagon_nn_execute_new(nn_id_,
                                   inputs.data(),
-                                   num_inputs * NUM_METADATA,
+                                   num_inputs * kNumMetaData,
                                   outputs.data(),
-                                   num_outputs * NUM_METADATA);
+                                   num_outputs * kNumMetaData);
  // handle hexagon output
  for (size_t i = 0; i < num_outputs; ++i) {
-    size_t index = i * NUM_METADATA;
+    size_t index = i * kNumMetaData;
    std::vector<uint32_t> output_shape{
        outputs[index].batches, outputs[index].height, outputs[index].width,
        outputs[index].depth};
-    MACE_ASSERT(output_shape.size() == output_shapes_[i].size(),
+    MACE_ASSERT(output_shape.size() == output_info_[i].shape.size(),
                "wrong output shape inferred");
    for (size_t j = 0; j < output_shape.size(); ++j) {
      MACE_ASSERT(static_cast<index_t>(output_shape[j])
-                      == output_shapes_[i][j],
+                      == output_info_[i].shape[j],
                  "wrong output shape inferred");
    }
-    if (hexagon_quantize) {
+    MACE_ASSERT(static_cast<index_t>(outputs[index].data_valid_len)
-      MACE_ASSERT(static_cast<index_t>(outputs[index].data_valid_len)
+                    == (*output_tensors)[i]->raw_size(),
-                      == (*output_tensors)[i]->raw_size(),
+                "wrong output bytes inferred.");
-                  "wrong output bytes inferred.");
-    } else {
-      MACE_ASSERT(static_cast<index_t>(outputs[index].data_valid_len)
-                      == output_tensors_u8_[i]->raw_size(),
-                  "wrong output bytes inferred.");
-      DeQuantize<uint8_t>(*output_tensors_u8_[i],
-                          output_metadata[i].min_val,
-                          output_metadata[i].max_val,
-                          (*output_tensors)[i]);
-    }
  }
  return res == 0;

--- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_
+#include <vector>
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+namespace mace {
+class HexagonDSPWrapper : public HexagonControlWrapper {
+ public:
+  HexagonDSPWrapper() = default;
+  int GetVersion() override;
+  bool Config() override;
+  bool Init() override;
+  bool Finalize() override;
+  bool SetupGraph(const NetDef &net_def,
+                  const unsigned char *model_data) override;
+  bool ExecuteGraph(const Tensor &input_tensor,
+                    Tensor *output_tensor) override;
+  bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
+                       std::vector<Tensor *> *output_tensors) override;
+  bool TeardownGraph() override;
+  void PrintLog() override;
+  void PrintGraph() override;
+  void GetPerfInfo() override;
+  void ResetPerfInfo() override;
+  void SetDebugLevel(int level) override;
+  MACE_DISABLE_COPY_AND_ASSIGN(HexagonDSPWrapper);
+};
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_
--- a/mace/core/runtime/hexagon/hexagon_hta_ops.h
+++ b/mace/core/runtime/hexagon/hexagon_hta_ops.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_
+#include <string>
+#include <unordered_map>
+#include "mace/utils/logging.h"
+#include "third_party/hta/hta_hexagon_nn_ops.h"
+namespace mace {
+class OpMap {
+ public:
+  void Init() {
+#define HTA_DEF_OP(NAME) op_map_[#NAME] = HTA_OP_##NAME;
+#include "third_party/hta/hta_ops.h"
+#undef HTA_DEF_OP
+  }
+  hta_op_type GetOpId(const std::string &op_type) {
+    if (op_map_.find(op_type) != end(op_map_)) {
+      return op_map_[op_type];
+    } else {
+      LOG(ERROR) << "HTA unsupported op type: " << op_type;
+      return HTA_NN_OPS_MAX;
+    }
+  }
+ private:
+  std::unordered_map<std::string, hta_op_type> op_map_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
+#include <algorithm>
+#include <iomanip>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+#include "mace/core/runtime/hexagon/hexagon_hta_ops.h"
+#include "mace/core/types.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/quantize.h"
+#include "third_party/hta/hta_hexagon_api.h"
+namespace mace {
+int HexagonHTAWrapper::GetVersion() {
+  int version;
+  MACE_CHECK(hexagon_hta_nn_version(&version) == 0, "get version error");
+  return version;
+}
+bool HexagonHTAWrapper::Config() {
+  LOG(INFO) << "HTA config";
+  MACE_CHECK(hexagon_hta_nn_config() == 0, "hexagon config error");
+  return true;
+}
+bool HexagonHTAWrapper::Init() {
+  LOG(INFO) << "Hexagon init";
+  MACE_CHECK(hexagon_hta_nn_init(&nn_id_) == 0, "hexagon_nn_init failed");
+  ResetPerfInfo();
+  return true;
+}
+bool HexagonHTAWrapper::Finalize() {
+  LOG(INFO) << "Hexagon finalize";
+  return true;
+}
+bool HexagonHTAWrapper::SetupGraph(const NetDef &net_def,
+                                   unsigned const char *model_data) {
+  LOG(INFO) << "Hexagon setup graph";
+  int64_t t0 = NowMicros();
+  // const node
+  for (const ConstTensor &const_tensor : net_def.tensors()) {
+    std::vector<int> tensor_shape(const_tensor.dims().begin(),
+                                  const_tensor.dims().end());
+    while (tensor_shape.size() < 4) {
+      tensor_shape.insert(tensor_shape.begin(), 1);
+    }
+    hexagon_nn_const_node const_node;
+    const_node.node_id = node_id(const_tensor.node_id());
+    const_node.tensor.batches = tensor_shape[0];
+    const_node.tensor.height = tensor_shape[1];
+    const_node.tensor.width = tensor_shape[2];
+    const_node.tensor.depth = tensor_shape[3];
+    if (const_tensor.data_type() == DataType::DT_INT32 &&
+        const_tensor.data_size() == 0) {
+      const_node.tensor.data = NULL;
+      const_node.tensor.dataLen = 0;
+    } else {
+      const_node.tensor.data =
+          const_cast<unsigned char *>(model_data + const_tensor.offset());
+      const_node.tensor.dataLen = const_tensor.data_size() *
+          GetEnumTypeSize(const_tensor.data_type());
+    }
+    hexagon_hta_nn_append_const_node(nn_id_,
+                                     const_node.node_id,
+                                     const_node.tensor.batches,
+                                     const_node.tensor.height,
+                                     const_node.tensor.width,
+                                     const_node.tensor.depth,
+                                     const_node.tensor.data,
+                                     const_node.tensor.dataLen);
+  }
+  // op node
+  OpMap op_map;
+  op_map.Init();
+  std::vector<std::vector<hexagon_hta_nn_input>> cached_inputs;
+  std::vector<std::vector<hexagon_hta_nn_output>> cached_outputs;
+  std::vector<hexagon_hta_nn_input> inputs;
+  std::vector<hexagon_hta_nn_output> outputs;
+  for (const OperatorDef &op : net_def.op()) {
+    hta_op_type op_id = op_map.GetOpId(op.type());
+    inputs.resize(op.node_input().size());
+    for (int i = 0; i < op.node_input().size(); ++i) {
+      inputs[i].src_id = node_id(op.node_input()[i].node_id());
+      inputs[i].output_idx = op.node_input()[i].output_port();
+    }
+    outputs.resize(op.output_shape().size());
+    for (int i = 0; i < op.output_shape().size(); ++i) {
+      outputs[i].rank = op.output_shape()[i].dims().size();
+      for (size_t j = 0; j < outputs[i].rank; ++j) {
+        outputs[i].max_sizes[j] = op.output_shape()[i].dims()[j];
+      }
+      if (outputs[i].rank == 0) {
+        outputs[i].rank = 1;
+        outputs[i].max_sizes[0] = 1;
+      }
+      outputs[i].max_sizes[outputs[i].rank] = 0;
+      outputs[i].elementsize = GetEnumTypeSize(
+          static_cast<DataType>(op.output_type()[i]));
+      outputs[i].zero_offset = 0;
+      outputs[i].stepsize = 0;
+    }
+    cached_inputs.push_back(inputs);
+    cached_outputs.push_back(outputs);
+    auto padding_type = static_cast<hta_padding_type>(op.padding());
+    hexagon_nn_op_node op_node;
+    op_node.node_id = node_id(op.node_id());
+    op_node.operation = op_id;
+    op_node.padding = padding_type;
+    op_node.inputs = cached_inputs.back().data();
+    op_node.inputsLen = inputs.size();
+    op_node.outputs = cached_outputs.back().data();
+    op_node.outputsLen = outputs.size();
+    hexagon_hta_nn_append_node(nn_id_,
+                               op_node.node_id,
+                               op_node.operation,
+                               op_node.padding,
+                               op_node.inputs,
+                               op_node.inputsLen,
+                               op_node.outputs,
+                               op_node.outputsLen);
+  }
+  // input info
+  num_inputs_ = net_def.input_info_size();
+  input_info_.reserve(num_inputs_);
+  for (const InputOutputInfo &input_info : net_def.input_info()) {
+    std::vector<index_t> input_shape(input_info.dims().begin(),
+                                     input_info.dims().end());
+    while (input_shape.size() < 4) {
+      input_shape.insert(input_shape.begin(), 1);
+    }
+    input_info_.emplace_back(input_shape,
+                             input_info.data_type(),
+                             input_info.scale(),
+                             input_info.zero_point(),
+                             make_unique<Tensor>());
+  }
+  // output info
+  num_outputs_ = net_def.output_info_size();
+  output_info_.reserve(num_outputs_);
+  for (const InputOutputInfo &output_info : net_def.output_info()) {
+    std::vector<index_t> output_shape(output_info.dims().begin(),
+                                      output_info.dims().end());
+    while (output_shape.size() < 4) {
+      output_shape.insert(output_shape.begin(), 1);
+    }
+    output_info_.emplace_back(output_shape,
+                              output_info.data_type(),
+                              output_info.scale(),
+                              output_info.zero_point(),
+                              make_unique<Tensor>());
+    VLOG(1) << "OutputInfo: "
+            << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
+            << " " << output_shape[2] << " " << output_shape[3]
+            << "\n\t type: " << output_info.data_type();
+  }
+  int64_t t1 = NowMicros();
+  MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed");
+  int64_t t2 = NowMicros();
+  VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
+  return true;
+}
+bool HexagonHTAWrapper::TeardownGraph() {
+  LOG(INFO) << "Hexagon teardown graph";
+  return hexagon_hta_nn_teardown(nn_id_) == 0;
+}
+void HexagonHTAWrapper::PrintLog() {
+  LOG(INFO) << "Print Log";
+}
+void HexagonHTAWrapper::PrintGraph() {
+  LOG(INFO) << "Print Graph";
+}
+void HexagonHTAWrapper::SetDebugLevel(int level) {
+  LOG(INFO) << "Set debug level: " << level;
+  MACE_CHECK(hexagon_hta_nn_set_debug_level(nn_id_, level) == 0,
+             "set debug level error");
+}
+void HexagonHTAWrapper::GetPerfInfo() {
+  LOG(INFO) << "Get perf info";
+}
+void HexagonHTAWrapper::ResetPerfInfo() {
+  LOG(INFO) << "Reset perf info";
+}
+bool HexagonHTAWrapper::ExecuteGraph(const Tensor &input_tensor,
+                                     Tensor *output_tensor) {
+  MACE_UNUSED(input_tensor);
+  MACE_UNUSED(output_tensor);
+  MACE_NOT_IMPLEMENTED;
+  return false;
+}
+bool HexagonHTAWrapper::ExecuteGraphNew(
+    const std::vector<Tensor *> &input_tensors,
+    std::vector<Tensor *> *output_tensors) {
+  VLOG(2) << "Execute graph new: " << nn_id_;
+  uint32_t num_inputs = static_cast<uint32_t>(input_tensors.size());
+  uint32_t num_outputs = static_cast<uint32_t>(output_tensors->size());
+  MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
+  MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
+  std::vector<hexagon_hta_nn_tensordef> inputs(num_inputs);
+  std::vector<hexagon_hta_nn_tensordef> outputs(num_outputs);
+  for (size_t i = 0; i < num_inputs; ++i) {
+    std::vector<index_t> input_shape = input_tensors[i]->shape();
+    inputs[i].batches = static_cast<uint32_t>(input_shape[0]);
+    inputs[i].height = static_cast<uint32_t>(input_shape[1]);
+    inputs[i].width = static_cast<uint32_t>(input_shape[2]);
+    inputs[i].depth = static_cast<uint32_t>(input_shape[3]);
+    input_info_[i].tensor_u8->SetDtype(DT_UINT8);
+    input_info_[i].tensor_u8->Resize(input_shape);
+    const float *input_data = input_tensors[i]->data<float>();
+    uint8_t *input_data_u8 = input_info_[i].tensor_u8->mutable_data<uint8_t>();
+    QuantizeWithScaleAndZeropoint(input_data,
+                                  input_tensors[i]->size(),
+                                  input_info_[i].scale,
+                                  input_info_[i].zero_point,
+                                  input_data_u8);
+    inputs[i].data = const_cast<unsigned char *>(
+        reinterpret_cast<const unsigned char *>(
+            input_info_[i].tensor_u8->raw_data()));
+    inputs[i].dataLen = static_cast<int>(input_info_[i].tensor_u8->raw_size());
+    inputs[i].data_valid_len = static_cast<uint32_t>(
+        input_info_[i].tensor_u8->raw_size());
+    inputs[i].unused = 0;
+  }
+  for (size_t i = 0; i < num_outputs; ++i) {
+    (*output_tensors)[i]->SetDtype(output_info_[i].data_type);
+    (*output_tensors)[i]->Resize(output_info_[i].shape);
+    output_info_[i].tensor_u8->SetDtype(DT_UINT8);
+    output_info_[i].tensor_u8->Resize(output_info_[i].shape);
+    outputs[i].data = reinterpret_cast<unsigned char *>(
+        output_info_[i].tensor_u8->raw_mutable_data());
+    outputs[i].dataLen =
+        static_cast<int>(output_info_[i].tensor_u8->raw_size());
+  }
+  int res = hexagon_hta_nn_execute_new(nn_id_,
+                                       inputs.data(),
+                                       num_inputs,
+                                       outputs.data(),
+                                       num_outputs);
+  for (size_t i = 0; i < num_outputs; ++i) {
+    std::vector<uint32_t> output_shape{
+        outputs[i].batches, outputs[i].height, outputs[i].width,
+        outputs[i].depth};
+    MACE_ASSERT(output_shape.size() == output_info_[i].shape.size(),
+                "wrong output shape inferred");
+    for (size_t j = 0; j < output_shape.size(); ++j) {
+      MACE_ASSERT(static_cast<index_t>(output_shape[j])
+                      == output_info_[i].shape[j],
+                  "wrong output shape inferred");
+    }
+    MACE_ASSERT(static_cast<index_t>(outputs[i].data_valid_len)
+                    == (*output_tensors)[i]->raw_size(),
+                "wrong output bytes inferred.");
+    const uint8_t *output_data_u8 = output_info_[i].tensor_u8->data<uint8_t>();
+    float *output_data = (*output_tensors)[i]->mutable_data<float>();
+    Dequantize(output_data_u8,
+               output_info_[i].tensor_u8->size(),
+               output_info_[i].scale,
+               output_info_[i].zero_point,
+               output_data);
+  }
+  return res == 0;
+}
+}  // namespace mace
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
+#include <vector>
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+namespace mace {
+class HexagonHTAWrapper : public HexagonControlWrapper {
+ public:
+  HexagonHTAWrapper() = default;
+  int GetVersion() override;
+  bool Config() override;
+  bool Init() override;
+  bool Finalize() override;
+  bool SetupGraph(const NetDef &net_def,
+                  const unsigned char *model_data) override;
+  bool ExecuteGraph(const Tensor &input_tensor,
+                    Tensor *output_tensor) override;
+  bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
+                       std::vector<Tensor *> *output_tensors) override;
+  bool TeardownGraph() override;
+  void PrintLog() override;
+  void PrintGraph() override;
+  void GetPerfInfo() override;
+  void ResetPerfInfo() override;
+  void SetDebugLevel(int level) override;
+  MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper);
+};
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
--- a/mace/examples/cli/BUILD.bazel
+++ b/mace/examples/cli/BUILD.bazel
@@ -3,6 +3,7 @@ load(
    "//mace:mace.bzl",
    "if_android",
    "if_hexagon_enabled",
+    "if_hta_enabled",
    "if_opencl_enabled",
    "if_openmp_enabled",
 )
@@ -36,6 +37,8 @@ cc_binary(
        "//mace/utils:utils_hdrs",
    ] + if_hexagon_enabled([
        "//third_party/nnlib:libhexagon",
+    ]) + if_hta_enabled([
+        "//third_party/hta",
    ]),
 )

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -79,6 +79,8 @@ DeviceType ParseDeviceType(const std::string &device_str) {
    return DeviceType::GPU;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
+  } else if (device_str.compare("HTA") == 0) {
+    return DeviceType::HTA;
  } else {
    return DeviceType::CPU;
  }

--- a/mace/libmace/BUILD.bazel
+++ b/mace/libmace/BUILD.bazel
@@ -16,6 +16,7 @@ load(
    "if_openmp_enabled",
    "if_android_armv7",
    "if_hexagon_enabled",
+    "if_hta_enabled",
    "if_opencl_enabled",
    "if_quantize_enabled",
 )
@@ -40,6 +41,8 @@ cc_library(
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
    ]),
    deps = [
        "//mace/ops",

--- a/mace/libmace/capability.cc
+++ b/mace/libmace/capability.cc
@@ -142,7 +142,7 @@ void BMNet::SetUp() {
  // Add input and output information
  for (size_t i = 0; i < input_names_.size(); ++i) {
-    InputInfo *info = net_.add_input_info();
+    InputOutputInfo *info = net_.add_input_info();
    info->set_data_format(DataFormat::NHWC);
    info->set_name(input_names_[i]);
    for (auto d : input_shapes_[i]) {
@@ -150,7 +150,7 @@ void BMNet::SetUp() {
    }
  }
  for (auto output_name : output_names_) {
-    OutputInfo *info = net_.add_output_info();
+    InputOutputInfo *info = net_.add_output_info();
    info->set_name(output_name);
  }
  // allocate weight data

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -33,10 +33,9 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif  // MACE_ENABLE_OPENCL
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
 #include "mace/core/runtime/hexagon/hexagon_device.h"
-#endif  // MACE_ENABLE_HEXAGON
+#endif
 namespace mace {
 namespace {
@@ -387,11 +386,11 @@ class MaceEngine::Impl {
  std::unique_ptr<Workspace> ws_;
  std::unique_ptr<NetBase> net_;
  bool is_quantized_model_;
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
 #endif
-  std::map<std::string, mace::InputInfo> input_info_map_;
+  std::map<std::string, mace::InputOutputInfo> input_info_map_;
-  std::map<std::string, mace::OutputInfo> output_info_map_;
+  std::map<std::string, mace::InputOutputInfo> output_info_map_;
  MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };
@@ -404,7 +403,7 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
      ws_(new Workspace()),
      net_(nullptr),
      is_quantized_model_(false)
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
      , hexagon_controller_(nullptr)
 #endif
 {
@@ -427,9 +426,9 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
        config.impl_->use_gemmlowp()));
  }
 #endif
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-  if (device_type_ == DeviceType::HEXAGON) {
+  if (device_type_ == DeviceType::HEXAGON || device_type_ == DeviceType::HTA) {
-    device_.reset(new HexagonDevice());
+    device_.reset(new HexagonDevice(device_type_));
  }
 #endif
  MACE_CHECK_NOTNULL(device_);
@@ -481,13 +480,13 @@ MaceStatus MaceEngine::Impl::Init(
                 << "' does not belong to model's outputs "
                 << MakeString(MapKeys(output_info_map_));
    }
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
    ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
 #endif
  }
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-  if (device_type_ == HEXAGON) {
+  if (device_type_ == HEXAGON || device_type_ == HTA) {
-    hexagon_controller_.reset(new HexagonControlWrapper());
+    hexagon_controller_ = CreateHexagonControlWrapper(device_type_);
    MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
    MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
    hexagon_controller_->SetDebugLevel(
@@ -519,7 +518,7 @@ MaceStatus MaceEngine::Impl::Init(
      ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
    }
    MACE_RETURN_IF_ERROR(net_->Init());
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
  }
 #endif
@@ -541,6 +540,7 @@ MaceStatus MaceEngine::Impl::Init(
        reinterpret_cast<const unsigned char *>(model_data_->data())));
  if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON ||
+      device_type_ == DeviceType::HTA ||
      (device_type_ == DeviceType::CPU && ws_->diffused_buffer())) {
    model_data_.reset();
  }
@@ -549,8 +549,8 @@ MaceStatus MaceEngine::Impl::Init(
 MaceEngine::Impl::~Impl() {
  LOG(INFO) << "Destroying MaceEngine";
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-  if (device_type_ == HEXAGON) {
+  if (device_type_ == HEXAGON || device_type_ == HTA) {
    if (VLOG_IS_ON(2)) {
      hexagon_controller_->GetPerfInfo();
      hexagon_controller_->PrintLog();
@@ -699,15 +699,15 @@ MaceStatus MaceEngine::Impl::Run(
    Tensor *output_tensor = ws_->GetTensor(output.first);
    output_tensors.push_back(output_tensor);
  }
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-  if (device_type_ == HEXAGON) {
+  if (device_type_ == HEXAGON || device_type_ == HTA) {
    MACE_CHECK(input_tensors.size() == 1 && output_tensors.size() == 1,
               "HEXAGON not support multiple inputs and outputs yet.");
-    hexagon_controller_->ExecuteGraphNew(input_tensors, &output_tensors, true);
+    hexagon_controller_->ExecuteGraphNew(input_tensors, &output_tensors);
  } else {
 #endif
    MACE_RETURN_IF_ERROR(net_->Run(run_metadata));
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
  }
 #endif

--- a/mace/libmace/mace_version_script.lds
+++ b/mace/libmace/mace_version_script.lds
@@ -15,8 +15,7 @@ mace {
    *mace*NetDef*;
    *mace*MemoryType*;
    *mace*DataType*;
-    *mace*InputInfo*;
+    *mace*InputOutputInfo*;
-    *mace*OutputInfo*;
    *mace*OutputShape*;
    *mace*OperatorDef*;
    *mace*ConstTensor*;

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -60,6 +60,19 @@ def if_not_hexagon_enabled(a):
      "//conditions:default": a,
  })
+def if_hta_enabled(a):
+  return select({
+      "//mace:hta_enabled": a,
+      "//conditions:default": [],
+  })
+def if_hexagon_or_hta_enabled(a):
+  return select({
+      "//mace:hexagon_enabled": a,
+      "//mace:hta_enabled": a,
+      "//conditions:default": [],
+  })
 def if_openmp_enabled(a):
  return select({
      "//mace:openmp_enabled": a,

--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -86,21 +86,15 @@ message OperatorDef {
 }
 // for hexagon mace-nnlib
-message InputInfo {
+message InputOutputInfo {
-  optional string name = 1;
-  optional int32 node_id = 2;
-  repeated int32 dims = 3;
-  optional int32 max_byte_size = 4;  // only support 32-bit len
-  optional DataType data_type = 5 [default = DT_FLOAT];
-  optional int32 data_format = 6 [default = 1];  // NHWC
-}
-message OutputInfo {
  optional string name = 1;
  optional int32 node_id = 2;
  repeated int32 dims = 3;
  optional int32 max_byte_size = 4;  // only support 32-bit len
  optional DataType data_type = 5 [default = DT_FLOAT];
  optional int32 data_format = 6 [default = 1];  // NHWC
+  optional float scale = 7;
+  optional int32 zero_point = 8;
 }
 message NetDef {
@@ -109,6 +103,6 @@ message NetDef {
  repeated ConstTensor tensors = 3;
  // for hexagon mace-nnlib
-  repeated InputInfo input_info = 100;
+  repeated InputOutputInfo input_info = 100;
-  repeated OutputInfo output_info = 101;
+  repeated InputOutputInfo output_info = 101;
 }
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -32,7 +32,7 @@ namespace mace {
 class NetDef;
-enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };
+enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 };
 enum DataFormat { DF_NONE = 0, NHWC = 1, NCHW = 2};

--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -37,6 +37,7 @@ FLAGS = None
 device_type_map = {'cpu': cvt.DeviceType.CPU.value,
                   'gpu': cvt.DeviceType.GPU.value,
                   'dsp': cvt.DeviceType.HEXAGON.value,
+                   'hta': cvt.DeviceType.HTA.value,
                   'cpu+gpu': cvt.DeviceType.CPU.value}
 data_format_map = {
@@ -53,10 +54,11 @@ def parse_data_type(data_type, device_type):
            return mace_pb2.DT_FLOAT
        else:
            return mace_pb2.DT_HALF
-    elif device_type == cvt.DeviceType.HEXAGON.value:
+    elif device_type == cvt.DeviceType.HEXAGON.value or \
+            device_type == cvt.DeviceType.HTA.value:
        return mace_pb2.DT_FLOAT
    else:
-        print("Invalid device type: " + device_type)
+        print("Invalid device type: " + str(device_type))
 def file_checksum(fname):
@@ -121,7 +123,7 @@ def main(unused_args):
        six.print_("platform %s is not supported." % FLAGS.platform,
                   file=sys.stderr)
        sys.exit(-1)
-    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', 'cpu+gpu']:
+    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', 'hta', 'cpu+gpu']:
        six.print_("runtime %s is not supported." % FLAGS.runtime,
                   file=sys.stderr)
        sys.exit(-1)
@@ -220,7 +222,8 @@ def main(unused_args):
        option, output_graph_def)
    output_graph_def, quantize_activation_info = mace_transformer.run()
-    if FLAGS.runtime == 'dsp':
+    if option.device in [cvt.DeviceType.HEXAGON.value,
+                         cvt.DeviceType.HTA.value]:
        from mace.python.tools.converter_tool import hexagon_converter
        converter = hexagon_converter.HexagonConverter(
            option, output_graph_def, quantize_activation_info)

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -22,6 +22,7 @@ class DeviceType(Enum):
    CPU = 0
    GPU = 2
    HEXAGON = 3
+    HTA = 4
 class DataFormat(Enum):

--- a/mace/python/tools/converter_tool/hexagon_converter.py
+++ b/mace/python/tools/converter_tool/hexagon_converter.py
@@ -20,6 +20,7 @@ from operator import mul
 from mace.proto import mace_pb2
 from mace.python.tools.converter_tool import base_converter
 from mace.python.tools.converter_tool.base_converter import ConverterUtil
+from mace.python.tools.converter_tool.base_converter import DeviceType
 from mace.python.tools.converter_tool.base_converter import EltwiseType
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
@@ -36,6 +37,8 @@ HexagonSupportedOps = [
    'BatchToSpaceND_8',
    'DepthwiseSupernode_8x8p32to8',
    'DequantizeOUTPUT_8tof',
+    'INPUT',
+    'OUTPUT',
    'QuantizedAdd_8p8to8',
    'QuantizedAvgPool_8',
    'QuantizedConcat_8',
@@ -332,7 +335,7 @@ class HexagonConverter(base_converter.ConverterInterface):
            else:
                op.type = self._hexagon_ops.map_nn_op(op.type)
-    def add_min_max(self, name, val):
+    def add_const_node(self, name, val):
        if name not in self._consts:
            tensor = self._model.tensors.add()
            self._consts[name] = tensor
@@ -364,14 +367,14 @@ class HexagonConverter(base_converter.ConverterInterface):
                min_tensor_name = op + ':1'
            else:
                min_tensor_name = op + '_min:0'
-                self.add_min_max(min_tensor_name, minval)
+                self.add_const_node(min_tensor_name, minval)
            this_op.input.extend([min_tensor_name])
        if add_max:
            if is_activation and diff_port:
                max_tensor_name = op + ':2'
            else:
                max_tensor_name = op + '_max:0'
-                self.add_min_max(max_tensor_name, maxval)
+                self.add_const_node(max_tensor_name, maxval)
            this_op.input.extend([max_tensor_name])
    def add_shape_const_node(self, op, values, name):
@@ -382,27 +385,48 @@ class HexagonConverter(base_converter.ConverterInterface):
        tensor.dims.extend(values)
        return tensor.name
-    def add_input_output_node(self):
+    def add_constant_min_max_for_first_op(self, op):
-        for op in self._model.op:
+        minval = self._quantize_activation_info[op.input[0]].minval
-            if op.name.startswith(MaceKeyword.mace_input_node_name):
+        maxval = self._quantize_activation_info[op.input[0]].maxval
-                del op.input[0]
+        input_op, _ = get_op_and_port_from_tensor(op.input[0])
-                break
+        input_min = input_op + '_min:0'
+        input_max = input_op + '_max:0'
+        self.add_const_node(input_min, minval)
+        self.add_const_node(input_max, maxval)
+        for i in range(len(op.input)):
+            if op.input[i] == input_op + ':1':
+                op.input[i] = input_min
+            elif op.input[i] == input_op + ':2':
+                op.input[i] = input_max
-        output_node = None
+    def add_input_output_node(self):
-        if not self._option.check_nodes:
+        mace_check(
-            output_name = list(self._option.output_nodes.values())[0].name
+            self._model.op[0].type == HexagonOp.QuantizeINPUT_f_to_8.name,
-        else:
+            "Not started with Quantize op.")
-            output_name = list(self._option.check_nodes.values())[0].name
+        quantize_input_op = self._model.op[0]
-        output_name = normalize_name(output_name)
+        del quantize_input_op.input[:]
-        for op in self._model.op:
-            if op.name == output_name:
+        mace_check(
-                output_node = op
+            self._model.op[-1].type == HexagonOp.DequantizeOUTPUT_8tof.name,
-                break
+            "Not ended with Dequantize op.")
-        mace_check(output_node is not None,
+        dequantize_output_op = self._model.op[-1]
-                   "mace_output_node_* not found.")
+        del dequantize_output_op.output_shape[:]
-        del output_node.output_shape[:]
+        del dequantize_output_op.output_type[:]
-        del output_node.output_type[:]
+        del dequantize_output_op.out_max_byte_size[:]
-        del output_node.out_max_byte_size[:]
+        if self._option.device == DeviceType.HTA.value:
+            # replace QuantizeINPUT_f_to_8 with INPUT
+            quantize_input_op.type = HexagonOp.INPUT.name
+            del quantize_input_op.output_shape[1:]
+            del quantize_input_op.output_type[1:]
+            del quantize_input_op.out_max_byte_size[1:]
+            # replace first op's input min max with constant
+            self.add_constant_min_max_for_first_op(self._model.op[1])
+            # replace DequantizeOUTPUT_8tof with OUTPUT
+            dequantize_output_op.type = HexagonOp.OUTPUT.name
+            del dequantize_output_op.input[1:]
    def add_node_id(self):
        node_id_counter = 0

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1174,7 +1174,8 @@ class Transformer(base_converter.ConverterInterface):
            self.set_filter_format(FilterFormat.OHWI)
        elif self._option.quantize and \
-                self._option.device == DeviceType.HEXAGON.value:
+                (self._option.device == DeviceType.HEXAGON.value or
+                 self._option.device == DeviceType.HTA.value):
            print("Transpose filters to HWIO/HWIM")
            mace_check(filter_format == FilterFormat.HWIO,
                       "HEXAGON only support HWIO/HWIM filter format.")
@@ -1456,7 +1457,7 @@ class Transformer(base_converter.ConverterInterface):
                           % (op.name, op.type,
                              mace_pb2.DataType.Name(data_type_arg.i)))
-        for input_node in self._option.input_nodes.values():
+        for i, input_node in enumerate(self._option.input_nodes.values()):
            new_input_name = self.input_name_map[input_node.name]
            op_def = self._model.op.add()
            op_def.name = self.normalize_op_name(new_input_name)
@@ -1465,8 +1466,10 @@ class Transformer(base_converter.ConverterInterface):
            op_def.output.extend([new_input_name])
            output_shape = op_def.output_shape.add()
            output_shape.dims.extend(input_node.shape)
-            self.copy_quantize_info(
+            quantize_info = self._quantize_activation_info[new_input_name]
-                op_def, self._quantize_activation_info[new_input_name])
+            self.copy_quantize_info(op_def, quantize_info)
+            self._model.input_info[i].scale = quantize_info.scale
+            self._model.input_info[i].zero_point = quantize_info.zero_point
            ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
            ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
@@ -1477,16 +1480,19 @@ class Transformer(base_converter.ConverterInterface):
            find_range_every_time_arg.i = 1
        output_nodes = self._option.check_nodes.values()
-        for output_node in output_nodes:
+        for i, output_node in enumerate(output_nodes):
            op_def = self._model.op.add()
            op_def.name = self.normalize_op_name(output_node.name)
            op_def.type = MaceOp.Dequantize.name
            op_def.input.extend([self.output_name_map[output_node.name]])
            op_def.output.extend([output_node.name])
            output_shape = op_def.output_shape.add()
-            output_shape.dims.extend(
+            producer_op = self._producer[output_node.name]
-                self._producer[output_node.name].output_shape[0].dims)
+            output_shape.dims.extend(producer_op.output_shape[0].dims)
            op_def.output_type.extend([mace_pb2.DT_FLOAT])
+            quantize_info = producer_op.quantize_info[0]
+            self._model.output_info[i].scale = quantize_info.scale
+            self._model.output_info[i].zero_point = quantize_info.zero_point
            ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
@@ -1533,7 +1539,8 @@ class Transformer(base_converter.ConverterInterface):
                    quantized_tensor = \
                        quantize_util.quantize_with_scale_and_zero(
                            tensor.float_data, scale, 0)
-                elif self._option.device == DeviceType.HEXAGON.value:
+                elif self._option.device == DeviceType.HEXAGON.value or \
+                        self._option.device == DeviceType.HTA.value:
                    quantized_tensor = \
                        quantize_util.quantize_bias_for_hexagon(
                            tensor.float_data)
@@ -1691,7 +1698,7 @@ class Transformer(base_converter.ConverterInterface):
            return False
        print("Add default quantize info for input")
-        for input_node in self._option.input_nodes.values():
+        for i, input_node in enumerate(self._option.input_nodes.values()):
            if input_node.name not in self._quantize_activation_info:
                print("Input range %s: %s" % (input_node.name,
                                              str(input_node.range)))

--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -75,7 +75,7 @@ void CreateNetArg(NetDef *net_def) {
 {% if net.input_info | length > 0 %}
 void CreateInputInfo(NetDef *net_def) {
  net_def->mutable_input_info()->Reserve({{ net.input_info | length }});
-  InputInfo *input_info = nullptr;
+  InputOutputInfo *input_info = nullptr;
  {% for idx in range(net.input_info|length) %}
  input_info = net_def->add_input_info();
  input_info->set_name({{ net.input_info[idx].name|tojson }});
@@ -92,7 +92,7 @@ void CreateInputInfo(NetDef *net_def) {
 {% if net.output_info | length > 0 %}
 void CreateOutputInfo(NetDef *net_def) {
  net_def->mutable_output_info()->Reserve({{ net.output_info | length }});
-  OutputInfo *output_info = nullptr;
+  InputOutputInfo *output_info = nullptr;
  {% for idx in range(net.output_info|length) %}
  output_info = net_def->add_output_info();
  output_info->set_name({{ net.output_info[idx].name|tojson }});

--- a/mace/test/BUILD.bazel
+++ b/mace/test/BUILD.bazel
@@ -11,6 +11,7 @@ load(
    "if_openmp_enabled",
    "if_android_armv7",
    "if_hexagon_enabled",
+    "if_hta_enabled",
    "if_opencl_enabled",
    "if_quantize_enabled",
 )
@@ -45,6 +46,8 @@ cc_test(
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
    ]),
    linkopts = ["-fopenmp"],
    linkstatic = 1,
@@ -78,6 +81,8 @@ cc_test(
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
    ]),
    linkopts = ["-fopenmp"],
    linkstatic = 1,
@@ -111,6 +116,8 @@ cc_test(
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
    ]),
    linkopts = ["-fopenmp"],
    linkstatic = 1,
@@ -143,6 +150,8 @@ cc_test(
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
    ]),
    linkopts = ["-fopenmp"],
    linkstatic = 1,

--- a/mace/test/mace_api_exception_test.cc
+++ b/mace/test/mace_api_exception_test.cc
@@ -29,7 +29,7 @@ TEST(MaceAPIExceptionTest, WrongInputTest) {
  std::shared_ptr<NetDef> net_def(new NetDef());
  for (size_t i = 0; i < input_names.size(); ++i) {
-    InputInfo *info = net_def->add_input_info();
+    InputOutputInfo *info = net_def->add_input_info();
    info->set_name(input_names[i]);
  }

--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -45,7 +45,7 @@ void MaceRunFunc(const int in_out_size) {
      filter_tensor_name, filter_shape, 0, data.size(), net_def.get());
  for (size_t i = 0; i < input_names.size(); ++i) {
-    InputInfo *info = net_def->add_input_info();
+    InputOutputInfo *info = net_def->add_input_info();
    info->set_data_format(DataFormat::NHWC);
    info->set_name(input_names[i]);
    for (auto d : input_shapes[0]) {
@@ -53,7 +53,7 @@ void MaceRunFunc(const int in_out_size) {
    }
  }
  for (size_t i = 0; i < output_names.size(); ++i) {
-    OutputInfo *info = net_def->add_output_info();
+    InputOutputInfo *info = net_def->add_output_info();
    info->set_name(output_names[i]);
  }
  for (size_t i = 0; i < output_names.size(); ++i) {

--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -44,7 +44,7 @@ void MaceRun(const int in_out_size,
  AddTensor<T>(filter_tensor_name, filter_shape, 0, data.size(), net_def.get());
  for (size_t i = 0; i < input_names.size(); ++i) {
-    InputInfo *info = net_def->add_input_info();
+    InputOutputInfo *info = net_def->add_input_info();
    info->set_data_format(DataFormat::NHWC);
    info->set_name(input_names[i]);
    for (auto d : max_shape) {
@@ -52,7 +52,7 @@ void MaceRun(const int in_out_size,
    }
  }
  for (size_t i = 0; i < output_names.size(); ++i) {
-    OutputInfo *info = net_def->add_output_info();
+    InputOutputInfo *info = net_def->add_output_info();
    info->set_name(output_names[i]);
  }
  for (size_t i = 0; i < output_names.size(); ++i) {

--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -76,6 +76,8 @@ DeviceType ParseDeviceType(const std::string &device_str) {
    return DeviceType::GPU;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
+  } else if (device_str.compare("HTA") == 0) {
+    return DeviceType::HTA;
  } else {
    return DeviceType::CPU;
  }

--- a/third_party/hta/BUILD
+++ b/third_party/hta/BUILD
+# These files are generated fron nnlib project
+licenses(["notice"])
+exports_files(["license.txt"])
+load(
+    "//mace:mace.bzl",
+    "if_android_armv7",
+    "if_android_arm64",
+)
+cc_library(
+    name = "hta",
+    srcs = if_android_armv7([
+        "armeabi-v7a/libhta_controller.so",
+        "armeabi-v7a/libhta_hexagon_runtime.so",
+        "armeabi-v7a/libnpu.so",
+    ]) + if_android_arm64([
+        "arm64-v8a/libcdsprpc.so",
+        "arm64-v8a/libhta_controller.so",
+        "arm64-v8a/libhta_hexagon_runtime.so",
+        "arm64-v8a/libnpu.so",
+    ]),
+    hdrs = [
+        "hta_hexagon_api.h",
+        "hta_hexagon_nn_ops.h",
+        "hta_ops.h",
+    ],
+    visibility = ["//visibility:public"],
+)
--- a/third_party/hta/arm64-v8a/libcdsprpc.so
+++ b/third_party/hta/arm64-v8a/libcdsprpc.so
--- a/third_party/hta/arm64-v8a/libhta_controller.so
+++ b/third_party/hta/arm64-v8a/libhta_controller.so
--- a/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so
+++ b/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so
--- a/third_party/hta/arm64-v8a/libnpu.so
+++ b/third_party/hta/arm64-v8a/libnpu.so
--- a/third_party/hta/armeabi-v7a/libhta_controller.so
+++ b/third_party/hta/armeabi-v7a/libhta_controller.so
--- a/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so
+++ b/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so
--- a/third_party/hta/armeabi-v7a/libnpu.so
+++ b/third_party/hta/armeabi-v7a/libnpu.so
--- a/third_party/hta/hta_hexagon_api.h
+++ b/third_party/hta/hta_hexagon_api.h
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef THIRD_PARTY_HTA_HEXAGON_API_H_
+#define THIRD_PARTY_HTA_HEXAGON_API_H_
+#include "hta_hexagon_nn_ops.h"
+#include <stdint.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef int hexagon_hta_nn_nn_id;
+struct input {
+	uint32_t src_id;
+	uint32_t output_idx;
+};
+#define NODE_ID_RESERVED_CONSTANT 0
+#define MAX_DIMENSIONS 8
+struct output {
+	uint32_t rank; // dimensions in the tensor
+	uint32_t max_sizes[MAX_DIMENSIONS]; // max num elements in each dimension
+	uint32_t elementsize; // size of each element
+	int32_t zero_offset; // 0 for float / integer values
+	float stepsize; // 0 for float/integer values
+};
+struct perfinfo {
+	uint32_t node_id;
+	uint32_t executions;
+	union {
+		uint64_t counter;
+		struct {
+			uint32_t counter_lo;
+			uint32_t counter_hi;
+		};
+	};
+};
+typedef struct input hexagon_hta_nn_input;
+typedef struct output hexagon_hta_nn_output;
+typedef struct perfinfo hexagon_hta_nn_perfinfo;
+typedef int32_t hexagon_hta_nn_padding_type;
+typedef enum padding_type_enum {
+	HTA_NN_PAD_NA = 0,
+	HTA_NN_PAD_SAME,
+	HTA_NN_PAD_VALID,
+	HTA_NN_PAD_MIRROR_REFLECT,
+	HTA_NN_PAD_MIRROR_SYMMETRIC,
+	HTA_NN_PAD_SAME_CAFFE,
+} hta_padding_type;
+typedef struct {
+	unsigned int batches;
+	unsigned int height;
+	unsigned int width;
+	unsigned int depth;
+	unsigned char *data;
+	int dataLen;		/* For input and output */
+	unsigned int data_valid_len; /* for output only */
+	unsigned int unused;
+} hexagon_hta_nn_tensordef;
+typedef struct hexagon_nn_op_node hexagon_nn_op_node;
+struct hexagon_nn_op_node {
+  unsigned int node_id;
+  hta_op_type operation;
+  hta_padding_type padding;
+  hexagon_hta_nn_input* inputs;
+  int inputsLen;
+  hexagon_hta_nn_output* outputs;
+  int outputsLen;
+};
+typedef struct hexagon_nn_const_node hexagon_nn_const_node;
+struct hexagon_nn_const_node {
+  unsigned int node_id;
+  hexagon_hta_nn_tensordef tensor;
+};
+/* Actual functions in the interface */
+/* Returns 0 on success, nonzero on error unless otherwise noted */
+/* Configure the hardware and software environment.  Should be called once before doing anything */
+int hexagon_hta_nn_config( void );
+/* Initialize a new graph, returns a new nn_id or -1 on error */
+int hexagon_hta_nn_init(hexagon_hta_nn_nn_id *g);
+/* Set debug verbosity.  Default is 0, higher values are more verbose */
+int hexagon_hta_nn_set_debug_level(hexagon_hta_nn_nn_id id, int level);
+/* Append a node to the graph.  Nodes are executed in the appended order. */
+int hexagon_hta_nn_append_node(
+	hexagon_hta_nn_nn_id id,
+	uint32_t node_id,
+	hta_op_type operation,
+	hta_padding_type padding,
+	const struct input *inputs,
+	uint32_t num_inputs,
+	const struct output *outputs,
+	uint32_t num_outputs);
+/*
+ * Append a const node into the graph.  The data is copied locally during this
+ * call, the caller does not need it to persist.
+ */
+int hexagon_hta_nn_append_const_node(
+	hexagon_hta_nn_nn_id id,
+	uint32_t node_id,
+	uint32_t batches,
+	uint32_t height,
+	uint32_t width,
+	uint32_t depth,
+	const uint8_t *data,
+	uint32_t data_len);
+/*
+ * Prepare a graph for execution.  Must be done before attempting to execute the graph.
+ */
+int hexagon_hta_nn_prepare(hexagon_hta_nn_nn_id id);
+/* Execute the graph with a single input and a single output. */
+int hexagon_hta_nn_execute(
+	hexagon_hta_nn_nn_id id,
+	uint32_t batches_in,
+	uint32_t height_in,
+	uint32_t width_in,
+	uint32_t depth_in,
+	const uint8_t *data_in,
+	uint32_t data_len_in,
+	uint32_t *batches_out,
+	uint32_t *height_out,
+	uint32_t *width_out,
+	uint32_t *depth_out,
+	uint8_t *data_out,
+	uint32_t data_out_max,
+	uint32_t *data_out_size);
+/* Tear down a graph, destroying it and freeing resources.  */
+int hexagon_hta_nn_teardown(hexagon_hta_nn_nn_id id);
+/* Get the version of the library */
+int hexagon_hta_nn_version(int *ver);
+/* Execute the graph with a multiple input and a multiple output. */
+int hexagon_hta_nn_execute_new(
+	hexagon_hta_nn_nn_id id,
+	const hexagon_hta_nn_tensordef *inputs,
+	uint32_t n_inputs,
+	hexagon_hta_nn_tensordef *outputs,
+	uint32_t n_outputs);
+int hexagon_hta_nn_serialize_size(hexagon_hta_nn_nn_id id, unsigned int *serialized_obj_size_out);
+int hexagon_hta_nn_serialize(hexagon_hta_nn_nn_id id, void *buf, unsigned int buf_len);
+int hexagon_hta_nn_deserialize(void *buf, unsigned len, hexagon_hta_nn_nn_id *g);
+#ifdef __cplusplus
+}
+#endif
+#endif //THIRD_PARTY_HTA_HEXAGON_API_H_
--- a/third_party/hta/hta_hexagon_nn_ops.h
+++ b/third_party/hta/hta_hexagon_nn_ops.h
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef THIRD_PARTY_HTA_HEXAGON_NN_OPS_H_
+#define THIRD_PARTY_HTA_HEXAGON_NN_OPS_H_
+typedef enum hta_op_type_enum {
+#define HTA_DEF_OP(NAME, ...) HTA_OP_##NAME,
+#include "hta_ops.h"
+  HTA_NN_OPS_MAX
+#undef HTA_DEF_OP
+} hta_op_type;
+#endif  // THIRD_PARTY_HTA_HEXAGON_NN_OPS_H_
--- a/third_party/hta/hta_ops.h
+++ b/third_party/hta/hta_ops.h
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/*
+ * You probably want to
+ *
+ *    ##    #####   #####
+ *   #  #   #    #  #    #
+ *  #    #  #    #  #    #
+ *  ######  #    #  #    #
+ *  #    #  #    #  #    #
+ *  #    #  #####   #####
+ *
+ *
+ *  #    #   ####   #####   ######   ####
+ *  ##   #  #    #  #    #  #       #
+ *  # #  #  #    #  #    #  #####    ####
+ *  #  # #  #    #  #    #  #            #
+ *  #   ##  #    #  #    #  #       #    #
+ *  #    #   ####   #####   ######   ####
+ *
+ *
+ *    ##     #####
+ *   #  #      #
+ *  #    #     #
+ *  ######     #
+ *  #    #     #
+ *  #    #     #
+ *
+ *
+ *   #####  #    #  ######
+ *     #    #    #  #
+ *     #    ######  #####
+ *     #    #    #  #
+ *     #    #    #  #
+ *     #    #    #  ######
+ *
+ *
+ *  ######  #    #  #####
+ *  #       ##   #  #    #
+ *  #####   # #  #  #    #
+ *  #       #  # #  #    #
+ *  #       #   ##  #    #
+ *  ######  #    #  #####
+ *
+ * otherwise the interface becomes incompatible.
+ */
+HTA_DEF_OP(INPUT)
+HTA_DEF_OP(OUTPUT)
+HTA_DEF_OP(Nop)
+HTA_DEF_OP(Const)
+HTA_DEF_OP(Check)
+HTA_DEF_OP(Close_f)
+HTA_DEF_OP(Close_quint8)
+HTA_DEF_OP(Close_q_quint8)
+HTA_DEF_OP(Close_int32)
+HTA_DEF_OP(Close_qint32)
+HTA_DEF_OP(PPrint_8)
+HTA_DEF_OP(PPrint_32)
+HTA_DEF_OP(PPrint_f)
+HTA_DEF_OP(PreFree)
+HTA_DEF_OP(Flatten)
+#ifndef HTA_DEF_OP_WREF
+#define HTA_DEF_OP_WREF(NAME) HTA_DEF_OP(NAME) HTA_DEF_OP(NAME##_ref)
+#define __SELF_HTA_DEF_OP_WREF
+#endif
+HTA_DEF_OP_WREF(QuantizedConv2d_8x8to32)
+HTA_DEF_OP_WREF(QuantizedMatMul_8x8to32)
+HTA_DEF_OP_WREF(QuantizeDownAndShrinkRange_32to8)
+HTA_DEF_OP_WREF(QuantizedRelu_8)
+HTA_DEF_OP_WREF(QuantizedReluX_8)
+HTA_DEF_OP_WREF(QuantizedMaxPool_8)
+HTA_DEF_OP_WREF(QuantizedAvgPool_8)
+HTA_DEF_OP_WREF(QuantizedL2Pool_8)
+HTA_DEF_OP_WREF(QuantizedConcat_8)
+HTA_DEF_OP_WREF(QuantizedBiasAdd_8p8to32)
+HTA_DEF_OP_WREF(Min_f)
+HTA_DEF_OP_WREF(Max_f)
+HTA_DEF_OP_WREF(Quantize)
+HTA_DEF_OP_WREF(Dequantize)
+HTA_DEF_OP_WREF(Supernode_8x8p8to8)
+HTA_DEF_OP(QuantizedFlatten)
+HTA_DEF_OP(Softmax_f)
+HTA_DEF_OP(Conv2d_f)
+HTA_DEF_OP(MatMul_f)
+HTA_DEF_OP(Relu_f)
+HTA_DEF_OP(ReluX_f)
+HTA_DEF_OP(AvgPool_f)
+HTA_DEF_OP(L2Pool_f)
+HTA_DEF_OP(MaxPool_f)
+HTA_DEF_OP(Concat_f)
+HTA_DEF_OP(BiasAdd_f)
+HTA_DEF_OP(LRN_f)
+HTA_DEF_OP(Variable)
+HTA_DEF_OP(Assign)
+HTA_DEF_OP(Reshape)
+HTA_DEF_OP(QuantizedReshape)
+HTA_DEF_OP(Tanh_f)
+HTA_DEF_OP(Sigmoid_f)
+HTA_DEF_OP(Slice_8)
+HTA_DEF_OP(Slice_f)
+HTA_DEF_OP(QuantizedSlice_8)
+HTA_DEF_OP(Add_f)
+HTA_DEF_OP(Mul_f)
+HTA_DEF_OP(Minimum_f)
+HTA_DEF_OP(Maximum_f)
+HTA_DEF_OP_WREF(Requantize_32to8)
+HTA_DEF_OP_WREF(RequantizationRange_32)
+HTA_DEF_OP(Neg_f)
+HTA_DEF_OP(Sub_f)
+HTA_DEF_OP(AddN_f)
+HTA_DEF_OP(Range_int32)
+HTA_DEF_OP(Rank_int32)
+HTA_DEF_OP(Transpose_int32)
+HTA_DEF_OP(Transpose_f)
+HTA_DEF_OP(InstanceNorm_f)
+HTA_DEF_OP_WREF(QuantizedInstanceNorm_8)
+HTA_DEF_OP(Sub_int32)
+HTA_DEF_OP(Add_int32)
+HTA_DEF_OP(Split_f)
+HTA_DEF_OP(Dequantize_qint32_f)
+HTA_DEF_OP(PRelu_f)
+HTA_DEF_OP_WREF(QuantizedPRelu_8)
+HTA_DEF_OP(Sum_f)
+HTA_DEF_OP(Prod_f)
+HTA_DEF_OP(Mul_int32)
+HTA_DEF_OP(LogicalAnd_int32)
+HTA_DEF_OP(LogicalOr_int32)
+HTA_DEF_OP(LogicalXor_int32)
+HTA_DEF_OP(Shape_int32)
+HTA_DEF_OP(Pack_int32)
+HTA_DEF_OP(MirrorPad_f)
+HTA_DEF_OP(ResizeNearestNeighbor_f)
+HTA_DEF_OP(StridedSlice_int32)
+HTA_DEF_OP(StridedSlice_f)
+HTA_DEF_OP(ExpandDims_int32)
+HTA_DEF_OP(ExpandDims_f)
+HTA_DEF_OP(LogSoftmax_f)
+HTA_DEF_OP(Split_int32)
+HTA_DEF_OP(QuantizedSplit_8)
+HTA_DEF_OP(Deconv_f)
+HTA_DEF_OP_WREF(QuantizedDeconv_8x8to32)
+HTA_DEF_OP_WREF(QuantizedMul_8x8to32)
+HTA_DEF_OP_WREF(QuantizedAdd_8p8to32)
+HTA_DEF_OP_WREF(QuantizedSigmoid_8)
+HTA_DEF_OP_WREF(QuantizedTanh_8)
+HTA_DEF_OP_WREF(QuantizedSoftmax_8)
+HTA_DEF_OP_WREF(QuantizedLRN_8)
+HTA_DEF_OP_WREF(Quantizedpad2d_frame_8p)
+HTA_DEF_OP_WREF(QuantizedSub_8p8to32)
+HTA_DEF_OP_WREF(QuantizedMaximum_8)
+HTA_DEF_OP_WREF(QuantizedMinimum_8)
+HTA_DEF_OP(Pad_f)
+HTA_DEF_OP(SpaceToBatchND_f)
+HTA_DEF_OP(BatchToSpaceND_f)
+HTA_DEF_OP(QuantizedPad_8)
+HTA_DEF_OP(ResizeBilinear_f)
+HTA_DEF_OP(ConcatV2_f)
+HTA_DEF_OP(ConcatV2_int32)
+HTA_DEF_OP(Prod_int32)
+HTA_DEF_OP(Slice_int32)
+HTA_DEF_OP(QuantizedAdd_8p8to8)
+HTA_DEF_OP(QuantizedResizeBilinear_8)
+HTA_DEF_OP(Supernode_8x8p8to8_d32)
+HTA_DEF_OP(Convert_to_d32)
+HTA_DEF_OP(Convert_from_d32)
+HTA_DEF_OP_WREF(QuantizedMaxPool_8_d32)
+HTA_DEF_OP_WREF(QuantizedConcat_8_d32)
+HTA_DEF_OP_WREF(QuantizedAvgPool_8_d32)
+HTA_DEF_OP(Sink)
+HTA_DEF_OP_WREF(QuantizedPRelu_8_d32)
+HTA_DEF_OP_WREF(AutoQuantize)
+HTA_DEF_OP_WREF(QuantizedDepthwiseConv2d_8x8to32)
+HTA_DEF_OP_WREF(DepthwiseConv2d_f)
+HTA_DEF_OP(DepthwiseSupernode_8x8p8to8)
+HTA_DEF_OP(DepthwiseSupernode_8x8p8to8_d32)
+HTA_DEF_OP_WREF(QuantizedMul_8x8to8_d32)
+HTA_DEF_OP(FullyConnected_u8)
+#if 0
+HTA_DEF_OP_WREF(QuantizedFC_8x8p8to8)
+#endif
+HTA_DEF_OP_WREF(QuantizedAdd_8p8to8_d32)
+HTA_DEF_OP_WREF(QuantizedClamp_8)
+HTA_DEF_OP(Clamp_f)
+HTA_DEF_OP(QuantizeForTest_d32)
+HTA_DEF_OP(Close_d32)
+HTA_DEF_OP_WREF(QuantizedSub_8p8to8_d32)
+HTA_DEF_OP(InputSupernode_8x8p8to8_outd32)
+HTA_DEF_OP(QuantizedLRN_8_d32)
+HTA_DEF_OP_WREF(QuantizedBiasAdd_32p32to32)
+HTA_DEF_OP_WREF(Quantize_int32)
+HTA_DEF_OP(Supernode_8x8p32to8)
+HTA_DEF_OP(DepthwiseSupernode_8x8p32to8)
+HTA_DEF_OP(Supernode_8x8p32to8_d32)
+HTA_DEF_OP(DepthwiseSupernode_8x8p32to8_d32)
+HTA_DEF_OP(InputSupernode_8x8p32to8_outd32)
+HTA_DEF_OP(PPrint_8_d32)
+HTA_DEF_OP(PPrintWithPadding_8_d32)
+HTA_DEF_OP_WREF(AutoQuantize_d32)
+HTA_DEF_OP_WREF(QuantizedTanh_8_d32)
+HTA_DEF_OP_WREF(QuantizedSigmoid_8_d32)
+HTA_DEF_OP_WREF(QuantizedSoftmax_8_d32)
+HTA_DEF_OP_WREF(QuantizedL2Pool_8_d32)
+HTA_DEF_OP(Gather_f)
+HTA_DEF_OP(Gather_int32)
+HTA_DEF_OP(Gather_8)
+HTA_DEF_OP(Table_f)
+HTA_DEF_OP(Table_int32)
+HTA_DEF_OP(Table_8)
+HTA_DEF_OP(FillPadding_8_d32)
+HTA_DEF_OP(QuantizedResizeBilinear_8_d32)
+HTA_DEF_OP(QuantizeINPUT_f_to_8)
+HTA_DEF_OP_WREF(DeconvBias_8x8to32)
+HTA_DEF_OP(SpaceToBatchND_8)
+HTA_DEF_OP(BatchToSpaceND_8)
+HTA_DEF_OP(SpaceToDepth_f)
+HTA_DEF_OP(DepthToSpace_f)
+HTA_DEF_OP(SpaceToDepth_8)
+HTA_DEF_OP(DepthToSpace_8)
+HTA_DEF_OP(DequantizeOUTPUT_8tof)
+HTA_DEF_OP(QuantizedBatchNorm_8x8p8to8)
+HTA_DEF_OP(QuantizedBatchNorm_8x8p32to8)
+HTA_DEF_OP(QuantizedBatchNorm_8x8p8to8_d32)
+HTA_DEF_OP(QuantizedBatchNorm_8x8p32to8_d32)
+HTA_DEF_OP_WREF(QuantizedInstanceNorm_8_d32)
+HTA_DEF_OP_WREF(QuantizedInstanceNormBG_8)
+HTA_DEF_OP_WREF(QuantizedInstanceNormBG_8_d32)
+HTA_DEF_OP(SuperFC_8x8p32to8)
+HTA_DEF_OP(SuperFC_8x8p32to8_ref)
+HTA_DEF_OP(SuperFC_8x8p32to8_d32)
+HTA_DEF_OP(ChannelShuffle_f)
+HTA_DEF_OP(ChannelShuffle_int32)
+HTA_DEF_OP_WREF(QuantizedChannelShuffle_8)
+HTA_DEF_OP(QuantizedChannelShuffle_8_d32)
+/* this is in op_chanshuffle_d32.c*/
+HTA_DEF_OP(QuantizedSplit_8_d32)
+HTA_DEF_OP(QuantizedCrop_8)
+HTA_DEF_OP(ResizeUnitSquare_f)
+HTA_DEF_OP_WREF(ResizeUnitSquare_8)
+HTA_DEF_OP_WREF(Nv21ToRgb_8)
+HTA_DEF_OP_WREF(RgbaToRgb_8)
+HTA_DEF_OP_WREF(Argb32ToRgb_8)
+HTA_DEF_OP(Permute_f)
+HTA_DEF_OP(QuantizedPermute_8)
+HTA_DEF_OP_WREF(QuantizedRoiPool_8)
+HTA_DEF_OP(Proposal_f)
+HTA_DEF_OP(RoiAlign_f)
+HTA_DEF_OP_WREF(QuantizedRoiAlign_8)
+HTA_DEF_OP_WREF(Implode_8)
+HTA_DEF_OP(QuantizedConcat_8_nond32)
+HTA_DEF_OP(Close_16tof)
+HTA_DEF_OP(QuantizedLstmInput_16x16to16)
+HTA_DEF_OP(QuantizedLstmOutput_16x16to8)
+HTA_DEF_OP(Quantize_16)
+HTA_DEF_OP(Dequantize_16)
+HTA_DEF_OP(Convert_8_16)
+HTA_DEF_OP(QuantizedTanh_16)
+HTA_DEF_OP(QuantizedSigmoid_16)
+HTA_DEF_OP_WREF(QuantizeDownAndShrinkRange_32to16)
+HTA_DEF_OP_WREF(Requantize_32to16)
+HTA_DEF_OP_WREF(QuantizedMatMul_8x8p32to16)
+HTA_DEF_OP(QuantizedStridedSlice_8)
+HTA_DEF_OP(Bbox_Transform_f)
+HTA_DEF_OP(Softmax_uint8)
+HTA_DEF_OP(QuantizedFakeConcat_8_d32)
+HTA_DEF_OP(DepthToSpace_8_d32)
+HTA_DEF_OP(OemNode)
+HTA_DEF_OP(QuantizedPad_8_d32)
+// Add new operations above this line
+#ifdef __SELF_HTA_DEF_OP_WREF
+#undef __SELF_HTA_DEF_OP_WREF
+#undef HTA_DEF_OP_WREF
+#endif
--- a/third_party/hta/libhta_dsp_skel.so
+++ b/third_party/hta/libhta_dsp_skel.so
--- a/third_party/hta/license.txt
+++ b/third_party/hta/license.txt
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
--- a/tools/common.py
+++ b/tools/common.py
@@ -129,6 +129,7 @@ class DeviceType(object):
    CPU = 'CPU'
    GPU = 'GPU'
    HEXAGON = 'HEXAGON'
+    HTA = 'HTA'
 class DataFormat(object):
@@ -199,6 +200,8 @@ def parse_device_type(runtime):
    if runtime == RuntimeType.dsp:
        device_type = DeviceType.HEXAGON
+    elif runtime == RuntimeType.hta:
+        device_type = DeviceType.HTA
    elif runtime == RuntimeType.gpu:
        device_type = DeviceType.GPU
    elif runtime == RuntimeType.cpu:
@@ -513,6 +516,7 @@ class RuntimeType(object):
    cpu = 'cpu'
    gpu = 'gpu'
    dsp = 'dsp'
+    hta = 'hta'
    cpu_gpu = 'cpu+gpu'

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -61,6 +61,7 @@ RuntimeTypeStrs = [
    "cpu",
    "gpu",
    "dsp",
+    "hta",
    "cpu+gpu"
 ]
@@ -142,6 +143,8 @@ def parse_device_type(runtime):
    if runtime == RuntimeType.dsp:
        device_type = DeviceType.HEXAGON
+    elif runtime == RuntimeType.hta:
+        device_type = DeviceType.HTA
    elif runtime == RuntimeType.gpu:
        device_type = DeviceType.GPU
    elif runtime == RuntimeType.cpu:
@@ -163,6 +166,19 @@ def get_hexagon_mode(configs):
    return False
+def get_hta_mode(configs):
+    runtime_list = []
+    for model_name in configs[YAMLKeyword.models]:
+        model_runtime = \
+            configs[YAMLKeyword.models][model_name].get(
+                YAMLKeyword.runtime, "")
+        runtime_list.append(model_runtime.lower())
+    if RuntimeType.hta in runtime_list:
+        return True
+    return False
 def get_opencl_mode(configs):
    runtime_list = []
    for model_name in configs[YAMLKeyword.models]:
@@ -452,6 +468,8 @@ def format_model_config(flags):
                DeviceType.GPU: ValidationThreshold.gpu_threshold,
                DeviceType.HEXAGON + "_QUANTIZE":
                    ValidationThreshold.hexagon_threshold,
+                DeviceType.HTA + "_QUANTIZE":
+                    ValidationThreshold.hexagon_threshold,
                DeviceType.CPU + "_QUANTIZE":
                    ValidationThreshold.cpu_quantize_threshold,
            }
@@ -461,6 +479,7 @@ def format_model_config(flags):
                if k.upper() not in (DeviceType.CPU,
                                     DeviceType.GPU,
                                     DeviceType.HEXAGON,
+                                     DeviceType.HTA,
                                     DeviceType.CPU + "_QUANTIZE"):
                    raise argparse.ArgumentTypeError(
                        'Unsupported validation threshold runtime: %s' % k)
@@ -740,7 +759,6 @@ def build_model_lib(configs, address_sanitizer):
    # create model library dir
    library_name = configs[YAMLKeyword.library_name]
    for target_abi in configs[YAMLKeyword.target_abis]:
-        hexagon_mode = get_hexagon_mode(configs)
        model_lib_output_path = get_model_lib_output_path(library_name,
                                                          target_abi)
        library_out_dir = os.path.dirname(model_lib_output_path)
@@ -751,7 +769,8 @@ def build_model_lib(configs, address_sanitizer):
            MODEL_LIB_TARGET,
            abi=target_abi,
            toolchain=toolchain,
-            hexagon_mode=hexagon_mode,
+            enable_hexagon=get_hexagon_mode(configs),
+            enable_hta=get_hta_mode(configs),
            enable_opencl=get_opencl_mode(configs),
            enable_quantize=get_quantize_mode(configs),
            address_sanitizer=address_sanitizer,
@@ -842,7 +861,6 @@ def report_run_statistics(stdout,
 def build_mace_run(configs, target_abi, toolchain, enable_openmp,
                   address_sanitizer, mace_lib_type):
    library_name = configs[YAMLKeyword.library_name]
-    hexagon_mode = get_hexagon_mode(configs)
    build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
    if os.path.exists(build_tmp_binary_dir):
@@ -865,7 +883,8 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
        mace_run_target,
        abi=target_abi,
        toolchain=toolchain,
-        hexagon_mode=hexagon_mode,
+        enable_hexagon=get_hexagon_mode(configs),
+        enable_hta=get_hta_mode(configs),
        enable_openmp=enable_openmp,
        enable_opencl=get_opencl_mode(configs),
        enable_quantize=get_quantize_mode(configs),
@@ -880,7 +899,6 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
 def build_example(configs, target_abi, toolchain,
                  enable_openmp, mace_lib_type, cl_binary_to_code, device):
    library_name = configs[YAMLKeyword.library_name]
-    hexagon_mode = get_hexagon_mode(configs)
    build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
    if os.path.exists(build_tmp_binary_dir):
@@ -914,7 +932,8 @@ def build_example(configs, target_abi, toolchain,
                            enable_openmp=enable_openmp,
                            enable_opencl=get_opencl_mode(configs),
                            enable_quantize=get_quantize_mode(configs),
-                            hexagon_mode=hexagon_mode,
+                            enable_hexagon=get_hexagon_mode(configs),
+                            enable_hta=get_hta_mode(configs),
                            address_sanitizer=flags.address_sanitizer,
                            symbol_hidden=symbol_hidden)
@@ -945,7 +964,8 @@ def build_example(configs, target_abi, toolchain,
                            enable_openmp=enable_openmp,
                            enable_opencl=get_opencl_mode(configs),
                            enable_quantize=get_quantize_mode(configs),
-                            hexagon_mode=hexagon_mode,
+                            enable_hexagon=get_hexagon_mode(configs),
+                            enable_hta=get_hta_mode(configs),
                            address_sanitizer=flags.address_sanitizer,
                            extra_args=build_arg)
@@ -1028,7 +1048,6 @@ def build_benchmark_model(configs,
                          enable_openmp,
                          mace_lib_type):
    library_name = configs[YAMLKeyword.library_name]
-    hexagon_mode = get_hexagon_mode(configs)
    link_dynamic = mace_lib_type == MACELibType.dynamic
    if link_dynamic:
@@ -1051,7 +1070,8 @@ def build_benchmark_model(configs,
                            enable_openmp=enable_openmp,
                            enable_opencl=get_opencl_mode(configs),
                            enable_quantize=get_quantize_mode(configs),
-                            hexagon_mode=hexagon_mode,
+                            enable_hexagon=get_hexagon_mode(configs),
+                            enable_hta=get_hta_mode(configs),
                            symbol_hidden=symbol_hidden,
                            extra_args=build_arg)
    # clear tmp binary dir

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -263,7 +263,8 @@ def find_simpleperf_library(abi, simpleperf_path=''):
 def bazel_build(target,
                abi="armeabi-v7a",
                toolchain='android',
-                hexagon_mode=False,
+                enable_hexagon=False,
+                enable_hta=False,
                enable_openmp=True,
                enable_neon=True,
                enable_opencl=True,
@@ -299,7 +300,9 @@ def bazel_build(target,
            "--define",
            "quantize=%s" % str(enable_quantize).lower(),
            "--define",
-            "hexagon=%s" % str(hexagon_mode).lower())
+            "hexagon=%s" % str(enable_hexagon).lower(),
+            "--define",
+            "hta=%s" % str(enable_hta).lower())
    if address_sanitizer:
        bazel_args += ("--config", "asan")
    else: