diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5a86f2eb1729d784135f5499da6dcc7ba66af776..89916b0959ed482f0c9dcdfd767945d43643f21b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -106,7 +106,7 @@ mace_cc_test:
         GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
         DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
       fi
-    - python tools/bazel_adb_run.py --target="//test/ccunit:mace_cc_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//test/ccunit:mace_cc_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
     - python tools/bazel_adb_run.py --target="//micro/test/ccunit:micro_ops_test" --run_target=True --stdout_processor=ops_benchmark_stdout_processor --target_abis=arm64-v8a
 
 mace_cc_benchmark:
@@ -133,7 +133,7 @@ model_tests:
       fi
     - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
     - python tools/converter.py convert --config=${CONF_FILE}  --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file --cl_mem_type=buffer
-    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file
+    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file
     - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
     - python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file
     - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file --address_sanitizer
diff --git a/include/mace/public/mace.h b/include/mace/public/mace.h
index 77488e77ba7097692aea12ef11f760b9df56dfb2..52cda774df420bd155b01cc540fede6879a3fccd 100644
--- a/include/mace/public/mace.h
+++ b/include/mace/public/mace.h
@@ -102,6 +102,17 @@ enum HexagonNNCornerType {
   HEXAGON_NN_CORNER_SVS2,
 };
 
+// APU Initial Cache Policy:
+// NONE: Compile model using the information from net_def and model_data.
+// STORE: Compile model using the information from net_def and model_data and
+// store the compiled model.
+// LOAD: Get input/output information from net_def and load pre-compiled model.
+enum APUCachePolicy {
+  APU_CACHE_NONE = 0,
+  APU_CACHE_STORE = 1,
+  APU_CACHE_LOAD = 2,
+};
+
 struct CallStats {
   int64_t start_micros;
   int64_t end_micros;
@@ -355,6 +366,21 @@ class MACE_API MaceEngineConfig {
                              bool dcvs_enable,
                              int latency);
 
+  /// \brief Set MTK APU initial cache
+  ///
+  /// \param policy is a policy for loading or storing apu initial cache.
+  /// \param binary_file will load cache file from this path.
+  /// \param storage_file will store cache file to this path.
+  ///
+  /// Now the path is used to store the cache to file,
+  /// which could speed up the APU initialization.
+  /// If do not call this API, the initialization maybe slow for APU.
+  ///
+  /// \return MaceStatus::MACE_SUCCESS for success, other for failure.
+  MaceStatus SetAPUCache(APUCachePolicy policy,
+                         const std::string &binary_file,
+                         const std::string &storage_file);
+
  private:
   class Impl;
   std::unique_ptr<Impl> impl_;
diff --git a/mace/codegen/model_version_script.lds b/mace/codegen/model_version_script.lds
index d1cc9dad28cab6cdb42aa503c54c0500287f806d..4a215dfc48d9cb7e52ab28d00fda2433f9ab3cbb 100644
--- a/mace/codegen/model_version_script.lds
+++ b/mace/codegen/model_version_script.lds
@@ -1,6 +1,7 @@
 mace {
   global:
     *LoadModelData*;
+    *GetModelSize*;
     *CreateNet*;
     *ModelName*;
     *ModelChecksum*;
diff --git a/mace/core/runtime/apu/apu_wrapper.cc b/mace/core/runtime/apu/apu_wrapper.cc
index 42b8956af504b26df903b58cde01ec8f9ce4bd42..6feac5c699cab4503904037833b925a5ddb0f545 100644
--- a/mace/core/runtime/apu/apu_wrapper.cc
+++ b/mace/core/runtime/apu/apu_wrapper.cc
@@ -32,10 +32,14 @@ apu_data_type ApuWrapper::MapToApuDataType(DataType mace_type) {
       return APU_DATA_TYPE_INT32;
     case DT_HALF:
       return APU_DATA_TYPE_HALF;
+    case DT_FLOAT16:
+      return APU_DATA_TYPE_HALF;
     case DT_UINT8:
       return APU_DATA_TYPE_UINT8;
+    case DT_INT16:
+      return APU_DATA_TYPE_INT16;
     default:
-      MACE_CHECK(true, "unsupport mace data type");
+      MACE_CHECK(false, "unsupport mace data type");
       break;
   }
   return APU_DATA_TYPE_UNDEFINED;
@@ -48,7 +52,7 @@ apu_pooling_mode ApuWrapper::MapToApuPoolingMode(int mace_mode) {
     case 2:
       return APU_POOLING_MAX;
     default:
-      MACE_CHECK(true, "unsupport mace pooling mode");
+      MACE_CHECK(false, "unsupport mace pooling mode");
       break;
   }
   return APU_POOLING_UNDEFINED;
@@ -67,62 +71,37 @@ apu_eltwise_mode ApuWrapper::MapToApuEltwiseMode(int mace_mode) {
     case 5:
       return APU_ELTWISE_MAX;
     default:
-      MACE_CHECK(true, "unsupport mace eltwise mode");
+      MACE_CHECK(false, "unsupport mace eltwise mode");
       break;
   }
   return APU_ELTWISE_UNDEFINED;
 }
 
-bool ApuWrapper::Init(const NetDef &net_def,
-                      unsigned const char *model_data,
-                      const index_t model_data_size) {
+bool ApuWrapper::Init(const NetDef &net_def, unsigned const char *model_data,
+                      const char *file_name, bool load, bool store) {
   frontend = new ApuFrontend();
 
+  MACE_CHECK(!(load & store),
+            "Should not load and store the model simultaneously.");
+
   // parse model argument
   int const_data_num = 0;
+  int apu_data_type = -1;
   for (auto arg : net_def.arg()) {
     if (arg.name().compare("const_data_num") == 0) {
       const_data_num = arg.i();
+    } else if (arg.name().compare("apu_data_type") == 0) {
+      apu_data_type = arg.i();
     }
   }
-
-  // const tensors
-  std::vector<apu_tensor> const_tensors;
-  for (auto const_tensor : net_def.tensors()) {
-    apu_tensor tensor;
-    tensor.tensor_id = const_tensor.node_id();
-    tensor.tensor_type = (tensor.tensor_id < const_data_num) ?
-                             APU_TENSOR_CONST_DATA :
-                             APU_TENSOR_CONST_ARGUMENT;
-    tensor.data_type = MapToApuDataType(const_tensor.data_type());
-    tensor.scale = const_tensor.has_scale() ? const_tensor.scale() : 0.0f;
-    tensor.zero_point = const_tensor.has_zero_point() ?
-                            const_tensor.zero_point() : 0;
-    tensor.dim_size = const_tensor.dims_size();
-    MACE_CHECK(tensor.dim_size <= APU_TENSOR_MAX_DIMS,
-               "tensor dimension size not supported");
-    for (auto i = 0; i < tensor.dim_size; i++) {
-      tensor.dims[i] = const_tensor.dims(i);
-    }
-    const auto tensor_end = const_tensor.offset() +
-        const_tensor->data_size() * GetEnumTypeSize(const_tensor.data_type());
-    if (model_data_size >= 0) {
-      MACE_CHECK(tensor_end <= model_data_size, "tensor_end (", tensor_end,
-                 ") should <= ", model_data_size);
-    }
-    tensor.data_buf =
-        const_cast<unsigned char *>(model_data + const_tensor.offset());
-    const_tensors.push_back(tensor);
-  }
-
   // input tensors
   std::vector<apu_tensor> input_tensors;
   for (auto input_info : net_def.input_info()) {
     apu_tensor tensor;
     tensor.tensor_id = input_info.node_id();
     tensor.tensor_type = APU_TENSOR_MODEL_INPUT;
-    tensor.data_type = APU_DATA_TYPE_UINT8;  // will do quantize in Run()
-    tensor.scale = input_info.has_scale() ? input_info.scale() : 0.0f;
+    tensor.data_type = MapToApuDataType(static_cast<DataType>(apu_data_type));
+    tensor.scale = input_info.has_scale() ? input_info.scale() : -1.0f;
     tensor.zero_point = input_info.has_zero_point() ?
                             input_info.zero_point() : 0;
     tensor.dim_size = input_info.dims_size();
@@ -131,114 +110,156 @@ bool ApuWrapper::Init(const NetDef &net_def,
     ApuTensorInfo info;
     info.name = input_info.name();
     info.size = 1;
-    for (auto i = 0; i < tensor.dim_size; i++) {
+    info.data_type = tensor.data_type;
+    int byte_per_element = GetByteNum(tensor.data_type);
+    for (auto i = 0 ; i < tensor.dim_size ; i++) {
       tensor.dims[i] = input_info.dims(i);
       info.size *= input_info.dims(i);
       info.shape.push_back(input_info.dims(i));
     }
-    info.buf = std::shared_ptr<uint8_t>(new uint8_t[info.size],
-                                        std::default_delete<uint8_t[]>());
+    info.buf
+    = std::shared_ptr<uint8_t>(new uint8_t[info.size * byte_per_element],
+                               std::default_delete<uint8_t[]>());
     info.scale = tensor.scale;
     info.zero_point = tensor.zero_point;
     input_infos.push_back(info);
     tensor.data_buf = info.buf.get();
     input_tensors.push_back(tensor);
   }
-
   // output tensors
-  std::vector<int> output_tensor_ids;
-  std::vector<void *> output_buffers;
+  std::vector<apu_tensor> output_tensors;
   for (auto output_info : net_def.output_info()) {
-    output_tensor_ids.push_back(output_info.node_id());
+    apu_tensor tensor;
+    tensor.tensor_id = output_info.node_id();
+    tensor.tensor_type = APU_TENSOR_MODEL_OUTPUT;
+    tensor.data_type = MapToApuDataType(static_cast<DataType>(apu_data_type));
+    tensor.dim_size = output_info.dims_size();
     ApuTensorInfo info;
     info.name = output_info.name();
     info.size = 1;
-    for (auto i = 0; i < output_info.dims().size(); i++) {
+    info.data_type = tensor.data_type;
+    int byte_per_element = GetByteNum(tensor.data_type);
+    for (auto i = 0 ; i < tensor.dim_size ; i++) {
+      tensor.dims[i] = output_info.dims(i);
       info.size *= output_info.dims(i);
       info.shape.push_back(output_info.dims(i));
     }
-    info.buf = std::shared_ptr<uint8_t>(new uint8_t[info.size],
-                                        std::default_delete<uint8_t[]>());
+    info.buf =
+    std::shared_ptr<uint8_t>(new uint8_t[info.size * byte_per_element],
+                             std::default_delete<uint8_t[]>());
     for (auto op_def : net_def.op()) {
       if (output_info.name() == op_def.output(0)) {
-        info.scale = op_def.quantize_info(0).scale();
-        info.zero_point = op_def.quantize_info(0).zero_point();
+        if (info.data_type == static_cast<int>(APU_DATA_TYPE_UINT8) ||
+            info.data_type == static_cast<int>(APU_DATA_TYPE_INT16)) {
+          info.scale = op_def.quantize_info(0).scale();
+          info.zero_point = op_def.quantize_info(0).zero_point();
+        } else {
+          info.scale = 0.0;
+          info.zero_point = 0;
+        }
       }
     }
     output_infos.push_back(info);
-    output_buffers.push_back(info.buf.get());
+    tensor.data_buf = info.buf.get();
+    output_tensors.push_back(tensor);
   }
-
+  // const tensors
+  std::vector<apu_tensor> const_tensors;
   // operators
   std::vector<apu_operator> ops;
   std::vector<std::vector<int>> cached_op_inputs;
-  for (auto op_def : net_def.op()) {
-    apu_operator op;
-    strncpy(op.type, op_def.type().c_str(), APU_OP_TYPE_MAX_SIZE);
-    op.input_size = op_def.node_input_size();
-    std::vector<int> input_ids;
-    for (auto i = 0; i < op.input_size; i++) {
-      input_ids.push_back(op_def.node_input(i).node_id());
-    }
-    cached_op_inputs.push_back(input_ids);
-    op.input_ids = cached_op_inputs.back().data();
-    op.output.tensor_id = op_def.node_id();
-    op.output.tensor_type = APU_TENSOR_OP_OUTPUT;
-    op.output.data_type = MapToApuDataType(op_def.output_type(0));
-    if (op.output.data_type == APU_DATA_TYPE_UINT8) {
-      op.output.scale = op_def.quantize_info(0).scale();
-      op.output.zero_point = op_def.quantize_info(0).zero_point();
-    } else {
-      op.output.scale = 0.0f;
-      op.output.zero_point = 0;
-    }
-    op.output.dim_size = op_def.output_shape(0).dims_size();
-    MACE_CHECK(op.output.dim_size <= APU_TENSOR_MAX_DIMS,
-               "tensor dimension size not supported");
-    for (auto i = 0; i < op.output.dim_size; i++) {
-      op.output.dims[i] = op_def.output_shape(0).dims(i);
+  if (!load) {
+    // const tensors
+    for (auto const_tensor : net_def.tensors()) {
+      apu_tensor tensor;
+      tensor.tensor_id = const_tensor.node_id();
+      tensor.tensor_type = (tensor.tensor_id < const_data_num) ?
+                               APU_TENSOR_CONST_DATA :
+                               APU_TENSOR_CONST_ARGUMENT;
+      tensor.data_type = MapToApuDataType(const_tensor.data_type());
+      tensor.scale = const_tensor.has_scale() ? const_tensor.scale() : 0.0f;
+      tensor.zero_point = const_tensor.has_zero_point() ?
+                              const_tensor.zero_point() : 0;
+      tensor.dim_size = const_tensor.dims_size();
+      MACE_CHECK(tensor.dim_size <= APU_TENSOR_MAX_DIMS,
+                 "tensor dimension size not supported");
+      for (auto i = 0 ; i < tensor.dim_size ; i++) {
+        tensor.dims[i] = const_tensor.dims(i);
+      }
+      tensor.data_buf =
+          const_cast<unsigned char*>(model_data + const_tensor.offset());
+      const_tensors.push_back(tensor);
     }
-    op.output.data_buf = nullptr;
-    // get op mode and activation mode
-    bool is_pooling = (strcmp(op.type, "Pooling") == 0);
-    bool is_eltwise = (strcmp(op.type, "Eltwise") == 0);
-    std::string activation;
-    float max_limit = 0.0f;
-    for (auto arg : op_def.arg()) {
-      if (arg.name().compare("activation") == 0) {
-        activation = arg.s();
+    // operators
+    for (auto op_def : net_def.op()) {
+      apu_operator op;
+      strncpy(op.type, op_def.type().c_str(), APU_OP_TYPE_MAX_SIZE);
+      op.input_size = op_def.node_input_size();
+      std::vector<int> input_ids;
+      for (auto i = 0 ; i < op.input_size ; i++) {
+        input_ids.push_back(op_def.node_input(i).node_id());
       }
-      if (arg.name().compare("max_limit") == 0) {
-        max_limit = arg.f();
+      cached_op_inputs.push_back(input_ids);
+      op.input_ids = cached_op_inputs.back().data();
+      op.output.tensor_id = op_def.node_id();
+      op.output.tensor_type = APU_TENSOR_OP_OUTPUT;
+      op.output.data_type = MapToApuDataType(op_def.output_type(0));
+      if (op.output.data_type == APU_DATA_TYPE_UINT8 ||
+          op.output.data_type == APU_DATA_TYPE_INT16) {
+        op.output.scale = op_def.quantize_info(0).scale();
+        op.output.zero_point = op_def.quantize_info(0).zero_point();
+      } else {
+        op.output.scale = 0.0f;
+        op.output.zero_point = 0;
       }
-      if (is_pooling && arg.name().compare("pooling_type") == 0) {
-        op.op_mode = static_cast<int>(MapToApuPoolingMode(arg.i()));
+      op.output.dim_size = op_def.output_shape(0).dims_size();
+      MACE_CHECK(op.output.dim_size <= APU_TENSOR_MAX_DIMS,
+                 "tensor dimension size not supported");
+      for (auto i = 0 ; i < op.output.dim_size ; i++) {
+        op.output.dims[i] = op_def.output_shape(0).dims(i);
       }
-      if (is_eltwise && arg.name().compare("type") == 0) {
-        op.op_mode = static_cast<int>(MapToApuEltwiseMode(arg.i()));
+      op.output.data_buf = nullptr;
+      // get op mode and activation mode
+      bool is_pooling = (strcmp(op.type, "Pooling") == 0);
+      bool is_eltwise = (strcmp(op.type, "Eltwise") == 0);
+      std::string activation;
+      float max_limit = 0.0f;
+      for (auto arg : op_def.arg()) {
+        if (arg.name().compare("activation") == 0) {
+          activation = arg.s();
+        }
+        if (arg.name().compare("max_limit") == 0) {
+          max_limit = arg.f();
+        }
+        if (is_pooling && arg.name().compare("pooling_type") == 0) {
+          op.op_mode = static_cast<int>(MapToApuPoolingMode(arg.i()));
+        }
+        if (is_eltwise && arg.name().compare("type") == 0) {
+          op.op_mode = static_cast<int>(MapToApuEltwiseMode(arg.i()));
+        }
       }
+      if (activation.compare("RELU") == 0) {
+        op.act_mode = APU_ACT_RELU;
+      } else if (activation.compare("RELUX") == 0 && max_limit == 6.0) {
+        op.act_mode = APU_ACT_RELU6;
+      } else if (activation.compare("SIGMOID") == 0) {
+        op.act_mode = APU_ACT_SIGMOID;
+      } else if (activation.compare("TANH") == 0) {
+        op.act_mode = APU_ACT_TANH;
+      } else {
+        op.act_mode = APU_ACT_NONE;
+      }
+      ops.push_back(op);
     }
-    if (activation.compare("RELU") == 0) {
-      op.act_mode = APU_ACT_RELU;
-    } else if (activation.compare("RELUX") == 0 && max_limit == 6.0) {
-      op.act_mode = APU_ACT_RELU6;
-    } else {
-      op.act_mode = APU_ACT_NONE;
-    }
-    ops.push_back(op);
   }
-
   bool print_model = false;
   bool ret = frontend->InitGraph(
-      const_tensors.size(), const_tensors.data(),
-      input_tensors.size(), input_tensors.data(),
-      output_tensor_ids.size(), output_tensor_ids.data(),
-      output_buffers.data(),
-      ops.size(), ops.data(),
-      print_model);
+                 const_tensors.size(), const_tensors.data(),
+                 input_tensors.size(), input_tensors.data(),
+                 output_tensors.size(), output_tensors.data(),
+                 ops.size(), ops.data(),
+                 print_model, file_name, load, store);
   cached_op_inputs.clear();
-  MACE_CHECK(ret == true, "apu init graph failed");
-
   return ret;
 }
 
@@ -247,22 +268,35 @@ bool ApuWrapper::Run(const std::map<std::string, Tensor *> &input_tensors,
   MACE_ASSERT(input_tensors.size() == input_infos.size(), "Wrong inputs num");
   MACE_ASSERT(output_tensors.size() == output_infos.size(),
               "Wrong outputs num");
-
   // prepare input
   for (int i = 0 ; i < static_cast<int>(input_tensors.size()) ; i++) {
     Tensor *tensor = input_tensors.at(input_infos[i].name);
 
     // check size
-    int size = input_infos[i].size;
-    MACE_ASSERT(size == static_cast<int>(tensor->size()), "Wrong input size");
-
+    int element_size = input_infos[i].size;
+    int byte_per_element = GetByteNum(input_infos[i].data_type);
+    MACE_ASSERT(element_size == static_cast<int>(tensor->size()),
+                "Wrong input size");
     // quantize
-    quantize_util_.QuantizeWithScaleAndZeropoint(
-        tensor->data<float>(),
-        size,
-        input_infos[i].scale,
-        input_infos[i].zero_point,
-        input_infos[i].buf.get());
+    if (input_infos[i].data_type == APU_DATA_TYPE_INT16) {
+      quantize_util_.QuantizeWithScaleAndZeropoint(
+          (const float*)tensor->raw_data(),
+          element_size,
+          input_infos[i].scale,
+          input_infos[i].zero_point,
+          reinterpret_cast<int16_t*>(input_infos[i].buf.get()));
+    } else if (input_infos[i].data_type == APU_DATA_TYPE_FLOAT) {
+        std::memcpy(input_infos[i].buf.get(),
+                    (const float*)tensor->raw_data(),
+                    element_size * byte_per_element);
+    } else {
+      quantize_util_.QuantizeWithScaleAndZeropoint(
+          (const float*)tensor->raw_data(),
+          element_size,
+          input_infos[i].scale,
+          input_infos[i].zero_point,
+          input_infos[i].buf.get());
+    }
   }
 
   // run model
@@ -276,16 +310,30 @@ bool ApuWrapper::Run(const std::map<std::string, Tensor *> &input_tensors,
     // prepare out buffer
     tensor->SetDtype(DT_FLOAT);
     tensor->Resize(output_infos[i].shape);
-    int size = output_infos[i].size;
-    MACE_ASSERT(size == static_cast<int>(tensor->size()), "Wrong output size");
-
+    int element_size = output_infos[i].size;
+    int byte_per_element = GetByteNum(output_infos[i].data_type);
+    MACE_ASSERT(element_size == static_cast<int>(tensor->size()),
+                "Wrong output size");
     // dequantize
-    quantize_util_.Dequantize(
-        output_infos[i].buf.get(),
-        size,
-        output_infos[i].scale,
-        output_infos[i].zero_point,
-        tensor->mutable_data<float>());
+    if (output_infos[i].data_type == APU_DATA_TYPE_INT16) {
+      quantize_util_.Dequantize(
+          reinterpret_cast<int16_t*>(output_infos[i].buf.get()),
+          element_size,
+          output_infos[i].scale,
+          output_infos[i].zero_point,
+          reinterpret_cast<float*>(tensor->raw_mutable_data()));
+    } else if (output_infos[i].data_type == APU_DATA_TYPE_FLOAT) {
+        std::memcpy(reinterpret_cast<float*>(tensor->raw_mutable_data()),
+                    output_infos[i].buf.get(),
+                    element_size * byte_per_element);
+    } else {
+      quantize_util_.Dequantize(
+          output_infos[i].buf.get(),
+          element_size,
+          output_infos[i].scale,
+          output_infos[i].zero_point,
+          reinterpret_cast<float*>(tensor->raw_mutable_data()));
+    }
   }
 
   return true;
@@ -299,4 +347,20 @@ bool ApuWrapper::Uninit() {
   return ret;
 }
 
+int ApuWrapper::GetByteNum(apu_data_type data_type) {
+    int byte_per_element;
+    if (data_type == APU_DATA_TYPE_FLOAT || data_type == APU_DATA_TYPE_INT32) {
+        byte_per_element = 4;
+    } else if (data_type == APU_DATA_TYPE_HALF ||
+               data_type == APU_DATA_TYPE_INT16) {
+        byte_per_element = 2;
+    } else if (data_type == APU_DATA_TYPE_UINT8) {
+        byte_per_element = 1;
+    } else {
+      byte_per_element = 1;
+      MACE_CHECK(false, "unsupport data type");
+    }
+    return byte_per_element;
+}
+
 }  // namespace mace
diff --git a/mace/core/runtime/apu/apu_wrapper.h b/mace/core/runtime/apu/apu_wrapper.h
old mode 100755
new mode 100644
index 7b87e56c8500a854904111c6aed2678cf9d13ce3..a18694edd0681f0b9a65a2ceaa922a1e6bf0582c
--- a/mace/core/runtime/apu/apu_wrapper.h
+++ b/mace/core/runtime/apu/apu_wrapper.h
@@ -37,12 +37,14 @@ class ApuWrapper {
     int size;
     float scale;
     int zero_point;
+    apu_data_type data_type;
   };
 
  public:
   explicit ApuWrapper(Device *device);
-  bool Init(const NetDef &net_def, unsigned const char *model_data,
-            const index_t model_data_size);
+  bool Init(const NetDef& net_def, unsigned const char *model_data = nullptr,
+            const char *file_name = nullptr,
+            bool load = false, bool store = false);
   bool Run(const std::map<std::string, Tensor *> &input_tensors,
            std::map<std::string, Tensor *> *output_tensors);
   bool Uninit();
@@ -51,6 +53,7 @@ class ApuWrapper {
   apu_data_type MapToApuDataType(DataType mace_type);
   apu_pooling_mode MapToApuPoolingMode(int mace_mode);
   apu_eltwise_mode MapToApuEltwiseMode(int mace_mode);
+  int GetByteNum(apu_data_type data_type);
 
  private:
   ApuFrontend *frontend;
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 743d10c6acdad0559c7aa310f0f653ef6eb897c8..d31f9eb56e1415f3691da8593926263abfb6b846 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -201,6 +201,10 @@ class MaceEngineConfig::Impl {
                              bool dcvs_enable,
                              int latency);
 
+  MaceStatus SetAPUCache(APUCachePolicy policy,
+                         const std::string &binary_file,
+                         const std::string &storage_file);
+
   inline DeviceType device_type() const {
     return device_type_;
   }
@@ -237,6 +241,18 @@ class MaceEngineConfig::Impl {
     return hexagon_latency_;
   }
 
+  inline APUCachePolicy apu_cache_policy() const {
+    return apu_cache_policy_;
+  }
+
+  inline std::string apu_binary_file() const {
+    return apu_binary_file_;
+  }
+
+  inline std::string apu_storage_file() const {
+    return apu_storage_file_;
+  }
+
  private:
   DeviceType device_type_;
   int num_threads_;
@@ -247,6 +263,9 @@ class MaceEngineConfig::Impl {
   HexagonNNCornerType hexagon_corner_;
   bool hexagon_dcvs_enable_;
   int hexagon_latency_;
+  APUCachePolicy apu_cache_policy_;
+  std::string apu_binary_file_;
+  std::string apu_storage_file_;
 };
 
 MaceEngineConfig::Impl::Impl(const DeviceType device_type)
@@ -258,7 +277,10 @@ MaceEngineConfig::Impl::Impl(const DeviceType device_type)
       gpu_perf_hint_(GPUPerfHint::PERF_NORMAL),
       hexagon_corner_(HexagonNNCornerType::HEXAGON_NN_CORNER_TURBO),
       hexagon_dcvs_enable_(true),
-      hexagon_latency_(100) {}
+      hexagon_latency_(100),
+      apu_cache_policy_(APUCachePolicy::APU_CACHE_NONE),
+      apu_binary_file_(""),
+      apu_storage_file_("") {}
 
 MaceStatus MaceEngineConfig::Impl::SetGPUContext(
     std::shared_ptr<GPUContext> context) {
@@ -282,14 +304,15 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
   return MaceStatus::MACE_SUCCESS;
 }
 
+#ifdef MACE_ENABLE_HEXAGON
 MaceStatus MaceEngineConfig::Impl::SetHexagonToUnsignedPD() {
   bool ret = false;
-#ifdef MACE_ENABLE_HEXAGON
   ret = HexagonDSPWrapper::RequestUnsignedPD();
-#endif
   return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR;
 }
+#endif
 
+#ifdef MACE_ENABLE_HEXAGON
 MaceStatus MaceEngineConfig::Impl::SetHexagonPower(
     HexagonNNCornerType corner,
     bool dcvs_enable,
@@ -298,11 +321,24 @@ MaceStatus MaceEngineConfig::Impl::SetHexagonPower(
   hexagon_dcvs_enable_ = dcvs_enable;
   hexagon_latency_ = latency;
   bool ret = false;
-#ifdef MACE_ENABLE_HEXAGON
   ret = HexagonDSPWrapper::SetPower(corner, dcvs_enable, latency);
+  return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR;
+}
 #endif
+
+#ifdef MACE_ENABLE_APU
+MaceStatus MaceEngineConfig::Impl::SetAPUCache(
+    APUCachePolicy policy,
+    const std::string &binary_file,
+    const std::string &storage_file) {
+  bool ret = false;
+  apu_cache_policy_ = policy;
+  apu_binary_file_ = binary_file;
+  apu_storage_file_ = storage_file;
+  ret = true;
   return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR;
 }
+#endif
 
 MaceEngineConfig::MaceEngineConfig(
     const DeviceType device_type)
@@ -338,6 +374,13 @@ MaceStatus MaceEngineConfig::SetHexagonPower(
   return impl_->SetHexagonPower(corner, dcvs_enable, latency);
 }
 
+MaceStatus MaceEngineConfig::SetAPUCache(
+    APUCachePolicy policy,
+    const std::string &binary_file,
+    const std::string &storage_file) {
+  return impl_->SetAPUCache(policy, binary_file, storage_file);
+}
+
 // Mace Tensor
 class MaceTensor::Impl {
  public:
@@ -478,6 +521,9 @@ class MaceEngine::Impl {
 #endif
 #ifdef MACE_ENABLE_APU
   std::unique_ptr<ApuWrapper> apu_controller_;
+  APUCachePolicy apu_cache_policy_;
+  std::string apu_binary_file_;
+  std::string apu_storage_file_;
 #endif
 
   MACE_DISABLE_COPY_AND_ASSIGN(Impl);
@@ -504,6 +550,9 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
 #endif
 #ifdef MACE_ENABLE_APU
       , apu_controller_(nullptr)
+      , apu_cache_policy_(config.impl_->apu_cache_policy())
+      , apu_binary_file_(config.impl_->apu_binary_file())
+      , apu_storage_file_(config.impl_->apu_storage_file())
 #endif
 {
   LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
@@ -660,8 +709,21 @@ MaceStatus MaceEngine::Impl::Init(
 #ifdef MACE_ENABLE_APU
   if (device_type_ == APU) {
     apu_controller_.reset(new ApuWrapper(device_.get()));
-    MACE_CHECK(apu_controller_->Init(
-        *net_def, model_data, model_data_size), "apu init error");
+    bool cache_load = apu_cache_policy_ == APUCachePolicy::APU_CACHE_LOAD;
+    bool cache_store = apu_cache_policy_ == APUCachePolicy::APU_CACHE_STORE;
+    const char* file_name = cache_store ?
+        apu_storage_file_.c_str() : apu_binary_file_.c_str();
+    bool ret = false;
+    if (cache_load || cache_store) {
+      VLOG(1) << "Loading/Storing init cache";
+      ret = apu_controller_->Init(
+          *net_def, model_data, file_name, cache_load, cache_store);
+    }
+    if (!ret && !cache_store) {
+      VLOG(1) << "Do not use init cache";
+      ret = apu_controller_->Init(*net_def, model_data);
+    }
+    MACE_CHECK(ret, "apu int error", cache_load, cache_store);
   } else {
 #endif
     MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index d92f9b13d244a88754579de12cabf35fe6476fc7..0997046095be6325f70a70f4626f97ba32e81857 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -15,6 +15,7 @@ enum DataType {
   DT_INT32 = 4;
   DT_FLOAT16 = 5;
   DT_BFLOAT16 = 6;
+  DT_INT16 = 7;
 }
 
 enum MemoryType {
diff --git a/mace/tools/BUILD.bazel b/mace/tools/BUILD.bazel
index 43201a290903a3597f1fad90c555ca38da68e358..6f66158a1c8f7e144a8ee09f014584897f1dd012 100644
--- a/mace/tools/BUILD.bazel
+++ b/mace/tools/BUILD.bazel
@@ -5,6 +5,7 @@ load(
     "if_android",
     "if_hexagon_enabled",
     "if_opencl_enabled",
+    "if_apu_enabled",
 )
 
 licenses(["notice"])  # Apache 2.0
@@ -22,6 +23,8 @@ cc_binary(
         "-DMACE_ENABLE_OPENCL",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
+    ]) + if_apu_enabled([
+        "-DMACE_ENABLE_APU",
     ]),
     linkstatic = 1,
     deps = [
@@ -46,6 +49,8 @@ cc_binary(
         "-DMACE_ENABLE_OPENCL",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
+    ]) + if_apu_enabled([
+        "-DMACE_ENABLE_APU",
     ]),
     linkopts = [
         "-lm",
diff --git a/mace/tools/mace_run.cc b/mace/tools/mace_run.cc
index 25b054111cdf8f88bab16bd11cdaaa56aa1e7002..61fc3369e5a9b4bf350cc2ae547bde778b6bd333 100644
--- a/mace/tools/mace_run.cc
+++ b/mace/tools/mace_run.cc
@@ -144,6 +144,12 @@ DEFINE_string(model_data_file,
 DEFINE_string(model_file,
               "",
               "model file name, used when load mace model in pb");
+DEFINE_string(apu_binary_file,
+              "",
+              "apu init cache path, used when load apu init cache");
+DEFINE_string(apu_storage_file,
+              "",
+              "apu init cache path, used when store apu init cache");
 DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON/APU");
 DEFINE_int32(round, 1, "round");
 DEFINE_int32(restart_round, 1, "restart round");
@@ -153,6 +159,7 @@ DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(num_threads, -1, "num of threads");
 DEFINE_int32(cpu_affinity_policy, 1,
              "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
+DEFINE_int32(apu_cache_policy, 0, "0:NONE/1:STORE/2:LOAD");
 DEFINE_bool(benchmark, false, "enable benchmark op");
 
 bool RunModel(const std::string &model_name,
@@ -201,6 +208,11 @@ bool RunModel(const std::string &model_name,
   // firmware) or 8250 family above to run hexagon nn on unsigned PD.
   // config.SetHexagonToUnsignedPD();
   config.SetHexagonPower(HEXAGON_NN_CORNER_TURBO, true, 100);
+#endif
+#ifdef MACE_ENABLE_APU
+  config.SetAPUCache(static_cast<APUCachePolicy>(FLAGS_apu_cache_policy),
+                     FLAGS_apu_binary_file,
+                     FLAGS_apu_storage_file);
 #endif
   std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_graph_data =
     make_unique<mace::port::ReadOnlyBufferMemoryRegion>();
@@ -539,6 +551,9 @@ int Main(int argc, char **argv) {
   LOG(INFO) << "output dir: " << FLAGS_output_dir;
   LOG(INFO) << "model_data_file: " << FLAGS_model_data_file;
   LOG(INFO) << "model_file: " << FLAGS_model_file;
+  LOG(INFO) << "apu_cache_policy: " << FLAGS_apu_cache_policy;
+  LOG(INFO) << "apu_binary_file: " << FLAGS_apu_binary_file;
+  LOG(INFO) << "apu_storage_file: " << FLAGS_apu_storage_file;
   LOG(INFO) << "device: " << FLAGS_device;
   LOG(INFO) << "round: " << FLAGS_round;
   LOG(INFO) << "restart_round: " << FLAGS_restart_round;
diff --git a/third_party/apu/ApuFrontend.h b/third_party/apu/ApuFrontend.h
old mode 100755
new mode 100644
index a715f1dc93d64e3790a220851b4101115f45a3d9..352185f5c08fa84c187a200d8b9312b0c82f9ce2
--- a/third_party/apu/ApuFrontend.h
+++ b/third_party/apu/ApuFrontend.h
@@ -6,6 +6,8 @@ enum apu_act_mode {
     APU_ACT_NONE = 0,
     APU_ACT_RELU = 1,
     APU_ACT_RELU6 = 2,
+    APU_ACT_SIGMOID = 3,
+    APU_ACT_TANH = 4,
 };
 
 enum apu_pooling_mode {
@@ -29,6 +31,7 @@ enum apu_data_type {
     APU_DATA_TYPE_UINT8 = 2,
     APU_DATA_TYPE_HALF = 3,
     APU_DATA_TYPE_INT32 = 4,
+    APU_DATA_TYPE_INT16 = 5,
 };
 
 enum apu_tensor_type {
@@ -37,6 +40,7 @@ enum apu_tensor_type {
     APU_TENSOR_CONST_ARGUMENT = 2,
     APU_TENSOR_MODEL_INPUT = 3,
     APU_TENSOR_OP_OUTPUT = 4,
+    APU_TENSOR_MODEL_OUTPUT = 5,
 };
 
 #define APU_TENSOR_MAX_DIMS 4
@@ -70,10 +74,10 @@ class ApuFrontend {
 
     bool InitGraph(int const_tensor_size, const apu_tensor* const_tensors,
                    int input_tensor_size, const apu_tensor* input_tensors,
-                   int output_tensor_size, const int* output_tensor_ids,
-                   void** output_buffers,
+                   int output_tensor_size, const apu_tensor* output_tensors,
                    int operator_size, const apu_operator* operators,
-                   bool print_model);
+                   bool print_model, const char *file_name,
+                   bool load, bool store);
     bool RunGraph();
     bool UninitGraph();
 
diff --git a/third_party/apu/libapu-frontend.so b/third_party/apu/libapu-frontend.so
old mode 100755
new mode 100644
index a6ffaa76aa685c29c224bf61138da004c533309a..79a1be2ef424c7fa473f153c7a66281001d41023
Binary files a/third_party/apu/libapu-frontend.so and b/third_party/apu/libapu-frontend.so differ
diff --git a/third_party/apu/mt6853/libapu-frontend.so b/third_party/apu/mt6853/libapu-frontend.so
new file mode 100644
index 0000000000000000000000000000000000000000..2144858555caa0c0926de15a16eeab9ce3aabf46
Binary files /dev/null and b/third_party/apu/mt6853/libapu-frontend.so differ
diff --git a/third_party/apu/mt6853/libapu-platform.so b/third_party/apu/mt6853/libapu-platform.so
new file mode 100644
index 0000000000000000000000000000000000000000..7537371553ec0daf3c97c6277a2ba16a3275b173
Binary files /dev/null and b/third_party/apu/mt6853/libapu-platform.so differ
diff --git a/third_party/apu/mt6873/libapu-frontend.so b/third_party/apu/mt6873/libapu-frontend.so
new file mode 100644
index 0000000000000000000000000000000000000000..453f5388c1986bd749fec9d0249dc7c0fbe7e530
Binary files /dev/null and b/third_party/apu/mt6873/libapu-frontend.so differ
diff --git a/third_party/apu/mt6873/libapu-platform.so b/third_party/apu/mt6873/libapu-platform.so
new file mode 100644
index 0000000000000000000000000000000000000000..af29cee6a9f6554595fd8c9066dbaff12a4fe07c
Binary files /dev/null and b/third_party/apu/mt6873/libapu-platform.so differ
diff --git a/third_party/apu/mt6885/libapu-frontend.so b/third_party/apu/mt6885/libapu-frontend.so
new file mode 100644
index 0000000000000000000000000000000000000000..453f5388c1986bd749fec9d0249dc7c0fbe7e530
Binary files /dev/null and b/third_party/apu/mt6885/libapu-frontend.so differ
diff --git a/third_party/apu/mt6885/libapu-platform.so b/third_party/apu/mt6885/libapu-platform.so
new file mode 100644
index 0000000000000000000000000000000000000000..af29cee6a9f6554595fd8c9066dbaff12a4fe07c
Binary files /dev/null and b/third_party/apu/mt6885/libapu-platform.so differ
diff --git a/tools/common.py b/tools/common.py
index fd17f80fb7d06059a6dcdb3a1e31c6ed67b3a9d9..a736960d3789c090793e1d62b1caec5f70c7fa61 100644
--- a/tools/common.py
+++ b/tools/common.py
@@ -415,6 +415,7 @@ class YAMLKeyword(object):
     quantize_large_weights = 'quantize_large_weights'
     quantize_range_file = 'quantize_range_file'
     quantize_stat = 'quantize_stat'
+    quantize_schema = 'quantize_schema'
     change_concat_ranges = 'change_concat_ranges'
     validation_inputs_data = 'validation_inputs_data'
     validation_threshold = 'validation_threshold'
diff --git a/tools/converter.py b/tools/converter.py
index 3f1601d1e53644c9e01332007cadae95ecdd3375..0b5d81095d06eac6addbc9ace79eef46d72ca848 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -122,6 +122,7 @@ class DefaultValues(object):
     cpu_affinity_policy = 1,
     gpu_perf_hint = 3,
     gpu_priority_hint = 3,
+    apu_cache_policy = 0,
 
 
 class ValidationThreshold(object):
@@ -1175,6 +1176,21 @@ def parse_args():
         "--benchmark",
         action="store_true",
         help="enable op benchmark.")
+    run.add_argument(
+        "--apu_cache_policy",
+        type=int,
+        default=DefaultValues.apu_cache_policy,
+        help="0:NONE/1:STORE/2:LOAD")
+    run.add_argument(
+        "--apu_binary_file",
+        type=str,
+        default="",
+        help="apu cache load dir.")
+    run.add_argument(
+        "--apu_storage_file",
+        type=str,
+        default="",
+        help="apu cache store dir.")
     return parser.parse_known_args()
 
 
diff --git a/tools/device.py b/tools/device.py
index 312cbb0855ca21c8bc8363fb00c1ddbc13be1d34..fc49a5dc15cbe36a04ab678d27cfd9ffa9879264 100644
--- a/tools/device.py
+++ b/tools/device.py
@@ -177,6 +177,9 @@ class DeviceWrapper:
                    cpu_affinity_policy=1,
                    gpu_perf_hint=3,
                    gpu_priority_hint=3,
+                   apu_cache_policy=0,
+                   apu_binary_file="",
+                   apu_storage_file="",
                    input_file_name='model_input',
                    output_file_name='model_out',
                    input_dir="",
@@ -282,7 +285,20 @@ class DeviceWrapper:
                     "third_party/nnlib/%s/libhexagon_controller.so" % abi,
                     self.data_dir)
 
+            apu_storage_cpy = False
             if device_type == common.DeviceType.APU:
+                if apu_cache_policy == 1:
+                    if not apu_storage_file:
+                        apu_storage_cpy = True
+                        apu_src_file = model_tag + ".bin"
+                        apu_storage_file = os.path.join(self.data_dir,
+                                                        apu_src_file)
+                elif apu_cache_policy == 2:
+                    if os.path.exists(apu_binary_file):
+                        self.push(apu_binary_file, self.data_dir)
+                        apu_binary_file = os.path.join(self.data_dir,
+                                                       os.path.basename(
+                                                           apu_binary_file))
                 self.push("third_party/apu/libapu-frontend.so",
                           self.data_dir)
 
@@ -345,6 +361,9 @@ class DeviceWrapper:
                 (self.data_dir, os.path.basename(opencl_binary_file)),
                 "--opencl_parameter_file=%s/%s" %
                 (self.data_dir, os.path.basename(opencl_parameter_file)),
+                "--apu_cache_policy=%s" % apu_cache_policy,
+                "--apu_binary_file=%s" % apu_binary_file,
+                "--apu_storage_file=%s" % apu_storage_file,
             ])
             if benchmark:
                 cmd.append("--benchmark=%s" % benchmark)
@@ -364,6 +383,11 @@ class DeviceWrapper:
                               _out=process_output,
                               _err_to_out=True)
             self.stdout = "".join(stdout_buff)
+
+            if apu_storage_cpy:
+                self.pull_from_data_dir(
+                    apu_src_file, '{}/apu_init_cache/'.format(mace_model_dir))
+
             if not sh_commands.stdout_success(self.stdout):
                 common.MaceLogger.error("Mace Run", "Mace run failed.")
 
@@ -545,6 +569,9 @@ class DeviceWrapper:
             cpu_affinity_policy=flags.cpu_affinity_policy,
             gpu_perf_hint=flags.gpu_perf_hint,
             gpu_priority_hint=flags.gpu_priority_hint,
+            apu_cache_policy=flags.apu_cache_policy,
+            apu_binary_file=flags.apu_binary_file,
+            apu_storage_file=flags.apu_storage_file,
             runtime_failure_ratio=flags.runtime_failure_ratio,
             address_sanitizer=flags.address_sanitizer,
             opencl_binary_file=model_opencl_output_bin_path,
diff --git a/tools/python/convert.py b/tools/python/convert.py
index f12e613dbe559c00431ab0e8d563cb4004543cab..cde07cffdbf8f375cfd2762a06270b03229762b7 100644
--- a/tools/python/convert.py
+++ b/tools/python/convert.py
@@ -113,6 +113,8 @@ def convert_model(conf, quantize_stat):
         option.winograd = conf[ModelKeys.winograd]
     if ModelKeys.quantize in conf:
         option.quantize = conf[ModelKeys.quantize]
+    if ModelKeys.quantize_schema in conf:
+        option.quantize_schema = conf[ModelKeys.quantize_schema]
     if ModelKeys.quantize_large_weights in conf:
         option.quantize_large_weights = conf[ModelKeys.quantize_large_weights]
     if ModelKeys.quantize_range_file in conf:
diff --git a/tools/python/quantize/quantize_util.py b/tools/python/quantize/quantize_util.py
index bb80e0dc41ddecb6f424cc3e55d112397d67c7c6..410c049300605718b35eccb5b9ff25a78a4efb6d 100644
--- a/tools/python/quantize/quantize_util.py
+++ b/tools/python/quantize/quantize_util.py
@@ -171,6 +171,24 @@ def quantize(data, device, non_zero):
     return quantized_data
 
 
+# only support int16 symmetric quantization.
+def quantize_int16(data):
+    np_data = np.array(data).astype(float)
+    max_val = max(abs(np_data.min()), abs(np_data.max()))
+    scale = max_val / 2**15
+    zero = 0
+    output = np.clip((np.round(zero + data / scale).astype(np.int32)),
+                     -2**15, 2**15 - 1)
+
+    quantized_data = QuantizedData()
+    quantized_data.data = output
+    quantized_data.scale = scale
+    quantized_data.zero = zero
+    quantized_data.minval = -max_val
+    quantized_data.maxval = max_val
+    return quantized_data
+
+
 def quantize_bias_for_hexagon(data):
     np_data = np.array(data).astype(float)
     max_val = max(abs(np_data.min()), abs(np_data.max()))
diff --git a/tools/python/template/model_header.jinja2 b/tools/python/template/model_header.jinja2
index ea1c5f6ce3fbf1290e3268793a8b083cdf3e1bc1..cd8d2035902b2b022b77c8e07ee464522f3dbe84 100644
--- a/tools/python/template/model_header.jinja2
+++ b/tools/python/template/model_header.jinja2
@@ -26,6 +26,7 @@ namespace {{tag}} {
 
 
 MACE_API extern const unsigned char *LoadModelData();
+MACE_API extern const int64_t GetModelSize();
 
 MACE_API extern const std::shared_ptr<NetDef> CreateNet();
 
diff --git a/tools/python/transform/apu_converter.py b/tools/python/transform/apu_converter.py
index 12f302ec1752a53117f4855e1bc21e228452cdb1..faeb0be688010cbbe776f635f4b9545b4444e931 100644
--- a/tools/python/transform/apu_converter.py
+++ b/tools/python/transform/apu_converter.py
@@ -24,6 +24,7 @@ from transform.base_converter import EltwiseType
 from transform.base_converter import MaceKeyword
 from transform.base_converter import MaceOp
 from transform.base_converter import PaddingMode
+from transform.base_converter import PadType
 from transform.base_converter import PoolingType
 from transform.base_converter import ReduceType
 from transform.base_converter import DataFormat
@@ -32,16 +33,17 @@ from utils.util import mace_check
 
 
 ApuSupportedOps = [
+    'Activation',
     'Concat',
     'Conv2D',
     'DepthwiseConv2d',
     'Eltwise',
+    'Pad',
     'Pooling',
     'Reduce',
     'ResizeBilinear',
     'Reshape',
     'Softmax',
-    'Squeeze',
 ]
 
 ApuOp = Enum('ApuOp', [(op, op) for op in ApuSupportedOps], type=str)
@@ -50,16 +52,18 @@ ApuOp = Enum('ApuOp', [(op, op) for op in ApuSupportedOps], type=str)
 class ApuOps(object):
     def __init__(self):
         self.apu_ops = {
+            MaceOp.Activation.name: ApuOp.Activation.name,
             MaceOp.Concat.name: ApuOp.Concat.name,
             MaceOp.Conv2D.name: ApuOp.Conv2D.name,
             MaceOp.DepthwiseConv2d.name: ApuOp.DepthwiseConv2d.name,
             MaceOp.Eltwise.name: ApuOp.Eltwise.name,
+            MaceOp.Pad.name: ApuOp.Pad.name,
             MaceOp.Pooling.name: ApuOp.Pooling.name,
             MaceOp.Reduce.name: ApuOp.Reduce.name,
             MaceOp.ResizeBilinear.name: ApuOp.ResizeBilinear.name,
             MaceOp.Reshape.name: ApuOp.Reshape.name,
             MaceOp.Softmax.name: ApuOp.Softmax.name,
-            MaceOp.Squeeze.name: ApuOp.Squeeze.name,
+            MaceOp.Squeeze.name: ApuOp.Reshape.name,
         }
 
     def has_op(self, op_name):
@@ -78,17 +82,30 @@ class ApuConverter(base_converter.ConverterInterface):
         self._apu_ops = ApuOps()
 
     def run(self):
-        self.use_uint8_in_out()
+        if self._option.quantize:
+            self.use_quant_in_out()
         self.add_op_output_type()
         self.ensure_bias_vector()
+        self.ensure_binary_input()
         self.common_check()
         if ConverterUtil.get_arg(self._model.op[0],
                                  MaceKeyword.mace_framework_type_str).i == \
            FrameworkType.TENSORFLOW.value:
             self.add_tensorflow_padding_value()
+        # Calculate the number of apu constant tensors
+        # Any tensors which will be apu constant tensors should be added
+        # above this line
         const_data_num_arg = self._model.arg.add()
         const_data_num_arg.name = MaceKeyword.mace_const_data_num_arg_str
         const_data_num_arg.i = len(self._model.tensors)
+        apu_data_type_arg = self._model.arg.add()
+        apu_data_type_arg.name = MaceKeyword.mace_apu_data_type_arg_str
+        if self._option.quantize_schema == 'mace_apu_16bit_per_tensor':
+            apu_data_type_arg.i = mace_pb2.DT_INT16
+        elif self._option.quantize:
+            apu_data_type_arg.i = mace_pb2.DT_UINT8
+        else:
+            apu_data_type_arg.i = mace_pb2.DT_FLOAT
         self.convert_ops()
         self.add_node_id()
         return self._model
@@ -104,9 +121,11 @@ class ApuConverter(base_converter.ConverterInterface):
                        ' match')
             mace_check(len(op.output_shape[0].dims) <= 4,
                        op.name + ': apu only support 1D~4D tensor')
-            mace_check(len(op.output) == len(op.quantize_info),
-                       op.name + ': length of output and quantize_info not'
-                       ' match')
+            if op.output_type[0] == mace_pb2.DT_UINT8 \
+                    or op.output_type[0] == mace_pb2.DT_INT16:
+                mace_check(len(op.output) == len(op.quantize_info),
+                           op.name + ': length of output and quantize_info not'
+                           ' match')
             data_format = ConverterUtil.data_format(op)
             if data_format is not None and len(op.output_shape[0].dims) == 4:
                 mace_check((data_format == DataFormat.NHWC)
@@ -117,9 +136,11 @@ class ApuConverter(base_converter.ConverterInterface):
                                op, MaceKeyword.mace_activation_type_str)
             if act_mode_arg is not None:
                 mace_check(act_mode_arg.s == b'RELU'
-                           or act_mode_arg.s == b'RELUX',
-                           op.name + ': apu only support activation RELU and'
-                           ' RELUX')
+                           or act_mode_arg.s == b'RELUX'
+                           or act_mode_arg.s == b'TANH'
+                           or act_mode_arg.s == b'SIGMOID',
+                           op.name + ': apu only support activation RELU,'
+                           ' RELUX, TANH and SIGMOID')
         for tensor in self._model.tensors:
             mace_check(len(tensor.dims) <= 4,
                        tensor.name + ': apu only support 1D~4D tensor')
@@ -138,7 +159,6 @@ class ApuConverter(base_converter.ConverterInterface):
         for op in self._model.op:
             if not self._apu_ops.has_op(op.type):
                 raise Exception('Unsupported op: ', op)
-
             if op.type == MaceOp.Conv2D.name \
                     or op.type == MaceOp.DepthwiseConv2d.name:
                 mace_check(len(op.input) == 3,
@@ -146,7 +166,7 @@ class ApuConverter(base_converter.ConverterInterface):
                            ' with 3 input')
                 self.add_size_tensor_from_arg(
                     op, MaceKeyword.mace_strides_str)
-                self.add_padding_tensor_from_arg(op)
+                self.add_padding_value_tensor_from_arg(op)
                 self.add_size_tensor_from_arg(
                     op, MaceKeyword.mace_dilations_str)
                 if op.type == MaceOp.DepthwiseConv2d.name:
@@ -160,22 +180,64 @@ class ApuConverter(base_converter.ConverterInterface):
                             break
                     op.input.extend([multiplier.name])
             elif op.type == MaceOp.Eltwise.name:
+                eltwise_type = ConverterUtil.get_arg(
+                               op, MaceKeyword.mace_element_type_str).i
+                # We only handle SUM and PROD operators now which are
+                # commutative
                 mace_check(len(op.input) == 2,
                            op.name + ': apu only support eltwise op with 2'
                            ' input')
-                eltwise_type = ConverterUtil.get_arg(
-                               op, MaceKeyword.mace_element_type_str).i
-                mace_check(eltwise_type == EltwiseType.SUM.value,
-                           op.name + ': apu only support eltwise type SUM')
+                mace_check(eltwise_type == EltwiseType.SUM.value
+                           or eltwise_type == EltwiseType.PROD.value,
+                           op.name +
+                           ': apu only support eltwise type SUM or PROD')
+            elif op.type == MaceOp.Pad.name:
+                mace_check(len(op.input) == 1,
+                           op.name + ': apu only support Pad op with 1'
+                           ' input')
+                pad_type_arg = \
+                    ConverterUtil.get_arg(op, MaceKeyword.mace_pad_type_str)
+                if pad_type_arg is not None:
+                    mace_check(PadType(pad_type_arg.i) ==
+                               PadType.CONSTANT, op.name +
+                               ': apu only support Pad type CONSTANT')
+
+                padding_arg = ConverterUtil.get_arg(
+                            op, MaceKeyword.mace_paddings_str)
+                mace_check(len(padding_arg.ints) == 8,
+                           op.name + ': paddings does not have size 8')
+                mace_check({0} ==
+                           {padding_arg.ints[0], padding_arg.ints[1],
+                            padding_arg.ints[6], padding_arg.ints[7]},
+                           op.name + ': apu only support Pad op with padding'
+                           ' in H/W dimensions')
+                data_type = ConverterUtil.get_arg(op, 'T').i
+                constant_value_arg = ConverterUtil.get_arg(
+                    op, MaceKeyword.mace_constant_value_str)
+                if constant_value_arg is not None:
+                    if data_type in [mace_pb2.DT_FLOAT, mace_pb2.DT_HALF]:
+                        constant_value = constant_value_arg.f
+                    elif data_type == mace_pb2.DT_INT32:
+                        constant_value = constant_value_arg.i
+                    else:
+                        mace_check(False, "Not supported data type")
+
+                    mace_check(constant_value == 0,
+                               op.name + ': apu only support Pad op with zero'
+                               ' padding')
+                self.add_paddings_tensor_from_arg(op)
+
             elif op.type == MaceOp.Pooling.name:
                 mace_check(len(op.input) == 1,
                            op.name + ': apu only support pooling op with 1'
                            ' input')
                 pooling_type_arg = ConverterUtil.get_arg(
                                    op, MaceKeyword.mace_pooling_type_str)
-                mace_check(PoolingType(pooling_type_arg.i) == PoolingType.AVG,
-                           op.name + ': apu only support pooling type AVG')
-                self.add_padding_tensor_from_arg(op)
+                mace_check(PoolingType(pooling_type_arg.i) in
+                           [PoolingType.AVG, PoolingType.MAX],
+                           op.name + ': apu only support pooling type AVG,'
+                           ' MAX')
+                self.add_padding_value_tensor_from_arg(op)
                 self.add_size_tensor_from_arg(
                     op, MaceKeyword.mace_strides_str)
                 self.add_size_tensor_from_arg(op, MaceKeyword.mace_kernel_str)
@@ -213,8 +275,7 @@ class ApuConverter(base_converter.ConverterInterface):
                 mace_check(len(op.input) == 1,
                            op.name + ': apu only support squeeze op with 1'
                            ' input')
-                self.add_int_list_tensor_from_arg(
-                    op, MaceKeyword.mace_axis_str)
+                self.add_shape_tensor_from_axis_arg(op)
 
             op.type = self._apu_ops.map_nn_op(op.type)
 
@@ -222,7 +283,12 @@ class ApuConverter(base_converter.ConverterInterface):
         type_map = {}
         for input_info in self._model.input_info:
             # will do input quantize in wrapper
-            type_map[input_info.name] = mace_pb2.DT_UINT8
+            if self._option.quantize_schema == 'mace_apu_16bit_per_tensor':
+                type_map[input_info.name] = mace_pb2.DT_INT16
+            elif self._option.quantize:
+                type_map[input_info.name] = mace_pb2.DT_UINT8
+            else:
+                type_map[input_info.name] = mace_pb2.DT_FLOAT
 
         for op in self._model.op:
             if len(op.output_type) >= 1:
@@ -239,8 +305,11 @@ class ApuConverter(base_converter.ConverterInterface):
                        op.name + ': length of output and output_type not'
                        ' match')
             mace_check(op.output_type[0] == mace_pb2.DT_UINT8
-                       or op.output_type[0] == mace_pb2.DT_INT32,
-                       op.name + ': apu only support quantized node')
+                       or op.output_type[0] == mace_pb2.DT_INT16
+                       or op.output_type[0] == mace_pb2.DT_INT32
+                       or op.output_type[0] == mace_pb2.DT_FLOAT,
+                       op.name + ': apu only support quantized or float16'
+                       ' node')
 
     def add_node_id(self):
         node_id_counter = 0
@@ -266,7 +335,7 @@ class ApuConverter(base_converter.ConverterInterface):
         for output_info in self._model.output_info:
             output_info.node_id = node_id_map[output_info.name]
 
-    def add_padding_tensor_from_arg(self, op):
+    def add_padding_value_tensor_from_arg(self, op):
         padding_value_arg = ConverterUtil.get_arg(
                             op, MaceKeyword.mace_padding_values_str)
         mace_check(len(padding_value_arg.ints) == 4,
@@ -278,6 +347,19 @@ class ApuConverter(base_converter.ConverterInterface):
         padding_value_tensor.int32_data.extend(padding_value_arg.ints)
         op.input.extend([padding_value_tensor.name])
 
+    def add_paddings_tensor_from_arg(self, op):
+        padding_value_arg = ConverterUtil.get_arg(
+                            op, MaceKeyword.mace_paddings_str)
+        padding_value_tensor = self._model.tensors.add()
+        padding_value_tensor.name = op.name + '/padding:0'
+        padding_value_tensor.data_type = mace_pb2.DT_INT32
+        mace_check(len(padding_value_arg.ints) % 2 == 0,
+                   op.name + ': the rank of paddings should be even')
+        padding_value_tensor.dims.extend(
+            [int(len(padding_value_arg.ints) / 2), 2])
+        padding_value_tensor.int32_data.extend(padding_value_arg.ints)
+        op.input.extend([padding_value_tensor.name])
+
     def add_size_tensor_from_arg(self, op, keyword):
         size_value_arg = ConverterUtil.get_arg(op, keyword)
         mace_check(len(size_value_arg.ints) == 2,
@@ -311,6 +393,27 @@ class ApuConverter(base_converter.ConverterInterface):
         list_value_tensor.int32_data.extend(list_value_arg.ints)
         op.input.extend([list_value_tensor.name])
 
+    def add_shape_tensor_from_axis_arg(self, op):
+        list_value_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str)
+        mace_check(list_value_arg.ints is not None,
+                   op.name + ': ' + MaceKeyword.mace_axis_str +
+                   ' value ints should not be None')
+        axes = list_value_arg.ints
+        for producer in self._model.op:
+            if producer.output[0] == op.input[0]:
+                input_tensor_shape = producer.output_shape[0].dims
+                break
+
+        shape_tensor = self._model.tensors.add()
+        shape_tensor.name = op.name + '/' + MaceKeyword.mace_axis_str + ':0'
+        shape_tensor.data_type = mace_pb2.DT_INT32
+        shape_tensor.dims.extend([len(input_tensor_shape) - len(axes)])
+        shape_tensor.int32_data.extend(input_tensor_shape)
+        for axis in sorted(axes, reverse=True):
+            del shape_tensor.int32_data[axis]
+        op.input.extend([shape_tensor.name])
+        ConverterUtil.del_arg(op, MaceKeyword.mace_axis_str)
+
     def add_tensorflow_padding_value(self):
         for op in self._model.op:
             padding_type = ConverterUtil.get_arg(
@@ -374,7 +477,8 @@ class ApuConverter(base_converter.ConverterInterface):
             tensor = self._model.tensors.add()
             tensor.name = _op.name + '/add/bias_add'
             tensor.dims.extend([_op.output_shape[0].dims[-1]])
-            if _op.output_type[0] == mace_pb2.DT_UINT8:
+            if _op.output_type[0] == mace_pb2.DT_UINT8 or \
+                    _op.output_type[0] == mace_pb2.DT_INT16:
                 tensor.data_type = mace_pb2.DT_INT32
                 input_name = _op.input[0]
                 for input_op in self._model.op:
@@ -395,7 +499,46 @@ class ApuConverter(base_converter.ConverterInterface):
                 tensor.float_data.extend([0.0] * tensor.dims[0])
             _op.input.extend([tensor.name])
 
-    def use_uint8_in_out(self):
+    def ensure_binary_input(self):
+        for _op in self._model.op:
+            if _op.type != MaceOp.Eltwise.name:
+                continue
+            if len(_op.input) != 1:
+                continue
+            eltwise_type = ConverterUtil.get_arg(
+                           _op, MaceKeyword.mace_element_type_str).i
+            if eltwise_type != EltwiseType.SUM.value and \
+               eltwise_type != EltwiseType.PROD.value:
+                continue
+
+            float_value_arg = ConverterUtil.get_arg(
+                                _op, MaceKeyword.mace_scalar_input_str)
+            mace_check(float_value_arg.f is not None,
+                       _op.name + ': ' +
+                       MaceKeyword.mace_scalar_input_str +
+                       ' value float should not be None')
+            scalar = float_value_arg.f
+            const_tensor = self._model.tensors.add()
+            const_tensor.name = _op.name + '/' + \
+                MaceKeyword.mace_scalar_input_str + ':0'
+            const_tensor.dims.extend([1])
+            if _op.output_type[0] == mace_pb2.DT_UINT8 or \
+                    _op.output_type[0] == mace_pb2.DT_INT16:
+                const_tensor.data_type = _op.output_type[0]
+                const_tensor.scale = scalar
+                const_tensor.zero_point = 0
+                const_tensor.quantized = True
+                const_tensor.int32_data.extend([1])
+            elif _op.output_type[0] == mace_pb2.DT_FLOAT:
+                const_tensor.data_type = mace_pb2.DT_FLOAT
+                const_tensor.float_data.extend([scalar])
+            _op.input.extend([const_tensor.name])
+            ConverterUtil.del_arg(
+                _op, MaceKeyword.mace_scalar_input_str)
+            ConverterUtil.del_arg(
+                _op, MaceKeyword.mace_scalar_input_index_str)
+
+    def use_quant_in_out(self):
         replace_dict = {}
         for input_info in self._model.input_info:
             if input_info.data_type == mace_pb2.DT_FLOAT:
diff --git a/tools/python/transform/base_converter.py b/tools/python/transform/base_converter.py
index 4c23e65fbf9efa190c6f8e87e86a507f92b3af8e..38edafb3879d48a177bb64b9c4d28d81c637936b 100644
--- a/tools/python/transform/base_converter.py
+++ b/tools/python/transform/base_converter.py
@@ -288,6 +288,8 @@ class MaceKeyword(object):
     mace_p_str = 'p'
     mace_nor_var_str = 'normalize_variance'
     mace_across_ch_str = 'across_channels'
+    mace_apu_16bit_per_tensor = 'mace_apu_16bit_per_tensor'
+    mace_apu_data_type_arg_str = 'apu_data_type'
 
 
 class TransformerRule(Enum):
@@ -335,6 +337,7 @@ class TransformerRule(Enum):
     FP16_GATHER_WEIGHT = 42
     QUANTIZE_LARGE_WEIGHTS = 43
     TRANSPOSE_SHAPE_TENSOR_TO_PARAM = 44
+    TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV = 45
 
 
 class ConverterInterface(object):
@@ -409,6 +412,7 @@ class ConverterOption(object):
         self._device = DeviceType.CPU.value
         self._winograd = 0
         self._quantize = False
+        self._quantize_schema = ""
         self._quantize_large_weights = False
         self._quantize_range_file = ""
         self._change_concat_ranges = False
@@ -444,6 +448,10 @@ class ConverterOption(object):
     def quantize(self):
         return self._quantize
 
+    @property
+    def quantize_schema(self):
+        return self._quantize_schema
+
     @property
     def quantize_large_weights(self):
         return self._quantize_large_weights
@@ -508,6 +516,10 @@ class ConverterOption(object):
     def quantize(self, quantize):
         self._quantize = quantize
 
+    @quantize_schema.setter
+    def quantize_schema(self, quantize_schema):
+        self._quantize_schema = quantize_schema
+
     @quantize_large_weights.setter
     def quantize_large_weights(self, quantize_large_weights):
         self._quantize_large_weights = quantize_large_weights
@@ -593,6 +605,10 @@ class ConverterOption(object):
                 # Need to be put after SORT_BY_EXECUTION
                 TransformerRule.ADD_QUANTIZE_TENSOR_RANGE,
             ]
+            if self._device == DeviceType.APU.value:
+                self._transformer_option = self._transformer_option + [
+                    TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV,
+                ]
             if self.quantize_large_weights:
                 self._transformer_option = self._transformer_option + [
                     TransformerRule.QUANTIZE_LARGE_WEIGHTS
diff --git a/tools/python/transform/transformer.py b/tools/python/transform/transformer.py
index c35973a0622ad84f6407bfb4c6a8adde8bcb42a6..136bc9cd5d1051357b931ee22ed8d19edc0e3528 100644
--- a/tools/python/transform/transformer.py
+++ b/tools/python/transform/transformer.py
@@ -115,6 +115,8 @@ class Transformer(base_converter.ConverterInterface):
                 self.fp16_gather_weight,
             TransformerRule.QUANTIZE_LARGE_WEIGHTS:
                 self.quantize_large_weights,
+            TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV:
+                self.transform_single_bn_to_depthwise_conv,
         }
 
         self._option = option
@@ -736,7 +738,6 @@ class Transformer(base_converter.ConverterInterface):
                     net.tensors.remove(scale)
                     self.replace_quantize_info(op, consumer_op)
                     self.safe_remove_node(consumer_op, op)
-
                     return True
 
         return False
@@ -1099,9 +1100,9 @@ class Transformer(base_converter.ConverterInterface):
         transposed_filter = set()
         transposed_deconv_filter = set()
 
-        if self._option.quantize and \
-                (self._option.device == DeviceType.CPU.value or
-                 self._option.device == DeviceType.APU.value):
+        if ((self._option.quantize and
+                self._option.device == DeviceType.CPU.value) or
+                self._option.device == DeviceType.APU.value):
             print("Transpose filters to OHWI")
             if filter_format == DataFormat.HWIO:
                 transpose_order = [3, 0, 1, 2]
@@ -1620,12 +1621,23 @@ class Transformer(base_converter.ConverterInterface):
             mace_check(data_type_arg, "Data type does not exist for %s(%s)"
                        % (op.name, op.type))
             if data_type_arg.i == mace_pb2.DT_FLOAT:
-                data_type_arg.i = mace_pb2.DT_UINT8
+                if self._option.quantize_schema == \
+                        MaceKeyword.mace_apu_16bit_per_tensor:
+                    data_type_arg.i = mace_pb2.DT_INT16
+                else:
+                    data_type_arg.i = mace_pb2.DT_UINT8
             elif data_type_arg.i == mace_pb2.DT_UINT8:
                 mace_check(op.type == MaceOp.Quantize.name
                            or op.type == MaceOp.Dequantize.name,
                            "Only Quantization ops support uint8, "
                            "but got %s(%s)" % (op.name, op.type))
+            elif data_type_arg.i == mace_pb2.DT_INT16 \
+                and self._option.quantize_schema == \
+                    MaceKeyword.mace_apu_16bit_per_tensor:
+                mace_check(op.type == MaceOp.Quantize.name
+                           or op.type == MaceOp.Dequantize.name,
+                           "Only Quantization ops support int16, "
+                           "but got %s(%s)" % (op.name, op.type))
             else:
                 mace_check(op.type == MaceOp.Quantize.name,
                            "Quantization only support float ops, "
@@ -1647,7 +1659,11 @@ class Transformer(base_converter.ConverterInterface):
             self._model.input_info[i].scale = quantize_info.scale
             self._model.input_info[i].zero_point = quantize_info.zero_point
 
-            ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
+            if self._option.quantize_schema == \
+                    MaceKeyword.mace_apu_16bit_per_tensor:
+                ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_INT16)
+            else:
+                ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
             ConverterUtil.add_data_format_arg(op_def, input_node.data_format)
             # use actual ranges for model input quantize
             find_range_every_time_arg = op_def.arg.add()
@@ -1670,7 +1686,11 @@ class Transformer(base_converter.ConverterInterface):
             self._model.output_info[i].scale = quantize_info.scale
             self._model.output_info[i].zero_point = quantize_info.zero_point
 
-            ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
+            if self._option.quantize_schema == \
+                    MaceKeyword.mace_apu_16bit_per_tensor:
+                ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_INT16)
+            else:
+                ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
             ConverterUtil.add_data_format_arg(op_def, output_node.data_format)
 
         quantize_flag_arg = self._model.arg.add()
@@ -1725,6 +1745,11 @@ class Transformer(base_converter.ConverterInterface):
                 else:
                     mace_check(False, "wrong device.")
                 tensor.data_type = mace_pb2.DT_INT32
+            elif self._option.quantize_schema == \
+                    MaceKeyword.mace_apu_16bit_per_tensor:
+                quantized_tensor = \
+                    quantize_util.quantize_int16(tensor.float_data)
+                tensor.data_type = mace_pb2.DT_INT16
             else:
                 non_zero = self._option.device == DeviceType.CPU.value
                 quantized_tensor = quantize_util.quantize(tensor.float_data,
@@ -1781,9 +1806,16 @@ class Transformer(base_converter.ConverterInterface):
         return False
 
     def add_quantize_info(self, op, minval, maxval):
-        scale, zero, minval, maxval = \
-            quantize_util.adjust_range(minval, maxval, self._option.device,
-                                       non_zero=False)
+        quantize_schema = self._option.quantize_schema
+        if quantize_schema == MaceKeyword.mace_apu_16bit_per_tensor:
+            maxval = max(abs(minval), abs(maxval))
+            minval = -maxval
+            scale = maxval / 2**15
+            zero = 0
+        else:
+            scale, zero, minval, maxval = \
+                quantize_util.adjust_range(minval, maxval, self._option.device,
+                                           non_zero=False)
         quantize_info = op.quantize_info.add()
         quantize_info.minval = minval
         quantize_info.maxval = maxval
@@ -1876,6 +1908,7 @@ class Transformer(base_converter.ConverterInterface):
     def add_quantize_tensor_range(self):
         # Quantize info from range statistics
         range_file = self._option.quantize_range_file
+        quantize_schema = self._option.quantize_schema
         if range_file:
             print("Add quantize tensor range")
             post_quantize_info = {}
@@ -1884,10 +1917,17 @@ class Transformer(base_converter.ConverterInterface):
                     tensor_name, minmax = line.split("@@")[:2]
                     min_val, max_val = [float(i) for i in
                                         minmax.strip().split(",")]
-                    scale, zero, min_val, max_val = \
-                        quantize_util.adjust_range(min_val, max_val,
-                                                   self._option.device,
-                                                   non_zero=False)
+                    if (quantize_schema ==
+                            MaceKeyword.mace_apu_16bit_per_tensor):
+                        max_val = max(abs(min_val), abs(max_val))
+                        min_val = -max_val
+                        scale = max_val / 2**15
+                        zero = 0
+                    else:
+                        scale, zero, min_val, max_val = \
+                            quantize_util.adjust_range(min_val, max_val,
+                                                       self._option.device,
+                                                       non_zero=False)
                     activation_info = mace_pb2.QuantizeActivationInfo()
                     activation_info.minval = min_val
                     activation_info.maxval = max_val
@@ -1918,11 +1958,18 @@ class Transformer(base_converter.ConverterInterface):
                 print("Input range %s: %s" % (input_node.name,
                                               str(input_node.range)))
                 new_input_name = self.input_name_map[input_node.name]
-                scale, zero, minval, maxval = \
-                    quantize_util.adjust_range(input_node.range[0],
-                                               input_node.range[1],
-                                               self._option.device,
-                                               non_zero=False)
+                if quantize_schema == MaceKeyword.mace_apu_16bit_per_tensor:
+                    maxval = max(abs(input_node.range[0]),
+                                 abs(input_node.range[0]))
+                    minval = -maxval
+                    scale = maxval / 2**15
+                    zero = 0
+                else:
+                    scale, zero, minval, maxval = \
+                        quantize_util.adjust_range(input_node.range[0],
+                                                   input_node.range[1],
+                                                   self._option.device,
+                                                   non_zero=False)
                 quantize_info = \
                     mace_pb2.QuantizeActivationInfo()
                 quantize_info.minval = minval
@@ -2396,3 +2443,37 @@ class Transformer(base_converter.ConverterInterface):
             return True
 
         return False
+
+    def transform_single_bn_to_depthwise_conv(self):
+        for op in self._model.op:
+            if op.type != MaceOp.BatchNorm.name:
+                continue
+
+            if len(op.input) != 3:
+                continue
+
+            producer = self._producer[op.input[0]]
+            if producer.type in [MaceOp.Conv2D.name,
+                                 MaceOp.Deconv2D.name,
+                                 MaceOp.DepthwiseDeconv2d.name,
+                                 MaceOp.DepthwiseConv2d.name,
+                                 MaceOp.BatchToSpaceND.name]:
+                continue
+
+            op.type = MaceOp.DepthwiseConv2d.name
+            padding_arg = op.arg.add()
+            padding_arg.name = MaceKeyword.mace_padding_str
+            padding_arg.i = PaddingMode.VALID.value
+            strides_arg = op.arg.add()
+            strides_arg.name = MaceKeyword.mace_strides_str
+            strides_arg.ints.extend([1, 1])
+            dilation_arg = op.arg.add()
+            dilation_arg.name = MaceKeyword.mace_dilations_str
+            dilation_arg.ints.extend([1, 1])
+            for tensor in self._model.tensors:
+                if tensor.name == op.input[1]:
+                    tensor.dims[:] = [1, 1, 1, tensor.dims[0]]
+                    break
+            return True
+
+        return False
diff --git a/tools/python/utils/config_parser.py b/tools/python/utils/config_parser.py
index c3805da22ed63a8f3935382d5ad372d0d04eb2f4..5d78de49bd48a1e6e3f364af456fa6175d8f4166 100644
--- a/tools/python/utils/config_parser.py
+++ b/tools/python/utils/config_parser.py
@@ -92,6 +92,7 @@ class ModelKeys(object):
     weight_sha256_checksum = "weight_sha256_checksum"
     quantize_range_file = "quantize_range_file"
     quantize = "quantize"
+    quantize_schema = "quantize_schema"
     quantize_large_weights = "quantize_large_weights"
     quantize_stat = "quantize_stat"
     change_concat_ranges = "change_concat_ranges"
diff --git a/tools/python/utils/device.py b/tools/python/utils/device.py
index 9037d0c882f26d5962df68f44e2cf97194237ea3..53d2535804857be857de0a6ae1ae7dabe350c87d 100644
--- a/tools/python/utils/device.py
+++ b/tools/python/utils/device.py
@@ -36,10 +36,11 @@ def execute(cmd, verbose=True):
                          universal_newlines=True)
 
     if not verbose:
-        # use p.communicate instead of p.wait to avoid such situation: pipe is filled and the child process is blocked.
+        # use p.communicate instead of p.wait to avoid such situation:
+        # pipe is filled and the child process is blocked.
         out, err = p.communicate()
         if p.returncode != 0:
-            raise Exception("errorcode: {}".format(p.returncode) )
+            raise Exception("errorcode: {}".format(p.returncode))
         return out
 
     buf = []