[RKNPU] Adapting to the changing of the quantization parameters, and...

[RKNPU] Adapting to the changing of the quantization parameters, and supporting the fully quantized mobilenetv1 model (#4046) * [RKNPU] Adapting to the changing of the quantization parameters, and supporting the fully quantized mobilenetv1 model test=develop * [Doc] Update the doc for apu and rknpu test=develop

[RKNPU] Adapting to the changing of the quantization parameters, and...
[RKNPU] Adapting to the changing of the quantization parameters, and supporting the fully quantized mobilenetv1 model (#4046) * [RKNPU] Adapting to the changing of the quantization parameters, and supporting the fully quantized mobilenetv1 model test=develop * [Doc] Update the doc for apu and rknpu test=develop
b2e7d97c · hong19860320 · GitHub · 898792ca · b2e7d97c · b2e7d97c
11 changed file
--- a/docs/demo_guides/mediatek_apu.md
+++ b/docs/demo_guides/mediatek_apu.md
@@ -159,12 +159,18 @@ $ git checkout <release-version-tag>
 $ wget https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz
 $ tar -xvf apu_ddk.tar.gz
 ```
- 编译tiny_publish for MT8168-P2V1 Tablet
+- 编译tiny_publish for MT8168-P2V1 Tablet and Smart TVs(S900)
 ```shell
-$ ./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared --build_extra=ON --with_log=ON --build_apu=ON --apu_ddk_root=./apu_ddk tiny_publish
+For MT8168-P2V1 Tablet
+$ ./lite/tools/build_android.sh --android_stl=c++_shared --with_extra=ON --with_log=ON --with_mediatek_apu=ON --mediatek_apu_sdk_root=./apu_ddk
+For Smart TVs(S900)
+$ ./lite/tools/build_android.sh --arch=armv7 --android_stl=c++_shared --with_extra=ON --with_log=ON --with_mediatek_apu=ON --mediatek_apu_sdk_root=./apu_ddk
 ```
 - 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/include目录；
- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件。
+- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件；
+- 将编译生成的build.lite.android.armv7.gcc/inference_lite_lib.android.armv7.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/armeabi-v7a/include目录；
+- 将编译生成的build.lite.android.armv7.gcc/inference_lite_lib.android.armv7.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/armeabi-v7a/lib/libpaddle_light_api_shared.so文件。
 ## 其它说明

--- a/docs/demo_guides/rockchip_npu.md
+++ b/docs/demo_guides/rockchip_npu.md
@@ -137,20 +137,26 @@ $ cd Paddle-Lite
 $ git checkout <release-version-tag>
 $ git clone https://github.com/airockchip/rknpu_ddk.git
 ```
- 编译full_publish and tiny_publish for RK1808 and RK1806 EVB
+- 编译tiny_publish and full_publish for RK1808 and RK1806 EVB
 ```shell
 For RK1808 EVB
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
+tiny_publish
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+$ ./lite/tools/build_linux.sh --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk
+full_publish
+$ ./lite/tools/build_linux.sh --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk full_publish
 For RK1806 EVB
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
+tiny_publish
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+$ ./lite/tools/build_linux.sh --arch=armv7 --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk
+full_publish
+$ ./lite/tools/build_linux.sh --arch=armv7 --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk full_publish
 ```
 - 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录；
- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+- 将tiny_publish模式下编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+- 将full_publish模式下编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件；
 - 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；
- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件。
+- 将tiny_publish模式下编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件；
+- 将full_publish模式下编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。
 ## 其它说明

--- a/lite/kernels/rknpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc
@@ -32,30 +32,18 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
  auto scale_name = op_info->Input("Scale").front();
-  auto scale_type = kernel->GetInputDeclType("Scale");
-  CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
  auto scale = scope->FindMutableTensor(scale_name);
  auto bias_name = op_info->Input("Bias").front();
-  auto bias_type = kernel->GetInputDeclType("Bias");
-  CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
  auto bias = scope->FindMutableTensor(bias_name);
  auto mean_name = op_info->Input("Mean").front();
-  auto mean_type = kernel->GetInputDeclType("Mean");
-  CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
  auto mean = scope->FindMutableTensor(mean_name);
  auto variance_name = op_info->Input("Variance").front();
-  auto variance_type = kernel->GetInputDeclType("Variance");
-  CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
  auto variance = scope->FindMutableTensor(variance_name);
  auto y_name = op_info->Output("Y").front();
-  auto y_type = kernel->GetOutputDeclType("Y");
  auto y = scope->FindMutableTensor(y_name);
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
  float momentum = op_info->GetAttr<float>("momentum");
  float epsilon = op_info->GetAttr<float>("epsilon");
  int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
@@ -71,9 +59,11 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(x_name));
+    input_scale = op_info->GetInputScale(x_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(y_name));
+    output_scale = op_info->GetOutputScale(y_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);

--- a/lite/kernels/rknpu/bridges/concat_op.cc
+++ b/lite/kernels/rknpu/bridges/concat_op.cc
@@ -32,9 +32,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input and output vars and op attributes
  auto x_names = op_info->Input("X");
-  auto x_type = kernel->GetInputDeclType("X");
  auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
  auto output = scope->FindMutableTensor(out_name);
  auto axis = op_info->GetAttr<int>("axis");
@@ -50,9 +48,9 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);
@@ -77,12 +75,13 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      qnt.enable_int8 = enable_int8;
      if (enable_int8) {
+        CHECK(op_info->HasInputScale(x_name));
+        input_scale = op_info->GetInputScale(x_name)[0];
        qnt.quant_bits = bit_length;
        qnt.scale.push_back(input_scale);
        x->mutable_data<int8_t>();
      }
-      x_node =
+      x_node = graph->Add(x_name, *x, precision, layout, qnt);
-          graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
    }
    inputs.push_back(x_node->data());

--- a/lite/kernels/rknpu/bridges/conv_op.cc
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
@@ -59,7 +59,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK_EQ(dilations.size(), 2L);
  // Check depthwise mode
  bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
-  auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+  CHECK(op_info->HasInputScale(filter_name));
+  auto weight_scale = op_info->GetInputScale(filter_name);
  // for quantization
  bool enable_int8 = false;
@@ -71,9 +72,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(input_name));
+    input_scale = op_info->GetInputScale(input_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(output_name));
+    output_scale = op_info->GetOutputScale(output_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);

--- a/lite/kernels/rknpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc
@@ -56,11 +56,9 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
  auto y_name = op_info->Input("Y").front();
-  auto y_type = kernel->GetInputDeclType("Y");
  auto y = scope->FindMutableTensor(y_name);
  auto y_dims = y->dims();
  auto out_name = op_info->Output("Out").front();
@@ -78,9 +76,11 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(x_name));
+    input_scale = op_info->GetInputScale(x_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);
@@ -100,7 +100,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      qnt.scale.push_back(input_scale);
      qnt.quant_bits = op_info->GetAttr<int>("bit_length");
    }
-    x_node = graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
+    x_node = graph->Add(x_name, *x, precision, layout, qnt);
  }
  // Y node
@@ -118,7 +118,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      qnt.scale.clear();
      qnt.scale.push_back(input_scale);
    }
-    y_node = graph->Add(y_name, *y, y_type->precision(), y_type->layout(), qnt);
+    y_node = graph->Add(y_name, *y, precision, layout, qnt);
  }
  std::shared_ptr<Node> output_node = nullptr;
@@ -133,8 +133,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    output->mutable_data<int8_t>();
  }
-  output_node = graph->Add(
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
-      out_name, *output, x_type->precision(), x_type->layout(), output_qnt);
  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;

--- a/lite/kernels/rknpu/bridges/fc_op.cc
+++ b/lite/kernels/rknpu/bridges/fc_op.cc
@@ -31,17 +31,14 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
  auto input_name = op_info->Input("Input").front();
-  auto input_type = kernel->GetInputDeclType("Input");
  auto input = scope->FindMutableTensor(input_name);
  auto input_dims = input->dims();
  CHECK_GE(input_dims.size(), 2UL);
  auto w_name = op_info->Input("W").front();
-  auto w_type = kernel->GetInputDeclType("W");
  auto w = scope->FindMutableTensor(w_name);
  auto w_dims = w->dims();
  CHECK_EQ(w_dims.size(), 2UL);
  auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
  auto output = scope->FindMutableTensor(out_name);
  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
  int m = input_dims.Slice(0, in_num_col_dims).production();
@@ -61,9 +58,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(input_name));
+    input_scale = op_info->GetInputScale(input_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);
    }
@@ -86,7 +85,8 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (enable_int8) {
    QuantizationInfo filter_qnt;
-    auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+    CHECK(op_info->HasInputScale(w_name));
+    auto weight_scale = op_info->GetInputScale(w_name);
    filter_qnt.enable_int8 = enable_int8;
    filter_qnt.scale = weight_scale;
    filter_qnt.quant_bits = bit_length;
@@ -99,8 +99,8 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        transpose_w_data[j * k + i] = w_data[i * n + j];
      }
    }
-    trans_w_node = graph->Add(
+    trans_w_node =
-        w_name, *transpose_w, precision, w_type->layout(), filter_qnt);
+        graph->Add(w_name, *transpose_w, precision, layout, filter_qnt);
  } else {
    auto transpose_w_data = transpose_w->mutable_data<float>();
    auto w_data = w->mutable_data<float>();
@@ -110,8 +110,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        transpose_w_data[j * k + i] = w_data[i * n + j];
      }
    }
-    trans_w_node =
+    trans_w_node = graph->Add(w_name, *transpose_w, precision, layout);
-        graph->Add(w_name, *transpose_w, precision, w_type->layout());
  }
  // Add bias node if bias tensor exists
@@ -132,8 +131,8 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      if (enable_int8) {
        auto bias_name_qnt = bias_name + "/qnt";
        auto* bias_qnt = scope->NewTensor(bias_name_qnt);
-        auto weight_scale =
+        CHECK(op_info->HasInputScale(w_name));
-            op_info->GetAttr<std::vector<float>>("weight_scale");
+        auto weight_scale = op_info->GetInputScale(w_name);
        bias_qnt->Resize(bias_shape);
        bias_qnt->set_persistable(true);
@@ -176,7 +175,8 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    bias->set_persistable(true);
    if (enable_int8) {
-      auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+      CHECK(op_info->HasInputScale(w_name));
+      auto weight_scale = op_info->GetInputScale(w_name);
      bias->set_precision(PrecisionType::kInt32);
      auto* bias_data = bias->mutable_data<int32_t>();

--- a/lite/kernels/rknpu/bridges/pool_op.cc
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
@@ -55,9 +55,11 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (x->precision() == PRECISION(kInt8)) {
    // enable_int8 = op_info->GetAttr<bool>("enable_int8");
    enable_int8 = true;
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(x_name));
+    input_scale = op_info->GetInputScale(x_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);
@@ -132,18 +134,16 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
  }
-  std::shared_ptr<Node> output_node = nullptr;
  QuantizationInfo output_qnt;
  output_qnt.enable_int8 = enable_int8;
  if (enable_int8) {
    output_qnt.quant_bits = bit_length;
    output_qnt.scale.push_back(output_scale);
    output->mutable_data<int8_t>();
  }
-  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+  auto output_node =
+      graph->Add(out_name, *output, precision, layout, output_qnt);
  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;

--- a/lite/kernels/rknpu/bridges/softmax_op.cc
+++ b/lite/kernels/rknpu/bridges/softmax_op.cc
@@ -32,14 +32,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
  auto x_rank = x_dims.size();
  auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
  auto output = scope->FindMutableTensor(out_name);
  auto axis = op_info->GetAttr<int>("axis");
  if (axis < 0) {
@@ -56,9 +52,11 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(x_name));
+    input_scale = op_info->GetInputScale(x_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);

--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
@@ -25,6 +25,9 @@ WITH_STRIP=OFF
 # options of compiling NPU lib.
 WITH_HUAWEI_KIRIN_NPU=OFF
 HUAWEI_KIRIN_NPU_SDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
+# options of compiling APU lib.
+WITH_MEDIATEK_APU=OFF
+MEDIATEK_APU_SDK_ROOT="$(pwd)/apu_ddk" # Download APU SDK from https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz
 # options of compiling OPENCL lib.
 WITH_OPENCL=OFF
 # options of adding training ops
@@ -154,6 +157,8 @@ function make_tiny_publish_so {
      -DLITE_WITH_CV=$WITH_CV \
      -DLITE_WITH_NPU=$WITH_HUAWEI_KIRIN_NPU \
      -DNPU_DDK_ROOT=$HUAWEI_KIRIN_NPU_SDK_ROOT \
+      -DLITE_WITH_APU=$WITH_MEDIATEK_APU \
+      -DAPU_DDK_ROOT=$MEDIATEK_APU_SDK_ROOT \
      -DLITE_WITH_OPENCL=$WITH_OPENCL \
      -DARM_TARGET_ARCH_ABI=$ARCH \
      -DARM_TARGET_LANG=$TOOLCHAIN \
@@ -204,6 +209,8 @@ function make_full_publish_so {
      -DLITE_WITH_CV=$WITH_CV \
      -DLITE_WITH_NPU=$WITH_HUAWEI_KIRIN_NPU \
      -DNPU_DDK_ROOT=$HUAWEI_KIRIN_NPU_SDK_ROOT \
+      -DLITE_WITH_APU=$WITH_MEDIATEK_APU \
+      -DAPU_DDK_ROOT=$MEDIATEK_APU_SDK_ROOT \
      -DLITE_WITH_OPENCL=$WITH_OPENCL \
      -DARM_TARGET_ARCH_ABI=$ARCH \
      -DARM_TARGET_LANG=$TOOLCHAIN \
@@ -257,6 +264,13 @@ function print_usage {
    echo -e "|             you can download huawei HiAi DDK from:  https://developer.huawei.com/consumer/cn/hiai/                                   |"
    echo -e "|  detailed information about Paddle-Lite NPU:  https://paddle-lite.readthedocs.io/zh/latest/demo_guides/npu.html                      |"
    echo -e "|                                                                                                                                      |"
+    echo -e "|  arguments of apu library compiling:(armv8, gcc, c++_static)                                                                         |"
+    echo -e "|     ./lite/tools/build_android.sh --with_mediatek_apu=ON --mediatek_apu_sdk_root=YourApuSdkPath                                      |"
+    echo -e "|     --with_mediatek_apu: (OFF|ON); controls whether to compile lib for mediatek_apu, default is OFF                                  |"
+    echo -e "|     --mediatek_apu_sdk_root: (path to mediatek APU SDK file) required when compiling apu library                                     |"
+    echo -e "|             you can download mediatek APU SDK from:  https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz           |"
+    echo -e "|  detailed information about Paddle-Lite APU:  https://paddle-lite.readthedocs.io/zh/latest/demo_guides/mediatek_apu.html             |"
+    echo -e "|                                                                                                                                      |"
    echo -e "|  arguments of opencl library compiling:(armv8, gcc, c++_static)                                                                      |"
    echo -e "|     ./lite/tools/build_android.sh --with_opencl=ON                                                                                   |"
    echo -e "|     --with_opencl: (OFF|ON); controls whether to compile lib for opencl, default is OFF                                              |"
@@ -351,6 +365,15 @@ function main {
                HUAWEI_KIRIN_NPU_SDK_ROOT="${i#*=}"
                shift
                ;;
+            # compiling lib which can operate on mediatek apu.
+            --with_mediatek_apu=*)
+                WITH_MEDIATEK_APU="${i#*=}"
+                shift
+                ;;
+            --mediatek_apu_sdk_root=*)
+                MEDIATEK_APU_SDK_ROOT="${i#*=}"
+                shift
+                ;;
            # compiling result contains both light_api and cxx_api lib.
            full_publish)
                make_full_publish_so

--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
@@ -26,7 +26,7 @@ OPTMODEL_DIR=""
 WITH_OPENCL=OFF
 # options of compiling rockchip NPU lib.
 WITH_ROCKCHIP_NPU=OFF
-ROCKCHIP_NPU_SDK_ROOT=""
+ROCKCHIP_NPU_SDK_ROOT="$(pwd)/rknpu_ddk"  # Download RKNPU SDK from https://github.com/airockchip/rknpu_ddk.git
 # options of compiling baidu XPU lib.
 WITH_BAIDU_XPU=OFF
 BAIDU_XPU_SDK_ROOT=""
@@ -229,6 +229,8 @@ function print_usage {
    echo -e "|     ./lite/tools/build_linux.sh --with_rockchip_npu=ON --rockchip_npu_sdk_root=YourRockchipNpuSdkPath                                                |"
    echo -e "|     --with_rockchip_npu: (OFF|ON); controls whether to compile lib for rockchip_npu, default is OFF                                                  |"
    echo -e "|     --rockchip_npu_sdk_root: (path to rockchip_npu DDK file) required when compiling rockchip_npu library                                            |"
+    echo -e "|             you can download rockchip NPU SDK from:  https://github.com/airockchip/rknpu_ddk.git                                                     |"
+    echo -e "|  detailed information about Paddle-Lite RKNPU:  https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html                           |"
    echo -e "|                                                                                                                                                      |"
    echo -e "|  arguments of baidu xpu library compiling:                                                                                                           |"
    echo -e "|     ./lite/tools/build_linux.sh --with_baidu_xpu=ON --baidu_xpu_sdk_root=YourBaiduXpuSdkPath                                                         |"