Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into fix-opencl-concat

4b0a4230 · 开心的小妮 · 2d8fcc46 · 4bdeabb8 · 4b0a4230 · 4b0a4230
23 changed file
--- a/docs/demo_guides/mediatek_apu.md
+++ b/docs/demo_guides/mediatek_apu.md
@@ -159,12 +159,18 @@ $ git checkout <release-version-tag>
 $ wget https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz
 $ tar -xvf apu_ddk.tar.gz
 ```
- 编译tiny_publish for MT8168-P2V1 Tablet
+- 编译tiny_publish for MT8168-P2V1 Tablet and Smart TVs(S900)
 ```shell
-$ ./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared --build_extra=ON --with_log=ON --build_apu=ON --apu_ddk_root=./apu_ddk tiny_publish
+For MT8168-P2V1 Tablet
+$ ./lite/tools/build_android.sh --android_stl=c++_shared --with_extra=ON --with_log=ON --with_mediatek_apu=ON --mediatek_apu_sdk_root=./apu_ddk
+For Smart TVs(S900)
+$ ./lite/tools/build_android.sh --arch=armv7 --android_stl=c++_shared --with_extra=ON --with_log=ON --with_mediatek_apu=ON --mediatek_apu_sdk_root=./apu_ddk
 ```
 - 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/include目录；
- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件。
+- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件；
+- 将编译生成的build.lite.android.armv7.gcc/inference_lite_lib.android.armv7.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/armeabi-v7a/include目录；
+- 将编译生成的build.lite.android.armv7.gcc/inference_lite_lib.android.armv7.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/armeabi-v7a/lib/libpaddle_light_api_shared.so文件。
 ## 其它说明

--- a/docs/demo_guides/rockchip_npu.md
+++ b/docs/demo_guides/rockchip_npu.md
@@ -137,20 +137,26 @@ $ cd Paddle-Lite
 $ git checkout <release-version-tag>
 $ git clone https://github.com/airockchip/rknpu_ddk.git
 ```
- 编译full_publish and tiny_publish for RK1808 and RK1806 EVB
+- 编译tiny_publish and full_publish for RK1808 and RK1806 EVB
 ```shell
 For RK1808 EVB
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
+tiny_publish
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+$ ./lite/tools/build_linux.sh --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk
+full_publish
+$ ./lite/tools/build_linux.sh --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk full_publish
 For RK1806 EVB
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
+tiny_publish
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+$ ./lite/tools/build_linux.sh --arch=armv7 --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk
+full_publish
+$ ./lite/tools/build_linux.sh --arch=armv7 --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk full_publish
 ```
 - 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录；
- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+- 将tiny_publish模式下编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+- 将full_publish模式下编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件；
 - 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；
- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件。
+- 将tiny_publish模式下编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件；
+- 将full_publish模式下编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。
 ## 其它说明

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -25,6 +25,8 @@
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
 #include <gflags/gflags.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
 using paddle::lite::profile::Timer;

--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -137,10 +137,11 @@ void ElementwiseSubCompute::Run() {
  auto x_dims = param.X->dims();
  auto y_dims = param.Y->dims();
  int pre, n, post;
-  if (x_dims.size() < y_dims.size()) {
+  if (x_dims.size() < y_dims.size() &&
-    LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
-  }
+    lite::arm::math::elementwise_sub_broadcast(
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
    lite::arm::math::elementwise_sub_broadcast(
        x_data, y_data, out_data, pre, n, post);
  } else {
@@ -158,24 +159,21 @@ void ElementwiseSubActivationCompute::Run() {
  std::string act_type = param.act_type;
  auto x_dims = param.X->dims();
  auto y_dims = param.Y->dims();
-  if (x_dims.size() < y_dims.size()) {
-    LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
-  }
  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    if (act_type == "relu") {
+  if (act_type != "relu") {
-      lite::arm::math::elementwise_sub_relu_broadcast(
+    LOG(FATAL) << "unsupported Activation type: " << act_type;
-          x_data, y_data, out_data, pre, n, post);
+  }
-    } else {
+  if (x_dims.size() < y_dims.size() &&
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
-    }
+    lite::arm::math::elementwise_sub_relu_broadcast(
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_sub_relu_broadcast(
+        x_data, y_data, out_data, pre, n, post);
  } else {
-    if (act_type == "relu") {
+    lite::arm::math::elementwise_sub_relu(
-      lite::arm::math::elementwise_sub_relu(
+        x_data, y_data, out_data, x_dims.production());
-          x_data, y_data, out_data, x_dims.production());
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
  }
 }

--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -7,6 +7,7 @@ add_kernel(squeeze_compute_host Host basic SRCS squeeze_compute.cc DEPS ${lite_k
 add_kernel(unsqueeze_compute_host Host basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(expand_compute_host Host basic SRCS expand_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(expand_as_compute_host Host basic SRCS expand_as_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})

--- a/lite/kernels/host/expand_as_compute.cc
+++ b/lite/kernels/host/expand_as_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/host/expand_as_compute.h"
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+template <typename T, PrecisionType PType>
+void ExpandAsCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ExpandAsParam>();
+  const auto* x = param.X;
+  auto* out = param.Out;
+  const auto* target = param.Target;
+  std::vector<int> expand_times;
+  const T* src = x->template data<T>();
+  T* dst = out->template mutable_data<T>();
+  // int dims = expand_times.size();
+  for (int i = 0; i < target->dims().size(); ++i) {
+    int times = target->dims()[i] / x->dims()[i];
+    expand_times.push_back(times);
+  }
+  int dims = target->dims().size();
+  DDim in_shape = x->dims();
+  int inner_num = 1;
+  int pos = dims - 1;
+  int outer_num = in_shape.count(0, pos);
+  inner_num *= in_shape[pos];
+  for (int j = 0; j < outer_num; ++j) {
+    for (int k = 0; k < expand_times[pos]; ++k) {
+      memcpy(dst + (j * expand_times[pos] + k) * inner_num,
+             src + j * inner_num,
+             sizeof(T) * inner_num);
+    }
+  }
+  inner_num *= expand_times[pos];
+  for (int i = dims - 2; i >= 0; --i) {
+    int outer_num = in_shape.count(0, i);
+    inner_num *= in_shape[i];
+    for (int j = outer_num - 1; j >= 0; --j) {
+      for (int k = expand_times[i] - 1; k >= 0; --k) {
+        memcpy(dst + (j * expand_times[i] + k) * inner_num,
+               dst + j * inner_num,
+               sizeof(T) * inner_num);
+      }
+    }
+    inner_num *= expand_times[i];
+  }
+}
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+using expand_as_float =
+    paddle::lite::kernels::host::ExpandAsCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(expand_as, kHost, kFloat, kAny, expand_as_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Target",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
--- a/lite/kernels/host/expand_as_compute.h
+++ b/lite/kernels/host/expand_as_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+template <typename T, PrecisionType PType>
+class ExpandAsCompute
+    : public KernelLite<TARGET(kHost), PType, DATALAYOUT(kAny)> {
+ public:
+  void Run() override;
+  virtual ~ExpandAsCompute() = default;
+};
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/rknpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc
@@ -32,30 +32,18 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
  auto scale_name = op_info->Input("Scale").front();
-  auto scale_type = kernel->GetInputDeclType("Scale");
-  CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
  auto scale = scope->FindMutableTensor(scale_name);
  auto bias_name = op_info->Input("Bias").front();
-  auto bias_type = kernel->GetInputDeclType("Bias");
-  CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
  auto bias = scope->FindMutableTensor(bias_name);
  auto mean_name = op_info->Input("Mean").front();
-  auto mean_type = kernel->GetInputDeclType("Mean");
-  CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
  auto mean = scope->FindMutableTensor(mean_name);
  auto variance_name = op_info->Input("Variance").front();
-  auto variance_type = kernel->GetInputDeclType("Variance");
-  CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
  auto variance = scope->FindMutableTensor(variance_name);
  auto y_name = op_info->Output("Y").front();
-  auto y_type = kernel->GetOutputDeclType("Y");
  auto y = scope->FindMutableTensor(y_name);
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
  float momentum = op_info->GetAttr<float>("momentum");
  float epsilon = op_info->GetAttr<float>("epsilon");
  int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
@@ -71,9 +59,11 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(x_name));
+    input_scale = op_info->GetInputScale(x_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(y_name));
+    output_scale = op_info->GetOutputScale(y_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);

--- a/lite/kernels/rknpu/bridges/concat_op.cc
+++ b/lite/kernels/rknpu/bridges/concat_op.cc
@@ -32,9 +32,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input and output vars and op attributes
  auto x_names = op_info->Input("X");
-  auto x_type = kernel->GetInputDeclType("X");
  auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
  auto output = scope->FindMutableTensor(out_name);
  auto axis = op_info->GetAttr<int>("axis");
@@ -50,9 +48,9 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);
@@ -77,12 +75,13 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      qnt.enable_int8 = enable_int8;
      if (enable_int8) {
+        CHECK(op_info->HasInputScale(x_name));
+        input_scale = op_info->GetInputScale(x_name)[0];
        qnt.quant_bits = bit_length;
        qnt.scale.push_back(input_scale);
        x->mutable_data<int8_t>();
      }
-      x_node =
+      x_node = graph->Add(x_name, *x, precision, layout, qnt);
-          graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
    }
    inputs.push_back(x_node->data());

--- a/lite/kernels/rknpu/bridges/conv_op.cc
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
@@ -59,7 +59,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK_EQ(dilations.size(), 2L);
  // Check depthwise mode
  bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
-  auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+  CHECK(op_info->HasInputScale(filter_name));
+  auto weight_scale = op_info->GetInputScale(filter_name);
  // for quantization
  bool enable_int8 = false;
@@ -71,9 +72,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(input_name));
+    input_scale = op_info->GetInputScale(input_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(output_name));
+    output_scale = op_info->GetOutputScale(output_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);

--- a/lite/kernels/rknpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc
@@ -56,11 +56,9 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
  auto y_name = op_info->Input("Y").front();
-  auto y_type = kernel->GetInputDeclType("Y");
  auto y = scope->FindMutableTensor(y_name);
  auto y_dims = y->dims();
  auto out_name = op_info->Output("Out").front();
@@ -78,9 +76,11 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(x_name));
+    input_scale = op_info->GetInputScale(x_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);
@@ -100,7 +100,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      qnt.scale.push_back(input_scale);
      qnt.quant_bits = op_info->GetAttr<int>("bit_length");
    }
-    x_node = graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
+    x_node = graph->Add(x_name, *x, precision, layout, qnt);
  }
  // Y node
@@ -118,7 +118,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      qnt.scale.clear();
      qnt.scale.push_back(input_scale);
    }
-    y_node = graph->Add(y_name, *y, y_type->precision(), y_type->layout(), qnt);
+    y_node = graph->Add(y_name, *y, precision, layout, qnt);
  }
  std::shared_ptr<Node> output_node = nullptr;
@@ -133,8 +133,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    output->mutable_data<int8_t>();
  }
-  output_node = graph->Add(
+  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
-      out_name, *output, x_type->precision(), x_type->layout(), output_qnt);
  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;

--- a/lite/kernels/rknpu/bridges/fc_op.cc
+++ b/lite/kernels/rknpu/bridges/fc_op.cc
@@ -31,17 +31,14 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  VLOG(3) << "[RKNPU] Converting " + op_type + "...";
  auto input_name = op_info->Input("Input").front();
-  auto input_type = kernel->GetInputDeclType("Input");
  auto input = scope->FindMutableTensor(input_name);
  auto input_dims = input->dims();
  CHECK_GE(input_dims.size(), 2UL);
  auto w_name = op_info->Input("W").front();
-  auto w_type = kernel->GetInputDeclType("W");
  auto w = scope->FindMutableTensor(w_name);
  auto w_dims = w->dims();
  CHECK_EQ(w_dims.size(), 2UL);
  auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
  auto output = scope->FindMutableTensor(out_name);
  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
  int m = input_dims.Slice(0, in_num_col_dims).production();
@@ -61,9 +58,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(input_name));
+    input_scale = op_info->GetInputScale(input_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);
    }
@@ -86,7 +85,8 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (enable_int8) {
    QuantizationInfo filter_qnt;
-    auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+    CHECK(op_info->HasInputScale(w_name));
+    auto weight_scale = op_info->GetInputScale(w_name);
    filter_qnt.enable_int8 = enable_int8;
    filter_qnt.scale = weight_scale;
    filter_qnt.quant_bits = bit_length;
@@ -99,8 +99,8 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        transpose_w_data[j * k + i] = w_data[i * n + j];
      }
    }
-    trans_w_node = graph->Add(
+    trans_w_node =
-        w_name, *transpose_w, precision, w_type->layout(), filter_qnt);
+        graph->Add(w_name, *transpose_w, precision, layout, filter_qnt);
  } else {
    auto transpose_w_data = transpose_w->mutable_data<float>();
    auto w_data = w->mutable_data<float>();
@@ -110,8 +110,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        transpose_w_data[j * k + i] = w_data[i * n + j];
      }
    }
-    trans_w_node =
+    trans_w_node = graph->Add(w_name, *transpose_w, precision, layout);
-        graph->Add(w_name, *transpose_w, precision, w_type->layout());
  }
  // Add bias node if bias tensor exists
@@ -132,8 +131,8 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      if (enable_int8) {
        auto bias_name_qnt = bias_name + "/qnt";
        auto* bias_qnt = scope->NewTensor(bias_name_qnt);
-        auto weight_scale =
+        CHECK(op_info->HasInputScale(w_name));
-            op_info->GetAttr<std::vector<float>>("weight_scale");
+        auto weight_scale = op_info->GetInputScale(w_name);
        bias_qnt->Resize(bias_shape);
        bias_qnt->set_persistable(true);
@@ -176,7 +175,8 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    bias->set_persistable(true);
    if (enable_int8) {
-      auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+      CHECK(op_info->HasInputScale(w_name));
+      auto weight_scale = op_info->GetInputScale(w_name);
      bias->set_precision(PrecisionType::kInt32);
      auto* bias_data = bias->mutable_data<int32_t>();

--- a/lite/kernels/rknpu/bridges/pool_op.cc
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
@@ -55,9 +55,11 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (x->precision() == PRECISION(kInt8)) {
    // enable_int8 = op_info->GetAttr<bool>("enable_int8");
    enable_int8 = true;
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(x_name));
+    input_scale = op_info->GetInputScale(x_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);
@@ -132,18 +134,16 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
  }
-  std::shared_ptr<Node> output_node = nullptr;
  QuantizationInfo output_qnt;
  output_qnt.enable_int8 = enable_int8;
  if (enable_int8) {
    output_qnt.quant_bits = bit_length;
    output_qnt.scale.push_back(output_scale);
    output->mutable_data<int8_t>();
  }
-  output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
+  auto output_node =
+      graph->Add(out_name, *output, precision, layout, output_qnt);
  std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
  std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;

--- a/lite/kernels/rknpu/bridges/softmax_op.cc
+++ b/lite/kernels/rknpu/bridges/softmax_op.cc
@@ -32,14 +32,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
  auto x_rank = x_dims.size();
  auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
  auto output = scope->FindMutableTensor(out_name);
  auto axis = op_info->GetAttr<int>("axis");
  if (axis < 0) {
@@ -56,9 +52,11 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("enable_int8")) {
    enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    input_scale = op_info->GetAttr<float>("input_scale");
+    CHECK(op_info->HasInputScale(x_name));
+    input_scale = op_info->GetInputScale(x_name)[0];
    bit_length = op_info->GetAttr<int>("bit_length");
-    output_scale = op_info->GetAttr<float>("output_scale");
+    CHECK(op_info->HasOutputScale(out_name));
+    output_scale = op_info->GetOutputScale(out_name)[0];
    if (enable_int8) {
      precision = PRECISION(kInt8);

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -34,6 +34,7 @@ add_operator(fake_quant extra SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op
 add_operator(fake_dequant extra SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(conv_transpose_op basic SRCS conv_transpose_op.cc DEPS ${op_DEPS})
 add_operator(expand_op_lite basic SRCS expand_op.cc DEPS ${op_DEPS})
+add_operator(expand_as_op_lite basic SRCS expand_as_op.cc DEPS ${op_DEPS})
 add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS})
 add_operator(unsqueeze_op_lite basic SRCS unsqueeze_op.cc DEPS ${op_DEPS})
 add_operator(stack_op basic SRCS stack_op.cc DEPS ${op_DEPS})

--- a/lite/operators/expand_as_op.cc
+++ b/lite/operators/expand_as_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/expand_as_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool ExpandAsOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Target);
+  CHECK_OR_FALSE(param_.Out);
+  int target_size = param_.Target->dims().size();
+  int x_dims_size = param_.X->dims().size();
+  CHECK_EQ(target_size, x_dims_size)
+      << "The number of expand_times size must be qual to the rank of "
+         "Input(X).";
+  CHECK_LE(param_.X->dims().size(), 6u)
+      << "The rank of Input(X) must not be greater than 6.";
+  return true;
+}
+bool ExpandAsOpLite::InferShapeImpl() const {
+  DDim out_dims(param_.X->dims());
+  for (size_t i = 0; i < param_.Target->dims().size(); ++i) {
+    // out_dims[i] *= param_.expand_times[i];
+    out_dims[i] = param_.Target->dims()[i];
+  }
+  param_.Out->Resize(out_dims);
+  return true;
+}
+bool ExpandAsOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  auto X_name = opdesc.Input("X").front();
+  auto Out_name = opdesc.Output("Out").front();
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
+  auto Target_name = opdesc.Input("Target").front();
+  param_.Target = GetVar<lite::Tensor>(scope, Target_name);
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(expand_as, paddle::lite::operators::ExpandAsOpLite);
--- a/lite/operators/expand_as_op.h
+++ b/lite/operators/expand_as_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class ExpandAsOpLite : public OpLite {
+ public:
+  ExpandAsOpLite() {}
+  explicit ExpandAsOpLite(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShapeImpl() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "expand_as"; }
+ private:
+  mutable ExpandAsParam param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/fusion_elementwise_activation_ops.cc
+++ b/lite/operators/fusion_elementwise_activation_ops.cc
@@ -28,8 +28,13 @@ bool FusionElementwiseActivationOp::CheckShape() const {
 }
 bool FusionElementwiseActivationOp::InferShapeImpl() const {
-  CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
+  size_t x_size = param_.X->dims().size();
-  param_.Out->Resize(param_.X->dims());
+  size_t y_size = param_.Y->dims().size();
+  if (x_size >= y_size) {
+    param_.Out->Resize(param_.X->dims());
+  } else {
+    param_.Out->Resize(param_.Y->dims());
+  }
  return true;
 }

--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -1287,6 +1287,13 @@ struct ExpandParam : ParamBase {
  std::vector<int> expand_times{};
 };
+/// ----------------------- expand as operators ----------------------
+struct ExpandAsParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* Target{};
+  lite::Tensor* Out{};
+};
 /// ----------------------- matmul operators ----------------------
 struct MatMulParam : ParamBase {
  const lite::Tensor* X{};

--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -86,6 +86,7 @@ endif()
    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_expand_as_compute SRCS expand_as_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_crf_decoding_compute SRCS crf_decoding_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/tests/kernels/expand_as_compute_test.cc
+++ b/lite/tests/kernels/expand_as_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+namespace paddle {
+namespace lite {
+class ExpandAsComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "X";
+  std::string out_ = "Out";
+  std::string target_ = "Target";
+  DDim dims_;
+  DDim target_dims_;
+ public:
+  ExpandAsComputeTester(const Place& place,
+                        const std::string& alias,
+                        DDim dims,
+                        DDim target_dims)
+      : TestCase(place, alias), dims_(dims), target_dims_(target_dims) {}
+  void RunBaseline(Scope* scope) override {
+    const auto* input = scope->FindTensor(x_);
+    CHECK(input);
+    auto* out = scope->NewTensor(out_);
+    CHECK(out);
+    const auto* target = scope->FindTensor(target_);
+    DDim out_shape(input->dims());
+    DDim in_shape = input->dims();
+    std::vector<int> expand_times_;
+    for (size_t i = 0; i < target->dims().size(); ++i) {
+      int times = target->dims()[i] / input->dims()[i];
+      expand_times_.push_back(times);
+    }
+    for (size_t i = 0; i < expand_times_.size(); ++i) {
+      out_shape[i] *= expand_times_[i];
+    }
+    out->Resize(out_shape);
+    float* out_data = out->mutable_data<float>();
+    const float* input_data = input->data<float>();
+    std::vector<int> in_stride(in_shape.size(), 1),
+        out_stride(out_shape.size(), 1);
+    for (int i = in_shape.size() - 2; i >= 0; --i) {
+      in_stride[i] = in_shape[i + 1] * in_stride[i + 1];
+    }
+    for (int i = out_shape.size() - 2; i >= 0; --i) {
+      out_stride[i] = out_shape[i + 1] * out_stride[i + 1];
+    }
+    for (size_t out_id = 0; out_id < out_shape.production(); ++out_id) {
+      int in_id = 0;
+      for (int i = expand_times_.size() - 1; i >= 0; --i) {
+        int in_j = (out_id / out_stride[i]) % in_shape[i];
+        in_id += in_j * in_stride[i];
+      }
+      out_data[out_id] = input_data[in_id];
+    }
+  }
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("expand_as");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Target", {target_});
+    op_desc->SetOutput("Out", {out_});
+  }
+  void PrepareData() override {
+    std::vector<float> in_data(dims_.production());
+    std::vector<float> target_data(target_dims_.production());
+    for (int i = 0; i < dims_.production(); ++i) {
+      in_data[i] = i;
+    }
+    for (int i = 0; i < target_dims_.production(); ++i) {
+      target_data[i] = i;
+    }
+    SetCommonTensor(x_, dims_, in_data.data());
+    SetCommonTensor(target_, target_dims_, target_data.data());
+  }
+};
+void test_expand_as_3dim(Place place, float abs_error) {
+  for (int C : {3}) {
+    for (int H : {2}) {
+      for (int W : {4}) {
+        std::unique_ptr<arena::TestCase> tester(new ExpandAsComputeTester(
+            place, "def", DDim({C, H, W}), DDim({C * 2, H * 3, W * 1})));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+void test_expand_as_4dim(Place place, float abs_error) {
+  for (int N : {2}) {
+    for (int C : {3}) {
+      for (int H : {2}) {
+        for (int W : {4}) {
+          std::unique_ptr<arena::TestCase> tester(
+              new ExpandAsComputeTester(place,
+                                        "def",
+                                        DDim({N, C, H, W}),
+                                        DDim({N * 2, C * 3, H * 1, W * 4})));
+          arena::Arena arena(std::move(tester), place, abs_error);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+}
+TEST(ExpandAs, precision) {
+  float abs_error = 1e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
+#elif defined(LITE_WITH_X86)
+  place = TARGET(kHost);
+#else
+  return;
+#endif
+  test_expand_as_3dim(place, abs_error);
+  test_expand_as_4dim(place, abs_error);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
@@ -25,6 +25,9 @@ WITH_STRIP=OFF
 # options of compiling NPU lib.
 WITH_HUAWEI_KIRIN_NPU=OFF
 HUAWEI_KIRIN_NPU_SDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
+# options of compiling APU lib.
+WITH_MEDIATEK_APU=OFF
+MEDIATEK_APU_SDK_ROOT="$(pwd)/apu_ddk" # Download APU SDK from https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz
 # options of compiling OPENCL lib.
 WITH_OPENCL=OFF
 # options of adding training ops
@@ -154,6 +157,8 @@ function make_tiny_publish_so {
      -DLITE_WITH_CV=$WITH_CV \
      -DLITE_WITH_NPU=$WITH_HUAWEI_KIRIN_NPU \
      -DNPU_DDK_ROOT=$HUAWEI_KIRIN_NPU_SDK_ROOT \
+      -DLITE_WITH_APU=$WITH_MEDIATEK_APU \
+      -DAPU_DDK_ROOT=$MEDIATEK_APU_SDK_ROOT \
      -DLITE_WITH_OPENCL=$WITH_OPENCL \
      -DARM_TARGET_ARCH_ABI=$ARCH \
      -DARM_TARGET_LANG=$TOOLCHAIN \
@@ -204,6 +209,8 @@ function make_full_publish_so {
      -DLITE_WITH_CV=$WITH_CV \
      -DLITE_WITH_NPU=$WITH_HUAWEI_KIRIN_NPU \
      -DNPU_DDK_ROOT=$HUAWEI_KIRIN_NPU_SDK_ROOT \
+      -DLITE_WITH_APU=$WITH_MEDIATEK_APU \
+      -DAPU_DDK_ROOT=$MEDIATEK_APU_SDK_ROOT \
      -DLITE_WITH_OPENCL=$WITH_OPENCL \
      -DARM_TARGET_ARCH_ABI=$ARCH \
      -DARM_TARGET_LANG=$TOOLCHAIN \
@@ -257,6 +264,13 @@ function print_usage {
    echo -e "|             you can download huawei HiAi DDK from:  https://developer.huawei.com/consumer/cn/hiai/                                   |"
    echo -e "|  detailed information about Paddle-Lite NPU:  https://paddle-lite.readthedocs.io/zh/latest/demo_guides/npu.html                      |"
    echo -e "|                                                                                                                                      |"
+    echo -e "|  arguments of apu library compiling:(armv8, gcc, c++_static)                                                                         |"
+    echo -e "|     ./lite/tools/build_android.sh --with_mediatek_apu=ON --mediatek_apu_sdk_root=YourApuSdkPath                                      |"
+    echo -e "|     --with_mediatek_apu: (OFF|ON); controls whether to compile lib for mediatek_apu, default is OFF                                  |"
+    echo -e "|     --mediatek_apu_sdk_root: (path to mediatek APU SDK file) required when compiling apu library                                     |"
+    echo -e "|             you can download mediatek APU SDK from:  https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz           |"
+    echo -e "|  detailed information about Paddle-Lite APU:  https://paddle-lite.readthedocs.io/zh/latest/demo_guides/mediatek_apu.html             |"
+    echo -e "|                                                                                                                                      |"
    echo -e "|  arguments of opencl library compiling:(armv8, gcc, c++_static)                                                                      |"
    echo -e "|     ./lite/tools/build_android.sh --with_opencl=ON                                                                                   |"
    echo -e "|     --with_opencl: (OFF|ON); controls whether to compile lib for opencl, default is OFF                                              |"
@@ -351,6 +365,15 @@ function main {
                HUAWEI_KIRIN_NPU_SDK_ROOT="${i#*=}"
                shift
                ;;
+            # compiling lib which can operate on mediatek apu.
+            --with_mediatek_apu=*)
+                WITH_MEDIATEK_APU="${i#*=}"
+                shift
+                ;;
+            --mediatek_apu_sdk_root=*)
+                MEDIATEK_APU_SDK_ROOT="${i#*=}"
+                shift
+                ;;
            # compiling result contains both light_api and cxx_api lib.
            full_publish)
                make_full_publish_so

--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
@@ -26,7 +26,7 @@ OPTMODEL_DIR=""
 WITH_OPENCL=OFF
 # options of compiling rockchip NPU lib.
 WITH_ROCKCHIP_NPU=OFF
-ROCKCHIP_NPU_SDK_ROOT=""
+ROCKCHIP_NPU_SDK_ROOT="$(pwd)/rknpu_ddk"  # Download RKNPU SDK from https://github.com/airockchip/rknpu_ddk.git
 # options of compiling baidu XPU lib.
 WITH_BAIDU_XPU=OFF
 BAIDU_XPU_SDK_ROOT=""
@@ -229,6 +229,8 @@ function print_usage {
    echo -e "|     ./lite/tools/build_linux.sh --with_rockchip_npu=ON --rockchip_npu_sdk_root=YourRockchipNpuSdkPath                                                |"
    echo -e "|     --with_rockchip_npu: (OFF|ON); controls whether to compile lib for rockchip_npu, default is OFF                                                  |"
    echo -e "|     --rockchip_npu_sdk_root: (path to rockchip_npu DDK file) required when compiling rockchip_npu library                                            |"
+    echo -e "|             you can download rockchip NPU SDK from:  https://github.com/airockchip/rknpu_ddk.git                                                     |"
+    echo -e "|  detailed information about Paddle-Lite RKNPU:  https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html                           |"
    echo -e "|                                                                                                                                                      |"
    echo -e "|  arguments of baidu xpu library compiling:                                                                                                           |"
    echo -e "|     ./lite/tools/build_linux.sh --with_baidu_xpu=ON --baidu_xpu_sdk_root=YourBaiduXpuSdkPath                                                         |"