Merge branch 'mul' into 'develop'

enable Mul See merge request paddlepaddle/paddlelitearmbackend!16

Merge branch 'mul' into 'develop'
enable Mul See merge request paddlepaddle/paddlelitearmbackend!16
519ef7f7 · Tensor Tang · 02029900 · 85487210 · 519ef7f7 · 519ef7f7
12 changed file
--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
@@ -54,15 +54,15 @@ void DeviceInfo::InitInternal(DeviceInfo* dev) {
              << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
              << ", CPU ARCH: A" << dev->archs_[i];
  }
-  LOG(INFO) << "L1 DataCache size is: ";
+  VLOG(1) << "L1 DataCache size is: ";
  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << dev->L1_cache_[i] / 1024 << " KB";
+    VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
  }
-  LOG(INFO) << "L2 Cache size is: ";
+  VLOG(1) << "L2 Cache size is: ";
  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << dev->L2_cache_[i] / 1024 << " KB";
+    VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
  }
-  LOG(INFO) << "Total memory: " << dev->max_memory_ << "KB";
+  VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";

  dev->max_freq_ = max_freq[0];
  for (int j = 1; j < dev->compute_core_num_; ++j) {

--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -6,7 +6,7 @@ message(STATUS "compile with lite ARM kernels")

 cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
-cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
+cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -20,6 +20,7 @@ lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_
 lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
 lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
 lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
+lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)

 set(arm_kernels

--- a/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
@@ -124,6 +124,20 @@ TEST(conv_arm, init) {

 TEST(conv_arm, compute) {
  DeviceInfo::Init();
+#if 1
+  for (auto n : {2}) {
+    for (auto ic : {6}) {
+      for (auto oc : {6}) {
+        for (auto ih : {9}) {
+          for (auto iw : {9}) {
+            for (auto flag_bias : {false, true}) {
+              for (auto flag_relu : {false, true}) {
+                for (auto depthwise : {false, true}) {
+                  for (auto dilation : {1}) {
+                    for (auto stride : {1, 2}) {
+                      for (auto padding : {0, 1, 2}) {
+                        for (auto ks : {1, 3, 5}) {
+#else
  for (auto n : {1, 2}) {
    for (auto ic : {6, 32 /*, 128*/}) {
      for (auto oc : {6, 32 /*, 128*/}) {
@@ -136,6 +150,7 @@ TEST(conv_arm, compute) {
                    for (auto stride : {1, 2}) {
                      for (auto padding : {0, 1, 2}) {
                        for (auto ks : {1, 3, 5}) {
+#endif
                          int group = 1;
                          if (depthwise) {  // depthwise convolution ?
                            group = oc = ic;

--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -22,6 +22,10 @@ namespace lite {
 namespace kernels {
 namespace arm {

+void FcCompute::PrepareForRun() {
+  // TODO(TJ): transpose weight
+}
+
 void FcCompute::Run() {
  auto& param = this->Param<operators::FcParam>();
  auto x_dims = param.input->dims();
@@ -48,22 +52,16 @@ void FcCompute::Run() {
                              &ctx);
    lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n,
                                   x_w, false, false, false, &ctx);
-
    if (param.bias) {
      CHECK_EQ(param.bias->numel(), n);
      lite::arm::math::fill_bias_fc(o_data, b_data, x_h, n);
    }
  } else {
-    // use sgemmv
-    // sgemv((const float*)weights, (const float*)din, (float*)dout,
-    //       false, n, x_w, _param->_flag_bias, (float*)bias, false);
+    lite::arm::math::sgemv(w_data, i_data, o_data, false, n, x_w,
+                           b_data != nullptr, b_data, false);
  }
 }

-TargetType FcCompute::target() const { return TARGET(kARM); }
-
-PrecisionType FcCompute::precision() const { return PRECISION(kFloat); }
-
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/paddle/fluid/lite/kernels/arm/fc_compute.h
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.h
@@ -25,10 +25,9 @@ class FcCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
 public:
  using param_t = operators::FcParam;

-  void Run() override;
+  void PrepareForRun() override;

-  TargetType target() const override;
-  PrecisionType precision() const override;
+  void Run() override;

  virtual ~FcCompute() = default;
 };

--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -12,57 +12,57 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
 #include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/lite/core/type_system.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {

-template <typename T>
-void mul_compute_eigen(const T* x, int x_h, int x_w, const T* y, int y_h,
-                       int y_w, T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+void MulCompute::PrepareForRun() {
+  // TODO(TJ): transpose x or y if necessary
+}

-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> Y(y, y_h, y_w);
-  Eigen::Map<matrix_t> Out(out, x_h, y_w);
+void MulCompute::Run() {
+  auto& param = Param<param_t>();

-  Out = X * Y;
-}
+  const auto* x_data = param.x->data<float>();
+  const auto* y_data = param.y->data<float>();
+  auto* o_data = param.output->mutable_data<float>();

-class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
+  int m = static_cast<int>(
+      param.x->dims().Slice(0, param.x_num_col_dims).production());
+  int x_w =
+      static_cast<int>(param.x->dims()
+                           .Slice(param.x_num_col_dims, param.x->dims().size())
+                           .production());
+  int y_h = static_cast<int>(
+      param.y->dims().Slice(0, param.y_num_col_dims).production());
+  int n =
+      static_cast<int>(param.y->dims()
+                           .Slice(param.y_num_col_dims, param.y->dims().size())
+                           .production());

-  void Run() override {
-    auto& param = Param<operators::MulParam>();
-    core::dim2 x_shape(
-        {static_cast<int>(
-             param.x->dims().Slice(0, param.x_num_col_dims).production()),
-         static_cast<int>(
-             param.x->dims()
-                 .Slice(param.x_num_col_dims, param.x->dims().size())
-                 .production())});
-    core::dim2 y_shape(
-        {static_cast<int>(
-             param.y->dims().Slice(0, param.y_num_col_dims).production()),
-         static_cast<int>(
-             param.y->dims()
-                 .Slice(param.y_num_col_dims, param.y->dims().size())
-                 .production())});
+  CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
+  auto k = x_w;
+  if (n == 1) {
+    lite::arm::math::sgemv(x_data, y_data, o_data, false, m, k, false, nullptr,
+                           false);

-    mul_compute_eigen(param.x->data<float>(), x_shape.x, x_shape.y,  //
-                      param.y->data<float>(), y_shape.x, y_shape.y,  //
-                      param.output->mutable_data<float>());
-  }
+  } else {
+    constexpr bool is_tranposed_y = false;
+    auto& ctx = this->ctx_->template As<ARMContext>();

-  virtual ~MulCompute() = default;
-};
+    float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
+                      ctx.l2_cache_size() / sizeof(float);
+    lite::arm::math::prepackA(packed_x, x_data, k, 0, m, 0, k, false, &ctx);
+    lite::arm::math::sgemm_prepack(packed_x, y_data, nullptr, o_data, m, n, k,
+                                   false, false, is_tranposed_y, &ctx);
+  }
+}

 }  // namespace arm
 }  // namespace kernels

--- a/paddle/fluid/lite/kernels/arm/mul_compute.h
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~MulCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/mul_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename T>
+void FillData(T* a, const int n, const T lower = static_cast<T>(-2.f),
+              const T upper = static_cast<T>(2.f)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+TEST(mul_arm, retrive_op) {
+  auto mul =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("mul");
+  ASSERT_FALSE(mul.empty());
+  ASSERT_TRUE(mul.front());
+}
+
+TEST(mul_arm, init) {
+  MulCompute mul;
+  ASSERT_EQ(mul.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mul.target(), TARGET(kARM));
+}
+
+TEST(mul_arm, compare_test) {
+  using T = float;
+
+  for (int m : {1, 2, 3, 4}) {
+    for (int n : {1, 2, 3, 4}) {
+      for (int k : {1, 2, 3, 4}) {
+        VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k;
+        lite::Tensor x, y, out, ref;
+        x.Resize({m, k});
+        y.Resize({k, n});
+        out.Resize({m, n});
+        ref.Resize({m, n});
+
+        auto* x_data = x.mutable_data<T>();
+        auto* y_data = y.mutable_data<T>();
+        auto* out_data = out.mutable_data<T>();
+        auto* ref_data = ref.mutable_data<T>();
+
+        FillData<T>(x_data, x.dims().production());
+        FillData<T>(y_data, y.dims().production());
+        FillData<T>(out_data, out.dims().production(), 0, 0);
+        FillData<T>(ref_data, out.dims().production(), 0, 0);
+
+        MulCompute mul;
+        operators::MulParam param;
+
+        param.x = &x;
+        param.y = &y;
+        param.output = &out;
+
+        DeviceInfo::Init();
+        std::unique_ptr<KernelContext> ctx(new KernelContext);
+        ctx->As<ARMContext>();
+        mul.SetParam(param);
+        mul.SetContext(std::move(ctx));
+        mul.PrepareForRun();
+
+        mul.Run();
+
+        lite::arm::math::mul_compute_eigen(x_data, m, k, y_data, k, n,
+                                           ref_data);
+        for (int i = 0; i < out.dims().production(); i++) {
+          EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+        }
+      }
+    }
+  }
+}
+
+TEST(mul_arm, num_col_dims) {
+  using T = float;
+
+  lite::Tensor x, y, out, ref;
+  x.Resize({2, 3, 4});
+  y.Resize({3, 4, 5});
+  out.Resize({2, 5});
+  ref.Resize({2, 5});
+
+  auto* x_data = x.mutable_data<T>();
+  auto* y_data = y.mutable_data<T>();
+  auto* out_data = out.mutable_data<T>();
+  auto* ref_data = ref.mutable_data<T>();
+
+  FillData<T>(x_data, x.dims().production());
+  FillData<T>(y_data, y.dims().production());
+  FillData<T>(out_data, out.dims().production());
+  FillData<T>(ref_data, out.dims().production());
+
+  MulCompute mul;
+  operators::MulParam param;
+
+  param.x = &x;
+  param.y = &y;
+  param.output = &out;
+  param.x_num_col_dims = 1;
+  param.y_num_col_dims = 2;
+
+  DeviceInfo::Init();
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  mul.SetParam(param);
+  mul.SetContext(std::move(ctx));
+  mul.PrepareForRun();
+
+  mul.Run();
+
+  lite::arm::math::mul_compute_eigen(x_data, 2, 12, y_data, 12, 5, ref_data);
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
@@ -182,7 +182,7 @@ TEST(pool_arm, compute) {
      for (auto stride : {2}) {
        for (auto pad : {0}) {
          for (auto n : {1, 3, 4, 11}) {
-            for (auto c : {1, 3, 11, 4, 1024}) {
+            for (auto c : {1, 3, 11 /* ,1024 */}) {  // speedup for ci
              for (auto h : {3, 1, 11, 4, 1}) {
                for (auto w : {1, 3, 4, 12, 1}) {
                  VLOG(3) << "n:" << n << " c:" << c << " h:" << h << " w:" << w

--- a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
@@ -54,6 +54,15 @@ TEST(scale_arm, compute) {
  lite::Tensor output;
  lite::Tensor output_ref;

+#if 1  // for ci speedup
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 3}) {
+      for (auto h : {3, 4}) {
+        for (auto w : {4, 3}) {
+          for (auto bias_after_scale : {true, false}) {
+            for (auto s : {-1.0f, 0.13f}) {
+              for (auto b : {-15.f, 0.11234f}) {
+#else
  for (auto n : {1, 3, 4, 11}) {
    for (auto c : {1, 3, 11, 4}) {
      for (auto h : {3, 1, 11, 4}) {
@@ -61,6 +70,8 @@ TEST(scale_arm, compute) {
          for (auto bias_after_scale : {true, false}) {
            for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) {
              for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) {
+#endif
+
                x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
                output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
                output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));

--- a/paddle/fluid/lite/operators/op_params.h
+++ b/paddle/fluid/lite/operators/op_params.h
@@ -57,6 +57,7 @@ struct FcParam {
  lite::Tensor* output{};
  lite::DDim in_mat_dims;
  int in_num_col_dims{1};
+  bool weight_transposed{false};
 };

 struct ReluParam {

--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -43,10 +43,14 @@ function cmake_arm {
        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2
 }

+function build_single {
+        make $1 -j$(expr $(nproc) - 2)
+}
+
 function build {
    file=$1
    for _test in $(cat $file); do
-        make $_test -j$(expr $(nproc) - 2)
+        build_single $_test
    done
 }

@@ -63,39 +67,6 @@ function test_lite {
    done
 }

-port_armv8=5554
-port_armv7=5556
-
-# Run test on android
-function test_lite_android {
-    local file=$1
-    local adb_abi=$2
-    local port=
-    if [[ ${adb_abi} == "armeabi-v7a" ]]; then
-        port=${port_armv7}
-    fi
-
-    if [[ ${adb_abi} == "arm64-v8a" ]]; then
-        port=${port_armv8}
-    fi
-    if [[ "${port}x" == "x" ]]; then
-        echo "Port can not be empty"
-        exit 1
-    fi
-
-    echo "file: ${file}"
-    # push all to adb and test
-    adb_work_dir="/data/local/tmp"
-    skip_list="test_model_parser_lite"
-    for _test in $(cat $file); do
-        [[ $skip_list =~ (^|[[:space:]])$_test($|[[:space:]]) ]] && continue || echo 'skip $_test'
-        testpath=$(find ./paddle/fluid -name ${_test})
-        adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-        adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${_test}"
-        adb -s emulator-${port} shell "./${adb_work_dir}/${_test}"
-    done
-}
-
 # Build the code and run lite server tests. This is executed in the CI system.
 function build_test_server {
    mkdir -p ./build
@@ -108,8 +79,34 @@ function build_test_server {
    build $LIBS_FILE
 }

-# Build the code and run lite server tests. This is executed in the CI system.
+# test_arm_android <some_test_name> <adb_port_number>
+function test_arm_android {
+    test_name=$1
+    port=$2
+    if [[ "${test_name}x" == "x" ]]; then
+        echo "test_name can not be empty"
+        exit 1
+    fi
+    if [[ "${port}x" == "x" ]]; then
+        echo "Port can not be empty"
+        exit 1
+    fi
+
+    echo "test name: ${test_name}"
+    adb_work_dir="/data/local/tmp"
+    skip_list="test_model_parser_lite" # add more with space
+    [[ $skip_list =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && continue || echo 'skip $test_name'
+    testpath=$(find ./paddle/fluid -name ${test_name})
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
+    adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}"
+}
+
+# Build the code and run lite arm tests. This is executed in the CI system.
 function build_test_arm {
+    port_armv8=5554
+    port_armv7=5556
+
    adb kill-server
    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    # start android arm64-v8a armeabi-v7a emulators first
@@ -122,6 +119,7 @@ function build_test_arm {

    for os in "android" "armlinux" ; do
        for abi in "arm64-v8a" "armeabi-v7a" "armeabi-v7a-hf" ; do
+            # TODO(TJ): enable compile on v7-hf on andorid and all v7 on armlinux
            if [[ ${abi} == "armeabi-v7a-hf" ]]; then
                echo "armeabi-v7a-hf is not supported on both android and armlinux"
                continue
@@ -138,17 +136,30 @@ function build_test_arm {
            cmake_arm ${os} ${abi}
            build $TESTS_FILE

+            # armlinux need in another docker
+            # TODO(TJ): enable test with armlinux
            if [[ ${os} == "android" ]]; then
                adb_abi=${abi}
                if [[ ${adb_abi} == "armeabi-v7a-hf" ]]; then
                    adb_abi="armeabi-v7a"
                fi
                if [[ ${adb_abi} == "armeabi-v7a" ]]; then
-                    # skip v7 tests
+                    # skip all armv7 tests
+                    # TODO(TJ): enable test with armv7
                    continue
                fi
-                test_lite_android $TESTS_FILE ${adb_abi}
-                # armlinux need in another docker
+                local port=
+                if [[ ${adb_abi} == "armeabi-v7a" ]]; then
+                    port=${port_armv7}
+                fi
+
+                if [[ ${adb_abi} == "arm64-v8a" ]]; then
+                    port=${port_armv8}
+                fi
+                echo "test file: ${TESTS_FILE}"
+                for _test in $(cat $TESTS_FILE); do
+                    test_arm_android $_test $port
+                done
            fi
            cd -
        done
@@ -164,12 +175,13 @@ function print_usage {
    echo "----------------------------------------"
    echo -e "cmake_x86: run cmake with X86 mode"
    echo -e "cmake_cuda: run cmake with CUDA mode"
-    echo -e "cmake_arm: run cmake with ARM mode"
+    echo -e "--arm_os=<os> --arm_abi=<abi> cmake_arm: run cmake with ARM mode"
    echo
    echo -e "build: compile the tests"
+    echo -e "--test_name=<test_name> build_single: compile single test"
    echo
    echo -e "test_server: run server tests"
-    echo -e "test_mobile: run mobile tests"
+    echo -e "--test_name=<test_name> --adb_port_number=<adb_port_number> test_arm_android: run arm test"
    echo "----------------------------------------"
    echo
 }
@@ -182,11 +194,31 @@ function main {
                TESTS_FILE="${i#*=}"
                shift
                ;;
+            --test_name=*)
+                TEST_NAME="${i#*=}"
+                shift
+                ;;
+            --arm_os=*)
+                ARM_OS="${i#*=}"
+                shift
+                ;;
+            --arm_abi=*)
+                ARM_ABI="${i#*=}"
+                shift
+                ;;
+            --arm_port=*)
+                ARM_PORT="${i#*=}"
+                shift
+                ;;
            build)
                build $TESTS_FILE
                build $LIBS_FILE
                shift
                ;;
+            build_single)
+                build_single $TEST_NAME
+                shift
+                ;;
            cmake_x86)
                cmake_x86
                shift
@@ -196,15 +228,15 @@ function main {
                shift
                ;;
            cmake_arm)
-                cmake_arm $2 $3
+                cmake_arm $ARM_OS $ARM_ABI
                shift
                ;;
            test_server)
                test_lite $TESTS_FILE
                shift
                ;;
-            test_mobile)
-                test_lite $TESTS_FILE
+            test_arm_android)
+                test_arm_android $TEST_NAME $ARM_PORT
                shift
                ;;
            build_test_server)
@@ -224,7 +256,5 @@ function main {
    done
 }

-print_usage
-
 main $@