Merge branch 'develop' of https://github.com/PaddlePaddle/paddle-mobile into dev-latest

7433193f · hjchen2 · 59fd4c58 · 19fe94ea · 7433193f · 7433193f
12 changed file
--- a/README.md
+++ b/README.md
@@ -26,61 +26,10 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 - **ARM CPU**
-|mobilenet arm v7|1线程|2线程|4线程|
-|------------|----|-----|-----|
-|麒麟970(ms)|108.180|63.935|37.545|
-|麒麟960(ms)|108.588|63.073|36.822|
-|高通845(ms)|85.952|48.890|28.641|
-|高通835(ms)|105.434|62.752|37.131|
-|||||
-|mobilenetssd arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|212.686|127.205|77.485|
-|麒麟960(ms)|212.641|125.338|75.250|
-|高通845(ms)|182.863|95.671|56.857|
-|高通835(ms)|213.849|127.717|77.006|
-|||||
-|googlenet(v1) arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|335.288|234.559|161.295|
-|麒麟960(ms)|354.443|232.642|157.815|
-|高通845(ms)|282.007|173.146|122.148|
-|高通835(ms)|341.250|233.354|158.554|
-|||||
-|squeezenet arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|83.726|57.944|36.923|
-|麒麟960(ms)|85.835|55.762|36.496|
-|高通845(ms)|71.301|41.618|28.785|
-|高通835(ms)|82.407|56.176|36.455|
-|||||
-|yolo arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|129.658|79.993|49.969|
-|麒麟960(ms)|130.208|78.791|48.390|
-|高通845(ms)|109.244|61.736|40.600|
-|高通835(ms)|130.402|80.863|50.359|
-    测试机型信息：
-    麒麟970:荣耀v10     (2.36GHz * 4 + 1.8GHz * 4)
-    麒麟960:华为mate9   (2.36GHz * 4 + 1.8GHz * 4)
-    骁龙835:小米6       (2.45GHz * 4 + 1.9GHz * 4)
-    骁龙845:OPPO FindX  (2.80GHz * 4 + 1.8GHz * 4)
 - **Mali GPU**
-    Mali GPU是百度和ARM合作开发的，双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet，googlenet，resnet等几个网络模型，后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 
 - **苹果设备的GPU Metal实现**
-|mobilenetfssd|速度|
-|------------|-----|
-|A9(ms)|33.78|
-|A10(ms)|24.05|
-|A11(ms)|17.15|
-|||
-|genet|速度|
-|A9(ms) |3.49|
-|A10(ms)|2.54|
-|A11(ms)|1.43|
 - **FPGA**
    目前已经支持 ZCU102 开发板。

--- a/doc/development_fpga.md
+++ b/doc/development_fpga.md
@@ -27,8 +27,9 @@ ___
 ## 准备模型和数据
 ___
 1. 模型文件放在./test/models/resnet50中。将[\_\_model\_\_](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)文件复制到此文件夹下。
-2. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
+2. 如果不存在，则创建文件夹./test/models/resnet50 和 ./test/images。
-3. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到/test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG，分类标签为80， 对应着"black grouse".
+3. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
+4. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到./test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG，分类标签为80， 对应着"black grouse"。
 ## 运行程序
 ___

--- a/doc/development_ios.md
+++ b/doc/development_ios.md
@@ -34,7 +34,7 @@ cd ../build/release/ios/build
 libpaddle-mobile.a
 /src/ios_io/ 下的
-PaddleMobile.h
+PaddleMobileCPU.h
 ```
 拖入工程

--- a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
+++ b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
@@ -26,7 +26,7 @@ void PolygonBoxTransformCompute(const PolygonBoxTransformParam<CPU>& param) {
  const auto& input_dims = input->dims();
  const auto* input_data = input->data<float>();
  auto* output = param.Output();
-  auto* output_data = output->mutable_data<float>();
+  auto* output_data = output->mutable_data<float>(input_dims);
  int64_t batch_size = input_dims[0];
  int64_t geo_channel = input_dims[1];

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -209,12 +209,18 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                      int32_t lda, int8_t *buffer);
  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                      int32_t ldb, int8_t *buffer);
+  void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                          int32_t lda, int8_t *buffer);
+  void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
+                          int32_t ldb, int8_t *buffer);
  // 8 bits int matrix product
  void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
             int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
             int32_t ldc, bool relu, int8_t *bias);
+  void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
+                 int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
+                 int32_t *C, int32_t ldc, bool relu, int8_t *bias);
  // 8 bits int write back
  // C = alpha * A * B + beta * C
  void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,

--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
@@ -30,7 +30,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                     int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO
+// TODO(wzzju)
 #else
  const int8_t *a_ptr, *b_ptr;
  a_ptr = a;
@@ -246,7 +246,7 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                     int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO
+// TODO(wzzju)
 #else
  const int8_t *a_ptr, *b_ptr;
  a_ptr = a;
@@ -546,8 +546,12 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
 #pragma omp parallel for
  for (int32_t j = 0; j < nc; j += NR) {
    for (int32_t i = 0; i < mc; i += MR_INT8) {
+#if __aarch64__
+    // TODO(wzzju)
+#else
      //      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif  // __aarch64__
    }
  }
  if (alpha != 1) {
@@ -682,7 +686,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
      const int8_t *b0 = &B(i, j);
 #if __ARM_NEON
 #if __aarch64__
-      // TODO
+      // TODO(wzzju)
 #else
      asm volatile(
          //          "pld        [%[b0]]                     \n\t"
@@ -791,7 +795,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
                      int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO
+// TODO(wzzju)
 #else
  int32_t nc1 = nc >> 4;
  int32_t _nc1 = nc & 15;

--- a/src/operators/math/gemm_omp_int8.cpp
+++ b/src/operators/math/gemm_omp_int8.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string.h>
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#include "operators/math/gemm.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+// 8 bits int matrix product (m*k x k*n)
+void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha,
+                     const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
+                     int8_t beta, int32_t *C, int32_t ldc, bool relu,
+                     int8_t *bias) {
+#ifdef _OPENMP
+  int32_t max_threads = omp_get_max_threads();
+#else
+  int32_t max_threads = 1;
+#endif
+  int32_t L1 = 64 / max_threads * 1024;
+  KC = k;
+  zero_int8 =
+      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
+  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(int8_t));
+    if (MC == 0) {
+      MC = MR_INT8;
+    } else {
+      int32_t mblock_num = (m + MC - 1) / MC;
+      MC = (m + mblock_num - 1) / mblock_num;
+      MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+    }
+    // 补齐 B
+    NC = (n + NR - 1) / NR * NR;
+    packedB_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
+#if __aarch64__
+    // TODO(wzzju)
+#else
+    PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8);
+#endif
+    packedA_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(int8_t));
+    if (NC == 0) {
+      NC = NR;
+    } else {
+      int32_t nblock_num = (n + NC - 1) / NC;
+      NC = (n + nblock_num - 1) / nblock_num;
+      NC = (NC + NR - 1) / NR * NR;
+    }
+    // 补齐 A
+    MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+    packedA_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
+#if __aarch64__
+    // TODO(wzzju)
+#else
+    PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8);
+#endif
+    packedB_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
+  }
+  packedC_int8 = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
+  if (m > n) {
+#pragma omp parallel for
+    for (int32_t i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int32_t local_threads = omp_get_thread_num();
+#else
+      int32_t local_threads = 0;
+#endif
+      int32_t mc;
+      mc = s_min(m - i, MC);
+      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
+      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
+#if __aarch64__
+      // TODO(wzzju)
+#else
+      PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A);
+#endif
+      InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
+                          &C(i, 0), ldc, relu, bias + i);
+    }
+  } else {
+#pragma omp parallel for
+    for (int32_t j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int32_t local_threads = omp_get_thread_num();
+#else
+      int32_t local_threads = 0;
+#endif
+      int32_t nc;
+      nc = s_min(n - j, NC);
+      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
+      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
+#if __aarch64__
+      // TODO(wzzju)
+#else
+      PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+#endif
+      InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
+                          &C(0, j), ldc, relu, bias);
+    }
+  }
+  paddle_mobile::memory::Free(packedA_int8);
+  paddle_mobile::memory::Free(packedB_int8);
+  paddle_mobile::memory::Free(packedC_int8);
+  paddle_mobile::memory::Free(zero_int8);
+}
+void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
+                              const int8_t *B, int32_t ldb, int8_t *buffer) {
+  const int32_t j_length = n - n_tail;
+#pragma omp parallel for
+  for (int32_t j = 0; j < j_length; j += NR) {
+    int8_t *local_buffer = buffer + j * k;
+    for (int32_t i = 0; i < k; ++i) {
+      const int8_t *b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+      // TODO(wzzju)
+#else
+      asm volatile(
+          //          "pld        [%[b0]]                     \n\t"
+          "vld1.s8    {d0},   [%[b0]]         \n\t"
+          "vst1.s8    {d0},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "q0");
+#endif  // __aarch64__
+#else
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+#endif  // __ARM_NEON
+    }
+  }
+  if (n_tail != 0) {
+    int8_t *local_buffer = buffer + j_length * k;
+    for (int32_t i = 0; i < k; ++i) {
+      const int8_t *b0 = &B(i, j_length);
+      for (int32_t j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int32_t j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
+                              const int8_t *A, int32_t lda, int8_t *buffer) {
+  const int i_length = m - m_tail;
+#pragma omp parallel for
+  for (int32_t i = 0; i < i_length; i += MR_INT8) {
+    const int8_t *a0 = A + i * lda;
+    const int8_t *a1 = A + (i + 1) * lda;
+    const int8_t *a2 = A + (i + 2) * lda;
+    const int8_t *a3 = A + (i + 3) * lda;
+    int8_t *local_buffer = buffer + i * k;
+    for (int32_t j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+    }
+  }
+  if (m_tail != 0) {
+    const int8_t *a0 = &A(i_length, 0);
+    const int8_t *a1 = a0 + lda;
+    const int8_t *a2 = a0 + 2 * lda;
+    const int8_t *a3 = a0 + 3 * lda;
+    int8_t *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero_int8;
+      case 2:
+        a2 = zero_int8;
+      case 3:
+        a3 = zero_int8;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
@@ -51,12 +51,23 @@ void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
      }
    }
+#ifdef _OPENMP
+    gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+                   matrix_out->data<int32_t>(), N, relu, bias);
+#else
    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
               matrix_out->data<int32_t>(), N, relu, bias);
+#endif
  } else {
+#ifdef _OPENMP
+    gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+                   matrix_b.data<int8_t>(), N, beta,
+                   matrix_out->data<int32_t>(), N, relu, bias);
+#else
    gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
               matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
               relu, bias);
+#endif
  }
 }
 }  // namespace math

--- a/test/common/test_gemm_int8_accuracy.cpp
+++ b/test/common/test_gemm_int8_accuracy.cpp
@@ -20,6 +20,9 @@ limitations under the License. */
 #include "common/log.h"
 #include "memory/t_malloc.h"
 #include "operators/math/gemm.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
 #define a(i, j) a[(i)*lda + (j)]
 #define b(i, j) b[(i)*ldb + (j)]
@@ -84,8 +87,13 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
  }
  paddle_mobile::operators::math::Gemm gemm;
+#ifdef _OPENMP
+  gemm.Sgemm_omp(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
+                 static_cast<int8_t>(0), c, ldc, relu, nullptr);
+#else
  gemm.Sgemm(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
             static_cast<int8_t>(0), c, ldc, relu, nullptr);
+#endif
  int eq = 0;
  int neq = 0;
  for (int i = 0; i < m * n; ++i) {
@@ -119,12 +127,17 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
 }
 int main() {
-  do_sgemm(9, 9, 9, false, 10);
+#ifdef _OPENMP
+  omp_set_num_threads(8);
+#endif
+  do_sgemm(9, 9, 9, false, 1);
  do_sgemm(10, 6, 12, false, 0);
  do_sgemm(512, 256, 384, false, 0);
  do_sgemm(1366, 768, 256, false, 0);
  do_sgemm(1255, 755, 333, false, 0);
-  do_sgemm(555, 777, 999, false, 0);
+  do_sgemm(599, 1133, 393, false, 0);
+  do_sgemm(777, 555, 999, false, 0);
+  do_sgemm(333, 797, 939, false, 0);
  do_sgemm(1024, 1024, 1024, false, 0);
  return 0;

--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -28,7 +28,7 @@ limitations under the License. */
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
+  paddle_mobile.SetThreadNum(8);
  Tensor aa, bb, cc;
  auto aaptr = aa.mutable_data<float>({m, k});
  auto bbptr = bb.mutable_data<float>({k, n});

--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -93,6 +93,8 @@ int TestMulOP() {
 }  // namespace paddle_mobile
 int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(8);
  paddle_mobile::TestMulOP<int8_t, int32_t>();
  paddle_mobile::TestMulOP<float, float>();
  return 0;

--- a/tools/build.sh
+++ b/tools/build.sh
@@ -160,7 +160,7 @@ build_for_ios() {
    fi
    cd "${BUILD_DIR}"
    make -j 8
-    cp ../../../src/ios_io/PaddleMobile.h ./build/PaddleMobile.h
+    cp ../../../src/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
    cd ./build
    # 生成符号表
    ranlib *.a