diff --git a/README.md b/README.md index 1a478db3770e1f5e518594fd2fefabb686cf3c38..ee4e20513186979fe76c1259e7fc3ca962426843 100644 --- a/README.md +++ b/README.md @@ -26,61 +26,10 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平 - **ARM CPU** -|mobilenet arm v7|1线程|2线程|4线程| -|------------|----|-----|-----| -|麒麟970(ms)|108.180|63.935|37.545| -|麒麟960(ms)|108.588|63.073|36.822| -|高通845(ms)|85.952|48.890|28.641| -|高通835(ms)|105.434|62.752|37.131| -||||| -|mobilenetssd arm v7|1线程|2线程|4线程| -|麒麟970(ms)|212.686|127.205|77.485| -|麒麟960(ms)|212.641|125.338|75.250| -|高通845(ms)|182.863|95.671|56.857| -|高通835(ms)|213.849|127.717|77.006| -||||| -|googlenet(v1) arm v7|1线程|2线程|4线程| -|麒麟970(ms)|335.288|234.559|161.295| -|麒麟960(ms)|354.443|232.642|157.815| -|高通845(ms)|282.007|173.146|122.148| -|高通835(ms)|341.250|233.354|158.554| -||||| -|squeezenet arm v7|1线程|2线程|4线程| -|麒麟970(ms)|83.726|57.944|36.923| -|麒麟960(ms)|85.835|55.762|36.496| -|高通845(ms)|71.301|41.618|28.785| -|高通835(ms)|82.407|56.176|36.455| -||||| -|yolo arm v7|1线程|2线程|4线程| -|麒麟970(ms)|129.658|79.993|49.969| -|麒麟960(ms)|130.208|78.791|48.390| -|高通845(ms)|109.244|61.736|40.600| -|高通835(ms)|130.402|80.863|50.359| - - 测试机型信息: - 麒麟970:荣耀v10 (2.36GHz * 4 + 1.8GHz * 4) - 麒麟960:华为mate9 (2.36GHz * 4 + 1.8GHz * 4) - 骁龙835:小米6 (2.45GHz * 4 + 1.9GHz * 4) - 骁龙845:OPPO FindX (2.80GHz * 4 + 1.8GHz * 4) - - **Mali GPU** - Mali GPU是百度和ARM合作开发的,双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet,googlenet,resnet等几个网络模型,后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 - - **苹果设备的GPU Metal实现** -|mobilenetfssd|速度| -|------------|-----| -|A9(ms)|33.78| -|A10(ms)|24.05| -|A11(ms)|17.15| -||| -|genet|速度| -|A9(ms) |3.49| -|A10(ms)|2.54| -|A11(ms)|1.43| - - - **FPGA** 目前已经支持 ZCU102 开发板。 diff --git a/doc/development_fpga.md b/doc/development_fpga.md index 14cc57c6b4055e8c4e45d8b673eb1e3be22ae256..3389ddde676a5d1c7b452dc734880eb50170bd3e 100644 --- a/doc/development_fpga.md +++ b/doc/development_fpga.md @@ -27,8 +27,9 @@ ___ ## 准备模型和数据 ___ 1. 模型文件放在./test/models/resnet50中。将[\_\_model\_\_](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)文件复制到此文件夹下。 -2. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。 -3. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到/test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG,分类标签为80, 对应着"black grouse". +2. 如果不存在,则创建文件夹./test/models/resnet50 和 ./test/images。 +3. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。 +4. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到./test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG,分类标签为80, 对应着"black grouse"。 ## 运行程序 ___ diff --git a/doc/development_ios.md b/doc/development_ios.md index 1d4f28bd5bcde1c3068ddeae87627ae6686d886a..1dbc7555e8ed6db94071c571673212d0ce2b7a71 100644 --- a/doc/development_ios.md +++ b/doc/development_ios.md @@ -34,7 +34,7 @@ cd ../build/release/ios/build libpaddle-mobile.a /src/ios_io/ 下的 -PaddleMobile.h +PaddleMobileCPU.h ``` 拖入工程 diff --git a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h index 6db4297046fba8cbb8028f1c70d8214b703158b6..9cbac1035faf4cdc5109a08ea78dfafa8e1df7f2 100644 --- a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h +++ b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h @@ -26,7 +26,7 @@ void PolygonBoxTransformCompute(const PolygonBoxTransformParam& param) { const auto& input_dims = input->dims(); const auto* input_data = input->data(); auto* output = param.Output(); - auto* output_data = output->mutable_data(); + auto* output_data = output->mutable_data(input_dims); int64_t batch_size = input_dims[0]; int64_t geo_channel = input_dims[1]; diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index ea023bc134033aee6577ebf06c95f2a762d08bca..8498992fcecbcb2c9a773fba874e108c013a04fc 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -209,12 +209,18 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, int32_t lda, int8_t *buffer); void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, int32_t ldb, int8_t *buffer); + void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, + int32_t lda, int8_t *buffer); + void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, + int32_t ldb, int8_t *buffer); // 8 bits int matrix product void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C, int32_t ldc, bool relu, int8_t *bias); - + void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, + int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, + int32_t *C, int32_t ldc, bool relu, int8_t *bias); // 8 bits int write back // C = alpha * A * B + beta * C void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C, diff --git a/src/operators/math/gemm_int8.cpp b/src/operators/math/gemm_int8.cpp index 5dd8a7c3131543f426f32e258efb3181be9b2f61..b16db7fe6acf0c3c7fb2902c9fb3f6e3dc81a65f 100644 --- a/src/operators/math/gemm_int8.cpp +++ b/src/operators/math/gemm_int8.cpp @@ -30,7 +30,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, int32_t ldc) { #if __ARM_NEON #if __aarch64__ -// TODO +// TODO(wzzju) #else const int8_t *a_ptr, *b_ptr; a_ptr = a; @@ -246,7 +246,7 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, int32_t ldc) { #if __ARM_NEON #if __aarch64__ -// TODO +// TODO(wzzju) #else const int8_t *a_ptr, *b_ptr; a_ptr = a; @@ -546,8 +546,12 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, #pragma omp parallel for for (int32_t j = 0; j < nc; j += NR) { for (int32_t i = 0; i < mc; i += MR_INT8) { +#if __aarch64__ + // TODO(wzzju) +#else // AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); +#endif // __aarch64__ } } if (alpha != 1) { @@ -682,7 +686,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, const int8_t *b0 = &B(i, j); #if __ARM_NEON #if __aarch64__ - // TODO + // TODO(wzzju) #else asm volatile( // "pld [%[b0]] \n\t" @@ -791,7 +795,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc) { #if __ARM_NEON #if __aarch64__ -// TODO +// TODO(wzzju) #else int32_t nc1 = nc >> 4; int32_t _nc1 = nc & 15; diff --git a/src/operators/math/gemm_omp_int8.cpp b/src/operators/math/gemm_omp_int8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..21256cccfcc6dcc647f34a2129616b70804d398f --- /dev/null +++ b/src/operators/math/gemm_omp_int8.cpp @@ -0,0 +1,235 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "common/log.h" +#include "memory/t_malloc.h" +#include "operators/math/gemm.h" +#if __ARM_NEON +#include +#endif +#ifdef _OPENMP +#include +#endif + +namespace paddle_mobile { +namespace operators { +namespace math { + +// 8 bits int matrix product (m*k x k*n) +void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, + const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, + int8_t beta, int32_t *C, int32_t ldc, bool relu, + int8_t *bias) { +#ifdef _OPENMP + int32_t max_threads = omp_get_max_threads(); +#else + int32_t max_threads = 1; +#endif + + int32_t L1 = 64 / max_threads * 1024; + KC = k; + zero_int8 = + static_cast(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC)); + memset(static_cast(zero_int8), 0, sizeof(int8_t) * KC); + if (m > n) { + // 对 A 分块 + MC = L1 / (KC * sizeof(int8_t)); + if (MC == 0) { + MC = MR_INT8; + } else { + int32_t mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8; + } + // 补齐 B + NC = (n + NR - 1) / NR * NR; + + packedB_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC)); +#if __aarch64__ + // TODO(wzzju) +#else + PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8); +#endif + packedA_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads)); + } else { + // 对 B 分块 + NC = L1 / (KC * sizeof(int8_t)); + if (NC == 0) { + NC = NR; + } else { + int32_t nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + } + // 补齐 A + MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8; + + packedA_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC)); +#if __aarch64__ + // TODO(wzzju) +#else + PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8); +#endif + packedB_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads)); + } + packedC_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads)); + + if (m > n) { +#pragma omp parallel for + for (int32_t i = 0; i < m; i += MC) { +#ifdef _OPENMP + int32_t local_threads = omp_get_thread_num(); +#else + int32_t local_threads = 0; +#endif + + int32_t mc; + mc = s_min(m - i, MC); + int8_t *local_A = packedA_int8 + MC * KC * local_threads; + int32_t *local_C = packedC_int8 + MC * NC * local_threads; +#if __aarch64__ + // TODO(wzzju) +#else + PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A); +#endif + InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C, + &C(i, 0), ldc, relu, bias + i); + } + } else { +#pragma omp parallel for + for (int32_t j = 0; j < n; j += NC) { +#ifdef _OPENMP + int32_t local_threads = omp_get_thread_num(); +#else + int32_t local_threads = 0; +#endif + int32_t nc; + nc = s_min(n - j, NC); + int8_t *local_B = packedB_int8 + KC * NC * local_threads; + int32_t *local_C = packedC_int8 + MC * NC * local_threads; +#if __aarch64__ + // TODO(wzzju) +#else + PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B); +#endif + InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C, + &C(0, j), ldc, relu, bias); + } + } + + paddle_mobile::memory::Free(packedA_int8); + paddle_mobile::memory::Free(packedB_int8); + paddle_mobile::memory::Free(packedC_int8); + paddle_mobile::memory::Free(zero_int8); +} + +void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, + const int8_t *B, int32_t ldb, int8_t *buffer) { + const int32_t j_length = n - n_tail; +#pragma omp parallel for + for (int32_t j = 0; j < j_length; j += NR) { + int8_t *local_buffer = buffer + j * k; + for (int32_t i = 0; i < k; ++i) { + const int8_t *b0 = &B(i, j); +#if __ARM_NEON +#if __aarch64__ + // TODO(wzzju) +#else + asm volatile( + // "pld [%[b0]] \n\t" + "vld1.s8 {d0}, [%[b0]] \n\t" + "vst1.s8 {d0}, [%[local_buffer]]! \n\t" + : [local_buffer] "+r"(local_buffer) + : [b0] "r"(b0) + : "memory", "q0"); +#endif // __aarch64__ +#else + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; +#endif // __ARM_NEON + } + } + if (n_tail != 0) { + int8_t *local_buffer = buffer + j_length * k; + for (int32_t i = 0; i < k; ++i) { + const int8_t *b0 = &B(i, j_length); + for (int32_t j = j_length; j < n; ++j) { + *local_buffer++ = *b0++; + } + for (int32_t j = n; j < j_length + NR; ++j) { + *local_buffer++ = 0; + } + } + } +} + +void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, + const int8_t *A, int32_t lda, int8_t *buffer) { + const int i_length = m - m_tail; +#pragma omp parallel for + for (int32_t i = 0; i < i_length; i += MR_INT8) { + const int8_t *a0 = A + i * lda; + const int8_t *a1 = A + (i + 1) * lda; + const int8_t *a2 = A + (i + 2) * lda; + const int8_t *a3 = A + (i + 3) * lda; + int8_t *local_buffer = buffer + i * k; + for (int32_t j = 0; j < k; ++j) { + *local_buffer++ = *a0++; + *local_buffer++ = *a1++; + *local_buffer++ = *a2++; + *local_buffer++ = *a3++; + } + } + + if (m_tail != 0) { + const int8_t *a0 = &A(i_length, 0); + const int8_t *a1 = a0 + lda; + const int8_t *a2 = a0 + 2 * lda; + const int8_t *a3 = a0 + 3 * lda; + int8_t *local_buffer = buffer + i_length * k; + switch (m_tail) { + case 1: + a1 = zero_int8; + case 2: + a2 = zero_int8; + case 3: + a3 = zero_int8; + break; + default: + break; + } + for (int j = 0; j < k; ++j) { + *local_buffer++ = *a0++; + *local_buffer++ = *a1++; + *local_buffer++ = *a2++; + *local_buffer++ = *a3++; + } + } +} + +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/math/math_function_int8.cpp b/src/operators/math/math_function_int8.cpp index 70677223d12ded2da07ab53bc371f1e8da9fe293..e02824b290ebc0080613e2ae2365626d79576c9e 100644 --- a/src/operators/math/math_function_int8.cpp +++ b/src/operators/math/math_function_int8.cpp @@ -51,12 +51,23 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, } } +#ifdef _OPENMP + gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); +#else gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, matrix_out->data(), N, relu, bias); +#endif } else { +#ifdef _OPENMP + gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, + matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); +#else gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, beta, matrix_out->data(), N, relu, bias); +#endif } } } // namespace math diff --git a/test/common/test_gemm_int8_accuracy.cpp b/test/common/test_gemm_int8_accuracy.cpp index 80ddd40e121c81032c903955bd7116cf52695569..87f8d945648577ef1414417b57f4013d288dc043 100644 --- a/test/common/test_gemm_int8_accuracy.cpp +++ b/test/common/test_gemm_int8_accuracy.cpp @@ -20,6 +20,9 @@ limitations under the License. */ #include "common/log.h" #include "memory/t_malloc.h" #include "operators/math/gemm.h" +#ifdef _OPENMP +#include +#endif // _OPENMP #define a(i, j) a[(i)*lda + (j)] #define b(i, j) b[(i)*ldb + (j)] @@ -84,8 +87,13 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) { } paddle_mobile::operators::math::Gemm gemm; +#ifdef _OPENMP + gemm.Sgemm_omp(m, n, k, static_cast(1), a, lda, b, ldb, + static_cast(0), c, ldc, relu, nullptr); +#else gemm.Sgemm(m, n, k, static_cast(1), a, lda, b, ldb, static_cast(0), c, ldc, relu, nullptr); +#endif int eq = 0; int neq = 0; for (int i = 0; i < m * n; ++i) { @@ -119,12 +127,17 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) { } int main() { - do_sgemm(9, 9, 9, false, 10); +#ifdef _OPENMP + omp_set_num_threads(8); +#endif + do_sgemm(9, 9, 9, false, 1); do_sgemm(10, 6, 12, false, 0); do_sgemm(512, 256, 384, false, 0); do_sgemm(1366, 768, 256, false, 0); do_sgemm(1255, 755, 333, false, 0); - do_sgemm(555, 777, 999, false, 0); + do_sgemm(599, 1133, 393, false, 0); + do_sgemm(777, 555, 999, false, 0); + do_sgemm(333, 797, 939, false, 0); do_sgemm(1024, 1024, 1024, false, 0); return 0; diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp index 89f0012ae8effaab383719c1b85748c24eb2bf73..14da4ba284b5ac7b0660bd15de871fdf5ed04cdd 100644 --- a/test/common/test_gemm_perf.cpp +++ b/test/common/test_gemm_perf.cpp @@ -28,7 +28,7 @@ limitations under the License. */ int main() { paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(1); + paddle_mobile.SetThreadNum(8); Tensor aa, bb, cc; auto aaptr = aa.mutable_data({m, k}); auto bbptr = bb.mutable_data({k, n}); diff --git a/test/operators/test_mul_op.cpp b/test/operators/test_mul_op.cpp index 10dab2cda1b3c692f42cf8760eb2b48ae6451f39..262ee960e1c777d369d3b510eb31e5ed47b3493c 100644 --- a/test/operators/test_mul_op.cpp +++ b/test/operators/test_mul_op.cpp @@ -93,6 +93,8 @@ int TestMulOP() { } // namespace paddle_mobile int main() { + paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(8); paddle_mobile::TestMulOP(); paddle_mobile::TestMulOP(); return 0; diff --git a/tools/build.sh b/tools/build.sh index 1408822e46850752bcd448350fc483c25f70ae9a..c6554105718304c195bb4a3326c80947719033a0 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -160,7 +160,7 @@ build_for_ios() { fi cd "${BUILD_DIR}" make -j 8 - cp ../../../src/ios_io/PaddleMobile.h ./build/PaddleMobile.h + cp ../../../src/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h cd ./build # 生成符号表 ranlib *.a