diff --git a/README.md b/README.md index 91b4f886a31bb839a7e513185464260b1e95c453..69362734116fd8af78442a07dd31600aa46b7935 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平 ![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg) arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。 - arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间。 + arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。 - **Mali GPU** diff --git a/src/io/executor.cpp b/src/io/executor.cpp index 510fc8d7db0d957b619e4dbeb25fc14b9768327c..480f48290cc1bbf4888832d76187a13a4915ec40 100644 --- a/src/io/executor.cpp +++ b/src/io/executor.cpp @@ -26,9 +26,6 @@ limitations under the License. */ #include "framework/program/var_desc.h" #include "framework/scope.h" #include "framework/tensor.h" -#ifdef _OPENMP -#include -#endif // _OPENMP #ifdef PADDLE_EXECUTOR_MULTITHREAD #include #include @@ -407,14 +404,6 @@ std::vector::Ptype> Executor::Predict( return result_vector; } -template -void Executor::SetThreadNum(int num) { -#ifdef _OPENMP - // omp_set_dynamic(0); - omp_set_num_threads(num); -#endif -} - template class Executor; template class Executor; template class Executor; diff --git a/src/io/executor.h b/src/io/executor.h index 28b0d65181355fd76e4ec09aa5964130aee2ab68..f8f2a8ad5657fdb3cf6cb249e32537bd5e866913 100644 --- a/src/io/executor.h +++ b/src/io/executor.h @@ -58,8 +58,6 @@ class Executor { std::vector Predict(const std::vector &input, const std::vector &dims); - void SetThreadNum(int num); - protected: Executor() = default; void InitMemory(); diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index 3d5735f8da66db6f4b5f139f8261a4cd9cf0f796..cabdd799a0e7d561d8bc56c0913f1389c38f8907 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -16,6 +16,14 @@ limitations under the License. */ namespace paddle_mobile { +template +void PaddleMobile::SetThreadNum(int num) { +#ifdef _OPENMP + // omp_set_dynamic(0); + omp_set_num_threads(num); +#endif +}; + template bool PaddleMobile::Load(const std::string &dirname, bool optimize, int batch_size) { @@ -81,7 +89,9 @@ PaddleMobile::~PaddleMobile() { } template class PaddleMobile; + template class PaddleMobile; + template class PaddleMobile; } // namespace paddle_mobile diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h index 3ce39e0ae1ffc7e193f6f4308a911875fdf95076..74c11471566c3db8a37ea2d62e0496e5d40cb3b7 100644 --- a/src/io/paddle_mobile.h +++ b/src/io/paddle_mobile.h @@ -17,6 +17,9 @@ limitations under the License. */ #include #include #include +#ifdef _OPENMP +#include +#endif // _OPENMP #include "common/types.h" #include "framework/tensor.h" @@ -44,6 +47,7 @@ class PaddleMobile { * */ bool Load(const std::string &model_path, const std::string ¶_path, bool optimize = false, int batch_size = 1); + void SetThreadNum(int num); /* * @b to predict diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index d7bc60e3c85f5ce5b5165d4d37caf397651eeccd..bb91adcc4db412db137fdc12831bad75e069e38c 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -18,6 +18,9 @@ limitations under the License. */ #ifndef X86 #include #endif +#ifdef _OPENMP +#include +#endif namespace paddle_mobile { namespace operators { @@ -158,6 +161,7 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, // 分块矩阵乘法 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, float beta, float *c, float *C, int ldc, bool relu) { +#pragma omp parallel for for (int j = 0; j < nc; j += NR) { for (int i = 0; i < mc; i += MR) { // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); @@ -187,6 +191,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, const float *b, float beta, float *c, float *C, int ldc, bool relu, float *new_scale, float *new_bias) { +#pragma omp parallel for for (int j = 0; j < nc; j += NR) { for (int i = 0; i < mc; i += MR) { // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index 1851f2668dee3a10e72b5dbeeadb9f51827a2729..2ab24736397c1e71350335561abbcabcba6e27a4 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -17,26 +17,21 @@ limitations under the License. */ #include "../test_include.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(4); bool optimize = true; auto time1 = time(); - // auto program = loader.Load(g_googlenet, optimize); - auto program = loader.Load(g_googlenet_combine + "/model", - g_googlenet_combine + "/params", optimize); - auto time2 = time(); - DLOG << "load cost :" << time_diff(time1, time2) << "ms\n"; - paddle_mobile::Executor executor(program, 1, optimize); - executor.SetThreadNum(4); - std::vector input; - std::vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224, &input, dims); - auto time3 = time(); - int count = 1; - for (int i = 0; i < count; ++i) { - executor.Predict(input, dims); - } + if (paddle_mobile.Load(g_googlenet, optimize)) { + auto time2 = time(); + DLOG << "load cost :" << time_diff(time1, time1) << "ms"; + std::vector input; + std::vector dims{1, 3, 224, 224}; + GetInput(g_test_image_1x3x224x224, &input, dims); + auto time3 = time(); + auto vec_result = paddle_mobile.Predict(input, dims); + auto time4 = time(); - auto time4 = time(); - DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n"; + DLOG << "predict cost :" << time_diff(time3, time4) << "ms"; + } return 0; } diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp index 1f38dc5d19d0e7bb54faf75a41419941e8b1f412..2e285695fb79f3ed5471a653c71a10b36ef4e7f2 100644 --- a/test/net/test_mobilenet.cpp +++ b/test/net/test_mobilenet.cpp @@ -18,6 +18,7 @@ limitations under the License. */ int main() { paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(4); auto time1 = time(); if (paddle_mobile.Load(g_mobilenet, true)) { auto time2 = time();