未验证 提交 4acdfa17 编写于 作者: R Ruilong Liu 提交者: GitHub

Merge branch 'develop' into develop

...@@ -30,7 +30,7 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平 ...@@ -30,7 +30,7 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平
![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg) ![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg)
arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。 arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间。 arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8
- **Mali GPU** - **Mali GPU**
......
...@@ -26,9 +26,6 @@ limitations under the License. */ ...@@ -26,9 +26,6 @@ limitations under the License. */
#include "framework/program/var_desc.h" #include "framework/program/var_desc.h"
#include "framework/scope.h" #include "framework/scope.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#ifdef _OPENMP
#include <omp.h>
#endif // _OPENMP
#ifdef PADDLE_EXECUTOR_MULTITHREAD #ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue> #include <queue>
#include <utility> #include <utility>
...@@ -407,14 +404,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -407,14 +404,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
return result_vector; return result_vector;
} }
template <typename Dtype, Precision P>
void Executor<Dtype, P>::SetThreadNum(int num) {
#ifdef _OPENMP
// omp_set_dynamic(0);
omp_set_num_threads(num);
#endif
}
template class Executor<CPU, Precision::FP32>; template class Executor<CPU, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>; template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>; template class Executor<GPU_MALI, Precision::FP32>;
......
...@@ -58,8 +58,6 @@ class Executor { ...@@ -58,8 +58,6 @@ class Executor {
std::vector<Ptype> Predict(const std::vector<Ptype> &input, std::vector<Ptype> Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims); const std::vector<int64_t> &dims);
void SetThreadNum(int num);
protected: protected:
Executor() = default; Executor() = default;
void InitMemory(); void InitMemory();
......
...@@ -16,6 +16,14 @@ limitations under the License. */ ...@@ -16,6 +16,14 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
#ifdef _OPENMP
// omp_set_dynamic(0);
omp_set_num_threads(num);
#endif
};
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize, bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
int batch_size) { int batch_size) {
...@@ -81,7 +89,9 @@ PaddleMobile<Dtype, P>::~PaddleMobile() { ...@@ -81,7 +89,9 @@ PaddleMobile<Dtype, P>::~PaddleMobile() {
} }
template class PaddleMobile<CPU, Precision::FP32>; template class PaddleMobile<CPU, Precision::FP32>;
template class PaddleMobile<FPGA, Precision::FP32>; template class PaddleMobile<FPGA, Precision::FP32>;
template class PaddleMobile<GPU_MALI, Precision::FP32>; template class PaddleMobile<GPU_MALI, Precision::FP32>;
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -17,6 +17,9 @@ limitations under the License. */ ...@@ -17,6 +17,9 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#ifdef _OPENMP
#include <omp.h>
#endif // _OPENMP
#include "common/types.h" #include "common/types.h"
#include "framework/tensor.h" #include "framework/tensor.h"
...@@ -44,6 +47,7 @@ class PaddleMobile { ...@@ -44,6 +47,7 @@ class PaddleMobile {
* */ * */
bool Load(const std::string &model_path, const std::string &para_path, bool Load(const std::string &model_path, const std::string &para_path,
bool optimize = false, int batch_size = 1); bool optimize = false, int batch_size = 1);
void SetThreadNum(int num);
/* /*
* @b to predict * @b to predict
......
...@@ -18,6 +18,9 @@ limitations under the License. */ ...@@ -18,6 +18,9 @@ limitations under the License. */
#ifndef X86 #ifndef X86
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#ifdef _OPENMP
#include <omp.h>
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -158,6 +161,7 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, ...@@ -158,6 +161,7 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
float beta, float *c, float *C, int ldc, bool relu) { float beta, float *c, float *C, int ldc, bool relu) {
#pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
...@@ -187,6 +191,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, ...@@ -187,6 +191,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc, const float *b, float beta, float *c, float *C, int ldc,
bool relu, float *new_scale, float *new_bias) { bool relu, float *new_scale, float *new_bias) {
#pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
......
...@@ -17,26 +17,21 @@ limitations under the License. */ ...@@ -17,26 +17,21 @@ limitations under the License. */
#include "../test_include.h" #include "../test_include.h"
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
bool optimize = true; bool optimize = true;
auto time1 = time(); auto time1 = time();
// auto program = loader.Load(g_googlenet, optimize); if (paddle_mobile.Load(g_googlenet, optimize)) {
auto program = loader.Load(g_googlenet_combine + "/model", auto time2 = time();
g_googlenet_combine + "/params", optimize); DLOG << "load cost :" << time_diff(time1, time1) << "ms";
auto time2 = time(); std::vector<float> input;
DLOG << "load cost :" << time_diff(time1, time2) << "ms\n"; std::vector<int64_t> dims{1, 3, 224, 224};
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize); GetInput<float>(g_test_image_1x3x224x224, &input, dims);
executor.SetThreadNum(4); auto time3 = time();
std::vector<float> input; auto vec_result = paddle_mobile.Predict(input, dims);
std::vector<int64_t> dims{1, 3, 224, 224}; auto time4 = time();
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
auto time3 = time();
int count = 1;
for (int i = 0; i < count; ++i) {
executor.Predict(input, dims);
}
auto time4 = time(); DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n"; }
return 0; return 0;
} }
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
int main() { int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time(); auto time1 = time();
if (paddle_mobile.Load(g_mobilenet, true)) { if (paddle_mobile.Load(g_mobilenet, true)) {
auto time2 = time(); auto time2 = time();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册