提交 b13c947e 编写于 作者: W wangliu

accelerate with openmp

上级 cab2d143
......@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.0)
project(paddle-mobile)
option(DEBUGING "enable debug mode" ON)
option(USE_OPENMP "openmp support" OFF)
option(USE_OPENMP "openmp support" ON)
option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON)
# select the platform to build
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "io/executor.h"
#include <operators/math/gemm.h>
#include <algorithm>
#include <vector>
#include "common/enforce.h"
......@@ -25,6 +26,9 @@ limitations under the License. */
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
#ifdef _OPENMP
#include <omp.h>
#endif // _OPENMP
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue>
#include <utility>
......@@ -403,6 +407,14 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
return result_vector;
}
template <typename Dtype, Precision P>
void Executor<Dtype, P>::SetThreadNum(int num) {
#ifdef _OPENMP
// omp_set_dynamic(0);
omp_set_num_threads(num);
#endif
}
template class Executor<CPU, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>;
......
......@@ -58,6 +58,8 @@ class Executor {
std::vector<Ptype> Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims);
void SetThreadNum(int num);
protected:
Executor() = default;
void InitMemory();
......
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef LRN_OP
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/operator.h"
#include "operators/op_param.h"
......@@ -47,6 +49,7 @@ struct LRNFunctor {
std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
for (int a = 0; a < N; a++) {
#pragma parallel for
for (int b = 0; b < C; b++) {
for (int index = start; index < end; index++) {
int channel = b + index;
......
此差异已折叠。
......@@ -15,10 +15,13 @@ limitations under the License. */
#ifdef POOL_OP
#pragma once
#ifdef _OPENMP
#include <omp.h>
#endif
#include <algorithm>
#include <vector>
#include "framework/tensor.h"
#ifdef __ARM_NEON
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
......
......@@ -14,10 +14,11 @@ limitations under the License. */
#ifdef POOL_OP
#include "operators/math/pooling.h"
#include <algorithm>
#include <vector>
#include "pooling.h"
#include "common/types.h"
#ifdef _OPENMP
#include <omp.h>
#endif
namespace paddle_mobile {
namespace operators {
......@@ -59,7 +60,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
T *output_data = output->mutable_data<T>();
for (int i = 0; i < batch_size; i++) {
// #pragma omp parallel for
#pragma omp parallel for
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
......
......@@ -26,16 +26,17 @@ int main() {
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
executor.SetThreadNum(4);
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
auto time3 = time();
for (int i = 0; i < 10; ++i) {
int count = 1;
for (int i = 0; i < count; ++i) {
executor.Predict(input, dims);
}
auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n";
return 0;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册