提交 3047c4b4 编写于 作者: H hjchen2

Replace omp statement to optimize elementwise add while batch size > 1

上级 33e1e2dd
......@@ -63,7 +63,12 @@ struct PaddleMobileException : public std::exception {
#else
#define PADDLE_MOBILE_THROW_EXCEPTION(...)
#define PADDLE_MOBILE_ENFORCE(stat, ...)
#define PADDLE_MOBILE_ENFORCE(stat, ...) \
{ \
if (stat) { \
} else { \
} \
}
#endif
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#ifdef GRU_OP
#include "operators/gru_op.h"
#include <iostream>
#include <vector>
#include "common/enforce.h"
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#ifdef CAST_OP
#include <algorithm>
#include <iostream>
#include <vector>
#include "framework/data_type.h"
#include "operators/kernel/kernels.h"
......
......@@ -26,18 +26,12 @@ namespace paddle_mobile {
namespace operators {
template <typename T>
struct AddFunctor {
inline T operator()(T a, T b) const { return a + b; }
};
template <typename P>
void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
Tensor *Out = param.Out();
Out->mutable_data<float>();
inline void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
const framework::Tensor *input_x = param.InputX();
const framework::Tensor *input_y = param.InputY();
framework::Tensor *Out = param.Out();
int axis = param.Axis();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
const auto &x_dims = input_x->dims();
const auto &y_dims = input_y->dims();
/// axis = -1 represent the last dimensions.
......@@ -57,18 +51,20 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
const float *bias_data = input_y->data<float>();
const float *input_data = input_x->data<float>();
float *output_data = Out->mutable_data<float>();
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
#pragma omp parallel for
for (int j = 0; j < channels; ++j) {
size_t offset = (i * channels + j) * elementwise_num;
const float *input = input_data + offset;
const float *bias = bias_data + j;
const float bias = bias_data[j];
float *output = output_data + offset;
int remain = elementwise_num;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = elementwise_num >> 0x4;
int remain = elementwise_num & 0xF;
remain = elementwise_num & 0xF;
for (int k = 0; k < loop; ++k) {
float32x4_t rb = vdupq_n_f32(*bias);
float32x4_t rb = vdupq_n_f32(bias);
float32x4_t r0 = vld1q_f32(input);
float32x4_t r1 = vld1q_f32(input + 4);
float32x4_t r2 = vld1q_f32(input + 8);
......@@ -84,15 +80,12 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
input += 16;
output += 16;
}
#endif
for (int k = 0; k < remain; ++k) {
output[k] = input[k] + *bias;
output[k] = input[k] + bias;
}
}
}
#else
ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
AddFunctor<float>(), Out);
#endif
}
template class ElementwiseAddKernel<CPU, float>;
......
......@@ -19,7 +19,7 @@ limitations under the License. */
int main(int argc, char* argv[]) {
if (argc < 2) {
std::cout << "Usage: ./test_benchmark feed_shape [thread_num] [use_fuse]\n"
<< "feed_shape: input tensor shape, such as 1,3,224,224.\n"
<< "feed_shape: input tensor shape, such as 3,224,224.\n"
<< "thread_num: optional int, threads count, default is 1.\n"
<< "use_fuse: optional bool, default is 0.\n";
return 1;
......@@ -52,7 +52,7 @@ int main(int argc, char* argv[]) {
sscanf(feed_shape, "%d,%d,%d", &dims[1], &dims[2], &dims[3]);
}
std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
<< dims[2] << ", " << dims[3] << "]\n";
<< dims[2] << ", " << dims[3] << "]" << std::endl;
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
// warmup
for (int i = 0; i < 10; ++i) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册