diff --git a/src/common/enforce.h b/src/common/enforce.h index 092afe0b794a91fd2fa5bc82095943f45872a333..1bacfb88d328c85de9b284249c8d9d58e7fc8e5e 100644 --- a/src/common/enforce.h +++ b/src/common/enforce.h @@ -63,7 +63,12 @@ struct PaddleMobileException : public std::exception { #else #define PADDLE_MOBILE_THROW_EXCEPTION(...) -#define PADDLE_MOBILE_ENFORCE(stat, ...) +#define PADDLE_MOBILE_ENFORCE(stat, ...) \ + { \ + if (stat) { \ + } else { \ + } \ + } #endif diff --git a/src/operators/gru_op.cpp b/src/operators/gru_op.cpp index cb1b0b335b2f2eebcb6863c3f1ff8cff2c2fb74c..db0936d00c1dcb0e90e8664660f92ed004c258b7 100644 --- a/src/operators/gru_op.cpp +++ b/src/operators/gru_op.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef GRU_OP #include "operators/gru_op.h" -#include #include #include "common/enforce.h" diff --git a/src/operators/kernel/arm/cast_kernel.cpp b/src/operators/kernel/arm/cast_kernel.cpp index 8c4028b0d5b1a88c9962c029afb0b958ad9ee34e..ec7e133d2b8c1f88601ff6db2ad15b9e6d8040f7 100644 --- a/src/operators/kernel/arm/cast_kernel.cpp +++ b/src/operators/kernel/arm/cast_kernel.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef CAST_OP #include -#include #include #include "framework/data_type.h" #include "operators/kernel/kernels.h" diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h index b6288380a04c71b3d6467f7f6648db046ae9acc9..19561d6b84e82b96463abc553042aa56ae38036d 100644 --- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h @@ -26,18 +26,12 @@ namespace paddle_mobile { namespace operators { template -struct AddFunctor { - inline T operator()(T a, T b) const { return a + b; } -}; - -template -void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { - const Tensor *input_x = param.InputX(); - const Tensor *input_y = param.InputY(); - Tensor *Out = param.Out(); - Out->mutable_data(); +inline void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { + const framework::Tensor *input_x = param.InputX(); + const framework::Tensor *input_y = param.InputY(); + framework::Tensor *Out = param.Out(); int axis = param.Axis(); -#if defined(__ARM_NEON__) || defined(__ARM_NEON) + const auto &x_dims = input_x->dims(); const auto &y_dims = input_y->dims(); /// axis = -1 represent the last dimensions. @@ -57,18 +51,20 @@ void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { const float *bias_data = input_y->data(); const float *input_data = input_x->data(); float *output_data = Out->mutable_data(); + + #pragma omp parallel for collapse(2) for (int i = 0; i < batch; ++i) { - #pragma omp parallel for for (int j = 0; j < channels; ++j) { size_t offset = (i * channels + j) * elementwise_num; const float *input = input_data + offset; - const float *bias = bias_data + j; + const float bias = bias_data[j]; float *output = output_data + offset; - + int remain = elementwise_num; +#if defined(__ARM_NEON__) || defined(__ARM_NEON) int loop = elementwise_num >> 0x4; - int remain = elementwise_num & 0xF; + remain = elementwise_num & 0xF; for (int k = 0; k < loop; ++k) { - float32x4_t rb = vdupq_n_f32(*bias); + float32x4_t rb = vdupq_n_f32(bias); float32x4_t r0 = vld1q_f32(input); float32x4_t r1 = vld1q_f32(input + 4); float32x4_t r2 = vld1q_f32(input + 8); @@ -84,15 +80,12 @@ void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { input += 16; output += 16; } +#endif for (int k = 0; k < remain; ++k) { - output[k] = input[k] + *bias; + output[k] = input[k] + bias; } } } -#else - ElementwiseComputeEx, float>(input_x, input_y, axis, - AddFunctor(), Out); -#endif } template class ElementwiseAddKernel; diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index 2b87d0bb4e1eda086b37bb12a05797fe0f659506..24e74ffeaca6c0b27f04109721ffdab61d7fbaee 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -19,7 +19,7 @@ limitations under the License. */ int main(int argc, char* argv[]) { if (argc < 2) { std::cout << "Usage: ./test_benchmark feed_shape [thread_num] [use_fuse]\n" - << "feed_shape: input tensor shape, such as 1,3,224,224.\n" + << "feed_shape: input tensor shape, such as 3,224,224.\n" << "thread_num: optional int, threads count, default is 1.\n" << "use_fuse: optional bool, default is 0.\n"; return 1; @@ -52,7 +52,7 @@ int main(int argc, char* argv[]) { sscanf(feed_shape, "%d,%d,%d", &dims[1], &dims[2], &dims[3]); } std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", " - << dims[2] << ", " << dims[3] << "]\n"; + << dims[2] << ", " << dims[3] << "]" << std::endl; GetInput(g_test_image_1x3x224x224, &input, dims); // warmup for (int i = 0; i < 10; ++i) {